Rewrite ‘parametrize’ and ‘parametrizeCustom’

This commit is contained in:
mrkkrp 2016-07-05 12:48:17 +03:00
parent 042ea42a20
commit ad52094310
4 changed files with 34 additions and 117 deletions

View File

@ -9,124 +9,40 @@
--
-- Parametrization for strings, useful for transliteration.
{-# LANGUAGE CPP #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.Inflections.Parameterize
( parameterize
, parameterizeCustom )
where
import Control.Monad (guard)
import Data.Char (isAscii, isAsciiLower, isAsciiUpper, isDigit, toLower)
import Data.List (group)
import Data.Maybe (mapMaybe)
import Data.Char (isAscii, isAlphaNum, isPunctuation, toLower)
import Data.Text (Text)
import Text.Inflections.Data
import Text.Megaparsec
import Text.Megaparsec.String
import qualified Data.HashMap.Strict as M
import qualified Data.Text as T
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
#endif
-- | Parametrizable character.
data PChar
= UCase {-# UNPACK #-} !Char -- ^ Uppercase charater
| Acceptable String
-- ^ Since some of the transliterating approximations expand from one
-- Unicode to two ASCII chars (eg., œ to oe), we represent this as a
-- String.
| Separator -- ^ Separator
| Underscore -- ^ Underscore
| OtherAscii {-# UNPACK #-} !Char -- ^ Other ASCII charater
| NonAscii {-# UNPACK #-} !Char -- ^ Non-ASCII charater
deriving (Eq, Show)
-- |Matches 'acceptable' characters for parameterization purposes.
acceptableParser :: Parser PChar
acceptableParser = do
c <- satisfy ((||) <$> isAsciiLower <*> isDigit)
return $ Acceptable [c]
parser :: Parser [PChar]
parser = many $ label "a parametrizable character" $ choice
[ acceptableParser
, UCase <$> satisfy isAsciiUpper
, Separator <$ char '-'
, Underscore <$ char '_'
, OtherAscii <$> asciiChar
, NonAscii <$> satisfy (not . isAscii) ]
-- | Parser that accepts rows of parametrizable characters.
-- | Check if given char is “acceptable”, that is, it's lowercase ASCII
-- letter or digit.
isValidParamChar :: Char -> Bool
isValidParamChar c = isAsciiLower c || isDigit c
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
-- how this character should appear in a URL.
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
parameterizeChar _ Separator = Just Separator
parameterizeChar _ Underscore = Just Underscore
parameterizeChar _ (OtherAscii _) = Just Separator
parameterizeChar ts (NonAscii c) = transliteratePCharCustom ts c
-- |Look up character in transliteration list. Accepts a Transliteration map
-- which has Chars as keys and Strings as values for approximating common
-- international Unicode characters within the ASCII range.
transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
transliteratePCharCustom ts c = do
-- We may have expanded into multiple characters during
-- transliteration, so check validity of all characters in
-- result.
v <- M.lookup c ts
guard (all isValidParamChar v)
return (Acceptable v)
-- |Turns PChar tokens into their String representation.
pCharToC :: PChar -> String
pCharToC (UCase c) = [c]
pCharToC (Acceptable str) = str
pCharToC Separator = "-"
pCharToC Underscore = "_"
pCharToC (OtherAscii c) = [c]
pCharToC (NonAscii c) = [c]
-- |Reduce sequences of separators down to only one separator.
squeezeSeparators :: [PChar] -> [PChar]
squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
where squashSeparatorGroup g = case head g of
Separator -> [Separator] -- only take head
_ -> g -- don't change
-- |Replaces special characters in a string so that it may be used as part of a
-- 'pretty' URL. Uses the default transliterations in this library
parameterize :: String -> String
-- 'pretty' URL. Uses the default transliterations in this library.
parameterize :: Text -> Text
parameterize = parameterizeCustom defaultMap
{-# INLINE parameterize #-}
-- |Transliterate a String with a custom transliteration table.
parameterizeCustom :: Transliterations -> String -> String
parameterizeCustom ts s =
case parsed of
Right ast -> (concatMap pCharToC . squeezeSeparators .
trimUnwanted wanted . mapMaybe (parameterizeChar ts))
ast
-- Note that this should never fail, since we accommodate all Unicode
-- characters as valid input.
Left err -> fail $ "Parse failed, please report a bug! Error: " ++
show err
where parsed = parse parser "" s
wanted :: [PChar] -- All valid URL chars - we shouldn't trim these.
wanted = Underscore :
map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9'])
-- |Trim non-wanted elements from the beginning and end of list.
trimUnwanted :: Eq a => [a] -> [a] -> [a]
trimUnwanted wanted = dropWhile notWanted . reverse . dropWhile notWanted
. reverse
where notWanted = (`notElem` wanted)
-- |Transliterate 'Text' with a custom transliteration table.
parameterizeCustom :: Transliterations -> Text -> Text
parameterizeCustom m txt = (T.intercalate "-" . T.words) (T.unfoldr f ("", txt))
where
f ("", t) = uncurry g <$> T.uncons t
f ((x:xs), t) = Just (x, (xs, t))
g x xs
| (isAscii x && isAlphaNum x) || x == '_' = (toLower x, ("", xs))
| isPunctuation x = (' ', ("", xs))
| otherwise =
case toLower <$> M.lookupDefault " " x m of
"" -> (' ', ("",xs))
(y:ys) -> (y, (ys,xs))

View File

@ -51,5 +51,5 @@ transliterateCustom replacement m txt = T.unfoldr f ("", txt)
then (x, ("", xs))
else
case M.lookupDefault replacement x m of
"" -> ('?', ("",xs))
(y:ys) -> (y, (ys,xs))
"" -> ('?', ("",xs))
(y:ys) -> (y, (ys,xs))

View File

@ -49,7 +49,6 @@ library
else
ghc-options: -O2 -Wall
build-depends: base >= 4.6 && < 4.10
, containers >= 0.5 && < 0.6
, exceptions >= 0.6 && < 0.9
, megaparsec >= 5.0 && < 6.0
, text >= 0.2 && < 1.3

View File

@ -6,6 +6,7 @@ import Data.Char (toLower)
import Data.List (group)
import Test.Hspec
import Test.QuickCheck
import qualified Data.Text as T
import Text.Inflections
import Text.Inflections.Parse.Types (Word(..))
@ -48,44 +49,45 @@ dasherizeSpacedSentence =
onlyValidCharacters :: Spec
onlyValidCharacters =
it "returns only valid characters" (property onlyValidCharactersPredicate)
where onlyValidCharactersPredicate sf = all (`elem` (alphaNumerics ++ "-_")) $ parameterize sf
where onlyValidCharactersPredicate sf
= T.all (`elem` (alphaNumerics ++ "-_")) $ parameterize (T.pack sf)
notBeginWithSeparator :: Spec
notBeginWithSeparator =
it "never returns a string beginning ending with a separator" (property notBeginWithSeparatorPredicate)
where
notBeginWithSeparatorPredicate s =
let parameterized = parameterize s in
(not . null) parameterized ==> head parameterized /= '-'
let parameterized = parameterize (T.pack s) in
(not . T.null) parameterized ==> T.head parameterized /= '-'
notEndWithSeparator :: Spec
notEndWithSeparator =
it "never returns a string beginning with a separator" (property notBeginWithSeparatorPredicate)
where
notBeginWithSeparatorPredicate s =
let parameterized = parameterize s in
(not . null) parameterized ==> last parameterized /= '-'
let parameterized = parameterize (T.pack s) in
(not . T.null) parameterized ==> T.last parameterized /= '-'
noMissingAlphanumerics :: Spec
noMissingAlphanumerics =
it "returns every alphanumeric character from the input" (property noMissingAlphanumericsPredicate)
where noMissingAlphanumericsPredicate s =
let parameterized = parameterize s in
let parameterized = parameterize (T.pack s) in
all (\c -> c `notElem` alphaNumerics ||
c `elem` (alphaNumerics ++ "-") &&
c `elem` parameterized) $ map toLower s
c `elem` T.unpack parameterized) $ map toLower s
noMoreThanOneHyphen :: Spec
noMoreThanOneHyphen =
it "never returns a string with a sequence of dashes" (property noMoreThanOneHyphenPredicate)
where noMoreThanOneHyphenPredicate s =
let parameterized = parameterize s in longestSequenceOf '-' parameterized <= 1
let parameterized = parameterize (T.pack s)
in longestSequenceOf '-' (T.unpack parameterized) <= 1
longestSequenceOf :: Char -> String -> Int
longestSequenceOf _ [] = 0
longestSequenceOf c s =
if null subseqLengths then 0 else maximum subseqLengths
where subseqLengths = (map length . filter (\str -> head str == c) . group) s
alphaNumerics :: String