Rewrite ‘parametrize’ and ‘parametrizeCustom’

2024-08-15 17:10:21 +03:00 · 2016-07-05 12:48:17 +03:00 · 2016-07-05 12:48:17 +03:00 · ad52094310
commit ad52094310
parent 042ea42a20
4 changed files with 34 additions and 117 deletions
--- a/Text/Inflections/Parameterize.hs
+++ b/Text/Inflections/Parameterize.hs
@ -9,124 +9,40 @@
 --
 -- Parametrization for strings, useful for transliteration.

-{-# LANGUAGE CPP #-}
+{-# LANGUAGE CPP               #-}
+{-# LANGUAGE OverloadedStrings #-}

 module Text.Inflections.Parameterize
  ( parameterize
  , parameterizeCustom )
 where

-import Control.Monad (guard)
-import Data.Char (isAscii, isAsciiLower, isAsciiUpper, isDigit, toLower)
-import Data.List (group)
-import Data.Maybe (mapMaybe)
+import Data.Char (isAscii, isAlphaNum, isPunctuation, toLower)
+import Data.Text (Text)
 import Text.Inflections.Data
-import Text.Megaparsec
-import Text.Megaparsec.String
 import qualified Data.HashMap.Strict as M
+import qualified Data.Text           as T

 #if !MIN_VERSION_base(4,8,0)
 import Control.Applicative
 #endif

-- | Parametrizable character.
-
-data PChar
-  = UCase      {-# UNPACK #-} !Char -- ^ Uppercase charater
-  | Acceptable String
-  -- ^ Since some of the transliterating approximations expand from one
-  -- Unicode to two ASCII chars (eg., œ to oe), we represent this as a
-  -- String.
-  | Separator                       -- ^ Separator
-  | Underscore                      -- ^ Underscore
-  | OtherAscii {-# UNPACK #-} !Char -- ^ Other ASCII charater
-  | NonAscii   {-# UNPACK #-} !Char -- ^ Non-ASCII charater
-  deriving (Eq, Show)
-
-- |Matches 'acceptable' characters for parameterization purposes.
-acceptableParser :: Parser PChar
-acceptableParser = do
-  c <- satisfy ((||) <$> isAsciiLower <*> isDigit)
-  return $ Acceptable [c]
-
-parser :: Parser [PChar]
-parser = many $ label "a parametrizable character" $ choice
-  [ acceptableParser
-  , UCase      <$> satisfy isAsciiUpper
-  , Separator  <$  char '-'
-  , Underscore <$  char '_'
-  , OtherAscii <$> asciiChar
-  , NonAscii   <$> satisfy (not . isAscii) ]
-
-- | Parser that accepts rows of parametrizable characters.
-- | Check if given char is “acceptable”, that is, it's lowercase ASCII
-- letter or digit.
-isValidParamChar :: Char -> Bool
-isValidParamChar c = isAsciiLower c || isDigit c
-
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
-- how this character should appear in a URL.
-parameterizeChar :: Transliterations -> PChar -> Maybe PChar
-parameterizeChar _  (UCase c)      = Just $ Acceptable [toLower c]
-parameterizeChar _  (Acceptable c) = Just $ Acceptable c
-parameterizeChar _  Separator      = Just Separator
-parameterizeChar _  Underscore     = Just Underscore
-parameterizeChar _  (OtherAscii _) = Just Separator
-parameterizeChar ts (NonAscii c)   = transliteratePCharCustom ts c
-
-- |Look up character in transliteration list. Accepts a Transliteration map
-- which has Chars as keys and Strings as values for approximating common
-- international Unicode characters within the ASCII range.
-transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
-transliteratePCharCustom ts c = do
-  -- We may have expanded into multiple characters during
-  -- transliteration, so check validity of all characters in
-  -- result.
-  v <- M.lookup c ts
-  guard (all isValidParamChar v)
-  return (Acceptable v)
-
-- |Turns PChar tokens into their String representation.
-pCharToC :: PChar -> String
-pCharToC (UCase c)        = [c]
-pCharToC (Acceptable str) = str
-pCharToC Separator        = "-"
-pCharToC Underscore       = "_"
-pCharToC (OtherAscii c)   = [c]
-pCharToC (NonAscii c)     = [c]
-
-- |Reduce sequences of separators down to only one separator.
-squeezeSeparators :: [PChar] -> [PChar]
-squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
-    where squashSeparatorGroup g = case head g of
-                                     Separator -> [Separator] -- only take head
-                                     _         -> g           -- don't change
-
 -- |Replaces special characters in a string so that it may be used as part of a
-- 'pretty' URL. Uses the default transliterations in this library
-parameterize :: String -> String
+-- 'pretty' URL. Uses the default transliterations in this library.
+parameterize :: Text -> Text
 parameterize = parameterizeCustom defaultMap
+{-# INLINE parameterize #-}

-- |Transliterate a String with a custom transliteration table.
-parameterizeCustom :: Transliterations -> String -> String
-parameterizeCustom ts s =
-    case parsed of
-      Right ast -> (concatMap pCharToC . squeezeSeparators .
-                    trimUnwanted wanted . mapMaybe (parameterizeChar ts))
-                   ast
-
-      -- Note that this should never fail, since we accommodate all Unicode
-      -- characters as valid input.
-      Left err -> fail $ "Parse failed, please report a bug! Error: " ++
-                         show err
-
-    where parsed = parse parser  "" s
-          wanted :: [PChar] -- All valid URL chars - we shouldn't trim these.
-          wanted = Underscore :
-                   map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9'])
-
-- |Trim non-wanted elements from the beginning and end of list.
-trimUnwanted :: Eq a => [a] -> [a] -> [a]
-trimUnwanted wanted = dropWhile notWanted . reverse . dropWhile notWanted
-                      . reverse
-  where notWanted = (`notElem` wanted)
+-- |Transliterate 'Text' with a custom transliteration table.
+parameterizeCustom :: Transliterations -> Text -> Text
+parameterizeCustom m txt = (T.intercalate "-" . T.words) (T.unfoldr f ("", txt))
+  where
+    f ("", t) = uncurry g <$> T.uncons t
+    f ((x:xs), t) = Just (x, (xs, t))
+    g x xs
+      | (isAscii x && isAlphaNum x) || x == '_' = (toLower x, ("", xs))
+      | isPunctuation x = (' ', ("", xs))
+      | otherwise =
+          case toLower <$> M.lookupDefault " " x m of
+            ""     -> (' ', ("",xs))
+            (y:ys) -> (y,   (ys,xs))
--- a/Text/Inflections/Transliterate.hs
+++ b/Text/Inflections/Transliterate.hs
@ -51,5 +51,5 @@ transliterateCustom replacement m txt = T.unfoldr f ("", txt)
        then (x, ("", xs))
        else
          case M.lookupDefault replacement x m of
-            "" -> ('?', ("",xs))
-            (y:ys) -> (y, (ys,xs))
+            ""     -> ('?', ("",xs))
+            (y:ys) -> (y,   (ys,xs))
--- a/inflections.cabal
+++ b/inflections.cabal
@ -49,7 +49,6 @@ library
  else
    ghc-options:      -O2 -Wall
  build-depends:       base         >= 4.6   && < 4.10
-                     , containers   >= 0.5   && < 0.6
                     , exceptions   >= 0.6   && < 0.9
                     , megaparsec   >= 5.0   && < 6.0
                     , text         >= 0.2   && < 1.3
--- a/test/Text/Inflections/PropertiesSpec.hs
+++ b/test/Text/Inflections/PropertiesSpec.hs
@ -6,6 +6,7 @@ import Data.Char (toLower)
 import Data.List (group)
 import Test.Hspec
 import Test.QuickCheck
+import qualified Data.Text as T

 import Text.Inflections
 import Text.Inflections.Parse.Types (Word(..))
@ -48,44 +49,45 @@ dasherizeSpacedSentence =
 onlyValidCharacters :: Spec
 onlyValidCharacters =
  it "returns only valid characters" (property onlyValidCharactersPredicate)
-    where onlyValidCharactersPredicate sf = all (`elem` (alphaNumerics ++ "-_")) $ parameterize sf
+    where onlyValidCharactersPredicate sf
+            = T.all (`elem` (alphaNumerics ++ "-_")) $ parameterize (T.pack sf)

 notBeginWithSeparator :: Spec
 notBeginWithSeparator =
  it "never returns a string beginning ending with a separator" (property notBeginWithSeparatorPredicate)
      where
        notBeginWithSeparatorPredicate s =
-            let parameterized = parameterize s in
-            (not . null) parameterized ==> head parameterized /= '-'
+            let parameterized = parameterize (T.pack s) in
+            (not . T.null) parameterized ==> T.head parameterized /= '-'

 notEndWithSeparator :: Spec
 notEndWithSeparator =
  it "never returns a string beginning with a separator" (property notBeginWithSeparatorPredicate)
     where
       notBeginWithSeparatorPredicate s =
-            let parameterized = parameterize s in
-            (not . null) parameterized ==> last parameterized /= '-'
+            let parameterized = parameterize (T.pack s) in
+            (not . T.null) parameterized ==> T.last parameterized /= '-'

 noMissingAlphanumerics :: Spec
 noMissingAlphanumerics =
  it "returns every alphanumeric character from the input" (property noMissingAlphanumericsPredicate)
  where noMissingAlphanumericsPredicate s =
-            let parameterized = parameterize s in
+            let parameterized = parameterize (T.pack s) in
            all (\c -> c `notElem` alphaNumerics ||
                c `elem` (alphaNumerics ++ "-") &&
-                c `elem` parameterized) $ map toLower s
+                c `elem` T.unpack parameterized) $ map toLower s

 noMoreThanOneHyphen :: Spec
 noMoreThanOneHyphen =
  it "never returns a string with a sequence of dashes" (property noMoreThanOneHyphenPredicate)
  where noMoreThanOneHyphenPredicate s =
-            let parameterized = parameterize s in longestSequenceOf '-' parameterized <= 1
+            let parameterized = parameterize (T.pack s)
+            in longestSequenceOf '-' (T.unpack parameterized) <= 1

 longestSequenceOf :: Char -> String -> Int
 longestSequenceOf _ [] = 0
 longestSequenceOf c s =
    if null subseqLengths then 0 else maximum subseqLengths
-
  where subseqLengths = (map length . filter (\str -> head str == c) . group) s

 alphaNumerics :: String