From ad520943106e745e8a02341caf456db725da8a80 Mon Sep 17 00:00:00 2001
From: mrkkrp <markkarpov@opmbx.org>
Date: Tue, 5 Jul 2016 12:48:17 +0300
Subject: [PATCH] =?UTF-8?q?Rewrite=20=E2=80=98parametrize=E2=80=99=20and?=
 =?UTF-8?q?=20=E2=80=98parametrizeCustom=E2=80=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Text/Inflections/Parameterize.hs        | 126 ++++--------------------
 Text/Inflections/Transliterate.hs       |   4 +-
 inflections.cabal                       |   1 -
 test/Text/Inflections/PropertiesSpec.hs |  20 ++--
 4 files changed, 34 insertions(+), 117 deletions(-)

diff --git a/Text/Inflections/Parameterize.hs b/Text/Inflections/Parameterize.hs
index 9896d65..5a16705 100644
--- a/Text/Inflections/Parameterize.hs
+++ b/Text/Inflections/Parameterize.hs
@@ -9,124 +9,40 @@
 --
 -- Parametrization for strings, useful for transliteration.
 
-{-# LANGUAGE CPP #-}
+{-# LANGUAGE CPP               #-}
+{-# LANGUAGE OverloadedStrings #-}
 
 module Text.Inflections.Parameterize
   ( parameterize
   , parameterizeCustom )
 where
 
-import Control.Monad (guard)
-import Data.Char (isAscii, isAsciiLower, isAsciiUpper, isDigit, toLower)
-import Data.List (group)
-import Data.Maybe (mapMaybe)
+import Data.Char (isAscii, isAlphaNum, isPunctuation, toLower)
+import Data.Text (Text)
 import Text.Inflections.Data
-import Text.Megaparsec
-import Text.Megaparsec.String
 import qualified Data.HashMap.Strict as M
+import qualified Data.Text           as T
 
 #if !MIN_VERSION_base(4,8,0)
 import Control.Applicative
 #endif
 
--- | Parametrizable character.
-
-data PChar
-  = UCase      {-# UNPACK #-} !Char -- ^ Uppercase charater
-  | Acceptable String
-  -- ^ Since some of the transliterating approximations expand from one
-  -- Unicode to two ASCII chars (eg., œ to oe), we represent this as a
-  -- String.
-  | Separator                       -- ^ Separator
-  | Underscore                      -- ^ Underscore
-  | OtherAscii {-# UNPACK #-} !Char -- ^ Other ASCII charater
-  | NonAscii   {-# UNPACK #-} !Char -- ^ Non-ASCII charater
-  deriving (Eq, Show)
-
--- |Matches 'acceptable' characters for parameterization purposes.
-acceptableParser :: Parser PChar
-acceptableParser = do
-  c <- satisfy ((||) <$> isAsciiLower <*> isDigit)
-  return $ Acceptable [c]
-
-parser :: Parser [PChar]
-parser = many $ label "a parametrizable character" $ choice
-  [ acceptableParser
-  , UCase      <$> satisfy isAsciiUpper
-  , Separator  <$  char '-'
-  , Underscore <$  char '_'
-  , OtherAscii <$> asciiChar
-  , NonAscii   <$> satisfy (not . isAscii) ]
-
--- | Parser that accepts rows of parametrizable characters.
--- | Check if given char is “acceptable”, that is, it's lowercase ASCII
--- letter or digit.
-isValidParamChar :: Char -> Bool
-isValidParamChar c = isAsciiLower c || isDigit c
-
--- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
--- how this character should appear in a URL.
-parameterizeChar :: Transliterations -> PChar -> Maybe PChar
-parameterizeChar _  (UCase c)      = Just $ Acceptable [toLower c]
-parameterizeChar _  (Acceptable c) = Just $ Acceptable c
-parameterizeChar _  Separator      = Just Separator
-parameterizeChar _  Underscore     = Just Underscore
-parameterizeChar _  (OtherAscii _) = Just Separator
-parameterizeChar ts (NonAscii c)   = transliteratePCharCustom ts c
-
--- |Look up character in transliteration list. Accepts a Transliteration map
--- which has Chars as keys and Strings as values for approximating common
--- international Unicode characters within the ASCII range.
-transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
-transliteratePCharCustom ts c = do
-  -- We may have expanded into multiple characters during
-  -- transliteration, so check validity of all characters in
-  -- result.
-  v <- M.lookup c ts
-  guard (all isValidParamChar v)
-  return (Acceptable v)
-
--- |Turns PChar tokens into their String representation.
-pCharToC :: PChar -> String
-pCharToC (UCase c)        = [c]
-pCharToC (Acceptable str) = str
-pCharToC Separator        = "-"
-pCharToC Underscore       = "_"
-pCharToC (OtherAscii c)   = [c]
-pCharToC (NonAscii c)     = [c]
-
--- |Reduce sequences of separators down to only one separator.
-squeezeSeparators :: [PChar] -> [PChar]
-squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
-    where squashSeparatorGroup g = case head g of
-                                     Separator -> [Separator] -- only take head
-                                     _         -> g           -- don't change
-
 -- |Replaces special characters in a string so that it may be used as part of a
--- 'pretty' URL. Uses the default transliterations in this library
-parameterize :: String -> String
+-- 'pretty' URL. Uses the default transliterations in this library.
+parameterize :: Text -> Text
 parameterize = parameterizeCustom defaultMap
+{-# INLINE parameterize #-}
 
--- |Transliterate a String with a custom transliteration table.
-parameterizeCustom :: Transliterations -> String -> String
-parameterizeCustom ts s =
-    case parsed of
-      Right ast -> (concatMap pCharToC . squeezeSeparators .
-                    trimUnwanted wanted . mapMaybe (parameterizeChar ts))
-                   ast
-
-      -- Note that this should never fail, since we accommodate all Unicode
-      -- characters as valid input.
-      Left err -> fail $ "Parse failed, please report a bug! Error: " ++
-                         show err
-
-    where parsed = parse parser  "" s
-          wanted :: [PChar] -- All valid URL chars - we shouldn't trim these.
-          wanted = Underscore :
-                   map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9'])
-
--- |Trim non-wanted elements from the beginning and end of list.
-trimUnwanted :: Eq a => [a] -> [a] -> [a]
-trimUnwanted wanted = dropWhile notWanted . reverse . dropWhile notWanted
-                      . reverse
-  where notWanted = (`notElem` wanted)
+-- |Transliterate 'Text' with a custom transliteration table.
+parameterizeCustom :: Transliterations -> Text -> Text
+parameterizeCustom m txt = (T.intercalate "-" . T.words) (T.unfoldr f ("", txt))
+  where
+    f ("", t) = uncurry g <$> T.uncons t
+    f ((x:xs), t) = Just (x, (xs, t))
+    g x xs
+      | (isAscii x && isAlphaNum x) || x == '_' = (toLower x, ("", xs))
+      | isPunctuation x = (' ', ("", xs))
+      | otherwise =
+          case toLower <$> M.lookupDefault " " x m of
+            ""     -> (' ', ("",xs))
+            (y:ys) -> (y,   (ys,xs))
diff --git a/Text/Inflections/Transliterate.hs b/Text/Inflections/Transliterate.hs
index 2d5e274..b030f0c 100644
--- a/Text/Inflections/Transliterate.hs
+++ b/Text/Inflections/Transliterate.hs
@@ -51,5 +51,5 @@ transliterateCustom replacement m txt = T.unfoldr f ("", txt)
         then (x, ("", xs))
         else
           case M.lookupDefault replacement x m of
-            "" -> ('?', ("",xs))
-            (y:ys) -> (y, (ys,xs))
+            ""     -> ('?', ("",xs))
+            (y:ys) -> (y,   (ys,xs))
diff --git a/inflections.cabal b/inflections.cabal
index 66821ae..4beb0a7 100644
--- a/inflections.cabal
+++ b/inflections.cabal
@@ -49,7 +49,6 @@ library
   else
     ghc-options:      -O2 -Wall
   build-depends:       base         >= 4.6   && < 4.10
-                     , containers   >= 0.5   && < 0.6
                      , exceptions   >= 0.6   && < 0.9
                      , megaparsec   >= 5.0   && < 6.0
                      , text         >= 0.2   && < 1.3
diff --git a/test/Text/Inflections/PropertiesSpec.hs b/test/Text/Inflections/PropertiesSpec.hs
index 59c07d4..72d0864 100644
--- a/test/Text/Inflections/PropertiesSpec.hs
+++ b/test/Text/Inflections/PropertiesSpec.hs
@@ -6,6 +6,7 @@ import Data.Char (toLower)
 import Data.List (group)
 import Test.Hspec
 import Test.QuickCheck
+import qualified Data.Text as T
 
 import Text.Inflections
 import Text.Inflections.Parse.Types (Word(..))
@@ -48,44 +49,45 @@ dasherizeSpacedSentence =
 onlyValidCharacters :: Spec
 onlyValidCharacters =
   it "returns only valid characters" (property onlyValidCharactersPredicate)
-    where onlyValidCharactersPredicate sf = all (`elem` (alphaNumerics ++ "-_")) $ parameterize sf
+    where onlyValidCharactersPredicate sf
+            = T.all (`elem` (alphaNumerics ++ "-_")) $ parameterize (T.pack sf)
 
 notBeginWithSeparator :: Spec
 notBeginWithSeparator =
   it "never returns a string beginning ending with a separator" (property notBeginWithSeparatorPredicate)
       where
         notBeginWithSeparatorPredicate s =
-            let parameterized = parameterize s in
-            (not . null) parameterized ==> head parameterized /= '-'
+            let parameterized = parameterize (T.pack s) in
+            (not . T.null) parameterized ==> T.head parameterized /= '-'
 
 notEndWithSeparator :: Spec
 notEndWithSeparator =
   it "never returns a string beginning with a separator" (property notBeginWithSeparatorPredicate)
      where
        notBeginWithSeparatorPredicate s =
-            let parameterized = parameterize s in
-            (not . null) parameterized ==> last parameterized /= '-'
+            let parameterized = parameterize (T.pack s) in
+            (not . T.null) parameterized ==> T.last parameterized /= '-'
 
 noMissingAlphanumerics :: Spec
 noMissingAlphanumerics =
   it "returns every alphanumeric character from the input" (property noMissingAlphanumericsPredicate)
   where noMissingAlphanumericsPredicate s =
-            let parameterized = parameterize s in
+            let parameterized = parameterize (T.pack s) in
             all (\c -> c `notElem` alphaNumerics ||
                 c `elem` (alphaNumerics ++ "-") &&
-                c `elem` parameterized) $ map toLower s
+                c `elem` T.unpack parameterized) $ map toLower s
 
 noMoreThanOneHyphen :: Spec
 noMoreThanOneHyphen =
   it "never returns a string with a sequence of dashes" (property noMoreThanOneHyphenPredicate)
   where noMoreThanOneHyphenPredicate s =
-            let parameterized = parameterize s in longestSequenceOf '-' parameterized <= 1
+            let parameterized = parameterize (T.pack s)
+            in longestSequenceOf '-' (T.unpack parameterized) <= 1
 
 longestSequenceOf :: Char -> String -> Int
 longestSequenceOf _ [] = 0
 longestSequenceOf c s =
     if null subseqLengths then 0 else maximum subseqLengths
-
   where subseqLengths = (map length . filter (\str -> head str == c) . group) s
 
 alphaNumerics :: String