From ad520943106e745e8a02341caf456db725da8a80 Mon Sep 17 00:00:00 2001 From: mrkkrp Date: Tue, 5 Jul 2016 12:48:17 +0300 Subject: [PATCH] =?UTF-8?q?Rewrite=20=E2=80=98parametrize=E2=80=99=20and?= =?UTF-8?q?=20=E2=80=98parametrizeCustom=E2=80=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Text/Inflections/Parameterize.hs | 126 ++++-------------------- Text/Inflections/Transliterate.hs | 4 +- inflections.cabal | 1 - test/Text/Inflections/PropertiesSpec.hs | 20 ++-- 4 files changed, 34 insertions(+), 117 deletions(-) diff --git a/Text/Inflections/Parameterize.hs b/Text/Inflections/Parameterize.hs index 9896d65..5a16705 100644 --- a/Text/Inflections/Parameterize.hs +++ b/Text/Inflections/Parameterize.hs @@ -9,124 +9,40 @@ -- -- Parametrization for strings, useful for transliteration. -{-# LANGUAGE CPP #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE OverloadedStrings #-} module Text.Inflections.Parameterize ( parameterize , parameterizeCustom ) where -import Control.Monad (guard) -import Data.Char (isAscii, isAsciiLower, isAsciiUpper, isDigit, toLower) -import Data.List (group) -import Data.Maybe (mapMaybe) +import Data.Char (isAscii, isAlphaNum, isPunctuation, toLower) +import Data.Text (Text) import Text.Inflections.Data -import Text.Megaparsec -import Text.Megaparsec.String import qualified Data.HashMap.Strict as M +import qualified Data.Text as T #if !MIN_VERSION_base(4,8,0) import Control.Applicative #endif --- | Parametrizable character. - -data PChar - = UCase {-# UNPACK #-} !Char -- ^ Uppercase charater - | Acceptable String - -- ^ Since some of the transliterating approximations expand from one - -- Unicode to two ASCII chars (eg., œ to oe), we represent this as a - -- String. - | Separator -- ^ Separator - | Underscore -- ^ Underscore - | OtherAscii {-# UNPACK #-} !Char -- ^ Other ASCII charater - | NonAscii {-# UNPACK #-} !Char -- ^ Non-ASCII charater - deriving (Eq, Show) - --- |Matches 'acceptable' characters for parameterization purposes. -acceptableParser :: Parser PChar -acceptableParser = do - c <- satisfy ((||) <$> isAsciiLower <*> isDigit) - return $ Acceptable [c] - -parser :: Parser [PChar] -parser = many $ label "a parametrizable character" $ choice - [ acceptableParser - , UCase <$> satisfy isAsciiUpper - , Separator <$ char '-' - , Underscore <$ char '_' - , OtherAscii <$> asciiChar - , NonAscii <$> satisfy (not . isAscii) ] - --- | Parser that accepts rows of parametrizable characters. --- | Check if given char is “acceptable”, that is, it's lowercase ASCII --- letter or digit. -isValidParamChar :: Char -> Bool -isValidParamChar c = isAsciiLower c || isDigit c - --- |Given a Transliteration table and a PChar, returns Maybe PChar indicating --- how this character should appear in a URL. -parameterizeChar :: Transliterations -> PChar -> Maybe PChar -parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c] -parameterizeChar _ (Acceptable c) = Just $ Acceptable c -parameterizeChar _ Separator = Just Separator -parameterizeChar _ Underscore = Just Underscore -parameterizeChar _ (OtherAscii _) = Just Separator -parameterizeChar ts (NonAscii c) = transliteratePCharCustom ts c - --- |Look up character in transliteration list. Accepts a Transliteration map --- which has Chars as keys and Strings as values for approximating common --- international Unicode characters within the ASCII range. -transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar -transliteratePCharCustom ts c = do - -- We may have expanded into multiple characters during - -- transliteration, so check validity of all characters in - -- result. - v <- M.lookup c ts - guard (all isValidParamChar v) - return (Acceptable v) - --- |Turns PChar tokens into their String representation. -pCharToC :: PChar -> String -pCharToC (UCase c) = [c] -pCharToC (Acceptable str) = str -pCharToC Separator = "-" -pCharToC Underscore = "_" -pCharToC (OtherAscii c) = [c] -pCharToC (NonAscii c) = [c] - --- |Reduce sequences of separators down to only one separator. -squeezeSeparators :: [PChar] -> [PChar] -squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps - where squashSeparatorGroup g = case head g of - Separator -> [Separator] -- only take head - _ -> g -- don't change - -- |Replaces special characters in a string so that it may be used as part of a --- 'pretty' URL. Uses the default transliterations in this library -parameterize :: String -> String +-- 'pretty' URL. Uses the default transliterations in this library. +parameterize :: Text -> Text parameterize = parameterizeCustom defaultMap +{-# INLINE parameterize #-} --- |Transliterate a String with a custom transliteration table. -parameterizeCustom :: Transliterations -> String -> String -parameterizeCustom ts s = - case parsed of - Right ast -> (concatMap pCharToC . squeezeSeparators . - trimUnwanted wanted . mapMaybe (parameterizeChar ts)) - ast - - -- Note that this should never fail, since we accommodate all Unicode - -- characters as valid input. - Left err -> fail $ "Parse failed, please report a bug! Error: " ++ - show err - - where parsed = parse parser "" s - wanted :: [PChar] -- All valid URL chars - we shouldn't trim these. - wanted = Underscore : - map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9']) - --- |Trim non-wanted elements from the beginning and end of list. -trimUnwanted :: Eq a => [a] -> [a] -> [a] -trimUnwanted wanted = dropWhile notWanted . reverse . dropWhile notWanted - . reverse - where notWanted = (`notElem` wanted) +-- |Transliterate 'Text' with a custom transliteration table. +parameterizeCustom :: Transliterations -> Text -> Text +parameterizeCustom m txt = (T.intercalate "-" . T.words) (T.unfoldr f ("", txt)) + where + f ("", t) = uncurry g <$> T.uncons t + f ((x:xs), t) = Just (x, (xs, t)) + g x xs + | (isAscii x && isAlphaNum x) || x == '_' = (toLower x, ("", xs)) + | isPunctuation x = (' ', ("", xs)) + | otherwise = + case toLower <$> M.lookupDefault " " x m of + "" -> (' ', ("",xs)) + (y:ys) -> (y, (ys,xs)) diff --git a/Text/Inflections/Transliterate.hs b/Text/Inflections/Transliterate.hs index 2d5e274..b030f0c 100644 --- a/Text/Inflections/Transliterate.hs +++ b/Text/Inflections/Transliterate.hs @@ -51,5 +51,5 @@ transliterateCustom replacement m txt = T.unfoldr f ("", txt) then (x, ("", xs)) else case M.lookupDefault replacement x m of - "" -> ('?', ("",xs)) - (y:ys) -> (y, (ys,xs)) + "" -> ('?', ("",xs)) + (y:ys) -> (y, (ys,xs)) diff --git a/inflections.cabal b/inflections.cabal index 66821ae..4beb0a7 100644 --- a/inflections.cabal +++ b/inflections.cabal @@ -49,7 +49,6 @@ library else ghc-options: -O2 -Wall build-depends: base >= 4.6 && < 4.10 - , containers >= 0.5 && < 0.6 , exceptions >= 0.6 && < 0.9 , megaparsec >= 5.0 && < 6.0 , text >= 0.2 && < 1.3 diff --git a/test/Text/Inflections/PropertiesSpec.hs b/test/Text/Inflections/PropertiesSpec.hs index 59c07d4..72d0864 100644 --- a/test/Text/Inflections/PropertiesSpec.hs +++ b/test/Text/Inflections/PropertiesSpec.hs @@ -6,6 +6,7 @@ import Data.Char (toLower) import Data.List (group) import Test.Hspec import Test.QuickCheck +import qualified Data.Text as T import Text.Inflections import Text.Inflections.Parse.Types (Word(..)) @@ -48,44 +49,45 @@ dasherizeSpacedSentence = onlyValidCharacters :: Spec onlyValidCharacters = it "returns only valid characters" (property onlyValidCharactersPredicate) - where onlyValidCharactersPredicate sf = all (`elem` (alphaNumerics ++ "-_")) $ parameterize sf + where onlyValidCharactersPredicate sf + = T.all (`elem` (alphaNumerics ++ "-_")) $ parameterize (T.pack sf) notBeginWithSeparator :: Spec notBeginWithSeparator = it "never returns a string beginning ending with a separator" (property notBeginWithSeparatorPredicate) where notBeginWithSeparatorPredicate s = - let parameterized = parameterize s in - (not . null) parameterized ==> head parameterized /= '-' + let parameterized = parameterize (T.pack s) in + (not . T.null) parameterized ==> T.head parameterized /= '-' notEndWithSeparator :: Spec notEndWithSeparator = it "never returns a string beginning with a separator" (property notBeginWithSeparatorPredicate) where notBeginWithSeparatorPredicate s = - let parameterized = parameterize s in - (not . null) parameterized ==> last parameterized /= '-' + let parameterized = parameterize (T.pack s) in + (not . T.null) parameterized ==> T.last parameterized /= '-' noMissingAlphanumerics :: Spec noMissingAlphanumerics = it "returns every alphanumeric character from the input" (property noMissingAlphanumericsPredicate) where noMissingAlphanumericsPredicate s = - let parameterized = parameterize s in + let parameterized = parameterize (T.pack s) in all (\c -> c `notElem` alphaNumerics || c `elem` (alphaNumerics ++ "-") && - c `elem` parameterized) $ map toLower s + c `elem` T.unpack parameterized) $ map toLower s noMoreThanOneHyphen :: Spec noMoreThanOneHyphen = it "never returns a string with a sequence of dashes" (property noMoreThanOneHyphenPredicate) where noMoreThanOneHyphenPredicate s = - let parameterized = parameterize s in longestSequenceOf '-' parameterized <= 1 + let parameterized = parameterize (T.pack s) + in longestSequenceOf '-' (T.unpack parameterized) <= 1 longestSequenceOf :: Char -> String -> Int longestSequenceOf _ [] = 0 longestSequenceOf c s = if null subseqLengths then 0 else maximum subseqLengths - where subseqLengths = (map length . filter (\str -> head str == c) . group) s alphaNumerics :: String