mirror of
https://github.com/typeable/inflections-hs.git
synced 2024-08-15 17:10:21 +03:00
Rewrite ‘parametrize’ and ‘parametrizeCustom’
This commit is contained in:
parent
042ea42a20
commit
ad52094310
@ -9,124 +9,40 @@
|
||||
--
|
||||
-- Parametrization for strings, useful for transliteration.
|
||||
|
||||
{-# LANGUAGE CPP #-}
|
||||
{-# LANGUAGE CPP #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Text.Inflections.Parameterize
|
||||
( parameterize
|
||||
, parameterizeCustom )
|
||||
where
|
||||
|
||||
import Control.Monad (guard)
|
||||
import Data.Char (isAscii, isAsciiLower, isAsciiUpper, isDigit, toLower)
|
||||
import Data.List (group)
|
||||
import Data.Maybe (mapMaybe)
|
||||
import Data.Char (isAscii, isAlphaNum, isPunctuation, toLower)
|
||||
import Data.Text (Text)
|
||||
import Text.Inflections.Data
|
||||
import Text.Megaparsec
|
||||
import Text.Megaparsec.String
|
||||
import qualified Data.HashMap.Strict as M
|
||||
import qualified Data.Text as T
|
||||
|
||||
#if !MIN_VERSION_base(4,8,0)
|
||||
import Control.Applicative
|
||||
#endif
|
||||
|
||||
-- | Parametrizable character.
|
||||
|
||||
data PChar
|
||||
= UCase {-# UNPACK #-} !Char -- ^ Uppercase charater
|
||||
| Acceptable String
|
||||
-- ^ Since some of the transliterating approximations expand from one
|
||||
-- Unicode to two ASCII chars (eg., œ to oe), we represent this as a
|
||||
-- String.
|
||||
| Separator -- ^ Separator
|
||||
| Underscore -- ^ Underscore
|
||||
| OtherAscii {-# UNPACK #-} !Char -- ^ Other ASCII charater
|
||||
| NonAscii {-# UNPACK #-} !Char -- ^ Non-ASCII charater
|
||||
deriving (Eq, Show)
|
||||
|
||||
-- |Matches 'acceptable' characters for parameterization purposes.
|
||||
acceptableParser :: Parser PChar
|
||||
acceptableParser = do
|
||||
c <- satisfy ((||) <$> isAsciiLower <*> isDigit)
|
||||
return $ Acceptable [c]
|
||||
|
||||
parser :: Parser [PChar]
|
||||
parser = many $ label "a parametrizable character" $ choice
|
||||
[ acceptableParser
|
||||
, UCase <$> satisfy isAsciiUpper
|
||||
, Separator <$ char '-'
|
||||
, Underscore <$ char '_'
|
||||
, OtherAscii <$> asciiChar
|
||||
, NonAscii <$> satisfy (not . isAscii) ]
|
||||
|
||||
-- | Parser that accepts rows of parametrizable characters.
|
||||
-- | Check if given char is “acceptable”, that is, it's lowercase ASCII
|
||||
-- letter or digit.
|
||||
isValidParamChar :: Char -> Bool
|
||||
isValidParamChar c = isAsciiLower c || isDigit c
|
||||
|
||||
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
|
||||
-- how this character should appear in a URL.
|
||||
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
|
||||
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
|
||||
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
|
||||
parameterizeChar _ Separator = Just Separator
|
||||
parameterizeChar _ Underscore = Just Underscore
|
||||
parameterizeChar _ (OtherAscii _) = Just Separator
|
||||
parameterizeChar ts (NonAscii c) = transliteratePCharCustom ts c
|
||||
|
||||
-- |Look up character in transliteration list. Accepts a Transliteration map
|
||||
-- which has Chars as keys and Strings as values for approximating common
|
||||
-- international Unicode characters within the ASCII range.
|
||||
transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
|
||||
transliteratePCharCustom ts c = do
|
||||
-- We may have expanded into multiple characters during
|
||||
-- transliteration, so check validity of all characters in
|
||||
-- result.
|
||||
v <- M.lookup c ts
|
||||
guard (all isValidParamChar v)
|
||||
return (Acceptable v)
|
||||
|
||||
-- |Turns PChar tokens into their String representation.
|
||||
pCharToC :: PChar -> String
|
||||
pCharToC (UCase c) = [c]
|
||||
pCharToC (Acceptable str) = str
|
||||
pCharToC Separator = "-"
|
||||
pCharToC Underscore = "_"
|
||||
pCharToC (OtherAscii c) = [c]
|
||||
pCharToC (NonAscii c) = [c]
|
||||
|
||||
-- |Reduce sequences of separators down to only one separator.
|
||||
squeezeSeparators :: [PChar] -> [PChar]
|
||||
squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
|
||||
where squashSeparatorGroup g = case head g of
|
||||
Separator -> [Separator] -- only take head
|
||||
_ -> g -- don't change
|
||||
|
||||
-- |Replaces special characters in a string so that it may be used as part of a
|
||||
-- 'pretty' URL. Uses the default transliterations in this library
|
||||
parameterize :: String -> String
|
||||
-- 'pretty' URL. Uses the default transliterations in this library.
|
||||
parameterize :: Text -> Text
|
||||
parameterize = parameterizeCustom defaultMap
|
||||
{-# INLINE parameterize #-}
|
||||
|
||||
-- |Transliterate a String with a custom transliteration table.
|
||||
parameterizeCustom :: Transliterations -> String -> String
|
||||
parameterizeCustom ts s =
|
||||
case parsed of
|
||||
Right ast -> (concatMap pCharToC . squeezeSeparators .
|
||||
trimUnwanted wanted . mapMaybe (parameterizeChar ts))
|
||||
ast
|
||||
|
||||
-- Note that this should never fail, since we accommodate all Unicode
|
||||
-- characters as valid input.
|
||||
Left err -> fail $ "Parse failed, please report a bug! Error: " ++
|
||||
show err
|
||||
|
||||
where parsed = parse parser "" s
|
||||
wanted :: [PChar] -- All valid URL chars - we shouldn't trim these.
|
||||
wanted = Underscore :
|
||||
map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9'])
|
||||
|
||||
-- |Trim non-wanted elements from the beginning and end of list.
|
||||
trimUnwanted :: Eq a => [a] -> [a] -> [a]
|
||||
trimUnwanted wanted = dropWhile notWanted . reverse . dropWhile notWanted
|
||||
. reverse
|
||||
where notWanted = (`notElem` wanted)
|
||||
-- |Transliterate 'Text' with a custom transliteration table.
|
||||
parameterizeCustom :: Transliterations -> Text -> Text
|
||||
parameterizeCustom m txt = (T.intercalate "-" . T.words) (T.unfoldr f ("", txt))
|
||||
where
|
||||
f ("", t) = uncurry g <$> T.uncons t
|
||||
f ((x:xs), t) = Just (x, (xs, t))
|
||||
g x xs
|
||||
| (isAscii x && isAlphaNum x) || x == '_' = (toLower x, ("", xs))
|
||||
| isPunctuation x = (' ', ("", xs))
|
||||
| otherwise =
|
||||
case toLower <$> M.lookupDefault " " x m of
|
||||
"" -> (' ', ("",xs))
|
||||
(y:ys) -> (y, (ys,xs))
|
||||
|
@ -51,5 +51,5 @@ transliterateCustom replacement m txt = T.unfoldr f ("", txt)
|
||||
then (x, ("", xs))
|
||||
else
|
||||
case M.lookupDefault replacement x m of
|
||||
"" -> ('?', ("",xs))
|
||||
(y:ys) -> (y, (ys,xs))
|
||||
"" -> ('?', ("",xs))
|
||||
(y:ys) -> (y, (ys,xs))
|
||||
|
@ -49,7 +49,6 @@ library
|
||||
else
|
||||
ghc-options: -O2 -Wall
|
||||
build-depends: base >= 4.6 && < 4.10
|
||||
, containers >= 0.5 && < 0.6
|
||||
, exceptions >= 0.6 && < 0.9
|
||||
, megaparsec >= 5.0 && < 6.0
|
||||
, text >= 0.2 && < 1.3
|
||||
|
@ -6,6 +6,7 @@ import Data.Char (toLower)
|
||||
import Data.List (group)
|
||||
import Test.Hspec
|
||||
import Test.QuickCheck
|
||||
import qualified Data.Text as T
|
||||
|
||||
import Text.Inflections
|
||||
import Text.Inflections.Parse.Types (Word(..))
|
||||
@ -48,44 +49,45 @@ dasherizeSpacedSentence =
|
||||
onlyValidCharacters :: Spec
|
||||
onlyValidCharacters =
|
||||
it "returns only valid characters" (property onlyValidCharactersPredicate)
|
||||
where onlyValidCharactersPredicate sf = all (`elem` (alphaNumerics ++ "-_")) $ parameterize sf
|
||||
where onlyValidCharactersPredicate sf
|
||||
= T.all (`elem` (alphaNumerics ++ "-_")) $ parameterize (T.pack sf)
|
||||
|
||||
notBeginWithSeparator :: Spec
|
||||
notBeginWithSeparator =
|
||||
it "never returns a string beginning ending with a separator" (property notBeginWithSeparatorPredicate)
|
||||
where
|
||||
notBeginWithSeparatorPredicate s =
|
||||
let parameterized = parameterize s in
|
||||
(not . null) parameterized ==> head parameterized /= '-'
|
||||
let parameterized = parameterize (T.pack s) in
|
||||
(not . T.null) parameterized ==> T.head parameterized /= '-'
|
||||
|
||||
notEndWithSeparator :: Spec
|
||||
notEndWithSeparator =
|
||||
it "never returns a string beginning with a separator" (property notBeginWithSeparatorPredicate)
|
||||
where
|
||||
notBeginWithSeparatorPredicate s =
|
||||
let parameterized = parameterize s in
|
||||
(not . null) parameterized ==> last parameterized /= '-'
|
||||
let parameterized = parameterize (T.pack s) in
|
||||
(not . T.null) parameterized ==> T.last parameterized /= '-'
|
||||
|
||||
noMissingAlphanumerics :: Spec
|
||||
noMissingAlphanumerics =
|
||||
it "returns every alphanumeric character from the input" (property noMissingAlphanumericsPredicate)
|
||||
where noMissingAlphanumericsPredicate s =
|
||||
let parameterized = parameterize s in
|
||||
let parameterized = parameterize (T.pack s) in
|
||||
all (\c -> c `notElem` alphaNumerics ||
|
||||
c `elem` (alphaNumerics ++ "-") &&
|
||||
c `elem` parameterized) $ map toLower s
|
||||
c `elem` T.unpack parameterized) $ map toLower s
|
||||
|
||||
noMoreThanOneHyphen :: Spec
|
||||
noMoreThanOneHyphen =
|
||||
it "never returns a string with a sequence of dashes" (property noMoreThanOneHyphenPredicate)
|
||||
where noMoreThanOneHyphenPredicate s =
|
||||
let parameterized = parameterize s in longestSequenceOf '-' parameterized <= 1
|
||||
let parameterized = parameterize (T.pack s)
|
||||
in longestSequenceOf '-' (T.unpack parameterized) <= 1
|
||||
|
||||
longestSequenceOf :: Char -> String -> Int
|
||||
longestSequenceOf _ [] = 0
|
||||
longestSequenceOf c s =
|
||||
if null subseqLengths then 0 else maximum subseqLengths
|
||||
|
||||
where subseqLengths = (map length . filter (\str -> head str == c) . group) s
|
||||
|
||||
alphaNumerics :: String
|
||||
|
Loading…
Reference in New Issue
Block a user