2014-02-23 17:34:46 +04:00
|
|
|
|
{-# LANGUAGE FlexibleContexts, NoMonomorphismRestriction #-}
|
|
|
|
|
|
|
|
|
|
module Text.Inflections
|
|
|
|
|
( dasherize
|
|
|
|
|
, parameterize
|
2014-02-25 22:39:24 +04:00
|
|
|
|
, transliterate
|
|
|
|
|
, transliterateCustom
|
2014-02-23 17:34:46 +04:00
|
|
|
|
, defaultTransliterations
|
|
|
|
|
) where
|
|
|
|
|
|
|
|
|
|
import Data.Char (toLower, isAsciiLower, isAsciiUpper, isAscii, isDigit)
|
|
|
|
|
import qualified Text.Parsec as P
|
|
|
|
|
import Control.Applicative
|
2014-02-25 13:11:37 +04:00
|
|
|
|
import Control.Monad (guard)
|
2014-02-23 17:34:46 +04:00
|
|
|
|
import qualified Text.ParserCombinators.Parsec.Char as C
|
|
|
|
|
import Data.List (group)
|
|
|
|
|
import Data.Maybe (mapMaybe)
|
|
|
|
|
import qualified Data.Map as Map
|
|
|
|
|
|
2014-02-26 01:36:36 +04:00
|
|
|
|
-- |A Map containing mappings from international characters to sequences
|
|
|
|
|
-- approximating these characters within the ASCII range.
|
2014-02-23 17:34:46 +04:00
|
|
|
|
type Transliterations = Map.Map Char String
|
|
|
|
|
|
|
|
|
|
data PChar = UCase Char
|
|
|
|
|
-- Since some of the transliterating approximations expand from
|
|
|
|
|
-- one Unicode to two ASCII chars (eg., œ to oe), we represent
|
|
|
|
|
-- this as a String.
|
|
|
|
|
| Acceptable String
|
|
|
|
|
| Separator
|
|
|
|
|
| Underscore
|
|
|
|
|
| OtherAscii Char
|
|
|
|
|
| NonAscii Char
|
|
|
|
|
deriving (Eq, Show)
|
|
|
|
|
|
|
|
|
|
-- |Replaces special characters in a string so that it may be used as part of a
|
2014-02-25 22:39:24 +04:00
|
|
|
|
-- 'pretty' URL. Uses the default transliterations in this library
|
|
|
|
|
parameterize :: String -> String
|
|
|
|
|
parameterize = parameterizeCustom defaultTransliterations
|
|
|
|
|
|
|
|
|
|
parameterizeCustom :: Transliterations -> String -> String
|
|
|
|
|
parameterizeCustom ts s =
|
2014-02-23 17:34:46 +04:00
|
|
|
|
case parsed of
|
|
|
|
|
Right ast -> (concatMap pCharToC . squeezeSeparators .
|
|
|
|
|
trimUnwanted wanted . mapMaybe (parameterizeChar ts))
|
|
|
|
|
ast
|
|
|
|
|
|
|
|
|
|
-- Note that this should never fail, since we accommodate all Unicode
|
|
|
|
|
-- characters as valid input.
|
|
|
|
|
Left err -> fail $ "Parse failed, please report a bug! Error: " ++
|
|
|
|
|
show err
|
|
|
|
|
|
|
|
|
|
where parsed = P.parse parameterizableString "" s
|
|
|
|
|
wanted :: [PChar] -- All valid URL chars - we shouldn't trim these.
|
|
|
|
|
wanted = Underscore :
|
|
|
|
|
map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9'])
|
|
|
|
|
|
|
|
|
|
-- |Replaces underscores with dashes in the string.
|
|
|
|
|
dasherize :: String -> String
|
|
|
|
|
dasherize = map (\c -> if c == ' ' then '-' else c)
|
|
|
|
|
|
2014-02-25 22:39:24 +04:00
|
|
|
|
-- |Returns a String after default approximations for changing Unicode characters
|
|
|
|
|
-- to a valid ASCII range are applied. If you want to supplement the default
|
|
|
|
|
-- approximations with your own, you should use the transliterateCustom
|
|
|
|
|
-- function instead of transliterate.
|
|
|
|
|
transliterate :: String -> String
|
|
|
|
|
transliterate = transliterateCustom "?" defaultTransliterations
|
|
|
|
|
|
|
|
|
|
-- |Returns a String after default approximations for changing Unicode characters
|
|
|
|
|
-- to a valid ASCII range are applied.
|
|
|
|
|
transliterateCustom :: String -> Transliterations -> String -> String
|
|
|
|
|
transliterateCustom replacement ts = concatMap lookupCharTransliteration
|
|
|
|
|
where lookupCharTransliteration c =
|
|
|
|
|
if isAscii c then -- Don't bother looking up Chars in ASCII range
|
|
|
|
|
[c]
|
|
|
|
|
else
|
|
|
|
|
case Map.lookup c ts of
|
|
|
|
|
Nothing -> replacement
|
|
|
|
|
Just val -> val
|
|
|
|
|
|
|
|
|
|
-- |These default transliterations stolen from the Ruby i18n library -
|
|
|
|
|
-- https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb#L41:L69
|
|
|
|
|
defaultTransliterations :: Map.Map Char String
|
|
|
|
|
defaultTransliterations = Map.fromList [
|
|
|
|
|
('À', "A"), ('Á', "A"), ('Â', "A"), ('Ã', "A"), ('Ä', "A"), ('Å', "A"),
|
|
|
|
|
('Æ', "AE"), ('Ç', "C"), ('È', "E"), ('É', "E"), ('Ê', "E"), ('Ë', "E"),
|
|
|
|
|
('Ì', "I"), ('Í', "I"), ('Î', "I"), ('Ï', "I"), ('Ð', "D"), ('Ñ', "N"),
|
|
|
|
|
('Ò', "O"), ('Ó', "O"), ('Ô', "O"), ('Õ', "O"), ('Ö', "O"), ('×', "x"),
|
|
|
|
|
('Ø', "O"), ('Ù', "U"), ('Ú', "U"), ('Û', "U"), ('Ü', "U"), ('Ý', "Y"),
|
|
|
|
|
('Þ', "Th"), ('ß', "ss"), ('à', "a"), ('á', "a"), ('â', "a"), ('ã', "a"),
|
|
|
|
|
('ä', "a"), ('å', "a"), ('æ', "ae"), ('ç', "c"), ('è', "e"), ('é', "e"),
|
|
|
|
|
('ê', "e"), ('ë', "e"), ('ì', "i"), ('í', "i"), ('î', "i"), ('ï', "i"),
|
|
|
|
|
('ð', "d"), ('ñ', "n"), ('ò', "o"), ('ó', "o"), ('ô', "o"), ('õ', "o"),
|
|
|
|
|
('ö', "o"), ('ø', "o"), ('ù', "u"), ('ú', "u"), ('û', "u"), ('ü', "u"),
|
|
|
|
|
('ý', "y"), ('þ', "th"), ('ÿ', "y"), ('Ā', "A"), ('ā', "a"), ('Ă', "A"),
|
|
|
|
|
('ă', "a"), ('Ą', "A"), ('ą', "a"), ('Ć', "C"), ('ć', "c"), ('Ĉ', "C"),
|
|
|
|
|
('ĉ', "c"), ('Ċ', "C"), ('ċ', "c"), ('Č', "C"), ('č', "c"), ('Ď', "D"),
|
|
|
|
|
('ď', "d"), ('Đ', "D"), ('đ', "d"), ('Ē', "E"), ('ē', "e"), ('Ĕ', "E"),
|
|
|
|
|
('ĕ', "e"), ('Ė', "E"), ('ė', "e"), ('Ę', "E"), ('ę', "e"), ('Ě', "E"),
|
|
|
|
|
('ě', "e"), ('Ĝ', "G"), ('ĝ', "g"), ('Ğ', "G"), ('ğ', "g"), ('Ġ', "G"),
|
|
|
|
|
('ġ', "g"), ('Ģ', "G"), ('ģ', "g"), ('Ĥ', "H"), ('ĥ', "h"), ('Ħ', "H"),
|
|
|
|
|
('ħ', "h"), ('Ĩ', "I"), ('ĩ', "i"), ('Ī', "I"), ('ī', "i"), ('Ĭ', "I"),
|
|
|
|
|
('ĭ', "i"), ('Į', "I"), ('į', "i"), ('İ', "I"), ('ı', "i"), ('IJ', "IJ"),
|
|
|
|
|
('ij', "ij"), ('Ĵ', "J"), ('ĵ', "j"), ('Ķ', "K"), ('ķ', "k"), ('ĸ', "k"),
|
|
|
|
|
('Ĺ', "L"), ('ĺ', "l"), ('Ļ', "L"), ('ļ', "l"), ('Ľ', "L"), ('ľ', "l"),
|
|
|
|
|
('Ŀ', "L"), ('ŀ', "l"), ('Ł', "L"), ('ł', "l"), ('Ń', "N"), ('ń', "n"),
|
|
|
|
|
('Ņ', "N"), ('ņ', "n"), ('Ň', "N"), ('ň', "n"), ('ʼn', "'n"), ('Ŋ', "NG"),
|
|
|
|
|
('ŋ', "ng"), ('Ō', "O"), ('ō', "o"), ('Ŏ', "O"), ('ŏ', "o"), ('Ő', "O"),
|
|
|
|
|
('ő', "o"), ('Œ', "OE"), ('œ', "oe"), ('Ŕ', "R"), ('ŕ', "r"), ('Ŗ', "R"),
|
|
|
|
|
('ŗ', "r"), ('Ř', "R"), ('ř', "r"), ('Ś', "S"), ('ś', "s"), ('Ŝ', "S"),
|
|
|
|
|
('ŝ', "s"), ('Ş', "S"), ('ş', "s"), ('Š', "S"), ('š', "s"), ('Ţ', "T"),
|
|
|
|
|
('ţ', "t"), ('Ť', "T"), ('ť', "t"), ('Ŧ', "T"), ('ŧ', "t"), ('Ũ', "U"),
|
|
|
|
|
('ũ', "u"), ('Ū', "U"), ('ū', "u"), ('Ŭ', "U"), ('ŭ', "u"), ('Ů', "U"),
|
|
|
|
|
('ů', "u"), ('Ű', "U"), ('ű', "u"), ('Ų', "U"), ('ų', "u"), ('Ŵ', "W"),
|
|
|
|
|
('ŵ', "w"), ('Ŷ', "Y"), ('ŷ', "y"), ('Ÿ', "Y"), ('Ź', "Z"), ('ź', "z"),
|
|
|
|
|
('Ż', "Z"), ('ż', "z"), ('Ž', "Z"), ('ž', "z")]
|
|
|
|
|
|
2014-02-23 17:34:46 +04:00
|
|
|
|
|
|
|
|
|
-- Private functions
|
|
|
|
|
|
2014-02-25 22:39:24 +04:00
|
|
|
|
|
|
|
|
|
-- |Look up character in transliteration list. Accepts a Transliteration map
|
|
|
|
|
-- which has Chars as keys and Strings as values for approximating common
|
|
|
|
|
-- international Unicode characters within the ASCII range.
|
|
|
|
|
transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
|
|
|
|
|
transliteratePCharCustom ts c = do
|
|
|
|
|
-- We may have expanded into multiple characters during
|
|
|
|
|
-- transliteration, so check validity of all characters in
|
|
|
|
|
-- result.
|
|
|
|
|
v <- Map.lookup c ts
|
|
|
|
|
guard (all isValidParamChar v)
|
|
|
|
|
return (Acceptable v)
|
|
|
|
|
|
2014-02-23 17:34:46 +04:00
|
|
|
|
-- |Matches 'acceptable' characters for parameterization purposes.
|
|
|
|
|
acceptableParser :: P.Stream s m Char => P.ParsecT s u m PChar
|
|
|
|
|
acceptableParser = do
|
|
|
|
|
c <- C.satisfy isValidParamChar
|
|
|
|
|
return $ Acceptable [c]
|
|
|
|
|
|
|
|
|
|
parameterizableString :: P.Stream s m Char => P.ParsecT s u m [PChar]
|
|
|
|
|
parameterizableString = P.many $ P.choice [
|
|
|
|
|
acceptableParser
|
|
|
|
|
, UCase <$> C.satisfy isAsciiUpper
|
2014-02-25 13:11:37 +04:00
|
|
|
|
, Separator <$ C.char '-'
|
|
|
|
|
, Underscore <$ C.char '_'
|
2014-02-23 17:34:46 +04:00
|
|
|
|
, OtherAscii <$> C.satisfy isAscii
|
|
|
|
|
, NonAscii <$> C.satisfy (not . isAscii)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
isValidParamChar :: Char -> Bool
|
|
|
|
|
isValidParamChar c = isAsciiLower c || isDigit c
|
|
|
|
|
|
|
|
|
|
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
|
|
|
|
|
-- how this character should appear in a URL.
|
|
|
|
|
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
|
|
|
|
|
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
|
|
|
|
|
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
|
|
|
|
|
parameterizeChar _ Separator = Just Separator
|
|
|
|
|
parameterizeChar _ Underscore = Just Underscore
|
|
|
|
|
parameterizeChar _ (OtherAscii _) = Just Separator
|
2014-02-25 22:39:24 +04:00
|
|
|
|
parameterizeChar ts (NonAscii c) = transliteratePCharCustom ts c
|
2014-02-23 17:34:46 +04:00
|
|
|
|
|
|
|
|
|
-- |Turns PChar tokens into their String representation.
|
|
|
|
|
pCharToC :: PChar -> String
|
|
|
|
|
pCharToC (UCase c) = [c]
|
|
|
|
|
pCharToC (Acceptable str) = str
|
|
|
|
|
pCharToC Separator = "-"
|
|
|
|
|
pCharToC Underscore = "_"
|
|
|
|
|
pCharToC (OtherAscii c) = [c]
|
|
|
|
|
pCharToC (NonAscii c) = [c]
|
|
|
|
|
|
|
|
|
|
-- |Reduce sequences of separators down to only one separator.
|
|
|
|
|
squeezeSeparators :: [PChar] -> [PChar]
|
|
|
|
|
squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
|
|
|
|
|
where squashSeparatorGroup g = case head g of
|
|
|
|
|
Separator -> [Separator] -- only take head
|
|
|
|
|
_ -> g -- don't change
|
|
|
|
|
|
|
|
|
|
-- |Trim non-wanted elements from the beginning and end of list.
|
|
|
|
|
trimUnwanted :: Eq a => [a] -> [a] -> [a]
|
2014-02-25 13:11:37 +04:00
|
|
|
|
trimUnwanted wanted = dropWhile notWanted . reverse . dropWhile notWanted
|
|
|
|
|
. reverse
|
|
|
|
|
where notWanted = (`notElem` wanted)
|