mirror of
https://github.com/typeable/inflections-hs.git
synced 2024-10-26 08:08:21 +03:00
Expose transliterate function
This commit is contained in:
parent
c2af533223
commit
ffc7e44965
13
README.md
13
README.md
@ -6,16 +6,21 @@ This library is a partial port of the [String Inflector](http://api.rubyonrails.
|
||||
|
||||
## Usage
|
||||
|
||||
The most common usage of this library at this point is to parameterize a URL. This is accomplished as follows:
|
||||
The following examples demonstrate usage of the `parameterize` and `transliterate` functions:
|
||||
|
||||
```haskell
|
||||
λ: parameterize defaultTransliterations "¡Feliz año nuevo!"
|
||||
"feliz-ano-nuevo"
|
||||
λ: parameterize "Hola. ¿Cómo estás?"
|
||||
"hola-como-estas"
|
||||
|
||||
λ: transliterate "Hola. ¿Cómo estás?"
|
||||
"Hola. ?Como estas?"
|
||||
```
|
||||
|
||||
## Customization
|
||||
|
||||
Part of parameterizing strings is approximating all characters in the input encoding to ASCII characters. This library copies the character approximation table from the Ruby i18n library. This data structure is provided as `defaultCharacterTransliterations`. You can provide your own transliteration map by passing a Map structure (from Data.Map) to the `parameterize` function.
|
||||
Part of parameterizing strings is approximating all characters in the input encoding to ASCII characters. This library copies the character approximation table from the Ruby i18n library. This data structure is provided as `defaultCharacterTransliterations`. You can provide your own transliteration map by passing a Map structure (from Data.Map) to the `parameterizeCustom` function.
|
||||
|
||||
If you want to specify a custom default replacement or approximation table for the `transliterate` function, you should instead call the `transliterateCustom` function which accepts a String for replacements and a Map for substitution.
|
||||
|
||||
## TODO
|
||||
|
||||
|
@ -3,6 +3,8 @@
|
||||
module Text.Inflections
|
||||
( dasherize
|
||||
, parameterize
|
||||
, transliterate
|
||||
, transliterateCustom
|
||||
, defaultTransliterations
|
||||
) where
|
||||
|
||||
@ -29,9 +31,12 @@ data PChar = UCase Char
|
||||
deriving (Eq, Show)
|
||||
|
||||
-- |Replaces special characters in a string so that it may be used as part of a
|
||||
-- 'pretty' URL.
|
||||
parameterize :: Transliterations -> String -> String
|
||||
parameterize ts s =
|
||||
-- 'pretty' URL. Uses the default transliterations in this library
|
||||
parameterize :: String -> String
|
||||
parameterize = parameterizeCustom defaultTransliterations
|
||||
|
||||
parameterizeCustom :: Transliterations -> String -> String
|
||||
parameterizeCustom ts s =
|
||||
case parsed of
|
||||
Right ast -> (concatMap pCharToC . squeezeSeparators .
|
||||
trimUnwanted wanted . mapMaybe (parameterizeChar ts))
|
||||
@ -51,56 +56,24 @@ parameterize ts s =
|
||||
dasherize :: String -> String
|
||||
dasherize = map (\c -> if c == ' ' then '-' else c)
|
||||
|
||||
-- |Returns a String after default approximations for changing Unicode characters
|
||||
-- to a valid ASCII range are applied. If you want to supplement the default
|
||||
-- approximations with your own, you should use the transliterateCustom
|
||||
-- function instead of transliterate.
|
||||
transliterate :: String -> String
|
||||
transliterate = transliterateCustom "?" defaultTransliterations
|
||||
|
||||
-- Private functions
|
||||
|
||||
-- |Matches 'acceptable' characters for parameterization purposes.
|
||||
acceptableParser :: P.Stream s m Char => P.ParsecT s u m PChar
|
||||
acceptableParser = do
|
||||
c <- C.satisfy isValidParamChar
|
||||
return $ Acceptable [c]
|
||||
|
||||
parameterizableString :: P.Stream s m Char => P.ParsecT s u m [PChar]
|
||||
parameterizableString = P.many $ P.choice [
|
||||
acceptableParser
|
||||
, UCase <$> C.satisfy isAsciiUpper
|
||||
, Separator <$ C.char '-'
|
||||
, Underscore <$ C.char '_'
|
||||
, OtherAscii <$> C.satisfy isAscii
|
||||
, NonAscii <$> C.satisfy (not . isAscii)
|
||||
]
|
||||
|
||||
-- |Look up character in transliteration list.
|
||||
transliterate :: Transliterations -> Char -> Maybe PChar
|
||||
transliterate ts c = do
|
||||
-- We may have expanded into multiple characters during
|
||||
-- transliteration, so check validity of all characters in
|
||||
-- result.
|
||||
v <- Map.lookup c ts
|
||||
guard (all isValidParamChar v)
|
||||
return (Acceptable v)
|
||||
|
||||
isValidParamChar :: Char -> Bool
|
||||
isValidParamChar c = isAsciiLower c || isDigit c
|
||||
|
||||
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
|
||||
-- how this character should appear in a URL.
|
||||
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
|
||||
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
|
||||
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
|
||||
parameterizeChar _ Separator = Just Separator
|
||||
parameterizeChar _ Underscore = Just Underscore
|
||||
parameterizeChar _ (OtherAscii _) = Just Separator
|
||||
parameterizeChar ts (NonAscii c) = transliterate ts c
|
||||
|
||||
-- |Turns PChar tokens into their String representation.
|
||||
pCharToC :: PChar -> String
|
||||
pCharToC (UCase c) = [c]
|
||||
pCharToC (Acceptable str) = str
|
||||
pCharToC Separator = "-"
|
||||
pCharToC Underscore = "_"
|
||||
pCharToC (OtherAscii c) = [c]
|
||||
pCharToC (NonAscii c) = [c]
|
||||
-- |Returns a String after default approximations for changing Unicode characters
|
||||
-- to a valid ASCII range are applied.
|
||||
transliterateCustom :: String -> Transliterations -> String -> String
|
||||
transliterateCustom replacement ts = concatMap lookupCharTransliteration
|
||||
where lookupCharTransliteration c =
|
||||
if isAscii c then -- Don't bother looking up Chars in ASCII range
|
||||
[c]
|
||||
else
|
||||
case Map.lookup c ts of
|
||||
Nothing -> replacement
|
||||
Just val -> val
|
||||
|
||||
-- |These default transliterations stolen from the Ruby i18n library -
|
||||
-- https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb#L41:L69
|
||||
@ -139,6 +112,60 @@ defaultTransliterations = Map.fromList [
|
||||
('ŵ', "w"), ('Ŷ', "Y"), ('ŷ', "y"), ('Ÿ', "Y"), ('Ź', "Z"), ('ź', "z"),
|
||||
('Ż', "Z"), ('ż', "z"), ('Ž', "Z"), ('ž', "z")]
|
||||
|
||||
|
||||
-- Private functions
|
||||
|
||||
|
||||
-- |Look up character in transliteration list. Accepts a Transliteration map
|
||||
-- which has Chars as keys and Strings as values for approximating common
|
||||
-- international Unicode characters within the ASCII range.
|
||||
transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
|
||||
transliteratePCharCustom ts c = do
|
||||
-- We may have expanded into multiple characters during
|
||||
-- transliteration, so check validity of all characters in
|
||||
-- result.
|
||||
v <- Map.lookup c ts
|
||||
guard (all isValidParamChar v)
|
||||
return (Acceptable v)
|
||||
|
||||
-- |Matches 'acceptable' characters for parameterization purposes.
|
||||
acceptableParser :: P.Stream s m Char => P.ParsecT s u m PChar
|
||||
acceptableParser = do
|
||||
c <- C.satisfy isValidParamChar
|
||||
return $ Acceptable [c]
|
||||
|
||||
parameterizableString :: P.Stream s m Char => P.ParsecT s u m [PChar]
|
||||
parameterizableString = P.many $ P.choice [
|
||||
acceptableParser
|
||||
, UCase <$> C.satisfy isAsciiUpper
|
||||
, Separator <$ C.char '-'
|
||||
, Underscore <$ C.char '_'
|
||||
, OtherAscii <$> C.satisfy isAscii
|
||||
, NonAscii <$> C.satisfy (not . isAscii)
|
||||
]
|
||||
|
||||
isValidParamChar :: Char -> Bool
|
||||
isValidParamChar c = isAsciiLower c || isDigit c
|
||||
|
||||
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
|
||||
-- how this character should appear in a URL.
|
||||
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
|
||||
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
|
||||
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
|
||||
parameterizeChar _ Separator = Just Separator
|
||||
parameterizeChar _ Underscore = Just Underscore
|
||||
parameterizeChar _ (OtherAscii _) = Just Separator
|
||||
parameterizeChar ts (NonAscii c) = transliteratePCharCustom ts c
|
||||
|
||||
-- |Turns PChar tokens into their String representation.
|
||||
pCharToC :: PChar -> String
|
||||
pCharToC (UCase c) = [c]
|
||||
pCharToC (Acceptable str) = str
|
||||
pCharToC Separator = "-"
|
||||
pCharToC Underscore = "_"
|
||||
pCharToC (OtherAscii c) = [c]
|
||||
pCharToC (NonAscii c) = [c]
|
||||
|
||||
-- |Reduce sequences of separators down to only one separator.
|
||||
squeezeSeparators :: [PChar] -> [PChar]
|
||||
squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
|
||||
|
@ -1,5 +1,5 @@
|
||||
name: inflections
|
||||
version: 0.1.0.1
|
||||
version: 0.1.0.2
|
||||
synopsis: Inflections library for Haskell
|
||||
description:
|
||||
Inflections provides methods for singularization, pluralization, dasherizing, etc. The library is based on Rails' inflections library.
|
||||
|
@ -3,6 +3,7 @@ module Text.Inflections.Tests where
|
||||
import Test.HUnit hiding (Test)
|
||||
|
||||
import Test.Framework.Providers.QuickCheck2 (testProperty)
|
||||
import Test.Framework.Providers.HUnit (testCase)
|
||||
|
||||
import Test.QuickCheck
|
||||
import Test.QuickCheck.Arbitrary
|
||||
@ -22,6 +23,12 @@ tests = [testGroup "dasherize"
|
||||
[ testProperty "Substitutes spaces for hyphens" prop_dasherize1
|
||||
],
|
||||
|
||||
testGroup "transliterate"
|
||||
[ testCase "Without substitutions" test_correctTransliterationWithoutSubs
|
||||
, testCase "With substitutions" test_correctTransliterationWithSubs
|
||||
, testCase "Missing subs" test_correctTransliterationMissingSubs
|
||||
],
|
||||
|
||||
testGroup "parameterize"
|
||||
[ testProperty "Contains only valid chars"
|
||||
prop_parameterize1
|
||||
@ -37,33 +44,42 @@ tests = [testGroup "dasherize"
|
||||
]
|
||||
|
||||
|
||||
test_correctTransliterationWithoutSubs =
|
||||
transliterate "this is a test" @?= "this is a test"
|
||||
|
||||
test_correctTransliterationWithSubs =
|
||||
transliterate "Feliz año nuevo" @?= "Feliz ano nuevo"
|
||||
|
||||
test_correctTransliterationMissingSubs =
|
||||
transliterate "Have a ❤ ñ!" @?= "Have a ? n!"
|
||||
|
||||
prop_dasherize1 :: String -> Property
|
||||
prop_dasherize1 s =
|
||||
'-' `notElem` s ==> numMatching '-' (dasherize s) == numMatching ' ' s
|
||||
|
||||
prop_parameterize1 :: String -> Bool
|
||||
prop_parameterize1 sf = all (`elem` (alphaNumerics ++ "-_")) $
|
||||
parameterize defaultTransliterations sf
|
||||
parameterize sf
|
||||
|
||||
prop_parameterize2 :: String -> Property
|
||||
prop_parameterize2 s =
|
||||
(not . null) parameterized ==> head parameterized /= '-'
|
||||
where parameterized = parameterize defaultTransliterations s
|
||||
where parameterized = parameterize s
|
||||
|
||||
prop_parameterize3 :: String -> Property
|
||||
prop_parameterize3 s =
|
||||
(not . null) parameterized ==> last parameterized /= '-'
|
||||
where parameterized = parameterize defaultTransliterations s
|
||||
where parameterized = parameterize s
|
||||
|
||||
prop_parameterize4 :: String -> Bool
|
||||
prop_parameterize4 s = all (\c -> c `notElem` alphaNumerics ||
|
||||
c `elem` (alphaNumerics ++ "-") &&
|
||||
c `elem` parameterized) $ map toLower s
|
||||
where parameterized = parameterize defaultTransliterations s
|
||||
where parameterized = parameterize s
|
||||
|
||||
prop_parameterize5 :: String -> Bool
|
||||
prop_parameterize5 s = longestSequenceOf '-' parameterized <= 1
|
||||
where parameterized = parameterize defaultTransliterations s
|
||||
where parameterized = parameterize s
|
||||
|
||||
|
||||
-- Helper functions and shared tests
|
||||
|
Loading…
Reference in New Issue
Block a user