Expose transliterate function

This commit is contained in:
Justin Leitgeb 2014-02-25 13:39:24 -05:00
parent c2af533223
commit ffc7e44965
4 changed files with 110 additions and 62 deletions

View File

@ -6,16 +6,21 @@ This library is a partial port of the [String Inflector](http://api.rubyonrails.
## Usage
The most common usage of this library at this point is to parameterize a URL. This is accomplished as follows:
The following examples demonstrate usage of the `parameterize` and `transliterate` functions:
```haskell
λ: parameterize defaultTransliterations "¡Feliz año nuevo!"
"feliz-ano-nuevo"
λ: parameterize "Hola. ¿Cómo estás?"
"hola-como-estas"
λ: transliterate "Hola. ¿Cómo estás?"
"Hola. ?Como estas?"
```
## Customization
Part of parameterizing strings is approximating all characters in the input encoding to ASCII characters. This library copies the character approximation table from the Ruby i18n library. This data structure is provided as `defaultCharacterTransliterations`. You can provide your own transliteration map by passing a Map structure (from Data.Map) to the `parameterize` function.
Part of parameterizing strings is approximating all characters in the input encoding to ASCII characters. This library copies the character approximation table from the Ruby i18n library. This data structure is provided as `defaultCharacterTransliterations`. You can provide your own transliteration map by passing a Map structure (from Data.Map) to the `parameterizeCustom` function.
If you want to specify a custom default replacement or approximation table for the `transliterate` function, you should instead call the `transliterateCustom` function which accepts a String for replacements and a Map for substitution.
## TODO

View File

@ -3,6 +3,8 @@
module Text.Inflections
( dasherize
, parameterize
, transliterate
, transliterateCustom
, defaultTransliterations
) where
@ -29,9 +31,12 @@ data PChar = UCase Char
deriving (Eq, Show)
-- |Replaces special characters in a string so that it may be used as part of a
-- 'pretty' URL.
parameterize :: Transliterations -> String -> String
parameterize ts s =
-- 'pretty' URL. Uses the default transliterations in this library
parameterize :: String -> String
parameterize = parameterizeCustom defaultTransliterations
parameterizeCustom :: Transliterations -> String -> String
parameterizeCustom ts s =
case parsed of
Right ast -> (concatMap pCharToC . squeezeSeparators .
trimUnwanted wanted . mapMaybe (parameterizeChar ts))
@ -51,56 +56,24 @@ parameterize ts s =
dasherize :: String -> String
dasherize = map (\c -> if c == ' ' then '-' else c)
-- |Returns a String after default approximations for changing Unicode characters
-- to a valid ASCII range are applied. If you want to supplement the default
-- approximations with your own, you should use the transliterateCustom
-- function instead of transliterate.
transliterate :: String -> String
transliterate = transliterateCustom "?" defaultTransliterations
-- Private functions
-- |Matches 'acceptable' characters for parameterization purposes.
acceptableParser :: P.Stream s m Char => P.ParsecT s u m PChar
acceptableParser = do
c <- C.satisfy isValidParamChar
return $ Acceptable [c]
parameterizableString :: P.Stream s m Char => P.ParsecT s u m [PChar]
parameterizableString = P.many $ P.choice [
acceptableParser
, UCase <$> C.satisfy isAsciiUpper
, Separator <$ C.char '-'
, Underscore <$ C.char '_'
, OtherAscii <$> C.satisfy isAscii
, NonAscii <$> C.satisfy (not . isAscii)
]
-- |Look up character in transliteration list.
transliterate :: Transliterations -> Char -> Maybe PChar
transliterate ts c = do
-- We may have expanded into multiple characters during
-- transliteration, so check validity of all characters in
-- result.
v <- Map.lookup c ts
guard (all isValidParamChar v)
return (Acceptable v)
isValidParamChar :: Char -> Bool
isValidParamChar c = isAsciiLower c || isDigit c
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
-- how this character should appear in a URL.
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
parameterizeChar _ Separator = Just Separator
parameterizeChar _ Underscore = Just Underscore
parameterizeChar _ (OtherAscii _) = Just Separator
parameterizeChar ts (NonAscii c) = transliterate ts c
-- |Turns PChar tokens into their String representation.
pCharToC :: PChar -> String
pCharToC (UCase c) = [c]
pCharToC (Acceptable str) = str
pCharToC Separator = "-"
pCharToC Underscore = "_"
pCharToC (OtherAscii c) = [c]
pCharToC (NonAscii c) = [c]
-- |Returns a String after default approximations for changing Unicode characters
-- to a valid ASCII range are applied.
transliterateCustom :: String -> Transliterations -> String -> String
transliterateCustom replacement ts = concatMap lookupCharTransliteration
where lookupCharTransliteration c =
if isAscii c then -- Don't bother looking up Chars in ASCII range
[c]
else
case Map.lookup c ts of
Nothing -> replacement
Just val -> val
-- |These default transliterations stolen from the Ruby i18n library -
-- https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb#L41:L69
@ -139,6 +112,60 @@ defaultTransliterations = Map.fromList [
('ŵ', "w"), ('Ŷ', "Y"), ('ŷ', "y"), ('Ÿ', "Y"), ('Ź', "Z"), ('ź', "z"),
('Ż', "Z"), ('ż', "z"), ('Ž', "Z"), ('ž', "z")]
-- Private functions
-- |Look up character in transliteration list. Accepts a Transliteration map
-- which has Chars as keys and Strings as values for approximating common
-- international Unicode characters within the ASCII range.
transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
transliteratePCharCustom ts c = do
-- We may have expanded into multiple characters during
-- transliteration, so check validity of all characters in
-- result.
v <- Map.lookup c ts
guard (all isValidParamChar v)
return (Acceptable v)
-- |Matches 'acceptable' characters for parameterization purposes.
acceptableParser :: P.Stream s m Char => P.ParsecT s u m PChar
acceptableParser = do
c <- C.satisfy isValidParamChar
return $ Acceptable [c]
parameterizableString :: P.Stream s m Char => P.ParsecT s u m [PChar]
parameterizableString = P.many $ P.choice [
acceptableParser
, UCase <$> C.satisfy isAsciiUpper
, Separator <$ C.char '-'
, Underscore <$ C.char '_'
, OtherAscii <$> C.satisfy isAscii
, NonAscii <$> C.satisfy (not . isAscii)
]
isValidParamChar :: Char -> Bool
isValidParamChar c = isAsciiLower c || isDigit c
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
-- how this character should appear in a URL.
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
parameterizeChar _ Separator = Just Separator
parameterizeChar _ Underscore = Just Underscore
parameterizeChar _ (OtherAscii _) = Just Separator
parameterizeChar ts (NonAscii c) = transliteratePCharCustom ts c
-- |Turns PChar tokens into their String representation.
pCharToC :: PChar -> String
pCharToC (UCase c) = [c]
pCharToC (Acceptable str) = str
pCharToC Separator = "-"
pCharToC Underscore = "_"
pCharToC (OtherAscii c) = [c]
pCharToC (NonAscii c) = [c]
-- |Reduce sequences of separators down to only one separator.
squeezeSeparators :: [PChar] -> [PChar]
squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps

View File

@ -1,5 +1,5 @@
name: inflections
version: 0.1.0.1
version: 0.1.0.2
synopsis: Inflections library for Haskell
description:
Inflections provides methods for singularization, pluralization, dasherizing, etc. The library is based on Rails' inflections library.

View File

@ -3,6 +3,7 @@ module Text.Inflections.Tests where
import Test.HUnit hiding (Test)
import Test.Framework.Providers.QuickCheck2 (testProperty)
import Test.Framework.Providers.HUnit (testCase)
import Test.QuickCheck
import Test.QuickCheck.Arbitrary
@ -22,6 +23,12 @@ tests = [testGroup "dasherize"
[ testProperty "Substitutes spaces for hyphens" prop_dasherize1
],
testGroup "transliterate"
[ testCase "Without substitutions" test_correctTransliterationWithoutSubs
, testCase "With substitutions" test_correctTransliterationWithSubs
, testCase "Missing subs" test_correctTransliterationMissingSubs
],
testGroup "parameterize"
[ testProperty "Contains only valid chars"
prop_parameterize1
@ -37,33 +44,42 @@ tests = [testGroup "dasherize"
]
test_correctTransliterationWithoutSubs =
transliterate "this is a test" @?= "this is a test"
test_correctTransliterationWithSubs =
transliterate "Feliz año nuevo" @?= "Feliz ano nuevo"
test_correctTransliterationMissingSubs =
transliterate "Have a ❤ ñ!" @?= "Have a ? n!"
prop_dasherize1 :: String -> Property
prop_dasherize1 s =
'-' `notElem` s ==> numMatching '-' (dasherize s) == numMatching ' ' s
prop_parameterize1 :: String -> Bool
prop_parameterize1 sf = all (`elem` (alphaNumerics ++ "-_")) $
parameterize defaultTransliterations sf
parameterize sf
prop_parameterize2 :: String -> Property
prop_parameterize2 s =
(not . null) parameterized ==> head parameterized /= '-'
where parameterized = parameterize defaultTransliterations s
where parameterized = parameterize s
prop_parameterize3 :: String -> Property
prop_parameterize3 s =
(not . null) parameterized ==> last parameterized /= '-'
where parameterized = parameterize defaultTransliterations s
where parameterized = parameterize s
prop_parameterize4 :: String -> Bool
prop_parameterize4 s = all (\c -> c `notElem` alphaNumerics ||
c `elem` (alphaNumerics ++ "-") &&
c `elem` parameterized) $ map toLower s
where parameterized = parameterize defaultTransliterations s
where parameterized = parameterize s
prop_parameterize5 :: String -> Bool
prop_parameterize5 s = longestSequenceOf '-' parameterized <= 1
where parameterized = parameterize defaultTransliterations s
where parameterized = parameterize s
-- Helper functions and shared tests