First commit

This commit is contained in:
Justin Leitgeb 2014-02-23 08:34:46 -05:00
commit 2c22ed5daa
7 changed files with 337 additions and 0 deletions

22
LICENSE Normal file
View File

@ -0,0 +1,22 @@
Copyright (c) 2014 Justin Leitgeb
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

33
README.md Normal file
View File

@ -0,0 +1,33 @@
# String Inflections for Haskell
This library is a partial port of the [String Inflector](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html) from Ruby on Rails. Specifically, it implements the [`parameterize`](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html#method-i-parameterize) and [`dasherize`](http://api.rubyonrails.org/classes/ActiveSupport/Inflector.html#method-i-dasherize) functions from the Inflector.
## Usage
The most common usage of this library at this point is to parameterize a URL. This is accomplished as follows:
```haskell
λ: parameterize defaultTransliterations "¡Feliz año nuevo!"
"feliz-ano-nuevo"
```
## Customization
Part of parameterizing strings is approximating all characters in the input encoding to ASCII characters. This library copies the character approximation table from the Ruby i18n library. This data structure is provided as `defaultCharacterTransliterations`. You can provide your own transliteration map by passing a Map structure (from Data.Map) to the `parameterize` function.
## TODO
I'd like this library to implement other functions found in the Rails inflections library. If you need one of those functions, please submit a pull request!
## Author
Justin Leitgeb <justin@stackbuilders.com>
## Contributing
You may submit pull requests to this repository on GitHub. Please add property
tests for any functional changes that you make to this library.
## License
MIT - see [the LICENSE file](LICENSE).

2
Setup.hs Normal file
View File

@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain

156
Text/Inflections.hs Normal file
View File

@ -0,0 +1,156 @@
{-# LANGUAGE FlexibleContexts, NoMonomorphismRestriction #-}
module Text.Inflections
( dasherize
, parameterize
, defaultTransliterations
) where
import Data.Char (toLower, isAsciiLower, isAsciiUpper, isAscii, isDigit)
import qualified Text.Parsec as P
import Control.Applicative
import qualified Text.ParserCombinators.Parsec.Char as C
import Data.List (group)
import Data.Maybe (mapMaybe)
import qualified Data.Map as Map
type Transliterations = Map.Map Char String
data PChar = UCase Char
-- Since some of the transliterating approximations expand from
-- one Unicode to two ASCII chars (eg., œ to oe), we represent
-- this as a String.
| Acceptable String
| Separator
| Underscore
| OtherAscii Char
| NonAscii Char
deriving (Eq, Show)
-- |Replaces special characters in a string so that it may be used as part of a
-- 'pretty' URL.
parameterize :: Transliterations -> String -> String
parameterize ts s =
case parsed of
Right ast -> (concatMap pCharToC . squeezeSeparators .
trimUnwanted wanted . mapMaybe (parameterizeChar ts))
ast
-- Note that this should never fail, since we accommodate all Unicode
-- characters as valid input.
Left err -> fail $ "Parse failed, please report a bug! Error: " ++
show err
where parsed = P.parse parameterizableString "" s
wanted :: [PChar] -- All valid URL chars - we shouldn't trim these.
wanted = Underscore :
map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9'])
-- |Replaces underscores with dashes in the string.
dasherize :: String -> String
dasherize = map (\c -> if c == ' ' then '-' else c)
-- Private functions
-- |Matches 'acceptable' characters for parameterization purposes.
acceptableParser :: P.Stream s m Char => P.ParsecT s u m PChar
acceptableParser = do
c <- C.satisfy isValidParamChar
return $ Acceptable [c]
parameterizableString :: P.Stream s m Char => P.ParsecT s u m [PChar]
parameterizableString = P.many $ P.choice [
acceptableParser
, UCase <$> C.satisfy isAsciiUpper
, C.char '-' >> return Separator
, C.char '_' >> return Underscore
, OtherAscii <$> C.satisfy isAscii
, NonAscii <$> C.satisfy (not . isAscii)
]
-- |Look up character in transliteration list.
transliterate :: Transliterations -> Char -> Maybe PChar
transliterate ts c =
case Map.lookup c ts of
Just v -> -- We may have expanded into multiple characters during
-- transliteration, so check validity of all characters in
-- result.
if all isValidParamChar v then
Just $ Acceptable v
else
Nothing
Nothing -> Nothing
isValidParamChar :: Char -> Bool
isValidParamChar c = isAsciiLower c || isDigit c
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
-- how this character should appear in a URL.
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
parameterizeChar _ Separator = Just Separator
parameterizeChar _ Underscore = Just Underscore
parameterizeChar _ (OtherAscii _) = Just Separator
parameterizeChar ts (NonAscii c) = transliterate ts c
-- |Turns PChar tokens into their String representation.
pCharToC :: PChar -> String
pCharToC (UCase c) = [c]
pCharToC (Acceptable str) = str
pCharToC Separator = "-"
pCharToC Underscore = "_"
pCharToC (OtherAscii c) = [c]
pCharToC (NonAscii c) = [c]
-- |These default transliterations stolen from the Ruby i18n library -
-- https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb#L41:L69
defaultTransliterations :: Map.Map Char String
defaultTransliterations = Map.fromList [
('À', "A"), ('Á', "A"), ('Â', "A"), ('Ã', "A"), ('Ä', "A"), ('Å', "A"),
('Æ', "AE"), ('Ç', "C"), ('È', "E"), ('É', "E"), ('Ê', "E"), ('Ë', "E"),
('Ì', "I"), ('Í', "I"), ('Î', "I"), ('Ï', "I"), ('Ð', "D"), ('Ñ', "N"),
('Ò', "O"), ('Ó', "O"), ('Ô', "O"), ('Õ', "O"), ('Ö', "O"), ('×', "x"),
('Ø', "O"), ('Ù', "U"), ('Ú', "U"), ('Û', "U"), ('Ü', "U"), ('Ý', "Y"),
('Þ', "Th"), ('ß', "ss"), ('à', "a"), ('á', "a"), ('â', "a"), ('ã', "a"),
('ä', "a"), ('å', "a"), ('æ', "ae"), ('ç', "c"), ('è', "e"), ('é', "e"),
('ê', "e"), ('ë', "e"), ('ì', "i"), ('í', "i"), ('î', "i"), ('ï', "i"),
('ð', "d"), ('ñ', "n"), ('ò', "o"), ('ó', "o"), ('ô', "o"), ('õ', "o"),
('ö', "o"), ('ø', "o"), ('ù', "u"), ('ú', "u"), ('û', "u"), ('ü', "u"),
('ý', "y"), ('þ', "th"), ('ÿ', "y"), ('Ā', "A"), ('ā', "a"), ('Ă', "A"),
('ă', "a"), ('Ą', "A"), ('ą', "a"), ('Ć', "C"), ('ć', "c"), ('Ĉ', "C"),
('ĉ', "c"), ('Ċ', "C"), ('ċ', "c"), ('Č', "C"), ('č', "c"), ('Ď', "D"),
('ď', "d"), ('Đ', "D"), ('đ', "d"), ('Ē', "E"), ('ē', "e"), ('Ĕ', "E"),
('ĕ', "e"), ('Ė', "E"), ('ė', "e"), ('Ę', "E"), ('ę', "e"), ('Ě', "E"),
('ě', "e"), ('Ĝ', "G"), ('ĝ', "g"), ('Ğ', "G"), ('ğ', "g"), ('Ġ', "G"),
('ġ', "g"), ('Ģ', "G"), ('ģ', "g"), ('Ĥ', "H"), ('ĥ', "h"), ('Ħ', "H"),
('ħ', "h"), ('Ĩ', "I"), ('ĩ', "i"), ('Ī', "I"), ('ī', "i"), ('Ĭ', "I"),
('ĭ', "i"), ('Į', "I"), ('į', "i"), ('İ', "I"), ('ı', "i"), ('IJ', "IJ"),
('ij', "ij"), ('Ĵ', "J"), ('ĵ', "j"), ('Ķ', "K"), ('ķ', "k"), ('ĸ', "k"),
('Ĺ', "L"), ('ĺ', "l"), ('Ļ', "L"), ('ļ', "l"), ('Ľ', "L"), ('ľ', "l"),
('Ŀ', "L"), ('ŀ', "l"), ('Ł', "L"), ('ł', "l"), ('Ń', "N"), ('ń', "n"),
('Ņ', "N"), ('ņ', "n"), ('Ň', "N"), ('ň', "n"), ('ʼn', "'n"), ('Ŋ', "NG"),
('ŋ', "ng"), ('Ō', "O"), ('ō', "o"), ('Ŏ', "O"), ('ŏ', "o"), ('Ő', "O"),
('ő', "o"), ('Œ', "OE"), ('œ', "oe"), ('Ŕ', "R"), ('ŕ', "r"), ('Ŗ', "R"),
('ŗ', "r"), ('Ř', "R"), ('ř', "r"), ('Ś', "S"), ('ś', "s"), ('Ŝ', "S"),
('ŝ', "s"), ('Ş', "S"), ('ş', "s"), ('Š', "S"), ('š', "s"), ('Ţ', "T"),
('ţ', "t"), ('Ť', "T"), ('ť', "t"), ('Ŧ', "T"), ('ŧ', "t"), ('Ũ', "U"),
('ũ', "u"), ('Ū', "U"), ('ū', "u"), ('Ŭ', "U"), ('ŭ', "u"), ('Ů', "U"),
('ů', "u"), ('Ű', "U"), ('ű', "u"), ('Ų', "U"), ('ų', "u"), ('Ŵ', "W"),
('ŵ', "w"), ('Ŷ', "Y"), ('ŷ', "y"), ('Ÿ', "Y"), ('Ź', "Z"), ('ź', "z"),
('Ż', "Z"), ('ż', "z"), ('Ž', "Z"), ('ž', "z")]
-- |Reduce sequences of separators down to only one separator.
squeezeSeparators :: [PChar] -> [PChar]
squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
where squashSeparatorGroup g = case head g of
Separator -> [Separator] -- only take head
_ -> g -- don't change
-- |Trim non-wanted elements from the beginning and end of list.
trimUnwanted :: Eq a => [a] -> [a] -> [a]
trimUnwanted wanted xs = dropWhile notWanted $ reverse $ dropWhile notWanted $
reverse xs
where notWanted elt = elt `notElem` wanted

35
inflections.cabal Normal file
View File

@ -0,0 +1,35 @@
name: inflections
version: 0.1.0.0
synopsis: Inflections library for Haskell
description:
Inflections provides methods for singularization, pluralization, dasherizing, etc. The library is based on Rails' inflections library.
license: MIT
license-file: LICENSE
author: Justin Leitgeb
maintainer: justin@stackbuilders.com
copyright: 2014 Justin Leitgeb
category: Text
build-type: Simple
-- extra-source-files:
cabal-version: >=1.10
library
exposed-modules: Text.Inflections
ghc-options: -Wall
build-depends: base >=4.6 && <4.7, parsec, containers
default-language: Haskell2010
test-suite test
type: exitcode-stdio-1.0
hs-source-dirs: test
main-is: Suite.hs
build-depends:
inflections
, base >=4.6 && <4.7
, test-framework
, HUnit
, QuickCheck
, test-framework-hunit
, test-framework-quickcheck2
, containers
default-language: Haskell2010

8
test/Suite.hs Normal file
View File

@ -0,0 +1,8 @@
module Main where
import Test.Framework (defaultMain)
import qualified Text.Inflections.Tests
main :: IO ()
main = defaultMain Text.Inflections.Tests.tests

View File

@ -0,0 +1,81 @@
module Text.Inflections.Tests where
import Test.HUnit hiding (Test)
import Test.Framework.Providers.QuickCheck2 (testProperty)
import Test.QuickCheck
import Test.QuickCheck.Arbitrary
import Test.Framework (Test, testGroup)
import Data.List (all, group)
import Data.Char (toLower)
import Data.Map (fromList)
import Text.Inflections
{-# ANN module "HLint: ignore Use camelCase" #-}
tests :: [Test]
tests = [testGroup "dasherize"
[ testProperty "Substitutes spaces for hyphens" prop_dasherize1
],
testGroup "parameterize"
[ testProperty "Contains only valid chars"
prop_parameterize1
, testProperty "Does not begin with a separator character"
prop_parameterize2
, testProperty "Does not end in a separator character"
prop_parameterize3
, testProperty "All alphanumerics in input exist in output"
prop_parameterize4
, testProperty "Doesn't have subsequences of more than one hyphen"
prop_parameterize5
]
]
prop_dasherize1 :: String -> Property
prop_dasherize1 s =
'-' `notElem` s ==> numMatching '-' (dasherize s) == numMatching ' ' s
prop_parameterize1 :: String -> Bool
prop_parameterize1 sf = all (`elem` (alphaNumerics ++ "-_")) $
parameterize defaultTransliterations sf
prop_parameterize2 :: String -> Property
prop_parameterize2 s =
(not . null) parameterized ==> head parameterized /= '-'
where parameterized = parameterize defaultTransliterations s
prop_parameterize3 :: String -> Property
prop_parameterize3 s =
(not . null) parameterized ==> last parameterized /= '-'
where parameterized = parameterize defaultTransliterations s
prop_parameterize4 :: String -> Bool
prop_parameterize4 s = all (\c -> c `notElem` alphaNumerics ||
c `elem` (alphaNumerics ++ "-") &&
c `elem` parameterized) $ map toLower s
where parameterized = parameterize defaultTransliterations s
prop_parameterize5 :: String -> Bool
prop_parameterize5 s = longestSequenceOf '-' parameterized <= 1
where parameterized = parameterize defaultTransliterations s
-- Helper functions and shared tests
longestSequenceOf :: Char -> String -> Int
longestSequenceOf c [] = 0
longestSequenceOf c s =
if null subseqLengths then 0 else maximum subseqLengths
where subseqLengths = (map length . filter (\str -> head str == c) . group) s
numMatching char str = length $ filter (== char) str
alphaNumerics :: String
alphaNumerics = ['a'..'z'] ++ ['0'..'9']