Refactor and switch to Megaparsec 5

Close #17.
This commit is contained in:
mrkkrp 2016-06-24 11:03:23 +03:00
parent 77aba55099
commit 794cc8c51c
13 changed files with 178 additions and 193 deletions

View File

@ -39,7 +39,7 @@ script:
*) cabal test --show-details=always ;;
esac
- cabal sdist
- cabal haddock | grep "100%" | wc -l | grep "15"
- cabal haddock | grep "100%" | wc -l | grep "14"
# after_script:
# - cabal install hpc-coveralls

View File

@ -4,6 +4,8 @@
* Switched test suite to Hspec.
* Switched to Megaparsec 5 for parsing.
* Improved documentation.
## Inflections 0.2.0.1

View File

@ -82,6 +82,7 @@ module Text.Inflections
, titleize
, Transliterations
, defaultMap
, parameterize
@ -95,7 +96,6 @@ module Text.Inflections
, parseSnakeCase
, parseCamelCase
, Transliterations
-- * Often used combinators
, toUnderscore
, toDashed
@ -103,12 +103,9 @@ module Text.Inflections
)
where
import Text.Inflections.Data ( Transliterations, defaultMap )
import Text.Inflections.Data ( defaultMap )
import Text.Inflections.Parameterize ( Transliterations
, parameterize
, parameterizeCustom )
import Text.Inflections.Parameterize ( parameterize, parameterizeCustom )
import Text.Inflections.Underscore ( underscore )

View File

@ -9,25 +9,22 @@
--
-- Conversion to dasherized phrases.
{-# LANGUAGE CPP #-}
module Text.Inflections.Dasherize ( dasherize ) where
import Text.Inflections.Parse.Types (Word(..))
import Data.List (intercalate)
import Text.Inflections.Parse.Types
import Prelude (String, (.), map)
#if MIN_VERSION_base(4,8,0)
import Prelude hiding (Word)
#endif
-- | Replaces underscores in a snake_cased string with dashes (hyphens).
--
-- >>> dasherize [ Word "foo", Acronym "bar", Word "bazz" ]
-- "foo-bar-bazz"
dasherize
:: [Word] -- ^ Input Words to separate with dashes
-> String -- ^ The dasherized String
dasherize = intercalate "-" . map toString
-- | Covert a 'Word' into its 'String' representation.
toString :: Word -> String
toString (Acronym s) = s
toString (Word s) = s
dasherize = intercalate "-" . fmap unWord

View File

@ -11,12 +11,17 @@
module Text.Inflections.Data where
import Data.Map (Map, fromList)
import Data.Map (Map)
import qualified Data.Map as M
-- |A 'Data.Map.Map' containing mappings from international characters to
-- sequences approximating these characters within the ASCII range.
type Transliterations = Map Char String
-- |These default transliterations stolen from the Ruby i18n library - see
-- <https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb#L41:L69>.
defaultMap :: Map Char String
defaultMap = fromList [
defaultMap :: Transliterations
defaultMap = M.fromList [
('À', "A"), ('Á', "A"), ('Â', "A"), ('Ã', "A"), ('Ä', "A"), ('Å', "A"),
('Æ', "AE"), ('Ç', "C"), ('È', "E"), ('É', "E"), ('Ê', "E"), ('Ë', "E"),
('Ì', "I"), ('Í', "I"), ('Î', "I"), ('Ï', "I"), ('Ð', "D"), ('Ñ', "N"),

View File

@ -9,29 +9,98 @@
--
-- Parametrization for strings, useful for transliteration.
{-# LANGUAGE CPP #-}
module Text.Inflections.Parameterize
( parameterize
, parameterizeCustom
, Transliterations )
, parameterizeCustom )
where
import qualified Data.Map as Map
import Control.Monad (guard)
import Data.Maybe (mapMaybe)
import Data.Char (toLower)
import Data.Char (isAscii, isAsciiLower, isAsciiUpper, isDigit, toLower)
import Data.List (group)
import qualified Text.Parsec as P
import Data.Maybe (mapMaybe)
import Text.Inflections.Data
import Text.Megaparsec
import Text.Megaparsec.String
import qualified Data.Map as Map
import Text.Inflections.Data (defaultMap)
import Text.Inflections.Parse.Parameterizable ( PChar(..)
, parser
, isValidParamChar )
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
#endif
-- |A 'Data.Map.Map' containing mappings from international characters to
-- sequences approximating these characters within the ASCII range.
type Transliterations = Map.Map Char String
-- | Parametrizable character.
data PChar
= UCase {-# UNPACK #-} !Char -- ^ Uppercase charater
| Acceptable String
-- ^ Since some of the transliterating approximations expand from one
-- Unicode to two ASCII chars (eg., œ to oe), we represent this as a
-- String.
| Separator -- ^ Separator
| Underscore -- ^ Underscore
| OtherAscii {-# UNPACK #-} !Char -- ^ Other ASCII charater
| NonAscii {-# UNPACK #-} !Char -- ^ Non-ASCII charater
deriving (Eq, Show)
-- |Matches 'acceptable' characters for parameterization purposes.
acceptableParser :: Parser PChar
acceptableParser = do
c <- satisfy ((||) <$> isAsciiLower <*> isDigit)
return $ Acceptable [c]
parser :: Parser [PChar]
parser = many $ label "a parametrizable character" $ choice
[ acceptableParser
, UCase <$> satisfy isAsciiUpper
, Separator <$ char '-'
, Underscore <$ char '_'
, OtherAscii <$> asciiChar
, NonAscii <$> satisfy (not . isAscii) ]
-- | Parser that accepts rows of parametrizable characters.
-- | Check if given char is “acceptable”, that is, it's lowercase ASCII
-- letter or digit.
isValidParamChar :: Char -> Bool
isValidParamChar c = isAsciiLower c || isDigit c
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
-- how this character should appear in a URL.
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
parameterizeChar _ Separator = Just Separator
parameterizeChar _ Underscore = Just Underscore
parameterizeChar _ (OtherAscii _) = Just Separator
parameterizeChar ts (NonAscii c) = transliteratePCharCustom ts c
-- |Look up character in transliteration list. Accepts a Transliteration map
-- which has Chars as keys and Strings as values for approximating common
-- international Unicode characters within the ASCII range.
transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
transliteratePCharCustom ts c = do
-- We may have expanded into multiple characters during
-- transliteration, so check validity of all characters in
-- result.
v <- Map.lookup c ts
guard (all isValidParamChar v)
return (Acceptable v)
-- |Turns PChar tokens into their String representation.
pCharToC :: PChar -> String
pCharToC (UCase c) = [c]
pCharToC (Acceptable str) = str
pCharToC Separator = "-"
pCharToC Underscore = "_"
pCharToC (OtherAscii c) = [c]
pCharToC (NonAscii c) = [c]
-- |Reduce sequences of separators down to only one separator.
squeezeSeparators :: [PChar] -> [PChar]
squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
where squashSeparatorGroup g = case head g of
Separator -> [Separator] -- only take head
_ -> g -- don't change
-- |Replaces special characters in a string so that it may be used as part of a
-- 'pretty' URL. Uses the default transliterations in this library
@ -51,49 +120,11 @@ parameterizeCustom ts s =
Left err -> fail $ "Parse failed, please report a bug! Error: " ++
show err
where parsed = P.parse parser "" s
where parsed = parse parser "" s
wanted :: [PChar] -- All valid URL chars - we shouldn't trim these.
wanted = Underscore :
map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9'])
-- |Look up character in transliteration list. Accepts a Transliteration map
-- which has Chars as keys and Strings as values for approximating common
-- international Unicode characters within the ASCII range.
transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
transliteratePCharCustom ts c = do
-- We may have expanded into multiple characters during
-- transliteration, so check validity of all characters in
-- result.
v <- Map.lookup c ts
guard (all isValidParamChar v)
return (Acceptable v)
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
-- how this character should appear in a URL.
parameterizeChar :: Transliterations -> PChar -> Maybe PChar
parameterizeChar _ (UCase c) = Just $ Acceptable [toLower c]
parameterizeChar _ (Acceptable c) = Just $ Acceptable c
parameterizeChar _ Separator = Just Separator
parameterizeChar _ Underscore = Just Underscore
parameterizeChar _ (OtherAscii _) = Just Separator
parameterizeChar ts (NonAscii c) = transliteratePCharCustom ts c
-- |Turns PChar tokens into their String representation.
pCharToC :: PChar -> String
pCharToC (UCase c) = [c]
pCharToC (Acceptable str) = str
pCharToC Separator = "-"
pCharToC Underscore = "_"
pCharToC (OtherAscii c) = [c]
pCharToC (NonAscii c) = [c]
-- |Reduce sequences of separators down to only one separator.
squeezeSeparators :: [PChar] -> [PChar]
squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
where squashSeparatorGroup g = case head g of
Separator -> [Separator] -- only take head
_ -> g -- don't change
-- |Trim non-wanted elements from the beginning and end of list.
trimUnwanted :: Eq a => [a] -> [a] -> [a]
trimUnwanted wanted = dropWhile notWanted . reverse . dropWhile notWanted

View File

@ -9,21 +9,19 @@
--
-- Parser for acronyms.
{-# LANGUAGE FlexibleContexts, NoMonomorphismRestriction #-}
{-# LANGUAGE CPP #-}
module Text.Inflections.Parse.Acronym ( acronym ) where
import qualified Text.ParserCombinators.Parsec.Char as C
import qualified Text.Parsec as P
import qualified Text.Parsec.Prim as Prim
import Text.Inflections.Parse.Types
import Text.Megaparsec
import Text.Megaparsec.String
import Control.Applicative ((<$>))
import Prelude (Char, String, (.), map)
#if MIN_VERSION_base(4,8,0)
import Prelude hiding (Word)
#endif
-- | Parser that accepts a string from given collection and turns it into
-- an 'Acronym'.
acronym :: P.Stream s m Char => [String] -> P.ParsecT s u m Word
acronym as = Acronym <$> P.choice (map (Prim.try . C.string) as)
acronym :: [String] -> Parser Word
acronym = fmap Acronym . choice . fmap string

View File

@ -9,17 +9,21 @@
--
-- Parser for camel case “symbols”.
{-# LANGUAGE FlexibleContexts, NoMonomorphismRestriction #-}
{-# LANGUAGE CPP #-}
module Text.Inflections.Parse.CamelCase ( parseCamelCase )
where
import Text.Parsec
import Text.Inflections.Parse.Types (Word(..))
import Text.Inflections.Parse.Acronym (acronym)
import Text.Inflections.Parse.Types (Word(..))
import Text.Megaparsec
import Text.Megaparsec.String
import Prelude (Char, String, Either, return, ($))
#if MIN_VERSION_base(4,8,0)
import Prelude hiding (Word)
#else
import Control.Applicative
#endif
-- |Parses a CamelCase string.
--
@ -28,19 +32,19 @@ import Prelude (Char, String, Either, return, ($))
-- >>> parseCamelCase [] "foo_bar_bazz"
-- Left "(unknown)" (line 1, column 4):
-- unexpected '_'
parseCamelCase :: [String] -> String -> Either ParseError [Word]
parseCamelCase acronyms = parse (parser acronyms) "(unknown)"
parseCamelCase
:: [String] -- ^ Collection of acronyms
-> String -- ^ Input
-> Either (ParseError Char Dec) [Word] -- ^ Result of parsing
parseCamelCase acronyms = parse (parser acronyms) ""
-- |Recognizes an input String in CamelCase.
parser :: Stream s m Char => [String] -> ParsecT s u m [Word]
parser acronyms = do
ws <- many $ choice [ acronym acronyms, word ]
eof
return ws
parser
:: [String] -- ^ Collection of acronyms
-> Parser [Word] -- ^ CamelCase parser
parser acronyms = many (acronym acronyms <|> word) <* eof
-- | Parser that accepts lower-cased and capitalized words.
word :: Stream s m Char => ParsecT s u m Word
word :: Parser Word
word = do
firstChar <- upper <|> lower
restChars <- many lower
return $ Word $ firstChar : restChars
firstChar <- upperChar <|> lowerChar
restChars <- many lowerChar
return . Word $ firstChar : restChars

View File

@ -1,62 +0,0 @@
-- |
-- Module : Text.Inflections.Parse.Parameterizable
-- Copyright : © 20142016 Justin Leitgeb
-- License : MIT
--
-- Maintainer : Justin Leitgeb <justin@stackbuilders.com>
-- Stability : experimental
-- Portability : portable
--
-- Parametrizable characters and parser for them.
{-# LANGUAGE CPP, FlexibleContexts, NoMonomorphismRestriction #-}
module Text.Inflections.Parse.Parameterizable
( parser
, isValidParamChar
, PChar(..) )
where
import Data.Char (isAsciiLower, isAsciiUpper, isAscii, isDigit)
import qualified Text.Parsec as P
import qualified Text.ParserCombinators.Parsec.Char as C
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative
#endif
-- | Parametrizable character.
data PChar
= UCase Char -- ^ Uppercase charater
| Acceptable String
-- ^ Since some of the transliterating approximations expand from one
-- Unicode to two ASCII chars (eg., œ to oe), we represent this as a
-- String.
| Separator
| Underscore
| OtherAscii Char
| NonAscii Char
deriving (Eq, Show)
-- |Matches 'acceptable' characters for parameterization purposes.
acceptableParser :: P.Stream s m Char => P.ParsecT s u m PChar
acceptableParser = do
c <- C.satisfy isValidParamChar
return $ Acceptable [c]
-- | Parser that accepts rows of parametrizable characters.
parser :: P.Stream s m Char => P.ParsecT s u m [PChar]
parser = P.many $ P.choice [
acceptableParser
, UCase <$> C.satisfy isAsciiUpper
, Separator <$ C.char '-'
, Underscore <$ C.char '_'
, OtherAscii <$> C.satisfy isAscii
, NonAscii <$> C.satisfy (not . isAscii)
]
-- | Check if given char is “acceptable”, that is, it's lowercase ASCII
-- letter or digit.
isValidParamChar :: Char -> Bool
isValidParamChar c = isAsciiLower c || isDigit c

View File

@ -9,18 +9,21 @@
--
-- Parser for snake case “symbols”.
{-# LANGUAGE FlexibleContexts, NoMonomorphismRestriction #-}
{-# LANGUAGE CPP #-}
module Text.Inflections.Parse.SnakeCase ( parseSnakeCase )
where
import Control.Applicative ((<$>))
import Text.Parsec
import Text.Inflections.Parse.Types (Word(..))
import Text.Inflections.Parse.Acronym (acronym)
import Text.Inflections.Parse.Types (Word(..))
import Text.Megaparsec
import Text.Megaparsec.String
import Prelude (Char, String, Either, return)
#if MIN_VERSION_base(4,8,0)
import Prelude hiding (Word)
#else
import Control.Applicative
#endif
-- |Parses a snake_case string.
--
@ -29,14 +32,18 @@ import Prelude (Char, String, Either, return)
-- >>> parseSnakeCase [] "fooBarBazz"
-- Left "(unknown)" (line 1, column 4):
-- unexpected 'B'
parseSnakeCase :: [String] -> String -> Either ParseError [Word]
parseSnakeCase acronyms = parse (parser acronyms) "(unknown)"
parseSnakeCase
:: [String] -- ^ Collection of acronyms
-> String -- ^ Input
-> Either (ParseError Char Dec) [Word] -- ^ Result of parsing
parseSnakeCase acronyms = parse (parser acronyms) ""
parser :: Stream s m Char => [String] -> ParsecT s u m [Word]
parser
:: [String] -> Parser [Word]
parser acronyms = do
ws <- (acronym acronyms <|> word) `sepBy` char '_'
eof
return ws
word :: Stream s m Char => ParsecT s u m Word
word = Word <$> (many1 lower <|> many1 digit)
word :: Parser Word
word = Word <$> (some lowerChar <|> some digitChar)

View File

@ -9,22 +9,32 @@
--
-- Types used in the library.
module Text.Inflections.Parse.Types ( Word(..), mapWord ) where
{-# LANGUAGE CPP #-}
import Prelude (String, Show, Eq, ($))
module Text.Inflections.Parse.Types
( Word (..) -- FIXME we should not export the constructor
, unWord
, mapWord )
where
#if MIN_VERSION_base(4,8,0)
import Prelude hiding (Word)
#endif
-- | A 'String' that should be kept whole through applied inflections
data Word
= Word String -- ^ A word that may be transformed by inflection
| Acronym String -- ^ A word that may not be transformed by inflections
deriving (Show, Eq)
-- | A word that may be transformed by inflection
= Word String
-- | A word that may not be transformed by inflections
| Acronym String
deriving (Show, Eq)
-- | Get a 'String' from 'Word'.
unWord :: Word -> String
unWord (Word s) = s
unWord (Acronym s) = s
{-# INLINE unWord #-}
-- | Apply 'String' transforming function to a 'Word'.
mapWord :: (String -> String) -> Word -> Word
mapWord f (Word s) = Word $ f s
mapWord f (Acronym s) = Acronym $ f s
mapWord f (Word s) = Word (f s)
mapWord f (Acronym s) = Acronym (f s)
{-# INLINE mapWord #-}

View File

@ -15,12 +15,9 @@ module Text.Inflections.Transliterate
)
where
import Text.Inflections.Parameterize ( Transliterations )
import Text.Inflections.Data (defaultMap)
import Data.Char (isAscii)
import Data.Maybe(fromMaybe)
import Text.Inflections.Data
import qualified Data.Map as Map
-- |Returns a String after default approximations for changing Unicode characters

View File

@ -41,7 +41,6 @@ library
, Text.Inflections.Titleize
, Text.Inflections.Transliterate
, Text.Inflections.Parse.Acronym
, Text.Inflections.Parse.Parameterizable
, Text.Inflections.Parse.SnakeCase
, Text.Inflections.Parse.CamelCase
@ -51,7 +50,7 @@ library
ghc-options: -O2 -Wall
build-depends: base >= 4.6 && < 4.10
, containers >= 0.5 && < 0.6
, parsec >= 3.1.9 && < 4.0
, megaparsec >= 5.0 && < 6.0
default-language: Haskell2010
test-suite test
@ -63,7 +62,7 @@ test-suite test
, QuickCheck >= 2.7.6 && < 3.0
, containers >= 0.5 && < 0.6
, hspec >= 2.0 && < 3.0
, parsec >= 3.1.9 && < 4.0
, megaparsec >= 5.0 && < 6.0
if flag(dev)
ghc-options: -Wall -Werror
else