Refactor and switch to Megaparsec 5

Close #17.
2024-09-11 06:05:33 +03:00 · 2016-06-24 11:03:23 +03:00 · 2016-06-24 11:03:23 +03:00 · 794cc8c51c
commit 794cc8c51c
parent 77aba55099
13 changed files with 178 additions and 193 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -39,7 +39,7 @@ script:
    *)       cabal test --show-details=always ;;
   esac
 - cabal sdist
- - cabal haddock | grep "100%" | wc -l | grep "15"
+ - cabal haddock | grep "100%" | wc -l | grep "14"

 # after_script:
 #  - cabal install hpc-coveralls
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,8 @@

 * Switched test suite to Hspec.

+* Switched to Megaparsec 5 for parsing.
+
 * Improved documentation.

 ## Inflections 0.2.0.1
--- a/Text/Inflections.hs
+++ b/Text/Inflections.hs
@ -82,6 +82,7 @@ module Text.Inflections

    , titleize

+    , Transliterations
    , defaultMap

    , parameterize
@ -95,7 +96,6 @@ module Text.Inflections

    , parseSnakeCase
    , parseCamelCase
-    , Transliterations
    -- * Often used combinators
    , toUnderscore
    , toDashed
@ -103,12 +103,9 @@ module Text.Inflections
    )
 where

+import Text.Inflections.Data ( Transliterations, defaultMap )

-import Text.Inflections.Data ( defaultMap )
-
-import Text.Inflections.Parameterize ( Transliterations
-                                     , parameterize
-                                     , parameterizeCustom )
+import Text.Inflections.Parameterize ( parameterize, parameterizeCustom )

 import Text.Inflections.Underscore ( underscore )

--- a/Text/Inflections/Dasherize.hs
+++ b/Text/Inflections/Dasherize.hs
@ -9,25 +9,22 @@
 --
 -- Conversion to dasherized phrases.

+{-# LANGUAGE CPP #-}
+
 module Text.Inflections.Dasherize ( dasherize ) where

-import Text.Inflections.Parse.Types (Word(..))
-
 import Data.List (intercalate)
+import Text.Inflections.Parse.Types

-import Prelude (String, (.), map)
+#if MIN_VERSION_base(4,8,0)
+import Prelude hiding (Word)
+#endif

 -- | Replaces underscores in a snake_cased string with dashes (hyphens).
 --
 -- >>> dasherize [ Word "foo", Acronym "bar", Word "bazz" ]
 -- "foo-bar-bazz"
-
 dasherize
  :: [Word] -- ^ Input Words to separate with dashes
  -> String -- ^ The dasherized String
-dasherize = intercalate "-" . map toString
-
-- | Covert a 'Word' into its 'String' representation.
-toString :: Word -> String
-toString (Acronym s) = s
-toString (Word s)    = s
+dasherize = intercalate "-" . fmap unWord
--- a/Text/Inflections/Data.hs
+++ b/Text/Inflections/Data.hs
@ -11,12 +11,17 @@

 module Text.Inflections.Data where

-import Data.Map (Map, fromList)
+import Data.Map (Map)
+import qualified Data.Map as M
+
+-- |A 'Data.Map.Map' containing mappings from international characters to
+-- sequences approximating these characters within the ASCII range.
+type Transliterations = Map Char String

 -- |These default transliterations stolen from the Ruby i18n library - see
 -- <https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb#L41:L69>.
-defaultMap :: Map Char String
-defaultMap = fromList [
+defaultMap :: Transliterations
+defaultMap = M.fromList [
  ('À', "A"), ('Á', "A"), ('Â', "A"), ('Ã', "A"), ('Ä', "A"), ('Å', "A"),
  ('Æ', "AE"), ('Ç', "C"), ('È', "E"), ('É', "E"), ('Ê', "E"), ('Ë', "E"),
  ('Ì', "I"), ('Í', "I"), ('Î', "I"), ('Ï', "I"), ('Ð', "D"), ('Ñ', "N"),
--- a/Text/Inflections/Parameterize.hs
+++ b/Text/Inflections/Parameterize.hs
@ -9,29 +9,98 @@
 --
 -- Parametrization for strings, useful for transliteration.

+{-# LANGUAGE CPP #-}
+
 module Text.Inflections.Parameterize
  ( parameterize
-  , parameterizeCustom
-  , Transliterations )
+  , parameterizeCustom )
 where

-import qualified Data.Map as Map
 import Control.Monad (guard)
-import Data.Maybe (mapMaybe)
-
-import Data.Char (toLower)
-
+import Data.Char (isAscii, isAsciiLower, isAsciiUpper, isDigit, toLower)
 import Data.List (group)
-import qualified Text.Parsec as P
+import Data.Maybe (mapMaybe)
+import Text.Inflections.Data
+import Text.Megaparsec
+import Text.Megaparsec.String
+import qualified Data.Map as Map

-import Text.Inflections.Data (defaultMap)
-import Text.Inflections.Parse.Parameterizable ( PChar(..)
-                                              , parser
-                                              , isValidParamChar )
+#if !MIN_VERSION_base(4,8,0)
+import Control.Applicative
+#endif

-- |A 'Data.Map.Map' containing mappings from international characters to
-- sequences approximating these characters within the ASCII range.
-type Transliterations = Map.Map Char String
+-- | Parametrizable character.
+
+data PChar
+  = UCase      {-# UNPACK #-} !Char -- ^ Uppercase charater
+  | Acceptable String
+  -- ^ Since some of the transliterating approximations expand from one
+  -- Unicode to two ASCII chars (eg., œ to oe), we represent this as a
+  -- String.
+  | Separator                       -- ^ Separator
+  | Underscore                      -- ^ Underscore
+  | OtherAscii {-# UNPACK #-} !Char -- ^ Other ASCII charater
+  | NonAscii   {-# UNPACK #-} !Char -- ^ Non-ASCII charater
+  deriving (Eq, Show)
+
+-- |Matches 'acceptable' characters for parameterization purposes.
+acceptableParser :: Parser PChar
+acceptableParser = do
+  c <- satisfy ((||) <$> isAsciiLower <*> isDigit)
+  return $ Acceptable [c]
+
+parser :: Parser [PChar]
+parser = many $ label "a parametrizable character" $ choice
+  [ acceptableParser
+  , UCase      <$> satisfy isAsciiUpper
+  , Separator  <$  char '-'
+  , Underscore <$  char '_'
+  , OtherAscii <$> asciiChar
+  , NonAscii   <$> satisfy (not . isAscii) ]
+
+-- | Parser that accepts rows of parametrizable characters.
+-- | Check if given char is “acceptable”, that is, it's lowercase ASCII
+-- letter or digit.
+isValidParamChar :: Char -> Bool
+isValidParamChar c = isAsciiLower c || isDigit c
+
+-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
+-- how this character should appear in a URL.
+parameterizeChar :: Transliterations -> PChar -> Maybe PChar
+parameterizeChar _  (UCase c)      = Just $ Acceptable [toLower c]
+parameterizeChar _  (Acceptable c) = Just $ Acceptable c
+parameterizeChar _  Separator      = Just Separator
+parameterizeChar _  Underscore     = Just Underscore
+parameterizeChar _  (OtherAscii _) = Just Separator
+parameterizeChar ts (NonAscii c)   = transliteratePCharCustom ts c
+
+-- |Look up character in transliteration list. Accepts a Transliteration map
+-- which has Chars as keys and Strings as values for approximating common
+-- international Unicode characters within the ASCII range.
+transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
+transliteratePCharCustom ts c = do
+  -- We may have expanded into multiple characters during
+  -- transliteration, so check validity of all characters in
+  -- result.
+  v <- Map.lookup c ts
+  guard (all isValidParamChar v)
+  return (Acceptable v)
+
+-- |Turns PChar tokens into their String representation.
+pCharToC :: PChar -> String
+pCharToC (UCase c)        = [c]
+pCharToC (Acceptable str) = str
+pCharToC Separator        = "-"
+pCharToC Underscore       = "_"
+pCharToC (OtherAscii c)   = [c]
+pCharToC (NonAscii c)     = [c]
+
+-- |Reduce sequences of separators down to only one separator.
+squeezeSeparators :: [PChar] -> [PChar]
+squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
+    where squashSeparatorGroup g = case head g of
+                                     Separator -> [Separator] -- only take head
+                                     _         -> g           -- don't change

 -- |Replaces special characters in a string so that it may be used as part of a
 -- 'pretty' URL. Uses the default transliterations in this library
@ -51,49 +120,11 @@ parameterizeCustom ts s =
      Left err -> fail $ "Parse failed, please report a bug! Error: " ++
                         show err

-    where parsed = P.parse parser  "" s
+    where parsed = parse parser  "" s
          wanted :: [PChar] -- All valid URL chars - we shouldn't trim these.
          wanted = Underscore :
                   map (Acceptable . (: [])) (['a'..'z'] ++ ['0'..'9'])

-- |Look up character in transliteration list. Accepts a Transliteration map
-- which has Chars as keys and Strings as values for approximating common
-- international Unicode characters within the ASCII range.
-transliteratePCharCustom :: Transliterations -> Char -> Maybe PChar
-transliteratePCharCustom ts c = do
-  -- We may have expanded into multiple characters during
-  -- transliteration, so check validity of all characters in
-  -- result.
-  v <- Map.lookup c ts
-  guard (all isValidParamChar v)
-  return (Acceptable v)
-
-- |Given a Transliteration table and a PChar, returns Maybe PChar indicating
-- how this character should appear in a URL.
-parameterizeChar :: Transliterations -> PChar -> Maybe PChar
-parameterizeChar _  (UCase c)      = Just $ Acceptable [toLower c]
-parameterizeChar _  (Acceptable c) = Just $ Acceptable c
-parameterizeChar _  Separator      = Just Separator
-parameterizeChar _  Underscore     = Just Underscore
-parameterizeChar _  (OtherAscii _) = Just Separator
-parameterizeChar ts (NonAscii c)   = transliteratePCharCustom ts c
-
-- |Turns PChar tokens into their String representation.
-pCharToC :: PChar -> String
-pCharToC (UCase c)        = [c]
-pCharToC (Acceptable str) = str
-pCharToC Separator        = "-"
-pCharToC Underscore       = "_"
-pCharToC (OtherAscii c)   = [c]
-pCharToC (NonAscii c)     = [c]
-
-- |Reduce sequences of separators down to only one separator.
-squeezeSeparators :: [PChar] -> [PChar]
-squeezeSeparators ps = concatMap squashSeparatorGroup $ group ps
-    where squashSeparatorGroup g = case head g of
-                                     Separator -> [Separator] -- only take head
-                                     _         -> g           -- don't change
-
 -- |Trim non-wanted elements from the beginning and end of list.
 trimUnwanted :: Eq a => [a] -> [a] -> [a]
 trimUnwanted wanted = dropWhile notWanted . reverse . dropWhile notWanted
--- a/Text/Inflections/Parse/Acronym.hs
+++ b/Text/Inflections/Parse/Acronym.hs
@ -9,21 +9,19 @@
 --
 -- Parser for acronyms.

-{-# LANGUAGE FlexibleContexts, NoMonomorphismRestriction #-}
+{-# LANGUAGE CPP #-}

 module Text.Inflections.Parse.Acronym ( acronym ) where

-import qualified Text.ParserCombinators.Parsec.Char as C
-import qualified Text.Parsec as P
-import qualified Text.Parsec.Prim as Prim
-
 import Text.Inflections.Parse.Types
+import Text.Megaparsec
+import Text.Megaparsec.String

-import Control.Applicative ((<$>))
-
-import Prelude (Char, String, (.), map)
+#if MIN_VERSION_base(4,8,0)
+import Prelude hiding (Word)
+#endif

 -- | Parser that accepts a string from given collection and turns it into
 -- an 'Acronym'.
-acronym :: P.Stream s m Char => [String] -> P.ParsecT s u m Word
-acronym as = Acronym <$> P.choice (map (Prim.try . C.string) as)
+acronym :: [String] -> Parser Word
+acronym = fmap Acronym . choice . fmap string
--- a/Text/Inflections/Parse/CamelCase.hs
+++ b/Text/Inflections/Parse/CamelCase.hs
@ -9,17 +9,21 @@
 --
 -- Parser for camel case “symbols”.

-{-# LANGUAGE FlexibleContexts, NoMonomorphismRestriction #-}
+{-# LANGUAGE CPP #-}

 module Text.Inflections.Parse.CamelCase ( parseCamelCase )
 where

-import Text.Parsec
-
-import Text.Inflections.Parse.Types (Word(..))
 import Text.Inflections.Parse.Acronym (acronym)
+import Text.Inflections.Parse.Types (Word(..))
+import Text.Megaparsec
+import Text.Megaparsec.String

-import Prelude (Char, String, Either, return, ($))
+#if MIN_VERSION_base(4,8,0)
+import Prelude hiding (Word)
+#else
+import Control.Applicative
+#endif

 -- |Parses a CamelCase string.
 --
@ -28,19 +32,19 @@ import Prelude (Char, String, Either, return, ($))
 -- >>> parseCamelCase [] "foo_bar_bazz"
 -- Left "(unknown)" (line 1, column 4):
 -- unexpected '_'
-parseCamelCase :: [String] -> String -> Either ParseError [Word]
-parseCamelCase acronyms = parse (parser acronyms) "(unknown)"
+parseCamelCase
+  :: [String]          -- ^ Collection of acronyms
+  -> String            -- ^ Input
+  -> Either (ParseError Char Dec) [Word] -- ^ Result of parsing
+parseCamelCase acronyms = parse (parser acronyms) ""

-- |Recognizes an input String in CamelCase.
-parser :: Stream s m Char => [String] -> ParsecT s u m [Word]
-parser acronyms = do
-  ws <- many $ choice [ acronym acronyms, word ]
-  eof
-  return ws
+parser
+  :: [String]          -- ^ Collection of acronyms
+  -> Parser [Word]     -- ^ CamelCase parser
+parser acronyms = many (acronym acronyms <|> word) <* eof

-- | Parser that accepts lower-cased and capitalized words.
-word :: Stream s m Char => ParsecT s u m Word
+word :: Parser Word
 word = do
-  firstChar <- upper <|> lower
-  restChars <- many lower
-  return $ Word $ firstChar : restChars
+  firstChar <- upperChar <|> lowerChar
+  restChars <- many lowerChar
+  return . Word $ firstChar : restChars
--- a/Text/Inflections/Parse/Parameterizable.hs
+++ b/Text/Inflections/Parse/Parameterizable.hs
@ -1,62 +0,0 @@
-- |
-- Module      :  Text.Inflections.Parse.Parameterizable
-- Copyright   :  © 2014–2016 Justin Leitgeb
-- License     :  MIT
--
-- Maintainer  :  Justin Leitgeb <justin@stackbuilders.com>
-- Stability   :  experimental
-- Portability :  portable
--
-- Parametrizable characters and parser for them.
-
-{-# LANGUAGE CPP, FlexibleContexts, NoMonomorphismRestriction #-}
-
-module Text.Inflections.Parse.Parameterizable
-  ( parser
-  , isValidParamChar
-  , PChar(..) )
-where
-
-import Data.Char (isAsciiLower, isAsciiUpper, isAscii, isDigit)
-import qualified Text.Parsec as P
-import qualified Text.ParserCombinators.Parsec.Char as C
-
-#if !MIN_VERSION_base(4,8,0)
-import Control.Applicative
-#endif
-
-- | Parametrizable character.
-
-data PChar
-  = UCase Char -- ^ Uppercase charater
-  | Acceptable String
-  -- ^ Since some of the transliterating approximations expand from one
-  -- Unicode to two ASCII chars (eg., œ to oe), we represent this as a
-  -- String.
-  | Separator
-  | Underscore
-  | OtherAscii Char
-  | NonAscii Char
-  deriving (Eq, Show)
-
-- |Matches 'acceptable' characters for parameterization purposes.
-acceptableParser :: P.Stream s m Char => P.ParsecT s u m PChar
-acceptableParser = do
-  c <- C.satisfy isValidParamChar
-  return $ Acceptable [c]
-
-- | Parser that accepts rows of parametrizable characters.
-parser :: P.Stream s m Char => P.ParsecT s u m [PChar]
-parser = P.many $ P.choice [
-           acceptableParser
-         , UCase      <$> C.satisfy isAsciiUpper
-         , Separator  <$  C.char '-'
-         , Underscore <$  C.char '_'
-         , OtherAscii <$> C.satisfy isAscii
-         , NonAscii   <$> C.satisfy (not . isAscii)
-         ]
-
-- | Check if given char is “acceptable”, that is, it's lowercase ASCII
-- letter or digit.
-isValidParamChar :: Char -> Bool
-isValidParamChar c = isAsciiLower c || isDigit c
--- a/Text/Inflections/Parse/SnakeCase.hs
+++ b/Text/Inflections/Parse/SnakeCase.hs
@ -9,18 +9,21 @@
 --
 -- Parser for snake case “symbols”.

-{-# LANGUAGE FlexibleContexts, NoMonomorphismRestriction #-}
+{-# LANGUAGE CPP #-}

 module Text.Inflections.Parse.SnakeCase ( parseSnakeCase )
 where

-import Control.Applicative ((<$>))
-import Text.Parsec
-
-import Text.Inflections.Parse.Types (Word(..))
 import Text.Inflections.Parse.Acronym (acronym)
+import Text.Inflections.Parse.Types (Word(..))
+import Text.Megaparsec
+import Text.Megaparsec.String

-import Prelude (Char, String, Either, return)
+#if MIN_VERSION_base(4,8,0)
+import Prelude hiding (Word)
+#else
+import Control.Applicative
+#endif

 -- |Parses a snake_case string.
 --
@ -29,14 +32,18 @@ import Prelude (Char, String, Either, return)
 -- >>> parseSnakeCase [] "fooBarBazz"
 -- Left "(unknown)" (line 1, column 4):
 -- unexpected 'B'
-parseSnakeCase :: [String] -> String -> Either ParseError [Word]
-parseSnakeCase acronyms = parse (parser acronyms) "(unknown)"
+parseSnakeCase
+  :: [String]          -- ^ Collection of acronyms
+  -> String            -- ^ Input
+  -> Either (ParseError Char Dec) [Word] -- ^ Result of parsing
+parseSnakeCase acronyms = parse (parser acronyms) ""

-parser :: Stream s m Char => [String] -> ParsecT s u m [Word]
+parser
+  :: [String] -> Parser [Word]
 parser acronyms = do
  ws <- (acronym acronyms <|> word) `sepBy` char '_'
  eof
  return ws

-word :: Stream s m Char => ParsecT s u m Word
-word = Word <$> (many1 lower <|> many1 digit)
+word :: Parser Word
+word = Word <$> (some lowerChar <|> some digitChar)
--- a/Text/Inflections/Parse/Types.hs
+++ b/Text/Inflections/Parse/Types.hs
@ -9,22 +9,32 @@
 --
 -- Types used in the library.

-module Text.Inflections.Parse.Types ( Word(..), mapWord ) where
+{-# LANGUAGE CPP #-}

-import Prelude (String, Show, Eq, ($))
+module Text.Inflections.Parse.Types
+  ( Word (..) -- FIXME we should not export the constructor
+  , unWord
+  , mapWord )
+where
+
+#if MIN_VERSION_base(4,8,0)
+import Prelude hiding (Word)
+#endif

 -- | A 'String' that should be kept whole through applied inflections
 data Word
+  = Word String -- ^ A word that may be transformed by inflection
+  | Acronym String -- ^ A word that may not be transformed by inflections
+  deriving (Show, Eq)

-    -- | A word that may be transformed by inflection
-    = Word String
-
-    -- | A word that may not be transformed by inflections
-    | Acronym String
-
-    deriving (Show, Eq)
+-- | Get a 'String' from 'Word'.
+unWord :: Word -> String
+unWord (Word    s) = s
+unWord (Acronym s) = s
+{-# INLINE unWord #-}

 -- | Apply 'String' transforming function to a 'Word'.
 mapWord :: (String -> String) -> Word -> Word
-mapWord f (Word s) = Word $ f s
-mapWord f (Acronym s) = Acronym $ f s
+mapWord f (Word    s) = Word (f s)
+mapWord f (Acronym s) = Acronym (f s)
+{-# INLINE mapWord #-}
--- a/Text/Inflections/Transliterate.hs
+++ b/Text/Inflections/Transliterate.hs
@ -15,12 +15,9 @@ module Text.Inflections.Transliterate
    )
 where

-import Text.Inflections.Parameterize ( Transliterations )
-import Text.Inflections.Data (defaultMap)
-
 import Data.Char (isAscii)
 import Data.Maybe(fromMaybe)
-
+import Text.Inflections.Data
 import qualified Data.Map as Map

 -- |Returns a String after default approximations for changing Unicode characters
--- a/inflections.cabal
+++ b/inflections.cabal
@ -41,7 +41,6 @@ library
                     , Text.Inflections.Titleize
                     , Text.Inflections.Transliterate
                     , Text.Inflections.Parse.Acronym
-                     , Text.Inflections.Parse.Parameterizable
                     , Text.Inflections.Parse.SnakeCase
                     , Text.Inflections.Parse.CamelCase

@ -51,7 +50,7 @@ library
    ghc-options:      -O2 -Wall
  build-depends:       base         >= 4.6   && < 4.10
                     , containers   >= 0.5   && < 0.6
-                     , parsec       >= 3.1.9 && < 4.0
+                     , megaparsec   >= 5.0   && < 6.0
  default-language:    Haskell2010

 test-suite test
@ -63,7 +62,7 @@ test-suite test
                     , QuickCheck   >= 2.7.6 && < 3.0
                     , containers   >= 0.5   && < 0.6
                     , hspec        >= 2.0   && < 3.0
-                     , parsec       >= 3.1.9 && < 4.0
+                     , megaparsec   >= 5.0   && < 6.0
  if flag(dev)
    ghc-options:      -Wall -Werror
  else