Char and byte modules (#230)

2024-11-26 23:02:07 +03:00 · 2017-07-03 18:34:00 +07:00 · 2017-07-03 18:34:00 +07:00 · 3b9812bf76
commit 3b9812bf76
parent 45f30ae7e1
12 changed files with 633 additions and 150 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -33,7 +33,7 @@ script:
 - travis_wait 60 cabal build
 - cabal test --show-details=always --test-option=--qc-max-success=1000
 - cabal sdist
- - cabal haddock | grep "100%" | wc -l | grep "9"
+ - cabal haddock | grep "100%" | wc -l | grep "10"

 after_script:
 - export PATH=~/.cabal/bin:$PATH
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,10 +5,11 @@
 * Re-organized the module hierarchy. Some modules such as
  `Text.Megaparsec.Prim` do not exist anymore. Stream definitions were moved
  to `Text.Megaparsec.Stream`. Generic combinators are now re-exported from
-  the `Control.Applicative.Combinators`. Just import `Text.Megaparsec` and
-  you should be OK. Then add `Text.Megaparsec.Char` if you are working with
-  a stream of `Char`s, then add qualified modules you need (permutation
-  parsing, lexing, expression parsing, etc.).
+  the `Control.Applicative.Combinators` from the package
+  `parser-combinators`. Just import `Text.Megaparsec` and you should be OK.
+  Then add `Text.Megaparsec.Char` if you are working with a stream of
+  `Char`s, then add qualified modules you need (permutation parsing, lexing,
+  expression parsing, etc.).

 * Dropped per-stream modules, the `Parser` type synonym is to be defined
  manually by user.
@ -79,6 +80,18 @@
 * Added `takeWhileP` and `takeWhile1P` to `MonadParsec`. Added `skipWhileP`,
  `skipWhile1P` as derivatives from those primitive combinators.

+* Dropped `oneOf'` and `noneOf'` from `Text.Megaparsec.Char`. These were
+  seldom (if ever) used and are easily re-implemented.
+
+* Added `notChar` in `Text.Megaparsec.Char`.
+
+* Added `space1` in `Text.Megaprasec.Char`. This parser is like `space` but
+  requires at least one space character to be present to succeed.
+
+* Added new module `Text.Megaparsec.Byte`, which is similar to
+  `Text.Megaparsec.Char`, but target token type is `Word8` instead of
+  `Char`.
+
 ## Megaparsec 5.3.1

 * Various updates to the docs.
--- a/README.md
+++ b/README.md
@ -72,11 +72,11 @@ features some combinators that are missing in other parsing libraries:
 In addition to that, Megaparsec 6 features high-performance combinators
 similar to those found in Attoparsec:

-* `tokens` makes it easy to parse several tokens in a row. This is about 100
-  time faster than matching a string token by token. `string` and `string'`
-  are built on top of this combinator. `tokens` returns “chunk” of original
-  input, meaning that if you parse `Text`, it'll return `Text` without any
-  repacking.
+* `tokens` makes it easy to parse several tokens in a row (`string` and
+  `string'` are built on top of this primitive). This is about 100 times
+  faster than matching a string token by token. `tokens` returns “chunk” of
+  original input, meaning that if you parse `Text`, it'll return `Text`
+  without any repacking.

 * `takeWhile` and `takeWhile1` are about 150 times faster than approaches
  involving `many`, `manyTill` and other similar combinators.
--- a/Text/Megaparsec.hs
+++ b/Text/Megaparsec.hs
@ -15,8 +15,8 @@
 --
 -- In addition to the "Text.Megaparsec" module, which exports and re-exports
 -- most everything that you may need, we advise to import
-- "Text.Megaparsec.Char" if you plan to work with a stream of 'Char'
-- tokens.
+-- "Text.Megaparsec.Char" if you plan to work with a stream of 'Char' tokens
+-- or "Text.Megaparsec.Byte" if you indend to parse binary data.
 --
 -- It is common to start working with the library by defining a type synonym
 -- like this:
@ -756,6 +756,8 @@ class (Stream s, A.Alternative m, MonadPlus m)

  -- | Similar to 'takeWhileP', but fails if it can't parse at least one
  -- token.
+  --
+  -- @since 6.0.0

  takeWhile1P
    :: Maybe String    -- ^ Name for a single token in the row
@ -1227,6 +1229,8 @@ region f m = do
 {-# INLINEABLE region #-}

 -- | The same as 'takeWhileP', but discards the result.
+--
+-- @since 6.0.0

 skipWhileP :: MonadParsec e s m
  => Maybe String      -- ^ Name of a single token in the row
@ -1236,6 +1240,8 @@ skipWhileP l f = void (takeWhileP l f)
 {-# INLINE skipWhileP #-}

 -- | The same as 'takeWhile1P', but discards the result.
+--
+-- @since 6.0.0

 skipWhile1P :: MonadParsec e s m
  => Maybe String      -- ^ Name of a single token in the row
--- a/Text/Megaparsec/Byte.hs
+++ b/Text/Megaparsec/Byte.hs
@ -0,0 +1,234 @@
+-- |
+-- Module      :  Text.Megaparsec.Byte
+-- Copyright   :  © 2015–2017 Megaparsec contributors
+-- License     :  FreeBSD
+--
+-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
+-- Stability   :  experimental
+-- Portability :  portable
+--
+-- Commonly used binary parsers.
+--
+-- @since 6.0.0
+
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE TypeFamilies        #-}
+
+module Text.Megaparsec.Byte
+  ( -- * Simple parsers
+    newline
+  , crlf
+  , eol
+  , tab
+  , space
+  , space1
+    -- * Categories of characters
+  , controlChar
+  , spaceChar
+  , upperChar
+  , lowerChar
+  , letterChar
+  , alphaNumChar
+  , printChar
+  , digitChar
+  , octDigitChar
+  , hexDigitChar
+  , asciiChar
+    -- * More general parsers
+  , C.char
+  , char'
+  , C.anyChar
+  , C.notChar
+  , C.oneOf
+  , C.noneOf
+  , C.satisfy
+    -- * Sequence of bytes
+  , C.string
+  , C.string' )
+where
+
+import Control.Applicative
+import Data.Char
+import Data.Proxy
+import Data.Word (Word8)
+import qualified Text.Megaparsec.Char as C
+
+import Text.Megaparsec
+
+----------------------------------------------------------------------------
+-- Simple parsers
+
+-- | Parse a newline byte.
+
+newline :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+newline = C.char 10
+{-# INLINE newline #-}
+
+-- | Parse a carriage return character followed by a newline character.
+-- Return the sequence of characters parsed.
+
+crlf :: forall e s m. (MonadParsec e s m, Token s ~ Word8) => m (Tokens s)
+crlf = C.string (tokensToChunk (Proxy :: Proxy s) [13,10])
+{-# INLINE crlf #-}
+
+-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
+-- sequence of characters parsed.
+
+eol :: forall e s m. (MonadParsec e s m, Token s ~ Word8) => m (Tokens s)
+eol = (tokenToChunk (Proxy :: Proxy s) <$> newline)
+  <|> crlf
+  <?> "end of line"
+{-# INLINE eol #-}
+
+-- | Parse a tab character.
+
+tab :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+tab = C.char 9
+{-# INLINE tab #-}
+
+-- | Skip /zero/ or more white space characters.
+--
+-- See also: 'skipMany' and 'spaceChar'.
+
+space :: (MonadParsec e s m, Token s ~ Word8) => m ()
+space = skipWhileP (Just "white space") isSpace'
+{-# INLINE space #-}
+
+-- | Skip /one/ or more white space characters.
+--
+-- See also: 'skipSome' and 'spaceChar'.
+
+space1 :: (MonadParsec e s m, Token s ~ Word8) => m ()
+space1 = skipWhile1P (Just "white space") isSpace'
+{-# INLINE space1 #-}
+
+----------------------------------------------------------------------------
+-- Categories of characters
+
+-- | Parse a control character (a non-printing character of the Latin-1
+-- subset of Unicode).
+
+controlChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+controlChar = C.satisfy (isControl . toChar) <?> "control character"
+{-# INLINE controlChar #-}
+
+-- | Parse a space character, and the control characters: tab, newline,
+-- carriage return, form feed, and vertical tab.
+
+spaceChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+spaceChar = C.satisfy isSpace' <?> "white space"
+{-# INLINE spaceChar #-}
+
+-- | Parse an upper-case character.
+
+upperChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+upperChar = C.satisfy (isUpper . toChar) <?> "uppercase letter"
+{-# INLINE upperChar #-}
+
+-- | Parse a lower-case alphabetic character.
+
+lowerChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+lowerChar = C.satisfy (isLower . toChar) <?> "lowercase letter"
+{-# INLINE lowerChar #-}
+
+-- | Parse an alphabetic character: lower-case or upper-case.
+
+letterChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+letterChar = C.satisfy (isLetter . toChar) <?> "letter"
+{-# INLINE letterChar #-}
+
+-- | Parse an alphabetic or numeric digit Unicode characters.
+--
+-- Note that the numeric digits outside the ASCII range are parsed by this
+-- parser but not by 'digitChar'. Such digits may be part of identifiers but
+-- are not used by the printer and reader to represent numbers.
+
+alphaNumChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+alphaNumChar = C.satisfy (isAlphaNum . toChar) <?> "alphanumeric character"
+{-# INLINE alphaNumChar #-}
+
+-- | Parse a printable character: letter, number, mark, punctuation, symbol
+-- or space.
+
+printChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+printChar = C.satisfy (isPrint . toChar) <?> "printable character"
+{-# INLINE printChar #-}
+
+-- | Parse an ASCII digit, i.e between “0” and “9”.
+
+digitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+digitChar = C.satisfy isDigit' <?> "digit"
+  where
+    isDigit' x = x >= 48 && x <= 57
+{-# INLINE digitChar #-}
+
+-- | Parse an octal digit, i.e. between “0” and “7”.
+
+octDigitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+octDigitChar = C.satisfy isOctDigit' <?> "octal digit"
+  where
+    isOctDigit' x = x >= 48 && x <= 55
+{-# INLINE octDigitChar #-}
+
+-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
+-- “A” and “F”.
+
+hexDigitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+hexDigitChar = C.satisfy (isHexDigit . toChar) <?> "hexadecimal digit"
+{-# INLINE hexDigitChar #-}
+
+-- | Parse a character from the first 128 characters of the Unicode
+-- character set, corresponding to the ASCII character set.
+
+asciiChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
+asciiChar = C.satisfy (< 128) <?> "ASCII character"
+{-# INLINE asciiChar #-}
+
+----------------------------------------------------------------------------
+-- More general parsers
+
+-- | The same as 'char' but case-insensitive. This parser returns the
+-- actually parsed character preserving its case.
+--
+-- >>> parseTest (char' 'e') "E"
+-- 'E'
+-- >>> parseTest (char' 'e') "G"
+-- 1:1:
+-- unexpected 'G'
+-- expecting 'E' or 'e'
+
+char' :: (MonadParsec e s m, Token s ~ Word8) => Token s -> m (Token s)
+char' c = choice [C.char c, C.char (swapCase c)]
+  where
+    swapCase x
+      | isUpper g = fromChar (toLower g)
+      | isLower g = fromChar (toUpper g)
+      | otherwise = x
+      where
+        g = toChar x
+{-# INLINE char' #-}
+
+----------------------------------------------------------------------------
+-- Helpers
+
+-- | 'Word8'-specialized version of 'isSpace'.
+
+isSpace' :: Word8 -> Bool
+isSpace' x
+  | x >= 9 && x <= 13 = True
+  | x == 32           = True
+  | x == 160          = True
+  | otherwise         = False
+{-# INLINE isSpace' #-}
+
+-- | Convert a byte to char.
+
+toChar :: Word8 -> Char
+toChar = chr . fromIntegral
+{-# INLINE toChar #-}
+
+-- | Convert a char to byte.
+
+fromChar :: Char -> Word8
+fromChar = fromIntegral . ord
+{-# INLINE fromChar #-}
--- a/Text/Megaparsec/Char.hs
+++ b/Text/Megaparsec/Char.hs
@ -24,6 +24,7 @@ module Text.Megaparsec.Char
  , eol
  , tab
  , space
+  , space1
    -- * Categories of characters
  , controlChar
  , spaceChar
@ -48,10 +49,9 @@ module Text.Megaparsec.Char
  , char
  , char'
  , anyChar
+  , notChar
  , oneOf
-  , oneOf'
  , noneOf
-  , noneOf'
  , satisfy
    -- * Sequence of characters
  , string
@ -69,8 +69,8 @@ import qualified Data.Set             as E
 import Text.Megaparsec

 #if !MIN_VERSION_base(4,8,0)
-import Data.Foldable (Foldable (), any, elem, notElem)
-import Prelude hiding (any, elem, notElem)
+import Data.Foldable (Foldable (), elem, notElem)
+import Prelude hiding (elem, notElem)
 #endif

 ----------------------------------------------------------------------------
@ -112,6 +112,16 @@ space :: (MonadParsec e s m, Token s ~ Char) => m ()
 space = skipWhileP (Just "white space") isSpace
 {-# INLINE space #-}

+-- | Skip /one/ or more white space characters.
+--
+-- See also: 'skipSome' and 'spaceChar'.
+--
+-- @since 6.0.0
+
+space1 :: (MonadParsec e s m, Token s ~ Char) => m ()
+space1 = skipWhile1P (Just "white space") isSpace
+{-# INLINE space1 #-}
+
 ----------------------------------------------------------------------------
 -- Categories of characters

@ -285,7 +295,7 @@ categoryName = \case
 --
 -- > semicolon = char ';'

-char :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
+char :: MonadParsec e s m => Token s -> m (Token s)
 char c = token testChar (Just c)
  where
    f x = Tokens (x:|[])
@ -306,7 +316,7 @@ char c = token testChar (Just c)
 -- expecting 'E' or 'e'

 char' :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
-char' c = choice [char c, char $ swapCase c]
+char' c = choice [char c, char (swapCase c)]
  where
    swapCase x
      | isUpper x = toLower x
@ -316,10 +326,19 @@ char' c = choice [char c, char $ swapCase c]

 -- | This parser succeeds for any character. Returns the parsed character.

-anyChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
+anyChar :: MonadParsec e s m => m (Token s)
 anyChar = satisfy (const True) <?> "character"
 {-# INLINE anyChar #-}

+-- | Match any character but the given one. It's a good idea to attach a
+-- 'label' to this parser manually.
+--
+-- @since 6.0.0
+
+notChar :: MonadParsec e s m => Token s -> m (Token s)
+notChar c = satisfy (/= c)
+{-# INLINE notChar #-}
+
 -- | @'oneOf' cs@ succeeds if the current character is in the supplied
 -- collection of characters @cs@. Returns the parsed character. Note that
 -- this parser cannot automatically generate the “expected” component of
@ -329,43 +348,36 @@ anyChar = satisfy (const True) <?> "character"
 -- See also: 'satisfy'.
 --
 -- > digit = oneOf ['0'..'9'] <?> "digit"
+--
+-- __Performance note__: prefer 'satisfy' when you can because it's faster
+-- when you have only a couple of tokens to compare to:
+--
+-- > quoteFast = satisfy (\x -> x == '\'' || x == '\"')
+-- > quoteSlow = oneOf "'\""

-oneOf :: (Foldable f, MonadParsec e s m, Token s ~ Char)
-  => f (Token s) -> m (Token s)
+oneOf :: (Foldable f, MonadParsec e s m)
+  => f (Token s)
+  -> m (Token s)
 oneOf cs = satisfy (`elem` cs)
 {-# INLINE oneOf #-}

-- | The same as 'oneOf', but case-insensitive. Returns the parsed character
-- preserving its case.
--
-- > vowel = oneOf' "aeiou" <?> "vowel"
-
-oneOf' :: (Foldable f, MonadParsec e s m, Token s ~ Char)
-  => f (Token s)
-  -> m (Token s)
-oneOf' cs = satisfy (`elemi` cs)
-{-# INLINE oneOf' #-}
-
 -- | As the dual of 'oneOf', @noneOf cs@ succeeds if the current character
 -- /not/ in the supplied list of characters @cs@. Returns the parsed
-- character.
+-- character. Note that this parser cannot automatically generate the
+-- “expected” component of error message, so usually you should label it
+-- manually with 'label' or ('<?>').
+--
+-- See also: 'satisfy'.
+--
+-- __Performance note__: prefer 'satisfy' and 'notChar' when you can because
+-- it's faster.

-noneOf :: (Foldable f, MonadParsec e s m, Token s ~ Char)
+noneOf :: (Foldable f, MonadParsec e s m)
  => f (Token s)
  -> m (Token s)
 noneOf cs = satisfy (`notElem` cs)
 {-# INLINE noneOf #-}

-- | The same as 'noneOf', but case-insensitive.
--
-- > consonant = noneOf' "aeiou" <?> "consonant"
-
-noneOf' :: (Foldable f, MonadParsec e s m, Token s ~ Char)
-  => f (Token s)
-  -> m (Token s)
-noneOf' cs = satisfy (`notElemi` cs)
-{-# INLINE noneOf' #-}
-
 -- | The parser @'satisfy' f@ succeeds for any character for which the
 -- supplied function @f@ returns 'True'. Returns the character that is
 -- actually parsed.
@ -373,7 +385,7 @@ noneOf' cs = satisfy (`notElemi` cs)
 -- > digitChar = satisfy isDigit <?> "digit"
 -- > oneOf cs  = satisfy (`elem` cs)

-satisfy :: (MonadParsec e s m, Token s ~ Char)
+satisfy :: MonadParsec e s m
  => (Token s -> Bool) -- ^ Predicate to apply
  -> m (Token s)
 satisfy f = token testChar Nothing
@ -409,24 +421,3 @@ string' :: (MonadParsec e s m, CI.FoldCase (Tokens s))
  -> m (Tokens s)
 string' = tokens ((==) `on` CI.mk)
 {-# INLINE string' #-}
-
----------------------------------------------------------------------------
-- Helpers
-
-- | Case-insensitive equality test for characters.
-
-casei :: Char -> Char -> Bool
-casei x y = toUpper x == toUpper y
-{-# INLINE casei #-}
-
-- | Case-insensitive 'elem'.
-
-elemi :: Foldable f => Char -> f Char -> Bool
-elemi = any . casei
-{-# INLINE elemi #-}
-
-- | Case-insensitive 'notElem'.
-
-notElemi :: Foldable f => Char -> f Char -> Bool
-notElemi c = not . elemi c
-{-# INLINE notElemi #-}
--- a/Text/Megaparsec/Error.hs
+++ b/Text/Megaparsec/Error.hs
@ -36,6 +36,7 @@ where

 import Control.DeepSeq
 import Control.Exception
+import Data.Char (chr)
 import Data.Data (Data)
 import Data.List (intercalate)
 import Data.List.NonEmpty (NonEmpty (..))
@ -44,6 +45,7 @@ import Data.Semigroup
 import Data.Set (Set)
 import Data.Typeable (Typeable)
 import Data.Void
+import Data.Word (Word8)
 import GHC.Generics
 import Prelude hiding (concat)
 import qualified Data.List.NonEmpty as NE
@ -199,6 +201,9 @@ class ShowToken a where
 instance ShowToken Char where
  showTokens = stringPretty

+instance ShowToken Word8 where
+  showTokens = stringPretty . fmap (chr . fromIntegral)
+
 -- | The type class defines how to print custom data component of
 -- 'ParseError'.
 --
--- a/Text/Megaparsec/Lexer.hs
+++ b/Text/Megaparsec/Lexer.hs
@ -68,16 +68,18 @@ import qualified Text.Megaparsec.Char as C
 ----------------------------------------------------------------------------
 -- White space

-- | @space spaceChar lineComment blockComment@ produces parser that can
-- parse white space in general. It's expected that you create such a parser
-- once and pass it to other functions in this module as needed (when you
-- see @spaceConsumer@ in documentation, usually it means that something
-- like 'space' is expected there).
+-- | @space sc lineComment blockComment@ produces parser that can parse
+-- white space in general. It's expected that you create such a parser once
+-- and pass it to other functions in this module as needed (when you see
+-- @spaceConsumer@ in documentation, usually it means that something like
+-- 'space' is expected there).
 --
-- @spaceChar@ is used to parse trivial space characters. You can use
-- 'C.spaceChar' from "Text.Megaparsec.Char" for this purpose as well as
-- your own parser (if you don't want to automatically consume newlines, for
-- example).
+-- @sc@ is used to parse blocks of space characters. You can use 'C.space1'
+-- from "Text.Megaparsec.Char" for this purpose as well as your own parser
+-- (if you don't want to automatically consume newlines, for example). Make
+-- sure the parser does not succeed on empty input though. In earlier
+-- version 'C.spaceChar' was recommended, but now parsers based on
+-- 'takeWhile1P' are preferred because of their speed.
 --
 -- @lineComment@ is used to parse line comments. You can use
 -- 'skipLineComment' if you don't need anything special.
@ -94,7 +96,8 @@ import qualified Text.Megaparsec.Char as C
 -- of the file).

 space :: MonadParsec e s m
-  => m () -- ^ A parser for a space character (e.g. @'void' 'C.spaceChar'@)
+  => m () -- ^ A parser for space characters which does not accept empty
+          -- input (e.g. 'C.space1')
  -> m () -- ^ A parser for a line comment (e.g. 'skipLineComment')
  -> m () -- ^ A parser for a block comment (e.g. 'skipBlockComment')
  -> m ()
@ -128,7 +131,7 @@ lexeme spc p = p <* spc
 -- > colon     = symbol ":"
 -- > dot       = symbol "."

-symbol :: (MonadParsec e s m, Token s ~ Char)
+symbol :: MonadParsec e s m
  => m ()              -- ^ How to consume white space after lexeme
  -> Tokens s          -- ^ String to parse
  -> m (Tokens s)
@ -137,7 +140,7 @@ symbol spc = lexeme spc . C.string
 -- | Case-insensitive version of 'symbol'. This may be helpful if you're
 -- working with case-insensitive languages.

-symbol' :: (MonadParsec e s m, Token s ~ Char, CI.FoldCase (Tokens s))
+symbol' :: (MonadParsec e s m, CI.FoldCase (Tokens s))
  => m ()              -- ^ How to consume white space after lexeme
  -> Tokens s          -- ^ String to parse (case-insensitive)
  -> m (Tokens s)
--- a/megaparsec.cabal
+++ b/megaparsec.cabal
@ -49,6 +49,7 @@ library
  if !impl(ghc >= 7.10)
    build-depends:    void         == 0.7.*
  exposed-modules:    Text.Megaparsec
+                    , Text.Megaparsec.Byte
                    , Text.Megaparsec.Char
                    , Text.Megaparsec.Error
                    , Text.Megaparsec.Error.Builder
@ -74,6 +75,7 @@ test-suite tests
  other-modules:      Control.Applicative.CombinatorsSpec
                    , Test.Hspec.Megaparsec
                    , Test.Hspec.Megaparsec.AdHoc
+                    , Text.Megaparsec.ByteSpec
                    , Text.Megaparsec.CharSpec
                    , Text.Megaparsec.ErrorSpec
                    , Text.Megaparsec.ExprSpec
--- a/tests/Text/Megaparsec/ByteSpec.hs
+++ b/tests/Text/Megaparsec/ByteSpec.hs
@ -0,0 +1,247 @@
+{-# LANGUAGE CPP               #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Text.Megaparsec.ByteSpec (spec) where
+
+import Control.Monad
+import Data.ByteString (ByteString)
+import Data.Char
+import Data.Proxy
+import Data.Semigroup ((<>))
+import Data.Void
+import Data.Word (Word8)
+import Test.Hspec
+import Test.Hspec.Megaparsec
+import Test.Hspec.Megaparsec.AdHoc (nes)
+import Test.QuickCheck
+import Text.Megaparsec
+import Text.Megaparsec.Byte
+import qualified Data.ByteString as B
+
+#if !MIN_VERSION_base(4,8,0)
+import Control.Applicative
+#endif
+
+type Parser = Parsec Void ByteString
+
+spec :: Spec
+spec = do
+
+  describe "newline" $
+    checkStrLit "newline" "\n" (tokenToChunk bproxy <$> newline)
+
+  describe "csrf" $
+    checkStrLit "crlf newline" "\r\n" crlf
+
+  describe "eol" $ do
+    context "when stream begins with a newline" $
+      it "succeeds returning the newline" $
+        property $ \s -> do
+          let s' = "\n" <> s
+          prs  eol s' `shouldParse`     "\n"
+          prs' eol s' `succeedsLeaving` s
+    context "when stream begins with CRLF sequence" $
+      it "parses the CRLF sequence" $
+        property $ \s -> do
+          let s' = "\r\n" <> s
+          prs  eol s' `shouldParse`     "\r\n"
+          prs' eol s' `succeedsLeaving` s
+    context "when stream begins with '\\r', but it's not followed by '\\n'" $
+      it "signals correct parse error" $
+        property $ \ch -> ch /= 10 ==> do
+          let s = "\r" <> B.singleton ch
+          prs eol s `shouldFailWith`
+            err posI (utoks (B.unpack s) <> elabel "end of line")
+    context "when input stream is '\\r'" $
+      it "signals correct parse error" $
+        prs eol "\r" `shouldFailWith` err posI
+          (utok 13 <> elabel "end of line")
+    context "when stream does not begin with newline or CRLF sequence" $
+      it "signals correct parse error" $
+        property $ \ch s -> (ch /= 13 && ch /= 10) ==> do
+          let s' = B.singleton ch <> s
+          prs eol s' `shouldFailWith` err posI
+            (utoks (B.unpack $ B.take 2 s') <> elabel "end of line")
+    context "when stream is empty" $
+      it "signals correct parse error" $
+        prs eol "" `shouldFailWith` err posI
+          (ueof <> elabel "end of line")
+
+  describe "tab" $
+    checkStrLit "tab" "\t" (tokenToChunk bproxy <$> tab)
+
+  describe "space" $
+    it "consumes space up to first non-space character" $
+      property $ \s' -> do
+        let (s0,s1) = B.partition isSpace' s'
+            s = s0 <> s1
+        prs  space s `shouldParse` ()
+        prs' space s `succeedsLeaving` s1
+
+  describe "space1" $ do
+    context "when stream does not start with a space character" $
+      it "signals correct parse error" $
+        property $ \ch s' -> not (isSpace' ch) ==> do
+          let (s0,s1) = B.partition isSpace' s'
+              s = B.singleton ch <> s0 <> s1
+          prs  space1 s `shouldFailWith` err posI (utok ch <> elabel "white space")
+          prs' space1 s `failsLeaving` s
+    context "when stream starts with a space character" $
+      it "consumes space up to first non-space character" $
+        property $ \s' -> do
+          let (s0,s1) = B.partition isSpace' s'
+              s = " " <> s0 <> s1
+          prs  space1 s `shouldParse` ()
+          prs' space1 s `succeedsLeaving` s1
+    context "when stream is empty" $
+      it "signals correct parse error" $
+        prs space1 "" `shouldFailWith` err posI (ueof <> elabel "white space")
+
+  describe "controlChar" $
+    checkCharPred "control character" (isControl . toChar) controlChar
+
+  describe "spaceChar" $
+    checkCharRange "white space" [9,10,11,12,13,32,160] spaceChar
+
+  -- describe "upperChar" $
+  --   checkCharPred "uppercase letter" (isUpper . toChar) upperChar
+
+  -- describe "lowerChar" $
+  --   checkCharPred "lowercase letter" (isLower . toChar) lowerChar
+
+  -- describe "letterChar" $
+  --   checkCharPred "letter" (isAlpha . toChar) letterChar
+
+  describe "alphaNumChar" $
+    checkCharPred "alphanumeric character" (isAlphaNum . toChar) alphaNumChar
+
+  describe "printChar" $
+    checkCharPred "printable character" (isPrint . toChar) printChar
+
+  describe "digitChar" $
+    checkCharRange "digit" [48..57] digitChar
+
+  describe "octDigitChar" $
+    checkCharRange "octal digit" [48..55] octDigitChar
+
+  describe "hexDigitChar" $
+    checkCharRange "hexadecimal digit" ([48..57] ++ [97..102] ++ [65..70]) hexDigitChar
+
+  -- describe "asciiChar" $
+  --   checkCharPred "ASCII character" (isAscii . toChar) asciiChar
+
+  describe "char'" $ do
+    context "when stream begins with the character specified as argument" $
+      it "parses the character" $
+        property $ \ch s -> do
+          let sl = B.cons (liftChar toLower ch) s
+              su = B.cons (liftChar toUpper ch) s
+          prs  (char' ch) sl `shouldParse`     liftChar toLower ch
+          prs  (char' ch) su `shouldParse`     liftChar toUpper ch
+          prs' (char' ch) sl `succeedsLeaving` s
+          prs' (char' ch) su `succeedsLeaving` s
+    context "when stream does not begin with the character specified as argument" $
+      it "signals correct parse error" $
+        property $ \ch ch' s -> liftChar toLower ch /= liftChar toLower ch' ==> do
+          let s' = B.cons ch' s
+              ms = utok ch' <> etok (liftChar toLower ch) <> etok (liftChar toUpper ch)
+          prs  (char' ch) s' `shouldFailWith` err posI ms
+          prs' (char' ch) s' `failsLeaving`   s'
+    context "when stream is empty" $
+      it "signals correct parse error" $
+        property $ \ch -> do
+          let ms = ueof <> etok (liftChar toLower ch) <> etok (liftChar toUpper ch)
+          prs  (char' ch) "" `shouldFailWith` err posI ms
+
+----------------------------------------------------------------------------
+-- Helpers
+
+checkStrLit :: String -> ByteString -> Parser ByteString -> SpecWith ()
+checkStrLit name ts p = do
+  context ("when stream begins with " ++ name) $
+    it ("parses the " ++ name) $
+      property $ \s -> do
+        let s' = ts <> s
+        prs  p s' `shouldParse`     ts
+        prs' p s' `succeedsLeaving` s
+  context ("when stream does not begin with " ++ name) $
+    it "signals correct parse error" $
+      property $ \ch s -> ch /= B.head ts ==> do
+       let s' = B.cons ch s
+           us = B.unpack $ B.take (B.length ts) s'
+           ps = B.unpack ts
+       prs  p s' `shouldFailWith` err posI (utoks us <> etoks ps)
+       prs' p s' `failsLeaving`   s'
+  context "when stream is empty" $
+    it "signals correct parse error" $
+      prs p "" `shouldFailWith` err posI (ueof <> etoks (B.unpack ts))
+
+checkCharPred :: String -> (Word8 -> Bool) -> Parser Word8 -> SpecWith ()
+checkCharPred name f p = do
+  context ("when stream begins with " ++ name) $
+    it ("parses the " ++ name) $
+      property $ \ch s -> f ch ==> do
+        let s' = B.singleton ch <> s
+        prs  p s' `shouldParse`     ch
+        prs' p s' `succeedsLeaving` s
+  context ("when stream does not begin with " ++ name) $
+    it "signals correct parse error" $
+      property $ \ch s -> not (f ch) ==> do
+       let s' = B.singleton ch <> s
+       prs  p s' `shouldFailWith` err posI (utok ch <> elabel name)
+       prs' p s' `failsLeaving`   s'
+  context "when stream is empty" $
+    it "signals correct parse error" $
+      prs p "" `shouldFailWith` err posI (ueof <> elabel name)
+
+checkCharRange :: String -> [Word8] -> Parser Word8 -> SpecWith ()
+checkCharRange name tchs p = do
+  forM_ tchs $ \tch ->
+    context ("when stream begins with " ++ showTokens (nes tch)) $
+      it ("parses the " ++ showTokens (nes tch)) $
+        property $ \s -> do
+          let s' = B.singleton tch <> s
+          prs  p s' `shouldParse`     tch
+          prs' p s' `succeedsLeaving` s
+  context "when stream is empty" $
+    it "signals correct parse error" $
+      prs p "" `shouldFailWith` err posI (ueof <> elabel name)
+
+prs
+  :: Parser a          -- ^ Parser to run
+  -> ByteString        -- ^ Input for the parser
+  -> Either (ParseError Word8 Void) a -- ^ Result of parsing
+prs p = parse p ""
+
+prs'
+  :: Parser a          -- ^ Parser to run
+  -> ByteString        -- ^ Input for the parser
+  -> (State ByteString, Either (ParseError Word8 Void) a) -- ^ Result of parsing
+prs' p s = runParser' p (initialState s)
+
+bproxy :: Proxy ByteString
+bproxy = Proxy
+
+-- | 'Word8'-specialized version of 'isSpace'.
+
+isSpace' :: Word8 -> Bool
+isSpace' x
+  | x >= 9 && x <= 13 = True
+  | x == 32           = True
+  | x == 160          = True
+  | otherwise         = False
+
+-- | Convert a byte to char.
+
+toChar :: Word8 -> Char
+toChar = chr . fromIntegral
+
+-- | Covert a char to byte.
+
+fromChar :: Char -> Word8
+fromChar = fromIntegral . ord
+
+-- | Lift char transformation to byte transformation.
+
+liftChar :: (Char -> Char) -> Word8 -> Word8
+liftChar f = fromChar . f . toChar
--- a/tests/Text/Megaparsec/CharSpec.hs
+++ b/tests/Text/Megaparsec/CharSpec.hs
@ -7,7 +7,6 @@ import Control.Monad
 import Data.Char
 import Data.List (partition, isPrefixOf)
 import Data.Monoid ((<>))
-import Data.Void
 import Test.Hspec
 import Test.Hspec.Megaparsec
 import Test.Hspec.Megaparsec.AdHoc
@ -68,12 +67,31 @@ spec = do
    checkStrLit "tab" "\t" (pure <$> tab)

  describe "space" $
-    it "consumes it up to first non-space character" $
-      property $ \s -> do
-        let (s0,s1) = partition isSpace s
-            s' = s0 ++ s1
-        prs  space s' `shouldParse` ()
-        prs' space s' `succeedsLeaving` s1
+    it "consumes space up to first non-space character" $
+      property $ \s' -> do
+        let (s0,s1) = partition isSpace s'
+            s = s0 ++ s1
+        prs  space s `shouldParse` ()
+        prs' space s `succeedsLeaving` s1
+
+  describe "space1" $ do
+    context "when stream does not start with a space character" $
+      it "signals correct parse error" $
+        property $ \ch s' -> not (isSpace ch) ==> do
+          let (s0,s1) = partition isSpace s'
+              s = ch : s0 ++ s1
+          prs  space1 s `shouldFailWith` err posI (utok ch <> elabel "white space")
+          prs' space1 s `failsLeaving` s
+    context "when stream starts with a space character" $
+      it "consumes space up to first non-space character" $
+        property $ \s' -> do
+          let (s0,s1) = partition isSpace s'
+              s = ' ' : s0 ++ s1
+          prs  space1 s `shouldParse` ()
+          prs' space1 s `succeedsLeaving` s1
+    context "when stream is empty" $
+      it "signals correct parse error" $
+        prs space1 "" `shouldFailWith` err posI (ueof <> elabel "white space")

  describe "controlChar" $
    checkCharPred "control character" isControl controlChar
@ -220,6 +238,24 @@ spec = do
      it "signals correct parse error" $
        prs anyChar "" `shouldFailWith` err posI (ueof <> elabel "character")

+  describe "notChar" $ do
+    context "when stream begins with the character specified as argument" $
+      it "signals correct parse error" $
+        property $ \ch s' -> do
+          let p = notChar ch
+              s = ch : s'
+          prs p s `shouldFailWith` err posI (utok ch)
+          prs' p s `failsLeaving` s
+    context "when stream does not begin with the character specified as argument" $
+      it "parses first character in the stream" $
+        property $ \ch s -> not (null s) && ch /= head s ==> do
+          let p = notChar ch
+          prs  p s `shouldParse` head s
+          prs' p s `succeedsLeaving` tail s
+    context "when stream is empty" $
+      it "signals correct parse error" $
+        prs (notChar 'a') "" `shouldFailWith` err posI ueof
+
  describe "oneOf" $ do
    context "when stream begins with one of specified characters" $
      it "parses the character" $
@ -240,29 +276,6 @@ spec = do
        property $ \chs ->
          prs (oneOf (chs :: String)) "" `shouldFailWith` err posI ueof

-  describe "oneOf'" $ do
-    context "when stream begins with one of specified characters" $
-      it "parses the character" $
-        property $ \chs' n s -> do
-          let chs = getNonEmpty chs'
-              ch  = chs !! (getNonNegative n `rem` length chs)
-              sl  = toLower ch : s
-              su  = toUpper ch : s
-          prs  (oneOf' chs) sl `shouldParse`     toLower ch
-          prs  (oneOf' chs) su `shouldParse`     toUpper ch
-          prs' (oneOf' chs) sl `succeedsLeaving` s
-          prs' (oneOf' chs) su `succeedsLeaving` s
-    context "when stream does not begin with any of specified characters" $
-      it "signals correct parse error" $
-        property $ \chs ch s  -> ch `notElemi` (chs :: String) ==> do
-          let s' = ch : s
-          prs  (oneOf' chs) s' `shouldFailWith` err posI (utok ch)
-          prs' (oneOf' chs) s' `failsLeaving`   s'
-    context "when stream is empty" $
-      it "signals correct parse error" $
-        property $ \chs ->
-          prs (oneOf' (chs :: String)) "" `shouldFailWith` err posI ueof
-
  describe "noneOf" $ do
    context "when stream does not begin with any of specified characters" $
      it "parses the character" $
@ -283,29 +296,6 @@ spec = do
        property $ \chs ->
          prs (noneOf (chs :: String)) "" `shouldFailWith` err posI ueof

-  describe "noneOf'" $ do
-    context "when stream does not begin with any of specified characters" $
-      it "parses the character" $
-        property $ \chs ch s  -> ch `notElemi` (chs :: String) ==> do
-          let sl  = toLower ch : s
-              su  = toUpper ch : s
-          prs  (noneOf' chs) sl `shouldParse`     toLower ch
-          prs  (noneOf' chs) su `shouldParse`     toUpper ch
-          prs' (noneOf' chs) sl `succeedsLeaving` s
-          prs' (noneOf' chs) su `succeedsLeaving` s
-    context "when stream begins with one of specified characters" $
-      it "signals correct parse error" $
-        property $ \chs' n s -> do
-          let chs = getNonEmpty chs'
-              ch  = chs !! (getNonNegative n `rem` length chs)
-              s'  = ch : s
-          prs  (noneOf' chs) s' `shouldFailWith` err posI (utok ch)
-          prs' (noneOf' chs) s' `failsLeaving`   s'
-    context "when stream is empty" $
-      it "signals correct parse error" $
-        property $ \chs ->
-          prs (noneOf' (chs :: String)) "" `shouldFailWith` err posI ueof
-
  describe "string" $ do
    context "when stream is prefixed with given string" $
      it "parses the string" $
@ -338,7 +328,7 @@ spec = do
 ----------------------------------------------------------------------------
 -- Helpers

-checkStrLit :: String -> String -> Parsec Void String String -> SpecWith ()
+checkStrLit :: String -> String -> Parser String -> SpecWith ()
 checkStrLit name ts p = do
  context ("when stream begins with " ++ name) $
    it ("parses the " ++ name) $
@ -357,7 +347,7 @@ checkStrLit name ts p = do
    it "signals correct parse error" $
      prs p "" `shouldFailWith` err posI (ueof <> etoks ts)

-checkCharPred :: String -> (Char -> Bool) -> Parsec Void String Char -> SpecWith ()
+checkCharPred :: String -> (Char -> Bool) -> Parser Char -> SpecWith ()
 checkCharPred name f p = do
  context ("when stream begins with " ++ name) $
    it ("parses the " ++ name) $
@ -375,7 +365,7 @@ checkCharPred name f p = do
    it "signals correct parse error" $
      prs p "" `shouldFailWith` err posI (ueof <> elabel name)

-checkCharRange :: String -> String -> Parsec Void String Char -> SpecWith ()
+checkCharRange :: String -> String -> Parser Char -> SpecWith ()
 checkCharRange name tchs p = do
  forM_ tchs $ \tch ->
    context ("when stream begins with " ++ showTokens (nes tch)) $
@ -384,12 +374,6 @@ checkCharRange name tchs p = do
          let s' = tch : s
          prs  p s' `shouldParse`     tch
          prs' p s' `succeedsLeaving` s
-  -- context ("when stream does not begin with " ++ name) $
-  --   it "signals correct parse error" $
-  --     property $ \ch s -> ch `notElem` tchs ==> do
-  --      let s' = ch : s
-  --      prs  p s' `shouldFailWith` err posI (utok ch <> elabel name)
-  --      prs' p s' `failsLeaving`   s'
  context "when stream is empty" $
    it "signals correct parse error" $
      prs p "" `shouldFailWith` err posI (ueof <> elabel name)
@ -406,16 +390,6 @@ fuzzyCase s = zipWith f s <$> vector (length s)
 casei :: Char -> Char -> Bool
 casei x y = toUpper x == toUpper y

-- | Case-insensitive 'elem'.
-
-elemi :: Char -> String -> Bool
-elemi c = any (casei c)
-
-- | Case-insensitive 'notElem'.
-
-notElemi :: Char -> String -> Bool
-notElemi c = not . elemi c
-
 -- | The 'isPrefixOf' function takes two 'String's and returns 'True' iff
 -- the first list is a prefix of the second with case-insensitive
 -- comparison.
--- a/tests/Text/Megaparsec/ErrorSpec.hs
+++ b/tests/Text/Megaparsec/ErrorSpec.hs
@ -7,6 +7,7 @@ import Data.List (isInfixOf, isSuffixOf)
 import Data.List.NonEmpty (NonEmpty (..))
 import Data.Monoid
 import Data.Void
+import Data.Word (Word8)
 import Test.Hspec
 import Test.Hspec.Megaparsec.AdHoc ()
 import Test.QuickCheck
@ -173,6 +174,13 @@ spec = do
    it "shows control characters in long strings property"
      (f "{\n" "\"{<newline>\"")

+  describe "showTokens (Word8 instance)" $
+    it "basically works" $ do
+      -- NOTE Currently the Word8 instance is defined via Char intance, so
+      -- the testing is rather shallow.
+      let ts = NE.fromList [10,48,49,50] :: NonEmpty Word8
+      showTokens ts `shouldBe` "\"<newline>012\""
+
  describe "parseErrorPretty" $ do
    it "shows unknown ParseError correctly" $
      parseErrorPretty (mempty :: PE) `shouldBe` "1:1:\nunknown parse error\n"