mirror of
https://github.com/mrkkrp/megaparsec.git
synced 2024-11-23 19:38:05 +03:00
236 lines
6.3 KiB
Haskell
236 lines
6.3 KiB
Haskell
-- |
|
||
-- Module : Text.Megaparsec.Byte
|
||
-- Copyright : © 2015–2017 Megaparsec contributors
|
||
-- License : FreeBSD
|
||
--
|
||
-- Maintainer : Mark Karpov <markkarpov92@gmail.com>
|
||
-- Stability : experimental
|
||
-- Portability : portable
|
||
--
|
||
-- Commonly used binary parsers.
|
||
--
|
||
-- @since 6.0.0
|
||
|
||
{-# LANGUAGE ScopedTypeVariables #-}
|
||
{-# LANGUAGE TypeFamilies #-}
|
||
|
||
module Text.Megaparsec.Byte
|
||
( -- * Simple parsers
|
||
newline
|
||
, crlf
|
||
, eol
|
||
, tab
|
||
, space
|
||
, space1
|
||
-- * Categories of characters
|
||
, controlChar
|
||
, spaceChar
|
||
, upperChar
|
||
, lowerChar
|
||
, letterChar
|
||
, alphaNumChar
|
||
, printChar
|
||
, digitChar
|
||
, octDigitChar
|
||
, hexDigitChar
|
||
, asciiChar
|
||
-- * More general parsers
|
||
, C.char
|
||
, char'
|
||
, C.anyChar
|
||
, C.notChar
|
||
, C.oneOf
|
||
, C.noneOf
|
||
, C.satisfy
|
||
-- * Sequence of bytes
|
||
, C.string
|
||
, C.string' )
|
||
where
|
||
|
||
import Control.Applicative
|
||
import Data.Char
|
||
import Data.Functor (void)
|
||
import Data.Maybe (fromMaybe)
|
||
import Data.Proxy
|
||
import Data.Word (Word8)
|
||
import Text.Megaparsec
|
||
import qualified Text.Megaparsec.Char as C
|
||
|
||
----------------------------------------------------------------------------
|
||
-- Simple parsers
|
||
|
||
-- | Parse a newline byte.
|
||
|
||
newline :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
newline = C.char 10
|
||
{-# INLINE newline #-}
|
||
|
||
-- | Parse a carriage return character followed by a newline character.
|
||
-- Return the sequence of characters parsed.
|
||
|
||
crlf :: forall e s m. (MonadParsec e s m, Token s ~ Word8) => m (Tokens s)
|
||
crlf = C.string (tokensToChunk (Proxy :: Proxy s) [13,10])
|
||
{-# INLINE crlf #-}
|
||
|
||
-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
|
||
-- sequence of characters parsed.
|
||
|
||
eol :: forall e s m. (MonadParsec e s m, Token s ~ Word8) => m (Tokens s)
|
||
eol = (tokenToChunk (Proxy :: Proxy s) <$> newline)
|
||
<|> crlf
|
||
<?> "end of line"
|
||
{-# INLINE eol #-}
|
||
|
||
-- | Parse a tab character.
|
||
|
||
tab :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
tab = C.char 9
|
||
{-# INLINE tab #-}
|
||
|
||
-- | Skip /zero/ or more white space characters.
|
||
--
|
||
-- See also: 'skipMany' and 'spaceChar'.
|
||
|
||
space :: (MonadParsec e s m, Token s ~ Word8) => m ()
|
||
space = void $ takeWhileP (Just "white space") isSpace'
|
||
{-# INLINE space #-}
|
||
|
||
-- | Skip /one/ or more white space characters.
|
||
--
|
||
-- See also: 'skipSome' and 'spaceChar'.
|
||
|
||
space1 :: (MonadParsec e s m, Token s ~ Word8) => m ()
|
||
space1 = void $ takeWhile1P (Just "white space") isSpace'
|
||
{-# INLINE space1 #-}
|
||
|
||
----------------------------------------------------------------------------
|
||
-- Categories of characters
|
||
|
||
-- | Parse a control character.
|
||
|
||
controlChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
controlChar = C.satisfy (isControl . toChar) <?> "control character"
|
||
{-# INLINE controlChar #-}
|
||
|
||
-- | Parse a space character, and the control characters: tab, newline,
|
||
-- carriage return, form feed, and vertical tab.
|
||
|
||
spaceChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
spaceChar = C.satisfy isSpace' <?> "white space"
|
||
{-# INLINE spaceChar #-}
|
||
|
||
-- | Parse an upper-case character.
|
||
|
||
upperChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
upperChar = C.satisfy (isUpper . toChar) <?> "uppercase letter"
|
||
{-# INLINE upperChar #-}
|
||
|
||
-- | Parse a lower-case alphabetic character.
|
||
|
||
lowerChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
lowerChar = C.satisfy (isLower . toChar) <?> "lowercase letter"
|
||
{-# INLINE lowerChar #-}
|
||
|
||
-- | Parse an alphabetic character: lower-case or upper-case.
|
||
|
||
letterChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
letterChar = C.satisfy (isLetter . toChar) <?> "letter"
|
||
{-# INLINE letterChar #-}
|
||
|
||
-- | Parse an alphabetic or digit characters.
|
||
|
||
alphaNumChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
alphaNumChar = C.satisfy (isAlphaNum . toChar) <?> "alphanumeric character"
|
||
{-# INLINE alphaNumChar #-}
|
||
|
||
-- | Parse a printable character: letter, number, mark, punctuation, symbol
|
||
-- or space.
|
||
|
||
printChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
printChar = C.satisfy (isPrint . toChar) <?> "printable character"
|
||
{-# INLINE printChar #-}
|
||
|
||
-- | Parse an ASCII digit, i.e between “0” and “9”.
|
||
|
||
digitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
digitChar = C.satisfy isDigit' <?> "digit"
|
||
where
|
||
isDigit' x = x >= 48 && x <= 57
|
||
{-# INLINE digitChar #-}
|
||
|
||
-- | Parse an octal digit, i.e. between “0” and “7”.
|
||
|
||
octDigitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
octDigitChar = C.satisfy isOctDigit' <?> "octal digit"
|
||
where
|
||
isOctDigit' x = x >= 48 && x <= 55
|
||
{-# INLINE octDigitChar #-}
|
||
|
||
-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
|
||
-- “A” and “F”.
|
||
|
||
hexDigitChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
hexDigitChar = C.satisfy (isHexDigit . toChar) <?> "hexadecimal digit"
|
||
{-# INLINE hexDigitChar #-}
|
||
|
||
-- | Parse a character from the first 128 characters of the Unicode
|
||
-- character set, corresponding to the ASCII character set.
|
||
|
||
asciiChar :: (MonadParsec e s m, Token s ~ Word8) => m (Token s)
|
||
asciiChar = C.satisfy (< 128) <?> "ASCII character"
|
||
{-# INLINE asciiChar #-}
|
||
|
||
----------------------------------------------------------------------------
|
||
-- More general parsers
|
||
|
||
-- | The same as 'char' but case-insensitive. This parser returns the
|
||
-- actually parsed character preserving its case.
|
||
--
|
||
-- >>> parseTest (char' 101) "E"
|
||
-- 69 -- 'E'
|
||
-- >>> parseTest (char' 101) "G"
|
||
-- 1:1:
|
||
-- unexpected 'G'
|
||
-- expecting 'E' or 'e'
|
||
|
||
char' :: (MonadParsec e s m, Token s ~ Word8) => Token s -> m (Token s)
|
||
char' c = choice
|
||
[ C.char c
|
||
, C.char (fromMaybe c (swapCase c)) ]
|
||
where
|
||
swapCase x
|
||
| isUpper g = fromChar (toLower g)
|
||
| isLower g = fromChar (toUpper g)
|
||
| otherwise = Nothing
|
||
where
|
||
g = toChar x
|
||
{-# INLINE char' #-}
|
||
|
||
----------------------------------------------------------------------------
|
||
-- Helpers
|
||
|
||
-- | 'Word8'-specialized version of 'isSpace'.
|
||
|
||
isSpace' :: Word8 -> Bool
|
||
isSpace' x
|
||
| x >= 9 && x <= 13 = True
|
||
| x == 32 = True
|
||
| x == 160 = True
|
||
| otherwise = False
|
||
{-# INLINE isSpace' #-}
|
||
|
||
-- | Convert a byte to char.
|
||
|
||
toChar :: Word8 -> Char
|
||
toChar = chr . fromIntegral
|
||
{-# INLINE toChar #-}
|
||
|
||
-- | Convert a char to byte.
|
||
|
||
fromChar :: Char -> Maybe Word8
|
||
fromChar x = let p = ord x in
|
||
if p > 0xff
|
||
then Nothing
|
||
else Just (fromIntegral p)
|
||
{-# INLINE fromChar #-}
|