mirror of
https://github.com/mrkkrp/megaparsec.git
synced 2024-11-27 15:32:14 +03:00
424 lines
13 KiB
Haskell
424 lines
13 KiB
Haskell
-- |
|
||
-- Module : Text.Megaparsec.Char
|
||
-- Copyright : © 2015–2017 Megaparsec contributors
|
||
-- © 2007 Paolo Martini
|
||
-- © 1999–2001 Daan Leijen
|
||
-- License : FreeBSD
|
||
--
|
||
-- Maintainer : Mark Karpov <markkarpov92@gmail.com>
|
||
-- Stability : experimental
|
||
-- Portability : non-portable
|
||
--
|
||
-- Commonly used character parsers.
|
||
|
||
{-# LANGUAGE CPP #-}
|
||
{-# LANGUAGE FlexibleContexts #-}
|
||
{-# LANGUAGE LambdaCase #-}
|
||
{-# LANGUAGE ScopedTypeVariables #-}
|
||
{-# LANGUAGE TypeFamilies #-}
|
||
|
||
module Text.Megaparsec.Char
|
||
( -- * Simple parsers
|
||
newline
|
||
, crlf
|
||
, eol
|
||
, tab
|
||
, space
|
||
, space1
|
||
-- * Categories of characters
|
||
, controlChar
|
||
, spaceChar
|
||
, upperChar
|
||
, lowerChar
|
||
, letterChar
|
||
, alphaNumChar
|
||
, printChar
|
||
, digitChar
|
||
, octDigitChar
|
||
, hexDigitChar
|
||
, markChar
|
||
, numberChar
|
||
, punctuationChar
|
||
, symbolChar
|
||
, separatorChar
|
||
, asciiChar
|
||
, latin1Char
|
||
, charCategory
|
||
, categoryName
|
||
-- * More general parsers
|
||
, char
|
||
, char'
|
||
, anyChar
|
||
, notChar
|
||
, oneOf
|
||
, noneOf
|
||
, satisfy
|
||
-- * Sequence of characters
|
||
, string
|
||
, string' )
|
||
where
|
||
|
||
import Control.Applicative
|
||
import Data.Char
|
||
import Data.Function (on)
|
||
import Data.Functor (void)
|
||
import Data.List.NonEmpty (NonEmpty (..))
|
||
import Data.Proxy
|
||
import Text.Megaparsec
|
||
import qualified Data.CaseInsensitive as CI
|
||
import qualified Data.Set as E
|
||
|
||
#if !MIN_VERSION_base(4,8,0)
|
||
import Data.Foldable (Foldable (), elem, notElem)
|
||
import Prelude hiding (elem, notElem)
|
||
#endif
|
||
|
||
----------------------------------------------------------------------------
|
||
-- Simple parsers
|
||
|
||
-- | Parse a newline character.
|
||
|
||
newline :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
newline = char '\n'
|
||
{-# INLINE newline #-}
|
||
|
||
-- | Parse a carriage return character followed by a newline character.
|
||
-- Return the sequence of characters parsed.
|
||
|
||
crlf :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
|
||
crlf = string (tokensToChunk (Proxy :: Proxy s) "\r\n")
|
||
{-# INLINE crlf #-}
|
||
|
||
-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
|
||
-- sequence of characters parsed.
|
||
|
||
eol :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
|
||
eol = (tokenToChunk (Proxy :: Proxy s) <$> newline)
|
||
<|> crlf
|
||
<?> "end of line"
|
||
{-# INLINE eol #-}
|
||
|
||
-- | Parse a tab character.
|
||
|
||
tab :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
tab = char '\t'
|
||
{-# INLINE tab #-}
|
||
|
||
-- | Skip /zero/ or more white space characters.
|
||
--
|
||
-- See also: 'skipMany' and 'spaceChar'.
|
||
|
||
space :: (MonadParsec e s m, Token s ~ Char) => m ()
|
||
space = void $ takeWhileP (Just "white space") isSpace
|
||
{-# INLINE space #-}
|
||
|
||
-- | Skip /one/ or more white space characters.
|
||
--
|
||
-- See also: 'skipSome' and 'spaceChar'.
|
||
--
|
||
-- @since 6.0.0
|
||
|
||
space1 :: (MonadParsec e s m, Token s ~ Char) => m ()
|
||
space1 = void $ takeWhile1P (Just "white space") isSpace
|
||
{-# INLINE space1 #-}
|
||
|
||
----------------------------------------------------------------------------
|
||
-- Categories of characters
|
||
|
||
-- | Parse a control character (a non-printing character of the Latin-1
|
||
-- subset of Unicode).
|
||
|
||
controlChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
controlChar = satisfy isControl <?> "control character"
|
||
{-# INLINE controlChar #-}
|
||
|
||
-- | Parse a Unicode space character, and the control characters: tab,
|
||
-- newline, carriage return, form feed, and vertical tab.
|
||
|
||
spaceChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
spaceChar = satisfy isSpace <?> "white space"
|
||
{-# INLINE spaceChar #-}
|
||
|
||
-- | Parse an upper-case or title-case alphabetic Unicode character. Title
|
||
-- case is used by a small number of letter ligatures like the
|
||
-- single-character form of Lj.
|
||
|
||
upperChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
upperChar = satisfy isUpper <?> "uppercase letter"
|
||
{-# INLINE upperChar #-}
|
||
|
||
-- | Parse a lower-case alphabetic Unicode character.
|
||
|
||
lowerChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
lowerChar = satisfy isLower <?> "lowercase letter"
|
||
{-# INLINE lowerChar #-}
|
||
|
||
-- | Parse an alphabetic Unicode character: lower-case, upper-case, or
|
||
-- title-case letter, or a letter of case-less scripts\/modifier letter.
|
||
|
||
letterChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
letterChar = satisfy isLetter <?> "letter"
|
||
{-# INLINE letterChar #-}
|
||
|
||
-- | Parse an alphabetic or numeric digit Unicode characters.
|
||
--
|
||
-- Note that the numeric digits outside the ASCII range are parsed by this
|
||
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
|
||
-- are not used by the printer and reader to represent numbers.
|
||
|
||
alphaNumChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"
|
||
{-# INLINE alphaNumChar #-}
|
||
|
||
-- | Parse a printable Unicode character: letter, number, mark, punctuation,
|
||
-- symbol or space.
|
||
|
||
printChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
printChar = satisfy isPrint <?> "printable character"
|
||
{-# INLINE printChar #-}
|
||
|
||
-- | Parse an ASCII digit, i.e between “0” and “9”.
|
||
|
||
digitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
digitChar = satisfy isDigit <?> "digit"
|
||
{-# INLINE digitChar #-}
|
||
|
||
-- | Parse an octal digit, i.e. between “0” and “7”.
|
||
|
||
octDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
octDigitChar = satisfy isOctDigit <?> "octal digit"
|
||
{-# INLINE octDigitChar #-}
|
||
|
||
-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
|
||
-- “A” and “F”.
|
||
|
||
hexDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"
|
||
{-# INLINE hexDigitChar #-}
|
||
|
||
-- | Parse a Unicode mark character (accents and the like), which combines
|
||
-- with preceding characters.
|
||
|
||
markChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
markChar = satisfy isMark <?> "mark character"
|
||
{-# INLINE markChar #-}
|
||
|
||
-- | Parse a Unicode numeric character, including digits from various
|
||
-- scripts, Roman numerals, etc.
|
||
|
||
numberChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
numberChar = satisfy isNumber <?> "numeric character"
|
||
{-# INLINE numberChar #-}
|
||
|
||
-- | Parse a Unicode punctuation character, including various kinds of
|
||
-- connectors, brackets and quotes.
|
||
|
||
punctuationChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
punctuationChar = satisfy isPunctuation <?> "punctuation"
|
||
{-# INLINE punctuationChar #-}
|
||
|
||
-- | Parse a Unicode symbol characters, including mathematical and currency
|
||
-- symbols.
|
||
|
||
symbolChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
symbolChar = satisfy isSymbol <?> "symbol"
|
||
{-# INLINE symbolChar #-}
|
||
|
||
-- | Parse a Unicode space and separator characters.
|
||
|
||
separatorChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
separatorChar = satisfy isSeparator <?> "separator"
|
||
{-# INLINE separatorChar #-}
|
||
|
||
-- | Parse a character from the first 128 characters of the Unicode
|
||
-- character set, corresponding to the ASCII character set.
|
||
|
||
asciiChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
asciiChar = satisfy isAscii <?> "ASCII character"
|
||
{-# INLINE asciiChar #-}
|
||
|
||
-- | Parse a character from the first 256 characters of the Unicode
|
||
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.
|
||
|
||
latin1Char :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
|
||
latin1Char = satisfy isLatin1 <?> "Latin-1 character"
|
||
{-# INLINE latin1Char #-}
|
||
|
||
-- | @'charCategory' cat@ parses character in Unicode General Category
|
||
-- @cat@, see 'Data.Char.GeneralCategory'.
|
||
|
||
charCategory :: (MonadParsec e s m, Token s ~ Char)
|
||
=> GeneralCategory
|
||
-> m (Token s)
|
||
charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat
|
||
{-# INLINE charCategory #-}
|
||
|
||
-- | Return the human-readable name of Unicode General Category.
|
||
|
||
categoryName :: GeneralCategory -> String
|
||
categoryName = \case
|
||
UppercaseLetter -> "uppercase letter"
|
||
LowercaseLetter -> "lowercase letter"
|
||
TitlecaseLetter -> "titlecase letter"
|
||
ModifierLetter -> "modifier letter"
|
||
OtherLetter -> "other letter"
|
||
NonSpacingMark -> "non-spacing mark"
|
||
SpacingCombiningMark -> "spacing combining mark"
|
||
EnclosingMark -> "enclosing mark"
|
||
DecimalNumber -> "decimal number character"
|
||
LetterNumber -> "letter number character"
|
||
OtherNumber -> "other number character"
|
||
ConnectorPunctuation -> "connector punctuation"
|
||
DashPunctuation -> "dash punctuation"
|
||
OpenPunctuation -> "open punctuation"
|
||
ClosePunctuation -> "close punctuation"
|
||
InitialQuote -> "initial quote"
|
||
FinalQuote -> "final quote"
|
||
OtherPunctuation -> "other punctuation"
|
||
MathSymbol -> "math symbol"
|
||
CurrencySymbol -> "currency symbol"
|
||
ModifierSymbol -> "modifier symbol"
|
||
OtherSymbol -> "other symbol"
|
||
Space -> "white space"
|
||
LineSeparator -> "line separator"
|
||
ParagraphSeparator -> "paragraph separator"
|
||
Control -> "control character"
|
||
Format -> "format character"
|
||
Surrogate -> "surrogate character"
|
||
PrivateUse -> "private-use Unicode character"
|
||
NotAssigned -> "non-assigned Unicode character"
|
||
|
||
----------------------------------------------------------------------------
|
||
-- More general parsers
|
||
|
||
-- | @'char' c@ parses a single character @c@.
|
||
--
|
||
-- > semicolon = char ';'
|
||
|
||
char :: MonadParsec e s m => Token s -> m (Token s)
|
||
char c = token testChar (Just c)
|
||
where
|
||
f x = Tokens (x:|[])
|
||
testChar x =
|
||
if x == c
|
||
then Right x
|
||
else Left (pure (f x), E.singleton (f c))
|
||
{-# INLINE char #-}
|
||
|
||
-- | The same as 'char' but case-insensitive. This parser returns the
|
||
-- actually parsed character preserving its case.
|
||
--
|
||
-- >>> parseTest (char' 'e') "E"
|
||
-- 'E'
|
||
-- >>> parseTest (char' 'e') "G"
|
||
-- 1:1:
|
||
-- unexpected 'G'
|
||
-- expecting 'E' or 'e'
|
||
|
||
char' :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
|
||
char' c = choice [char c, char (swapCase c)]
|
||
where
|
||
swapCase x
|
||
| isUpper x = toLower x
|
||
| isLower x = toUpper x
|
||
| otherwise = x
|
||
{-# INLINE char' #-}
|
||
|
||
-- | This parser succeeds for any character. Returns the parsed character.
|
||
|
||
anyChar :: MonadParsec e s m => m (Token s)
|
||
anyChar = satisfy (const True) <?> "character"
|
||
{-# INLINE anyChar #-}
|
||
|
||
-- | Match any character but the given one. It's a good idea to attach a
|
||
-- 'label' to this parser manually.
|
||
--
|
||
-- @since 6.0.0
|
||
|
||
notChar :: MonadParsec e s m => Token s -> m (Token s)
|
||
notChar c = satisfy (/= c)
|
||
{-# INLINE notChar #-}
|
||
|
||
-- | @'oneOf' cs@ succeeds if the current character is in the supplied
|
||
-- collection of characters @cs@. Returns the parsed character. Note that
|
||
-- this parser cannot automatically generate the “expected” component of
|
||
-- error message, so usually you should label it manually with 'label' or
|
||
-- ('<?>').
|
||
--
|
||
-- See also: 'satisfy'.
|
||
--
|
||
-- > digit = oneOf ['0'..'9'] <?> "digit"
|
||
--
|
||
-- __Performance note__: prefer 'satisfy' when you can because it's faster
|
||
-- when you have only a couple of tokens to compare to:
|
||
--
|
||
-- > quoteFast = satisfy (\x -> x == '\'' || x == '\"')
|
||
-- > quoteSlow = oneOf "'\""
|
||
|
||
oneOf :: (Foldable f, MonadParsec e s m)
|
||
=> f (Token s)
|
||
-> m (Token s)
|
||
oneOf cs = satisfy (`elem` cs)
|
||
{-# INLINE oneOf #-}
|
||
|
||
-- | As the dual of 'oneOf', @'noneOf' cs@ succeeds if the current character
|
||
-- /not/ in the supplied list of characters @cs@. Returns the parsed
|
||
-- character. Note that this parser cannot automatically generate the
|
||
-- “expected” component of error message, so usually you should label it
|
||
-- manually with 'label' or ('<?>').
|
||
--
|
||
-- See also: 'satisfy'.
|
||
--
|
||
-- __Performance note__: prefer 'satisfy' and 'notChar' when you can because
|
||
-- it's faster.
|
||
|
||
noneOf :: (Foldable f, MonadParsec e s m)
|
||
=> f (Token s)
|
||
-> m (Token s)
|
||
noneOf cs = satisfy (`notElem` cs)
|
||
{-# INLINE noneOf #-}
|
||
|
||
-- | The parser @'satisfy' f@ succeeds for any character for which the
|
||
-- supplied function @f@ returns 'True'. Returns the character that is
|
||
-- actually parsed.
|
||
--
|
||
-- > digitChar = satisfy isDigit <?> "digit"
|
||
-- > oneOf cs = satisfy (`elem` cs)
|
||
|
||
satisfy :: MonadParsec e s m
|
||
=> (Token s -> Bool) -- ^ Predicate to apply
|
||
-> m (Token s)
|
||
satisfy f = token testChar Nothing
|
||
where
|
||
testChar x =
|
||
if f x
|
||
then Right x
|
||
else Left (pure (Tokens (x:|[])), E.empty)
|
||
{-# INLINE satisfy #-}
|
||
|
||
----------------------------------------------------------------------------
|
||
-- Sequence of characters
|
||
|
||
-- | @'string' s@ parses a sequence of characters given by @s@. Returns the
|
||
-- parsed string (i.e. @s@).
|
||
--
|
||
-- > divOrMod = string "div" <|> string "mod"
|
||
|
||
string :: MonadParsec e s m
|
||
=> Tokens s
|
||
-> m (Tokens s)
|
||
string = tokens (==)
|
||
{-# INLINE string #-}
|
||
|
||
-- | The same as 'string', but case-insensitive. On success returns string
|
||
-- cased as actually parsed input.
|
||
--
|
||
-- >>> parseTest (string' "foobar") "foObAr"
|
||
-- "foObAr"
|
||
|
||
string' :: (MonadParsec e s m, CI.FoldCase (Tokens s))
|
||
=> Tokens s
|
||
-> m (Tokens s)
|
||
string' = tokens ((==) `on` CI.mk)
|
||
{-# INLINE string' #-}
|