mirror of
https://github.com/mrkkrp/megaparsec.git
synced 2024-12-24 08:43:14 +03:00
339 lines
11 KiB
Haskell
339 lines
11 KiB
Haskell
-- |
|
||
-- Module : Text.Megaparsec.Char
|
||
-- Copyright : © 2015 Megaparsec contributors
|
||
-- © 2007 Paolo Martini
|
||
-- © 1999–2001 Daan Leijen
|
||
-- License : BSD3
|
||
--
|
||
-- Maintainer : Mark Karpov <markkarpov@opmbx.org>
|
||
-- Stability : experimental
|
||
-- Portability : portable
|
||
--
|
||
-- Commonly used character parsers.
|
||
|
||
module Text.Megaparsec.Char
|
||
( -- * Simple parsers
|
||
newline
|
||
, crlf
|
||
, eol
|
||
, tab
|
||
, space
|
||
-- * Categories of characters
|
||
, controlChar
|
||
, spaceChar
|
||
, upperChar
|
||
, lowerChar
|
||
, letterChar
|
||
, alphaNumChar
|
||
, printChar
|
||
, digitChar
|
||
, octDigitChar
|
||
, hexDigitChar
|
||
, markChar
|
||
, numberChar
|
||
, punctuationChar
|
||
, symbolChar
|
||
, separatorChar
|
||
, asciiChar
|
||
, latin1Char
|
||
, charCategory
|
||
, categoryName
|
||
-- * More general parsers
|
||
, char
|
||
, char'
|
||
, anyChar
|
||
, oneOf
|
||
, oneOf'
|
||
, noneOf
|
||
, noneOf'
|
||
, satisfy
|
||
-- * Sequence of characters
|
||
, string
|
||
, string' )
|
||
where
|
||
|
||
import Control.Applicative ((<|>))
|
||
import Data.Char
|
||
import Data.List (nub)
|
||
import Data.Maybe (fromJust)
|
||
|
||
import Text.Megaparsec.Combinator
|
||
import Text.Megaparsec.Pos
|
||
import Text.Megaparsec.Prim
|
||
import Text.Megaparsec.ShowToken
|
||
|
||
-- | Parses a newline character.
|
||
|
||
newline :: Stream s m Char => ParsecT s u m Char
|
||
newline = char '\n' <?> "newline"
|
||
|
||
-- | Parses a carriage return character followed by a newline
|
||
-- character. Returns sequence of characters parsed.
|
||
|
||
crlf :: Stream s m Char => ParsecT s u m String
|
||
crlf = string "\r\n"
|
||
|
||
-- | Parses a CRLF (see 'crlf') or LF (see 'newline') end of line.
|
||
-- Returns the sequence of characters parsed.
|
||
--
|
||
-- > eol = (pure <$> newline) <|> crlf
|
||
|
||
eol :: Stream s m Char => ParsecT s u m String
|
||
eol = (pure <$> newline) <|> crlf <?> "end of line"
|
||
|
||
-- | Parses a tab character.
|
||
|
||
tab :: Stream s m Char => ParsecT s u m Char
|
||
tab = char '\t' <?> "tab"
|
||
|
||
-- | Skips /zero/ or more white space characters. See also 'skipMany' and
|
||
-- 'spaceChar'.
|
||
|
||
space :: Stream s m Char => ParsecT s u m ()
|
||
space = skipMany spaceChar
|
||
|
||
-- | Parses control characters, which are the non-printing characters of the
|
||
-- Latin-1 subset of Unicode.
|
||
|
||
controlChar :: Stream s m Char => ParsecT s u m Char
|
||
controlChar = satisfy isControl <?> "control character"
|
||
|
||
-- | Parses a Unicode space character, and the control characters: tab,
|
||
-- newline, carriage return, form feed, and vertical tab.
|
||
|
||
spaceChar :: Stream s m Char => ParsecT s u m Char
|
||
spaceChar = satisfy isSpace <?> "white space"
|
||
|
||
-- | Parses an upper-case or title-case alphabetic Unicode character. Title
|
||
-- case is used by a small number of letter ligatures like the
|
||
-- single-character form of Lj.
|
||
|
||
upperChar :: Stream s m Char => ParsecT s u m Char
|
||
upperChar = satisfy isUpper <?> "uppercase letter"
|
||
|
||
-- | Parses a lower-case alphabetic Unicode character.
|
||
|
||
lowerChar :: Stream s m Char => ParsecT s u m Char
|
||
lowerChar = satisfy isLower <?> "lowercase letter"
|
||
|
||
-- | Parses alphabetic Unicode characters: lower-case, upper-case and
|
||
-- title-case letters, plus letters of case-less scripts and modifiers
|
||
-- letters.
|
||
|
||
letterChar :: Stream s m Char => ParsecT s u m Char
|
||
letterChar = satisfy isLetter <?> "letter"
|
||
|
||
-- | Parses alphabetic or numeric digit Unicode characters.
|
||
--
|
||
-- Note that numeric digits outside the ASCII range are parsed by this
|
||
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
|
||
-- are not used by the printer and reader to represent numbers.
|
||
|
||
alphaNumChar :: Stream s m Char => ParsecT s u m Char
|
||
alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"
|
||
|
||
-- | Parses printable Unicode characters: letters, numbers, marks,
|
||
-- punctuation, symbols and spaces.
|
||
|
||
printChar :: Stream s m Char => ParsecT s u m Char
|
||
printChar = satisfy isPrint <?> "printable character"
|
||
|
||
-- | Parses an ASCII digit, i.e between “0” and “9”.
|
||
|
||
digitChar :: Stream s m Char => ParsecT s u m Char
|
||
digitChar = satisfy isDigit <?> "digit"
|
||
|
||
-- | Parses an octal digit, i.e. between “0” and “7”.
|
||
|
||
octDigitChar :: Stream s m Char => ParsecT s u m Char
|
||
octDigitChar = satisfy isOctDigit <?> "octal digit"
|
||
|
||
-- | Parses a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”,
|
||
-- or “A” and “F”.
|
||
|
||
hexDigitChar :: Stream s m Char => ParsecT s u m Char
|
||
hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"
|
||
|
||
-- | Parses Unicode mark characters, for example accents and the like, which
|
||
-- combine with preceding characters.
|
||
|
||
markChar :: Stream s m Char => ParsecT s u m Char
|
||
markChar = satisfy isMark <?> "mark character"
|
||
|
||
-- | Parses Unicode numeric characters, including digits from various
|
||
-- scripts, Roman numerals, et cetera.
|
||
|
||
numberChar :: Stream s m Char => ParsecT s u m Char
|
||
numberChar = satisfy isNumber <?> "numeric character"
|
||
|
||
-- | Parses Unicode punctuation characters, including various kinds of
|
||
-- connectors, brackets and quotes.
|
||
|
||
punctuationChar :: Stream s m Char => ParsecT s u m Char
|
||
punctuationChar = satisfy isPunctuation <?> "punctuation"
|
||
|
||
-- | Parses Unicode symbol characters, including mathematical and currency
|
||
-- symbols.
|
||
|
||
symbolChar :: Stream s m Char => ParsecT s u m Char
|
||
symbolChar = satisfy isSymbol <?> "symbol"
|
||
|
||
-- | Parses Unicode space and separator characters.
|
||
|
||
separatorChar :: Stream s m Char => ParsecT s u m Char
|
||
separatorChar = satisfy isSeparator <?> "separator"
|
||
|
||
-- | Parses a character from the first 128 characters of the Unicode character set,
|
||
-- corresponding to the ASCII character set.
|
||
|
||
asciiChar :: Stream s m Char => ParsecT s u m Char
|
||
asciiChar = satisfy isAscii <?> "ASCII character"
|
||
|
||
-- | Parses a character from the first 256 characters of the Unicode
|
||
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.
|
||
|
||
latin1Char :: Stream s m Char => ParsecT s u m Char
|
||
latin1Char = satisfy isLatin1 <?> "Latin-1 character"
|
||
|
||
-- | @charCategory cat@ Parses character in Unicode General Category @cat@,
|
||
-- see 'Data.Char.GeneralCategory'.
|
||
|
||
charCategory :: Stream s m Char => GeneralCategory -> ParsecT s u m Char
|
||
charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat
|
||
|
||
-- | Returns human-readable name of Unicode General Category.
|
||
|
||
categoryName :: GeneralCategory -> String
|
||
categoryName cat =
|
||
fromJust $ lookup cat
|
||
[ (UppercaseLetter , "uppercase letter")
|
||
, (LowercaseLetter , "lowercase letter")
|
||
, (TitlecaseLetter , "titlecase letter")
|
||
, (ModifierLetter , "modifier letter")
|
||
, (OtherLetter , "other letter")
|
||
, (NonSpacingMark , "non-spacing mark")
|
||
, (SpacingCombiningMark, "spacing combining mark")
|
||
, (EnclosingMark , "enclosing mark")
|
||
, (DecimalNumber , "decimal number character")
|
||
, (LetterNumber , "letter number character")
|
||
, (OtherNumber , "other number character")
|
||
, (ConnectorPunctuation, "connector punctuation")
|
||
, (DashPunctuation , "dash punctuation")
|
||
, (OpenPunctuation , "open punctuation")
|
||
, (ClosePunctuation , "close punctuation")
|
||
, (InitialQuote , "initial quote")
|
||
, (FinalQuote , "final quote")
|
||
, (OtherPunctuation , "other punctuation")
|
||
, (MathSymbol , "math symbol")
|
||
, (CurrencySymbol , "currency symbol")
|
||
, (ModifierSymbol , "modifier symbol")
|
||
, (OtherSymbol , "other symbol")
|
||
, (Space , "white space")
|
||
, (LineSeparator , "line separator")
|
||
, (ParagraphSeparator , "paragraph separator")
|
||
, (Control , "control character")
|
||
, (Format , "format character")
|
||
, (Surrogate , "surrogate character")
|
||
, (PrivateUse , "private-use Unicode character")
|
||
, (NotAssigned , "non-assigned Unicode character") ]
|
||
|
||
-- | @char c@ parses a single character @c@.
|
||
--
|
||
-- > semicolon = char ';'
|
||
|
||
char :: Stream s m Char => Char -> ParsecT s u m Char
|
||
char c = satisfy (== c) <?> showToken c
|
||
|
||
-- | The same as 'char' but case-insensitive. This parser returns actually
|
||
-- parsed character preserving its case.
|
||
--
|
||
-- >>> parseTest (char' 'e') "E"
|
||
-- 'E'
|
||
-- >>> parseTest (char' 'e') "G"
|
||
-- parse error at line 1, column 1:
|
||
-- unexpected 'G'
|
||
-- expecting 'E' or 'e'
|
||
|
||
char' :: Stream s m Char => Char -> ParsecT s u m Char
|
||
char' = choice . fmap char . extendi . pure
|
||
|
||
-- | Extends given list of characters adding uppercase version of every
|
||
-- lowercase characters and vice versa. Resulting list is guaranteed to have
|
||
-- no duplicates.
|
||
|
||
extendi :: String -> String
|
||
extendi cs = nub (cs >>= f)
|
||
where f c | isLower c = [c, toUpper c]
|
||
| isUpper c = [c, toLower c]
|
||
| otherwise = [c]
|
||
|
||
-- | This parser succeeds for any character. Returns the parsed character.
|
||
|
||
anyChar :: Stream s m Char => ParsecT s u m Char
|
||
anyChar = satisfy (const True) <?> "character"
|
||
|
||
-- | @oneOf cs@ succeeds if the current character is in the supplied
|
||
-- list of characters @cs@. Returns the parsed character. Note that this
|
||
-- parser doesn't automatically generate “expected” component of error
|
||
-- message, so usually you should label it manually with 'label' or
|
||
-- ('<?>').
|
||
--
|
||
-- See also 'satisfy'.
|
||
--
|
||
-- > digit = oneOf ['0'..'9'] <?> "digit"
|
||
|
||
oneOf :: Stream s m Char => String -> ParsecT s u m Char
|
||
oneOf cs = satisfy (`elem` cs)
|
||
|
||
-- | The same as 'oneOf', but case-insensitive. Returns the parsed character
|
||
-- preserving its case.
|
||
--
|
||
-- > vowel = oneOf' "aeiou" <?> "vowel"
|
||
|
||
oneOf' :: Stream s m Char => String -> ParsecT s u m Char
|
||
oneOf' = oneOf . extendi
|
||
|
||
-- | As the dual of 'oneOf', @noneOf cs@ succeeds if the current
|
||
-- character /not/ in the supplied list of characters @cs@. Returns the
|
||
-- parsed character.
|
||
|
||
noneOf :: Stream s m Char => String -> ParsecT s u m Char
|
||
noneOf cs = satisfy (`notElem` cs)
|
||
|
||
-- | The same as 'noneOf', but case-insensitive.
|
||
--
|
||
-- > consonant = noneOf' "aeiou" <?> "consonant"
|
||
|
||
noneOf' :: Stream s m Char => String -> ParsecT s u m Char
|
||
noneOf' = noneOf . extendi
|
||
|
||
-- | The parser @satisfy f@ succeeds for any character for which the
|
||
-- supplied function @f@ returns 'True'. Returns the character that is
|
||
-- actually parsed.
|
||
--
|
||
-- > digitChar = satisfy isDigit <?> "digit"
|
||
-- > oneOf cs = satisfy (`elem` cs)
|
||
|
||
satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
|
||
satisfy f = token nextPos testChar
|
||
where nextPos pos x _ = updatePosChar pos x
|
||
testChar x = if f x then Just x else Nothing
|
||
|
||
-- | @string s@ parses a sequence of characters given by @s@. Returns
|
||
-- the parsed string (i.e. @s@).
|
||
--
|
||
-- > divOrMod = string "div" <|> string "mod"
|
||
|
||
string :: Stream s m Char => String -> ParsecT s u m String
|
||
string = tokens updatePosString (==)
|
||
|
||
-- | The same as 'string', but case-insensitive. On success returns string
|
||
-- cased as argument of the function.
|
||
--
|
||
-- >>> parseTest (string' "foobar") "foObAr"
|
||
-- "foobar"
|
||
|
||
string' :: Stream s m Char => String -> ParsecT s u m String
|
||
string' = tokens updatePosString test
|
||
where test x y = toLower x == toLower y
|