megaparsec/Text/Megaparsec/Char.hs

280 lines
8.9 KiB
Haskell
Raw Normal View History

2015-07-28 16:32:19 +03:00
-- |
-- Module : Text.Megaparsec.Char
-- Copyright : © 2015 Megaparsec contributors
-- © 2007 Paolo Martini
-- © 19992001 Daan Leijen
2015-07-28 16:32:19 +03:00
-- License : BSD3
--
-- Maintainer : Mark Karpov <markkarpov@opmbx.org>
2015-07-29 11:38:32 +03:00
-- Stability : experimental
2015-07-28 16:32:19 +03:00
-- Portability : portable
--
-- Commonly used character parsers.
module Text.Megaparsec.Char
2015-08-12 20:51:06 +03:00
( newline
, crlf
, eol
, tab
, space
, controlChar
, spaceChar
, upperChar
, lowerChar
, letterChar
, alphaNumChar
, printChar
, digitChar
, octDigitChar
, hexDigitChar
, markChar
, numberChar
, punctuationChar
, symbolChar
, separatorChar
, asciiChar
, latin1Char
, charCategory
, categoryName
, char
, anyChar
, oneOf
, noneOf
, satisfy
, string )
2015-07-28 16:32:19 +03:00
where
import Control.Applicative ((<|>))
2015-07-28 16:32:19 +03:00
import Data.Char
import Data.Maybe (fromJust)
2015-07-28 16:32:19 +03:00
2015-08-12 15:41:22 +03:00
import Text.Megaparsec.Combinator
import Text.Megaparsec.Pos
import Text.Megaparsec.Prim
import Text.Megaparsec.ShowToken
2015-07-28 16:32:19 +03:00
2015-08-12 15:41:22 +03:00
-- | Parses a newline character.
2015-07-28 16:32:19 +03:00
newline :: Stream s m Char => ParsecT s u m Char
newline = char '\n' <?> "newline"
2015-07-28 16:32:19 +03:00
2015-08-12 15:41:22 +03:00
-- | Parses a carriage return character followed by a newline
-- character. Returns sequence of characters parsed.
2015-07-28 16:32:19 +03:00
crlf :: Stream s m Char => ParsecT s u m String
crlf = string "\r\n"
2015-07-28 16:32:19 +03:00
2015-08-12 15:41:22 +03:00
-- | Parses a CRLF (see 'crlf') or LF (see 'newline') end of line.
-- Returns the sequence of characters parsed.
2015-07-28 16:32:19 +03:00
--
2015-08-12 15:41:22 +03:00
-- > eol = (pure <$> newline) <|> crlf
2015-07-28 16:32:19 +03:00
eol :: Stream s m Char => ParsecT s u m String
eol = (pure <$> newline) <|> crlf <?> "end of line"
2015-07-28 16:32:19 +03:00
2015-08-12 15:41:22 +03:00
-- | Parses a tab character.
2015-07-28 16:32:19 +03:00
tab :: Stream s m Char => ParsecT s u m Char
tab = char '\t' <?> "tab"
-- | Skips /zero/ or more white space characters. See also 'skipMany' and
-- 'spaceChar'.
space :: Stream s m Char => ParsecT s u m ()
space = skipMany spaceChar
2015-08-12 15:41:22 +03:00
-- | Parses control characters, which are the non-printing characters of the
-- Latin-1 subset of Unicode.
2015-08-12 15:41:22 +03:00
controlChar :: Stream s m Char => ParsecT s u m Char
controlChar = satisfy isControl <?> "control character"
2015-07-28 16:32:19 +03:00
-- | Parses a Unicode space character, and the control characters: tab,
-- newline, carriage return, form feed, and vertical tab.
spaceChar :: Stream s m Char => ParsecT s u m Char
spaceChar = satisfy isSpace <?> "white space"
-- | Parses an upper-case or title-case alphabetic Unicode character. Title
-- case is used by a small number of letter ligatures like the
-- single-character form of Lj.
upperChar :: Stream s m Char => ParsecT s u m Char
upperChar = satisfy isUpper <?> "uppercase letter"
-- | Parses a lower-case alphabetic Unicode character.
lowerChar :: Stream s m Char => ParsecT s u m Char
lowerChar = satisfy isLower <?> "lowercase letter"
-- | Parses alphabetic Unicode characters: lower-case, upper-case and
-- title-case letters, plus letters of case-less scripts and modifiers
-- letters.
letterChar :: Stream s m Char => ParsecT s u m Char
letterChar = satisfy isLetter <?> "letter"
-- | Parses alphabetic or numeric digit Unicode characters.
--
-- Note that numeric digits outside the ASCII range are parsed by this
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
-- are not used by the printer and reader to represent numbers.
2015-07-28 16:32:19 +03:00
alphaNumChar :: Stream s m Char => ParsecT s u m Char
alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"
2015-07-28 16:32:19 +03:00
-- | Parses printable Unicode characters: letters, numbers, marks,
-- punctuation, symbols and spaces.
2015-07-28 16:32:19 +03:00
printChar :: Stream s m Char => ParsecT s u m Char
printChar = satisfy isPrint <?> "printable character"
2015-07-28 16:32:19 +03:00
-- | Parses an ASCII digit, i.e between “0” and “9”.
2015-07-28 16:32:19 +03:00
digitChar :: Stream s m Char => ParsecT s u m Char
digitChar = satisfy isDigit <?> "digit"
2015-07-28 16:32:19 +03:00
-- | Parses an octal digit, i.e. between “0” and “7”.
2015-07-28 16:32:19 +03:00
octDigitChar :: Stream s m Char => ParsecT s u m Char
octDigitChar = satisfy isOctDigit <?> "octal digit"
2015-07-28 16:32:19 +03:00
-- | Parses a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”,
-- or “A” and “F”.
2015-07-28 16:32:19 +03:00
hexDigitChar :: Stream s m Char => ParsecT s u m Char
hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"
2015-08-12 15:41:22 +03:00
-- | Parses Unicode mark characters, for example accents and the like, which
-- combine with preceding characters.
markChar :: Stream s m Char => ParsecT s u m Char
markChar = satisfy isMark <?> "mark character"
-- | Parses Unicode numeric characters, including digits from various
-- scripts, Roman numerals, et cetera.
numberChar :: Stream s m Char => ParsecT s u m Char
numberChar = satisfy isNumber <?> "numeric character"
-- | Parses Unicode punctuation characters, including various kinds of
-- connectors, brackets and quotes.
punctuationChar :: Stream s m Char => ParsecT s u m Char
punctuationChar = satisfy isPunctuation <?> "punctuation"
-- | Parses Unicode symbol characters, including mathematical and currency
-- symbols.
symbolChar :: Stream s m Char => ParsecT s u m Char
symbolChar = satisfy isSymbol <?> "symbol"
-- | Parses Unicode space and separator characters.
separatorChar :: Stream s m Char => ParsecT s u m Char
separatorChar = satisfy isSeparator <?> "separator"
-- | Parses a character from the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.
asciiChar :: Stream s m Char => ParsecT s u m Char
asciiChar = satisfy isAscii <?> "ASCII character"
-- | Parses a character from the first 256 characters of the Unicode
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.
latin1Char :: Stream s m Char => ParsecT s u m Char
latin1Char = satisfy isLatin1 <?> "Latin-1 character"
-- | @charCategory cat@ Parses character in Unicode General Category @cat@,
-- see 'Data.Char.GeneralCategory'.
charCategory :: Stream s m Char => GeneralCategory -> ParsecT s u m Char
charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat
-- | Returns human-readable name of Unicode General Category.
categoryName :: GeneralCategory -> String
categoryName cat =
fromJust $ lookup cat
[ (UppercaseLetter , "uppercase letter")
, (LowercaseLetter , "lowercase letter")
, (TitlecaseLetter , "titlecase letter")
, (ModifierLetter , "modifier letter")
, (OtherLetter , "other letter")
, (NonSpacingMark , "non-spacing mark")
, (SpacingCombiningMark, "spacing combining mark")
, (EnclosingMark , "enclosing mark")
, (DecimalNumber , "decimal number character")
, (LetterNumber , "letter number character")
, (OtherNumber , "other number character")
, (ConnectorPunctuation, "connector punctuation")
, (DashPunctuation , "dash punctuation")
, (OpenPunctuation , "open punctuation")
, (ClosePunctuation , "close punctuation")
, (InitialQuote , "initial quote")
, (FinalQuote , "final quote")
, (OtherPunctuation , "other punctuation")
, (MathSymbol , "math symbol")
, (CurrencySymbol , "currency symbol")
, (ModifierSymbol , "modifier symbol")
, (OtherSymbol , "other symbol")
, (Space , "white space")
, (LineSeparator , "line separator")
, (ParagraphSeparator , "paragraph separator")
, (Control , "control character")
, (Format , "format character")
, (Surrogate , "surrogate character")
, (PrivateUse , "private-use Unicode character")
, (NotAssigned , "non-assigned Unicode character") ]
2015-08-12 15:41:22 +03:00
2015-07-28 16:32:19 +03:00
-- | @char c@ parses a single character @c@.
--
2015-08-12 15:41:22 +03:00
-- > semicolon = char ';'
2015-07-28 16:32:19 +03:00
char :: Stream s m Char => Char -> ParsecT s u m Char
char c = satisfy (== c) <?> showToken c
2015-07-28 16:32:19 +03:00
-- | This parser succeeds for any character. Returns the parsed character.
anyChar :: Stream s m Char => ParsecT s u m Char
anyChar = satisfy (const True) <?> "character"
2015-07-28 16:32:19 +03:00
-- | @oneOf cs@ succeeds if the current character is in the supplied
-- list of characters @cs@. Returns the parsed character. See also
-- 'satisfy'.
--
-- > vowel = oneOf "aeiou" <?> "vowel"
oneOf :: Stream s m Char => String -> ParsecT s u m Char
oneOf cs = satisfy (`elem` cs)
-- | As the dual of 'oneOf', @noneOf cs@ succeeds if the current
-- character /not/ in the supplied list of characters @cs@. Returns the
-- parsed character.
--
-- > consonant = noneOf "aeiou" <?> "consonant"
noneOf :: Stream s m Char => String -> ParsecT s u m Char
noneOf cs = satisfy (`notElem` cs)
2015-07-28 16:32:19 +03:00
-- | The parser @satisfy f@ succeeds for any character for which the
-- supplied function @f@ returns 'True'. Returns the character that is
-- actually parsed.
--
-- > digit = satisfy isDigit
-- > oneOf cs = satisfy (`elem` cs)
satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
satisfy f = token nextPos testChar
2015-08-12 20:51:06 +03:00
where nextPos pos x _ = updatePosChar pos x
testChar x = if f x then Just x else Nothing
2015-07-28 16:32:19 +03:00
-- | @string s@ parses a sequence of characters given by @s@. Returns
-- the parsed string (i.e. @s@).
--
-- > divOrMod = string "div" <|> string "mod"
string :: Stream s m Char => String -> ParsecT s u m String
string = tokens updatePosString