megaparsec/Text/Megaparsec/Char.hs

-- |
-- Module      :  Text.Megaparsec.Char
-- Copyright   :  © 2015 Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  BSD3
--
-- Maintainer  :  Mark Karpov <markkarpov@opmbx.org>
-- Stability   :  experimental
-- Portability :  portable
--
-- Commonly used character parsers.

module Text.Megaparsec.Char
  ( -- * Simple parsers
    newline
  , crlf
  , eol
  , tab
  , space
    -- * Categories of characters
  , controlChar
  , spaceChar
  , upperChar
  , lowerChar
  , letterChar
  , alphaNumChar
  , printChar
  , digitChar
  , octDigitChar
  , hexDigitChar
  , markChar
  , numberChar
  , punctuationChar
  , symbolChar
  , separatorChar
  , asciiChar
  , latin1Char
  , charCategory
  , categoryName
    -- * More general parsers
  , char
  , char'
  , anyChar
  , oneOf
  , oneOf'
  , noneOf
  , noneOf'
  , satisfy
    -- * Sequence of characters
  , string
  , string' )
where

import Control.Applicative ((<|>))
import Data.Char
import Data.List (nub)
import Data.Maybe (fromJust)

import Text.Megaparsec.Combinator
import Text.Megaparsec.Pos
import Text.Megaparsec.Prim
import Text.Megaparsec.ShowToken

-- | Parses a newline character.

newline :: Stream s m Char => ParsecT s u m Char
newline = char '\n' <?> "newline"

-- | Parses a carriage return character followed by a newline
-- character. Returns sequence of characters parsed.

crlf :: Stream s m Char => ParsecT s u m String
crlf = string "\r\n"

-- | Parses a CRLF (see 'crlf') or LF (see 'newline') end of line.
-- Returns the sequence of characters parsed.
--
-- > eol = (pure <$> newline) <|> crlf

eol :: Stream s m Char => ParsecT s u m String
eol = (pure <$> newline) <|> crlf <?> "end of line"

-- | Parses a tab character.

tab :: Stream s m Char => ParsecT s u m Char
tab = char '\t' <?> "tab"

-- | Skips /zero/ or more white space characters. See also 'skipMany' and
-- 'spaceChar'.

space :: Stream s m Char => ParsecT s u m ()
space = skipMany spaceChar

-- | Parses control characters, which are the non-printing characters of the
-- Latin-1 subset of Unicode.

controlChar :: Stream s m Char => ParsecT s u m Char
controlChar = satisfy isControl <?> "control character"

-- | Parses a Unicode space character, and the control characters: tab,
-- newline, carriage return, form feed, and vertical tab.

spaceChar :: Stream s m Char => ParsecT s u m Char
spaceChar = satisfy isSpace <?> "white space"

-- | Parses an upper-case or title-case alphabetic Unicode character. Title
-- case is used by a small number of letter ligatures like the
-- single-character form of Lj.

upperChar :: Stream s m Char => ParsecT s u m Char
upperChar = satisfy isUpper <?> "uppercase letter"

-- | Parses a lower-case alphabetic Unicode character.

lowerChar :: Stream s m Char => ParsecT s u m Char
lowerChar = satisfy isLower <?> "lowercase letter"

-- | Parses alphabetic Unicode characters: lower-case, upper-case and
-- title-case letters, plus letters of case-less scripts and modifiers
-- letters.

letterChar :: Stream s m Char => ParsecT s u m Char
letterChar = satisfy isLetter <?> "letter"

-- | Parses alphabetic or numeric digit Unicode characters.
--
-- Note that numeric digits outside the ASCII range are parsed by this
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
-- are not used by the printer and reader to represent numbers.

alphaNumChar :: Stream s m Char => ParsecT s u m Char
alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"

-- | Parses printable Unicode characters: letters, numbers, marks,
-- punctuation, symbols and spaces.

printChar :: Stream s m Char => ParsecT s u m Char
printChar = satisfy isPrint <?> "printable character"

-- | Parses an ASCII digit, i.e between “0” and “9”.

digitChar :: Stream s m Char => ParsecT s u m Char
digitChar = satisfy isDigit <?> "digit"

-- | Parses an octal digit, i.e. between “0” and “7”.

octDigitChar :: Stream s m Char => ParsecT s u m Char
octDigitChar = satisfy isOctDigit <?> "octal digit"

-- | Parses a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”,
-- or “A” and “F”.

hexDigitChar :: Stream s m Char => ParsecT s u m Char
hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"

-- | Parses Unicode mark characters, for example accents and the like, which
-- combine with preceding characters.

markChar :: Stream s m Char => ParsecT s u m Char
markChar = satisfy isMark <?> "mark character"

-- | Parses Unicode numeric characters, including digits from various
-- scripts, Roman numerals, et cetera.

numberChar :: Stream s m Char => ParsecT s u m Char
numberChar = satisfy isNumber <?> "numeric character"

-- | Parses Unicode punctuation characters, including various kinds of
-- connectors, brackets and quotes.

punctuationChar :: Stream s m Char => ParsecT s u m Char
punctuationChar = satisfy isPunctuation <?> "punctuation"

-- | Parses Unicode symbol characters, including mathematical and currency
-- symbols.

symbolChar :: Stream s m Char => ParsecT s u m Char
symbolChar = satisfy isSymbol <?> "symbol"

-- | Parses Unicode space and separator characters.

separatorChar :: Stream s m Char => ParsecT s u m Char
separatorChar = satisfy isSeparator <?> "separator"

-- | Parses a character from the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.

asciiChar :: Stream s m Char => ParsecT s u m Char
asciiChar = satisfy isAscii <?> "ASCII character"

-- | Parses a character from the first 256 characters of the Unicode
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.

latin1Char :: Stream s m Char => ParsecT s u m Char
latin1Char = satisfy isLatin1 <?> "Latin-1 character"

-- | @charCategory cat@ Parses character in Unicode General Category @cat@,
-- see 'Data.Char.GeneralCategory'.

charCategory :: Stream s m Char => GeneralCategory -> ParsecT s u m Char
charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat

-- | Returns human-readable name of Unicode General Category.

categoryName :: GeneralCategory -> String
categoryName cat =
  fromJust $ lookup cat
  [ (UppercaseLetter     , "uppercase letter")
  , (LowercaseLetter     , "lowercase letter")
  , (TitlecaseLetter     , "titlecase letter")
  , (ModifierLetter      , "modifier letter")
  , (OtherLetter         , "other letter")
  , (NonSpacingMark      , "non-spacing mark")
  , (SpacingCombiningMark, "spacing combining mark")
  , (EnclosingMark       , "enclosing mark")
  , (DecimalNumber       , "decimal number character")
  , (LetterNumber        , "letter number character")
  , (OtherNumber         , "other number character")
  , (ConnectorPunctuation, "connector punctuation")
  , (DashPunctuation     , "dash punctuation")
  , (OpenPunctuation     , "open punctuation")
  , (ClosePunctuation    , "close punctuation")
  , (InitialQuote        , "initial quote")
  , (FinalQuote          , "final quote")
  , (OtherPunctuation    , "other punctuation")
  , (MathSymbol          , "math symbol")
  , (CurrencySymbol      , "currency symbol")
  , (ModifierSymbol      , "modifier symbol")
  , (OtherSymbol         , "other symbol")
  , (Space               , "white space")
  , (LineSeparator       , "line separator")
  , (ParagraphSeparator  , "paragraph separator")
  , (Control             , "control character")
  , (Format              , "format character")
  , (Surrogate           , "surrogate character")
  , (PrivateUse          , "private-use Unicode character")
  , (NotAssigned         , "non-assigned Unicode character") ]

-- | @char c@ parses a single character @c@.
--
-- > semicolon = char ';'

char :: Stream s m Char => Char -> ParsecT s u m Char
char c = satisfy (== c) <?> showToken c

-- | The same as 'char' but case-insensitive. This parser returns actually
-- parsed character preserving its case.
--
-- >>> parseTest (char' 'e') "E"
-- 'E'
-- >>> parseTest (char' 'e') "G"
-- parse error at line 1, column 1:
-- unexpected 'G'
-- expecting 'E' or 'e'

char' :: Stream s m Char => Char -> ParsecT s u m Char
char' = choice . fmap char . extendi . pure

-- | Extends given list of characters adding uppercase version of every
-- lowercase characters and vice versa. Resulting list is guaranteed to have
-- no duplicates.

extendi :: String -> String
extendi cs = nub (cs >>= f)
  where f c | isLower c = [c, toUpper c]
            | isUpper c = [c, toLower c]
            | otherwise = [c]

-- | This parser succeeds for any character. Returns the parsed character.

anyChar :: Stream s m Char => ParsecT s u m Char
anyChar = satisfy (const True) <?> "character"

-- | @oneOf cs@ succeeds if the current character is in the supplied
-- list of characters @cs@. Returns the parsed character. Note that this
-- parser doesn't automatically generate “expected” component of error
-- message, so usually you should label it manually with 'label' or
-- ('<?>').
--
-- See also 'satisfy'.
--
-- > digit = oneOf ['0'..'9'] <?> "digit"

oneOf :: Stream s m Char => String -> ParsecT s u m Char
oneOf cs = satisfy (`elem` cs)

-- | The same as 'oneOf', but case-insensitive. Returns the parsed character
-- preserving its case.
--
-- > vowel = oneOf' "aeiou" <?> "vowel"

oneOf' :: Stream s m Char => String -> ParsecT s u m Char
oneOf' = oneOf . extendi

-- | As the dual of 'oneOf', @noneOf cs@ succeeds if the current
-- character /not/ in the supplied list of characters @cs@. Returns the
-- parsed character.

noneOf :: Stream s m Char => String -> ParsecT s u m Char
noneOf cs = satisfy (`notElem` cs)

-- | The same as 'noneOf', but case-insensitive.
--
-- > consonant = noneOf' "aeiou" <?> "consonant"

noneOf' :: Stream s m Char => String -> ParsecT s u m Char
noneOf' = noneOf . extendi

-- | The parser @satisfy f@ succeeds for any character for which the
-- supplied function @f@ returns 'True'. Returns the character that is
-- actually parsed.
--
-- > digitChar = satisfy isDigit <?> "digit"
-- > oneOf cs  = satisfy (`elem` cs)

satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
satisfy f = token nextPos testChar
  where nextPos pos x _ = updatePosChar pos x
        testChar x      = if f x then Just x else Nothing

-- | @string s@ parses a sequence of characters given by @s@. Returns
-- the parsed string (i.e. @s@).
--
-- > divOrMod = string "div" <|> string "mod"

string :: Stream s m Char => String -> ParsecT s u m String
string = tokens updatePosString (==)

-- | The same as 'string', but case-insensitive. On success returns string
-- cased as argument of the function.
--
-- >>> parseTest (string' "foobar") "foObAr"
-- "foobar"

string' :: Stream s m Char => String -> ParsecT s u m String
string' = tokens updatePosString test
  where test x y = toLower x == toLower y