megaparsec/Text/Megaparsec/Char.hs

-- |
-- Module      :  Text.Megaparsec.Char
-- Copyright   :  © 2015–2017 Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  FreeBSD
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  non-portable
--
-- Commonly used character parsers.

{-# LANGUAGE CPP              #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE LambdaCase       #-}
{-# LANGUAGE TypeFamilies     #-}

module Text.Megaparsec.Char
  ( -- * Simple parsers
    newline
  , crlf
  , eol
  , tab
  , space
    -- * Categories of characters
  , controlChar
  , spaceChar
  , upperChar
  , lowerChar
  , letterChar
  , alphaNumChar
  , printChar
  , digitChar
  , octDigitChar
  , hexDigitChar
  , markChar
  , numberChar
  , punctuationChar
  , symbolChar
  , separatorChar
  , asciiChar
  , latin1Char
  , charCategory
  , categoryName
    -- * More general parsers
  , char
  , char'
  , anyChar
  , oneOf
  , oneOf'
  , noneOf
  , noneOf'
  , satisfy
    -- * Sequence of characters
  , string
  , string' )
where

import Control.Applicative
import Data.Char
import Data.List.NonEmpty (NonEmpty (..))
import qualified Data.Set as E

import Text.Megaparsec

#if !MIN_VERSION_base(4,8,0)
import Data.Foldable (Foldable (), any, elem, notElem)
import Prelude hiding (any, elem, notElem)
#endif

----------------------------------------------------------------------------
-- Simple parsers

-- | Parse a newline character.

newline :: (MonadParsec e s m, Token s ~ Char) => m Char
newline = char '\n'
{-# INLINE newline #-}

-- | Parse a carriage return character followed by a newline character.
-- Return the sequence of characters parsed.

crlf :: (MonadParsec e s m, Token s ~ Char) => m String
crlf = string "\r\n"
{-# INLINE crlf #-}

-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
-- sequence of characters parsed.
--
-- > eol = (pure <$> newline) <|> crlf <?> "end of line"

eol :: (MonadParsec e s m, Token s ~ Char) => m String
eol = (pure <$> newline) <|> crlf <?> "end of line"
{-# INLINE eol #-}

-- | Parse a tab character.

tab :: (MonadParsec e s m, Token s ~ Char) => m Char
tab = char '\t'
{-# INLINE tab #-}

-- | Skip /zero/ or more white space characters.
--
-- See also: 'skipMany' and 'spaceChar'.

space :: (MonadParsec e s m, Token s ~ Char) => m ()
space = skipMany spaceChar
{-# INLINE space #-}

----------------------------------------------------------------------------
-- Categories of characters

-- | Parse a control character (a non-printing character of the Latin-1
-- subset of Unicode).

controlChar :: (MonadParsec e s m, Token s ~ Char) => m Char
controlChar = satisfy isControl <?> "control character"
{-# INLINE controlChar #-}

-- | Parse a Unicode space character, and the control characters: tab,
-- newline, carriage return, form feed, and vertical tab.

spaceChar :: (MonadParsec e s m, Token s ~ Char) => m Char
spaceChar = satisfy isSpace <?> "white space"
{-# INLINE spaceChar #-}

-- | Parse an upper-case or title-case alphabetic Unicode character. Title
-- case is used by a small number of letter ligatures like the
-- single-character form of Lj.

upperChar :: (MonadParsec e s m, Token s ~ Char) => m Char
upperChar = satisfy isUpper <?> "uppercase letter"
{-# INLINE upperChar #-}

-- | Parse a lower-case alphabetic Unicode character.

lowerChar :: (MonadParsec e s m, Token s ~ Char) => m Char
lowerChar = satisfy isLower <?> "lowercase letter"
{-# INLINE lowerChar #-}

-- | Parse an alphabetic Unicode character: lower-case, upper-case, or
-- title-case letter, or a letter of case-less scripts\/modifier letter.

letterChar :: (MonadParsec e s m, Token s ~ Char) => m Char
letterChar = satisfy isLetter <?> "letter"
{-# INLINE letterChar #-}

-- | Parse an alphabetic or numeric digit Unicode characters.
--
-- Note that the numeric digits outside the ASCII range are parsed by this
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
-- are not used by the printer and reader to represent numbers.

alphaNumChar :: (MonadParsec e s m, Token s ~ Char) => m Char
alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"
{-# INLINE alphaNumChar #-}

-- | Parse a printable Unicode character: letter, number, mark, punctuation,
-- symbol or space.

printChar :: (MonadParsec e s m, Token s ~ Char) => m Char
printChar = satisfy isPrint <?> "printable character"
{-# INLINE printChar #-}

-- | Parse an ASCII digit, i.e between “0” and “9”.

digitChar :: (MonadParsec e s m, Token s ~ Char) => m Char
digitChar = satisfy isDigit <?> "digit"
{-# INLINE digitChar #-}

-- | Parse an octal digit, i.e. between “0” and “7”.

octDigitChar :: (MonadParsec e s m, Token s ~ Char) => m Char
octDigitChar = satisfy isOctDigit <?> "octal digit"
{-# INLINE octDigitChar #-}

-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
-- “A” and “F”.

hexDigitChar :: (MonadParsec e s m, Token s ~ Char) => m Char
hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"
{-# INLINE hexDigitChar #-}

-- | Parse a Unicode mark character (accents and the like), which combines
-- with preceding characters.

markChar :: (MonadParsec e s m, Token s ~ Char) => m Char
markChar = satisfy isMark <?> "mark character"
{-# INLINE markChar #-}

-- | Parse a Unicode numeric character, including digits from various
-- scripts, Roman numerals, etc.

numberChar :: (MonadParsec e s m, Token s ~ Char) => m Char
numberChar = satisfy isNumber <?> "numeric character"
{-# INLINE numberChar #-}

-- | Parse a Unicode punctuation character, including various kinds of
-- connectors, brackets and quotes.

punctuationChar :: (MonadParsec e s m, Token s ~ Char) => m Char
punctuationChar = satisfy isPunctuation <?> "punctuation"
{-# INLINE punctuationChar #-}

-- | Parse a Unicode symbol characters, including mathematical and currency
-- symbols.

symbolChar :: (MonadParsec e s m, Token s ~ Char) => m Char
symbolChar = satisfy isSymbol <?> "symbol"
{-# INLINE symbolChar #-}

-- | Parse a Unicode space and separator characters.

separatorChar :: (MonadParsec e s m, Token s ~ Char) => m Char
separatorChar = satisfy isSeparator <?> "separator"
{-# INLINE separatorChar #-}

-- | Parse a character from the first 128 characters of the Unicode
-- character set, corresponding to the ASCII character set.

asciiChar :: (MonadParsec e s m, Token s ~ Char) => m Char
asciiChar = satisfy isAscii <?> "ASCII character"
{-# INLINE asciiChar #-}

-- | Parse a character from the first 256 characters of the Unicode
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.

latin1Char :: (MonadParsec e s m, Token s ~ Char) => m Char
latin1Char = satisfy isLatin1 <?> "Latin-1 character"
{-# INLINE latin1Char #-}

-- | @charCategory cat@ parses character in Unicode General Category @cat@,
-- see 'Data.Char.GeneralCategory'.

charCategory :: (MonadParsec e s m, Token s ~ Char) => GeneralCategory -> m Char
charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat
{-# INLINE charCategory #-}

-- | Return the human-readable name of Unicode General Category.

categoryName :: GeneralCategory -> String
categoryName = \case
  UppercaseLetter      -> "uppercase letter"
  LowercaseLetter      -> "lowercase letter"
  TitlecaseLetter      -> "titlecase letter"
  ModifierLetter       -> "modifier letter"
  OtherLetter          -> "other letter"
  NonSpacingMark       -> "non-spacing mark"
  SpacingCombiningMark -> "spacing combining mark"
  EnclosingMark        -> "enclosing mark"
  DecimalNumber        -> "decimal number character"
  LetterNumber         -> "letter number character"
  OtherNumber          -> "other number character"
  ConnectorPunctuation -> "connector punctuation"
  DashPunctuation      -> "dash punctuation"
  OpenPunctuation      -> "open punctuation"
  ClosePunctuation     -> "close punctuation"
  InitialQuote         -> "initial quote"
  FinalQuote           -> "final quote"
  OtherPunctuation     -> "other punctuation"
  MathSymbol           -> "math symbol"
  CurrencySymbol       -> "currency symbol"
  ModifierSymbol       -> "modifier symbol"
  OtherSymbol          -> "other symbol"
  Space                -> "white space"
  LineSeparator        -> "line separator"
  ParagraphSeparator   -> "paragraph separator"
  Control              -> "control character"
  Format               -> "format character"
  Surrogate            -> "surrogate character"
  PrivateUse           -> "private-use Unicode character"
  NotAssigned          -> "non-assigned Unicode character"

----------------------------------------------------------------------------
-- More general parsers

-- | @char c@ parses a single character @c@.
--
-- > semicolon = char ';'

char :: (MonadParsec e s m, Token s ~ Char) => Char -> m Char
char c = token testChar (Just c)
  where
    f x = E.singleton (Tokens (x:|[]))
    testChar x =
      if x == c
        then Right x
        else Left (f x, f c, E.empty)
{-# INLINE char #-}

-- | The same as 'char' but case-insensitive. This parser returns the
-- actually parsed character preserving its case.
--
-- >>> parseTest (char' 'e') "E"
-- 'E'
-- >>> parseTest (char' 'e') "G"
-- 1:1:
-- unexpected 'G'
-- expecting 'E' or 'e'

char' :: (MonadParsec e s m, Token s ~ Char) => Char -> m Char
char' c = choice [char c, char $ swapCase c]
  where
    swapCase x
      | isUpper x = toLower x
      | isLower x = toUpper x
      | otherwise = x
{-# INLINE char' #-}

-- | This parser succeeds for any character. Returns the parsed character.

anyChar :: (MonadParsec e s m, Token s ~ Char) => m Char
anyChar = satisfy (const True) <?> "character"
{-# INLINE anyChar #-}

-- | @oneOf cs@ succeeds if the current character is in the supplied
-- collection of characters @cs@. Returns the parsed character. Note that
-- this parser cannot automatically generate the “expected” component of
-- error message, so usually you should label it manually with 'label' or
-- ('<?>').
--
-- See also: 'satisfy'.
--
-- > digit = oneOf ['0'..'9'] <?> "digit"

oneOf :: (Foldable f, MonadParsec e s m, Token s ~ Char) => f Char -> m Char
oneOf cs = satisfy (`elem` cs)
{-# INLINE oneOf #-}

-- | The same as 'oneOf', but case-insensitive. Returns the parsed character
-- preserving its case.
--
-- > vowel = oneOf' "aeiou" <?> "vowel"

oneOf' :: (Foldable f, MonadParsec e s m, Token s ~ Char) => f Char -> m Char
oneOf' cs = satisfy (`elemi` cs)
{-# INLINE oneOf' #-}

-- | As the dual of 'oneOf', @noneOf cs@ succeeds if the current character
-- /not/ in the supplied list of characters @cs@. Returns the parsed
-- character.

noneOf :: (Foldable f, MonadParsec e s m, Token s ~ Char) => f Char -> m Char
noneOf cs = satisfy (`notElem` cs)
{-# INLINE noneOf #-}

-- | The same as 'noneOf', but case-insensitive.
--
-- > consonant = noneOf' "aeiou" <?> "consonant"

noneOf' :: (Foldable f, MonadParsec e s m, Token s ~ Char) => f Char -> m Char
noneOf' cs = satisfy (`notElemi` cs)
{-# INLINE noneOf' #-}

-- | The parser @satisfy f@ succeeds for any character for which the
-- supplied function @f@ returns 'True'. Returns the character that is
-- actually parsed.
--
-- > digitChar = satisfy isDigit <?> "digit"
-- > oneOf cs  = satisfy (`elem` cs)

satisfy :: (MonadParsec e s m, Token s ~ Char) => (Char -> Bool) -> m Char
satisfy f = token testChar Nothing
  where
    testChar x =
      if f x
        then Right x
        else Left (E.singleton (Tokens (x:|[])), E.empty, E.empty)
{-# INLINE satisfy #-}

----------------------------------------------------------------------------
-- Sequence of characters

-- | @string s@ parses a sequence of characters given by @s@. Returns the
-- parsed string (i.e. @s@).
--
-- > divOrMod = string "div" <|> string "mod"

string :: (MonadParsec e s m, Token s ~ Char) => String -> m String
string = tokens (==)
{-# INLINE string #-}

-- | The same as 'string', but case-insensitive. On success returns string
-- cased as actually parsed input.
--
-- >>> parseTest (string' "foobar") "foObAr"
-- "foObAr"

string' :: (MonadParsec e s m, Token s ~ Char) => String -> m String
string' = tokens casei
{-# INLINE string' #-}

----------------------------------------------------------------------------
-- Helpers

-- | Case-insensitive equality test for characters.

casei :: Char -> Char -> Bool
casei x y = toUpper x == toUpper y
{-# INLINE casei #-}

-- | Case-insensitive 'elem'.

elemi :: Foldable f => Char -> f Char -> Bool
elemi = any . casei
{-# INLINE elemi #-}

-- | Case-insensitive 'notElem'.

notElemi :: Foldable f => Char -> f Char -> Bool
notElemi c = not . elemi c
{-# INLINE notElemi #-}