megaparsec/Text/Megaparsec/Char.hs

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE Safe #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}

-- |
-- Module      :  Text.Megaparsec.Char
-- Copyright   :  © 2015–present Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  FreeBSD
--
-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
-- Stability   :  experimental
-- Portability :  non-portable
--
-- Commonly used character parsers.
module Text.Megaparsec.Char
  ( -- * Simple parsers
    newline,
    crlf,
    eol,
    tab,
    space,
    hspace,
    space1,
    hspace1,

    -- * Categories of characters
    controlChar,
    spaceChar,
    upperChar,
    lowerChar,
    letterChar,
    alphaNumChar,
    printChar,
    digitChar,
    binDigitChar,
    octDigitChar,
    hexDigitChar,
    markChar,
    numberChar,
    punctuationChar,
    symbolChar,
    separatorChar,
    asciiChar,
    latin1Char,
    charCategory,
    categoryName,

    -- * Single character
    char,
    char',

    -- * Sequence of characters
    string,
    string',
  )
where

import Control.Applicative
import Data.Char
import Data.Functor (void)
import Data.Proxy
import Text.Megaparsec
import Text.Megaparsec.Common

----------------------------------------------------------------------------
-- Simple parsers

-- | Parse a newline character.
newline :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
newline = char '\n'
{-# INLINE newline #-}

-- | Parse a carriage return character followed by a newline character.
-- Return the sequence of characters parsed.
crlf :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
crlf = string (tokensToChunk (Proxy :: Proxy s) "\r\n")
{-# INLINE crlf #-}

-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
-- sequence of characters parsed.
eol :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
eol =
  (tokenToChunk (Proxy :: Proxy s) <$> newline)
    <|> crlf
    <?> "end of line"
{-# INLINE eol #-}

-- | Parse a tab character.
tab :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
tab = char '\t'
{-# INLINE tab #-}

-- | Skip /zero/ or more white space characters.
--
-- See also: 'skipMany' and 'spaceChar'.
space :: (MonadParsec e s m, Token s ~ Char) => m ()
space = void $ takeWhileP (Just "white space") isSpace
{-# INLINE space #-}

-- | Like 'space', but does not accept newlines and carriage returns.
--
-- @since 9.0.0
hspace :: (MonadParsec e s m, Token s ~ Char) => m ()
hspace = void $ takeWhileP (Just "white space") isHSpace
{-# INLINE hspace #-}

-- | Skip /one/ or more white space characters.
--
-- See also: 'skipSome' and 'spaceChar'.
--
-- @since 6.0.0
space1 :: (MonadParsec e s m, Token s ~ Char) => m ()
space1 = void $ takeWhile1P (Just "white space") isSpace
{-# INLINE space1 #-}

-- | Like 'space1', but does not accept newlines and carriage returns.
--
-- @since 9.0.0
hspace1 :: (MonadParsec e s m, Token s ~ Char) => m ()
hspace1 = void $ takeWhile1P (Just "white space") isHSpace
{-# INLINE hspace1 #-}

----------------------------------------------------------------------------
-- Categories of characters

-- | Parse a control character (a non-printing character of the Latin-1
-- subset of Unicode).
controlChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
controlChar = satisfy isControl <?> "control character"
{-# INLINE controlChar #-}

-- | Parse a Unicode space character, and the control characters: tab,
-- newline, carriage return, form feed, and vertical tab.
spaceChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
spaceChar = satisfy isSpace <?> "white space"
{-# INLINE spaceChar #-}

-- | Parse an upper-case or title-case alphabetic Unicode character. Title
-- case is used by a small number of letter ligatures like the
-- single-character form of Lj.
upperChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
upperChar = satisfy isUpper <?> "uppercase letter"
{-# INLINE upperChar #-}

-- | Parse a lower-case alphabetic Unicode character.
lowerChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
lowerChar = satisfy isLower <?> "lowercase letter"
{-# INLINE lowerChar #-}

-- | Parse an alphabetic Unicode character: lower-case, upper-case, or
-- title-case letter, or a letter of case-less scripts\/modifier letter.
letterChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
letterChar = satisfy isLetter <?> "letter"
{-# INLINE letterChar #-}

-- | Parse an alphabetic or numeric digit Unicode characters.
--
-- Note that the numeric digits outside the ASCII range are parsed by this
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
-- are not used by the printer and reader to represent numbers.
alphaNumChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"
{-# INLINE alphaNumChar #-}

-- | Parse a printable Unicode character: letter, number, mark, punctuation,
-- symbol or space.
printChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
printChar = satisfy isPrint <?> "printable character"
{-# INLINE printChar #-}

-- | Parse an ASCII digit, i.e between “0” and “9”.
digitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
digitChar = satisfy isDigit <?> "digit"
{-# INLINE digitChar #-}

-- | Parse a binary digit, i.e. "0" or "1".
--
-- @since 7.0.0
binDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
binDigitChar = satisfy isBinDigit <?> "binary digit"
  where
    isBinDigit x = x == '0' || x == '1'
{-# INLINE binDigitChar #-}

-- | Parse an octal digit, i.e. between “0” and “7”.
octDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
octDigitChar = satisfy isOctDigit <?> "octal digit"
{-# INLINE octDigitChar #-}

-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
-- “A” and “F”.
hexDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"
{-# INLINE hexDigitChar #-}

-- | Parse a Unicode mark character (accents and the like), which combines
-- with preceding characters.
markChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
markChar = satisfy isMark <?> "mark character"
{-# INLINE markChar #-}

-- | Parse a Unicode numeric character, including digits from various
-- scripts, Roman numerals, etc.
numberChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
numberChar = satisfy isNumber <?> "numeric character"
{-# INLINE numberChar #-}

-- | Parse a Unicode punctuation character, including various kinds of
-- connectors, brackets and quotes.
punctuationChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
punctuationChar = satisfy isPunctuation <?> "punctuation"
{-# INLINE punctuationChar #-}

-- | Parse a Unicode symbol characters, including mathematical and currency
-- symbols.
symbolChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
symbolChar = satisfy isSymbol <?> "symbol"
{-# INLINE symbolChar #-}

-- | Parse a Unicode space and separator characters.
separatorChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
separatorChar = satisfy isSeparator <?> "separator"
{-# INLINE separatorChar #-}

-- | Parse a character from the first 128 characters of the Unicode
-- character set, corresponding to the ASCII character set.
asciiChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
asciiChar = satisfy isAscii <?> "ASCII character"
{-# INLINE asciiChar #-}

-- | Parse a character from the first 256 characters of the Unicode
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.
latin1Char :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
latin1Char = satisfy isLatin1 <?> "Latin-1 character"
{-# INLINE latin1Char #-}

-- | @'charCategory' cat@ parses character in Unicode General Category
-- @cat@, see 'Data.Char.GeneralCategory'.
charCategory ::
  (MonadParsec e s m, Token s ~ Char) =>
  GeneralCategory ->
  m (Token s)
charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat
{-# INLINE charCategory #-}

-- | Return the human-readable name of Unicode General Category.
categoryName :: GeneralCategory -> String
categoryName = \case
  UppercaseLetter -> "uppercase letter"
  LowercaseLetter -> "lowercase letter"
  TitlecaseLetter -> "titlecase letter"
  ModifierLetter -> "modifier letter"
  OtherLetter -> "other letter"
  NonSpacingMark -> "non-spacing mark"
  SpacingCombiningMark -> "spacing combining mark"
  EnclosingMark -> "enclosing mark"
  DecimalNumber -> "decimal number character"
  LetterNumber -> "letter number character"
  OtherNumber -> "other number character"
  ConnectorPunctuation -> "connector punctuation"
  DashPunctuation -> "dash punctuation"
  OpenPunctuation -> "open punctuation"
  ClosePunctuation -> "close punctuation"
  InitialQuote -> "initial quote"
  FinalQuote -> "final quote"
  OtherPunctuation -> "other punctuation"
  MathSymbol -> "math symbol"
  CurrencySymbol -> "currency symbol"
  ModifierSymbol -> "modifier symbol"
  OtherSymbol -> "other symbol"
  Space -> "white space"
  LineSeparator -> "line separator"
  ParagraphSeparator -> "paragraph separator"
  Control -> "control character"
  Format -> "format character"
  Surrogate -> "surrogate character"
  PrivateUse -> "private-use Unicode character"
  NotAssigned -> "non-assigned Unicode character"

----------------------------------------------------------------------------
-- Single character

-- | A type-constrained version of 'single'.
--
-- > semicolon = char ';'
char :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
char = single
{-# INLINE char #-}

-- | The same as 'char' but case-insensitive. This parser returns the
-- actually parsed character preserving its case.
--
-- >>> parseTest (char' 'e') "E"
-- 'E'
-- >>> parseTest (char' 'e') "G"
-- 1:1:
-- unexpected 'G'
-- expecting 'E' or 'e'
char' :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
char' c =
  choice
    [ char (toLower c),
      char (toUpper c),
      char (toTitle c)
    ]
{-# INLINE char' #-}

----------------------------------------------------------------------------
-- Helpers

-- | Is it a horizontal space character?
isHSpace :: Char -> Bool
isHSpace x = isSpace x && x /= '\n' && x /= '\r'