megaparsec/Text/Megaparsec/Char.hs
mlugg ccf314b0b9 Add Safe Haskell support
Remove dependence on GeneralizedNewtypeDeriving and add Safe Haskell
extensions to every file.
2020-10-17 18:53:15 +02:00

318 lines
10 KiB
Haskell
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE Safe #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
-- |
-- Module : Text.Megaparsec.Char
-- Copyright : © 2015present Megaparsec contributors
-- © 2007 Paolo Martini
-- © 19992001 Daan Leijen
-- License : FreeBSD
--
-- Maintainer : Mark Karpov <markkarpov92@gmail.com>
-- Stability : experimental
-- Portability : non-portable
--
-- Commonly used character parsers.
module Text.Megaparsec.Char
( -- * Simple parsers
newline,
crlf,
eol,
tab,
space,
hspace,
space1,
hspace1,
-- * Categories of characters
controlChar,
spaceChar,
upperChar,
lowerChar,
letterChar,
alphaNumChar,
printChar,
digitChar,
binDigitChar,
octDigitChar,
hexDigitChar,
markChar,
numberChar,
punctuationChar,
symbolChar,
separatorChar,
asciiChar,
latin1Char,
charCategory,
categoryName,
-- * Single character
char,
char',
-- * Sequence of characters
string,
string',
)
where
import Control.Applicative
import Data.Char
import Data.Functor (void)
import Data.Proxy
import Text.Megaparsec
import Text.Megaparsec.Common
----------------------------------------------------------------------------
-- Simple parsers
-- | Parse a newline character.
newline :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
newline = char '\n'
{-# INLINE newline #-}
-- | Parse a carriage return character followed by a newline character.
-- Return the sequence of characters parsed.
crlf :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
crlf = string (tokensToChunk (Proxy :: Proxy s) "\r\n")
{-# INLINE crlf #-}
-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
-- sequence of characters parsed.
eol :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
eol =
(tokenToChunk (Proxy :: Proxy s) <$> newline)
<|> crlf
<?> "end of line"
{-# INLINE eol #-}
-- | Parse a tab character.
tab :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
tab = char '\t'
{-# INLINE tab #-}
-- | Skip /zero/ or more white space characters.
--
-- See also: 'skipMany' and 'spaceChar'.
space :: (MonadParsec e s m, Token s ~ Char) => m ()
space = void $ takeWhileP (Just "white space") isSpace
{-# INLINE space #-}
-- | Like 'space', but does not accept newlines and carriage returns.
--
-- @since 9.0.0
hspace :: (MonadParsec e s m, Token s ~ Char) => m ()
hspace = void $ takeWhileP (Just "white space") isHSpace
{-# INLINE hspace #-}
-- | Skip /one/ or more white space characters.
--
-- See also: 'skipSome' and 'spaceChar'.
--
-- @since 6.0.0
space1 :: (MonadParsec e s m, Token s ~ Char) => m ()
space1 = void $ takeWhile1P (Just "white space") isSpace
{-# INLINE space1 #-}
-- | Like 'space1', but does not accept newlines and carriage returns.
--
-- @since 9.0.0
hspace1 :: (MonadParsec e s m, Token s ~ Char) => m ()
hspace1 = void $ takeWhile1P (Just "white space") isHSpace
{-# INLINE hspace1 #-}
----------------------------------------------------------------------------
-- Categories of characters
-- | Parse a control character (a non-printing character of the Latin-1
-- subset of Unicode).
controlChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
controlChar = satisfy isControl <?> "control character"
{-# INLINE controlChar #-}
-- | Parse a Unicode space character, and the control characters: tab,
-- newline, carriage return, form feed, and vertical tab.
spaceChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
spaceChar = satisfy isSpace <?> "white space"
{-# INLINE spaceChar #-}
-- | Parse an upper-case or title-case alphabetic Unicode character. Title
-- case is used by a small number of letter ligatures like the
-- single-character form of Lj.
upperChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
upperChar = satisfy isUpper <?> "uppercase letter"
{-# INLINE upperChar #-}
-- | Parse a lower-case alphabetic Unicode character.
lowerChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
lowerChar = satisfy isLower <?> "lowercase letter"
{-# INLINE lowerChar #-}
-- | Parse an alphabetic Unicode character: lower-case, upper-case, or
-- title-case letter, or a letter of case-less scripts\/modifier letter.
letterChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
letterChar = satisfy isLetter <?> "letter"
{-# INLINE letterChar #-}
-- | Parse an alphabetic or numeric digit Unicode characters.
--
-- Note that the numeric digits outside the ASCII range are parsed by this
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
-- are not used by the printer and reader to represent numbers.
alphaNumChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"
{-# INLINE alphaNumChar #-}
-- | Parse a printable Unicode character: letter, number, mark, punctuation,
-- symbol or space.
printChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
printChar = satisfy isPrint <?> "printable character"
{-# INLINE printChar #-}
-- | Parse an ASCII digit, i.e between “0” and “9”.
digitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
digitChar = satisfy isDigit <?> "digit"
{-# INLINE digitChar #-}
-- | Parse a binary digit, i.e. "0" or "1".
--
-- @since 7.0.0
binDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
binDigitChar = satisfy isBinDigit <?> "binary digit"
where
isBinDigit x = x == '0' || x == '1'
{-# INLINE binDigitChar #-}
-- | Parse an octal digit, i.e. between “0” and “7”.
octDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
octDigitChar = satisfy isOctDigit <?> "octal digit"
{-# INLINE octDigitChar #-}
-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
-- “A” and “F”.
hexDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"
{-# INLINE hexDigitChar #-}
-- | Parse a Unicode mark character (accents and the like), which combines
-- with preceding characters.
markChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
markChar = satisfy isMark <?> "mark character"
{-# INLINE markChar #-}
-- | Parse a Unicode numeric character, including digits from various
-- scripts, Roman numerals, etc.
numberChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
numberChar = satisfy isNumber <?> "numeric character"
{-# INLINE numberChar #-}
-- | Parse a Unicode punctuation character, including various kinds of
-- connectors, brackets and quotes.
punctuationChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
punctuationChar = satisfy isPunctuation <?> "punctuation"
{-# INLINE punctuationChar #-}
-- | Parse a Unicode symbol characters, including mathematical and currency
-- symbols.
symbolChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
symbolChar = satisfy isSymbol <?> "symbol"
{-# INLINE symbolChar #-}
-- | Parse a Unicode space and separator characters.
separatorChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
separatorChar = satisfy isSeparator <?> "separator"
{-# INLINE separatorChar #-}
-- | Parse a character from the first 128 characters of the Unicode
-- character set, corresponding to the ASCII character set.
asciiChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
asciiChar = satisfy isAscii <?> "ASCII character"
{-# INLINE asciiChar #-}
-- | Parse a character from the first 256 characters of the Unicode
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.
latin1Char :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
latin1Char = satisfy isLatin1 <?> "Latin-1 character"
{-# INLINE latin1Char #-}
-- | @'charCategory' cat@ parses character in Unicode General Category
-- @cat@, see 'Data.Char.GeneralCategory'.
charCategory ::
(MonadParsec e s m, Token s ~ Char) =>
GeneralCategory ->
m (Token s)
charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat
{-# INLINE charCategory #-}
-- | Return the human-readable name of Unicode General Category.
categoryName :: GeneralCategory -> String
categoryName = \case
UppercaseLetter -> "uppercase letter"
LowercaseLetter -> "lowercase letter"
TitlecaseLetter -> "titlecase letter"
ModifierLetter -> "modifier letter"
OtherLetter -> "other letter"
NonSpacingMark -> "non-spacing mark"
SpacingCombiningMark -> "spacing combining mark"
EnclosingMark -> "enclosing mark"
DecimalNumber -> "decimal number character"
LetterNumber -> "letter number character"
OtherNumber -> "other number character"
ConnectorPunctuation -> "connector punctuation"
DashPunctuation -> "dash punctuation"
OpenPunctuation -> "open punctuation"
ClosePunctuation -> "close punctuation"
InitialQuote -> "initial quote"
FinalQuote -> "final quote"
OtherPunctuation -> "other punctuation"
MathSymbol -> "math symbol"
CurrencySymbol -> "currency symbol"
ModifierSymbol -> "modifier symbol"
OtherSymbol -> "other symbol"
Space -> "white space"
LineSeparator -> "line separator"
ParagraphSeparator -> "paragraph separator"
Control -> "control character"
Format -> "format character"
Surrogate -> "surrogate character"
PrivateUse -> "private-use Unicode character"
NotAssigned -> "non-assigned Unicode character"
----------------------------------------------------------------------------
-- Single character
-- | A type-constrained version of 'single'.
--
-- > semicolon = char ';'
char :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
char = single
{-# INLINE char #-}
-- | The same as 'char' but case-insensitive. This parser returns the
-- actually parsed character preserving its case.
--
-- >>> parseTest (char' 'e') "E"
-- 'E'
-- >>> parseTest (char' 'e') "G"
-- 1:1:
-- unexpected 'G'
-- expecting 'E' or 'e'
char' :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
char' c =
choice
[ char (toLower c),
char (toUpper c),
char (toTitle c)
]
{-# INLINE char' #-}
----------------------------------------------------------------------------
-- Helpers
-- | Is it a horizontal space character?
isHSpace :: Char -> Bool
isHSpace x = isSpace x && x /= '\n' && x /= '\r'