megaparsec/Text/Megaparsec/Char.hs

-- |
-- Module      :  Text.Megaparsec.Char
-- Copyright   :  © 2015 Megaparsec contributors
--                © 2007 Paolo Martini
--                © 1999–2001 Daan Leijen
-- License     :  BSD3
--
-- Maintainer  :  Mark Karpov <markkarpov@opmbx.org>
-- Stability   :  experimental
-- Portability :  portable
--
-- Commonly used character parsers.

module Text.Megaparsec.Char
    ( newline
    , crlf
    , eol
    , tab
    , space
    , controlChar
    , spaceChar
    , upperChar
    , lowerChar
    , letterChar
    , alphaNumChar
    , printChar
    , digitChar
    , octDigitChar
    , hexDigitChar
    , markChar
    , numberChar
    , punctuationChar
    , symbolChar
    , separatorChar
    , asciiChar
    , latin1Char
    , charCategory
    , categoryName
    , char
    , anyChar
    , oneOf
    , noneOf
    , satisfy
    , string )
where

import Control.Applicative ((<|>))
import Data.Char
import Data.Maybe (fromJust)

import Text.Megaparsec.Combinator
import Text.Megaparsec.Pos
import Text.Megaparsec.Prim
import Text.Megaparsec.ShowToken

-- | Parses a newline character.

newline :: Stream s m Char => ParsecT s u m Char
newline = char '\n' <?> "newline"

-- | Parses a carriage return character followed by a newline
-- character. Returns sequence of characters parsed.

crlf :: Stream s m Char => ParsecT s u m String
crlf = string "\r\n"

-- | Parses a CRLF (see 'crlf') or LF (see 'newline') end of line.
-- Returns the sequence of characters parsed.
--
-- > eol = (pure <$> newline) <|> crlf

eol :: Stream s m Char => ParsecT s u m String
eol = (pure <$> newline) <|> crlf <?> "end of line"

-- | Parses a tab character.

tab :: Stream s m Char => ParsecT s u m Char
tab = char '\t' <?> "tab"

-- | Skips /zero/ or more white space characters. See also 'skipMany' and
-- 'spaceChar'.

space :: Stream s m Char => ParsecT s u m ()
space = skipMany spaceChar

-- | Parses control characters, which are the non-printing characters of the
-- Latin-1 subset of Unicode.

controlChar :: Stream s m Char => ParsecT s u m Char
controlChar = satisfy isControl <?> "control character"

-- | Parses a Unicode space character, and the control characters: tab,
-- newline, carriage return, form feed, and vertical tab.

spaceChar :: Stream s m Char => ParsecT s u m Char
spaceChar = satisfy isSpace <?> "white space"

-- | Parses an upper-case or title-case alphabetic Unicode character. Title
-- case is used by a small number of letter ligatures like the
-- single-character form of Lj.

upperChar :: Stream s m Char => ParsecT s u m Char
upperChar = satisfy isUpper <?> "uppercase letter"

-- | Parses a lower-case alphabetic Unicode character.

lowerChar :: Stream s m Char => ParsecT s u m Char
lowerChar = satisfy isLower <?> "lowercase letter"

-- | Parses alphabetic Unicode characters: lower-case, upper-case and
-- title-case letters, plus letters of case-less scripts and modifiers
-- letters.

letterChar :: Stream s m Char => ParsecT s u m Char
letterChar = satisfy isLetter <?> "letter"

-- | Parses alphabetic or numeric digit Unicode characters.
--
-- Note that numeric digits outside the ASCII range are parsed by this
-- parser but not by 'digitChar'. Such digits may be part of identifiers but
-- are not used by the printer and reader to represent numbers.

alphaNumChar :: Stream s m Char => ParsecT s u m Char
alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"

-- | Parses printable Unicode characters: letters, numbers, marks,
-- punctuation, symbols and spaces.

printChar :: Stream s m Char => ParsecT s u m Char
printChar = satisfy isPrint <?> "printable character"

-- | Parses an ASCII digit, i.e between “0” and “9”.

digitChar :: Stream s m Char => ParsecT s u m Char
digitChar = satisfy isDigit <?> "digit"

-- | Parses an octal digit, i.e. between “0” and “7”.

octDigitChar :: Stream s m Char => ParsecT s u m Char
octDigitChar = satisfy isOctDigit <?> "octal digit"

-- | Parses a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”,
-- or “A” and “F”.

hexDigitChar :: Stream s m Char => ParsecT s u m Char
hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"

-- | Parses Unicode mark characters, for example accents and the like, which
-- combine with preceding characters.

markChar :: Stream s m Char => ParsecT s u m Char
markChar = satisfy isMark <?> "mark character"

-- | Parses Unicode numeric characters, including digits from various
-- scripts, Roman numerals, et cetera.

numberChar :: Stream s m Char => ParsecT s u m Char
numberChar = satisfy isNumber <?> "numeric character"

-- | Parses Unicode punctuation characters, including various kinds of
-- connectors, brackets and quotes.

punctuationChar :: Stream s m Char => ParsecT s u m Char
punctuationChar = satisfy isPunctuation <?> "punctuation"

-- | Parses Unicode symbol characters, including mathematical and currency
-- symbols.

symbolChar :: Stream s m Char => ParsecT s u m Char
symbolChar = satisfy isSymbol <?> "symbol"

-- | Parses Unicode space and separator characters.

separatorChar :: Stream s m Char => ParsecT s u m Char
separatorChar = satisfy isSeparator <?> "separator"

-- | Parses a character from the first 128 characters of the Unicode character set,
-- corresponding to the ASCII character set.

asciiChar :: Stream s m Char => ParsecT s u m Char
asciiChar = satisfy isAscii <?> "ASCII character"

-- | Parses a character from the first 256 characters of the Unicode
-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.

latin1Char :: Stream s m Char => ParsecT s u m Char
latin1Char = satisfy isLatin1 <?> "Latin-1 character"

-- | @charCategory cat@ Parses character in Unicode General Category @cat@,
-- see 'Data.Char.GeneralCategory'.

charCategory :: Stream s m Char => GeneralCategory -> ParsecT s u m Char
charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat

-- | Returns human-readable name of Unicode General Category.

categoryName :: GeneralCategory -> String
categoryName cat =
  fromJust $ lookup cat
  [ (UppercaseLetter     , "uppercase letter")
  , (LowercaseLetter     , "lowercase letter")
  , (TitlecaseLetter     , "titlecase letter")
  , (ModifierLetter      , "modifier letter")
  , (OtherLetter         , "other letter")
  , (NonSpacingMark      , "non-spacing mark")
  , (SpacingCombiningMark, "spacing combining mark")
  , (EnclosingMark       , "enclosing mark")
  , (DecimalNumber       , "decimal number character")
  , (LetterNumber        , "letter number character")
  , (OtherNumber         , "other number character")
  , (ConnectorPunctuation, "connector punctuation")
  , (DashPunctuation     , "dash punctuation")
  , (OpenPunctuation     , "open punctuation")
  , (ClosePunctuation    , "close punctuation")
  , (InitialQuote        , "initial quote")
  , (FinalQuote          , "final quote")
  , (OtherPunctuation    , "other punctuation")
  , (MathSymbol          , "math symbol")
  , (CurrencySymbol      , "currency symbol")
  , (ModifierSymbol      , "modifier symbol")
  , (OtherSymbol         , "other symbol")
  , (Space               , "white space")
  , (LineSeparator       , "line separator")
  , (ParagraphSeparator  , "paragraph separator")
  , (Control             , "control character")
  , (Format              , "format character")
  , (Surrogate           , "surrogate character")
  , (PrivateUse          , "private-use Unicode character")
  , (NotAssigned         , "non-assigned Unicode character") ]

-- | @char c@ parses a single character @c@.
--
-- > semicolon = char ';'

char :: Stream s m Char => Char -> ParsecT s u m Char
char c = satisfy (== c) <?> showToken c

-- | This parser succeeds for any character. Returns the parsed character.

anyChar :: Stream s m Char => ParsecT s u m Char
anyChar = satisfy (const True) <?> "character"

-- | @oneOf cs@ succeeds if the current character is in the supplied
-- list of characters @cs@. Returns the parsed character. See also
-- 'satisfy'.
--
-- > vowel = oneOf "aeiou" <?> "vowel"

oneOf :: Stream s m Char => String -> ParsecT s u m Char
oneOf cs = satisfy (`elem` cs)

-- | As the dual of 'oneOf', @noneOf cs@ succeeds if the current
-- character /not/ in the supplied list of characters @cs@. Returns the
-- parsed character.
--
-- > consonant = noneOf "aeiou" <?> "consonant"

noneOf :: Stream s m Char => String -> ParsecT s u m Char
noneOf cs = satisfy (`notElem` cs)

-- | The parser @satisfy f@ succeeds for any character for which the
-- supplied function @f@ returns 'True'. Returns the character that is
-- actually parsed.
--
-- > digit    = satisfy isDigit
-- > oneOf cs = satisfy (`elem` cs)

satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
satisfy f = tokenPrim nextPos testChar
    where nextPos pos x _ = updatePosChar pos x
          testChar x      = if f x then Just x else Nothing

-- | @string s@ parses a sequence of characters given by @s@. Returns
-- the parsed string (i.e. @s@).
--
-- > divOrMod = string "div" <|> string "mod"

string :: Stream s m Char => String -> ParsecT s u m String
string = tokens updatePosString
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- |
-												renamed ‘MegaParsec’ → ‘Megaparsec’, close #10

											
										
										
											2015-08-01 19:24:45 +03:00
+								-- Module      :  Text.Megaparsec.Char
 								-- Copyright   :  © 2015 Megaparsec contributors
-												cosmetic changes in copyright (headers)

											
										
										
											2015-07-30 19:20:37 +03:00
+								--                © 2007 Paolo Martini
 								--                © 1999–2001 Daan Leijen
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- License     :  BSD3
 								--
 								-- Maintainer  :  Mark Karpov <markkarpov@opmbx.org>
-												refactoring, phase 2

											
										
										
											2015-07-29 11:38:32 +03:00
+								-- Stability   :  experimental
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- Portability :  portable
 								--
 								-- Commonly used character parsers.
-												renamed ‘MegaParsec’ → ‘Megaparsec’, close #10

											
										
										
											2015-08-01 19:24:45 +03:00
+								module Text.Megaparsec.Char
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								    ( newline
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								    , crlf
-												further cosmetic changes in ‘Text.Megaparsec.Char’

											
										
										
											2015-08-08 18:17:27 +03:00
+								    , eol
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								    , tab
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								    , space
 								    , controlChar
 								    , spaceChar
 								    , upperChar
 								    , lowerChar
 								    , letterChar
 								    , alphaNumChar
 								    , printChar
 								    , digitChar
 								    , octDigitChar
 								    , hexDigitChar
 								    , markChar
 								    , numberChar
 								    , punctuationChar
 								    , symbolChar
 								    , separatorChar
 								    , asciiChar
 								    , latin1Char
 								    , charCategory
 								    , categoryName
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								    , char
 								    , anyChar
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								    , oneOf
 								    , noneOf
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								    , satisfy
 								    , string )
 								where
-												re-export ‘(<|>)’, ‘many’, ‘some’, and ‘optional’, fixes #9

These functions are now re-exported from ‘Control.Applicative’
module. ‘many’ and ‘some’ are now part of ‘Alternative’ instance of
‘ParsecT’.

Note that these functions are re-exported only in ‘Text.MegaParsec’
module, but not in ‘Text.MegaParsec.Prim’ to avoid duplication of
floating doc-strings. Others internal modules now just casually import
‘Control.Applicative’ for their needs.

Note that ‘many1’ was renamed to ‘some’, the same is done for other
parsers that had ‘many1’ part in their names (for consistency).

											
										
										
											2015-08-01 17:39:20 +03:00
+								import Control.Applicative ((<|>))
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								import Data.Char
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								import Data.Maybe (fromJust)
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
+								import Text.Megaparsec.Combinator
-												renamed ‘MegaParsec’ → ‘Megaparsec’, close #10

											
										
										
											2015-08-01 19:24:45 +03:00
+								import Text.Megaparsec.Pos
 								import Text.Megaparsec.Prim
-												further cosmetic changes in ‘Text.Megaparsec.Char’

											
										
										
											2015-08-08 18:17:27 +03:00
+								import Text.Megaparsec.ShowToken
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
+								-- | Parses a newline character.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
 								newline :: Stream s m Char => ParsecT s u m Char
-												further cosmetic changes in ‘Text.Megaparsec.Char’

											
										
										
											2015-08-08 18:17:27 +03:00
+								newline = char '\n' <?> "newline"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
+								-- | Parses a carriage return character followed by a newline
 								-- character. Returns sequence of characters parsed.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												improve quality of error messages, fixed #13

											
										
										
											2015-08-11 00:21:52 +03:00
+								crlf :: Stream s m Char => ParsecT s u m String
 								crlf = string "\r\n"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
+								-- | Parses a CRLF (see 'crlf') or LF (see 'newline') end of line.
 								-- Returns the sequence of characters parsed.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								--
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
+								-- > eol = (pure <$> newline) <|> crlf
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												improve quality of error messages, fixed #13

											
										
										
											2015-08-11 00:21:52 +03:00
+								eol :: Stream s m Char => ParsecT s u m String
 								eol = (pure <$> newline) <|> crlf <?> "end of line"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
+								-- | Parses a tab character.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
 								tab :: Stream s m Char => ParsecT s u m Char
 								tab = char '\t' <?> "tab"
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | Skips /zero/ or more white space characters. See also 'skipMany' and
 								-- 'spaceChar'.
 								space :: Stream s m Char => ParsecT s u m ()
 								space = skipMany spaceChar
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | Parses control characters, which are the non-printing characters of the
 								-- Latin-1 subset of Unicode.
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								controlChar :: Stream s m Char => ParsecT s u m Char
 								controlChar = satisfy isControl <?> "control character"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | Parses a Unicode space character, and the control characters: tab,
 								-- newline, carriage return, form feed, and vertical tab.
 								spaceChar :: Stream s m Char => ParsecT s u m Char
 								spaceChar = satisfy isSpace <?> "white space"
 								-- | Parses an upper-case or title-case alphabetic Unicode character. Title
 								-- case is used by a small number of letter ligatures like the
 								-- single-character form of Lj.
 								upperChar :: Stream s m Char => ParsecT s u m Char
 								upperChar = satisfy isUpper <?> "uppercase letter"
 								-- | Parses a lower-case alphabetic Unicode character.
 								lowerChar :: Stream s m Char => ParsecT s u m Char
 								lowerChar = satisfy isLower <?> "lowercase letter"
 								-- | Parses alphabetic Unicode characters: lower-case, upper-case and
 								-- title-case letters, plus letters of case-less scripts and modifiers
 								-- letters.
 								letterChar :: Stream s m Char => ParsecT s u m Char
 								letterChar = satisfy isLetter <?> "letter"
 								-- | Parses alphabetic or numeric digit Unicode characters.
 								--
 								-- Note that numeric digits outside the ASCII range are parsed by this
 								-- parser but not by 'digitChar'. Such digits may be part of identifiers but
 								-- are not used by the printer and reader to represent numbers.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								alphaNumChar :: Stream s m Char => ParsecT s u m Char
 								alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | Parses printable Unicode characters: letters, numbers, marks,
 								-- punctuation, symbols and spaces.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								printChar :: Stream s m Char => ParsecT s u m Char
 								printChar = satisfy isPrint <?> "printable character"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | Parses an ASCII digit, i.e between “0” and “9”.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								digitChar :: Stream s m Char => ParsecT s u m Char
 								digitChar = satisfy isDigit <?> "digit"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | Parses an octal digit, i.e. between “0” and “7”.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								octDigitChar :: Stream s m Char => ParsecT s u m Char
 								octDigitChar = satisfy isOctDigit <?> "octal digit"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | Parses a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”,
 								-- or “A” and “F”.
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								hexDigitChar :: Stream s m Char => ParsecT s u m Char
 								hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | Parses Unicode mark characters, for example accents and the like, which
 								-- combine with preceding characters.
 								markChar :: Stream s m Char => ParsecT s u m Char
 								markChar = satisfy isMark <?> "mark character"
 								-- | Parses Unicode numeric characters, including digits from various
 								-- scripts, Roman numerals, et cetera.
 								numberChar :: Stream s m Char => ParsecT s u m Char
 								numberChar = satisfy isNumber <?> "numeric character"
 								-- | Parses Unicode punctuation characters, including various kinds of
 								-- connectors, brackets and quotes.
 								punctuationChar :: Stream s m Char => ParsecT s u m Char
 								punctuationChar = satisfy isPunctuation <?> "punctuation"
 								-- | Parses Unicode symbol characters, including mathematical and currency
 								-- symbols.
 								symbolChar :: Stream s m Char => ParsecT s u m Char
 								symbolChar = satisfy isSymbol <?> "symbol"
 								-- | Parses Unicode space and separator characters.
 								separatorChar :: Stream s m Char => ParsecT s u m Char
 								separatorChar = satisfy isSeparator <?> "separator"
 								-- | Parses a character from the first 128 characters of the Unicode character set,
 								-- corresponding to the ASCII character set.
 								asciiChar :: Stream s m Char => ParsecT s u m Char
 								asciiChar = satisfy isAscii <?> "ASCII character"
 								-- | Parses a character from the first 256 characters of the Unicode
 								-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.
 								latin1Char :: Stream s m Char => ParsecT s u m Char
 								latin1Char = satisfy isLatin1 <?> "Latin-1 character"
 								-- | @charCategory cat@ Parses character in Unicode General Category @cat@,
 								-- see 'Data.Char.GeneralCategory'.
 								charCategory :: Stream s m Char => GeneralCategory -> ParsecT s u m Char
 								charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat
 								-- | Returns human-readable name of Unicode General Category.
 								categoryName :: GeneralCategory -> String
 								categoryName cat =
 								  fromJust $ lookup cat
 								  [ (UppercaseLetter     , "uppercase letter")
 								  , (LowercaseLetter     , "lowercase letter")
 								  , (TitlecaseLetter     , "titlecase letter")
 								  , (ModifierLetter      , "modifier letter")
 								  , (OtherLetter         , "other letter")
 								  , (NonSpacingMark      , "non-spacing mark")
 								  , (SpacingCombiningMark, "spacing combining mark")
 								  , (EnclosingMark       , "enclosing mark")
 								  , (DecimalNumber       , "decimal number character")
 								  , (LetterNumber        , "letter number character")
 								  , (OtherNumber         , "other number character")
 								  , (ConnectorPunctuation, "connector punctuation")
 								  , (DashPunctuation     , "dash punctuation")
 								  , (OpenPunctuation     , "open punctuation")
 								  , (ClosePunctuation    , "close punctuation")
 								  , (InitialQuote        , "initial quote")
 								  , (FinalQuote          , "final quote")
 								  , (OtherPunctuation    , "other punctuation")
 								  , (MathSymbol          , "math symbol")
 								  , (CurrencySymbol      , "currency symbol")
 								  , (ModifierSymbol      , "modifier symbol")
 								  , (OtherSymbol         , "other symbol")
 								  , (Space               , "white space")
 								  , (LineSeparator       , "line separator")
 								  , (ParagraphSeparator  , "paragraph separator")
 								  , (Control             , "control character")
 								  , (Format              , "format character")
 								  , (Surrogate           , "surrogate character")
 								  , (PrivateUse          , "private-use Unicode character")
 								  , (NotAssigned         , "non-assigned Unicode character") ]
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- | @char c@ parses a single character @c@.
 								--
-												various minor changes

											
										
										
											2015-08-12 15:41:22 +03:00
+								-- > semicolon = char ';'
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
 								char :: Stream s m Char => Char -> ParsecT s u m Char
-												further cosmetic changes in ‘Text.Megaparsec.Char’

											
										
										
											2015-08-08 18:17:27 +03:00
+								char c = satisfy (== c) <?> showToken c
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
 								-- | This parser succeeds for any character. Returns the parsed character.
 								anyChar :: Stream s m Char => ParsecT s u m Char
-												further cosmetic changes in ‘Text.Megaparsec.Char’

											
										
										
											2015-08-08 18:17:27 +03:00
+								anyChar = satisfy (const True) <?> "character"
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
-												extend collection of character parsers, close #16

Added new character parsers in ‘Text.Megaparsec.Char’:

* ‘controlChar’
* ‘printChar’
* ‘markChar’
* ‘numberChar’
* ‘punctuationChar’
* ‘symbolChar’
* ‘separatorChar’
* ‘asciiChar’
* ‘latin1Char’
* ‘charCategory’

Renamed some parsers:

‘spaces’   → ‘space’
‘space’    → ‘spaceChar’
‘lower’    → ‘lowerChar’
‘upper’    → ‘upperChar’
‘letter’   → ‘letterChar’
‘alphaNum’ → ‘alphaNumChar’
‘digit’    → ‘digitChar’
‘octDigit’ → ‘octDigitChar’
‘hexDigit’ → ‘hexDigitChar’

Descriptions of old parsers have been updated to accent some
Unicode-specific moments. For example, old description of ‘letter’
stated that it parses letters from “a” to “z” and from “A” to “Z”. This
is wrong, since it used ‘Data.Char.isAlpha’ predicate internally and
thus parsed many more characters.

											
										
										
											2015-08-12 20:00:03 +03:00
+								-- | @oneOf cs@ succeeds if the current character is in the supplied
 								-- list of characters @cs@. Returns the parsed character. See also
 								-- 'satisfy'.
 								--
 								-- > vowel = oneOf "aeiou" <?> "vowel"
 								oneOf :: Stream s m Char => String -> ParsecT s u m Char
 								oneOf cs = satisfy (`elem` cs)
 								-- | As the dual of 'oneOf', @noneOf cs@ succeeds if the current
 								-- character /not/ in the supplied list of characters @cs@. Returns the
 								-- parsed character.
 								--
 								-- > consonant = noneOf "aeiou" <?> "consonant"
 								noneOf :: Stream s m Char => String -> ParsecT s u m Char
 								noneOf cs = satisfy (`notElem` cs)
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								-- | The parser @satisfy f@ succeeds for any character for which the
 								-- supplied function @f@ returns 'True'. Returns the character that is
 								-- actually parsed.
 								--
 								-- > digit    = satisfy isDigit
 								-- > oneOf cs = satisfy (`elem` cs)
 								satisfy :: Stream s m Char => (Char -> Bool) -> ParsecT s u m Char
-												representation of tokens in error messages, fixed #12

* Type class ‘ShowToken’ introduced to pretty-print tokens.

* For now, we have defined instances for ‘String’ and ‘Char’.

											
										
										
											2015-08-06 13:37:08 +03:00
+								satisfy f = tokenPrim nextPos testChar
 								    where nextPos pos x _ = updatePosChar pos x
-												refactoring, phase 1

											
										
										
											2015-07-28 16:32:19 +03:00
+								          testChar x      = if f x then Just x else Nothing
 								-- | @string s@ parses a sequence of characters given by @s@. Returns
 								-- the parsed string (i.e. @s@).
 								--
 								-- > divOrMod = string "div" <|> string "mod"
 								string :: Stream s m Char => String -> ParsecT s u m String
-												representation of tokens in error messages, fixed #12

* Type class ‘ShowToken’ introduced to pretty-print tokens.

* For now, we have defined instances for ‘String’ and ‘Char’.

											
										
										
											2015-08-06 13:37:08 +03:00
+								string = tokens updatePosString