2008-01-13 20:53:15 +03:00
|
|
|
|
-- |
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- Module : Text.Megaparsec.Lexer
|
2015-08-01 19:24:45 +03:00
|
|
|
|
-- Copyright : © 2015 Megaparsec contributors
|
2015-07-30 19:20:37 +03:00
|
|
|
|
-- © 2007 Paolo Martini
|
|
|
|
|
-- © 1999–2001 Daan Leijen
|
2015-07-28 16:32:19 +03:00
|
|
|
|
-- License : BSD3
|
|
|
|
|
--
|
|
|
|
|
-- Maintainer : Mark Karpov <markkarpov@opmbx.org>
|
2015-07-29 11:38:32 +03:00
|
|
|
|
-- Stability : experimental
|
2008-01-20 09:39:18 +03:00
|
|
|
|
-- Portability : non-portable (uses local universal quantification: PolymorphicComponents)
|
2015-07-28 16:32:19 +03:00
|
|
|
|
--
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- A helper module to parse lexical elements. See 'makeLexer' for a
|
2015-08-23 11:04:12 +03:00
|
|
|
|
-- description of how to use it.
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2008-02-13 07:32:24 +03:00
|
|
|
|
{-# OPTIONS_GHC -fno-warn-name-shadowing #-}
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-30 13:00:07 +03:00
|
|
|
|
module Text.Megaparsec.Lexer
|
2015-08-12 20:51:06 +03:00
|
|
|
|
( LanguageDef (..)
|
2015-08-23 11:04:12 +03:00
|
|
|
|
, Lexer (..)
|
2015-09-02 16:27:48 +03:00
|
|
|
|
, defaultLang
|
2015-08-23 11:04:12 +03:00
|
|
|
|
, makeLexer )
|
2015-07-28 16:32:19 +03:00
|
|
|
|
where
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
import Control.Applicative ((<|>), many, some, empty)
|
2015-08-01 17:39:20 +03:00
|
|
|
|
import Control.Monad (void)
|
2015-08-12 20:00:03 +03:00
|
|
|
|
import Data.Char (isAlpha, toLower, toUpper)
|
2015-08-19 22:11:21 +03:00
|
|
|
|
import Data.List (sort)
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-01 19:24:45 +03:00
|
|
|
|
import Text.Megaparsec.Combinator
|
2015-09-02 16:27:48 +03:00
|
|
|
|
import Text.Megaparsec.Pos
|
|
|
|
|
import Text.Megaparsec.Prim
|
|
|
|
|
import qualified Text.Megaparsec.Char as C
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-07-30 18:45:06 +03:00
|
|
|
|
-- Language definition
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 15:41:22 +03:00
|
|
|
|
-- | The @LanguageDef@ type is a record that contains all parameters used to
|
2015-09-02 16:27:48 +03:00
|
|
|
|
-- control features of the "Text.Megaparsec.Lexer" module. 'defaultLang' can
|
|
|
|
|
-- be used as a basis for new language definitions.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-07-30 21:36:54 +03:00
|
|
|
|
data LanguageDef s u m =
|
2015-08-12 20:51:06 +03:00
|
|
|
|
LanguageDef {
|
2015-07-28 16:32:19 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
-- | The parser is used to parse single white space character. If
|
|
|
|
|
-- indentation is important in your language you should probably not treat
|
|
|
|
|
-- newline as white space character.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
spaceChar :: ParsecT s u m Char
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
-- | The parser parses line comments. It's responsibility of the parser to
|
|
|
|
|
-- stop at the end of line. If your language doesn't support this type of
|
|
|
|
|
-- comments, set this value to 'empty'.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
, lineComment :: ParsecT s u m ()
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
-- | The parser parses block (multi-line) comments. If your language
|
|
|
|
|
-- doesn't support this type of comments, set this value to 'empty'.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
, blockComment :: ParsecT s u m ()
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
-- NEXT
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | This parser should accept any start characters of identifiers, for
|
|
|
|
|
-- example @letter \<|> char \'_\'@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, identStart :: ParsecT s u m Char
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | This parser should accept any legal tail characters of identifiers,
|
|
|
|
|
-- for example @alphaNum \<|> char \'_\'@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, identLetter :: ParsecT s u m Char
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | This parser should accept any start characters of operators, for
|
|
|
|
|
-- example @oneOf \":!#$%&*+.\/\<=>?\@\\\\^|-~\"@
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, opStart :: ParsecT s u m Char
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | This parser should accept any legal tail characters of operators.
|
|
|
|
|
-- Note that this parser should even be defined if the language doesn't
|
|
|
|
|
-- support user-defined operators, or otherwise the 'reservedOp' parser
|
|
|
|
|
-- won't work correctly.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, opLetter :: ParsecT s u m Char
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The list of reserved identifiers.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, reservedNames :: [String]
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The list of reserved operators.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, reservedOpNames :: [String]
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Set to 'True' if the language is case sensitive.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, caseSensitive :: Bool }
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
-- Default language definition
|
|
|
|
|
|
|
|
|
|
-- | This is standard language definition. It is recommended to use
|
|
|
|
|
-- this definition as the basis for other definitions. @defaultLang@ has no
|
|
|
|
|
-- reserved names or operators, is case sensitive and doesn't accept
|
|
|
|
|
-- comments, identifiers or operators.
|
|
|
|
|
|
|
|
|
|
defaultLang :: Stream s m Char => LanguageDef s u m
|
|
|
|
|
defaultLang =
|
|
|
|
|
LanguageDef
|
|
|
|
|
{ spaceChar = C.spaceChar
|
|
|
|
|
, lineComment = empty
|
|
|
|
|
, blockComment = empty
|
|
|
|
|
-- NEXT
|
|
|
|
|
, identStart = C.letterChar <|> C.char '_'
|
|
|
|
|
, identLetter = C.alphaNumChar <|> C.oneOf "_'"
|
|
|
|
|
, opStart = opLetter defaultLang
|
|
|
|
|
, opLetter = C.oneOf ":!#$%&*+./<=>?@\\^|-~"
|
|
|
|
|
, reservedOpNames = []
|
|
|
|
|
, reservedNames = []
|
|
|
|
|
, caseSensitive = True }
|
|
|
|
|
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- Lexer
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
|
|
|
|
-- | The type of the record that holds lexical parsers that work on
|
|
|
|
|
-- @s@ streams with state @u@ over a monad @m@.
|
|
|
|
|
|
2015-08-23 11:04:12 +03:00
|
|
|
|
data Lexer s u m =
|
|
|
|
|
Lexer {
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
-- | Skips any white space. White space consists of /zero/ or more
|
|
|
|
|
-- occurrences of 'spaceChar', a line comment or a block (multi-line)
|
|
|
|
|
-- comment.
|
|
|
|
|
|
|
|
|
|
space :: ParsecT s u m ()
|
|
|
|
|
|
|
|
|
|
-- | @lexeme p@ first applies parser @p@ and then the 'space' parser,
|
|
|
|
|
-- returning the value of @p@. Every lexical token (lexeme) is defined
|
|
|
|
|
-- using @lexeme@, this way every parse starts at a point without white
|
|
|
|
|
-- space. Parsers that use @lexeme@ are called /lexeme/ parsers in this
|
|
|
|
|
-- document.
|
|
|
|
|
--
|
|
|
|
|
-- The only point where the 'space' parser should be called explicitly is
|
|
|
|
|
-- the start of the main parser in order to skip any leading white space.
|
|
|
|
|
|
|
|
|
|
, lexeme :: forall a. ParsecT s u m a -> ParsecT s u m a
|
|
|
|
|
|
|
|
|
|
-- | Lexeme parser @symbol s@ parses 'string' @s@ and skips
|
|
|
|
|
-- trailing white space.
|
|
|
|
|
|
|
|
|
|
, symbol :: String -> ParsecT s u m String
|
|
|
|
|
|
|
|
|
|
-- | @indentGuard p@ consumes all white space it can consume, then checks
|
|
|
|
|
-- column number. The column number should satisfy given predicate @p@,
|
|
|
|
|
-- otherwise the parser fails with “incorrect indentation” message. In
|
|
|
|
|
-- successful cases @indentGuard@ returns current column number.
|
|
|
|
|
|
|
|
|
|
, indentGuard :: (Int -> Bool) -> ParsecT s u m Int
|
|
|
|
|
|
|
|
|
|
-- NEXT
|
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser parses a legal identifier. Returns the identifier
|
|
|
|
|
-- string. This parser will fail on identifiers that are reserved
|
|
|
|
|
-- words. Legal identifier (start) characters and reserved words are
|
2015-08-23 11:04:12 +03:00
|
|
|
|
-- defined in the 'LanguageDef' that is passed to 'makeLexer'.
|
2015-07-28 16:32:19 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
, identifier :: ParsecT s u m String
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser @reserved name@ parses @symbol name@, but it also
|
|
|
|
|
-- checks that the @name@ is not a prefix of a valid identifier.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, reserved :: String -> ParsecT s u m ()
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser parses a legal operator. Returns the name of the
|
|
|
|
|
-- operator. This parser will fail on any operators that are reserved
|
|
|
|
|
-- operators. Legal operator (start) characters and reserved operators are
|
2015-08-23 11:04:12 +03:00
|
|
|
|
-- defined in the 'LanguageDef' that is passed to 'makeLexer'.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, operator :: ParsecT s u m String
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser @reservedOp name@ parses @symbol name@, but it
|
|
|
|
|
-- also checks that the @name@ is not a prefix of a valid operator.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, reservedOp :: String -> ParsecT s u m ()
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser parses a single literal character. Returns the
|
|
|
|
|
-- literal character value. This parsers deals correctly with escape
|
|
|
|
|
-- sequences. The literal character is parsed according to the grammar
|
|
|
|
|
-- rules defined in the Haskell report (which matches most programming
|
|
|
|
|
-- languages quite closely).
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, charLiteral :: ParsecT s u m Char
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser parses a literal string. Returns the literal
|
|
|
|
|
-- string value. This parsers deals correctly with escape sequences and
|
|
|
|
|
-- gaps. The literal string is parsed according to the grammar rules
|
|
|
|
|
-- defined in the Haskell report (which matches most programming languages
|
|
|
|
|
-- quite closely).
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, stringLiteral :: ParsecT s u m String
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser parses an integer (a whole number). This parser
|
|
|
|
|
-- /does not/ parse sign. Returns the value of the number. The number can
|
|
|
|
|
-- be specified in 'decimal', 'hexadecimal' or 'octal'. The number is
|
|
|
|
|
-- parsed according to the grammar rules in the Haskell report.
|
2015-07-28 16:32:19 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, integer :: ParsecT s u m Integer
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | This is just like 'integer', except it can parse sign.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, integer' :: ParsecT s u m Integer
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parses a positive whole number in the decimal system.
|
|
|
|
|
-- Returns the value of the number.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, decimal :: ParsecT s u m Integer
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parses a positive whole number in the hexadecimal
|
|
|
|
|
-- system. The number should be prefixed with “0x” or “0X”. Returns the
|
|
|
|
|
-- value of the number.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, hexadecimal :: ParsecT s u m Integer
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parses a positive whole number in the octal system.
|
|
|
|
|
-- The number should be prefixed with “0o” or “0O”. Returns the value of
|
|
|
|
|
-- the number.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, octal :: ParsecT s u m Integer
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | @signed p@ tries to parse sign (i.e. “+”, “-”, or nothing) and
|
|
|
|
|
-- then runs parser @p@, changing sign of its result accordingly. Note
|
|
|
|
|
-- that there may be white space after the sign but not before it.
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, signed :: forall a . Num a => ParsecT s u m a -> ParsecT s u m a
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser parses a floating point value. Returns the value
|
|
|
|
|
-- of the number. The number is parsed according to the grammar rules
|
|
|
|
|
-- defined in the Haskell report, sign is /not/ parsed, use 'float'' to
|
|
|
|
|
-- achieve parsing of signed floating point values.
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, float :: ParsecT s u m Double
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | This is just like 'float', except it can parse sign.
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, float' :: ParsecT s u m Double
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | The lexeme parser parses either 'integer' or a 'float'.
|
|
|
|
|
-- Returns the value of the number. This parser deals with any overlap in
|
|
|
|
|
-- the grammar rules for integers and floats. The number is parsed
|
|
|
|
|
-- according to the grammar rules defined in the Haskell report.
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, number :: ParsecT s u m (Either Integer Double)
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | This is just like 'number', except it can parse sign.
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, number' :: ParsecT s u m (Either Integer Double)
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @parens p@ parses @p@ enclosed in parenthesis,
|
|
|
|
|
-- returning the value of @p@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, parens :: forall a. ParsecT s u m a -> ParsecT s u m a
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @braces p@ parses @p@ enclosed in braces (“{” and
|
|
|
|
|
-- “}”), returning the value of @p@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, braces :: forall a. ParsecT s u m a -> ParsecT s u m a
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @angles p@ parses @p@ enclosed in angle brackets (“\<”
|
|
|
|
|
-- and “>”), returning the value of @p@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, angles :: forall a. ParsecT s u m a -> ParsecT s u m a
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @brackets p@ parses @p@ enclosed in brackets (“[”
|
|
|
|
|
-- and “]”), returning the value of @p@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, brackets :: forall a. ParsecT s u m a -> ParsecT s u m a
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @semicolon@ parses the character “;” and skips any
|
|
|
|
|
-- trailing white space. Returns the string “;”.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, semicolon :: ParsecT s u m String
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @comma@ parses the character “,” and skips any
|
|
|
|
|
-- trailing white space. Returns the string “,”.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, comma :: ParsecT s u m String
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @colon@ parses the character “:” and skips any
|
|
|
|
|
-- trailing white space. Returns the string “:”.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, colon :: ParsecT s u m String
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @dot@ parses the character “.” and skips any
|
|
|
|
|
-- trailing white space. Returns the string “.”.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, dot :: ParsecT s u m String
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @semiSep p@ parses /zero/ or more occurrences of @p@
|
|
|
|
|
-- separated by 'semicolon'. Returns a list of values returned by @p@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, semicolonSep :: forall a . ParsecT s u m a -> ParsecT s u m [a]
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @semiSep1 p@ parses /one/ or more occurrences of @p@
|
|
|
|
|
-- separated by 'semi'. Returns a list of values returned by @p@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, semicolonSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a]
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @commaSep p@ parses /zero/ or more occurrences of
|
|
|
|
|
-- @p@ separated by 'comma'. Returns a list of values returned by @p@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, commaSep :: forall a . ParsecT s u m a -> ParsecT s u m [a]
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- | Lexeme parser @commaSep1 p@ parses /one/ or more occurrences of
|
|
|
|
|
-- @p@ separated by 'comma'. Returns a list of values returned by @p@.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, commaSep1 :: forall a . ParsecT s u m a -> ParsecT s u m [a] }
|
2008-01-22 08:14:30 +03:00
|
|
|
|
|
2015-08-23 11:04:12 +03:00
|
|
|
|
-- | The expression @makeLexer language@ creates a 'Lexer' record that
|
|
|
|
|
-- contains lexical parsers that are defined using the definitions in the
|
|
|
|
|
-- @language@ record.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
--
|
2015-07-30 18:45:06 +03:00
|
|
|
|
-- The use of this function is quite stylized — one imports the appropriate
|
|
|
|
|
-- language definition and selects the lexical parsers that are needed from
|
2015-08-23 11:04:12 +03:00
|
|
|
|
-- the resulting 'Lexer'.
|
2008-01-22 08:14:30 +03:00
|
|
|
|
--
|
2015-07-30 18:45:06 +03:00
|
|
|
|
-- > module Main (main) where
|
2008-01-22 08:14:30 +03:00
|
|
|
|
-- >
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- > import Text.Megaparsec
|
|
|
|
|
-- > import Text.Megaparsec.Language (haskellDef)
|
|
|
|
|
-- > import qualified Text.Megaparsec.Lexer as L
|
2008-01-22 08:14:30 +03:00
|
|
|
|
-- >
|
2015-07-30 18:45:06 +03:00
|
|
|
|
-- > -- The parser
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- > …
|
2015-07-28 16:32:19 +03:00
|
|
|
|
-- >
|
2015-07-30 18:45:06 +03:00
|
|
|
|
-- > expr = parens expr
|
|
|
|
|
-- > <|> identifier
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- > <|> …
|
2008-01-22 08:14:30 +03:00
|
|
|
|
-- >
|
2015-07-30 18:45:06 +03:00
|
|
|
|
-- > -- The lexer
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- > lexer = L.makeLexer haskellDef
|
2015-07-28 16:32:19 +03:00
|
|
|
|
-- >
|
2015-08-30 13:00:07 +03:00
|
|
|
|
-- > parens = L.parens lexer
|
|
|
|
|
-- > braces = L.braces lexer
|
|
|
|
|
-- > identifier = L.identifier lexer
|
|
|
|
|
-- > reserved = L.reserved lexer
|
|
|
|
|
-- > …
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-23 11:04:12 +03:00
|
|
|
|
makeLexer :: Stream s m Char => LanguageDef s u m -> Lexer s u m
|
2015-09-02 16:27:48 +03:00
|
|
|
|
makeLexer lang =
|
2015-08-23 11:04:12 +03:00
|
|
|
|
Lexer
|
2015-09-02 16:27:48 +03:00
|
|
|
|
{ space = space
|
|
|
|
|
, lexeme = lexeme
|
|
|
|
|
, symbol = symbol
|
|
|
|
|
, indentGuard = indentGuard
|
|
|
|
|
|
|
|
|
|
, identifier = identifier
|
2015-08-12 20:51:06 +03:00
|
|
|
|
, reserved = reserved
|
|
|
|
|
, operator = operator
|
|
|
|
|
, reservedOp = reservedOp
|
|
|
|
|
|
|
|
|
|
, charLiteral = charLiteral
|
|
|
|
|
, stringLiteral = stringLiteral
|
|
|
|
|
|
|
|
|
|
, integer = integer
|
|
|
|
|
, integer' = integer'
|
|
|
|
|
, decimal = decimal
|
|
|
|
|
, hexadecimal = hexadecimal
|
|
|
|
|
, octal = octal
|
|
|
|
|
, signed = signed
|
|
|
|
|
, float = float
|
|
|
|
|
, float' = float'
|
|
|
|
|
, number = number
|
|
|
|
|
, number' = number'
|
|
|
|
|
|
|
|
|
|
, parens = parens
|
|
|
|
|
, braces = braces
|
|
|
|
|
, angles = angles
|
|
|
|
|
, brackets = brackets
|
|
|
|
|
, semicolon = semicolon
|
|
|
|
|
, comma = comma
|
|
|
|
|
, colon = colon
|
|
|
|
|
, dot = dot
|
|
|
|
|
, semicolonSep = semicolonSep
|
|
|
|
|
, semicolonSep1 = semicolonSep1
|
|
|
|
|
, commaSep = commaSep
|
|
|
|
|
, commaSep1 = commaSep1 }
|
|
|
|
|
where
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
-- white space & indentation
|
|
|
|
|
|
|
|
|
|
space = hidden . skipMany . choice $
|
|
|
|
|
($ lang) <$> [blockComment, lineComment, void . spaceChar]
|
|
|
|
|
lexeme p = p <* space
|
|
|
|
|
symbol = lexeme . C.string
|
|
|
|
|
indentGuard p = do
|
|
|
|
|
space
|
|
|
|
|
pos <- sourceColumn <$> getPosition
|
|
|
|
|
if p pos
|
|
|
|
|
then return pos
|
|
|
|
|
else fail "incorrect indentation"
|
|
|
|
|
|
|
|
|
|
-- bracketing NEXT
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
parens = between (symbol "(") (symbol ")")
|
|
|
|
|
braces = between (symbol "{") (symbol "}")
|
|
|
|
|
angles = between (symbol "<") (symbol ">")
|
|
|
|
|
brackets = between (symbol "[") (symbol "]")
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
semicolon = symbol ";"
|
|
|
|
|
comma = symbol ","
|
|
|
|
|
dot = symbol "."
|
|
|
|
|
colon = symbol ":"
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
commaSep = (`sepBy` comma)
|
|
|
|
|
semicolonSep = (`sepBy` semicolon)
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
commaSep1 = (`sepBy1` comma)
|
|
|
|
|
semicolonSep1 = (`sepBy1` semicolon)
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- chars & strings
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
charLiteral = lexeme ( between (C.char '\'')
|
|
|
|
|
(C.char '\'' <?> "end of character")
|
2015-08-12 20:51:06 +03:00
|
|
|
|
characterChar )
|
|
|
|
|
<?> "character"
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
characterChar = charLetter <|> charEscape <?> "literal character"
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
charEscape = C.char '\\' >> escapeCode
|
|
|
|
|
charLetter = C.satisfy (\c -> (c /= '\'') && (c /= '\\') && (c > '\026'))
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
stringLiteral =
|
|
|
|
|
lexeme ((foldr (maybe id (:)) "" <$>
|
2015-09-02 16:27:48 +03:00
|
|
|
|
between (C.char '"') (C.char '"' <?> "end of string")
|
2015-08-12 20:51:06 +03:00
|
|
|
|
(many stringChar)) <?> "literal string")
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
stringChar = (Just <$> stringLetter) <|> stringEscape <?> "string character"
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
stringLetter = C.satisfy (\c -> (c /= '"') && (c /= '\\') && (c > '\026'))
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
stringEscape = C.char '\\' >>
|
2015-08-12 20:51:06 +03:00
|
|
|
|
( (escapeGap >> return Nothing) <|>
|
|
|
|
|
(escapeEmpty >> return Nothing) <|>
|
|
|
|
|
(Just <$> escapeCode) )
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
escapeEmpty = C.char '&'
|
|
|
|
|
escapeGap = some C.spaceChar >> C.char '\\' <?> "end of string gap"
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- escape codes
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
escapeCode = charEsc <|> charNum <|> charAscii <|> charControl
|
|
|
|
|
<?> "escape code"
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
charEsc = choice (parseEsc <$> escMap)
|
2015-09-02 16:27:48 +03:00
|
|
|
|
where parseEsc (c, code) = C.char c >> return code
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
charNum = toEnum . fromInteger <$>
|
|
|
|
|
( decimal <|>
|
2015-09-02 16:27:48 +03:00
|
|
|
|
(C.char 'o' >> nump "0o" C.octDigitChar) <|>
|
|
|
|
|
(C.char 'x' >> nump "0x" C.hexDigitChar) )
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
charAscii = choice (parseAscii <$> asciiMap)
|
2015-09-02 16:27:48 +03:00
|
|
|
|
where parseAscii (asc, code) = try (C.string asc >> return code)
|
2015-07-31 14:30:38 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
charControl = toEnum . subtract 64 . fromEnum <$> (C.char '^' >> C.upperChar)
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- escape code tables
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
escMap = zip "abfnrtv\\\"\'" "\a\b\f\n\r\t\v\\\"\'"
|
|
|
|
|
asciiMap = zip (ascii3codes ++ ascii2codes) (ascii3 ++ ascii2)
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
ascii2codes = ["BS","HT","LF","VT","FF","CR","SO","SI","EM",
|
|
|
|
|
"FS","GS","RS","US","SP"]
|
|
|
|
|
ascii3codes = ["NUL","SOH","STX","ETX","EOT","ENQ","ACK","BEL",
|
|
|
|
|
"DLE","DC1","DC2","DC3","DC4","NAK","SYN","ETB",
|
|
|
|
|
"CAN","SUB","ESC","DEL"]
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
ascii2 = "\b\t\n\v\f\r\SO\SI\EM\FS\GS\RS\US "
|
|
|
|
|
ascii3 = "\NUL\SOH\STX\ETX\EOT\ENQ\ACK\a\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\SUB\ESC\DEL"
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- numbers — integers
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-19 22:11:21 +03:00
|
|
|
|
integer = decimal
|
2015-08-12 20:51:06 +03:00
|
|
|
|
integer' = signed integer
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
decimal = lexeme (nump "" C.digitChar <?> "integer")
|
|
|
|
|
hexadecimal = lexeme $ C.char '0' >> C.oneOf "xX" >> nump "0x" C.hexDigitChar
|
|
|
|
|
octal = lexeme $ C.char '0' >> C.oneOf "oO" >> nump "0o" C.octDigitChar
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
nump prefix baseDigit = read . (prefix ++) <$> some baseDigit
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
signed p = ($) <$> option id (lexeme sign) <*> p
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
sign :: (Stream s m Char, Num a) => ParsecT s u m (a -> a)
|
2015-09-02 16:27:48 +03:00
|
|
|
|
sign = (C.char '+' *> return id) <|> (C.char '-' *> return negate)
|
2015-07-30 18:45:06 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
-- numbers — floats
|
2008-01-13 20:53:15 +03:00
|
|
|
|
|
2015-08-12 20:51:06 +03:00
|
|
|
|
float = lexeme ffloat <?> "float"
|
|
|
|
|
float' = signed float
|
|
|
|
|
|
|
|
|
|
ffloat = read <$> ffloat'
|
|
|
|
|
where
|
|
|
|
|
ffloat' = do
|
|
|
|
|
decimal <- fDec
|
|
|
|
|
rest <- fraction <|> fExp
|
|
|
|
|
return $ decimal ++ rest
|
|
|
|
|
|
|
|
|
|
fraction = do
|
2015-09-02 16:27:48 +03:00
|
|
|
|
void $ C.char '.'
|
2015-08-12 20:51:06 +03:00
|
|
|
|
decimal <- fDec
|
|
|
|
|
exp <- option "" fExp
|
|
|
|
|
return $ '.' : decimal ++ exp
|
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
fDec = some C.digitChar
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
|
|
|
|
fExp = do
|
2015-09-02 16:27:48 +03:00
|
|
|
|
expChar <- C.oneOf "eE"
|
|
|
|
|
signStr <- option "" (pure <$> C.oneOf "+-")
|
2015-08-12 20:51:06 +03:00
|
|
|
|
decimal <- fDec
|
|
|
|
|
return $ expChar : signStr ++ decimal
|
|
|
|
|
|
|
|
|
|
-- numbers — a more general case
|
|
|
|
|
|
|
|
|
|
number = (Right <$> try float) <|> (Left <$> integer) <?> "number"
|
|
|
|
|
number' = (Right <$> try float') <|> (Left <$> integer') <?> "number"
|
|
|
|
|
|
|
|
|
|
-- operators & reserved ops
|
|
|
|
|
|
|
|
|
|
reservedOp name =
|
|
|
|
|
lexeme $ try $ do
|
2015-09-02 16:27:48 +03:00
|
|
|
|
void $ C.string name
|
|
|
|
|
notFollowedBy (opLetter lang) <?> ("end of " ++ show name)
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
|
|
|
|
operator =
|
|
|
|
|
lexeme $ try $ do
|
|
|
|
|
name <- oper
|
|
|
|
|
if isReservedOp name
|
|
|
|
|
then unexpected ("reserved operator " ++ show name)
|
|
|
|
|
else return name
|
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
oper = ((:) <$> opStart lang <*> many (opLetter lang))
|
2015-08-12 20:51:06 +03:00
|
|
|
|
<?> "operator"
|
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
isReservedOp = isReserved . sort $ reservedOpNames lang
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
|
|
|
|
-- identifiers & reserved words
|
|
|
|
|
|
|
|
|
|
reserved name =
|
|
|
|
|
lexeme $ try $ do
|
|
|
|
|
void $ caseString name
|
2015-09-02 16:27:48 +03:00
|
|
|
|
notFollowedBy (identLetter lang) <?> ("end of " ++ show name)
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
|
|
|
|
caseString name
|
2015-09-02 16:27:48 +03:00
|
|
|
|
| caseSensitive lang = C.string name
|
2015-08-12 20:51:06 +03:00
|
|
|
|
| otherwise = walk name >> return name
|
|
|
|
|
where walk = foldr (\c -> ((caseChar c <?> show name) >>)) (return ())
|
|
|
|
|
caseChar c
|
2015-09-02 16:27:48 +03:00
|
|
|
|
| isAlpha c = C.char (toLower c) <|> C.char (toUpper c)
|
|
|
|
|
| otherwise = C.char c
|
2015-08-12 20:51:06 +03:00
|
|
|
|
|
|
|
|
|
identifier =
|
|
|
|
|
lexeme $ try $ do
|
|
|
|
|
name <- ident
|
|
|
|
|
if isReservedName name
|
|
|
|
|
then unexpected ("reserved word " ++ show name)
|
|
|
|
|
else return name
|
|
|
|
|
|
2015-09-02 16:27:48 +03:00
|
|
|
|
ident = ((:) <$> identStart lang <*> many (identLetter lang))
|
2015-08-12 20:51:06 +03:00
|
|
|
|
<?> "identifier"
|
|
|
|
|
|
|
|
|
|
isReservedName name = isReserved theReservedNames caseName
|
|
|
|
|
where caseName
|
2015-09-02 16:27:48 +03:00
|
|
|
|
| caseSensitive lang = name
|
2015-08-12 20:51:06 +03:00
|
|
|
|
| otherwise = toLower <$> name
|
|
|
|
|
|
|
|
|
|
isReserved names name = scan names
|
|
|
|
|
where scan [] = False
|
|
|
|
|
scan (r:rs) = case compare r name of
|
|
|
|
|
LT -> scan rs
|
|
|
|
|
EQ -> True
|
|
|
|
|
GT -> False
|
|
|
|
|
|
|
|
|
|
theReservedNames
|
2015-09-02 16:27:48 +03:00
|
|
|
|
| caseSensitive lang = sort reserved
|
2015-08-12 20:51:06 +03:00
|
|
|
|
| otherwise = sort . fmap (fmap toLower) $ reserved
|
2015-09-02 16:27:48 +03:00
|
|
|
|
where reserved = reservedNames lang
|