2020-05-18 15:59:07 +03:00
|
|
|
module Text.Lexer
|
|
|
|
|
2020-06-12 00:14:11 +03:00
|
|
|
import Data.Bool
|
2020-05-18 15:59:07 +03:00
|
|
|
import Data.List
|
|
|
|
import Data.Nat
|
|
|
|
|
|
|
|
import public Text.Lexer.Core
|
|
|
|
import public Text.Quantity
|
|
|
|
import public Text.Token
|
|
|
|
|
2021-08-06 12:03:13 +03:00
|
|
|
%default total
|
|
|
|
|
2020-05-18 15:59:07 +03:00
|
|
|
export
|
|
|
|
toTokenMap : List (Lexer, k) -> TokenMap (Token k)
|
|
|
|
toTokenMap = map $ \(l, kind) => (l, Tok kind)
|
|
|
|
|
|
|
|
||| Recognise any character.
|
|
|
|
||| /./
|
|
|
|
export
|
|
|
|
any : Lexer
|
|
|
|
any = pred (const True)
|
|
|
|
|
|
|
|
||| Recognise a lexer or recognise no input. This is not guaranteed
|
|
|
|
||| to consume input.
|
|
|
|
||| /`l`?/
|
|
|
|
export
|
|
|
|
opt : (l : Lexer) -> Recognise False
|
|
|
|
opt l = l <|> empty
|
|
|
|
|
|
|
|
||| Recognise any character if the sub-lexer `l` fails.
|
|
|
|
||| /(?!`l`)./
|
|
|
|
export
|
|
|
|
non : (l : Lexer) -> Lexer
|
|
|
|
non l = reject l <+> any
|
|
|
|
|
|
|
|
||| Produce recognisers by applying a function to elements of a container, and
|
|
|
|
||| recognise the first match. Consumes input if the function produces consuming
|
|
|
|
||| recognisers. Fails if the container is empty.
|
|
|
|
export
|
|
|
|
choiceMap : {c : Bool} ->
|
|
|
|
Foldable t => (a -> Recognise c) -> t a -> Recognise c
|
2021-08-06 12:03:13 +03:00
|
|
|
choiceMap {c} f xs = foldr (\x, acc => rewrite sym (andSameNeutral c) in
|
|
|
|
f x <|> acc)
|
|
|
|
fail xs
|
2020-05-18 15:59:07 +03:00
|
|
|
|
|
|
|
||| Recognise the first matching recogniser in a container. Consumes input if
|
|
|
|
||| recognisers in the list consume. Fails if the container is empty.
|
|
|
|
export
|
|
|
|
choice : {c : _} ->
|
|
|
|
Foldable t => t (Recognise c) -> Recognise c
|
2021-08-06 12:03:13 +03:00
|
|
|
choice = choiceMap id
|
2020-05-18 15:59:07 +03:00
|
|
|
|
|
|
|
||| Sequence a list of recognisers. Guaranteed to consume input if the list is
|
|
|
|
||| non-empty and the recognisers consume.
|
|
|
|
export
|
|
|
|
concat : {c : _} ->
|
2021-08-06 12:03:13 +03:00
|
|
|
(xs : List (Recognise c)) -> Recognise (isCons xs && c)
|
2020-05-18 15:59:07 +03:00
|
|
|
concat = Lexer.Core.concatMap id
|
|
|
|
|
|
|
|
||| Recognise a specific character.
|
|
|
|
||| /[`x`]/
|
|
|
|
export
|
|
|
|
is : (x : Char) -> Lexer
|
|
|
|
is x = pred (==x)
|
|
|
|
|
|
|
|
||| Recognise anything but the given character.
|
|
|
|
||| /[\^`x`]/
|
|
|
|
export
|
|
|
|
isNot : (x : Char) -> Lexer
|
|
|
|
isNot x = pred (/=x)
|
|
|
|
|
|
|
|
||| Recognise a specific character (case-insensitive).
|
|
|
|
||| /[`x`]/i
|
|
|
|
export
|
|
|
|
like : (x : Char) -> Lexer
|
|
|
|
like x = pred (\y => toUpper x == toUpper y)
|
|
|
|
|
|
|
|
||| Recognise anything but the given character (case-insensitive).
|
|
|
|
||| /[\^`x`]/i
|
|
|
|
export
|
|
|
|
notLike : (x : Char) -> Lexer
|
|
|
|
notLike x = pred (\y => toUpper x /= toUpper y)
|
|
|
|
|
|
|
|
||| Recognise a specific string.
|
|
|
|
||| Fails if the string is empty.
|
|
|
|
||| /`str`/
|
|
|
|
export
|
|
|
|
exact : (str : String) -> Lexer
|
|
|
|
exact str = case unpack str of
|
|
|
|
[] => fail
|
|
|
|
(x :: xs) => concatMap is (x :: xs)
|
|
|
|
|
|
|
|
||| Recognise a specific string (case-insensitive).
|
|
|
|
||| Fails if the string is empty.
|
|
|
|
||| /`str`/i
|
|
|
|
export
|
|
|
|
approx : (str : String) -> Lexer
|
|
|
|
approx str = case unpack str of
|
|
|
|
[] => fail
|
|
|
|
(x :: xs) => concatMap like (x :: xs)
|
|
|
|
|
|
|
|
||| Recognise any of the characters in the given string.
|
|
|
|
||| /[`chars`]/
|
|
|
|
export
|
|
|
|
oneOf : (chars : String) -> Lexer
|
|
|
|
oneOf chars = pred (\x => x `elem` unpack chars)
|
|
|
|
|
|
|
|
||| Recognise a character range. Also works in reverse!
|
|
|
|
||| /[`start`-`end`]/
|
|
|
|
export
|
|
|
|
range : (start : Char) -> (end : Char) -> Lexer
|
|
|
|
range start end = pred (\x => (x >= min start end)
|
|
|
|
&& (x <= max start end))
|
|
|
|
|
|
|
|
mutual
|
|
|
|
||| Recognise a sequence of at least one sub-lexers
|
|
|
|
||| /`l`+/
|
|
|
|
export
|
|
|
|
some : Lexer -> Lexer
|
|
|
|
some l = l <+> many l
|
|
|
|
|
|
|
|
||| Recognise a sequence of at zero or more sub-lexers. This is not
|
|
|
|
||| guaranteed to consume input
|
|
|
|
||| /`l`\*/
|
|
|
|
export
|
|
|
|
many : Lexer -> Recognise False
|
|
|
|
many l = opt (some l)
|
|
|
|
|
2021-08-06 12:03:13 +03:00
|
|
|
||| Repeat the sub-lexer `l` one or more times until the lexer
|
|
|
|
||| `stopBefore` is encountered. `stopBefore` will not be consumed.
|
|
|
|
||| /((?!`stopBefore`)`l`)\+/
|
|
|
|
export
|
|
|
|
someUntil : (stopBefore : Recognise c) -> (l : Lexer) -> Lexer
|
|
|
|
someUntil stopBefore l = some (reject stopBefore <+> l)
|
|
|
|
|
2020-05-18 15:59:07 +03:00
|
|
|
||| Repeat the sub-lexer `l` zero or more times until the lexer
|
|
|
|
||| `stopBefore` is encountered. `stopBefore` will not be consumed.
|
|
|
|
||| Not guaranteed to consume input.
|
|
|
|
||| /((?!`stopBefore`)`l`)\*/
|
|
|
|
export
|
|
|
|
manyUntil : (stopBefore : Recognise c) -> (l : Lexer) -> Recognise False
|
|
|
|
manyUntil stopBefore l = many (reject stopBefore <+> l)
|
|
|
|
|
|
|
|
||| Repeat the sub-lexer `l` zero or more times until the lexer
|
|
|
|
||| `stopAfter` is encountered, and consume it. Guaranteed to
|
|
|
|
||| consume if `stopAfter` consumes.
|
|
|
|
||| /`l`\*?`stopAfter`/
|
|
|
|
export
|
|
|
|
manyThen : (stopAfter : Recognise c) -> (l : Lexer) -> Recognise c
|
|
|
|
manyThen stopAfter l = manyUntil stopAfter l <+> stopAfter
|
|
|
|
|
|
|
|
||| Recognise a sub-lexer repeated as specified by `q`. Fails if `q` has
|
|
|
|
||| `min` and `max` in the wrong order. Consumes input unless `min q` is zero.
|
|
|
|
||| /`l`{`q`}/
|
|
|
|
export
|
|
|
|
count : (q : Quantity) -> (l : Lexer) -> Recognise (isSucc (min q))
|
|
|
|
count (Qty Z Nothing) l = many l
|
|
|
|
count (Qty Z (Just Z)) _ = empty
|
|
|
|
count (Qty Z (Just (S max))) l = opt $ l <+> count (atMost max) l
|
|
|
|
count (Qty (S min) Nothing) l = l <+> count (atLeast min) l
|
|
|
|
count (Qty (S min) (Just Z)) _ = fail
|
|
|
|
count (Qty (S min) (Just (S max))) l = l <+> count (between min max) l
|
|
|
|
|
|
|
|
||| Recognise a single digit 0-9
|
|
|
|
||| /[0-9]/
|
|
|
|
export
|
|
|
|
digit : Lexer
|
|
|
|
digit = pred isDigit
|
|
|
|
|
|
|
|
||| Recognise one or more digits
|
|
|
|
||| /[0-9]+/
|
|
|
|
export
|
|
|
|
digits : Lexer
|
|
|
|
digits = some digit
|
|
|
|
|
2021-08-06 12:03:13 +03:00
|
|
|
||| Recognise a single binary digit
|
|
|
|
||| /[0-1]/
|
|
|
|
export
|
|
|
|
binDigit : Lexer
|
|
|
|
binDigit = pred (\c => c == '0' || c == '1')
|
|
|
|
|
|
|
|
||| Recognise one or more binary digits
|
|
|
|
||| /[0-1]+/
|
|
|
|
export
|
|
|
|
binDigits : Lexer
|
|
|
|
binDigits = some binDigit
|
|
|
|
|
2020-05-18 15:59:07 +03:00
|
|
|
||| Recognise a single hexidecimal digit
|
|
|
|
||| /[0-9A-Fa-f]/
|
|
|
|
export
|
|
|
|
hexDigit : Lexer
|
|
|
|
hexDigit = pred isHexDigit
|
|
|
|
|
|
|
|
||| Recognise one or more hexidecimal digits
|
|
|
|
||| /[0-9A-Fa-f]+/
|
|
|
|
export
|
|
|
|
hexDigits : Lexer
|
|
|
|
hexDigits = some hexDigit
|
|
|
|
|
|
|
|
||| Recognise a single octal digit
|
|
|
|
||| /[0-8]/
|
|
|
|
export
|
|
|
|
octDigit : Lexer
|
2021-08-06 12:03:13 +03:00
|
|
|
octDigit = pred isOctDigit
|
2020-05-18 15:59:07 +03:00
|
|
|
|
|
|
|
||| Recognise one or more octal digits
|
|
|
|
||| /[0-8]+/
|
|
|
|
export
|
|
|
|
octDigits : Lexer
|
2021-08-06 12:03:13 +03:00
|
|
|
octDigits = some octDigit
|
2020-05-18 15:59:07 +03:00
|
|
|
|
|
|
|
||| Recognise a single alpha character
|
|
|
|
||| /[A-Za-z]/
|
|
|
|
export
|
|
|
|
alpha : Lexer
|
|
|
|
alpha = pred isAlpha
|
|
|
|
|
|
|
|
||| Recognise one or more alpha characters
|
|
|
|
||| /[A-Za-z]+/
|
|
|
|
export
|
|
|
|
alphas : Lexer
|
|
|
|
alphas = some alpha
|
|
|
|
|
|
|
|
||| Recognise a lowercase alpha character
|
|
|
|
||| /[a-z]/
|
|
|
|
export
|
|
|
|
lower : Lexer
|
|
|
|
lower = pred isLower
|
|
|
|
|
|
|
|
||| Recognise one or more lowercase alpha characters
|
|
|
|
||| /[a-z]+/
|
|
|
|
export
|
|
|
|
lowers : Lexer
|
|
|
|
lowers = some lower
|
|
|
|
|
|
|
|
||| Recognise an uppercase alpha character
|
|
|
|
||| /[A-Z]/
|
|
|
|
export
|
|
|
|
upper : Lexer
|
|
|
|
upper = pred isUpper
|
|
|
|
|
|
|
|
||| Recognise one or more uppercase alpha characters
|
|
|
|
||| /[A-Z]+/
|
|
|
|
export
|
|
|
|
uppers : Lexer
|
|
|
|
uppers = some upper
|
|
|
|
|
|
|
|
||| Recognise an alphanumeric character
|
|
|
|
||| /[A-Za-z0-9]/
|
|
|
|
export
|
|
|
|
alphaNum : Lexer
|
|
|
|
alphaNum = pred isAlphaNum
|
|
|
|
|
|
|
|
||| Recognise one or more alphanumeric characters
|
|
|
|
||| /[A-Za-z0-9]+/
|
|
|
|
export
|
|
|
|
alphaNums : Lexer
|
|
|
|
alphaNums = some alphaNum
|
|
|
|
|
|
|
|
||| Recognise a single whitespace character
|
|
|
|
||| /\\s/
|
|
|
|
export
|
|
|
|
space : Lexer
|
|
|
|
space = pred isSpace
|
|
|
|
|
|
|
|
||| Recognise one or more whitespace characters
|
|
|
|
||| /\\s+/
|
|
|
|
export
|
|
|
|
spaces : Lexer
|
|
|
|
spaces = some space
|
|
|
|
|
|
|
|
||| Recognise a single newline sequence. Understands CRLF, CR, and LF
|
|
|
|
||| /\\r\\n|[\\r\\n]/
|
|
|
|
export
|
|
|
|
newline : Lexer
|
|
|
|
newline = let crlf = "\r\n" in
|
|
|
|
exact crlf <|> oneOf crlf
|
|
|
|
|
|
|
|
||| Recognise one or more newline sequences. Understands CRLF, CR, and LF
|
|
|
|
||| /(\\r\\n|[\\r\\n])+)/
|
|
|
|
export
|
|
|
|
newlines : Lexer
|
|
|
|
newlines = some newline
|
|
|
|
|
|
|
|
||| Recognise a single non-whitespace, non-alphanumeric character
|
|
|
|
||| /[\^\\sA-Za-z0-9]/
|
|
|
|
export
|
|
|
|
symbol : Lexer
|
|
|
|
symbol = pred (\x => not (isSpace x || isAlphaNum x))
|
|
|
|
|
|
|
|
||| Recognise one or more non-whitespace, non-alphanumeric characters
|
|
|
|
||| /[\^\\sA-Za-z0-9]+/
|
|
|
|
export
|
|
|
|
symbols : Lexer
|
|
|
|
symbols = some symbol
|
|
|
|
|
|
|
|
||| Recognise a single control character
|
|
|
|
||| /[\\x00-\\x1f\\x7f-\\x9f]/
|
|
|
|
export
|
|
|
|
control : Lexer
|
|
|
|
control = pred isControl
|
|
|
|
|
|
|
|
||| Recognise one or more control characters
|
|
|
|
||| /[\\x00-\\x1f\\x7f-\\x9f]+/
|
|
|
|
export
|
|
|
|
controls : Lexer
|
|
|
|
controls = some control
|
|
|
|
|
|
|
|
||| Recognise zero or more occurrences of a sub-lexer between
|
|
|
|
||| delimiting lexers
|
|
|
|
||| /`start`(`l`)\*?`end`/
|
|
|
|
export
|
|
|
|
surround : (start : Lexer) -> (end : Lexer) -> (l : Lexer) -> Lexer
|
|
|
|
surround start end l = start <+> manyThen end l
|
|
|
|
|
|
|
|
||| Recognise zero or more occurrences of a sub-lexer surrounded
|
|
|
|
||| by the same quote lexer on both sides (useful for strings)
|
|
|
|
||| /`q`(`l`)\*?`q`/
|
|
|
|
export
|
|
|
|
quote : (q : Lexer) -> (l : Lexer) -> Lexer
|
|
|
|
quote q l = surround q q l
|
|
|
|
|
2021-08-06 12:03:13 +03:00
|
|
|
||| Recognise an escape sub-lexer (often '\\') followed by
|
|
|
|
||| another sub-lexer
|
2020-05-18 15:59:07 +03:00
|
|
|
||| /[`esc`]`l`/
|
|
|
|
export
|
2021-08-06 12:03:13 +03:00
|
|
|
escape : (esc : Lexer) -> Lexer -> Lexer
|
|
|
|
escape esc l = esc <+> l
|
2020-05-18 15:59:07 +03:00
|
|
|
|
|
|
|
||| Recognise a string literal, including escaped characters.
|
|
|
|
||| (Note: doesn't yet handle escape sequences such as \123)
|
|
|
|
||| /"(\\\\.|.)\*?"/
|
|
|
|
export
|
|
|
|
stringLit : Lexer
|
2021-08-06 12:03:13 +03:00
|
|
|
stringLit = quote (is '"') (escape (is '\\') any <|> any)
|
2020-05-18 15:59:07 +03:00
|
|
|
|
|
|
|
||| Recognise a character literal, including escaped characters.
|
|
|
|
||| (Note: doesn't yet handle escape sequences such as \123)
|
|
|
|
||| /'(\\\\.|[\^'])'/
|
|
|
|
export
|
|
|
|
charLit : Lexer
|
|
|
|
charLit = let q = '\'' in
|
2021-08-06 12:03:13 +03:00
|
|
|
is q <+> (escape (is '\\') (control <|> any) <|> isNot q) <+> is q
|
2020-05-18 15:59:07 +03:00
|
|
|
where
|
|
|
|
lexStr : List String -> Lexer
|
|
|
|
lexStr [] = fail
|
|
|
|
lexStr (t :: ts) = exact t <|> lexStr ts
|
|
|
|
|
|
|
|
control : Lexer
|
|
|
|
control = lexStr ["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL",
|
|
|
|
"BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI",
|
|
|
|
"DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB",
|
|
|
|
"CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US",
|
|
|
|
"SP", "DEL"]
|
|
|
|
<|> (is 'x' <+> hexDigits)
|
|
|
|
<|> (is 'o' <+> octDigits)
|
|
|
|
<|> digits
|
|
|
|
|
|
|
|
||| Recognise an integer literal (possibly with a '-' prefix)
|
|
|
|
||| /-?[0-9]+/
|
|
|
|
export
|
|
|
|
intLit : Lexer
|
|
|
|
intLit = opt (is '-') <+> digits
|
|
|
|
|
2021-08-06 12:03:13 +03:00
|
|
|
||| Recognise a binary literal, prefixed by "0b"
|
|
|
|
||| /0b[0-1]+/
|
|
|
|
export
|
|
|
|
binLit : Lexer
|
|
|
|
binLit = exact "0b" <+> binDigits
|
|
|
|
|
2020-05-18 15:59:07 +03:00
|
|
|
||| Recognise a hexidecimal literal, prefixed by "0x" or "0X"
|
|
|
|
||| /0[Xx][0-9A-Fa-f]+/
|
|
|
|
export
|
|
|
|
hexLit : Lexer
|
|
|
|
hexLit = approx "0x" <+> hexDigits
|
|
|
|
|
2021-08-06 12:03:13 +03:00
|
|
|
||| Recognise an octal literal, prefixed by "0o"
|
|
|
|
||| /0o[0-9A-Fa-f]+/
|
|
|
|
export
|
|
|
|
octLit : Lexer
|
|
|
|
octLit = exact "0o" <+> octDigits
|
|
|
|
|
|
|
|
export
|
|
|
|
digitsUnderscoredLit : Lexer
|
|
|
|
digitsUnderscoredLit = digits <+> many (is '_' <+> digits)
|
|
|
|
|
|
|
|
export
|
|
|
|
binUnderscoredLit : Lexer
|
|
|
|
binUnderscoredLit = binLit <+> many (is '_' <+> binDigits)
|
|
|
|
|
|
|
|
export
|
|
|
|
hexUnderscoredLit : Lexer
|
|
|
|
hexUnderscoredLit = hexLit <+> many (is '_' <+> hexDigits)
|
|
|
|
|
|
|
|
export
|
|
|
|
octUnderscoredLit : Lexer
|
|
|
|
octUnderscoredLit = octLit <+> many (is '_' <+> octDigits)
|
|
|
|
|
2020-05-18 15:59:07 +03:00
|
|
|
||| Recognise `start`, then recognise all input until a newline is encountered,
|
|
|
|
||| and consume the newline. Will succeed if end-of-input is encountered before
|
|
|
|
||| a newline.
|
|
|
|
||| /`start`[\^\\r\\n]+(\\r\\n|[\\r\\n])?/
|
|
|
|
export
|
|
|
|
lineComment : (start : Lexer) -> Lexer
|
|
|
|
lineComment start = start <+> manyUntil newline any <+> opt newline
|
|
|
|
|
|
|
|
||| Recognise all input between `start` and `end` lexers.
|
|
|
|
||| Supports balanced nesting.
|
|
|
|
|||
|
|
|
|
||| For block comments that don't support nesting (such as C-style comments),
|
|
|
|
||| use `surround`
|
|
|
|
export
|
|
|
|
blockComment : (start : Lexer) -> (end : Lexer) -> Lexer
|
|
|
|
blockComment start end = start <+> middle <+> end
|
|
|
|
where
|
|
|
|
middle : Recognise False
|
|
|
|
middle = manyUntil end (blockComment start end <|> any)
|