module Text.Lexer import Data.Bool import Data.List import Data.Nat import public Text.Lexer.Core import public Text.Quantity import public Text.Token export toTokenMap : List (Lexer, k) -> TokenMap (Token k) toTokenMap = map $ \(l, kind) => (l, Tok kind) ||| Recognise any character. ||| /./ export any : Lexer any = pred (const True) ||| Recognise a lexer or recognise no input. This is not guaranteed ||| to consume input. ||| /`l`?/ export opt : (l : Lexer) -> Recognise False opt l = l <|> empty ||| Recognise any character if the sub-lexer `l` fails. ||| /(?!`l`)./ export non : (l : Lexer) -> Lexer non l = reject l <+> any ||| Produce recognisers by applying a function to elements of a container, and ||| recognise the first match. Consumes input if the function produces consuming ||| recognisers. Fails if the container is empty. export choiceMap : {c : Bool} -> Foldable t => (a -> Recognise c) -> t a -> Recognise c choiceMap {c} f xs = foldr (\x, acc => rewrite sym (andSameNeutral c) in f x <|> acc) fail xs ||| Recognise the first matching recogniser in a container. Consumes input if ||| recognisers in the list consume. Fails if the container is empty. export choice : {c : _} -> Foldable t => t (Recognise c) -> Recognise c choice = Lexer.choiceMap id ||| Sequence a list of recognisers. Guaranteed to consume input if the list is ||| non-empty and the recognisers consume. export concat : {c : _} -> (xs : List (Recognise c)) -> Recognise (c && isCons xs) concat = Lexer.Core.concatMap id ||| Recognise a specific character. ||| /[`x`]/ export is : (x : Char) -> Lexer is x = pred (==x) ||| Recognise anything but the given character. ||| /[\^`x`]/ export isNot : (x : Char) -> Lexer isNot x = pred (/=x) ||| Recognise a specific character (case-insensitive). ||| /[`x`]/i export like : (x : Char) -> Lexer like x = pred (\y => toUpper x == toUpper y) ||| Recognise anything but the given character (case-insensitive). ||| /[\^`x`]/i export notLike : (x : Char) -> Lexer notLike x = pred (\y => toUpper x /= toUpper y) ||| Recognise a specific string. ||| Fails if the string is empty. ||| /`str`/ export exact : (str : String) -> Lexer exact str = case unpack str of [] => fail (x :: xs) => concatMap is (x :: xs) ||| Recognise a specific string (case-insensitive). ||| Fails if the string is empty. ||| /`str`/i export approx : (str : String) -> Lexer approx str = case unpack str of [] => fail (x :: xs) => concatMap like (x :: xs) ||| Recognise any of the characters in the given string. ||| /[`chars`]/ export oneOf : (chars : String) -> Lexer oneOf chars = pred (\x => x `elem` unpack chars) ||| Recognise a character range. Also works in reverse! ||| /[`start`-`end`]/ export range : (start : Char) -> (end : Char) -> Lexer range start end = pred (\x => (x >= min start end) && (x <= max start end)) mutual ||| Recognise a sequence of at least one sub-lexers ||| /`l`+/ export some : Lexer -> Lexer some l = l <+> many l ||| Recognise a sequence of at zero or more sub-lexers. This is not ||| guaranteed to consume input ||| /`l`\*/ export many : Lexer -> Recognise False many l = opt (some l) ||| Repeat the sub-lexer `l` zero or more times until the lexer ||| `stopBefore` is encountered. `stopBefore` will not be consumed. ||| Not guaranteed to consume input. ||| /((?!`stopBefore`)`l`)\*/ export manyUntil : (stopBefore : Recognise c) -> (l : Lexer) -> Recognise False manyUntil stopBefore l = many (reject stopBefore <+> l) ||| Repeat the sub-lexer `l` zero or more times until the lexer ||| `stopAfter` is encountered, and consume it. Guaranteed to ||| consume if `stopAfter` consumes. ||| /`l`\*?`stopAfter`/ export manyThen : (stopAfter : Recognise c) -> (l : Lexer) -> Recognise c manyThen stopAfter l = manyUntil stopAfter l <+> stopAfter ||| Recognise many instances of `l` until an instance of `end` is ||| encountered. ||| ||| Useful for defining comments. export manyTill : (l : Lexer) -> (end : Lexer) -> Recognise False manyTill l end = end <|> opt (l <+> manyTill l end) ||| Recognise a sub-lexer repeated as specified by `q`. Fails if `q` has ||| `min` and `max` in the wrong order. Consumes input unless `min q` is zero. ||| /`l`{`q`}/ export count : (q : Quantity) -> (l : Lexer) -> Recognise (isSucc (min q)) count (Qty Z Nothing) l = many l count (Qty Z (Just Z)) _ = empty count (Qty Z (Just (S max))) l = opt $ l <+> count (atMost max) l count (Qty (S min) Nothing) l = l <+> count (atLeast min) l count (Qty (S min) (Just Z)) _ = fail count (Qty (S min) (Just (S max))) l = l <+> count (between min max) l ||| Recognise a single digit 0-9 ||| /[0-9]/ export digit : Lexer digit = pred isDigit ||| Recognise one or more digits ||| /[0-9]+/ export digits : Lexer digits = some digit ||| Recognise a single hexidecimal digit ||| /[0-9A-Fa-f]/ export hexDigit : Lexer hexDigit = pred isHexDigit ||| Recognise one or more hexidecimal digits ||| /[0-9A-Fa-f]+/ export hexDigits : Lexer hexDigits = some hexDigit ||| Recognise a single octal digit ||| /[0-8]/ export octDigit : Lexer octDigit = pred isHexDigit ||| Recognise one or more octal digits ||| /[0-8]+/ export octDigits : Lexer octDigits = some hexDigit ||| Recognise a single alpha character ||| /[A-Za-z]/ export alpha : Lexer alpha = pred isAlpha ||| Recognise one or more alpha characters ||| /[A-Za-z]+/ export alphas : Lexer alphas = some alpha ||| Recognise a lowercase alpha character ||| /[a-z]/ export lower : Lexer lower = pred isLower ||| Recognise one or more lowercase alpha characters ||| /[a-z]+/ export lowers : Lexer lowers = some lower ||| Recognise an uppercase alpha character ||| /[A-Z]/ export upper : Lexer upper = pred isUpper ||| Recognise one or more uppercase alpha characters ||| /[A-Z]+/ export uppers : Lexer uppers = some upper ||| Recognise an alphanumeric character ||| /[A-Za-z0-9]/ export alphaNum : Lexer alphaNum = pred isAlphaNum ||| Recognise one or more alphanumeric characters ||| /[A-Za-z0-9]+/ export alphaNums : Lexer alphaNums = some alphaNum ||| Recognise a single whitespace character ||| /\\s/ export space : Lexer space = pred isSpace ||| Recognise one or more whitespace characters ||| /\\s+/ export spaces : Lexer spaces = some space ||| Recognise a single newline sequence. Understands CRLF, CR, and LF ||| /\\r\\n|[\\r\\n]/ export newline : Lexer newline = let crlf = "\r\n" in exact crlf <|> oneOf crlf ||| Recognise one or more newline sequences. Understands CRLF, CR, and LF ||| /(\\r\\n|[\\r\\n])+)/ export newlines : Lexer newlines = some newline ||| Recognise a single non-whitespace, non-alphanumeric character ||| /[\^\\sA-Za-z0-9]/ export symbol : Lexer symbol = pred (\x => not (isSpace x || isAlphaNum x)) ||| Recognise one or more non-whitespace, non-alphanumeric characters ||| /[\^\\sA-Za-z0-9]+/ export symbols : Lexer symbols = some symbol ||| Recognise a single control character ||| /[\\x00-\\x1f\\x7f-\\x9f]/ export control : Lexer control = pred isControl ||| Recognise one or more control characters ||| /[\\x00-\\x1f\\x7f-\\x9f]+/ export controls : Lexer controls = some control ||| Recognise zero or more occurrences of a sub-lexer between ||| delimiting lexers ||| /`start`(`l`)\*?`end`/ export surround : (start : Lexer) -> (end : Lexer) -> (l : Lexer) -> Lexer surround start end l = start <+> manyThen end l ||| Recognise zero or more occurrences of a sub-lexer surrounded ||| by the same quote lexer on both sides (useful for strings) ||| /`q`(`l`)\*?`q`/ export quote : (q : Lexer) -> (l : Lexer) -> Lexer quote q l = surround q q l ||| Recognise an escape character (often '\\') followed by a sub-lexer ||| /[`esc`]`l`/ export escape : (esc : Char) -> Lexer -> Lexer escape esc l = is esc <+> l ||| Recognise a string literal, including escaped characters. ||| (Note: doesn't yet handle escape sequences such as \123) ||| /"(\\\\.|.)\*?"/ export stringLit : Lexer stringLit = quote (is '"') (escape '\\' any <|> any) ||| Recognise a character literal, including escaped characters. ||| (Note: doesn't yet handle escape sequences such as \123) ||| /'(\\\\.|[\^'])'/ export charLit : Lexer charLit = let q = '\'' in is q <+> (escape '\\' (control <|> any) <|> isNot q) <+> is q where lexStr : List String -> Lexer lexStr [] = fail lexStr (t :: ts) = exact t <|> lexStr ts control : Lexer control = lexStr ["NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", "BS", "HT", "LF", "VT", "FF", "CR", "SO", "SI", "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", "SYN", "ETB", "CAN", "EM", "SUB", "ESC", "FS", "GS", "RS", "US", "SP", "DEL"] <|> (is 'x' <+> hexDigits) <|> (is 'o' <+> octDigits) <|> digits ||| Recognise an integer literal (possibly with a '-' prefix) ||| /-?[0-9]+/ export intLit : Lexer intLit = opt (is '-') <+> digits ||| Recognise a hexidecimal literal, prefixed by "0x" or "0X" ||| /0[Xx][0-9A-Fa-f]+/ export hexLit : Lexer hexLit = approx "0x" <+> hexDigits ||| Recognise `start`, then recognise all input until a newline is encountered, ||| and consume the newline. Will succeed if end-of-input is encountered before ||| a newline. ||| /`start`[\^\\r\\n]+(\\r\\n|[\\r\\n])?/ export lineComment : (start : Lexer) -> Lexer lineComment start = start <+> manyUntil newline any <+> opt newline ||| Recognise all input between `start` and `end` lexers. ||| Supports balanced nesting. ||| ||| For block comments that don't support nesting (such as C-style comments), ||| use `surround` export blockComment : (start : Lexer) -> (end : Lexer) -> Lexer blockComment start end = start <+> middle <+> end where middle : Recognise False middle = manyUntil end (blockComment start end <|> any)