lexer: add a language-composable tokenizer

2024-12-01 01:09:03 +03:00 · 2021-02-05 17:15:13 +08:00 · 2021-02-05 17:15:13 +08:00 · 404fe5fd19
commit 404fe5fd19
parent 052e3f0908
3 changed files with 132 additions and 0 deletions
--- a/idris2api.ipkg
+++ b/idris2api.ipkg
@ -118,6 +118,7 @@ modules =
    Libraries.Text.Bounded,
    Libraries.Text.Lexer,
    Libraries.Text.Lexer.Core,
    Libraries.Text.Lexer.Tokenizer,
    Libraries.Text.Literate,
    Libraries.Text.Parser,
    Libraries.Text.Parser.Core,
--- a/src/Libraries/Text/Lexer/Core.idr
+++ b/src/Libraries/Text/Lexer/Core.idr
@ -97,6 +97,7 @@ strTail start (MkStrLen str len)
 -- If the string is recognised, returns the index at which the token
 -- ends
 export
 scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char)
 scan Empty tok str = pure (tok, str)
 scan Fail tok str = Nothing
--- a/src/Libraries/Text/Lexer/Tokenizer.idr
+++ b/src/Libraries/Text/Lexer/Tokenizer.idr
@ -0,0 +1,130 @@
 module Libraries.Text.Lexer.Tokenizer
 import public Libraries.Control.Delayed
 import public Libraries.Text.Bounded
 import Libraries.Data.Bool.Extra
 import Libraries.Data.String.Extra
 import Libraries.Text.Lexer.Core
 import Libraries.Text.Lexer
 import Data.List
 import Data.Either
 import Data.Nat
 import Data.Strings
 %default total
 ||| Description a language's tokenizer rule.
 export
 data Tokenizer : (tokenType : Type) -> Type where
     Match : Lexer -> (String -> tokenType) -> Tokenizer tokenType
     Compose : (begin : Lexer) ->
               (tagger : String -> tag) ->
               (middle : tag -> Tokenizer midTokenType) ->
               (end : tag -> Lexer) ->
               (map : tag -> List (WithBounds midTokenType) -> tokenType) ->
               Tokenizer tokenType
     Alt : Tokenizer tokenType -> Lazy (Tokenizer tokenType) -> Tokenizer tokenType
 ||| Alternative tokenizer rules.
 export
 (<|>) : Tokenizer t -> Lazy (Tokenizer t) -> Tokenizer t
 (<|>) = Alt
 ||| Match on a recogniser and cast the string to a token.
 export
 match : Lexer -> (String -> a) -> Tokenizer a
 match = Match
 ||| Compose other language's tokenizer. Language composition should be
 ||| quoted between a begin recogniser and a end recogniser. The begin token
 ||| can be used to generate the composition tokenizer and the end recogniser.
 export
 compose : (begin : Lexer) ->
          (tagger : String -> tag) ->
          (middle : tag -> Tokenizer b) ->
          (end : tag -> Lexer) ->
          (map : tag -> List (WithBounds b) -> a) ->
          Tokenizer a
 compose = Compose
 ||| Stop reason why tokenizer can't make more progress.
 ||| @ ComposeNotClosing carries the span of composition begin token in the
 |||                     form of `(startLine, startCol), (endLine, endCol)`.
 export
 data StopReason =
     EndInput
   | NoRuleApply
   | ComposeNotClosing (Int, Int) (Int, Int)
 Show StopReason where
    show EndInput = "EndInput"
    show NoRuleApply = "NoRuleApply"
    show (ComposeNotClosing start end) = "ComposeNotClosing " ++ show start ++ " " ++ show end
 tokenise : Tokenizer a -> (reject : Lexer) ->
           (line, col : Int) -> List (WithBounds a) ->
           List Char ->
           (List (WithBounds a), (Int, Int, List Char, StopReason))
 tokenise tokenizer reject line col acc str
    = case scan reject [] str of
           Just _ => (reverse acc, (line, col, str, NoRuleApply))
           Nothing => case getFirstToken tokenizer str of
                           Right (tok, line', col', rest) =>
                                 -- assert total because getFirstToken must consume something
                                 assert_total (tokenise tokenizer reject line' col' (tok :: acc) rest)
                           Left reason => (reverse acc, (line, col, str, reason))
  where
    countNLs : List Char -> Nat
    countNLs str = List.length (filter (== '\n') str)
    getCols : List Char -> Int -> Int
    getCols x c
        = case span (/= '\n') x of
               (incol, []) => c + cast (length incol)
               (incol, _) => cast (length incol)
    getFirstToken : Tokenizer a -> List Char ->
                    Either StopReason (WithBounds a, Int, Int, List Char)
    getFirstToken _ [] = Left EndInput
    getFirstToken (Match lex fn) str
        = do let Just (tok, rest) = scan lex [] str
               | _ => Left NoRuleApply
             let line' = line + cast (countNLs tok)
             let col' = getCols tok col
             Right (MkBounded (fn (fastPack (reverse tok))) False line col line' col',
                   line', col', rest)
    getFirstToken (Compose begin tagger middleFn endFn fn) str
        = do let Just (beginTok, rest) = scan begin [] str
               | _ => Left NoRuleApply
             let line' = line + cast (countNLs beginTok)
             let col' = getCols beginTok col
             let tag = tagger $ fastPack beginTok
             let middle = middleFn tag
             let end = endFn tag
             let (midToks, (line'', col'', rest'', reason)) =
                    tokenise middle end line' col' [] rest
             case reason of
                  NoRuleApply => Right ()
                  EndInput => Right ()
                  notClosing => Left notClosing
             let Just (endTok, rest''') = scan end [] rest''
               | _ => Left $ ComposeNotClosing (line, col) (line', col')
             let line''' = line'' + cast (countNLs endTok)
             let col''' = getCols endTok col''
             let midToks' = MkBounded (fn tag midToks) False line col line''' col'''
             Right (midToks', line''', col''', rest''')
    getFirstToken (Alt t1 t2) str
        = case getFirstToken t1 str of
               Right result => Right result
               Left reason@(ComposeNotClosing _ _) => Left reason
               Left _ => getFirstToken t2 str
 ||| Given a tokenizer and an input string, return a list of recognised tokens,
 ||| and the line, column, and remainder of the input at the first point in the string
 ||| where there are no recognised tokens.
 export
 lex : Tokenizer a -> String -> (List (WithBounds a), (Int, Int, String, StopReason))
 lex tokenizer str
    = let (ts, (l, c, str', reason)) =
              tokenise tokenizer (pred $ const False) 0 0 [] (fastUnpack str) in
          (ts, (l, c, fastPack str', reason))