mirror of
https://github.com/idris-lang/Idris2.git
synced 2024-12-01 01:09:03 +03:00
lexer: add a language-composable tokenizer
This commit is contained in:
parent
052e3f0908
commit
404fe5fd19
@ -118,6 +118,7 @@ modules =
|
||||
Libraries.Text.Bounded,
|
||||
Libraries.Text.Lexer,
|
||||
Libraries.Text.Lexer.Core,
|
||||
Libraries.Text.Lexer.Tokenizer,
|
||||
Libraries.Text.Literate,
|
||||
Libraries.Text.Parser,
|
||||
Libraries.Text.Parser.Core,
|
||||
|
@ -97,6 +97,7 @@ strTail start (MkStrLen str len)
|
||||
|
||||
-- If the string is recognised, returns the index at which the token
|
||||
-- ends
|
||||
export
|
||||
scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char)
|
||||
scan Empty tok str = pure (tok, str)
|
||||
scan Fail tok str = Nothing
|
||||
|
130
src/Libraries/Text/Lexer/Tokenizer.idr
Normal file
130
src/Libraries/Text/Lexer/Tokenizer.idr
Normal file
@ -0,0 +1,130 @@
|
||||
module Libraries.Text.Lexer.Tokenizer
|
||||
|
||||
import public Libraries.Control.Delayed
|
||||
import public Libraries.Text.Bounded
|
||||
import Libraries.Data.Bool.Extra
|
||||
import Libraries.Data.String.Extra
|
||||
import Libraries.Text.Lexer.Core
|
||||
import Libraries.Text.Lexer
|
||||
import Data.List
|
||||
import Data.Either
|
||||
import Data.Nat
|
||||
import Data.Strings
|
||||
|
||||
%default total
|
||||
|
||||
||| Description a language's tokenizer rule.
|
||||
export
|
||||
data Tokenizer : (tokenType : Type) -> Type where
|
||||
Match : Lexer -> (String -> tokenType) -> Tokenizer tokenType
|
||||
Compose : (begin : Lexer) ->
|
||||
(tagger : String -> tag) ->
|
||||
(middle : tag -> Tokenizer midTokenType) ->
|
||||
(end : tag -> Lexer) ->
|
||||
(map : tag -> List (WithBounds midTokenType) -> tokenType) ->
|
||||
Tokenizer tokenType
|
||||
Alt : Tokenizer tokenType -> Lazy (Tokenizer tokenType) -> Tokenizer tokenType
|
||||
|
||||
||| Alternative tokenizer rules.
|
||||
export
|
||||
(<|>) : Tokenizer t -> Lazy (Tokenizer t) -> Tokenizer t
|
||||
(<|>) = Alt
|
||||
|
||||
||| Match on a recogniser and cast the string to a token.
|
||||
export
|
||||
match : Lexer -> (String -> a) -> Tokenizer a
|
||||
match = Match
|
||||
|
||||
||| Compose other language's tokenizer. Language composition should be
|
||||
||| quoted between a begin recogniser and a end recogniser. The begin token
|
||||
||| can be used to generate the composition tokenizer and the end recogniser.
|
||||
export
|
||||
compose : (begin : Lexer) ->
|
||||
(tagger : String -> tag) ->
|
||||
(middle : tag -> Tokenizer b) ->
|
||||
(end : tag -> Lexer) ->
|
||||
(map : tag -> List (WithBounds b) -> a) ->
|
||||
Tokenizer a
|
||||
compose = Compose
|
||||
|
||||
||| Stop reason why tokenizer can't make more progress.
|
||||
||| @ ComposeNotClosing carries the span of composition begin token in the
|
||||
||| form of `(startLine, startCol), (endLine, endCol)`.
|
||||
export
|
||||
data StopReason =
|
||||
EndInput
|
||||
| NoRuleApply
|
||||
| ComposeNotClosing (Int, Int) (Int, Int)
|
||||
|
||||
Show StopReason where
|
||||
show EndInput = "EndInput"
|
||||
show NoRuleApply = "NoRuleApply"
|
||||
show (ComposeNotClosing start end) = "ComposeNotClosing " ++ show start ++ " " ++ show end
|
||||
|
||||
tokenise : Tokenizer a -> (reject : Lexer) ->
|
||||
(line, col : Int) -> List (WithBounds a) ->
|
||||
List Char ->
|
||||
(List (WithBounds a), (Int, Int, List Char, StopReason))
|
||||
tokenise tokenizer reject line col acc str
|
||||
= case scan reject [] str of
|
||||
Just _ => (reverse acc, (line, col, str, NoRuleApply))
|
||||
Nothing => case getFirstToken tokenizer str of
|
||||
Right (tok, line', col', rest) =>
|
||||
-- assert total because getFirstToken must consume something
|
||||
assert_total (tokenise tokenizer reject line' col' (tok :: acc) rest)
|
||||
Left reason => (reverse acc, (line, col, str, reason))
|
||||
where
|
||||
countNLs : List Char -> Nat
|
||||
countNLs str = List.length (filter (== '\n') str)
|
||||
|
||||
getCols : List Char -> Int -> Int
|
||||
getCols x c
|
||||
= case span (/= '\n') x of
|
||||
(incol, []) => c + cast (length incol)
|
||||
(incol, _) => cast (length incol)
|
||||
|
||||
getFirstToken : Tokenizer a -> List Char ->
|
||||
Either StopReason (WithBounds a, Int, Int, List Char)
|
||||
getFirstToken _ [] = Left EndInput
|
||||
getFirstToken (Match lex fn) str
|
||||
= do let Just (tok, rest) = scan lex [] str
|
||||
| _ => Left NoRuleApply
|
||||
let line' = line + cast (countNLs tok)
|
||||
let col' = getCols tok col
|
||||
Right (MkBounded (fn (fastPack (reverse tok))) False line col line' col',
|
||||
line', col', rest)
|
||||
getFirstToken (Compose begin tagger middleFn endFn fn) str
|
||||
= do let Just (beginTok, rest) = scan begin [] str
|
||||
| _ => Left NoRuleApply
|
||||
let line' = line + cast (countNLs beginTok)
|
||||
let col' = getCols beginTok col
|
||||
let tag = tagger $ fastPack beginTok
|
||||
let middle = middleFn tag
|
||||
let end = endFn tag
|
||||
let (midToks, (line'', col'', rest'', reason)) =
|
||||
tokenise middle end line' col' [] rest
|
||||
case reason of
|
||||
NoRuleApply => Right ()
|
||||
EndInput => Right ()
|
||||
notClosing => Left notClosing
|
||||
let Just (endTok, rest''') = scan end [] rest''
|
||||
| _ => Left $ ComposeNotClosing (line, col) (line', col')
|
||||
let line''' = line'' + cast (countNLs endTok)
|
||||
let col''' = getCols endTok col''
|
||||
let midToks' = MkBounded (fn tag midToks) False line col line''' col'''
|
||||
Right (midToks', line''', col''', rest''')
|
||||
getFirstToken (Alt t1 t2) str
|
||||
= case getFirstToken t1 str of
|
||||
Right result => Right result
|
||||
Left reason@(ComposeNotClosing _ _) => Left reason
|
||||
Left _ => getFirstToken t2 str
|
||||
|
||||
||| Given a tokenizer and an input string, return a list of recognised tokens,
|
||||
||| and the line, column, and remainder of the input at the first point in the string
|
||||
||| where there are no recognised tokens.
|
||||
export
|
||||
lex : Tokenizer a -> String -> (List (WithBounds a), (Int, Int, String, StopReason))
|
||||
lex tokenizer str
|
||||
= let (ts, (l, c, str', reason)) =
|
||||
tokenise tokenizer (pred $ const False) 0 0 [] (fastUnpack str) in
|
||||
(ts, (l, c, fastPack str', reason))
|
Loading…
Reference in New Issue
Block a user