mirror of
https://github.com/idris-lang/Idris2.git
synced 2024-12-01 01:09:03 +03:00
lexer: add a language-composable tokenizer
This commit is contained in:
parent
052e3f0908
commit
404fe5fd19
@ -118,6 +118,7 @@ modules =
|
|||||||
Libraries.Text.Bounded,
|
Libraries.Text.Bounded,
|
||||||
Libraries.Text.Lexer,
|
Libraries.Text.Lexer,
|
||||||
Libraries.Text.Lexer.Core,
|
Libraries.Text.Lexer.Core,
|
||||||
|
Libraries.Text.Lexer.Tokenizer,
|
||||||
Libraries.Text.Literate,
|
Libraries.Text.Literate,
|
||||||
Libraries.Text.Parser,
|
Libraries.Text.Parser,
|
||||||
Libraries.Text.Parser.Core,
|
Libraries.Text.Parser.Core,
|
||||||
|
@ -97,6 +97,7 @@ strTail start (MkStrLen str len)
|
|||||||
|
|
||||||
-- If the string is recognised, returns the index at which the token
|
-- If the string is recognised, returns the index at which the token
|
||||||
-- ends
|
-- ends
|
||||||
|
export
|
||||||
scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char)
|
scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char)
|
||||||
scan Empty tok str = pure (tok, str)
|
scan Empty tok str = pure (tok, str)
|
||||||
scan Fail tok str = Nothing
|
scan Fail tok str = Nothing
|
||||||
|
130
src/Libraries/Text/Lexer/Tokenizer.idr
Normal file
130
src/Libraries/Text/Lexer/Tokenizer.idr
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
module Libraries.Text.Lexer.Tokenizer
|
||||||
|
|
||||||
|
import public Libraries.Control.Delayed
|
||||||
|
import public Libraries.Text.Bounded
|
||||||
|
import Libraries.Data.Bool.Extra
|
||||||
|
import Libraries.Data.String.Extra
|
||||||
|
import Libraries.Text.Lexer.Core
|
||||||
|
import Libraries.Text.Lexer
|
||||||
|
import Data.List
|
||||||
|
import Data.Either
|
||||||
|
import Data.Nat
|
||||||
|
import Data.Strings
|
||||||
|
|
||||||
|
%default total
|
||||||
|
|
||||||
|
||| Description a language's tokenizer rule.
|
||||||
|
export
|
||||||
|
data Tokenizer : (tokenType : Type) -> Type where
|
||||||
|
Match : Lexer -> (String -> tokenType) -> Tokenizer tokenType
|
||||||
|
Compose : (begin : Lexer) ->
|
||||||
|
(tagger : String -> tag) ->
|
||||||
|
(middle : tag -> Tokenizer midTokenType) ->
|
||||||
|
(end : tag -> Lexer) ->
|
||||||
|
(map : tag -> List (WithBounds midTokenType) -> tokenType) ->
|
||||||
|
Tokenizer tokenType
|
||||||
|
Alt : Tokenizer tokenType -> Lazy (Tokenizer tokenType) -> Tokenizer tokenType
|
||||||
|
|
||||||
|
||| Alternative tokenizer rules.
|
||||||
|
export
|
||||||
|
(<|>) : Tokenizer t -> Lazy (Tokenizer t) -> Tokenizer t
|
||||||
|
(<|>) = Alt
|
||||||
|
|
||||||
|
||| Match on a recogniser and cast the string to a token.
|
||||||
|
export
|
||||||
|
match : Lexer -> (String -> a) -> Tokenizer a
|
||||||
|
match = Match
|
||||||
|
|
||||||
|
||| Compose other language's tokenizer. Language composition should be
|
||||||
|
||| quoted between a begin recogniser and a end recogniser. The begin token
|
||||||
|
||| can be used to generate the composition tokenizer and the end recogniser.
|
||||||
|
export
|
||||||
|
compose : (begin : Lexer) ->
|
||||||
|
(tagger : String -> tag) ->
|
||||||
|
(middle : tag -> Tokenizer b) ->
|
||||||
|
(end : tag -> Lexer) ->
|
||||||
|
(map : tag -> List (WithBounds b) -> a) ->
|
||||||
|
Tokenizer a
|
||||||
|
compose = Compose
|
||||||
|
|
||||||
|
||| Stop reason why tokenizer can't make more progress.
|
||||||
|
||| @ ComposeNotClosing carries the span of composition begin token in the
|
||||||
|
||| form of `(startLine, startCol), (endLine, endCol)`.
|
||||||
|
export
|
||||||
|
data StopReason =
|
||||||
|
EndInput
|
||||||
|
| NoRuleApply
|
||||||
|
| ComposeNotClosing (Int, Int) (Int, Int)
|
||||||
|
|
||||||
|
Show StopReason where
|
||||||
|
show EndInput = "EndInput"
|
||||||
|
show NoRuleApply = "NoRuleApply"
|
||||||
|
show (ComposeNotClosing start end) = "ComposeNotClosing " ++ show start ++ " " ++ show end
|
||||||
|
|
||||||
|
tokenise : Tokenizer a -> (reject : Lexer) ->
|
||||||
|
(line, col : Int) -> List (WithBounds a) ->
|
||||||
|
List Char ->
|
||||||
|
(List (WithBounds a), (Int, Int, List Char, StopReason))
|
||||||
|
tokenise tokenizer reject line col acc str
|
||||||
|
= case scan reject [] str of
|
||||||
|
Just _ => (reverse acc, (line, col, str, NoRuleApply))
|
||||||
|
Nothing => case getFirstToken tokenizer str of
|
||||||
|
Right (tok, line', col', rest) =>
|
||||||
|
-- assert total because getFirstToken must consume something
|
||||||
|
assert_total (tokenise tokenizer reject line' col' (tok :: acc) rest)
|
||||||
|
Left reason => (reverse acc, (line, col, str, reason))
|
||||||
|
where
|
||||||
|
countNLs : List Char -> Nat
|
||||||
|
countNLs str = List.length (filter (== '\n') str)
|
||||||
|
|
||||||
|
getCols : List Char -> Int -> Int
|
||||||
|
getCols x c
|
||||||
|
= case span (/= '\n') x of
|
||||||
|
(incol, []) => c + cast (length incol)
|
||||||
|
(incol, _) => cast (length incol)
|
||||||
|
|
||||||
|
getFirstToken : Tokenizer a -> List Char ->
|
||||||
|
Either StopReason (WithBounds a, Int, Int, List Char)
|
||||||
|
getFirstToken _ [] = Left EndInput
|
||||||
|
getFirstToken (Match lex fn) str
|
||||||
|
= do let Just (tok, rest) = scan lex [] str
|
||||||
|
| _ => Left NoRuleApply
|
||||||
|
let line' = line + cast (countNLs tok)
|
||||||
|
let col' = getCols tok col
|
||||||
|
Right (MkBounded (fn (fastPack (reverse tok))) False line col line' col',
|
||||||
|
line', col', rest)
|
||||||
|
getFirstToken (Compose begin tagger middleFn endFn fn) str
|
||||||
|
= do let Just (beginTok, rest) = scan begin [] str
|
||||||
|
| _ => Left NoRuleApply
|
||||||
|
let line' = line + cast (countNLs beginTok)
|
||||||
|
let col' = getCols beginTok col
|
||||||
|
let tag = tagger $ fastPack beginTok
|
||||||
|
let middle = middleFn tag
|
||||||
|
let end = endFn tag
|
||||||
|
let (midToks, (line'', col'', rest'', reason)) =
|
||||||
|
tokenise middle end line' col' [] rest
|
||||||
|
case reason of
|
||||||
|
NoRuleApply => Right ()
|
||||||
|
EndInput => Right ()
|
||||||
|
notClosing => Left notClosing
|
||||||
|
let Just (endTok, rest''') = scan end [] rest''
|
||||||
|
| _ => Left $ ComposeNotClosing (line, col) (line', col')
|
||||||
|
let line''' = line'' + cast (countNLs endTok)
|
||||||
|
let col''' = getCols endTok col''
|
||||||
|
let midToks' = MkBounded (fn tag midToks) False line col line''' col'''
|
||||||
|
Right (midToks', line''', col''', rest''')
|
||||||
|
getFirstToken (Alt t1 t2) str
|
||||||
|
= case getFirstToken t1 str of
|
||||||
|
Right result => Right result
|
||||||
|
Left reason@(ComposeNotClosing _ _) => Left reason
|
||||||
|
Left _ => getFirstToken t2 str
|
||||||
|
|
||||||
|
||| Given a tokenizer and an input string, return a list of recognised tokens,
|
||||||
|
||| and the line, column, and remainder of the input at the first point in the string
|
||||||
|
||| where there are no recognised tokens.
|
||||||
|
export
|
||||||
|
lex : Tokenizer a -> String -> (List (WithBounds a), (Int, Int, String, StopReason))
|
||||||
|
lex tokenizer str
|
||||||
|
= let (ts, (l, c, str', reason)) =
|
||||||
|
tokenise tokenizer (pred $ const False) 0 0 [] (fastUnpack str) in
|
||||||
|
(ts, (l, c, fastPack str', reason))
|
Loading…
Reference in New Issue
Block a user