From 79d52c5207621a965420b14a32b4dca0fdd402c0 Mon Sep 17 00:00:00 2001 From: Edwin Brady Date: Sat, 16 May 2020 16:30:51 +0100 Subject: [PATCH] Switch to List Char for the lexer Surprisingly, this doesn't affect performance - it shows up as about the same in the profile. But more importantly, it ports more directly to Idris 2, which can't guarantee that 'strTail' doesn't allocate at runtime, so using a List Char here is much more efficient. This probably points to the need for better string primitives. Later... --- libs/prelude/Prelude.idr | 2 +- src/Text/Lexer/Core.idr | 110 +++++++++++++++++++-------------------- 2 files changed, 54 insertions(+), 58 deletions(-) diff --git a/libs/prelude/Prelude.idr b/libs/prelude/Prelude.idr index 814aa5c..7fbbed1 100644 --- a/libs/prelude/Prelude.idr +++ b/libs/prelude/Prelude.idr @@ -1066,7 +1066,7 @@ reverse = prim__strReverse public export substr : (index : Nat) -> (len : Nat) -> (subject : String) -> String substr s e subj - = if s < length subj + = if natToInteger s < natToInteger (length subj) then prim__strSubstr (prim__cast_IntegerInt (natToInteger s)) (prim__cast_IntegerInt (natToInteger e)) subj diff --git a/src/Text/Lexer/Core.idr b/src/Text/Lexer/Core.idr index d623218..1dece22 100644 --- a/src/Text/Lexer/Core.idr +++ b/src/Text/Lexer/Core.idr @@ -91,34 +91,32 @@ strTail : Nat -> StrLen -> StrLen strTail start (MkStrLen str len) = MkStrLen (substr start len str) (minus len start) -||| If the string is recognised, returns the index at which the token ends -scan : Recognise c -> Nat -> StrLen -> Maybe Nat -scan Empty idx str = pure idx -scan Fail idx str = Nothing -scan (Lookahead positive r) idx str = do - guard (isJust (scan r idx str) == positive) - pure idx -scan (Pred f) idx str = do - c <- strIndex str idx - guard (f c) - pure (1 + idx) -scan (SeqEat r1 r2) idx str - = do idx' <- scan r1 idx str +-- If the string is recognised, returns the index at which the token +-- ends +scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char) +scan Empty tok str = pure (tok, str) +scan Fail tok str = Nothing +scan (Lookahead positive r) tok str + = if isJust (scan r tok str) == positive + then pure (tok, str) + else Nothing +scan (Pred f) tok [] = Nothing +scan (Pred f) tok (c :: str) + = if f c + then Just (c :: tok, str) + else Nothing +scan (SeqEat r1 r2) tok str + = do (tok', rest) <- scan r1 tok str -- TODO: Can we prove totality instead by showing idx has increased? - assert_total (scan r2 idx' str) -scan (SeqEmpty r1 r2) idx str - = do idx' <- scan r1 idx str - scan r2 idx' str -scan (SeqSame r1 r2) idx str - = do idx' <- scan r1 idx str - scan r2 idx' str -scan (Alt r1 r2) idx str - = maybe (scan r2 idx str) Just (scan r1 idx str) - -takeToken : Lexer -> StrLen -> Maybe (String, StrLen) -takeToken lex str - = do i <- scan lex 0 str -- i must be > 0 if successful - pure (substr 0 i (getString str), strTail i str) + assert_total (scan r2 tok' rest) +scan (SeqEmpty r1 r2) tok str + = do (tok', rest) <- scan r1 tok str + scan r2 tok' rest +scan (SeqSame r1 r2) tok str + = do (tok', rest) <- scan r1 tok str + scan r2 tok' rest +scan (Alt r1 r2) tok str + = maybe (scan r2 tok str) Just (scan r1 tok str) ||| A mapping from lexers to the tokens they produce. ||| This is a list of pairs `(Lexer, String -> tokenType)` @@ -136,56 +134,54 @@ record TokenData a where col : Int tok : a -||| `fspanEnd 0` returns the index at which the span stopped together -||| with the leftover string. -||| BEWARE: It is NOT a `span` starting from the string's end! -fspanEnd : Nat -> (Char -> Bool) -> String -> (Nat, String) -fspanEnd k p "" = (k, "") -fspanEnd k p xxs - = assert_total $ - let x = prim__strHead xxs - xs = prim__strTail xxs in - if p x then fspanEnd (S k) p xs - else (k, xxs) - -||| Faster version of 'span' from the prelude (avoids unpacking) export -fspan : (Char -> Bool) -> String -> (String, String) -fspan p xs - = let (end, rest) = fspanEnd 0 p xs in - (substr 0 end xs, rest) +Show a => Show (TokenData a) where + show t = show (line t) ++ ":" ++ show (col t) ++ ":" ++ show (tok t) tokenise : (TokenData a -> Bool) -> (line : Int) -> (col : Int) -> List (TokenData a) -> TokenMap a -> - StrLen -> (List (TokenData a), (Int, Int, StrLen)) + List Char -> (List (TokenData a), (Int, Int, List Char)) tokenise pred line col acc tmap str = case getFirstToken tmap str of Just (tok, line', col', rest) => -- assert total because getFirstToken must consume something if pred tok - then (reverse acc, (line, col, MkStrLen "" 0)) + then (reverse acc, (line, col, [])) else assert_total (tokenise pred line' col' (tok :: acc) tmap rest) Nothing => (reverse acc, (line, col, str)) where countNLs : List Char -> Nat countNLs str = List.length (filter (== '\n') str) - getCols : String -> Int -> Int + getCols : List Char -> Int -> Int getCols x c - = case fspan (/= '\n') (reverse x) of - (incol, "") => c + cast (length incol) + = case span (/= '\n') (reverse x) of + (incol, []) => c + cast (length incol) (incol, _) => cast (length incol) - getFirstToken : TokenMap a -> StrLen -> Maybe (TokenData a, Int, Int, StrLen) + getFirstToken : TokenMap a -> List Char -> + Maybe (TokenData a, Int, Int, List Char) getFirstToken [] str = Nothing getFirstToken ((lex, fn) :: ts) str - = case takeToken lex str of - Just (tok, rest) => Just (MkToken line col (fn tok), - line + cast (countNLs (unpack tok)), + = case scan lex [] str of + Just (tok, rest) => Just (MkToken line col (fn (pack (reverse tok))), + line + cast (countNLs tok), getCols tok col, rest) Nothing => getFirstToken ts str +-- faster than the prelude version since there's no intermediate StrM +funpack' : String -> List Char -> List Char +funpack' "" acc = reverse acc +funpack' str acc + = assert_total $ + let x = prim__strHead str + xs = prim__strTail str in + funpack' xs (x :: acc) + +funpack : String -> List Char +funpack str = funpack' str [] + ||| Given a mapping from lexers to token generating functions (the ||| TokenMap a) and an input string, return a list of recognised tokens, ||| and the line, column, and remainder of the input at the first point in the @@ -193,12 +189,12 @@ tokenise pred line col acc tmap str export lex : TokenMap a -> String -> (List (TokenData a), (Int, Int, String)) lex tmap str - = let (ts, (l, c, str')) = tokenise (const False) 0 0 [] tmap (mkStr str) in - (ts, (l, c, getString str')) + = let (ts, (l, c, str')) = tokenise (const False) 0 0 [] tmap (funpack str) in + (ts, (l, c, pack str')) export lexTo : (TokenData a -> Bool) -> TokenMap a -> String -> (List (TokenData a), (Int, Int, String)) lexTo pred tmap str - = let (ts, (l, c, str')) = tokenise pred 0 0 [] tmap (mkStr str) in - (ts, (l, c, getString str')) + = let (ts, (l, c, str')) = tokenise pred 0 0 [] tmap (funpack str) in + (ts, (l, c, pack str'))