Switch to List Char for the lexer

Surprisingly, this doesn't affect performance - it shows up as about the same in the profile. But more importantly, it ports more directly to Idris 2, which can't guarantee that 'strTail' doesn't allocate at runtime, so using a List Char here is much more efficient. This probably points to the need for better string primitives. Later...
2024-11-23 20:22:34 +03:00 · 2020-05-16 16:30:51 +01:00 · 2020-05-16 16:30:51 +01:00 · 79d52c5207
commit 79d52c5207
parent 560449e087
2 changed files with 54 additions and 58 deletions
--- a/libs/prelude/Prelude.idr
+++ b/libs/prelude/Prelude.idr
@ -1066,7 +1066,7 @@ reverse = prim__strReverse
 public export
 substr : (index : Nat) -> (len : Nat) -> (subject : String) -> String
 substr s e subj
-    = if s < length subj
+    = if natToInteger s < natToInteger (length subj)
         then prim__strSubstr (prim__cast_IntegerInt (natToInteger s))
                              (prim__cast_IntegerInt (natToInteger e))
                              subj
--- a/src/Text/Lexer/Core.idr
+++ b/src/Text/Lexer/Core.idr
@ -91,34 +91,32 @@ strTail : Nat -> StrLen -> StrLen
 strTail start (MkStrLen str len)
    = MkStrLen (substr start len str) (minus len start)

-||| If the string is recognised, returns the index at which the token ends
-scan : Recognise c -> Nat -> StrLen -> Maybe Nat
-scan Empty idx str = pure idx
-scan Fail idx str = Nothing
-scan (Lookahead positive r) idx str = do
-  guard (isJust (scan r idx str) == positive)
-  pure idx
-scan (Pred f) idx str = do
-  c <- strIndex str idx
-  guard (f c)
-  pure (1 + idx)
-scan (SeqEat r1 r2) idx str
-    = do idx' <- scan r1 idx str
+-- If the string is recognised, returns the index at which the token
+-- ends
+scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char)
+scan Empty tok str = pure (tok, str)
+scan Fail tok str = Nothing
+scan (Lookahead positive r) tok str
+    = if isJust (scan r tok str) == positive
+         then pure (tok, str)
+         else Nothing
+scan (Pred f) tok [] = Nothing
+scan (Pred f) tok (c :: str)
+    = if f c
+         then Just (c :: tok, str)
+         else Nothing
+scan (SeqEat r1 r2) tok str
+    = do (tok', rest) <- scan r1 tok str
         -- TODO: Can we prove totality instead by showing idx has increased?
-         assert_total (scan r2 idx' str)
-scan (SeqEmpty r1 r2) idx str
-    = do idx' <- scan r1 idx str
-         scan r2 idx' str
-scan (SeqSame r1 r2) idx str
-    = do idx' <- scan r1 idx str
-         scan r2 idx' str
-scan (Alt r1 r2) idx str
-    = maybe (scan r2 idx str) Just (scan r1 idx str)
-
-takeToken : Lexer -> StrLen -> Maybe (String, StrLen)
-takeToken lex str
-    = do i <- scan lex 0 str -- i must be > 0 if successful
-         pure (substr 0 i (getString str), strTail i str)
+         assert_total (scan r2 tok' rest)
+scan (SeqEmpty r1 r2) tok str
+    = do (tok', rest) <- scan r1 tok str
+         scan r2 tok' rest
+scan (SeqSame r1 r2) tok str
+    = do (tok', rest) <- scan r1 tok str
+         scan r2 tok' rest
+scan (Alt r1 r2) tok str
+    = maybe (scan r2 tok str) Just (scan r1 tok str)

 ||| A mapping from lexers to the tokens they produce.
 ||| This is a list of pairs `(Lexer, String -> tokenType)`
@ -136,56 +134,54 @@ record TokenData a where
  col : Int
  tok : a

-||| `fspanEnd 0` returns the index at which the span stopped together
-||| with the leftover string.
-||| BEWARE: It is NOT a `span` starting from the string's end!
-fspanEnd : Nat -> (Char -> Bool) -> String -> (Nat, String)
-fspanEnd k p "" = (k, "")
-fspanEnd k p xxs
-    = assert_total $
-      let x = prim__strHead xxs
-          xs = prim__strTail xxs in
-          if p x then fspanEnd (S k) p xs
-                 else (k, xxs)
-
-||| Faster version of 'span' from the prelude (avoids unpacking)
 export
-fspan : (Char -> Bool) -> String -> (String, String)
-fspan p xs
-    = let (end, rest) = fspanEnd 0 p xs in
-          (substr 0 end xs, rest)
+Show a => Show (TokenData a) where
+  show t = show (line t) ++ ":" ++ show (col t) ++ ":" ++ show (tok t)

 tokenise : (TokenData a -> Bool) ->
           (line : Int) -> (col : Int) ->
           List (TokenData a) -> TokenMap a ->
-           StrLen -> (List (TokenData a), (Int, Int, StrLen))
+           List Char -> (List (TokenData a), (Int, Int, List Char))
 tokenise pred line col acc tmap str
    = case getFirstToken tmap str of
           Just (tok, line', col', rest) =>
           -- assert total because getFirstToken must consume something
               if pred tok
-                  then (reverse acc, (line, col, MkStrLen "" 0))
+                  then (reverse acc, (line, col, []))
                  else assert_total (tokenise pred line' col' (tok :: acc) tmap rest)
           Nothing => (reverse acc, (line, col, str))
  where
    countNLs : List Char -> Nat
    countNLs str = List.length (filter (== '\n') str)

-    getCols : String -> Int -> Int
+    getCols : List Char -> Int -> Int
    getCols x c
-         = case fspan (/= '\n') (reverse x) of
-                (incol, "") => c + cast (length incol)
+         = case span (/= '\n') (reverse x) of
+                (incol, []) => c + cast (length incol)
                (incol, _) => cast (length incol)

-    getFirstToken : TokenMap a -> StrLen -> Maybe (TokenData a, Int, Int, StrLen)
+    getFirstToken : TokenMap a -> List Char ->
+                    Maybe (TokenData a, Int, Int, List Char)
    getFirstToken [] str = Nothing
    getFirstToken ((lex, fn) :: ts) str
-        = case takeToken lex str of
-               Just (tok, rest) => Just (MkToken line col (fn tok),
-                                         line + cast (countNLs (unpack tok)),
+        = case scan lex [] str of
+               Just (tok, rest) => Just (MkToken line col (fn (pack (reverse tok))),
+                                         line + cast (countNLs tok),
                                         getCols tok col, rest)
               Nothing => getFirstToken ts str

+-- faster than the prelude version since there's no intermediate StrM
+funpack' : String -> List Char -> List Char
+funpack' "" acc = reverse acc
+funpack' str acc
+    = assert_total $
+        let x = prim__strHead str
+            xs = prim__strTail str in
+            funpack' xs (x :: acc)
+
+funpack : String -> List Char
+funpack str = funpack' str []
+
 ||| Given a mapping from lexers to token generating functions (the
 ||| TokenMap a) and an input string, return a list of recognised tokens,
 ||| and the line, column, and remainder of the input at the first point in the
@ -193,12 +189,12 @@ tokenise pred line col acc tmap str
 export
 lex : TokenMap a -> String -> (List (TokenData a), (Int, Int, String))
 lex tmap str
-   = let (ts, (l, c, str')) = tokenise (const False) 0 0 [] tmap (mkStr str) in
-         (ts, (l, c, getString str'))
+    = let (ts, (l, c, str')) = tokenise (const False) 0 0 [] tmap (funpack str) in
+          (ts, (l, c, pack str'))

 export
 lexTo : (TokenData a -> Bool) ->
        TokenMap a -> String -> (List (TokenData a), (Int, Int, String))
 lexTo pred tmap str
-   = let (ts, (l, c, str')) = tokenise pred 0 0 [] tmap (mkStr str) in
-         (ts, (l, c, getString str'))
+    = let (ts, (l, c, str')) = tokenise pred 0 0 [] tmap (funpack str) in
+          (ts, (l, c, pack str'))