Idris2/libs/contrib/Text/Lexer/Core.idr

module Text.Lexer.Core

import public Control.Delayed
import Data.Bool
import Data.List
import Data.Nat
import Data.Strings

||| A language of token recognisers.
||| @ consumes If `True`, this recogniser is guaranteed to consume at
|||            least one character of input when it succeeds.
public export
data Recognise : (consumes : Bool) -> Type where
     Empty : Recognise False
     Fail : Recognise c
     Lookahead : (positive : Bool) -> Recognise c -> Recognise False
     Pred : (Char -> Bool) -> Recognise True
     SeqEat : Recognise True -> Inf (Recognise e) -> Recognise True
     SeqEmpty : Recognise e1 -> Recognise e2 -> Recognise (e1 || e2)
     Alt : Recognise e1 -> Recognise e2 -> Recognise (e1 && e2)

||| A token recogniser. Guaranteed to consume at least one character.
public export
Lexer : Type
Lexer = Recognise True

-- %allow_overloads (<+>)

||| Sequence two recognisers. If either consumes a character, the sequence
||| is guaranteed to consume a character.
export %inline
(<+>) : {c1 : Bool} ->
        Recognise c1 -> inf c1 (Recognise c2) -> Recognise (c1 || c2)
(<+>) {c1 = False} = SeqEmpty
(<+>) {c1 = True} = SeqEat

%allow_overloads (<|>)

||| Alternative recognisers. If both consume, the combination is guaranteed
||| to consumer a character.
%inline
export
(<|>) : Recognise c1 -> Recognise c2 -> Recognise (c1 && c2)
(<|>) = Alt

||| A recogniser that always fails.
export
fail : Recognise c
fail = Fail

||| Recognise no input (doesn't consume any input)
%inline
export
empty : Recognise False
empty = Empty

||| Recognise a character that matches a predicate
%inline
export
pred : (Char -> Bool) -> Lexer
pred = Pred

||| Positive lookahead. Never consumes input.
export
expect : Recognise c -> Recognise False
expect = Lookahead True

||| Negative lookahead. Never consumes input.
export
reject : Recognise c -> Recognise False
reject = Lookahead False

%allow_overloads concatMap

||| Sequence the recognisers resulting from applying a function to each element
||| of a list. The resulting recogniser will consume input if the produced
||| recognisers consume and the list is non-empty.
export
concatMap : (a -> Recognise c) -> (xs : List a) ->
            Recognise (c && (isCons xs))
concatMap {c} _ [] = rewrite andFalseFalse c in Empty
concatMap {c} f (x :: xs)
   = rewrite andTrueNeutral c in
     rewrite sym (orSameAndRightNeutral c (isCons xs)) in
             SeqEmpty (f x) (Core.concatMap f xs)

data StrLen : Type where
     MkStrLen : String -> Nat -> StrLen

Show StrLen where
  show (MkStrLen str n) = str ++ "(" ++ show n ++ ")"

getString : StrLen -> String
getString (MkStrLen str n) = str

strIndex : StrLen -> Nat -> Maybe Char
strIndex (MkStrLen str len) i
    = if cast {to = Integer} i >= cast len then Nothing
                  else Just (assert_total (prim__strIndex str (cast i)))

mkStr : String -> StrLen
mkStr str = MkStrLen str (length str)

strTail : Nat -> StrLen -> StrLen
strTail start (MkStrLen str len)
    = MkStrLen (substr start len str) (minus len start)

isJust : Maybe a -> Bool
isJust Nothing = False
isJust (Just x) = True

export
unpack' : String -> List Char
unpack' str
    = case strUncons str of
           Nothing => []
           Just (x, xs) => x :: unpack' xs

-- If the string is recognised, returns the index at which the token
-- ends
scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char)
scan Empty tok str = pure (tok, str)
scan Fail tok str = Nothing
scan (Lookahead positive r) tok str
    = if isJust (scan r tok str) == positive
         then pure (tok, str)
         else Nothing
scan (Pred f) tok [] = Nothing
scan (Pred f) tok (c :: str)
    = if f c
         then Just (c :: tok, str)
         else Nothing
scan (SeqEat r1 r2) tok str
    = do (tok', rest) <- scan r1 tok str
         -- TODO: Can we prove totality instead by showing idx has increased?
         assert_total (scan r2 tok' rest)
scan (SeqEmpty r1 r2) tok str
    = do (tok', rest) <- scan r1 tok str
         scan r2 tok' rest
scan (Alt r1 r2) tok str
    = maybe (scan r2 tok str) Just (scan r1 tok str)

||| A mapping from lexers to the tokens they produce.
||| This is a list of pairs `(Lexer, String -> tokenType)`
||| For each Lexer in the list, if a substring in the input matches, run
||| the associated function to produce a token of type `tokenType`
public export
TokenMap : (tokenType : Type) -> Type
TokenMap tokenType = List (Lexer, String -> tokenType)

||| A token, and the line and column where it was in the input
public export
record TokenData a where
  constructor MkToken
  line : Int
  col : Int
  tok : a

export
Show a => Show (TokenData a) where
  show t = show (line t) ++ ":" ++ show (col t) ++ ":" ++ show (tok t)

tokenise : (TokenData a -> Bool) ->
           (line : Int) -> (col : Int) ->
           List (TokenData a) -> TokenMap a ->
           List Char -> (List (TokenData a), (Int, Int, List Char))
tokenise pred line col acc tmap str
    = case getFirstToken tmap str of
           Just (tok, line', col', rest) =>
           -- assert total because getFirstToken must consume something
               if pred tok
                  then (reverse acc, (line, col, []))
                  else assert_total (tokenise pred line' col' (tok :: acc) tmap rest)
           Nothing => (reverse acc, (line, col, str))
  where
    countNLs : List Char -> Nat
    countNLs str = List.length (filter (== '\n') str)

    getCols : List Char -> Int -> Int
    getCols x c
         = case span (/= '\n') (reverse x) of
                (incol, []) => c + cast (length incol)
                (incol, _) => cast (length incol)

    getFirstToken : TokenMap a -> List Char ->
                    Maybe (TokenData a, Int, Int, List Char)
    getFirstToken [] str = Nothing
    getFirstToken ((lex, fn) :: ts) str
        = case scan lex [] str of
               Just (tok, rest) => Just (MkToken line col (fn (pack (reverse tok))),
                                         line + cast (countNLs tok),
                                         getCols tok col, rest)
               Nothing => getFirstToken ts str

||| Given a mapping from lexers to token generating functions (the
||| TokenMap a) and an input string, return a list of recognised tokens,
||| and the line, column, and remainder of the input at the first point in the
||| string where there are no recognised tokens.
export
lex : TokenMap a -> String -> (List (TokenData a), (Int, Int, String))
lex tmap str
    = let (ts, (l, c, str')) = tokenise (const False) 0 0 [] tmap (fastUnpack str) in
          (ts, (l, c, pack str'))

export
lexTo : (TokenData a -> Bool) ->
        TokenMap a -> String -> (List (TokenData a), (Int, Int, String))
lexTo pred tmap str
    = let (ts, (l, c, str')) = tokenise pred 0 0 [] tmap (fastUnpack str) in
          (ts, (l, c, pack str'))
Add libraries 2020-05-18 15:59:07 +03:00			`module Text.Lexer.Core`

			`import public Control.Delayed`
various stdlib updates 2020-06-12 00:14:11 +03:00			`import Data.Bool`
Add libraries 2020-05-18 15:59:07 +03:00			`import Data.List`
			`import Data.Nat`
			`import Data.Strings`

			`\|\|\| A language of token recognisers.`
			\|\|\| @ consumes If `True`, this recogniser is guaranteed to consume at
			`\|\|\| least one character of input when it succeeds.`
			`public export`
			`data Recognise : (consumes : Bool) -> Type where`
			`Empty : Recognise False`
			`Fail : Recognise c`
			`Lookahead : (positive : Bool) -> Recognise c -> Recognise False`
			`Pred : (Char -> Bool) -> Recognise True`
			`SeqEat : Recognise True -> Inf (Recognise e) -> Recognise True`
			`SeqEmpty : Recognise e1 -> Recognise e2 -> Recognise (e1 \|\| e2)`
			`Alt : Recognise e1 -> Recognise e2 -> Recognise (e1 && e2)`

			`\|\|\| A token recogniser. Guaranteed to consume at least one character.`
			`public export`
			`Lexer : Type`
			`Lexer = Recognise True`

			`-- %allow_overloads (<+>)`

			`\|\|\| Sequence two recognisers. If either consumes a character, the sequence`
			`\|\|\| is guaranteed to consume a character.`
			`export %inline`
			`(<+>) : {c1 : Bool} ->`
			`Recognise c1 -> inf c1 (Recognise c2) -> Recognise (c1 \|\| c2)`
			`(<+>) {c1 = False} = SeqEmpty`
			`(<+>) {c1 = True} = SeqEat`

			`%allow_overloads (<\|>)`

			`\|\|\| Alternative recognisers. If both consume, the combination is guaranteed`
			`\|\|\| to consumer a character.`
			`%inline`
			`export`
			`(<\|>) : Recognise c1 -> Recognise c2 -> Recognise (c1 && c2)`
			`(<\|>) = Alt`

			`\|\|\| A recogniser that always fails.`
			`export`
			`fail : Recognise c`
			`fail = Fail`

			`\|\|\| Recognise no input (doesn't consume any input)`
			`%inline`
			`export`
			`empty : Recognise False`
			`empty = Empty`

			`\|\|\| Recognise a character that matches a predicate`
			`%inline`
			`export`
			`pred : (Char -> Bool) -> Lexer`
			`pred = Pred`

			`\|\|\| Positive lookahead. Never consumes input.`
			`export`
			`expect : Recognise c -> Recognise False`
			`expect = Lookahead True`

			`\|\|\| Negative lookahead. Never consumes input.`
			`export`
			`reject : Recognise c -> Recognise False`
			`reject = Lookahead False`

			`%allow_overloads concatMap`

			`\|\|\| Sequence the recognisers resulting from applying a function to each element`
			`\|\|\| of a list. The resulting recogniser will consume input if the produced`
			`\|\|\| recognisers consume and the list is non-empty.`
			`export`
various stdlib updates 2020-06-12 00:14:11 +03:00			`concatMap : (a -> Recognise c) -> (xs : List a) ->`
Add libraries 2020-05-18 15:59:07 +03:00			`Recognise (c && (isCons xs))`
			`concatMap {c} _ [] = rewrite andFalseFalse c in Empty`
various stdlib updates 2020-06-12 00:14:11 +03:00			`concatMap {c} f (x :: xs)`
Add libraries 2020-05-18 15:59:07 +03:00			`= rewrite andTrueNeutral c in`
			`rewrite sym (orSameAndRightNeutral c (isCons xs)) in`
			`SeqEmpty (f x) (Core.concatMap f xs)`

			`data StrLen : Type where`
			`MkStrLen : String -> Nat -> StrLen`

			`Show StrLen where`
			`show (MkStrLen str n) = str ++ "(" ++ show n ++ ")"`

			`getString : StrLen -> String`
			`getString (MkStrLen str n) = str`

			`strIndex : StrLen -> Nat -> Maybe Char`
			`strIndex (MkStrLen str len) i`
			`= if cast {to = Integer} i >= cast len then Nothing`
			`else Just (assert_total (prim__strIndex str (cast i)))`

			`mkStr : String -> StrLen`
			`mkStr str = MkStrLen str (length str)`

			`strTail : Nat -> StrLen -> StrLen`
			`strTail start (MkStrLen str len)`
			`= MkStrLen (substr start len str) (minus len start)`

			`isJust : Maybe a -> Bool`
			`isJust Nothing = False`
			`isJust (Just x) = True`

			`export`
			`unpack' : String -> List Char`
various stdlib updates 2020-06-12 00:14:11 +03:00			`unpack' str`
Add libraries 2020-05-18 15:59:07 +03:00			`= case strUncons str of`
			`Nothing => []`
			`Just (x, xs) => x :: unpack' xs`

			`-- If the string is recognised, returns the index at which the token`
			`-- ends`
			`scan : Recognise c -> List Char -> List Char -> Maybe (List Char, List Char)`
			`scan Empty tok str = pure (tok, str)`
			`scan Fail tok str = Nothing`
			`scan (Lookahead positive r) tok str`
			`= if isJust (scan r tok str) == positive`
			`then pure (tok, str)`
			`else Nothing`
			`scan (Pred f) tok [] = Nothing`
			`scan (Pred f) tok (c :: str)`
			`= if f c`
			`then Just (c :: tok, str)`
			`else Nothing`
			`scan (SeqEat r1 r2) tok str`
			`= do (tok', rest) <- scan r1 tok str`
			`-- TODO: Can we prove totality instead by showing idx has increased?`
			`assert_total (scan r2 tok' rest)`
			`scan (SeqEmpty r1 r2) tok str`
			`= do (tok', rest) <- scan r1 tok str`
			`scan r2 tok' rest`
			`scan (Alt r1 r2) tok str`
			`= maybe (scan r2 tok str) Just (scan r1 tok str)`

			`\|\|\| A mapping from lexers to the tokens they produce.`
			\|\|\| This is a list of pairs `(Lexer, String -> tokenType)`
			`\|\|\| For each Lexer in the list, if a substring in the input matches, run`
			\|\|\| the associated function to produce a token of type `tokenType`
			`public export`
			`TokenMap : (tokenType : Type) -> Type`
			`TokenMap tokenType = List (Lexer, String -> tokenType)`

			`\|\|\| A token, and the line and column where it was in the input`
			`public export`
			`record TokenData a where`
			`constructor MkToken`
			`line : Int`
			`col : Int`
			`tok : a`

			`export`
			`Show a => Show (TokenData a) where`
			`show t = show (line t) ++ ":" ++ show (col t) ++ ":" ++ show (tok t)`

			`tokenise : (TokenData a -> Bool) ->`
			`(line : Int) -> (col : Int) ->`
			`List (TokenData a) -> TokenMap a ->`
			`List Char -> (List (TokenData a), (Int, Int, List Char))`
			`tokenise pred line col acc tmap str`
			`= case getFirstToken tmap str of`
			`Just (tok, line', col', rest) =>`
			`-- assert total because getFirstToken must consume something`
			`if pred tok`
			`then (reverse acc, (line, col, []))`
			`else assert_total (tokenise pred line' col' (tok :: acc) tmap rest)`
			`Nothing => (reverse acc, (line, col, str))`
			`where`
			`countNLs : List Char -> Nat`
			`countNLs str = List.length (filter (== '\n') str)`

			`getCols : List Char -> Int -> Int`
			`getCols x c`
			`= case span (/= '\n') (reverse x) of`
			`(incol, []) => c + cast (length incol)`
			`(incol, _) => cast (length incol)`

various stdlib updates 2020-06-12 00:14:11 +03:00			`getFirstToken : TokenMap a -> List Char ->`
Add libraries 2020-05-18 15:59:07 +03:00			`Maybe (TokenData a, Int, Int, List Char)`
			`getFirstToken [] str = Nothing`
			`getFirstToken ((lex, fn) :: ts) str`
			`= case scan lex [] str of`
various stdlib updates 2020-06-12 00:14:11 +03:00			`Just (tok, rest) => Just (MkToken line col (fn (pack (reverse tok))),`
Add libraries 2020-05-18 15:59:07 +03:00			`line + cast (countNLs tok),`
			`getCols tok col, rest)`
			`Nothing => getFirstToken ts str`

			`\|\|\| Given a mapping from lexers to token generating functions (the`
			`\|\|\| TokenMap a) and an input string, return a list of recognised tokens,`
			`\|\|\| and the line, column, and remainder of the input at the first point in the`
			`\|\|\| string where there are no recognised tokens.`
			`export`
			`lex : TokenMap a -> String -> (List (TokenData a), (Int, Int, String))`
			`lex tmap str`
String-related stdlib tweaks. 2020-09-19 15:22:54 +03:00			`= let (ts, (l, c, str')) = tokenise (const False) 0 0 [] tmap (fastUnpack str) in`
Add libraries 2020-05-18 15:59:07 +03:00			`(ts, (l, c, pack str'))`

			`export`
			`lexTo : (TokenData a -> Bool) ->`
			`TokenMap a -> String -> (List (TokenData a), (Int, Int, String))`
			`lexTo pred tmap str`
String-related stdlib tweaks. 2020-09-19 15:22:54 +03:00			`= let (ts, (l, c, str')) = tokenise pred 0 0 [] tmap (fastUnpack str) in`
Add libraries 2020-05-18 15:59:07 +03:00			`(ts, (l, c, pack str'))`