Idris2/libs/contrib/Text/Literate.idr

||| A simple module to process 'literate' documents.
|||
||| The module uses a lexer to split the document into code blocks,
||| delineated by user-defined markers, and code lines that are
||| indicated be a line marker. The lexer returns a document stripped
||| of non-code elements but preserving the original document's line
||| count. Column numbering of code lines are not preserved.
|||
||| The underlying tokeniser is greedy.
|||
||| Once it identifies a line marker it reads a prettifying space then
||| consumes until the end of line. Once identifies a starting code
||| block marker, the lexer will consume input until the next
||| identifiable end block is encountered. Any other content is
||| treated as part of the original document.
|||
||| Thus, the input literate files *must* be well-formed w.r.t
||| to code line markers and code blocks.
|||
||| A further restriction is that literate documents cannot contain
||| the markers within the document's main text: This will confuse the
||| lexer.
|||
module Text.Literate

import Text.Lexer

import Data.List
import Data.List1
import Data.List.Views
import Data.String

%default total

untilEOL : Recognise False
untilEOL = manyUntil newline any

line : String -> Lexer
line s = exact s <+> (newline <|> space <+> untilEOL)

block : String -> String -> Lexer
block s e = surround (exact s <+> untilEOL) (exact e <+> untilEOL) any

notCodeLine : Lexer
notCodeLine = newline
           <|> any <+> untilEOL

data Token = CodeBlock String String String
           | Any String
           | CodeLine String String

Show Token where
  showPrec d (CodeBlock l r x) = showCon d "CodeBlock" $ showArg l ++ showArg r ++ showArg x
  showPrec d (Any x)           = showCon d "Any" $ showArg x
  showPrec d (CodeLine m x)    = showCon d "CodeLine" $ showArg m ++ showArg x

rawTokens : (delims  : List (String, String))
         -> (markers : List String)
         -> TokenMap (Token)
rawTokens delims ls =
          map (\(l,r) => (block l r, CodeBlock (trim l) (trim r))) delims
       ++ map (\m => (line m, CodeLine (trim m))) ls
       ++ [(notCodeLine, Any)]

namespace Compat
  -- `reduce` below was depending on the old behaviour of `lines` before #1585
  -- was merged. That old `lines` function is added here to preserve behaviour
  -- of `reduce`.
  lines' : List Char -> List1 (List Char)
  lines' [] = singleton []
  lines' s  = case break isNL s of
                   (l, s') => l ::: case s' of
                                         [] => []
                                         _ :: s'' => forget $ lines' (assert_smaller s s'')
  export
  lines : String -> List1 String
  lines s = map pack (lines' (unpack s))

||| Merge the tokens into a single source file.
reduce : List (WithBounds Token) -> List String -> String
reduce [] acc = concat (reverse acc)
reduce (MkBounded (Any x) _ _ :: rest) acc =
  -- newline will always be tokenized as a single token
  if x == "\n"
  then reduce rest ("\n"::acc)
  else reduce rest acc

reduce (MkBounded (CodeLine m src) _ _ :: rest) acc =
    if m == trim src
    then reduce rest ("\n"::acc)
    else reduce rest ((substr (length m + 1) -- remove space to right of marker.
                              (length src)
                              src
                      )::acc)

reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc with (Compat.lines src) -- Strip the deliminators surrounding the block.
  reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: ys) with (snocList ys)
    reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: []) | Empty = reduce rest acc -- 2
    reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: (srcs ++ [f])) | (Snoc f srcs rec) =
        -- the "\n" counts for the open deliminator; the closing deliminator should always be followed by a (Any "\n"), so we don't add a newline
        reduce rest (((unlines srcs) ++ "\n") :: "\n" :: acc)

-- [ NOTE ] 1 & 2 shouldn't happen as code blocks are well formed i.e. have two deliminators.


public export
record LiterateError where
  constructor MkLitErr
  line   : Int
  column : Int
  input  : String

||| Description of literate styles.
|||
||| A 'literate' style comprises of
|||
||| + a list of code block deliminators (`deliminators`);
||| + a list of code line markers (`line_markers`); and
||| + a list of known file extensions `file_extensions`.
|||
||| Some example specifications:
|||
||| + Bird Style
|||
|||```
|||MkLitStyle Nil [">"] [".lidr"]
|||```
|||
||| + Literate Haskell (for LaTeX)
|||
|||```
|||MkLitStyle [("\\begin{code}", "\\end{code}"),("\\begin{spec}","\\end{spec}")]
|||           Nil
|||           [".lhs", ".tex"]
|||```
|||
||| + OrgMode
|||
|||```
|||MkLitStyle [("#+BEGIN_SRC idris","#+END_SRC"), ("#+COMMENT idris","#+END_COMMENT")]
|||           ["#+IDRIS:"]
|||           [".org"]
|||```
|||
||| + Common Mark
|||
|||```
|||MkLitStyle [("```idris","```"), ("<!-- idris","--!>")]
|||           Nil
|||           [".md", ".markdown"]
|||```
|||
public export
record LiterateStyle where
  constructor MkLitStyle
  ||| The pairs of start and end tags for code blocks.
  deliminators : List (String, String)

  ||| Line markers that indicate a line contains code.
  line_markers : List String

  ||| Recognised file extensions. Not used by the module, but will be
  ||| of use when connecting to code that reads in the original source
  ||| files.
  file_extensions : List String

||| Given a 'literate specification' extract the code from the
||| literate source file (`litStr`) that follows the presented style.
|||
||| @specification The literate specification to use.
||| @litStr  The literate source file.
|||
||| Returns a `LiterateError` if the literate file contains malformed
||| code blocks or code lines.
|||
export
extractCode : (specification : LiterateStyle)
           -> (litStr        : String)
           -> Either LiterateError String
extractCode (MkLitStyle delims markers exts) str =
      case lex (rawTokens delims markers) str of
        (toks, (_,_,"")) => Right (reduce toks Nil)
        (_, (l,c,i))     => Left (MkLitErr l c i)

||| Synonym for `extractCode`.
export
unlit : (specification : LiterateStyle)
     -> (litStr        : String)
     -> Either LiterateError String
unlit = extractCode

||| Is the provided line marked up using a line marker?
|||
||| If the line is suffixed by any one of the style's set of line
||| markers then return length of literate line marker, and the code stripped from the line
||| marker. Otherwise, return Nothing and the unmarked line.
export
isLiterateLine : (specification : LiterateStyle)
              -> (str : String)
              -> Pair (Maybe String) String
isLiterateLine (MkLitStyle delims markers _) str with (lex (rawTokens delims markers) str)
  isLiterateLine (MkLitStyle delims markers _) str | ([MkBounded (CodeLine m str') _ _], (_,_, "")) = (Just m, str')
  isLiterateLine (MkLitStyle delims markers _) str | (_, _) = (Nothing, str)

||| Given a 'literate specification' embed the given code using the
||| literate style provided.
|||
||| If the style uses deliminators to denote code blocks use the first
||| pair of deliminators in the style. Otherwise use first linemarker
||| in the style. If there is **no style** return the presented code
||| string unembedded.
|||
|||
||| @specification The literate specification to use.
||| @code  The code to embed,
|||
|||
export
embedCode : (specification : LiterateStyle)
         -> (code : String)
         -> String
embedCode (MkLitStyle ((s,e)::delims) _            _) str = unlines [s,str,e]
embedCode (MkLitStyle Nil             (m::markers) _) str = unwords [m, str]
embedCode (MkLitStyle _               _            _) str = str

||| Synonm for `embedCode`
export
relit : (specification : LiterateStyle)
     -> (code : String)
     -> String
relit = embedCode

-- --------------------------------------------------------------------- [ EOF ]
Update contrib Text.Parser to match Library.Text.Parser (#1808) Co-authored-by: Guillaume ALLAIS <guillaume.allais@ens-lyon.org> 2021-08-06 12:03:13 +03:00			`\|\|\| A simple module to process 'literate' documents.`
			`\|\|\|`
			`\|\|\| The module uses a lexer to split the document into code blocks,`
			`\|\|\| delineated by user-defined markers, and code lines that are`
			`\|\|\| indicated be a line marker. The lexer returns a document stripped`
			`\|\|\| of non-code elements but preserving the original document's line`
			`\|\|\| count. Column numbering of code lines are not preserved.`
			`\|\|\|`
			`\|\|\| The underlying tokeniser is greedy.`
			`\|\|\|`
			`\|\|\| Once it identifies a line marker it reads a prettifying space then`
			`\|\|\| consumes until the end of line. Once identifies a starting code`
			`\|\|\| block marker, the lexer will consume input until the next`
			`\|\|\| identifiable end block is encountered. Any other content is`
			`\|\|\| treated as part of the original document.`
			`\|\|\|`
			`\|\|\| Thus, the input literate files must be well-formed w.r.t`
			`\|\|\| to code line markers and code blocks.`
			`\|\|\|`
			`\|\|\| A further restriction is that literate documents cannot contain`
			`\|\|\| the markers within the document's main text: This will confuse the`
			`\|\|\| lexer.`
			`\|\|\|`
			`module Text.Literate`

			`import Text.Lexer`

			`import Data.List`
			`import Data.List1`
			`import Data.List.Views`
			`import Data.String`

			`%default total`

			`untilEOL : Recognise False`
			`untilEOL = manyUntil newline any`

			`line : String -> Lexer`
			`line s = exact s <+> (newline <\|> space <+> untilEOL)`

			`block : String -> String -> Lexer`
			`block s e = surround (exact s <+> untilEOL) (exact e <+> untilEOL) any`

			`notCodeLine : Lexer`
			`notCodeLine = newline`
			`<\|> any <+> untilEOL`

			`data Token = CodeBlock String String String`
			`\| Any String`
			`\| CodeLine String String`

			`Show Token where`
			`showPrec d (CodeBlock l r x) = showCon d "CodeBlock" $ showArg l ++ showArg r ++ showArg x`
			`showPrec d (Any x) = showCon d "Any" $ showArg x`
			`showPrec d (CodeLine m x) = showCon d "CodeLine" $ showArg m ++ showArg x`

			`rawTokens : (delims : List (String, String))`
			`-> (markers : List String)`
			`-> TokenMap (Token)`
			`rawTokens delims ls =`
			`map (\(l,r) => (block l r, CodeBlock (trim l) (trim r))) delims`
			`++ map (\m => (line m, CodeLine (trim m))) ls`
			`++ [(notCodeLine, Any)]`

			`namespace Compat`
			-- `reduce` below was depending on the old behaviour of `lines` before #1585
			-- was merged. That old `lines` function is added here to preserve behaviour
			-- of `reduce`.
			`lines' : List Char -> List1 (List Char)`
			`lines' [] = singleton []`
			`lines' s = case break isNL s of`
			`(l, s') => l ::: case s' of`
			`[] => []`
			`_ :: s'' => forget $ lines' (assert_smaller s s'')`
			`export`
			`lines : String -> List1 String`
			`lines s = map pack (lines' (unpack s))`

			`\|\|\| Merge the tokens into a single source file.`
			`reduce : List (WithBounds Token) -> List String -> String`
Switch from 'fast' string functions to normal string functions 2023-10-18 20:14:02 +03:00			`reduce [] acc = concat (reverse acc)`
Update contrib Text.Parser to match Library.Text.Parser (#1808) Co-authored-by: Guillaume ALLAIS <guillaume.allais@ens-lyon.org> 2021-08-06 12:03:13 +03:00			`reduce (MkBounded (Any x) _ _ :: rest) acc =`
			`-- newline will always be tokenized as a single token`
			`if x == "\n"`
			`then reduce rest ("\n"::acc)`
			`else reduce rest acc`

			`reduce (MkBounded (CodeLine m src) _ _ :: rest) acc =`
			`if m == trim src`
			`then reduce rest ("\n"::acc)`
			`else reduce rest ((substr (length m + 1) -- remove space to right of marker.`
			`(length src)`
			`src`
			`)::acc)`

			`reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc with (Compat.lines src) -- Strip the deliminators surrounding the block.`
			`reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc \| (s ::: ys) with (snocList ys)`
			`reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc \| (s ::: []) \| Empty = reduce rest acc -- 2`
			`reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc \| (s ::: (srcs ++ [f])) \| (Snoc f srcs rec) =`
			`-- the "\n" counts for the open deliminator; the closing deliminator should always be followed by a (Any "\n"), so we don't add a newline`
			`reduce rest (((unlines srcs) ++ "\n") :: "\n" :: acc)`

			`-- [ NOTE ] 1 & 2 shouldn't happen as code blocks are well formed i.e. have two deliminators.`


			`public export`
			`record LiterateError where`
			`constructor MkLitErr`
			`line : Int`
			`column : Int`
			`input : String`

			`\|\|\| Description of literate styles.`
			`\|\|\|`
			`\|\|\| A 'literate' style comprises of`
			`\|\|\|`
			\|\|\| + a list of code block deliminators (`deliminators`);
			\|\|\| + a list of code line markers (`line_markers`); and
			\|\|\| + a list of known file extensions `file_extensions`.
			`\|\|\|`
			`\|\|\| Some example specifications:`
			`\|\|\|`
			`\|\|\| + Bird Style`
			`\|\|\|`
			\|\|\|```
			`\|\|\|MkLitStyle Nil [">"] [".lidr"]`
			\|\|\|```
			`\|\|\|`
			`\|\|\| + Literate Haskell (for LaTeX)`
			`\|\|\|`
			\|\|\|```
			`\|\|\|MkLitStyle [("\\begin{code}", "\\end{code}"),("\\begin{spec}","\\end{spec}")]`
			`\|\|\| Nil`
			`\|\|\| [".lhs", ".tex"]`
			\|\|\|```
			`\|\|\|`
			`\|\|\| + OrgMode`
			`\|\|\|`
			\|\|\|```
			`\|\|\|MkLitStyle [("#+BEGIN_SRC idris","#+END_SRC"), ("#+COMMENT idris","#+END_COMMENT")]`
			`\|\|\| ["#+IDRIS:"]`
			`\|\|\| [".org"]`
			\|\|\|```
			`\|\|\|`
			`\|\|\| + Common Mark`
			`\|\|\|`
			\|\|\|```
			\|\|\|MkLitStyle [("```idris","```"), ("<!-- idris","--!>")]
			`\|\|\| Nil`
			`\|\|\| [".md", ".markdown"]`
			\|\|\|```
			`\|\|\|`
			`public export`
			`record LiterateStyle where`
			`constructor MkLitStyle`
			`\|\|\| The pairs of start and end tags for code blocks.`
			`deliminators : List (String, String)`

			`\|\|\| Line markers that indicate a line contains code.`
			`line_markers : List String`

			`\|\|\| Recognised file extensions. Not used by the module, but will be`
			`\|\|\| of use when connecting to code that reads in the original source`
			`\|\|\| files.`
			`file_extensions : List String`

			`\|\|\| Given a 'literate specification' extract the code from the`
			\|\|\| literate source file (`litStr`) that follows the presented style.
			`\|\|\|`
			`\|\|\| @specification The literate specification to use.`
			`\|\|\| @litStr The literate source file.`
			`\|\|\|`
			\|\|\| Returns a `LiterateError` if the literate file contains malformed
			`\|\|\| code blocks or code lines.`
			`\|\|\|`
			`export`
			`extractCode : (specification : LiterateStyle)`
			`-> (litStr : String)`
			`-> Either LiterateError String`
			`extractCode (MkLitStyle delims markers exts) str =`
			`case lex (rawTokens delims markers) str of`
			`(toks, (_,_,"")) => Right (reduce toks Nil)`
			`(_, (l,c,i)) => Left (MkLitErr l c i)`

			\|\|\| Synonym for `extractCode`.
			`export`
			`unlit : (specification : LiterateStyle)`
			`-> (litStr : String)`
			`-> Either LiterateError String`
			`unlit = extractCode`

			`\|\|\| Is the provided line marked up using a line marker?`
			`\|\|\|`
			`\|\|\| If the line is suffixed by any one of the style's set of line`
			`\|\|\| markers then return length of literate line marker, and the code stripped from the line`
			`\|\|\| marker. Otherwise, return Nothing and the unmarked line.`
			`export`
			`isLiterateLine : (specification : LiterateStyle)`
			`-> (str : String)`
			`-> Pair (Maybe String) String`
			`isLiterateLine (MkLitStyle delims markers _) str with (lex (rawTokens delims markers) str)`
			`isLiterateLine (MkLitStyle delims markers _) str \| ([MkBounded (CodeLine m str') _ _], (_,_, "")) = (Just m, str')`
			`isLiterateLine (MkLitStyle delims markers _) str \| (_, _) = (Nothing, str)`

			`\|\|\| Given a 'literate specification' embed the given code using the`
			`\|\|\| literate style provided.`
			`\|\|\|`
			`\|\|\| If the style uses deliminators to denote code blocks use the first`
			`\|\|\| pair of deliminators in the style. Otherwise use first linemarker`
			`\|\|\| in the style. If there is no style return the presented code`
			`\|\|\| string unembedded.`
			`\|\|\|`
			`\|\|\|`
			`\|\|\| @specification The literate specification to use.`
			`\|\|\| @code The code to embed,`
			`\|\|\|`
			`\|\|\|`
			`export`
			`embedCode : (specification : LiterateStyle)`
			`-> (code : String)`
			`-> String`
			`embedCode (MkLitStyle ((s,e)::delims) _ _) str = unlines [s,str,e]`
			`embedCode (MkLitStyle Nil (m::markers) _) str = unwords [m, str]`
			`embedCode (MkLitStyle _ _ _) str = str`

			\|\|\| Synonm for `embedCode`
			`export`
			`relit : (specification : LiterateStyle)`
			`-> (code : String)`
			`-> String`
			`relit = embedCode`

			`-- --------------------------------------------------------------------- [ EOF ]`