Idris2/libs/contrib/Text/Literate.idr
Alex Humphreys f3855d7100
Update contrib Text.Parser to match Library.Text.Parser (#1808)
Co-authored-by: Guillaume ALLAIS <guillaume.allais@ens-lyon.org>
2021-08-06 10:03:13 +01:00

235 lines
7.8 KiB
Idris

||| A simple module to process 'literate' documents.
|||
||| The module uses a lexer to split the document into code blocks,
||| delineated by user-defined markers, and code lines that are
||| indicated be a line marker. The lexer returns a document stripped
||| of non-code elements but preserving the original document's line
||| count. Column numbering of code lines are not preserved.
|||
||| The underlying tokeniser is greedy.
|||
||| Once it identifies a line marker it reads a prettifying space then
||| consumes until the end of line. Once identifies a starting code
||| block marker, the lexer will consume input until the next
||| identifiable end block is encountered. Any other content is
||| treated as part of the original document.
|||
||| Thus, the input literate files *must* be well-formed w.r.t
||| to code line markers and code blocks.
|||
||| A further restriction is that literate documents cannot contain
||| the markers within the document's main text: This will confuse the
||| lexer.
|||
module Text.Literate
import Text.Lexer
import Data.List
import Data.List1
import Data.List.Views
import Data.String
import Data.String.Extra
%default total
untilEOL : Recognise False
untilEOL = manyUntil newline any
line : String -> Lexer
line s = exact s <+> (newline <|> space <+> untilEOL)
block : String -> String -> Lexer
block s e = surround (exact s <+> untilEOL) (exact e <+> untilEOL) any
notCodeLine : Lexer
notCodeLine = newline
<|> any <+> untilEOL
data Token = CodeBlock String String String
| Any String
| CodeLine String String
Show Token where
showPrec d (CodeBlock l r x) = showCon d "CodeBlock" $ showArg l ++ showArg r ++ showArg x
showPrec d (Any x) = showCon d "Any" $ showArg x
showPrec d (CodeLine m x) = showCon d "CodeLine" $ showArg m ++ showArg x
rawTokens : (delims : List (String, String))
-> (markers : List String)
-> TokenMap (Token)
rawTokens delims ls =
map (\(l,r) => (block l r, CodeBlock (trim l) (trim r))) delims
++ map (\m => (line m, CodeLine (trim m))) ls
++ [(notCodeLine, Any)]
namespace Compat
-- `reduce` below was depending on the old behaviour of `lines` before #1585
-- was merged. That old `lines` function is added here to preserve behaviour
-- of `reduce`.
lines' : List Char -> List1 (List Char)
lines' [] = singleton []
lines' s = case break isNL s of
(l, s') => l ::: case s' of
[] => []
_ :: s'' => forget $ lines' (assert_smaller s s'')
export
lines : String -> List1 String
lines s = map pack (lines' (unpack s))
||| Merge the tokens into a single source file.
reduce : List (WithBounds Token) -> List String -> String
reduce [] acc = fastAppend (reverse acc)
reduce (MkBounded (Any x) _ _ :: rest) acc =
-- newline will always be tokenized as a single token
if x == "\n"
then reduce rest ("\n"::acc)
else reduce rest acc
reduce (MkBounded (CodeLine m src) _ _ :: rest) acc =
if m == trim src
then reduce rest ("\n"::acc)
else reduce rest ((substr (length m + 1) -- remove space to right of marker.
(length src)
src
)::acc)
reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc with (Compat.lines src) -- Strip the deliminators surrounding the block.
reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: ys) with (snocList ys)
reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: []) | Empty = reduce rest acc -- 2
reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: (srcs ++ [f])) | (Snoc f srcs rec) =
-- the "\n" counts for the open deliminator; the closing deliminator should always be followed by a (Any "\n"), so we don't add a newline
reduce rest (((unlines srcs) ++ "\n") :: "\n" :: acc)
-- [ NOTE ] 1 & 2 shouldn't happen as code blocks are well formed i.e. have two deliminators.
public export
record LiterateError where
constructor MkLitErr
line : Int
column : Int
input : String
||| Description of literate styles.
|||
||| A 'literate' style comprises of
|||
||| + a list of code block deliminators (`deliminators`);
||| + a list of code line markers (`line_markers`); and
||| + a list of known file extensions `file_extensions`.
|||
||| Some example specifications:
|||
||| + Bird Style
|||
|||```
|||MkLitStyle Nil [">"] [".lidr"]
|||```
|||
||| + Literate Haskell (for LaTeX)
|||
|||```
|||MkLitStyle [("\\begin{code}", "\\end{code}"),("\\begin{spec}","\\end{spec}")]
||| Nil
||| [".lhs", ".tex"]
|||```
|||
||| + OrgMode
|||
|||```
|||MkLitStyle [("#+BEGIN_SRC idris","#+END_SRC"), ("#+COMMENT idris","#+END_COMMENT")]
||| ["#+IDRIS:"]
||| [".org"]
|||```
|||
||| + Common Mark
|||
|||```
|||MkLitStyle [("```idris","```"), ("<!-- idris","--!>")]
||| Nil
||| [".md", ".markdown"]
|||```
|||
public export
record LiterateStyle where
constructor MkLitStyle
||| The pairs of start and end tags for code blocks.
deliminators : List (String, String)
||| Line markers that indicate a line contains code.
line_markers : List String
||| Recognised file extensions. Not used by the module, but will be
||| of use when connecting to code that reads in the original source
||| files.
file_extensions : List String
||| Given a 'literate specification' extract the code from the
||| literate source file (`litStr`) that follows the presented style.
|||
||| @specification The literate specification to use.
||| @litStr The literate source file.
|||
||| Returns a `LiterateError` if the literate file contains malformed
||| code blocks or code lines.
|||
export
extractCode : (specification : LiterateStyle)
-> (litStr : String)
-> Either LiterateError String
extractCode (MkLitStyle delims markers exts) str =
case lex (rawTokens delims markers) str of
(toks, (_,_,"")) => Right (reduce toks Nil)
(_, (l,c,i)) => Left (MkLitErr l c i)
||| Synonym for `extractCode`.
export
unlit : (specification : LiterateStyle)
-> (litStr : String)
-> Either LiterateError String
unlit = extractCode
||| Is the provided line marked up using a line marker?
|||
||| If the line is suffixed by any one of the style's set of line
||| markers then return length of literate line marker, and the code stripped from the line
||| marker. Otherwise, return Nothing and the unmarked line.
export
isLiterateLine : (specification : LiterateStyle)
-> (str : String)
-> Pair (Maybe String) String
isLiterateLine (MkLitStyle delims markers _) str with (lex (rawTokens delims markers) str)
isLiterateLine (MkLitStyle delims markers _) str | ([MkBounded (CodeLine m str') _ _], (_,_, "")) = (Just m, str')
isLiterateLine (MkLitStyle delims markers _) str | (_, _) = (Nothing, str)
||| Given a 'literate specification' embed the given code using the
||| literate style provided.
|||
||| If the style uses deliminators to denote code blocks use the first
||| pair of deliminators in the style. Otherwise use first linemarker
||| in the style. If there is **no style** return the presented code
||| string unembedded.
|||
|||
||| @specification The literate specification to use.
||| @code The code to embed,
|||
|||
export
embedCode : (specification : LiterateStyle)
-> (code : String)
-> String
embedCode (MkLitStyle ((s,e)::delims) _ _) str = unlines [s,str,e]
embedCode (MkLitStyle Nil (m::markers) _) str = unwords [m, str]
embedCode (MkLitStyle _ _ _) str = str
||| Synonm for `embedCode`
export
relit : (specification : LiterateStyle)
-> (code : String)
-> String
relit = embedCode
-- --------------------------------------------------------------------- [ EOF ]