||| A simple module to process 'literate' documents. ||| ||| The module uses a lexer to split the document into code blocks, ||| delineated by user-defined markers, and code lines that are ||| indicated be a line marker. The lexer returns a document stripped ||| of non-code elements but preserving the original document's line ||| count. Column numbering of code lines are not preserved. ||| ||| The underlying tokeniser is greedy. ||| ||| Once it identifies a line marker it reads a prettifying space then ||| consumes until the end of line. Once identifies a starting code ||| block marker, the lexer will consume input until the next ||| identifiable end block is encountered. Any other content is ||| treated as part of the original document. ||| ||| Thus, the input literate files *must* be well-formed w.r.t ||| to code line markers and code blocks. ||| ||| A further restriction is that literate documents cannot contain ||| the markers within the document's main text: This will confuse the ||| lexer. ||| module Text.Literate import Text.Lexer import Data.List import Data.List1 import Data.List.Views import Data.String %default total untilEOL : Recognise False untilEOL = manyUntil newline any line : String -> Lexer line s = exact s <+> (newline <|> space <+> untilEOL) block : String -> String -> Lexer block s e = surround (exact s <+> untilEOL) (exact e <+> untilEOL) any notCodeLine : Lexer notCodeLine = newline <|> any <+> untilEOL data Token = CodeBlock String String String | Any String | CodeLine String String Show Token where showPrec d (CodeBlock l r x) = showCon d "CodeBlock" $ showArg l ++ showArg r ++ showArg x showPrec d (Any x) = showCon d "Any" $ showArg x showPrec d (CodeLine m x) = showCon d "CodeLine" $ showArg m ++ showArg x rawTokens : (delims : List (String, String)) -> (markers : List String) -> TokenMap (Token) rawTokens delims ls = map (\(l,r) => (block l r, CodeBlock (trim l) (trim r))) delims ++ map (\m => (line m, CodeLine (trim m))) ls ++ [(notCodeLine, Any)] namespace Compat -- `reduce` below was depending on the old behaviour of `lines` before #1585 -- was merged. That old `lines` function is added here to preserve behaviour -- of `reduce`. lines' : List Char -> List1 (List Char) lines' [] = singleton [] lines' s = case break isNL s of (l, s') => l ::: case s' of [] => [] _ :: s'' => forget $ lines' (assert_smaller s s'') export lines : String -> List1 String lines s = map pack (lines' (unpack s)) ||| Merge the tokens into a single source file. reduce : List (WithBounds Token) -> List String -> String reduce [] acc = fastConcat (reverse acc) reduce (MkBounded (Any x) _ _ :: rest) acc = -- newline will always be tokenized as a single token if x == "\n" then reduce rest ("\n"::acc) else reduce rest acc reduce (MkBounded (CodeLine m src) _ _ :: rest) acc = if m == trim src then reduce rest ("\n"::acc) else reduce rest ((substr (length m + 1) -- remove space to right of marker. (length src) src )::acc) reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc with (Compat.lines src) -- Strip the deliminators surrounding the block. reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: ys) with (snocList ys) reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: []) | Empty = reduce rest acc -- 2 reduce (MkBounded (CodeBlock l r src) _ _ :: rest) acc | (s ::: (srcs ++ [f])) | (Snoc f srcs rec) = -- the "\n" counts for the open deliminator; the closing deliminator should always be followed by a (Any "\n"), so we don't add a newline reduce rest (((unlines srcs) ++ "\n") :: "\n" :: acc) -- [ NOTE ] 1 & 2 shouldn't happen as code blocks are well formed i.e. have two deliminators. public export record LiterateError where constructor MkLitErr line : Int column : Int input : String ||| Description of literate styles. ||| ||| A 'literate' style comprises of ||| ||| + a list of code block deliminators (`deliminators`); ||| + a list of code line markers (`line_markers`); and ||| + a list of known file extensions `file_extensions`. ||| ||| Some example specifications: ||| ||| + Bird Style ||| |||``` |||MkLitStyle Nil [">"] [".lidr"] |||``` ||| ||| + Literate Haskell (for LaTeX) ||| |||``` |||MkLitStyle [("\\begin{code}", "\\end{code}"),("\\begin{spec}","\\end{spec}")] ||| Nil ||| [".lhs", ".tex"] |||``` ||| ||| + OrgMode ||| |||``` |||MkLitStyle [("#+BEGIN_SRC idris","#+END_SRC"), ("#+COMMENT idris","#+END_COMMENT")] ||| ["#+IDRIS:"] ||| [".org"] |||``` ||| ||| + Common Mark ||| |||``` |||MkLitStyle [("```idris","```"), ("")] ||| Nil ||| [".md", ".markdown"] |||``` ||| public export record LiterateStyle where constructor MkLitStyle ||| The pairs of start and end tags for code blocks. deliminators : List (String, String) ||| Line markers that indicate a line contains code. line_markers : List String ||| Recognised file extensions. Not used by the module, but will be ||| of use when connecting to code that reads in the original source ||| files. file_extensions : List String ||| Given a 'literate specification' extract the code from the ||| literate source file (`litStr`) that follows the presented style. ||| ||| @specification The literate specification to use. ||| @litStr The literate source file. ||| ||| Returns a `LiterateError` if the literate file contains malformed ||| code blocks or code lines. ||| export extractCode : (specification : LiterateStyle) -> (litStr : String) -> Either LiterateError String extractCode (MkLitStyle delims markers exts) str = case lex (rawTokens delims markers) str of (toks, (_,_,"")) => Right (reduce toks Nil) (_, (l,c,i)) => Left (MkLitErr l c i) ||| Synonym for `extractCode`. export unlit : (specification : LiterateStyle) -> (litStr : String) -> Either LiterateError String unlit = extractCode ||| Is the provided line marked up using a line marker? ||| ||| If the line is suffixed by any one of the style's set of line ||| markers then return length of literate line marker, and the code stripped from the line ||| marker. Otherwise, return Nothing and the unmarked line. export isLiterateLine : (specification : LiterateStyle) -> (str : String) -> Pair (Maybe String) String isLiterateLine (MkLitStyle delims markers _) str with (lex (rawTokens delims markers) str) isLiterateLine (MkLitStyle delims markers _) str | ([MkBounded (CodeLine m str') _ _], (_,_, "")) = (Just m, str') isLiterateLine (MkLitStyle delims markers _) str | (_, _) = (Nothing, str) ||| Given a 'literate specification' embed the given code using the ||| literate style provided. ||| ||| If the style uses deliminators to denote code blocks use the first ||| pair of deliminators in the style. Otherwise use first linemarker ||| in the style. If there is **no style** return the presented code ||| string unembedded. ||| ||| ||| @specification The literate specification to use. ||| @code The code to embed, ||| ||| export embedCode : (specification : LiterateStyle) -> (code : String) -> String embedCode (MkLitStyle ((s,e)::delims) _ _) str = unlines [s,str,e] embedCode (MkLitStyle Nil (m::markers) _) str = unwords [m, str] embedCode (MkLitStyle _ _ _) str = str ||| Synonm for `embedCode` export relit : (specification : LiterateStyle) -> (code : String) -> String relit = embedCode -- --------------------------------------------------------------------- [ EOF ]