mirror of
synced 2024-12-18 08:02:12 +03:00
329 lines
9.6 KiB
329 lines
9.6 KiB
"""Process Markdown document.
This module allows to manipulate Markdown document:
- create a navigable Markdown object from a text,
- add, remove child nodes to a Markdown object,
- extend a Markdown document with another Markdown document,
- provide utility methods to construct toc.
import re
import unicodedata
from textwrap import dedent
from typing import List, Optional
from parser import Parser
class Node:
"""Represent the base class for a Markdown document token."""
content: Optional[str]
def __init__(self, content: Optional[str]) -> None:
self.content = content
class Code(Node):
"""A code block token (https://daringfireball.net/projects/markdown/syntax#precode)."""
class Paragraph(Node):
"""A paragraph token (https://daringfireball.net/projects/markdown/syntax#p)."""
class Whitespace(Node):
"""A whitespace token."""
def build_header(title: str, level: int) -> str:
"""Constructs a header in Markdown format.
title: title of the header.
level: 1 base index of the header level
hashes = "#" * level
return f"{hashes} {title}\n"
class Header(Node):
"""A header token (https://daringfireball.net/projects/markdown/syntax#header)."""
title: str
level: int
def __init__(self, title: str, level: int) -> None:
self.title = title
self.level = level
def indent(self, count: int) -> None:
"""Indent or dedent a header
count: number of level to indent, can be negative to dedent.
self.level += count
def update_content(self) -> None:
self.content = build_header(title=self.title, level=self.level)
class RefLink(Node):
"""A reference link token (https://daringfireball.net/projects/markdown/syntax#link)."""
ref: str
link: str
def __init__(self, ref: str, link: str) -> None:
self.ref = ref
self.link = link
def update_content(self) -> None:
self.content = f"[{self.ref}]: {self.link}\n"
def parse_paragraph(parser: Parser) -> Paragraph:
content = ""
while parser.peek() != "":
if parser.peek() == "\n":
content += parser.read()
line = parser.peek_while(lambda it: it != "\n")
if is_blank(line):
return Paragraph(content=content)
content += parser.read()
return Paragraph(content=content)
def is_blank(line: str) -> bool:
"""Return True if line is made of whitespace, False otherwise."""
for c in line:
if not is_whitespace(c):
return False
return True
def is_whitespace(c: str) -> bool:
"""Return True if c is a whitespace, False otherwise."""
return c == " " or c == "\t" or c == "\n"
def parse_whitespace(parser: Parser) -> Whitespace:
"""Parse and return a whitespace token."""
content = parser.read_while(is_whitespace)
return Whitespace(content=content)
def parse_code(parser: Parser) -> Code:
"""Parse and return a code block token."""
separator = parser.read(3)
content = separator
while parser.peek() != "":
c = parser.peek(3)
if c == separator:
content += parser.read(3)
return Code(content=content)
content += parser.read()
return Code(content=content)
def parse_header(parser: Parser) -> Header:
"""Parse and return a header token."""
hashes = parser.read_while(lambda it: it == "#")
_ = parser.read_while(lambda it: is_whitespace(it))
title = parser.read_while(lambda it: it != "\n")
_ = parser.read()
return Header(title=title, level=len(hashes))
def parse_ref_link(parser: Parser) -> RefLink:
"""Parse and return a reference link token."""
line = parser.read_while(lambda it: it != "\n")
_ = parser.read()
ret = re.match(r"\[(?P<ref>.+)]:\s+(?P<link>.+)", line)
assert ret is not None
return RefLink(ref=ret.group("ref"), link=ret.group("link"))
def parse_markdown(text: str) -> "MarkdownDoc":
"""Parse a Markdown text and return a document instance."""
processed_text = text
parser = Parser(buffer=processed_text)
root = MarkdownDoc()
while parser.peek() != "":
node: Node
c = parser.peek()
# Whitespace parsing:
if is_whitespace(c):
node = parse_whitespace(parser=parser)
# Code parsing:
if c == "-" or c == "~" or c == "`":
sep = parser.peek(3)
if sep == "---" or sep == "~~~" or sep == "```":
node = parse_code(parser=parser)
# Header parsing:
if c == "#":
node = parse_header(parser=parser)
# Parse Reference-style Links
if c == "[":
line = parser.peek_while(lambda it: it != "\n")
if re.match(r"\[.+]: .+", line):
node = parse_ref_link(parser=parser)
# Default node parsing:
node = parse_paragraph(parser=parser)
return root
class MarkdownDoc:
"""A class used to represent Markdown document.
children: children nodes of this document.
children: List[Node]
def __init__(self) -> None:
self.children = []
def add_child(self, node) -> None:
"""Add a node to the document."""
def find_first(self, func, start: Optional[Node] = None) -> Optional[Node]:
"""Search the first child node that meet a criteria.
func: a callable predicate to filter against.
start: a node to start the search from (it can be the returned result).
if start:
start_index = self.children.index(start)
start_index = 0
for child in self.children[start_index:]:
if func(child):
return child
return None
def to_text(self) -> str:
"""Return the text representation of this document."""
ref_links_nodes = [c for c in self.children if isinstance(c, RefLink)]
other_nodes = [c for c in self.children if not isinstance(c, RefLink)]
nodes = [*other_nodes, *ref_links_nodes]
return "".join([node.content for node in nodes if node.content])
def indent(self, count: int = 1) -> None:
"""Indent all headers of a specified count level."""
for c in self.children:
if isinstance(c, Header):
def extend(self, other: "MarkdownDoc") -> None:
"""Extend the current document with another Markdown document instance."""
def insert_node(self, start: Node, node: Node) -> None:
"""Insert a child node to the current document, after a specified node."""
index = self.children.index(start)
self.children.insert(index, node)
def insert_nodes(self, start: Node, nodes: List[Node]) -> None:
"""Insert children nodes to the current document, after a specified node."""
index = self.children.index(start)
self.children[index:index] = nodes
def remove_node(self, node: Node) -> None:
"""Remove a child node."""
index = self.children.index(node)
except ValueError:
def remove_nodes(self, nodes: List[Node]) -> None:
"""Remove children nodes."""
self.children = [node for node in self.children if node not in nodes]
def slice(self, node_a: Node, node_b: Node) -> List[Node]:
"""Return a slice of the current children nodes
node_a: lower node (included in the returned slice)
node_b: upper node (excluded from the returned slice)
index_a = self.children.index(node_a)
index_b = self.children.index(node_b)
return self.children[index_a:index_b]
def next_node(self, node: Node) -> Optional[Node]:
"""Return the following node of a specified child node."""
index = self.children.index(node)
if index < len(self.children):
return self.children[index + 1]
return None
def previous_node(self, node: Node) -> Optional[Node]:
"""Return the following node of a specified child node."""
index = self.children.index(node)
if index > 0:
return self.children[index - 1]
return None
def toc(self) -> str:
"""Return a table-of-content of the current document."""
def slugify(value: str) -> str:
value = (
unicodedata.normalize("NFKD", value)
.encode("ascii", "ignore")
value = re.sub(r"[^\w\s/-]", "", value).strip().lower()
return re.sub(r"[-\s]+", "-", value).replace("/", "")
headers = [child for child in self.children if isinstance(child, Header)]
toc = dedent(
Table of Contents
for header in headers:
indent = " " * header.level
slug = slugify(header.title)
line = f"{indent}* [{header.title}](#{slug})\n"
toc += line
return toc