mirror of
https://github.com/Orange-OpenSource/hurl.git
synced 2024-12-30 14:38:28 +03:00
329 lines
9.6 KiB
Python
329 lines
9.6 KiB
Python
|
"""Process Markdown document.
|
||
|
|
||
|
This module allows to manipulate Markdown document:
|
||
|
- create a navigable Markdown object from a text,
|
||
|
- add, remove child nodes to a Markdown object,
|
||
|
- extend a Markdown document with another Markdown document,
|
||
|
- provide utility methods to construct toc.
|
||
|
"""
|
||
|
import re
|
||
|
import unicodedata
|
||
|
from textwrap import dedent
|
||
|
from typing import List, Optional
|
||
|
|
||
|
from parser import Parser
|
||
|
|
||
|
|
||
|
class Node:
|
||
|
"""Represent the base class for a Markdown document token."""
|
||
|
|
||
|
content: Optional[str]
|
||
|
|
||
|
def __init__(self, content: Optional[str]) -> None:
|
||
|
self.content = content
|
||
|
|
||
|
|
||
|
class Code(Node):
|
||
|
"""A code block token (https://daringfireball.net/projects/markdown/syntax#precode)."""
|
||
|
|
||
|
pass
|
||
|
|
||
|
|
||
|
class Paragraph(Node):
|
||
|
"""A paragraph token (https://daringfireball.net/projects/markdown/syntax#p)."""
|
||
|
|
||
|
pass
|
||
|
|
||
|
|
||
|
class Whitespace(Node):
|
||
|
"""A whitespace token."""
|
||
|
|
||
|
pass
|
||
|
|
||
|
|
||
|
def build_header(title: str, level: int) -> str:
|
||
|
"""Constructs a header in Markdown format.
|
||
|
|
||
|
Arg:
|
||
|
title: title of the header.
|
||
|
level: 1 base index of the header level
|
||
|
"""
|
||
|
hashes = "#" * level
|
||
|
return f"{hashes} {title}\n"
|
||
|
|
||
|
|
||
|
class Header(Node):
|
||
|
"""A header token (https://daringfireball.net/projects/markdown/syntax#header)."""
|
||
|
|
||
|
title: str
|
||
|
level: int
|
||
|
|
||
|
def __init__(self, title: str, level: int) -> None:
|
||
|
super().__init__(content=None)
|
||
|
self.title = title
|
||
|
self.level = level
|
||
|
self.update_content()
|
||
|
|
||
|
def indent(self, count: int) -> None:
|
||
|
"""Indent or dedent a header
|
||
|
|
||
|
Args:
|
||
|
count: number of level to indent, can be negative to dedent.
|
||
|
"""
|
||
|
self.level += count
|
||
|
self.update_content()
|
||
|
|
||
|
def update_content(self) -> None:
|
||
|
self.content = build_header(title=self.title, level=self.level)
|
||
|
|
||
|
|
||
|
class RefLink(Node):
|
||
|
"""A reference link token (https://daringfireball.net/projects/markdown/syntax#link)."""
|
||
|
|
||
|
ref: str
|
||
|
link: str
|
||
|
|
||
|
def __init__(self, ref: str, link: str) -> None:
|
||
|
super().__init__(content=None)
|
||
|
self.ref = ref
|
||
|
self.link = link
|
||
|
self.update_content()
|
||
|
|
||
|
def update_content(self) -> None:
|
||
|
self.content = f"[{self.ref}]: {self.link}\n"
|
||
|
|
||
|
|
||
|
def parse_paragraph(parser: Parser) -> Paragraph:
|
||
|
content = ""
|
||
|
while parser.peek() != "":
|
||
|
if parser.peek() == "\n":
|
||
|
content += parser.read()
|
||
|
line = parser.peek_while(lambda it: it != "\n")
|
||
|
if is_blank(line):
|
||
|
return Paragraph(content=content)
|
||
|
continue
|
||
|
content += parser.read()
|
||
|
return Paragraph(content=content)
|
||
|
|
||
|
|
||
|
def is_blank(line: str) -> bool:
|
||
|
"""Return True if line is made of whitespace, False otherwise."""
|
||
|
for c in line:
|
||
|
if not is_whitespace(c):
|
||
|
return False
|
||
|
return True
|
||
|
|
||
|
|
||
|
def is_whitespace(c: str) -> bool:
|
||
|
"""Return True if c is a whitespace, False otherwise."""
|
||
|
return c == " " or c == "\t" or c == "\n"
|
||
|
|
||
|
|
||
|
def parse_whitespace(parser: Parser) -> Whitespace:
|
||
|
"""Parse and return a whitespace token."""
|
||
|
content = parser.read_while(is_whitespace)
|
||
|
return Whitespace(content=content)
|
||
|
|
||
|
|
||
|
def parse_code(parser: Parser) -> Code:
|
||
|
"""Parse and return a code block token."""
|
||
|
separator = parser.read(3)
|
||
|
content = separator
|
||
|
|
||
|
while parser.peek() != "":
|
||
|
c = parser.peek(3)
|
||
|
if c == separator:
|
||
|
content += parser.read(3)
|
||
|
return Code(content=content)
|
||
|
content += parser.read()
|
||
|
return Code(content=content)
|
||
|
|
||
|
|
||
|
def parse_header(parser: Parser) -> Header:
|
||
|
"""Parse and return a header token."""
|
||
|
hashes = parser.read_while(lambda it: it == "#")
|
||
|
_ = parser.read_while(lambda it: is_whitespace(it))
|
||
|
title = parser.read_while(lambda it: it != "\n")
|
||
|
_ = parser.read()
|
||
|
return Header(title=title, level=len(hashes))
|
||
|
|
||
|
|
||
|
def parse_ref_link(parser: Parser) -> RefLink:
|
||
|
"""Parse and return a reference link token."""
|
||
|
line = parser.read_while(lambda it: it != "\n")
|
||
|
_ = parser.read()
|
||
|
ret = re.match(r"\[(?P<ref>.+)]:\s+(?P<link>.+)", line)
|
||
|
assert ret is not None
|
||
|
return RefLink(ref=ret.group("ref"), link=ret.group("link"))
|
||
|
|
||
|
|
||
|
def parse_markdown(text: str) -> "MarkdownDoc":
|
||
|
"""Parse a Markdown text and return a document instance."""
|
||
|
processed_text = text
|
||
|
parser = Parser(buffer=processed_text)
|
||
|
|
||
|
root = MarkdownDoc()
|
||
|
|
||
|
while parser.peek() != "":
|
||
|
node: Node
|
||
|
c = parser.peek()
|
||
|
|
||
|
# Whitespace parsing:
|
||
|
if is_whitespace(c):
|
||
|
node = parse_whitespace(parser=parser)
|
||
|
root.add_child(node)
|
||
|
continue
|
||
|
|
||
|
# Code parsing:
|
||
|
if c == "-" or c == "~" or c == "`":
|
||
|
sep = parser.peek(3)
|
||
|
if sep == "---" or sep == "~~~" or sep == "```":
|
||
|
node = parse_code(parser=parser)
|
||
|
root.add_child(node)
|
||
|
continue
|
||
|
|
||
|
# Header parsing:
|
||
|
if c == "#":
|
||
|
node = parse_header(parser=parser)
|
||
|
root.add_child(node)
|
||
|
continue
|
||
|
|
||
|
# Parse Reference-style Links
|
||
|
if c == "[":
|
||
|
line = parser.peek_while(lambda it: it != "\n")
|
||
|
if re.match(r"\[.+]: .+", line):
|
||
|
node = parse_ref_link(parser=parser)
|
||
|
root.add_child(node)
|
||
|
continue
|
||
|
|
||
|
# Default node parsing:
|
||
|
node = parse_paragraph(parser=parser)
|
||
|
root.add_child(node)
|
||
|
|
||
|
return root
|
||
|
|
||
|
|
||
|
class MarkdownDoc:
|
||
|
"""A class used to represent Markdown document.
|
||
|
|
||
|
Attributes:
|
||
|
children: children nodes of this document.
|
||
|
"""
|
||
|
|
||
|
children: List[Node]
|
||
|
|
||
|
def __init__(self) -> None:
|
||
|
self.children = []
|
||
|
|
||
|
def add_child(self, node) -> None:
|
||
|
"""Add a node to the document."""
|
||
|
self.children.append(node)
|
||
|
|
||
|
def find_first(self, func, start: Optional[Node] = None) -> Optional[Node]:
|
||
|
"""Search the first child node that meet a criteria.
|
||
|
|
||
|
Args:
|
||
|
func: a callable predicate to filter against.
|
||
|
start: a node to start the search from (it can be the returned result).
|
||
|
"""
|
||
|
if start:
|
||
|
start_index = self.children.index(start)
|
||
|
else:
|
||
|
start_index = 0
|
||
|
for child in self.children[start_index:]:
|
||
|
if func(child):
|
||
|
return child
|
||
|
return None
|
||
|
|
||
|
def to_text(self) -> str:
|
||
|
"""Return the text representation of this document."""
|
||
|
ref_links_nodes = [c for c in self.children if isinstance(c, RefLink)]
|
||
|
other_nodes = [c for c in self.children if not isinstance(c, RefLink)]
|
||
|
nodes = [*other_nodes, *ref_links_nodes]
|
||
|
return "".join([node.content for node in nodes if node.content])
|
||
|
|
||
|
def indent(self, count: int = 1) -> None:
|
||
|
"""Indent all headers of a specified count level."""
|
||
|
for c in self.children:
|
||
|
if isinstance(c, Header):
|
||
|
c.indent(count=count)
|
||
|
|
||
|
def extend(self, other: "MarkdownDoc") -> None:
|
||
|
"""Extend the current document with another Markdown document instance."""
|
||
|
self.children.extend(other.children)
|
||
|
|
||
|
def insert_node(self, start: Node, node: Node) -> None:
|
||
|
"""Insert a child node to the current document, after a specified node."""
|
||
|
index = self.children.index(start)
|
||
|
self.children.insert(index, node)
|
||
|
|
||
|
def insert_nodes(self, start: Node, nodes: List[Node]) -> None:
|
||
|
"""Insert children nodes to the current document, after a specified node."""
|
||
|
index = self.children.index(start)
|
||
|
self.children[index:index] = nodes
|
||
|
|
||
|
def remove_node(self, node: Node) -> None:
|
||
|
"""Remove a child node."""
|
||
|
try:
|
||
|
index = self.children.index(node)
|
||
|
self.children.pop(index)
|
||
|
except ValueError:
|
||
|
pass
|
||
|
|
||
|
def remove_nodes(self, nodes: List[Node]) -> None:
|
||
|
"""Remove children nodes."""
|
||
|
self.children = [node for node in self.children if node not in nodes]
|
||
|
|
||
|
def slice(self, node_a: Node, node_b: Node) -> List[Node]:
|
||
|
"""Return a slice of the current children nodes
|
||
|
|
||
|
Args:
|
||
|
node_a: lower node (included in the returned slice)
|
||
|
node_b: upper node (excluded from the returned slice)
|
||
|
"""
|
||
|
index_a = self.children.index(node_a)
|
||
|
index_b = self.children.index(node_b)
|
||
|
return self.children[index_a:index_b]
|
||
|
|
||
|
def next_node(self, node: Node) -> Optional[Node]:
|
||
|
"""Return the following node of a specified child node."""
|
||
|
index = self.children.index(node)
|
||
|
if index < len(self.children):
|
||
|
return self.children[index + 1]
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
def previous_node(self, node: Node) -> Optional[Node]:
|
||
|
"""Return the following node of a specified child node."""
|
||
|
index = self.children.index(node)
|
||
|
if index > 0:
|
||
|
return self.children[index - 1]
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
def toc(self) -> str:
|
||
|
"""Return a table-of-content of the current document."""
|
||
|
|
||
|
def slugify(value: str) -> str:
|
||
|
value = (
|
||
|
unicodedata.normalize("NFKD", value)
|
||
|
.encode("ascii", "ignore")
|
||
|
.decode("ascii")
|
||
|
)
|
||
|
value = re.sub(r"[^\w\s/-]", "", value).strip().lower()
|
||
|
return re.sub(r"[-\s]+", "-", value).replace("/", "")
|
||
|
|
||
|
headers = [child for child in self.children if isinstance(child, Header)]
|
||
|
toc = dedent(
|
||
|
"""\
|
||
|
Table of Contents
|
||
|
=================
|
||
|
"""
|
||
|
)
|
||
|
for header in headers:
|
||
|
indent = " " * header.level
|
||
|
slug = slugify(header.title)
|
||
|
line = f"{indent}* [{header.title}](#{slug})\n"
|
||
|
toc += line
|
||
|
return toc
|