hurl/bin/docs/markdown.py

"""Process Markdown document.

This module allows to manipulate Markdown document:
- create a navigable Markdown object from a text,
- add, remove child nodes to a Markdown object,
- extend a Markdown document with another Markdown document,
- provide utility methods to construct toc.
"""
import re
import unicodedata
from textwrap import dedent
from typing import List, Optional

from parser import Parser


class Node:
    """Represent the base class for a Markdown document token."""

    content: Optional[str]

    def __init__(self, content: Optional[str]) -> None:
        self.content = content


class Code(Node):
    """A code block token (https://daringfireball.net/projects/markdown/syntax#precode)."""

    pass


class Paragraph(Node):
    """A paragraph token (https://daringfireball.net/projects/markdown/syntax#p)."""

    pass


class Whitespace(Node):
    """A whitespace token."""

    pass


def build_header(title: str, level: int) -> str:
    """Constructs a header in Markdown format.

    Arg:
        title: title of the header.
        level: 1 base index of the header level
    """
    hashes = "#" * level
    return f"{hashes} {title}\n"


class Header(Node):
    """A header token (https://daringfireball.net/projects/markdown/syntax#header)."""

    title: str
    level: int

    def __init__(self, title: str, level: int) -> None:
        super().__init__(content=None)
        self.title = title
        self.level = level
        self.update_content()

    def indent(self, count: int) -> None:
        """Indent or dedent a header

        Args:
            count: number of level to indent, can be negative to dedent.
        """
        self.level += count
        self.update_content()

    def update_content(self) -> None:
        self.content = build_header(title=self.title, level=self.level)


class RefLink(Node):
    """A reference link token (https://daringfireball.net/projects/markdown/syntax#link)."""

    ref: str
    link: str

    def __init__(self, ref: str, link: str) -> None:
        super().__init__(content=None)
        self.ref = ref
        self.link = link
        self.update_content()

    def update_content(self) -> None:
        self.content = f"[{self.ref}]: {self.link}\n"


class Table(Node):
    """A table"""

    def reformat(self) -> None:
        """
        Format a Markdown table so all column are maximized.

        Example:

        ```markdown
        | a | b | c|
        |---|---|--|
        | aaaaa | bbbbb | cccc |
        | dd | ee | f |
        ```

        is reformatted to

        ```markdown
        | a     | b     | c    |
        |-------|-------|------|
        | aaaaa | bbbbb | cccc |
        | dd    | ee    | f    |
        ```
        """
        # Convert our content to an array of strings
        lines = self.content.splitlines()
        rows = [line.split("|")[1:-1] for line in lines]
        rows_count = len(rows)
        cols_count = len(rows[0])
        cells = [
            [rows[r][c].strip() for c in range(cols_count)] for r in range(rows_count)
        ]
        max_lengths = [
            max([len(cells[r][c]) for r in range(rows_count)])
            for c in range(cols_count)
        ]
        cells_normalized = [["" for _ in range(cols_count)] for _ in range(rows_count)]
        for c in range(cols_count):
            max_length = max_lengths[c]
            for r in range(rows_count):
                cell = cells[r][c]
                if r == 1:
                    pad_char = "-"
                else:
                    pad_char = " "
                cell_normalized = cell.ljust(max_length, pad_char).center(
                    max_length + 2, pad_char
                )
                cells_normalized[r][c] = cell_normalized
        # Reconstruct content
        self.content = "".join(
            ["|" + "|".join(row) + "|\n" for row in cells_normalized]
        )


def parse_paragraph(parser: Parser) -> Paragraph:
    content = ""
    while parser.peek() != "":
        if parser.peek() == "\n":
            content += parser.read()
            line = parser.peek_while(lambda it: it != "\n")
            if is_blank(line):
                return Paragraph(content=content)
            continue
        content += parser.read()
    return Paragraph(content=content)


def is_blank(line: str) -> bool:
    """Return True if line is made of whitespace, False otherwise."""
    for c in line:
        if not is_whitespace(c):
            return False
    return True


def is_whitespace(c: str) -> bool:
    """Return True if c is a whitespace, False otherwise."""
    return c == " " or c == "\t" or c == "\n"


def parse_whitespace(parser: Parser) -> Whitespace:
    """Parse and return a whitespace token."""
    content = parser.read_while(is_whitespace)
    return Whitespace(content=content)


def parse_code(parser: Parser) -> Code:
    """Parse and return a code block token."""
    separator = parser.read(3)
    content = separator

    while parser.peek() != "":
        c = parser.peek(3)
        if c == separator:
            content += parser.read(3)
            return Code(content=content)
        content += parser.read()
    return Code(content=content)


def parse_header(parser: Parser) -> Header:
    """Parse and return a header token."""
    hashes = parser.read_while(lambda it: it == "#")
    _ = parser.read_while(lambda it: is_whitespace(it))
    title = parser.read_while(lambda it: it != "\n")
    _ = parser.read()
    return Header(title=title, level=len(hashes))


def parse_ref_link(parser: Parser) -> RefLink:
    """Parse and return a reference link token."""
    line = parser.read_while(lambda it: it != "\n")
    _ = parser.read()
    ret = re.match(r"\[(?P<ref>.+)]:\s+(?P<link>.+)", line)
    assert ret is not None
    return RefLink(ref=ret.group("ref"), link=ret.group("link"))


def parse_markdown(text: str) -> "MarkdownDoc":
    """Parse a Markdown text and return a document instance."""
    processed_text = text
    parser = Parser(buffer=processed_text)

    root = MarkdownDoc()

    while parser.peek() != "":
        node: Node
        c = parser.peek()

        # Whitespace parsing:
        if is_whitespace(c):
            node = parse_whitespace(parser=parser)
            root.add_child(node)
            continue

        # Code parsing:
        if c == "-" or c == "~" or c == "`":
            sep = parser.peek(3)
            if sep == "---" or sep == "~~~" or sep == "```":
                node = parse_code(parser=parser)
                root.add_child(node)
                continue

        # Header parsing:
        if c == "#":
            node = parse_header(parser=parser)
            root.add_child(node)
            continue

        # Parse Reference-style Links
        if c == "[":
            line = parser.peek_while(lambda it: it != "\n")
            if re.match(r"\[.+]: .+", line):
                node = parse_ref_link(parser=parser)
                root.add_child(node)
                continue

        # Default node parsing:
        node = parse_paragraph(parser=parser)
        root.add_child(node)

    return root


class MarkdownDoc:
    """A class used to represent Markdown document.

    Attributes:
        children: children nodes of this document.
    """

    children: List[Node]

    def __init__(self) -> None:
        self.children = []

    def add_child(self, node) -> None:
        """Add a node to the document."""
        self.children.append(node)

    def find_first(self, func, start: Optional[Node] = None) -> Optional[Node]:
        """Search the first child node that meet a criteria.

        Args:
            func: a callable predicate to filter against.
            start: a node to start the search from (it can be the returned result).
        """
        if start:
            start_index = self.children.index(start)
        else:
            start_index = 0
        for child in self.children[start_index:]:
            if func(child):
                return child
        return None

    def to_text(self) -> str:
        """Return the text representation of this document."""
        ref_links_nodes = [c for c in self.children if isinstance(c, RefLink)]
        other_nodes = [c for c in self.children if not isinstance(c, RefLink)]
        nodes = [*other_nodes, *ref_links_nodes]
        return "".join([node.content for node in nodes if node.content])

    def indent(self, count: int = 1) -> None:
        """Indent all headers of a specified count level."""
        for c in self.children:
            if isinstance(c, Header):
                c.indent(count=count)

    def extend(self, other: "MarkdownDoc") -> None:
        """Extend the current document with another Markdown document instance."""
        self.children.extend(other.children)

    def insert_node(self, start: Node, node: Node) -> None:
        """Insert a child node to the current document, after a specified node."""
        index = self.children.index(start)
        self.children.insert(index, node)

    def insert_nodes(self, start: Node, nodes: List[Node]) -> None:
        """Insert children nodes to the current document, after a specified node."""
        index = self.children.index(start)
        self.children[index:index] = nodes

    def remove_node(self, node: Node) -> None:
        """Remove a child node."""
        try:
            index = self.children.index(node)
            self.children.pop(index)
        except ValueError:
            pass

    def remove_nodes(self, nodes: List[Node]) -> None:
        """Remove children nodes."""
        self.children = [node for node in self.children if node not in nodes]

    def slice(self, node_a: Node, node_b: Node) -> List[Node]:
        """Return a slice of the current children nodes

        Args:
            node_a: lower node (included in the returned slice)
            node_b: upper node (excluded from the returned slice)
        """
        index_a = self.children.index(node_a)
        index_b = self.children.index(node_b)
        return self.children[index_a:index_b]

    def next_node(self, node: Node) -> Optional[Node]:
        """Return the following node of a specified child node."""
        index = self.children.index(node)
        if index < len(self.children):
            return self.children[index + 1]
        else:
            return None

    def previous_node(self, node: Node) -> Optional[Node]:
        """Return the following node of a specified child node."""
        index = self.children.index(node)
        if index > 0:
            return self.children[index - 1]
        else:
            return None

    def toc(self) -> str:
        """Return a table-of-content of the current document."""

        def slugify(value: str) -> str:
            value = (
                unicodedata.normalize("NFKD", value)
                .encode("ascii", "ignore")
                .decode("ascii")
            )
            value = re.sub(r"[^\w\s/-]", "", value).strip().lower()
            return re.sub(r"[-\s]+", "-", value).replace("/", "")

        headers = [child for child in self.children if isinstance(child, Header)]
        toc = dedent(
            """\
        Table of Contents
        =================
        """
        )
        for header in headers:
            indent = "   " * header.level
            slug = slugify(header.title)
            line = f"{indent}* [{header.title}](#{slug})\n"
            toc += line
        return toc
Add scripts to construct GitHub and crates.io README, and to construct grammar and man canonical document from the source. 2022-05-31 15:37:58 +03:00			`"""Process Markdown document.`

			`This module allows to manipulate Markdown document:`
			`- create a navigable Markdown object from a text,`
			`- add, remove child nodes to a Markdown object,`
			`- extend a Markdown document with another Markdown document,`
			`- provide utility methods to construct toc.`
			`"""`
			`import re`
			`import unicodedata`
			`from textwrap import dedent`
			`from typing import List, Optional`

			`from parser import Parser`


			`class Node:`
			`"""Represent the base class for a Markdown document token."""`

			`content: Optional[str]`

			`def __init__(self, content: Optional[str]) -> None:`
			`self.content = content`


			`class Code(Node):`
			`"""A code block token (https://daringfireball.net/projects/markdown/syntax#precode)."""`

			`pass`


			`class Paragraph(Node):`
			`"""A paragraph token (https://daringfireball.net/projects/markdown/syntax#p)."""`

			`pass`


			`class Whitespace(Node):`
			`"""A whitespace token."""`

			`pass`


			`def build_header(title: str, level: int) -> str:`
			`"""Constructs a header in Markdown format.`

			`Arg:`
			`title: title of the header.`
			`level: 1 base index of the header level`
			`"""`
			`hashes = "#" * level`
			`return f"{hashes} {title}\n"`


			`class Header(Node):`
			`"""A header token (https://daringfireball.net/projects/markdown/syntax#header)."""`

			`title: str`
			`level: int`

			`def __init__(self, title: str, level: int) -> None:`
			`super().__init__(content=None)`
			`self.title = title`
			`self.level = level`
			`self.update_content()`

			`def indent(self, count: int) -> None:`
			`"""Indent or dedent a header`

			`Args:`
			`count: number of level to indent, can be negative to dedent.`
			`"""`
			`self.level += count`
			`self.update_content()`

			`def update_content(self) -> None:`
			`self.content = build_header(title=self.title, level=self.level)`


			`class RefLink(Node):`
			`"""A reference link token (https://daringfireball.net/projects/markdown/syntax#link)."""`

			`ref: str`
			`link: str`

			`def __init__(self, ref: str, link: str) -> None:`
			`super().__init__(content=None)`
			`self.ref = ref`
			`self.link = link`
			`self.update_content()`

			`def update_content(self) -> None:`
			`self.content = f"[{self.ref}]: {self.link}\n"`


Improve Markdown tables generation. 2023-09-14 19:00:37 +03:00			`class Table(Node):`
			`"""A table"""`

			`def reformat(self) -> None:`
			`"""`
Fix typo in markdown.py colum -> column 2023-09-24 08:05:18 +03:00			`Format a Markdown table so all column are maximized.`
Improve Markdown tables generation. 2023-09-14 19:00:37 +03:00
			`Example:`

			```markdown
			`\| a \| b \| c\|`
			`\|---\|---\|--\|`
			`\| aaaaa \| bbbbb \| cccc \|`
			`\| dd \| ee \| f \|`
			```

			`is reformatted to`

			```markdown
			`\| a \| b \| c \|`
			`\|-------\|-------\|------\|`
			`\| aaaaa \| bbbbb \| cccc \|`
			`\| dd \| ee \| f \|`
			```
			`"""`
			`# Convert our content to an array of strings`
			`lines = self.content.splitlines()`
			`rows = [line.split("\|")[1:-1] for line in lines]`
			`rows_count = len(rows)`
			`cols_count = len(rows[0])`
			`cells = [`
			`[rows[r][c].strip() for c in range(cols_count)] for r in range(rows_count)`
			`]`
			`max_lengths = [`
			`max([len(cells[r][c]) for r in range(rows_count)])`
			`for c in range(cols_count)`
			`]`
			`cells_normalized = [["" for _ in range(cols_count)] for _ in range(rows_count)]`
			`for c in range(cols_count):`
			`max_length = max_lengths[c]`
			`for r in range(rows_count):`
			`cell = cells[r][c]`
			`if r == 1:`
			`pad_char = "-"`
			`else:`
			`pad_char = " "`
			`cell_normalized = cell.ljust(max_length, pad_char).center(`
			`max_length + 2, pad_char`
			`)`
			`cells_normalized[r][c] = cell_normalized`
			`# Reconstruct content`
			`self.content = "".join(`
			`["\|" + "\|".join(row) + "\|\n" for row in cells_normalized]`
			`)`


Add scripts to construct GitHub and crates.io README, and to construct grammar and man canonical document from the source. 2022-05-31 15:37:58 +03:00			`def parse_paragraph(parser: Parser) -> Paragraph:`
			`content = ""`
			`while parser.peek() != "":`
			`if parser.peek() == "\n":`
			`content += parser.read()`
			`line = parser.peek_while(lambda it: it != "\n")`
			`if is_blank(line):`
			`return Paragraph(content=content)`
			`continue`
			`content += parser.read()`
			`return Paragraph(content=content)`


			`def is_blank(line: str) -> bool:`
			`"""Return True if line is made of whitespace, False otherwise."""`
			`for c in line:`
			`if not is_whitespace(c):`
			`return False`
			`return True`


			`def is_whitespace(c: str) -> bool:`
			`"""Return True if c is a whitespace, False otherwise."""`
			`return c == " " or c == "\t" or c == "\n"`


			`def parse_whitespace(parser: Parser) -> Whitespace:`
			`"""Parse and return a whitespace token."""`
			`content = parser.read_while(is_whitespace)`
			`return Whitespace(content=content)`


			`def parse_code(parser: Parser) -> Code:`
			`"""Parse and return a code block token."""`
			`separator = parser.read(3)`
			`content = separator`

			`while parser.peek() != "":`
			`c = parser.peek(3)`
			`if c == separator:`
			`content += parser.read(3)`
			`return Code(content=content)`
			`content += parser.read()`
			`return Code(content=content)`


			`def parse_header(parser: Parser) -> Header:`
			`"""Parse and return a header token."""`
			`hashes = parser.read_while(lambda it: it == "#")`
			`_ = parser.read_while(lambda it: is_whitespace(it))`
			`title = parser.read_while(lambda it: it != "\n")`
			`_ = parser.read()`
			`return Header(title=title, level=len(hashes))`


			`def parse_ref_link(parser: Parser) -> RefLink:`
			`"""Parse and return a reference link token."""`
			`line = parser.read_while(lambda it: it != "\n")`
			`_ = parser.read()`
			`ret = re.match(r"\[(?P<ref>.+)]:\s+(?P<link>.+)", line)`
			`assert ret is not None`
			`return RefLink(ref=ret.group("ref"), link=ret.group("link"))`


			`def parse_markdown(text: str) -> "MarkdownDoc":`
			`"""Parse a Markdown text and return a document instance."""`
			`processed_text = text`
			`parser = Parser(buffer=processed_text)`

			`root = MarkdownDoc()`

			`while parser.peek() != "":`
			`node: Node`
			`c = parser.peek()`

			`# Whitespace parsing:`
			`if is_whitespace(c):`
			`node = parse_whitespace(parser=parser)`
			`root.add_child(node)`
			`continue`

			`# Code parsing:`
			if c == "-" or c == "~" or c == "`":
			`sep = parser.peek(3)`
			if sep == "---" or sep == "~~~" or sep == "```":
			`node = parse_code(parser=parser)`
			`root.add_child(node)`
			`continue`

			`# Header parsing:`
			`if c == "#":`
			`node = parse_header(parser=parser)`
			`root.add_child(node)`
			`continue`

			`# Parse Reference-style Links`
			`if c == "[":`
			`line = parser.peek_while(lambda it: it != "\n")`
			`if re.match(r"\[.+]: .+", line):`
			`node = parse_ref_link(parser=parser)`
			`root.add_child(node)`
			`continue`

			`# Default node parsing:`
			`node = parse_paragraph(parser=parser)`
			`root.add_child(node)`

			`return root`


			`class MarkdownDoc:`
			`"""A class used to represent Markdown document.`

			`Attributes:`
			`children: children nodes of this document.`
			`"""`

			`children: List[Node]`

			`def __init__(self) -> None:`
			`self.children = []`

			`def add_child(self, node) -> None:`
			`"""Add a node to the document."""`
			`self.children.append(node)`

			`def find_first(self, func, start: Optional[Node] = None) -> Optional[Node]:`
			`"""Search the first child node that meet a criteria.`

			`Args:`
			`func: a callable predicate to filter against.`
			`start: a node to start the search from (it can be the returned result).`
			`"""`
			`if start:`
			`start_index = self.children.index(start)`
			`else:`
			`start_index = 0`
			`for child in self.children[start_index:]:`
			`if func(child):`
			`return child`
			`return None`

			`def to_text(self) -> str:`
			`"""Return the text representation of this document."""`
			`ref_links_nodes = [c for c in self.children if isinstance(c, RefLink)]`
			`other_nodes = [c for c in self.children if not isinstance(c, RefLink)]`
			`nodes = [other_nodes, ref_links_nodes]`
			`return "".join([node.content for node in nodes if node.content])`

			`def indent(self, count: int = 1) -> None:`
			`"""Indent all headers of a specified count level."""`
			`for c in self.children:`
			`if isinstance(c, Header):`
			`c.indent(count=count)`

			`def extend(self, other: "MarkdownDoc") -> None:`
			`"""Extend the current document with another Markdown document instance."""`
			`self.children.extend(other.children)`

			`def insert_node(self, start: Node, node: Node) -> None:`
			`"""Insert a child node to the current document, after a specified node."""`
			`index = self.children.index(start)`
			`self.children.insert(index, node)`

			`def insert_nodes(self, start: Node, nodes: List[Node]) -> None:`
			`"""Insert children nodes to the current document, after a specified node."""`
			`index = self.children.index(start)`
			`self.children[index:index] = nodes`

			`def remove_node(self, node: Node) -> None:`
			`"""Remove a child node."""`
			`try:`
			`index = self.children.index(node)`
			`self.children.pop(index)`
			`except ValueError:`
			`pass`

			`def remove_nodes(self, nodes: List[Node]) -> None:`
			`"""Remove children nodes."""`
			`self.children = [node for node in self.children if node not in nodes]`

			`def slice(self, node_a: Node, node_b: Node) -> List[Node]:`
			`"""Return a slice of the current children nodes`

			`Args:`
			`node_a: lower node (included in the returned slice)`
			`node_b: upper node (excluded from the returned slice)`
			`"""`
			`index_a = self.children.index(node_a)`
			`index_b = self.children.index(node_b)`
			`return self.children[index_a:index_b]`

			`def next_node(self, node: Node) -> Optional[Node]:`
			`"""Return the following node of a specified child node."""`
			`index = self.children.index(node)`
			`if index < len(self.children):`
			`return self.children[index + 1]`
			`else:`
			`return None`

			`def previous_node(self, node: Node) -> Optional[Node]:`
			`"""Return the following node of a specified child node."""`
			`index = self.children.index(node)`
			`if index > 0:`
			`return self.children[index - 1]`
			`else:`
			`return None`

			`def toc(self) -> str:`
			`"""Return a table-of-content of the current document."""`

			`def slugify(value: str) -> str:`
			`value = (`
			`unicodedata.normalize("NFKD", value)`
			`.encode("ascii", "ignore")`
			`.decode("ascii")`
			`)`
			`value = re.sub(r"[^\w\s/-]", "", value).strip().lower()`
			`return re.sub(r"[-\s]+", "-", value).replace("/", "")`

			`headers = [child for child in self.children if isinstance(child, Header)]`
			`toc = dedent(`
			`"""\`
			`Table of Contents`
			`=================`
			`"""`
			`)`
			`for header in headers:`
			`indent = " " * header.level`
			`slug = slugify(header.title)`
			`line = f"{indent}* [{header.title}](#{slug})\n"`
			`toc += line`
			`return toc`