Make HTML to Markdown conversion more pluggable (#12653)

This PR overhauls the HTML to Markdown conversion functionality in order
to make it more pluggable. This will ultimately allow for supporting a
variety of different HTML input structures (both natively and via
extensions).

As part of this, the `rustdoc_to_markdown` crate has been renamed to
`html_to_markdown`.

The `MarkdownWriter` now accepts a list of trait objects that can be
used to drive the conversion of the HTML into Markdown. Right now we
have some generic handler implementations for going from plain HTML
elements to their Markdown equivalents, as well as some rustdoc-specific
ones.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-06-04 16:14:26 -04:00 committed by GitHub
parent 1c617474fe
commit 2d9479667f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 671 additions and 320 deletions

26
Cargo.lock generated
View File

@ -353,6 +353,7 @@ dependencies = [
"fuzzy",
"gpui",
"heed",
"html_to_markdown",
"http 0.1.0",
"indoc",
"language",
@ -367,7 +368,6 @@ dependencies = [
"rand 0.8.5",
"regex",
"rope",
"rustdoc_to_markdown",
"schemars",
"search",
"semantic_index",
@ -5067,6 +5067,18 @@ dependencies = [
"syn 2.0.59",
]
[[package]]
name = "html_to_markdown"
version = "0.1.0"
dependencies = [
"anyhow",
"html5ever",
"indoc",
"markup5ever_rcdom",
"pretty_assertions",
"regex",
]
[[package]]
name = "http"
version = "0.1.0"
@ -8618,18 +8630,6 @@ dependencies = [
"semver",
]
[[package]]
name = "rustdoc_to_markdown"
version = "0.1.0"
dependencies = [
"anyhow",
"html5ever",
"indoc",
"markup5ever_rcdom",
"pretty_assertions",
"regex",
]
[[package]]
name = "rustix"
version = "0.37.23"

View File

@ -41,6 +41,7 @@ members = [
"crates/gpui",
"crates/gpui_macros",
"crates/headless",
"crates/html_to_markdown",
"crates/http",
"crates/image_viewer",
"crates/inline_completion_button",
@ -76,7 +77,6 @@ members = [
"crates/rich_text",
"crates/rope",
"crates/rpc",
"crates/rustdoc_to_markdown",
"crates/task",
"crates/tasks_ui",
"crates/search",
@ -187,6 +187,7 @@ google_ai = { path = "crates/google_ai" }
gpui = { path = "crates/gpui" }
gpui_macros = { path = "crates/gpui_macros" }
headless = { path = "crates/headless" }
html_to_markdown = { path = "crates/html_to_markdown" }
http = { path = "crates/http" }
install_cli = { path = "crates/install_cli" }
image_viewer = { path = "crates/image_viewer" }
@ -223,7 +224,6 @@ dev_server_projects = { path = "crates/dev_server_projects" }
rich_text = { path = "crates/rich_text" }
rope = { path = "crates/rope" }
rpc = { path = "crates/rpc" }
rustdoc_to_markdown = { path = "crates/rustdoc_to_markdown" }
task = { path = "crates/task" }
tasks_ui = { path = "crates/tasks_ui" }
search = { path = "crates/search" }

View File

@ -28,6 +28,7 @@ futures.workspace = true
fuzzy.workspace = true
gpui.workspace = true
heed.workspace = true
html_to_markdown.workspace = true
http.workspace = true
indoc.workspace = true
language.workspace = true
@ -40,7 +41,6 @@ parking_lot.workspace = true
project.workspace = true
regex.workspace = true
rope.workspace = true
rustdoc_to_markdown.workspace = true
schemars.workspace = true
search.workspace = true
semantic_index.workspace = true

View File

@ -5,9 +5,9 @@ use anyhow::{anyhow, bail, Context, Result};
use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection};
use futures::AsyncReadExt;
use gpui::{AppContext, Task, WeakView};
use html_to_markdown::convert_html_to_markdown;
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
use language::LspAdapterDelegate;
use rustdoc_to_markdown::convert_html_to_markdown;
use ui::{prelude::*, ButtonLike, ElevationIndex};
use workspace::Workspace;

View File

@ -7,10 +7,10 @@ use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutp
use fs::Fs;
use futures::AsyncReadExt;
use gpui::{AppContext, Model, Task, WeakView};
use html_to_markdown::convert_rustdoc_to_markdown;
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
use language::LspAdapterDelegate;
use project::{Project, ProjectPath};
use rustdoc_to_markdown::convert_rustdoc_to_markdown;
use ui::{prelude::*, ButtonLike, ElevationIndex};
use workspace::Workspace;

View File

@ -1,5 +1,5 @@
[package]
name = "rustdoc_to_markdown"
name = "html_to_markdown"
version = "0.1.0"
edition = "2021"
publish = false
@ -9,7 +9,7 @@ license = "GPL-3.0-or-later"
workspace = true
[lib]
path = "src/rustdoc_to_markdown.rs"
path = "src/html_to_markdown.rs"
[dependencies]
anyhow.workspace = true

View File

@ -1,5 +1,5 @@
use html_to_markdown::convert_rustdoc_to_markdown;
use indoc::indoc;
use rustdoc_to_markdown::convert_rustdoc_to_markdown;
pub fn main() {
let html = indoc! {"

View File

@ -3,7 +3,9 @@
#![deny(missing_docs)]
mod html_element;
mod markdown;
mod markdown_writer;
mod structure;
use std::io::Read;
@ -14,15 +16,28 @@ use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use markup5ever_rcdom::RcDom;
use crate::markdown_writer::MarkdownWriter;
use crate::markdown::{HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler};
use crate::markdown_writer::{HandleTag, MarkdownWriter};
/// Converts the provided HTML to Markdown.
pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
let dom = parse_html(html).context("failed to parse HTML")?;
let handlers: Vec<Box<dyn HandleTag>> = vec![
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocTableHandler::new()),
Box::new(structure::rustdoc::RustdocItemHandler),
];
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
.run(&dom.document)
.run(&dom.document, handlers)
.context("failed to convert HTML to Markdown")?;
Ok(markdown)
@ -32,9 +47,21 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
let handlers: Vec<Box<dyn HandleTag>> = vec![
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocTableHandler::new()),
Box::new(structure::rustdoc::RustdocItemHandler),
];
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
.run(&dom.document)
.run(&dom.document, handlers)
.context("failed to convert rustdoc HTML to Markdown")?;
Ok(markdown)

View File

@ -0,0 +1,135 @@
use crate::html_element::HtmlElement;
use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome};
pub struct ParagraphHandler;
impl HandleTag for ParagraphHandler {
fn should_handle(&self, _tag: &str) -> bool {
true
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
if tag.is_inline() && writer.is_inside("p") {
if let Some(parent) = writer.current_element_stack().iter().last() {
if !parent.is_inline() {
if !(writer.markdown.ends_with(' ') || writer.markdown.ends_with('\n')) {
writer.push_str(" ");
}
}
}
}
match tag.tag.as_str() {
"p" => writer.push_blank_line(),
_ => {}
}
StartTagOutcome::Continue
}
}
pub struct HeadingHandler;
impl HandleTag for HeadingHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"h1" => writer.push_str("\n\n# "),
"h2" => writer.push_str("\n\n## "),
"h3" => writer.push_str("\n\n### "),
"h4" => writer.push_str("\n\n#### "),
"h5" => writer.push_str("\n\n##### "),
"h6" => writer.push_str("\n\n###### "),
_ => {}
}
StartTagOutcome::Continue
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => writer.push_blank_line(),
_ => {}
}
}
}
pub struct ListHandler;
impl HandleTag for ListHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"ul" | "ol" | "li" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"ul" | "ol" => writer.push_newline(),
"li" => writer.push_str("- "),
_ => {}
}
StartTagOutcome::Continue
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"ul" | "ol" => writer.push_newline(),
"li" => writer.push_newline(),
_ => {}
}
}
}
pub struct StyledTextHandler;
impl HandleTag for StyledTextHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"strong" | "em" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"strong" => writer.push_str("**"),
"em" => writer.push_str("_"),
_ => {}
}
StartTagOutcome::Continue
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"strong" => writer.push_str("**"),
"em" => writer.push_str("_"),
_ => {}
}
}
}

View File

@ -0,0 +1,198 @@
use std::collections::VecDeque;
use std::sync::OnceLock;
use anyhow::Result;
use markup5ever_rcdom::{Handle, NodeData};
use regex::Regex;
use crate::html_element::HtmlElement;
fn empty_line_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
}
fn more_than_three_newlines_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
}
pub enum StartTagOutcome {
Continue,
Skip,
}
pub struct MarkdownWriter {
current_element_stack: VecDeque<HtmlElement>,
pub(crate) markdown: String,
}
impl MarkdownWriter {
pub fn new() -> Self {
Self {
current_element_stack: VecDeque::new(),
markdown: String::new(),
}
}
pub fn current_element_stack(&self) -> &VecDeque<HtmlElement> {
&self.current_element_stack
}
pub fn is_inside(&self, tag: &str) -> bool {
self.current_element_stack
.iter()
.any(|parent_element| parent_element.tag == tag)
}
/// Appends the given string slice onto the end of the Markdown output.
pub fn push_str(&mut self, str: &str) {
self.markdown.push_str(str);
}
/// Appends a newline to the end of the Markdown output.
pub fn push_newline(&mut self) {
self.push_str("\n");
}
/// Appends a blank line to the end of the Markdown output.
pub fn push_blank_line(&mut self) {
self.push_str("\n\n");
}
pub fn run(
mut self,
root_node: &Handle,
mut handlers: Vec<Box<dyn HandleTag>>,
) -> Result<String> {
self.visit_node(&root_node, &mut handlers)?;
Ok(Self::prettify_markdown(self.markdown))
}
fn prettify_markdown(markdown: String) -> String {
let markdown = empty_line_regex().replace_all(&markdown, "");
let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
markdown.trim().to_string()
}
fn visit_node(&mut self, node: &Handle, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
let mut current_element = None;
match node.data {
NodeData::Document
| NodeData::Doctype { .. }
| NodeData::ProcessingInstruction { .. }
| NodeData::Comment { .. } => {
// Currently left unimplemented, as we're not interested in this data
// at this time.
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
let tag_name = name.local.to_string();
if !tag_name.is_empty() {
current_element = Some(HtmlElement {
tag: tag_name,
attrs: attrs.clone(),
});
}
}
NodeData::Text { ref contents } => {
let text = contents.borrow().to_string();
self.visit_text(text, handlers)?;
}
}
if let Some(current_element) = current_element.as_ref() {
match self.start_tag(&current_element, handlers) {
StartTagOutcome::Continue => {}
StartTagOutcome::Skip => return Ok(()),
}
self.current_element_stack
.push_back(current_element.clone());
}
for child in node.children.borrow().iter() {
self.visit_node(child, handlers)?;
}
if let Some(current_element) = current_element {
self.current_element_stack.pop_back();
self.end_tag(&current_element, handlers);
}
Ok(())
}
fn start_tag(
&mut self,
tag: &HtmlElement,
handlers: &mut [Box<dyn HandleTag>],
) -> StartTagOutcome {
for handler in handlers {
if handler.should_handle(tag.tag.as_str()) {
match handler.handle_tag_start(tag, self) {
StartTagOutcome::Continue => {}
StartTagOutcome::Skip => return StartTagOutcome::Skip,
}
}
}
StartTagOutcome::Continue
}
fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box<dyn HandleTag>]) {
for handler in handlers {
if handler.should_handle(tag.tag.as_str()) {
handler.handle_tag_end(tag, self);
}
}
}
fn visit_text(&mut self, text: String, handlers: &mut [Box<dyn HandleTag>]) -> Result<()> {
for handler in handlers {
match handler.handle_text(&text, self) {
HandlerOutcome::Handled => return Ok(()),
HandlerOutcome::NoOp => {}
}
}
let text = text
.trim_matches(|char| char == '\n' || char == '\r')
.replace('\n', " ");
self.push_str(&text);
Ok(())
}
}
pub enum HandlerOutcome {
Handled,
NoOp,
}
pub trait HandleTag {
/// Returns whether this handler should handle the given tag.
fn should_handle(&self, tag: &str) -> bool;
/// Handles the start of the given tag.
fn handle_tag_start(
&mut self,
_tag: &HtmlElement,
_writer: &mut MarkdownWriter,
) -> StartTagOutcome {
StartTagOutcome::Continue
}
/// Handles the end of the given tag.
fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {}
fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome {
HandlerOutcome::NoOp
}
}

View File

@ -0,0 +1 @@
pub mod rustdoc;

View File

@ -0,0 +1,286 @@
use crate::html_element::HtmlElement;
use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
pub struct RustdocHeadingHandler;
impl HandleTag for RustdocHeadingHandler {
fn should_handle(&self, _tag: &str) -> bool {
// We're only handling text, so we don't need to visit any tags.
false
}
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
if writer.is_inside("h1")
|| writer.is_inside("h2")
|| writer.is_inside("h3")
|| writer.is_inside("h4")
|| writer.is_inside("h5")
|| writer.is_inside("h6")
{
let text = text
.trim_matches(|char| char == '\n' || char == '\r' || char == '§')
.replace('\n', " ");
writer.push_str(&text);
return HandlerOutcome::Handled;
}
HandlerOutcome::NoOp
}
}
pub struct RustdocCodeHandler;
impl HandleTag for RustdocCodeHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"pre" | "code" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"code" => {
if !writer.is_inside("pre") {
writer.push_str("`");
}
}
"pre" => {
let classes = tag.classes();
let is_rust = classes.iter().any(|class| class == "rust");
let language = is_rust
.then(|| "rs")
.or_else(|| {
classes.iter().find_map(|class| {
if let Some((_, language)) = class.split_once("language-") {
Some(language.trim())
} else {
None
}
})
})
.unwrap_or("");
writer.push_str(&format!("\n\n```{language}\n"));
}
_ => {}
}
StartTagOutcome::Continue
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"code" => {
if !writer.is_inside("pre") {
writer.push_str("`");
}
}
"pre" => writer.push_str("\n```\n"),
_ => {}
}
}
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
if writer.is_inside("pre") {
writer.push_str(&text);
return HandlerOutcome::Handled;
}
HandlerOutcome::NoOp
}
}
pub struct RustdocTableHandler {
/// The number of columns in the current `<table>`.
current_table_columns: usize,
is_first_th: bool,
is_first_td: bool,
}
impl RustdocTableHandler {
pub fn new() -> Self {
Self {
current_table_columns: 0,
is_first_th: true,
is_first_td: true,
}
}
}
impl HandleTag for RustdocTableHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"table" | "thead" | "tbody" | "tr" | "th" | "td" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"thead" => writer.push_blank_line(),
"tr" => writer.push_newline(),
"th" => {
self.current_table_columns += 1;
if self.is_first_th {
self.is_first_th = false;
} else {
writer.push_str(" ");
}
writer.push_str("| ");
}
"td" => {
if self.is_first_td {
self.is_first_td = false;
} else {
writer.push_str(" ");
}
writer.push_str("| ");
}
_ => {}
}
StartTagOutcome::Continue
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"thead" => {
writer.push_newline();
for ix in 0..self.current_table_columns {
if ix > 0 {
writer.push_str(" ");
}
writer.push_str("| ---");
}
writer.push_str(" |");
self.is_first_th = true;
}
"tr" => {
writer.push_str(" |");
self.is_first_td = true;
}
"table" => {
self.current_table_columns = 0;
}
_ => {}
}
}
}
const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
pub struct RustdocItemHandler;
impl RustdocItemHandler {
/// Returns whether we're currently inside of an `.item-name` element, which
/// rustdoc uses to display Rust items in a list.
fn is_inside_item_name(writer: &MarkdownWriter) -> bool {
writer
.current_element_stack()
.iter()
.any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS))
}
}
impl HandleTag for RustdocItemHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"div" | "span" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"div" | "span" => {
if Self::is_inside_item_name(writer) && tag.has_class("stab") {
writer.push_str(" [");
}
}
_ => {}
}
StartTagOutcome::Continue
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"div" | "span" => {
if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) {
writer.push_str(": ");
}
if Self::is_inside_item_name(writer) && tag.has_class("stab") {
writer.push_str("]");
}
}
_ => {}
}
}
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
if Self::is_inside_item_name(writer)
&& !writer.is_inside("span")
&& !writer.is_inside("code")
{
writer.push_str(&format!("`{text}`"));
return HandlerOutcome::Handled;
}
HandlerOutcome::NoOp
}
}
pub struct RustdocChromeRemover;
impl HandleTag for RustdocChromeRemover {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"head" | "script" | "nav" | "summary" | "button" | "div" | "span" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
_writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"head" | "script" | "nav" => return StartTagOutcome::Skip,
"summary" => {
if tag.has_class("hideme") {
return StartTagOutcome::Skip;
}
}
"button" => {
if tag.attr("id").as_deref() == Some("copy-path") {
return StartTagOutcome::Skip;
}
}
"div" | "span" => {
let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
if tag.has_any_classes(&classes_to_skip) {
return StartTagOutcome::Skip;
}
}
_ => {}
}
StartTagOutcome::Continue
}
}

View File

@ -1,296 +0,0 @@
use std::collections::VecDeque;
use std::sync::OnceLock;
use anyhow::Result;
use markup5ever_rcdom::{Handle, NodeData};
use regex::Regex;
use crate::html_element::HtmlElement;
fn empty_line_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
}
fn more_than_three_newlines_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
}
const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
enum StartTagOutcome {
Continue,
Skip,
}
pub struct MarkdownWriter {
current_element_stack: VecDeque<HtmlElement>,
/// The number of columns in the current `<table>`.
current_table_columns: usize,
is_first_th: bool,
is_first_td: bool,
/// The Markdown output.
markdown: String,
}
impl MarkdownWriter {
pub fn new() -> Self {
Self {
current_element_stack: VecDeque::new(),
current_table_columns: 0,
is_first_th: true,
is_first_td: true,
markdown: String::new(),
}
}
fn is_inside(&self, tag: &str) -> bool {
self.current_element_stack
.iter()
.any(|parent_element| parent_element.tag == tag)
}
/// Appends the given string slice onto the end of the Markdown output.
fn push_str(&mut self, str: &str) {
self.markdown.push_str(str);
}
/// Appends a newline to the end of the Markdown output.
fn push_newline(&mut self) {
self.push_str("\n");
}
/// Appends a blank line to the end of the Markdown output.
fn push_blank_line(&mut self) {
self.push_str("\n\n");
}
pub fn run(mut self, root_node: &Handle) -> Result<String> {
self.visit_node(&root_node)?;
Ok(Self::prettify_markdown(self.markdown))
}
fn prettify_markdown(markdown: String) -> String {
let markdown = empty_line_regex().replace_all(&markdown, "");
let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n");
markdown.trim().to_string()
}
fn visit_node(&mut self, node: &Handle) -> Result<()> {
let mut current_element = None;
match node.data {
NodeData::Document
| NodeData::Doctype { .. }
| NodeData::ProcessingInstruction { .. }
| NodeData::Comment { .. } => {
// Currently left unimplemented, as we're not interested in this data
// at this time.
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
let tag_name = name.local.to_string();
if !tag_name.is_empty() {
current_element = Some(HtmlElement {
tag: tag_name,
attrs: attrs.clone(),
});
}
}
NodeData::Text { ref contents } => {
let text = contents.borrow().to_string();
self.visit_text(text)?;
}
}
if let Some(current_element) = current_element.as_ref() {
match self.start_tag(&current_element) {
StartTagOutcome::Continue => {}
StartTagOutcome::Skip => return Ok(()),
}
self.current_element_stack
.push_back(current_element.clone());
}
for child in node.children.borrow().iter() {
self.visit_node(child)?;
}
if let Some(current_element) = current_element {
self.current_element_stack.pop_back();
self.end_tag(&current_element);
}
Ok(())
}
fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
if tag.is_inline() && self.is_inside("p") {
if let Some(parent) = self.current_element_stack.iter().last() {
if !parent.is_inline() {
if !(self.markdown.ends_with(' ') || self.markdown.ends_with('\n')) {
self.push_str(" ");
}
}
}
}
match tag.tag.as_str() {
"head" | "script" | "nav" => return StartTagOutcome::Skip,
"h1" => self.push_str("\n\n# "),
"h2" => self.push_str("\n\n## "),
"h3" => self.push_str("\n\n### "),
"h4" => self.push_str("\n\n#### "),
"h5" => self.push_str("\n\n##### "),
"h6" => self.push_str("\n\n###### "),
"p" => self.push_blank_line(),
"strong" => self.push_str("**"),
"em" => self.push_str("_"),
"code" => {
if !self.is_inside("pre") {
self.push_str("`");
}
}
"pre" => {
let classes = tag.classes();
let is_rust = classes.iter().any(|class| class == "rust");
let language = is_rust
.then(|| "rs")
.or_else(|| {
classes.iter().find_map(|class| {
if let Some((_, language)) = class.split_once("language-") {
Some(language.trim())
} else {
None
}
})
})
.unwrap_or("");
self.push_str(&format!("\n\n```{language}\n"));
}
"ul" | "ol" => self.push_newline(),
"li" => self.push_str("- "),
"thead" => self.push_blank_line(),
"tr" => self.push_newline(),
"th" => {
self.current_table_columns += 1;
if self.is_first_th {
self.is_first_th = false;
} else {
self.push_str(" ");
}
self.push_str("| ");
}
"td" => {
if self.is_first_td {
self.is_first_td = false;
} else {
self.push_str(" ");
}
self.push_str("| ");
}
"summary" => {
if tag.has_class("hideme") {
return StartTagOutcome::Skip;
}
}
"button" => {
if tag.attr("id").as_deref() == Some("copy-path") {
return StartTagOutcome::Skip;
}
}
"div" | "span" => {
let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"];
if tag.has_any_classes(&classes_to_skip) {
return StartTagOutcome::Skip;
}
if self.is_inside_item_name() && tag.has_class("stab") {
self.push_str(" [");
}
}
_ => {}
}
StartTagOutcome::Continue
}
fn end_tag(&mut self, tag: &HtmlElement) {
match tag.tag.as_str() {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
"strong" => self.push_str("**"),
"em" => self.push_str("_"),
"code" => {
if !self.is_inside("pre") {
self.push_str("`");
}
}
"pre" => self.push_str("\n```\n"),
"ul" | "ol" => self.push_newline(),
"li" => self.push_newline(),
"thead" => {
self.push_newline();
for ix in 0..self.current_table_columns {
if ix > 0 {
self.push_str(" ");
}
self.push_str("| ---");
}
self.push_str(" |");
self.is_first_th = true;
}
"tr" => {
self.push_str(" |");
self.is_first_td = true;
}
"table" => {
self.current_table_columns = 0;
}
"div" | "span" => {
if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) {
self.push_str(": ");
}
if self.is_inside_item_name() && tag.has_class("stab") {
self.push_str("]");
}
}
_ => {}
}
}
fn visit_text(&mut self, text: String) -> Result<()> {
if self.is_inside("pre") {
self.push_str(&text);
return Ok(());
}
let text = text
.trim_matches(|char| char == '\n' || char == '\r' || char == '§')
.replace('\n', " ");
if self.is_inside_item_name() && !self.is_inside("span") && !self.is_inside("code") {
self.push_str(&format!("`{text}`"));
return Ok(());
}
self.push_str(&text);
Ok(())
}
/// Returns whether we're currently inside of an `.item-name` element, which
/// rustdoc uses to display Rust items in a list.
fn is_inside_item_name(&self) -> bool {
self.current_element_stack
.iter()
.any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS))
}
}