From 5bcb9ed0174cd6cebe4bb087a23914d4eb7d5d25 Mon Sep 17 00:00:00 2001 From: Marshall Bowers Date: Wed, 29 May 2024 16:05:16 -0400 Subject: [PATCH] Add `rustdoc_to_markdown` crate (#12445) This PR adds a new crate for converting rustdoc output to Markdown. We're leveraging Servo's `html5ever` to parse the Markdown content, and then walking the DOM nodes to convert it to a Markdown string. The Markdown output will be continued to be refined, but it's in a place where it should be reasonable. Release Notes: - N/A --- Cargo.lock | 146 +++++++++++-- Cargo.toml | 4 + crates/rustdoc_to_markdown/Cargo.toml | 20 ++ crates/rustdoc_to_markdown/LICENSE-GPL | 1 + crates/rustdoc_to_markdown/examples/test.rs | 29 +++ .../src/markdown_writer.rs | 201 ++++++++++++++++++ .../src/rustdoc_to_markdown.rs | 36 ++++ 7 files changed, 420 insertions(+), 17 deletions(-) create mode 100644 crates/rustdoc_to_markdown/Cargo.toml create mode 120000 crates/rustdoc_to_markdown/LICENSE-GPL create mode 100644 crates/rustdoc_to_markdown/examples/test.rs create mode 100644 crates/rustdoc_to_markdown/src/markdown_writer.rs create mode 100644 crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs diff --git a/Cargo.lock b/Cargo.lock index a381dd863a..b67c2a1f59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5060,6 +5060,20 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d13cdbd5dbb29f9c88095bbdc2590c9cba0d0a1269b983fef6b2cdd7e9f4db1" +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 2.0.59", +] + [[package]] name = "http" version = "0.1.0" @@ -5719,7 +5733,7 @@ dependencies = [ "tree-sitter-embedded-template", "tree-sitter-heex", "tree-sitter-html", - "tree-sitter-json 0.20.2", + "tree-sitter-json", "tree-sitter-markdown", "tree-sitter-ruby", "tree-sitter-rust", @@ -5809,7 +5823,7 @@ dependencies = [ "tree-sitter-gomod", "tree-sitter-gowork", "tree-sitter-jsdoc", - "tree-sitter-json 0.20.2", + "tree-sitter-json", "tree-sitter-markdown", "tree-sitter-proto", "tree-sitter-python", @@ -6181,6 +6195,32 @@ dependencies = [ "workspace", ] +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "matchers" version = "0.1.0" @@ -7286,7 +7326,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ "phf_macros", - "phf_shared", + "phf_shared 0.11.2", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator 0.11.2", + "phf_shared 0.11.2", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.5", ] [[package]] @@ -7295,7 +7355,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" dependencies = [ - "phf_shared", + "phf_shared 0.11.2", "rand 0.8.5", ] @@ -7305,13 +7365,22 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" dependencies = [ - "phf_generator", - "phf_shared", + "phf_generator 0.11.2", + "phf_shared 0.11.2", "proc-macro2", "quote", "syn 2.0.59", ] +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher 0.3.11", +] + [[package]] name = "phf_shared" version = "0.11.2" @@ -7555,6 +7624,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "prettier" version = "0.1.0" @@ -8554,6 +8629,16 @@ dependencies = [ "semver", ] +[[package]] +name = "rustdoc_to_markdown" +version = "0.1.0" +dependencies = [ + "anyhow", + "html5ever", + "indoc", + "markup5ever_rcdom", +] + [[package]] name = "rustix" version = "0.37.23" @@ -9118,7 +9203,7 @@ dependencies = [ "serde_json_lenient", "smallvec", "tree-sitter", - "tree-sitter-json 0.19.0", + "tree-sitter-json", "unindent", "util", ] @@ -9802,6 +9887,32 @@ dependencies = [ "float-cmp", ] +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared 0.10.0", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", + "proc-macro2", + "quote", +] + [[package]] name = "stringprep" version = "0.1.4" @@ -10991,16 +11102,6 @@ dependencies = [ "tree-sitter", ] -[[package]] -name = "tree-sitter-json" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90b04c4e1a92139535eb9fca4ec8fa9666cc96b618005d3ae35f3c957fa92f92" -dependencies = [ - "cc", - "tree-sitter", -] - [[package]] name = "tree-sitter-json" version = "0.20.2" @@ -12937,6 +13038,17 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "054a8e68b76250b253f671d1268cb7f1ae089ec35e195b2efb2a4e9a836d0621" +[[package]] +name = "xml5ever" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c376f76ed09df711203e20c3ef5ce556f0166fa03d39590016c0fd625437fad" +dependencies = [ + "log", + "mac", + "markup5ever", +] + [[package]] name = "xmlparser" version = "0.13.5" diff --git a/Cargo.toml b/Cargo.toml index de1d25ee89..b50cdf8ca6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -76,6 +76,7 @@ members = [ "crates/rich_text", "crates/rope", "crates/rpc", + "crates/rustdoc_to_markdown", "crates/task", "crates/tasks_ui", "crates/search", @@ -220,6 +221,7 @@ dev_server_projects = { path = "crates/dev_server_projects" } rich_text = { path = "crates/rich_text" } rope = { path = "crates/rope" } rpc = { path = "crates/rpc" } +rustdoc_to_markdown = { path = "crates/rustdoc_to_markdown" } task = { path = "crates/task" } tasks_ui = { path = "crates/tasks_ui" } search = { path = "crates/search" } @@ -288,6 +290,7 @@ heed = { version = "0.20.1", features = [ "read-txn-no-tls", ] } hex = "0.4.3" +html5ever = "0.27.0" ignore = "0.4.22" indoc = "1" # We explicitly disable http2 support in isahc. @@ -300,6 +303,7 @@ lazy_static = "1.4.0" libc = "0.2" linkify = "0.10.0" log = { version = "0.4.16", features = ["kv_unstable_serde"] } +markup5ever_rcdom = "0.3.0" nanoid = "0.4" nix = "0.28" once_cell = "1.19.0" diff --git a/crates/rustdoc_to_markdown/Cargo.toml b/crates/rustdoc_to_markdown/Cargo.toml new file mode 100644 index 0000000000..001e476be7 --- /dev/null +++ b/crates/rustdoc_to_markdown/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "rustdoc_to_markdown" +version = "0.1.0" +edition = "2021" +publish = false +license = "GPL-3.0-or-later" + +[lints] +workspace = true + +[lib] +path = "src/rustdoc_to_markdown.rs" + +[dependencies] +anyhow.workspace = true +html5ever.workspace = true +markup5ever_rcdom.workspace = true + +[dev-dependencies] +indoc.workspace = true diff --git a/crates/rustdoc_to_markdown/LICENSE-GPL b/crates/rustdoc_to_markdown/LICENSE-GPL new file mode 120000 index 0000000000..89e542f750 --- /dev/null +++ b/crates/rustdoc_to_markdown/LICENSE-GPL @@ -0,0 +1 @@ +../../LICENSE-GPL \ No newline at end of file diff --git a/crates/rustdoc_to_markdown/examples/test.rs b/crates/rustdoc_to_markdown/examples/test.rs new file mode 100644 index 0000000000..3df85b2b1f --- /dev/null +++ b/crates/rustdoc_to_markdown/examples/test.rs @@ -0,0 +1,29 @@ +use indoc::indoc; +use rustdoc_to_markdown::convert_rustdoc_to_markdown; + +pub fn main() { + let html = indoc! {" + + +

Hello World

+

+ Here is some content. +

+

Some items

+ + + + "}; + // To test this out with some real input, try this: + // + // ``` + // let html = include_str!("/path/to/zed/target/doc/gpui/index.html"); + // ``` + let markdown = convert_rustdoc_to_markdown(html).unwrap(); + + println!("{markdown}"); +} diff --git a/crates/rustdoc_to_markdown/src/markdown_writer.rs b/crates/rustdoc_to_markdown/src/markdown_writer.rs new file mode 100644 index 0000000000..59aa7e1b37 --- /dev/null +++ b/crates/rustdoc_to_markdown/src/markdown_writer.rs @@ -0,0 +1,201 @@ +use std::cell::RefCell; +use std::collections::VecDeque; + +use anyhow::Result; +use html5ever::Attribute; +use markup5ever_rcdom::{Handle, NodeData}; + +#[derive(Debug, Clone)] +struct HtmlElement { + tag: String, + attrs: RefCell>, +} + +enum StartTagOutcome { + Continue, + Skip, +} + +pub struct MarkdownWriter { + current_element_stack: VecDeque, + /// The Markdown output. + markdown: String, +} + +impl MarkdownWriter { + pub fn new() -> Self { + Self { + current_element_stack: VecDeque::new(), + markdown: String::new(), + } + } + + fn is_inside(&self, tag: &str) -> bool { + self.current_element_stack + .iter() + .any(|parent_element| parent_element.tag == tag) + } + + fn is_inside_heading(&self) -> bool { + ["h1", "h2", "h3", "h4", "h5", "h6"] + .into_iter() + .any(|heading| self.is_inside(heading)) + } + + /// Appends the given string slice onto the end of the Markdown output. + fn push_str(&mut self, str: &str) { + self.markdown.push_str(str); + } + + /// Appends a newline to the end of the Markdown output. + fn push_newline(&mut self) { + self.push_str("\n"); + } + + pub fn run(mut self, root_node: &Handle) -> Result { + self.visit_node(&root_node)?; + Ok(self.markdown.trim().to_string()) + } + + fn visit_node(&mut self, node: &Handle) -> Result<()> { + let mut current_element = None; + + match node.data { + NodeData::Document + | NodeData::Doctype { .. } + | NodeData::ProcessingInstruction { .. } + | NodeData::Comment { .. } => { + // Currently left unimplemented, as we're not interested in this data + // at this time. + } + NodeData::Element { + ref name, + ref attrs, + .. + } => { + let tag_name = name.local.to_string(); + if !tag_name.is_empty() { + current_element = Some(HtmlElement { + tag: tag_name, + attrs: attrs.clone(), + }); + } + } + NodeData::Text { ref contents } => { + let text = contents.borrow().to_string(); + self.visit_text(text)?; + } + } + + if let Some(current_element) = current_element.as_ref() { + match self.start_tag(¤t_element) { + StartTagOutcome::Continue => {} + StartTagOutcome::Skip => return Ok(()), + } + + self.current_element_stack + .push_back(current_element.clone()); + } + + for child in node.children.borrow().iter() { + self.visit_node(child)?; + } + + self.current_element_stack.pop_back(); + + if let Some(current_element) = current_element { + self.end_tag(¤t_element); + } + + Ok(()) + } + + fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome { + match tag.tag.as_str() { + "head" | "script" | "nav" => return StartTagOutcome::Skip, + "h1" => self.push_str("\n# "), + "h2" => self.push_str("\n## "), + "h3" => self.push_str("\n### "), + "h4" => self.push_str("\n#### "), + "h5" => self.push_str("\n##### "), + "h6" => self.push_str("\n###### "), + "code" => { + if !self.is_inside("pre") { + self.push_str("`") + } + } + "pre" => self.push_str("\n```\n"), + "ul" | "ol" => self.push_newline(), + "li" => self.push_str("- "), + "summary" => { + if tag.attrs.borrow().iter().any(|attr| { + attr.name.local.to_string() == "class" && attr.value.to_string() == "hideme" + }) { + return StartTagOutcome::Skip; + } + } + "div" | "span" => { + if tag.attrs.borrow().iter().any(|attr| { + attr.name.local.to_string() == "class" + && attr.value.to_string() == "sidebar-elems" + }) { + return StartTagOutcome::Skip; + } + + if tag.attrs.borrow().iter().any(|attr| { + attr.name.local.to_string() == "class" + && attr.value.to_string() == "out-of-band" + }) { + return StartTagOutcome::Skip; + } + + if tag.attrs.borrow().iter().any(|attr| { + attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name" + }) { + self.push_str("`"); + } + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn end_tag(&mut self, tag: &HtmlElement) { + match tag.tag.as_str() { + "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"), + "code" => { + if !self.is_inside("pre") { + self.push_str("`") + } + } + "pre" => self.push_str("\n```\n"), + "ul" | "ol" => self.push_newline(), + "li" => self.push_newline(), + "div" => { + if tag.attrs.borrow().iter().any(|attr| { + attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name" + }) { + self.push_str("`: "); + } + } + _ => {} + } + } + + fn visit_text(&mut self, text: String) -> Result<()> { + if self.is_inside("pre") { + self.push_str(&text); + return Ok(()); + } + + if self.is_inside_heading() && self.is_inside("a") { + return Ok(()); + } + + let trimmed_text = text.trim_matches(|char| char == '\n' || char == '\r' || char == 'ยง'); + self.push_str(trimmed_text); + + Ok(()) + } +} diff --git a/crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs b/crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs new file mode 100644 index 0000000000..d3afe2a264 --- /dev/null +++ b/crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs @@ -0,0 +1,36 @@ +//! Provides conversion from rustdoc's HTML output to Markdown. + +#![deny(missing_docs)] + +mod markdown_writer; + +use anyhow::{Context, Result}; +use html5ever::driver::ParseOpts; +use html5ever::parse_document; +use html5ever::tendril::TendrilSink; +use html5ever::tree_builder::TreeBuilderOpts; +use markup5ever_rcdom::RcDom; + +use crate::markdown_writer::MarkdownWriter; + +/// Converts the provided rustdoc HTML to Markdown. +pub fn convert_rustdoc_to_markdown(html: &str) -> Result { + let parse_options = ParseOpts { + tree_builder: TreeBuilderOpts { + drop_doctype: true, + ..Default::default() + }, + ..Default::default() + }; + let dom = parse_document(RcDom::default(), parse_options) + .from_utf8() + .read_from(&mut html.as_bytes()) + .context("failed to parse rustdoc HTML")?; + + let markdown_writer = MarkdownWriter::new(); + let markdown = markdown_writer + .run(&dom.document) + .context("failed to convert rustdoc to HTML")?; + + Ok(markdown) +}