Add rustdoc_to_markdown crate (#12445)

This PR adds a new crate for converting rustdoc output to Markdown.

We're leveraging Servo's `html5ever` to parse the Markdown content, and
then walking the DOM nodes to convert it to a Markdown string.

The Markdown output will be continued to be refined, but it's in a place
where it should be reasonable.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-05-29 16:05:16 -04:00 committed by GitHub
parent a22cd95f9d
commit 5bcb9ed017
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 420 additions and 17 deletions

146
Cargo.lock generated
View File

@ -5060,6 +5060,20 @@ version = "3.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d13cdbd5dbb29f9c88095bbdc2590c9cba0d0a1269b983fef6b2cdd7e9f4db1"
[[package]]
name = "html5ever"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 2.0.59",
]
[[package]]
name = "http"
version = "0.1.0"
@ -5719,7 +5733,7 @@ dependencies = [
"tree-sitter-embedded-template",
"tree-sitter-heex",
"tree-sitter-html",
"tree-sitter-json 0.20.2",
"tree-sitter-json",
"tree-sitter-markdown",
"tree-sitter-ruby",
"tree-sitter-rust",
@ -5809,7 +5823,7 @@ dependencies = [
"tree-sitter-gomod",
"tree-sitter-gowork",
"tree-sitter-jsdoc",
"tree-sitter-json 0.20.2",
"tree-sitter-json",
"tree-sitter-markdown",
"tree-sitter-proto",
"tree-sitter-python",
@ -6181,6 +6195,32 @@ dependencies = [
"workspace",
]
[[package]]
name = "markup5ever"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45"
dependencies = [
"log",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "markup5ever_rcdom"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18"
dependencies = [
"html5ever",
"markup5ever",
"tendril",
"xml5ever",
]
[[package]]
name = "matchers"
version = "0.1.0"
@ -7286,7 +7326,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_macros",
"phf_shared",
"phf_shared 0.11.2",
]
[[package]]
name = "phf_codegen"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
dependencies = [
"phf_generator 0.11.2",
"phf_shared 0.11.2",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand 0.8.5",
]
[[package]]
@ -7295,7 +7355,7 @@ version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared",
"phf_shared 0.11.2",
"rand 0.8.5",
]
@ -7305,13 +7365,22 @@ version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
dependencies = [
"phf_generator",
"phf_shared",
"phf_generator 0.11.2",
"phf_shared 0.11.2",
"proc-macro2",
"quote",
"syn 2.0.59",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher 0.3.11",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
@ -7555,6 +7624,12 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "prettier"
version = "0.1.0"
@ -8554,6 +8629,16 @@ dependencies = [
"semver",
]
[[package]]
name = "rustdoc_to_markdown"
version = "0.1.0"
dependencies = [
"anyhow",
"html5ever",
"indoc",
"markup5ever_rcdom",
]
[[package]]
name = "rustix"
version = "0.37.23"
@ -9118,7 +9203,7 @@ dependencies = [
"serde_json_lenient",
"smallvec",
"tree-sitter",
"tree-sitter-json 0.19.0",
"tree-sitter-json",
"unindent",
"util",
]
@ -9802,6 +9887,32 @@ dependencies = [
"float-cmp",
]
[[package]]
name = "string_cache"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro2",
"quote",
]
[[package]]
name = "stringprep"
version = "0.1.4"
@ -10991,16 +11102,6 @@ dependencies = [
"tree-sitter",
]
[[package]]
name = "tree-sitter-json"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90b04c4e1a92139535eb9fca4ec8fa9666cc96b618005d3ae35f3c957fa92f92"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tree-sitter-json"
version = "0.20.2"
@ -12937,6 +13038,17 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "054a8e68b76250b253f671d1268cb7f1ae089ec35e195b2efb2a4e9a836d0621"
[[package]]
name = "xml5ever"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c376f76ed09df711203e20c3ef5ce556f0166fa03d39590016c0fd625437fad"
dependencies = [
"log",
"mac",
"markup5ever",
]
[[package]]
name = "xmlparser"
version = "0.13.5"

View File

@ -76,6 +76,7 @@ members = [
"crates/rich_text",
"crates/rope",
"crates/rpc",
"crates/rustdoc_to_markdown",
"crates/task",
"crates/tasks_ui",
"crates/search",
@ -220,6 +221,7 @@ dev_server_projects = { path = "crates/dev_server_projects" }
rich_text = { path = "crates/rich_text" }
rope = { path = "crates/rope" }
rpc = { path = "crates/rpc" }
rustdoc_to_markdown = { path = "crates/rustdoc_to_markdown" }
task = { path = "crates/task" }
tasks_ui = { path = "crates/tasks_ui" }
search = { path = "crates/search" }
@ -288,6 +290,7 @@ heed = { version = "0.20.1", features = [
"read-txn-no-tls",
] }
hex = "0.4.3"
html5ever = "0.27.0"
ignore = "0.4.22"
indoc = "1"
# We explicitly disable http2 support in isahc.
@ -300,6 +303,7 @@ lazy_static = "1.4.0"
libc = "0.2"
linkify = "0.10.0"
log = { version = "0.4.16", features = ["kv_unstable_serde"] }
markup5ever_rcdom = "0.3.0"
nanoid = "0.4"
nix = "0.28"
once_cell = "1.19.0"

View File

@ -0,0 +1,20 @@
[package]
name = "rustdoc_to_markdown"
version = "0.1.0"
edition = "2021"
publish = false
license = "GPL-3.0-or-later"
[lints]
workspace = true
[lib]
path = "src/rustdoc_to_markdown.rs"
[dependencies]
anyhow.workspace = true
html5ever.workspace = true
markup5ever_rcdom.workspace = true
[dev-dependencies]
indoc.workspace = true

View File

@ -0,0 +1 @@
../../LICENSE-GPL

View File

@ -0,0 +1,29 @@
use indoc::indoc;
use rustdoc_to_markdown::convert_rustdoc_to_markdown;
pub fn main() {
let html = indoc! {"
<html>
<body>
<h1>Hello World</h1>
<p>
Here is some content.
</p>
<h2>Some items</h2>
<ul>
<li>One</li>
<li>Two</li>
<li>Three</li>
</ul>
</body>
</html>
"};
// To test this out with some real input, try this:
//
// ```
// let html = include_str!("/path/to/zed/target/doc/gpui/index.html");
// ```
let markdown = convert_rustdoc_to_markdown(html).unwrap();
println!("{markdown}");
}

View File

@ -0,0 +1,201 @@
use std::cell::RefCell;
use std::collections::VecDeque;
use anyhow::Result;
use html5ever::Attribute;
use markup5ever_rcdom::{Handle, NodeData};
#[derive(Debug, Clone)]
struct HtmlElement {
tag: String,
attrs: RefCell<Vec<Attribute>>,
}
enum StartTagOutcome {
Continue,
Skip,
}
pub struct MarkdownWriter {
current_element_stack: VecDeque<HtmlElement>,
/// The Markdown output.
markdown: String,
}
impl MarkdownWriter {
pub fn new() -> Self {
Self {
current_element_stack: VecDeque::new(),
markdown: String::new(),
}
}
fn is_inside(&self, tag: &str) -> bool {
self.current_element_stack
.iter()
.any(|parent_element| parent_element.tag == tag)
}
fn is_inside_heading(&self) -> bool {
["h1", "h2", "h3", "h4", "h5", "h6"]
.into_iter()
.any(|heading| self.is_inside(heading))
}
/// Appends the given string slice onto the end of the Markdown output.
fn push_str(&mut self, str: &str) {
self.markdown.push_str(str);
}
/// Appends a newline to the end of the Markdown output.
fn push_newline(&mut self) {
self.push_str("\n");
}
pub fn run(mut self, root_node: &Handle) -> Result<String> {
self.visit_node(&root_node)?;
Ok(self.markdown.trim().to_string())
}
fn visit_node(&mut self, node: &Handle) -> Result<()> {
let mut current_element = None;
match node.data {
NodeData::Document
| NodeData::Doctype { .. }
| NodeData::ProcessingInstruction { .. }
| NodeData::Comment { .. } => {
// Currently left unimplemented, as we're not interested in this data
// at this time.
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
let tag_name = name.local.to_string();
if !tag_name.is_empty() {
current_element = Some(HtmlElement {
tag: tag_name,
attrs: attrs.clone(),
});
}
}
NodeData::Text { ref contents } => {
let text = contents.borrow().to_string();
self.visit_text(text)?;
}
}
if let Some(current_element) = current_element.as_ref() {
match self.start_tag(&current_element) {
StartTagOutcome::Continue => {}
StartTagOutcome::Skip => return Ok(()),
}
self.current_element_stack
.push_back(current_element.clone());
}
for child in node.children.borrow().iter() {
self.visit_node(child)?;
}
self.current_element_stack.pop_back();
if let Some(current_element) = current_element {
self.end_tag(&current_element);
}
Ok(())
}
fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
match tag.tag.as_str() {
"head" | "script" | "nav" => return StartTagOutcome::Skip,
"h1" => self.push_str("\n# "),
"h2" => self.push_str("\n## "),
"h3" => self.push_str("\n### "),
"h4" => self.push_str("\n#### "),
"h5" => self.push_str("\n##### "),
"h6" => self.push_str("\n###### "),
"code" => {
if !self.is_inside("pre") {
self.push_str("`")
}
}
"pre" => self.push_str("\n```\n"),
"ul" | "ol" => self.push_newline(),
"li" => self.push_str("- "),
"summary" => {
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class" && attr.value.to_string() == "hideme"
}) {
return StartTagOutcome::Skip;
}
}
"div" | "span" => {
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class"
&& attr.value.to_string() == "sidebar-elems"
}) {
return StartTagOutcome::Skip;
}
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class"
&& attr.value.to_string() == "out-of-band"
}) {
return StartTagOutcome::Skip;
}
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
}) {
self.push_str("`");
}
}
_ => {}
}
StartTagOutcome::Continue
}
fn end_tag(&mut self, tag: &HtmlElement) {
match tag.tag.as_str() {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"),
"code" => {
if !self.is_inside("pre") {
self.push_str("`")
}
}
"pre" => self.push_str("\n```\n"),
"ul" | "ol" => self.push_newline(),
"li" => self.push_newline(),
"div" => {
if tag.attrs.borrow().iter().any(|attr| {
attr.name.local.to_string() == "class" && attr.value.to_string() == "item-name"
}) {
self.push_str("`: ");
}
}
_ => {}
}
}
fn visit_text(&mut self, text: String) -> Result<()> {
if self.is_inside("pre") {
self.push_str(&text);
return Ok(());
}
if self.is_inside_heading() && self.is_inside("a") {
return Ok(());
}
let trimmed_text = text.trim_matches(|char| char == '\n' || char == '\r' || char == '§');
self.push_str(trimmed_text);
Ok(())
}
}

View File

@ -0,0 +1,36 @@
//! Provides conversion from rustdoc's HTML output to Markdown.
#![deny(missing_docs)]
mod markdown_writer;
use anyhow::{Context, Result};
use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use markup5ever_rcdom::RcDom;
use crate::markdown_writer::MarkdownWriter;
/// Converts the provided rustdoc HTML to Markdown.
pub fn convert_rustdoc_to_markdown(html: &str) -> Result<String> {
let parse_options = ParseOpts {
tree_builder: TreeBuilderOpts {
drop_doctype: true,
..Default::default()
},
..Default::default()
};
let dom = parse_document(RcDom::default(), parse_options)
.from_utf8()
.read_from(&mut html.as_bytes())
.context("failed to parse rustdoc HTML")?;
let markdown_writer = MarkdownWriter::new();
let markdown = markdown_writer
.run(&dom.document)
.context("failed to convert rustdoc to HTML")?;
Ok(markdown)
}