diff --git a/Cargo.lock b/Cargo.lock index fb6ad1f833..a9394d1ed1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -353,6 +353,7 @@ dependencies = [ "fuzzy", "gpui", "heed", + "html_to_markdown", "http 0.1.0", "indoc", "language", @@ -367,7 +368,6 @@ dependencies = [ "rand 0.8.5", "regex", "rope", - "rustdoc_to_markdown", "schemars", "search", "semantic_index", @@ -5067,6 +5067,18 @@ dependencies = [ "syn 2.0.59", ] +[[package]] +name = "html_to_markdown" +version = "0.1.0" +dependencies = [ + "anyhow", + "html5ever", + "indoc", + "markup5ever_rcdom", + "pretty_assertions", + "regex", +] + [[package]] name = "http" version = "0.1.0" @@ -8618,18 +8630,6 @@ dependencies = [ "semver", ] -[[package]] -name = "rustdoc_to_markdown" -version = "0.1.0" -dependencies = [ - "anyhow", - "html5ever", - "indoc", - "markup5ever_rcdom", - "pretty_assertions", - "regex", -] - [[package]] name = "rustix" version = "0.37.23" diff --git a/Cargo.toml b/Cargo.toml index 7c3cf3762b..cb58423a81 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ members = [ "crates/gpui", "crates/gpui_macros", "crates/headless", + "crates/html_to_markdown", "crates/http", "crates/image_viewer", "crates/inline_completion_button", @@ -76,7 +77,6 @@ members = [ "crates/rich_text", "crates/rope", "crates/rpc", - "crates/rustdoc_to_markdown", "crates/task", "crates/tasks_ui", "crates/search", @@ -187,6 +187,7 @@ google_ai = { path = "crates/google_ai" } gpui = { path = "crates/gpui" } gpui_macros = { path = "crates/gpui_macros" } headless = { path = "crates/headless" } +html_to_markdown = { path = "crates/html_to_markdown" } http = { path = "crates/http" } install_cli = { path = "crates/install_cli" } image_viewer = { path = "crates/image_viewer" } @@ -223,7 +224,6 @@ dev_server_projects = { path = "crates/dev_server_projects" } rich_text = { path = "crates/rich_text" } rope = { path = "crates/rope" } rpc = { path = "crates/rpc" } -rustdoc_to_markdown = { path = "crates/rustdoc_to_markdown" } task = { path = "crates/task" } tasks_ui = { path = "crates/tasks_ui" } search = { path = "crates/search" } diff --git a/crates/assistant/Cargo.toml b/crates/assistant/Cargo.toml index 029da5d553..06df24d69a 100644 --- a/crates/assistant/Cargo.toml +++ b/crates/assistant/Cargo.toml @@ -28,6 +28,7 @@ futures.workspace = true fuzzy.workspace = true gpui.workspace = true heed.workspace = true +html_to_markdown.workspace = true http.workspace = true indoc.workspace = true language.workspace = true @@ -40,7 +41,6 @@ parking_lot.workspace = true project.workspace = true regex.workspace = true rope.workspace = true -rustdoc_to_markdown.workspace = true schemars.workspace = true search.workspace = true semantic_index.workspace = true diff --git a/crates/assistant/src/slash_command/fetch_command.rs b/crates/assistant/src/slash_command/fetch_command.rs index 6bd870c1b6..37483cbb1a 100644 --- a/crates/assistant/src/slash_command/fetch_command.rs +++ b/crates/assistant/src/slash_command/fetch_command.rs @@ -5,9 +5,9 @@ use anyhow::{anyhow, bail, Context, Result}; use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection}; use futures::AsyncReadExt; use gpui::{AppContext, Task, WeakView}; +use html_to_markdown::convert_html_to_markdown; use http::{AsyncBody, HttpClient, HttpClientWithUrl}; use language::LspAdapterDelegate; -use rustdoc_to_markdown::convert_html_to_markdown; use ui::{prelude::*, ButtonLike, ElevationIndex}; use workspace::Workspace; diff --git a/crates/assistant/src/slash_command/rustdoc_command.rs b/crates/assistant/src/slash_command/rustdoc_command.rs index 85015798b2..cf48dc28dc 100644 --- a/crates/assistant/src/slash_command/rustdoc_command.rs +++ b/crates/assistant/src/slash_command/rustdoc_command.rs @@ -7,10 +7,10 @@ use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutp use fs::Fs; use futures::AsyncReadExt; use gpui::{AppContext, Model, Task, WeakView}; +use html_to_markdown::convert_rustdoc_to_markdown; use http::{AsyncBody, HttpClient, HttpClientWithUrl}; use language::LspAdapterDelegate; use project::{Project, ProjectPath}; -use rustdoc_to_markdown::convert_rustdoc_to_markdown; use ui::{prelude::*, ButtonLike, ElevationIndex}; use workspace::Workspace; diff --git a/crates/rustdoc_to_markdown/Cargo.toml b/crates/html_to_markdown/Cargo.toml similarity index 83% rename from crates/rustdoc_to_markdown/Cargo.toml rename to crates/html_to_markdown/Cargo.toml index 58a60bc7bf..bac60ef9a6 100644 --- a/crates/rustdoc_to_markdown/Cargo.toml +++ b/crates/html_to_markdown/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "rustdoc_to_markdown" +name = "html_to_markdown" version = "0.1.0" edition = "2021" publish = false @@ -9,7 +9,7 @@ license = "GPL-3.0-or-later" workspace = true [lib] -path = "src/rustdoc_to_markdown.rs" +path = "src/html_to_markdown.rs" [dependencies] anyhow.workspace = true diff --git a/crates/rustdoc_to_markdown/LICENSE-GPL b/crates/html_to_markdown/LICENSE-GPL similarity index 100% rename from crates/rustdoc_to_markdown/LICENSE-GPL rename to crates/html_to_markdown/LICENSE-GPL diff --git a/crates/rustdoc_to_markdown/examples/test.rs b/crates/html_to_markdown/examples/test.rs similarity index 92% rename from crates/rustdoc_to_markdown/examples/test.rs rename to crates/html_to_markdown/examples/test.rs index 38a85874df..3937a7b3b3 100644 --- a/crates/rustdoc_to_markdown/examples/test.rs +++ b/crates/html_to_markdown/examples/test.rs @@ -1,5 +1,5 @@ +use html_to_markdown::convert_rustdoc_to_markdown; use indoc::indoc; -use rustdoc_to_markdown::convert_rustdoc_to_markdown; pub fn main() { let html = indoc! {" diff --git a/crates/rustdoc_to_markdown/src/html_element.rs b/crates/html_to_markdown/src/html_element.rs similarity index 100% rename from crates/rustdoc_to_markdown/src/html_element.rs rename to crates/html_to_markdown/src/html_element.rs diff --git a/crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs b/crates/html_to_markdown/src/html_to_markdown.rs similarity index 93% rename from crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs rename to crates/html_to_markdown/src/html_to_markdown.rs index 05d0b53128..34eec2b001 100644 --- a/crates/rustdoc_to_markdown/src/rustdoc_to_markdown.rs +++ b/crates/html_to_markdown/src/html_to_markdown.rs @@ -3,7 +3,9 @@ #![deny(missing_docs)] mod html_element; +mod markdown; mod markdown_writer; +mod structure; use std::io::Read; @@ -14,15 +16,28 @@ use html5ever::tendril::TendrilSink; use html5ever::tree_builder::TreeBuilderOpts; use markup5ever_rcdom::RcDom; -use crate::markdown_writer::MarkdownWriter; +use crate::markdown::{HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler}; +use crate::markdown_writer::{HandleTag, MarkdownWriter}; /// Converts the provided HTML to Markdown. pub fn convert_html_to_markdown(html: impl Read) -> Result { let dom = parse_html(html).context("failed to parse HTML")?; + let handlers: Vec> = vec![ + Box::new(ParagraphHandler), + Box::new(HeadingHandler), + Box::new(ListHandler), + Box::new(StyledTextHandler), + Box::new(structure::rustdoc::RustdocChromeRemover), + Box::new(structure::rustdoc::RustdocHeadingHandler), + Box::new(structure::rustdoc::RustdocCodeHandler), + Box::new(structure::rustdoc::RustdocTableHandler::new()), + Box::new(structure::rustdoc::RustdocItemHandler), + ]; + let markdown_writer = MarkdownWriter::new(); let markdown = markdown_writer - .run(&dom.document) + .run(&dom.document, handlers) .context("failed to convert HTML to Markdown")?; Ok(markdown) @@ -32,9 +47,21 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result { pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result { let dom = parse_html(html).context("failed to parse rustdoc HTML")?; + let handlers: Vec> = vec![ + Box::new(ParagraphHandler), + Box::new(HeadingHandler), + Box::new(ListHandler), + Box::new(StyledTextHandler), + Box::new(structure::rustdoc::RustdocChromeRemover), + Box::new(structure::rustdoc::RustdocHeadingHandler), + Box::new(structure::rustdoc::RustdocCodeHandler), + Box::new(structure::rustdoc::RustdocTableHandler::new()), + Box::new(structure::rustdoc::RustdocItemHandler), + ]; + let markdown_writer = MarkdownWriter::new(); let markdown = markdown_writer - .run(&dom.document) + .run(&dom.document, handlers) .context("failed to convert rustdoc HTML to Markdown")?; Ok(markdown) diff --git a/crates/html_to_markdown/src/markdown.rs b/crates/html_to_markdown/src/markdown.rs new file mode 100644 index 0000000000..f6af5794b5 --- /dev/null +++ b/crates/html_to_markdown/src/markdown.rs @@ -0,0 +1,135 @@ +use crate::html_element::HtmlElement; +use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome}; + +pub struct ParagraphHandler; + +impl HandleTag for ParagraphHandler { + fn should_handle(&self, _tag: &str) -> bool { + true + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + if tag.is_inline() && writer.is_inside("p") { + if let Some(parent) = writer.current_element_stack().iter().last() { + if !parent.is_inline() { + if !(writer.markdown.ends_with(' ') || writer.markdown.ends_with('\n')) { + writer.push_str(" "); + } + } + } + } + + match tag.tag.as_str() { + "p" => writer.push_blank_line(), + _ => {} + } + + StartTagOutcome::Continue + } +} + +pub struct HeadingHandler; + +impl HandleTag for HeadingHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "h1" => writer.push_str("\n\n# "), + "h2" => writer.push_str("\n\n## "), + "h3" => writer.push_str("\n\n### "), + "h4" => writer.push_str("\n\n#### "), + "h5" => writer.push_str("\n\n##### "), + "h6" => writer.push_str("\n\n###### "), + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => writer.push_blank_line(), + _ => {} + } + } +} + +pub struct ListHandler; + +impl HandleTag for ListHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "ul" | "ol" | "li" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "ul" | "ol" => writer.push_newline(), + "li" => writer.push_str("- "), + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "ul" | "ol" => writer.push_newline(), + "li" => writer.push_newline(), + _ => {} + } + } +} + +pub struct StyledTextHandler; + +impl HandleTag for StyledTextHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "strong" | "em" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "strong" => writer.push_str("**"), + "em" => writer.push_str("_"), + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "strong" => writer.push_str("**"), + "em" => writer.push_str("_"), + _ => {} + } + } +} diff --git a/crates/html_to_markdown/src/markdown_writer.rs b/crates/html_to_markdown/src/markdown_writer.rs new file mode 100644 index 0000000000..436f895d7e --- /dev/null +++ b/crates/html_to_markdown/src/markdown_writer.rs @@ -0,0 +1,198 @@ +use std::collections::VecDeque; +use std::sync::OnceLock; + +use anyhow::Result; +use markup5ever_rcdom::{Handle, NodeData}; +use regex::Regex; + +use crate::html_element::HtmlElement; + +fn empty_line_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap()) +} + +fn more_than_three_newlines_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap()) +} + +pub enum StartTagOutcome { + Continue, + Skip, +} + +pub struct MarkdownWriter { + current_element_stack: VecDeque, + pub(crate) markdown: String, +} + +impl MarkdownWriter { + pub fn new() -> Self { + Self { + current_element_stack: VecDeque::new(), + markdown: String::new(), + } + } + + pub fn current_element_stack(&self) -> &VecDeque { + &self.current_element_stack + } + + pub fn is_inside(&self, tag: &str) -> bool { + self.current_element_stack + .iter() + .any(|parent_element| parent_element.tag == tag) + } + + /// Appends the given string slice onto the end of the Markdown output. + pub fn push_str(&mut self, str: &str) { + self.markdown.push_str(str); + } + + /// Appends a newline to the end of the Markdown output. + pub fn push_newline(&mut self) { + self.push_str("\n"); + } + + /// Appends a blank line to the end of the Markdown output. + pub fn push_blank_line(&mut self) { + self.push_str("\n\n"); + } + + pub fn run( + mut self, + root_node: &Handle, + mut handlers: Vec>, + ) -> Result { + self.visit_node(&root_node, &mut handlers)?; + Ok(Self::prettify_markdown(self.markdown)) + } + + fn prettify_markdown(markdown: String) -> String { + let markdown = empty_line_regex().replace_all(&markdown, ""); + let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n"); + + markdown.trim().to_string() + } + + fn visit_node(&mut self, node: &Handle, handlers: &mut [Box]) -> Result<()> { + let mut current_element = None; + + match node.data { + NodeData::Document + | NodeData::Doctype { .. } + | NodeData::ProcessingInstruction { .. } + | NodeData::Comment { .. } => { + // Currently left unimplemented, as we're not interested in this data + // at this time. + } + NodeData::Element { + ref name, + ref attrs, + .. + } => { + let tag_name = name.local.to_string(); + if !tag_name.is_empty() { + current_element = Some(HtmlElement { + tag: tag_name, + attrs: attrs.clone(), + }); + } + } + NodeData::Text { ref contents } => { + let text = contents.borrow().to_string(); + self.visit_text(text, handlers)?; + } + } + + if let Some(current_element) = current_element.as_ref() { + match self.start_tag(¤t_element, handlers) { + StartTagOutcome::Continue => {} + StartTagOutcome::Skip => return Ok(()), + } + + self.current_element_stack + .push_back(current_element.clone()); + } + + for child in node.children.borrow().iter() { + self.visit_node(child, handlers)?; + } + + if let Some(current_element) = current_element { + self.current_element_stack.pop_back(); + self.end_tag(¤t_element, handlers); + } + + Ok(()) + } + + fn start_tag( + &mut self, + tag: &HtmlElement, + handlers: &mut [Box], + ) -> StartTagOutcome { + for handler in handlers { + if handler.should_handle(tag.tag.as_str()) { + match handler.handle_tag_start(tag, self) { + StartTagOutcome::Continue => {} + StartTagOutcome::Skip => return StartTagOutcome::Skip, + } + } + } + + StartTagOutcome::Continue + } + + fn end_tag(&mut self, tag: &HtmlElement, handlers: &mut [Box]) { + for handler in handlers { + if handler.should_handle(tag.tag.as_str()) { + handler.handle_tag_end(tag, self); + } + } + } + + fn visit_text(&mut self, text: String, handlers: &mut [Box]) -> Result<()> { + for handler in handlers { + match handler.handle_text(&text, self) { + HandlerOutcome::Handled => return Ok(()), + HandlerOutcome::NoOp => {} + } + } + + let text = text + .trim_matches(|char| char == '\n' || char == '\r') + .replace('\n', " "); + + self.push_str(&text); + + Ok(()) + } +} + +pub enum HandlerOutcome { + Handled, + NoOp, +} + +pub trait HandleTag { + /// Returns whether this handler should handle the given tag. + fn should_handle(&self, tag: &str) -> bool; + + /// Handles the start of the given tag. + fn handle_tag_start( + &mut self, + _tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + StartTagOutcome::Continue + } + + /// Handles the end of the given tag. + fn handle_tag_end(&mut self, _tag: &HtmlElement, _writer: &mut MarkdownWriter) {} + + fn handle_text(&mut self, _text: &str, _writer: &mut MarkdownWriter) -> HandlerOutcome { + HandlerOutcome::NoOp + } +} diff --git a/crates/html_to_markdown/src/structure.rs b/crates/html_to_markdown/src/structure.rs new file mode 100644 index 0000000000..c6505a2ab6 --- /dev/null +++ b/crates/html_to_markdown/src/structure.rs @@ -0,0 +1 @@ +pub mod rustdoc; diff --git a/crates/html_to_markdown/src/structure/rustdoc.rs b/crates/html_to_markdown/src/structure/rustdoc.rs new file mode 100644 index 0000000000..b1ae7d2425 --- /dev/null +++ b/crates/html_to_markdown/src/structure/rustdoc.rs @@ -0,0 +1,286 @@ +use crate::html_element::HtmlElement; +use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome}; + +pub struct RustdocHeadingHandler; + +impl HandleTag for RustdocHeadingHandler { + fn should_handle(&self, _tag: &str) -> bool { + // We're only handling text, so we don't need to visit any tags. + false + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if writer.is_inside("h1") + || writer.is_inside("h2") + || writer.is_inside("h3") + || writer.is_inside("h4") + || writer.is_inside("h5") + || writer.is_inside("h6") + { + let text = text + .trim_matches(|char| char == '\n' || char == '\r' || char == '§') + .replace('\n', " "); + writer.push_str(&text); + + return HandlerOutcome::Handled; + } + + HandlerOutcome::NoOp + } +} + +pub struct RustdocCodeHandler; + +impl HandleTag for RustdocCodeHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "pre" | "code" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "code" => { + if !writer.is_inside("pre") { + writer.push_str("`"); + } + } + "pre" => { + let classes = tag.classes(); + let is_rust = classes.iter().any(|class| class == "rust"); + let language = is_rust + .then(|| "rs") + .or_else(|| { + classes.iter().find_map(|class| { + if let Some((_, language)) = class.split_once("language-") { + Some(language.trim()) + } else { + None + } + }) + }) + .unwrap_or(""); + + writer.push_str(&format!("\n\n```{language}\n")); + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "code" => { + if !writer.is_inside("pre") { + writer.push_str("`"); + } + } + "pre" => writer.push_str("\n```\n"), + _ => {} + } + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if writer.is_inside("pre") { + writer.push_str(&text); + return HandlerOutcome::Handled; + } + + HandlerOutcome::NoOp + } +} + +pub struct RustdocTableHandler { + /// The number of columns in the current ``. + current_table_columns: usize, + is_first_th: bool, + is_first_td: bool, +} + +impl RustdocTableHandler { + pub fn new() -> Self { + Self { + current_table_columns: 0, + is_first_th: true, + is_first_td: true, + } + } +} + +impl HandleTag for RustdocTableHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "table" | "thead" | "tbody" | "tr" | "th" | "td" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "thead" => writer.push_blank_line(), + "tr" => writer.push_newline(), + "th" => { + self.current_table_columns += 1; + if self.is_first_th { + self.is_first_th = false; + } else { + writer.push_str(" "); + } + writer.push_str("| "); + } + "td" => { + if self.is_first_td { + self.is_first_td = false; + } else { + writer.push_str(" "); + } + writer.push_str("| "); + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "thead" => { + writer.push_newline(); + for ix in 0..self.current_table_columns { + if ix > 0 { + writer.push_str(" "); + } + writer.push_str("| ---"); + } + writer.push_str(" |"); + self.is_first_th = true; + } + "tr" => { + writer.push_str(" |"); + self.is_first_td = true; + } + "table" => { + self.current_table_columns = 0; + } + _ => {} + } + } +} + +const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name"; + +pub struct RustdocItemHandler; + +impl RustdocItemHandler { + /// Returns whether we're currently inside of an `.item-name` element, which + /// rustdoc uses to display Rust items in a list. + fn is_inside_item_name(writer: &MarkdownWriter) -> bool { + writer + .current_element_stack() + .iter() + .any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS)) + } +} + +impl HandleTag for RustdocItemHandler { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "div" | "span" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "div" | "span" => { + if Self::is_inside_item_name(writer) && tag.has_class("stab") { + writer.push_str(" ["); + } + } + _ => {} + } + + StartTagOutcome::Continue + } + + fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) { + match tag.tag.as_str() { + "div" | "span" => { + if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) { + writer.push_str(": "); + } + + if Self::is_inside_item_name(writer) && tag.has_class("stab") { + writer.push_str("]"); + } + } + _ => {} + } + } + + fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome { + if Self::is_inside_item_name(writer) + && !writer.is_inside("span") + && !writer.is_inside("code") + { + writer.push_str(&format!("`{text}`")); + return HandlerOutcome::Handled; + } + + HandlerOutcome::NoOp + } +} + +pub struct RustdocChromeRemover; + +impl HandleTag for RustdocChromeRemover { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "head" | "script" | "nav" | "summary" | "button" | "div" | "span" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag.as_str() { + "head" | "script" | "nav" => return StartTagOutcome::Skip, + "summary" => { + if tag.has_class("hideme") { + return StartTagOutcome::Skip; + } + } + "button" => { + if tag.attr("id").as_deref() == Some("copy-path") { + return StartTagOutcome::Skip; + } + } + "div" | "span" => { + let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"]; + if tag.has_any_classes(&classes_to_skip) { + return StartTagOutcome::Skip; + } + } + _ => {} + } + + StartTagOutcome::Continue + } +} diff --git a/crates/rustdoc_to_markdown/src/markdown_writer.rs b/crates/rustdoc_to_markdown/src/markdown_writer.rs deleted file mode 100644 index bafac18a33..0000000000 --- a/crates/rustdoc_to_markdown/src/markdown_writer.rs +++ /dev/null @@ -1,296 +0,0 @@ -use std::collections::VecDeque; -use std::sync::OnceLock; - -use anyhow::Result; -use markup5ever_rcdom::{Handle, NodeData}; -use regex::Regex; - -use crate::html_element::HtmlElement; - -fn empty_line_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap()) -} - -fn more_than_three_newlines_regex() -> &'static Regex { - static REGEX: OnceLock = OnceLock::new(); - REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap()) -} - -const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name"; - -enum StartTagOutcome { - Continue, - Skip, -} - -pub struct MarkdownWriter { - current_element_stack: VecDeque, - /// The number of columns in the current `
`. - current_table_columns: usize, - is_first_th: bool, - is_first_td: bool, - /// The Markdown output. - markdown: String, -} - -impl MarkdownWriter { - pub fn new() -> Self { - Self { - current_element_stack: VecDeque::new(), - current_table_columns: 0, - is_first_th: true, - is_first_td: true, - markdown: String::new(), - } - } - - fn is_inside(&self, tag: &str) -> bool { - self.current_element_stack - .iter() - .any(|parent_element| parent_element.tag == tag) - } - - /// Appends the given string slice onto the end of the Markdown output. - fn push_str(&mut self, str: &str) { - self.markdown.push_str(str); - } - - /// Appends a newline to the end of the Markdown output. - fn push_newline(&mut self) { - self.push_str("\n"); - } - - /// Appends a blank line to the end of the Markdown output. - fn push_blank_line(&mut self) { - self.push_str("\n\n"); - } - - pub fn run(mut self, root_node: &Handle) -> Result { - self.visit_node(&root_node)?; - Ok(Self::prettify_markdown(self.markdown)) - } - - fn prettify_markdown(markdown: String) -> String { - let markdown = empty_line_regex().replace_all(&markdown, ""); - let markdown = more_than_three_newlines_regex().replace_all(&markdown, "\n\n"); - - markdown.trim().to_string() - } - - fn visit_node(&mut self, node: &Handle) -> Result<()> { - let mut current_element = None; - - match node.data { - NodeData::Document - | NodeData::Doctype { .. } - | NodeData::ProcessingInstruction { .. } - | NodeData::Comment { .. } => { - // Currently left unimplemented, as we're not interested in this data - // at this time. - } - NodeData::Element { - ref name, - ref attrs, - .. - } => { - let tag_name = name.local.to_string(); - if !tag_name.is_empty() { - current_element = Some(HtmlElement { - tag: tag_name, - attrs: attrs.clone(), - }); - } - } - NodeData::Text { ref contents } => { - let text = contents.borrow().to_string(); - self.visit_text(text)?; - } - } - - if let Some(current_element) = current_element.as_ref() { - match self.start_tag(¤t_element) { - StartTagOutcome::Continue => {} - StartTagOutcome::Skip => return Ok(()), - } - - self.current_element_stack - .push_back(current_element.clone()); - } - - for child in node.children.borrow().iter() { - self.visit_node(child)?; - } - - if let Some(current_element) = current_element { - self.current_element_stack.pop_back(); - self.end_tag(¤t_element); - } - - Ok(()) - } - - fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome { - if tag.is_inline() && self.is_inside("p") { - if let Some(parent) = self.current_element_stack.iter().last() { - if !parent.is_inline() { - if !(self.markdown.ends_with(' ') || self.markdown.ends_with('\n')) { - self.push_str(" "); - } - } - } - } - - match tag.tag.as_str() { - "head" | "script" | "nav" => return StartTagOutcome::Skip, - "h1" => self.push_str("\n\n# "), - "h2" => self.push_str("\n\n## "), - "h3" => self.push_str("\n\n### "), - "h4" => self.push_str("\n\n#### "), - "h5" => self.push_str("\n\n##### "), - "h6" => self.push_str("\n\n###### "), - "p" => self.push_blank_line(), - "strong" => self.push_str("**"), - "em" => self.push_str("_"), - "code" => { - if !self.is_inside("pre") { - self.push_str("`"); - } - } - "pre" => { - let classes = tag.classes(); - let is_rust = classes.iter().any(|class| class == "rust"); - let language = is_rust - .then(|| "rs") - .or_else(|| { - classes.iter().find_map(|class| { - if let Some((_, language)) = class.split_once("language-") { - Some(language.trim()) - } else { - None - } - }) - }) - .unwrap_or(""); - - self.push_str(&format!("\n\n```{language}\n")); - } - "ul" | "ol" => self.push_newline(), - "li" => self.push_str("- "), - "thead" => self.push_blank_line(), - "tr" => self.push_newline(), - "th" => { - self.current_table_columns += 1; - if self.is_first_th { - self.is_first_th = false; - } else { - self.push_str(" "); - } - self.push_str("| "); - } - "td" => { - if self.is_first_td { - self.is_first_td = false; - } else { - self.push_str(" "); - } - self.push_str("| "); - } - "summary" => { - if tag.has_class("hideme") { - return StartTagOutcome::Skip; - } - } - "button" => { - if tag.attr("id").as_deref() == Some("copy-path") { - return StartTagOutcome::Skip; - } - } - "div" | "span" => { - let classes_to_skip = ["nav-container", "sidebar-elems", "out-of-band"]; - if tag.has_any_classes(&classes_to_skip) { - return StartTagOutcome::Skip; - } - - if self.is_inside_item_name() && tag.has_class("stab") { - self.push_str(" ["); - } - } - _ => {} - } - - StartTagOutcome::Continue - } - - fn end_tag(&mut self, tag: &HtmlElement) { - match tag.tag.as_str() { - "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.push_str("\n\n"), - "strong" => self.push_str("**"), - "em" => self.push_str("_"), - "code" => { - if !self.is_inside("pre") { - self.push_str("`"); - } - } - "pre" => self.push_str("\n```\n"), - "ul" | "ol" => self.push_newline(), - "li" => self.push_newline(), - "thead" => { - self.push_newline(); - for ix in 0..self.current_table_columns { - if ix > 0 { - self.push_str(" "); - } - self.push_str("| ---"); - } - self.push_str(" |"); - self.is_first_th = true; - } - "tr" => { - self.push_str(" |"); - self.is_first_td = true; - } - "table" => { - self.current_table_columns = 0; - } - "div" | "span" => { - if tag.has_class(RUSTDOC_ITEM_NAME_CLASS) { - self.push_str(": "); - } - - if self.is_inside_item_name() && tag.has_class("stab") { - self.push_str("]"); - } - } - _ => {} - } - } - - fn visit_text(&mut self, text: String) -> Result<()> { - if self.is_inside("pre") { - self.push_str(&text); - return Ok(()); - } - - let text = text - .trim_matches(|char| char == '\n' || char == '\r' || char == '§') - .replace('\n', " "); - - if self.is_inside_item_name() && !self.is_inside("span") && !self.is_inside("code") { - self.push_str(&format!("`{text}`")); - return Ok(()); - } - - self.push_str(&text); - - Ok(()) - } - - /// Returns whether we're currently inside of an `.item-name` element, which - /// rustdoc uses to display Rust items in a list. - fn is_inside_item_name(&self) -> bool { - self.current_element_stack - .iter() - .any(|element| element.has_class(RUSTDOC_ITEM_NAME_CLASS)) - } -}