mirror of
https://github.com/zed-industries/zed.git
synced 2024-09-19 10:29:35 +03:00
rustdoc_to_markdown: Improve paragraph handling (#12498)
This PR improves `rustdoc_to_markdown`'s paragraph handling to produce better output. Specifically, there should now be fewer instances where a space is missing between words as the result of line breaks in the source HTML. Release Notes: - N/A
This commit is contained in:
parent
4dc98026c4
commit
99901801f4
75
crates/rustdoc_to_markdown/src/html_element.rs
Normal file
75
crates/rustdoc_to_markdown/src/html_element.rs
Normal file
@ -0,0 +1,75 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashSet;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use html5ever::Attribute;
|
||||
|
||||
/// Returns a [`HashSet`] containing the HTML elements that are inline by default.
|
||||
///
|
||||
/// [MDN: List of "inline" elements](https://yari-demos.prod.mdn.mozit.cloud/en-US/docs/Web/HTML/Inline_elements)
|
||||
fn inline_elements() -> &'static HashSet<&'static str> {
|
||||
static INLINE_ELEMENTS: OnceLock<HashSet<&str>> = OnceLock::new();
|
||||
&INLINE_ELEMENTS.get_or_init(|| {
|
||||
HashSet::from_iter([
|
||||
"a", "abbr", "acronym", "audio", "b", "bdi", "bdo", "big", "br", "button", "canvas",
|
||||
"cite", "code", "data", "datalist", "del", "dfn", "em", "embed", "i", "iframe", "img",
|
||||
"input", "ins", "kbd", "label", "map", "mark", "meter", "noscript", "object", "output",
|
||||
"picture", "progress", "q", "ruby", "s", "samp", "script", "select", "slot", "small",
|
||||
"span", "strong", "sub", "sup", "svg", "template", "textarea", "time", "tt", "u",
|
||||
"var", "video", "wbr",
|
||||
])
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct HtmlElement {
|
||||
pub(crate) tag: String,
|
||||
pub(crate) attrs: RefCell<Vec<Attribute>>,
|
||||
}
|
||||
|
||||
impl HtmlElement {
|
||||
/// Returns whether this [`HtmlElement`] is an inline element.
|
||||
pub fn is_inline(&self) -> bool {
|
||||
inline_elements().contains(self.tag.as_str())
|
||||
}
|
||||
|
||||
/// Returns the attribute with the specified name.
|
||||
pub fn attr(&self, name: &str) -> Option<String> {
|
||||
self.attrs
|
||||
.borrow()
|
||||
.iter()
|
||||
.find(|attr| attr.name.local.to_string() == name)
|
||||
.map(|attr| attr.value.to_string())
|
||||
}
|
||||
|
||||
/// Returns the list of classes on this [`HtmlElement`].
|
||||
pub fn classes(&self) -> Vec<String> {
|
||||
self.attrs
|
||||
.borrow()
|
||||
.iter()
|
||||
.find(|attr| attr.name.local.to_string() == "class")
|
||||
.map(|attr| {
|
||||
attr.value
|
||||
.split(' ')
|
||||
.map(|class| class.trim().to_string())
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Returns whether this [`HtmlElement`] has the specified class.
|
||||
pub fn has_class(&self, class: &str) -> bool {
|
||||
self.has_any_classes(&[class])
|
||||
}
|
||||
|
||||
/// Returns whether this [`HtmlElement`] has any of the specified classes.
|
||||
pub fn has_any_classes(&self, classes: &[&str]) -> bool {
|
||||
self.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class"
|
||||
&& attr
|
||||
.value
|
||||
.split(' ')
|
||||
.any(|class| classes.contains(&class.trim()))
|
||||
})
|
||||
}
|
||||
}
|
@ -1,12 +1,12 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use anyhow::Result;
|
||||
use html5ever::Attribute;
|
||||
use markup5ever_rcdom::{Handle, NodeData};
|
||||
use regex::Regex;
|
||||
|
||||
use crate::html_element::HtmlElement;
|
||||
|
||||
fn empty_line_regex() -> &'static Regex {
|
||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||
REGEX.get_or_init(|| Regex::new(r"^\s*$").unwrap())
|
||||
@ -17,54 +17,6 @@ fn more_than_three_newlines_regex() -> &'static Regex {
|
||||
REGEX.get_or_init(|| Regex::new(r"\n{3,}").unwrap())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct HtmlElement {
|
||||
tag: String,
|
||||
attrs: RefCell<Vec<Attribute>>,
|
||||
}
|
||||
|
||||
impl HtmlElement {
|
||||
/// Returns the attribute with the specified name.
|
||||
pub fn attr(&self, name: &str) -> Option<String> {
|
||||
self.attrs
|
||||
.borrow()
|
||||
.iter()
|
||||
.find(|attr| attr.name.local.to_string() == name)
|
||||
.map(|attr| attr.value.to_string())
|
||||
}
|
||||
|
||||
/// Returns the list of classes on this [`HtmlElement`].
|
||||
pub fn classes(&self) -> Vec<String> {
|
||||
self.attrs
|
||||
.borrow()
|
||||
.iter()
|
||||
.find(|attr| attr.name.local.to_string() == "class")
|
||||
.map(|attr| {
|
||||
attr.value
|
||||
.split(' ')
|
||||
.map(|class| class.trim().to_string())
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Returns whether this [`HtmlElement`] has the specified class.
|
||||
pub fn has_class(&self, class: &str) -> bool {
|
||||
self.has_any_classes(&[class])
|
||||
}
|
||||
|
||||
/// Returns whether this [`HtmlElement`] has any of the specified classes.
|
||||
pub fn has_any_classes(&self, classes: &[&str]) -> bool {
|
||||
self.attrs.borrow().iter().any(|attr| {
|
||||
attr.name.local.to_string() == "class"
|
||||
&& attr
|
||||
.value
|
||||
.split(' ')
|
||||
.any(|class| classes.contains(&class.trim()))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
|
||||
|
||||
enum StartTagOutcome {
|
||||
@ -179,6 +131,12 @@ impl MarkdownWriter {
|
||||
}
|
||||
|
||||
fn start_tag(&mut self, tag: &HtmlElement) -> StartTagOutcome {
|
||||
if tag.is_inline() && self.is_inside("p") {
|
||||
if !self.markdown.ends_with(' ') {
|
||||
self.push_str(" ");
|
||||
}
|
||||
}
|
||||
|
||||
match tag.tag.as_str() {
|
||||
"head" | "script" | "nav" => return StartTagOutcome::Skip,
|
||||
"h1" => self.push_str("\n\n# "),
|
||||
@ -187,6 +145,7 @@ impl MarkdownWriter {
|
||||
"h4" => self.push_str("\n\n#### "),
|
||||
"h5" => self.push_str("\n\n##### "),
|
||||
"h6" => self.push_str("\n\n###### "),
|
||||
"p" => self.push_blank_line(),
|
||||
"code" => {
|
||||
if !self.is_inside("pre") {
|
||||
self.push_str("`");
|
||||
@ -305,14 +264,16 @@ impl MarkdownWriter {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let trimmed_text = text.trim_matches(|char| char == '\n' || char == '\r' || char == '§');
|
||||
let text = text
|
||||
.trim_matches(|char| char == '\n' || char == '\r' || char == '§')
|
||||
.replace('\n', " ");
|
||||
|
||||
if self.is_inside_item_name() && !self.is_inside("span") && !self.is_inside("code") {
|
||||
self.push_str(&format!("`{trimmed_text}`"));
|
||||
self.push_str(&format!("`{text}`"));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.push_str(trimmed_text);
|
||||
self.push_str(&text);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod html_element;
|
||||
mod markdown_writer;
|
||||
|
||||
use std::io::Read;
|
||||
@ -65,6 +66,72 @@ mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_paragraph() {
|
||||
let html = indoc! {r#"
|
||||
<p>In particular, the last point is what sets <code>axum</code> apart from other frameworks.
|
||||
<code>axum</code> doesn’t have its own middleware system but instead uses
|
||||
<a href="https://docs.rs/tower-service/0.3.2/x86_64-unknown-linux-gnu/tower_service/trait.Service.html" title="trait tower_service::Service"><code>tower::Service</code></a>. This means <code>axum</code> gets timeouts, tracing, compression,
|
||||
authorization, and more, for free. It also enables you to share middleware with
|
||||
applications written using <a href="http://crates.io/crates/hyper"><code>hyper</code></a> or <a href="http://crates.io/crates/tonic"><code>tonic</code></a>.</p>
|
||||
"#};
|
||||
let expected = indoc! {"
|
||||
In particular, the last point is what sets `axum` apart from other frameworks. `axum` doesn’t have its own middleware system but instead uses `tower::Service`. This means `axum` gets timeouts, tracing, compression, authorization, and more, for free. It also enables you to share middleware with applications written using `hyper` or `tonic`.
|
||||
"}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multiple_paragraphs() {
|
||||
let html = indoc! {r##"
|
||||
<h2 id="serde"><a class="doc-anchor" href="#serde">§</a>Serde</h2>
|
||||
<p>Serde is a framework for <em><strong>ser</strong></em>ializing and <em><strong>de</strong></em>serializing Rust data
|
||||
structures efficiently and generically.</p>
|
||||
<p>The Serde ecosystem consists of data structures that know how to serialize
|
||||
and deserialize themselves along with data formats that know how to
|
||||
serialize and deserialize other things. Serde provides the layer by which
|
||||
these two groups interact with each other, allowing any supported data
|
||||
structure to be serialized and deserialized using any supported data format.</p>
|
||||
<p>See the Serde website <a href="https://serde.rs/">https://serde.rs/</a> for additional documentation and
|
||||
usage examples.</p>
|
||||
<h3 id="design"><a class="doc-anchor" href="#design">§</a>Design</h3>
|
||||
<p>Where many other languages rely on runtime reflection for serializing data,
|
||||
Serde is instead built on Rust’s powerful trait system. A data structure
|
||||
that knows how to serialize and deserialize itself is one that implements
|
||||
Serde’s <code>Serialize</code> and <code>Deserialize</code> traits (or uses Serde’s derive
|
||||
attribute to automatically generate implementations at compile time). This
|
||||
avoids any overhead of reflection or runtime type information. In fact in
|
||||
many situations the interaction between data structure and data format can
|
||||
be completely optimized away by the Rust compiler, leaving Serde
|
||||
serialization to perform the same speed as a handwritten serializer for the
|
||||
specific selection of data structure and data format.</p>
|
||||
"##};
|
||||
let expected = indoc! {"
|
||||
## Serde
|
||||
|
||||
Serde is a framework for serializing and deserializing Rust data structures efficiently and generically.
|
||||
|
||||
The Serde ecosystem consists of data structures that know how to serialize and deserialize themselves along with data formats that know how to serialize and deserialize other things. Serde provides the layer by which these two groups interact with each other, allowing any supported data structure to be serialized and deserialized using any supported data format.
|
||||
|
||||
See the Serde website https://serde.rs/ for additional documentation and usage examples.
|
||||
|
||||
### Design
|
||||
|
||||
Where many other languages rely on runtime reflection for serializing data, Serde is instead built on Rust’s powerful trait system. A data structure that knows how to serialize and deserialize itself is one that implements Serde’s `Serialize` and `Deserialize` traits (or uses Serde’s derive attribute to automatically generate implementations at compile time). This avoids any overhead of reflection or runtime type information. In fact in many situations the interaction between data structure and data format can be completely optimized away by the Rust compiler, leaving Serde serialization to perform the same speed as a handwritten serializer for the specific selection of data structure and data format.
|
||||
"}
|
||||
.trim();
|
||||
|
||||
assert_eq!(
|
||||
convert_rustdoc_to_markdown(html.as_bytes()).unwrap(),
|
||||
expected
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rust_code_block() {
|
||||
let html = indoc! {r#"
|
||||
@ -201,8 +268,9 @@ mod tests {
|
||||
let expected = indoc! {r#"
|
||||
## Feature flags
|
||||
|
||||
axum uses a set of feature flags to reduce the amount of compiled and
|
||||
optional dependencies.The following optional features are available:
|
||||
axum uses a set of feature flags to reduce the amount of compiled and optional dependencies.
|
||||
|
||||
The following optional features are available:
|
||||
|
||||
| Name | Description | Default? |
|
||||
| --- | --- | --- |
|
||||
|
Loading…
Reference in New Issue
Block a user