html_to_markdown: Move TableHandler out of rustdoc (#12697)

This PR moves the `TableHandler` out of the `rustdoc` module, as it
doesn't contain anything specific to rustdoc.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-06-05 15:37:02 -04:00 committed by GitHub
parent 071270fe88
commit f3460d440c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 86 additions and 84 deletions

View File

@ -16,7 +16,9 @@ use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use markup5ever_rcdom::RcDom;
use crate::markdown::{HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler};
use crate::markdown::{
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
};
use crate::markdown_writer::{HandleTag, MarkdownWriter};
/// Converts the provided HTML to Markdown.
@ -27,11 +29,11 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(TableHandler::new()),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocTableHandler::new()),
Box::new(structure::rustdoc::RustdocItemHandler),
];
@ -51,11 +53,11 @@ pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
Box::new(ParagraphHandler),
Box::new(HeadingHandler),
Box::new(ListHandler),
Box::new(TableHandler::new()),
Box::new(StyledTextHandler),
Box::new(structure::rustdoc::RustdocChromeRemover),
Box::new(structure::rustdoc::RustdocHeadingHandler),
Box::new(structure::rustdoc::RustdocCodeHandler),
Box::new(structure::rustdoc::RustdocTableHandler::new()),
Box::new(structure::rustdoc::RustdocItemHandler),
];

View File

@ -101,6 +101,87 @@ impl HandleTag for ListHandler {
}
}
pub struct TableHandler {
/// The number of columns in the current `<table>`.
current_table_columns: usize,
is_first_th: bool,
is_first_td: bool,
}
impl TableHandler {
pub fn new() -> Self {
Self {
current_table_columns: 0,
is_first_th: true,
is_first_td: true,
}
}
}
impl HandleTag for TableHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"table" | "thead" | "tbody" | "tr" | "th" | "td" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"thead" => writer.push_blank_line(),
"tr" => writer.push_newline(),
"th" => {
self.current_table_columns += 1;
if self.is_first_th {
self.is_first_th = false;
} else {
writer.push_str(" ");
}
writer.push_str("| ");
}
"td" => {
if self.is_first_td {
self.is_first_td = false;
} else {
writer.push_str(" ");
}
writer.push_str("| ");
}
_ => {}
}
StartTagOutcome::Continue
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"thead" => {
writer.push_newline();
for ix in 0..self.current_table_columns {
if ix > 0 {
writer.push_str(" ");
}
writer.push_str("| ---");
}
writer.push_str(" |");
self.is_first_th = true;
}
"tr" => {
writer.push_str(" |");
self.is_first_td = true;
}
"table" => {
self.current_table_columns = 0;
}
_ => {}
}
}
}
pub struct StyledTextHandler;
impl HandleTag for StyledTextHandler {

View File

@ -96,87 +96,6 @@ impl HandleTag for RustdocCodeHandler {
}
}
pub struct RustdocTableHandler {
/// The number of columns in the current `<table>`.
current_table_columns: usize,
is_first_th: bool,
is_first_td: bool,
}
impl RustdocTableHandler {
pub fn new() -> Self {
Self {
current_table_columns: 0,
is_first_th: true,
is_first_td: true,
}
}
}
impl HandleTag for RustdocTableHandler {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"table" | "thead" | "tbody" | "tr" | "th" | "td" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag.as_str() {
"thead" => writer.push_blank_line(),
"tr" => writer.push_newline(),
"th" => {
self.current_table_columns += 1;
if self.is_first_th {
self.is_first_th = false;
} else {
writer.push_str(" ");
}
writer.push_str("| ");
}
"td" => {
if self.is_first_td {
self.is_first_td = false;
} else {
writer.push_str(" ");
}
writer.push_str("| ");
}
_ => {}
}
StartTagOutcome::Continue
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
match tag.tag.as_str() {
"thead" => {
writer.push_newline();
for ix in 0..self.current_table_columns {
if ix > 0 {
writer.push_str(" ");
}
writer.push_str("| ---");
}
writer.push_str(" |");
self.is_first_th = true;
}
"tr" => {
writer.push_str(" |");
self.is_first_td = true;
}
"table" => {
self.current_table_columns = 0;
}
_ => {}
}
}
}
const RUSTDOC_ITEM_NAME_CLASS: &str = "item-name";
pub struct RustdocItemHandler;