mirror of
https://github.com/zed-industries/zed.git
synced 2024-11-08 07:35:01 +03:00
Add basic Wikipedia support to /fetch
(#12777)
This PR extends the `/fetch` slash command with the initial support for Wikipedia's HTML structure. Release Notes: - N/A
This commit is contained in:
parent
a910f192db
commit
9174858225
@ -5,7 +5,7 @@ use anyhow::{anyhow, bail, Context, Result};
|
|||||||
use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection};
|
use assistant_slash_command::{SlashCommand, SlashCommandOutput, SlashCommandOutputSection};
|
||||||
use futures::AsyncReadExt;
|
use futures::AsyncReadExt;
|
||||||
use gpui::{AppContext, Task, WeakView};
|
use gpui::{AppContext, Task, WeakView};
|
||||||
use html_to_markdown::convert_html_to_markdown;
|
use html_to_markdown::{convert_html_to_markdown, markdown, HandleTag};
|
||||||
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
|
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
|
||||||
use language::LspAdapterDelegate;
|
use language::LspAdapterDelegate;
|
||||||
use ui::{prelude::*, ButtonLike, ElevationIndex};
|
use ui::{prelude::*, ButtonLike, ElevationIndex};
|
||||||
@ -37,7 +37,21 @@ impl FetchSlashCommand {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
convert_html_to_markdown(&body[..])
|
let mut handlers: Vec<Box<dyn HandleTag>> = vec![
|
||||||
|
Box::new(markdown::ParagraphHandler),
|
||||||
|
Box::new(markdown::HeadingHandler),
|
||||||
|
Box::new(markdown::ListHandler),
|
||||||
|
Box::new(markdown::TableHandler::new()),
|
||||||
|
Box::new(markdown::StyledTextHandler),
|
||||||
|
Box::new(markdown::CodeHandler),
|
||||||
|
];
|
||||||
|
if url.contains("wikipedia.org") {
|
||||||
|
use html_to_markdown::structure::wikipedia;
|
||||||
|
|
||||||
|
handlers.push(Box::new(wikipedia::WikipediaChromeRemover));
|
||||||
|
}
|
||||||
|
|
||||||
|
convert_html_to_markdown(&body[..], handlers)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
//! Provides conversion from rustdoc's HTML output to Markdown.
|
//! Provides conversion from rustdoc's HTML output to Markdown.
|
||||||
|
|
||||||
#![deny(missing_docs)]
|
|
||||||
|
|
||||||
mod html_element;
|
mod html_element;
|
||||||
mod markdown;
|
pub mod markdown;
|
||||||
mod markdown_writer;
|
mod markdown_writer;
|
||||||
mod structure;
|
pub mod structure;
|
||||||
|
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
|
|
||||||
@ -19,24 +17,17 @@ use markup5ever_rcdom::RcDom;
|
|||||||
use crate::markdown::{
|
use crate::markdown::{
|
||||||
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
|
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
|
||||||
};
|
};
|
||||||
use crate::markdown_writer::{HandleTag, MarkdownWriter};
|
use crate::markdown_writer::MarkdownWriter;
|
||||||
|
|
||||||
|
pub use crate::markdown_writer::HandleTag;
|
||||||
|
|
||||||
/// Converts the provided HTML to Markdown.
|
/// Converts the provided HTML to Markdown.
|
||||||
pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
|
pub fn convert_html_to_markdown(
|
||||||
|
html: impl Read,
|
||||||
|
handlers: Vec<Box<dyn HandleTag>>,
|
||||||
|
) -> Result<String> {
|
||||||
let dom = parse_html(html).context("failed to parse HTML")?;
|
let dom = parse_html(html).context("failed to parse HTML")?;
|
||||||
|
|
||||||
let handlers: Vec<Box<dyn HandleTag>> = vec![
|
|
||||||
Box::new(ParagraphHandler),
|
|
||||||
Box::new(HeadingHandler),
|
|
||||||
Box::new(ListHandler),
|
|
||||||
Box::new(TableHandler::new()),
|
|
||||||
Box::new(StyledTextHandler),
|
|
||||||
Box::new(structure::rustdoc::RustdocChromeRemover),
|
|
||||||
Box::new(structure::rustdoc::RustdocHeadingHandler),
|
|
||||||
Box::new(structure::rustdoc::RustdocCodeHandler),
|
|
||||||
Box::new(structure::rustdoc::RustdocItemHandler),
|
|
||||||
];
|
|
||||||
|
|
||||||
let markdown_writer = MarkdownWriter::new();
|
let markdown_writer = MarkdownWriter::new();
|
||||||
let markdown = markdown_writer
|
let markdown = markdown_writer
|
||||||
.run(&dom.document, handlers)
|
.run(&dom.document, handlers)
|
||||||
@ -47,26 +38,20 @@ pub fn convert_html_to_markdown(html: impl Read) -> Result<String> {
|
|||||||
|
|
||||||
/// Converts the provided rustdoc HTML to Markdown.
|
/// Converts the provided rustdoc HTML to Markdown.
|
||||||
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
|
pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<String> {
|
||||||
let dom = parse_html(html).context("failed to parse rustdoc HTML")?;
|
convert_html_to_markdown(
|
||||||
|
html,
|
||||||
let handlers: Vec<Box<dyn HandleTag>> = vec![
|
vec![
|
||||||
Box::new(ParagraphHandler),
|
Box::new(ParagraphHandler),
|
||||||
Box::new(HeadingHandler),
|
Box::new(HeadingHandler),
|
||||||
Box::new(ListHandler),
|
Box::new(ListHandler),
|
||||||
Box::new(TableHandler::new()),
|
Box::new(TableHandler::new()),
|
||||||
Box::new(StyledTextHandler),
|
Box::new(StyledTextHandler),
|
||||||
Box::new(structure::rustdoc::RustdocChromeRemover),
|
Box::new(structure::rustdoc::RustdocChromeRemover),
|
||||||
Box::new(structure::rustdoc::RustdocHeadingHandler),
|
Box::new(structure::rustdoc::RustdocHeadingHandler),
|
||||||
Box::new(structure::rustdoc::RustdocCodeHandler),
|
Box::new(structure::rustdoc::RustdocCodeHandler),
|
||||||
Box::new(structure::rustdoc::RustdocItemHandler),
|
Box::new(structure::rustdoc::RustdocItemHandler),
|
||||||
];
|
],
|
||||||
|
)
|
||||||
let markdown_writer = MarkdownWriter::new();
|
|
||||||
let markdown = markdown_writer
|
|
||||||
.run(&dom.document, handlers)
|
|
||||||
.context("failed to convert rustdoc HTML to Markdown")?;
|
|
||||||
|
|
||||||
Ok(markdown)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_html(mut html: impl Read) -> Result<RcDom> {
|
fn parse_html(mut html: impl Read) -> Result<RcDom> {
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use crate::html_element::HtmlElement;
|
use crate::html_element::HtmlElement;
|
||||||
use crate::markdown_writer::{HandleTag, MarkdownWriter, StartTagOutcome};
|
use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
|
||||||
|
|
||||||
pub struct ParagraphHandler;
|
pub struct ParagraphHandler;
|
||||||
|
|
||||||
@ -214,3 +214,53 @@ impl HandleTag for StyledTextHandler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct CodeHandler;
|
||||||
|
|
||||||
|
impl HandleTag for CodeHandler {
|
||||||
|
fn should_handle(&self, tag: &str) -> bool {
|
||||||
|
match tag {
|
||||||
|
"pre" | "code" => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_start(
|
||||||
|
&mut self,
|
||||||
|
tag: &HtmlElement,
|
||||||
|
writer: &mut MarkdownWriter,
|
||||||
|
) -> StartTagOutcome {
|
||||||
|
match tag.tag.as_str() {
|
||||||
|
"code" => {
|
||||||
|
if !writer.is_inside("pre") {
|
||||||
|
writer.push_str("`");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"pre" => writer.push_str("\n\n```\n"),
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
StartTagOutcome::Continue
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||||
|
match tag.tag.as_str() {
|
||||||
|
"code" => {
|
||||||
|
if !writer.is_inside("pre") {
|
||||||
|
writer.push_str("`");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"pre" => writer.push_str("\n```\n"),
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||||
|
if writer.is_inside("pre") {
|
||||||
|
writer.push_str(&text);
|
||||||
|
return HandlerOutcome::Handled;
|
||||||
|
}
|
||||||
|
|
||||||
|
HandlerOutcome::NoOp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1 +1,2 @@
|
|||||||
pub mod rustdoc;
|
pub mod rustdoc;
|
||||||
|
pub mod wikipedia;
|
||||||
|
80
crates/html_to_markdown/src/structure/wikipedia.rs
Normal file
80
crates/html_to_markdown/src/structure/wikipedia.rs
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
use crate::html_element::HtmlElement;
|
||||||
|
use crate::markdown_writer::{MarkdownWriter, StartTagOutcome};
|
||||||
|
use crate::HandleTag;
|
||||||
|
|
||||||
|
pub struct WikipediaChromeRemover;
|
||||||
|
|
||||||
|
impl HandleTag for WikipediaChromeRemover {
|
||||||
|
fn should_handle(&self, _tag: &str) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_tag_start(
|
||||||
|
&mut self,
|
||||||
|
tag: &HtmlElement,
|
||||||
|
_writer: &mut MarkdownWriter,
|
||||||
|
) -> StartTagOutcome {
|
||||||
|
match tag.tag.as_str() {
|
||||||
|
"head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
|
||||||
|
"sup" => {
|
||||||
|
if tag.has_class("reference") {
|
||||||
|
return StartTagOutcome::Skip;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"div" | "span" | "a" => {
|
||||||
|
if tag.attr("id").as_deref() == Some("p-lang-btn") {
|
||||||
|
return StartTagOutcome::Skip;
|
||||||
|
}
|
||||||
|
|
||||||
|
if tag.attr("id").as_deref() == Some("p-search") {
|
||||||
|
return StartTagOutcome::Skip;
|
||||||
|
}
|
||||||
|
|
||||||
|
let classes_to_skip = ["mw-editsection", "mw-jump-link"];
|
||||||
|
if tag.has_any_classes(&classes_to_skip) {
|
||||||
|
return StartTagOutcome::Skip;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
StartTagOutcome::Continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use indoc::indoc;
|
||||||
|
use pretty_assertions::assert_eq;
|
||||||
|
|
||||||
|
use crate::{convert_html_to_markdown, markdown};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn wikipedia_handlers() -> Vec<Box<dyn HandleTag>> {
|
||||||
|
vec![
|
||||||
|
Box::new(markdown::ParagraphHandler),
|
||||||
|
Box::new(markdown::HeadingHandler),
|
||||||
|
Box::new(markdown::ListHandler),
|
||||||
|
Box::new(markdown::StyledTextHandler),
|
||||||
|
Box::new(WikipediaChromeRemover),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_citation_references_get_removed() {
|
||||||
|
let html = indoc! {r##"
|
||||||
|
<p>Rust began as a personal project in 2006 by <a href="/wiki/Mozilla" title="Mozilla">Mozilla</a> Research employee Graydon Hoare.<sup id="cite_ref-MITTechReview_23-0" class="reference"><a href="#cite_note-MITTechReview-23">[20]</a></sup> Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental <a href="/wiki/Browser_engine" title="Browser engine">browser engine</a> called <a href="/wiki/Servo_(software)" title="Servo (software)">Servo</a>,<sup id="cite_ref-infoq2012_24-0" class="reference"><a href="#cite_note-infoq2012-24">[21]</a></sup> which was officially announced by Mozilla in 2010.<sup id="cite_ref-MattAsay_25-0" class="reference"><a href="#cite_note-MattAsay-25">[22]</a></sup><sup id="cite_ref-26" class="reference"><a href="#cite_note-26">[23]</a></sup> Rust's memory and ownership system was influenced by <a href="/wiki/Region-based_memory_management" title="Region-based memory management">region-based memory management</a> in languages such as <a href="/wiki/Cyclone_(programming_language)" title="Cyclone (programming language)">Cyclone</a> and ML Kit.<sup id="cite_ref-influences_8-13" class="reference"><a href="#cite_note-influences-8">[5]</a></sup>
|
||||||
|
</p>
|
||||||
|
"##};
|
||||||
|
let expected = indoc! {"
|
||||||
|
Rust began as a personal project in 2006 by Mozilla Research employee Graydon Hoare. Mozilla began sponsoring the project in 2009 as a part of the ongoing development of an experimental browser engine called Servo, which was officially announced by Mozilla in 2010. Rust's memory and ownership system was influenced by region-based memory management in languages such as Cyclone and ML Kit.
|
||||||
|
"}
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
convert_html_to_markdown(html.as_bytes(), wikipedia_handlers()).unwrap(),
|
||||||
|
expected
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user