assistant: Strip out general website chrome in /fetch command (#13264)

This PR updates the `/fetch` command to strip out general website chrome
that likely won't contain content on any websites.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-06-19 09:50:02 -04:00 committed by GitHub
parent aff7a83815
commit ddf07253c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 25 additions and 0 deletions

View File

@ -62,6 +62,7 @@ impl FetchSlashCommand {
match content_type {
ContentType::Html => {
let mut handlers: Vec<TagHandler> = vec![
Rc::new(RefCell::new(markdown::WebpageChromeRemover)),
Rc::new(RefCell::new(markdown::ParagraphHandler)),
Rc::new(RefCell::new(markdown::HeadingHandler)),
Rc::new(RefCell::new(markdown::ListHandler)),

View File

@ -1,6 +1,30 @@
use crate::html_element::HtmlElement;
use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome};
pub struct WebpageChromeRemover;
impl HandleTag for WebpageChromeRemover {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"head" | "script" | "style" | "nav" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
_writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag() {
"head" | "script" | "style" | "nav" => return StartTagOutcome::Skip,
_ => {}
}
StartTagOutcome::Continue
}
}
pub struct ParagraphHandler;
impl HandleTag for ParagraphHandler {