gleam: Improve indexing of HexDocs (#13787)

This PR improves the indexing of HexDocs content for Gleam packages.

We now index each of the modules in the package instead of just the
root.

Release Notes:

- N/A
This commit is contained in:
Marshall Bowers 2024-07-03 12:57:08 -04:00 committed by GitHub
parent f024fcff3d
commit 98699a65c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 211 additions and 37 deletions

View File

@ -1,7 +1,6 @@
use html_to_markdown::{convert_html_to_markdown, TagHandler};
use std::cell::RefCell;
mod hexdocs;
use std::fs;
use std::rc::Rc;
use zed::lsp::CompletionKind;
use zed::{
CodeLabel, CodeLabelSpan, HttpRequest, KeyValueStore, LanguageServerId, SlashCommand,
@ -9,6 +8,8 @@ use zed::{
};
use zed_extension_api::{self as zed, Result};
use crate::hexdocs::convert_hexdocs_to_markdown;
struct GleamExtension {
cached_binary_path: Option<String>,
}
@ -191,19 +192,7 @@ impl zed::Extension for GleamExtension {
),
})?;
let mut handlers: Vec<TagHandler> = vec![
Rc::new(RefCell::new(
html_to_markdown::markdown::WebpageChromeRemover,
)),
Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
];
let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
let mut text = String::new();
text.push_str(&markdown);
@ -244,27 +233,7 @@ impl zed::Extension for GleamExtension {
database: &KeyValueStore,
) -> Result<(), String> {
match provider.as_str() {
"gleam-hexdocs" => {
let response = zed::fetch(&HttpRequest {
url: format!("https://hexdocs.pm/{package}"),
})?;
let mut handlers: Vec<TagHandler> = vec![
Rc::new(RefCell::new(
html_to_markdown::markdown::WebpageChromeRemover,
)),
Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
];
let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
Ok(database.insert(&package, &markdown)?)
}
"gleam-hexdocs" => hexdocs::index(package, database),
_ => Ok(()),
}
}

View File

@ -0,0 +1,205 @@
use std::cell::RefCell;
use std::collections::BTreeSet;
use std::io::Read;
use std::rc::Rc;
use html_to_markdown::markdown::{
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
};
use html_to_markdown::{
convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
StartTagOutcome, TagHandler,
};
use zed_extension_api::{self as zed, HttpRequest, KeyValueStore, Result};
pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
let response = zed::fetch(&HttpRequest {
url: format!("https://hexdocs.pm/{package}"),
})?;
let (package_root_markdown, modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
database.insert(&package, &package_root_markdown)?;
for module in modules {
let response = zed::fetch(&HttpRequest {
url: format!("https://hexdocs.pm/{package}/{module}.html"),
})?;
let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
database.insert(&module, &markdown)?;
}
Ok(())
}
pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
let mut handlers: Vec<TagHandler> = vec![
module_collector.clone(),
Rc::new(RefCell::new(GleamChromeRemover)),
Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
];
let markdown = convert_html_to_markdown(html, &mut handlers)
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
let modules = module_collector
.borrow()
.modules
.iter()
.cloned()
.collect::<Vec<_>>();
Ok((markdown, modules))
}
/// A higher-order handler that skips all content from the `nav`.
///
/// We still need to traverse the `nav` for collecting information, but
/// we don't want to include any of its content in the resulting Markdown.
pub struct NavSkipper<T: HandleTag> {
handler: T,
}
impl<T: HandleTag> NavSkipper<T> {
pub fn new(handler: T) -> Self {
Self { handler }
}
}
impl<T: HandleTag> HandleTag for NavSkipper<T> {
fn should_handle(&self, tag: &str) -> bool {
tag == "nav" || self.handler.should_handle(tag)
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
if writer.is_inside("nav") {
return StartTagOutcome::Continue;
}
self.handler.handle_tag_start(tag, writer)
}
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
if writer.is_inside("nav") {
return;
}
self.handler.handle_tag_end(tag, writer)
}
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
if writer.is_inside("nav") {
return HandlerOutcome::Handled;
}
self.handler.handle_text(text, writer)
}
}
pub struct GleamChromeRemover;
impl HandleTag for GleamChromeRemover {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
_writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag() {
"head" | "script" | "style" | "svg" | "header" | "footer" => {
return StartTagOutcome::Skip;
}
"a" => {
if tag.attr("onclick").is_some() {
return StartTagOutcome::Skip;
}
}
_ => {}
}
StartTagOutcome::Continue
}
}
pub struct GleamModuleCollector {
modules: BTreeSet<String>,
has_seen_modules_header: bool,
}
impl GleamModuleCollector {
pub fn new() -> Self {
Self {
modules: BTreeSet::new(),
has_seen_modules_header: false,
}
}
fn parse_module(tag: &HtmlElement) -> Option<String> {
if tag.tag() != "a" {
return None;
}
let href = tag.attr("href")?;
if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
return None;
}
let module_name = href.trim_start_matches("./").trim_end_matches(".html");
Some(module_name.to_owned())
}
}
impl HandleTag for GleamModuleCollector {
fn should_handle(&self, tag: &str) -> bool {
match tag {
"h2" | "a" => true,
_ => false,
}
}
fn handle_tag_start(
&mut self,
tag: &HtmlElement,
writer: &mut MarkdownWriter,
) -> StartTagOutcome {
match tag.tag() {
"a" => {
if self.has_seen_modules_header && writer.is_inside("li") {
if let Some(module_name) = Self::parse_module(tag) {
self.modules.insert(module_name);
}
}
}
_ => {}
}
StartTagOutcome::Continue
}
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
self.has_seen_modules_header = true;
}
HandlerOutcome::NoOp
}
}