mirror of
https://github.com/zed-industries/zed.git
synced 2024-09-18 18:08:07 +03:00
gleam: Improve indexing of HexDocs (#13787)
This PR improves the indexing of HexDocs content for Gleam packages. We now index each of the modules in the package instead of just the root. Release Notes: - N/A
This commit is contained in:
parent
f024fcff3d
commit
98699a65c1
@ -1,7 +1,6 @@
|
||||
use html_to_markdown::{convert_html_to_markdown, TagHandler};
|
||||
use std::cell::RefCell;
|
||||
mod hexdocs;
|
||||
|
||||
use std::fs;
|
||||
use std::rc::Rc;
|
||||
use zed::lsp::CompletionKind;
|
||||
use zed::{
|
||||
CodeLabel, CodeLabelSpan, HttpRequest, KeyValueStore, LanguageServerId, SlashCommand,
|
||||
@ -9,6 +8,8 @@ use zed::{
|
||||
};
|
||||
use zed_extension_api::{self as zed, Result};
|
||||
|
||||
use crate::hexdocs::convert_hexdocs_to_markdown;
|
||||
|
||||
struct GleamExtension {
|
||||
cached_binary_path: Option<String>,
|
||||
}
|
||||
@ -191,19 +192,7 @@ impl zed::Extension for GleamExtension {
|
||||
),
|
||||
})?;
|
||||
|
||||
let mut handlers: Vec<TagHandler> = vec![
|
||||
Rc::new(RefCell::new(
|
||||
html_to_markdown::markdown::WebpageChromeRemover,
|
||||
)),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
|
||||
];
|
||||
|
||||
let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
|
||||
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
|
||||
let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
|
||||
|
||||
let mut text = String::new();
|
||||
text.push_str(&markdown);
|
||||
@ -244,27 +233,7 @@ impl zed::Extension for GleamExtension {
|
||||
database: &KeyValueStore,
|
||||
) -> Result<(), String> {
|
||||
match provider.as_str() {
|
||||
"gleam-hexdocs" => {
|
||||
let response = zed::fetch(&HttpRequest {
|
||||
url: format!("https://hexdocs.pm/{package}"),
|
||||
})?;
|
||||
|
||||
let mut handlers: Vec<TagHandler> = vec![
|
||||
Rc::new(RefCell::new(
|
||||
html_to_markdown::markdown::WebpageChromeRemover,
|
||||
)),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::ParagraphHandler)),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::HeadingHandler)),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::ListHandler)),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::TableHandler::new())),
|
||||
Rc::new(RefCell::new(html_to_markdown::markdown::StyledTextHandler)),
|
||||
];
|
||||
|
||||
let markdown = convert_html_to_markdown(response.body.as_bytes(), &mut handlers)
|
||||
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
|
||||
|
||||
Ok(database.insert(&package, &markdown)?)
|
||||
}
|
||||
"gleam-hexdocs" => hexdocs::index(package, database),
|
||||
_ => Ok(()),
|
||||
}
|
||||
}
|
||||
|
205
extensions/gleam/src/hexdocs.rs
Normal file
205
extensions/gleam/src/hexdocs.rs
Normal file
@ -0,0 +1,205 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::BTreeSet;
|
||||
use std::io::Read;
|
||||
use std::rc::Rc;
|
||||
|
||||
use html_to_markdown::markdown::{
|
||||
HeadingHandler, ListHandler, ParagraphHandler, StyledTextHandler, TableHandler,
|
||||
};
|
||||
use html_to_markdown::{
|
||||
convert_html_to_markdown, HandleTag, HandlerOutcome, HtmlElement, MarkdownWriter,
|
||||
StartTagOutcome, TagHandler,
|
||||
};
|
||||
use zed_extension_api::{self as zed, HttpRequest, KeyValueStore, Result};
|
||||
|
||||
pub fn index(package: String, database: &KeyValueStore) -> Result<()> {
|
||||
let response = zed::fetch(&HttpRequest {
|
||||
url: format!("https://hexdocs.pm/{package}"),
|
||||
})?;
|
||||
|
||||
let (package_root_markdown, modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
|
||||
|
||||
database.insert(&package, &package_root_markdown)?;
|
||||
|
||||
for module in modules {
|
||||
let response = zed::fetch(&HttpRequest {
|
||||
url: format!("https://hexdocs.pm/{package}/{module}.html"),
|
||||
})?;
|
||||
|
||||
let (markdown, _modules) = convert_hexdocs_to_markdown(response.body.as_bytes())?;
|
||||
|
||||
database.insert(&module, &markdown)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn convert_hexdocs_to_markdown(html: impl Read) -> Result<(String, Vec<String>)> {
|
||||
let module_collector = Rc::new(RefCell::new(GleamModuleCollector::new()));
|
||||
|
||||
let mut handlers: Vec<TagHandler> = vec![
|
||||
module_collector.clone(),
|
||||
Rc::new(RefCell::new(GleamChromeRemover)),
|
||||
Rc::new(RefCell::new(NavSkipper::new(ParagraphHandler))),
|
||||
Rc::new(RefCell::new(NavSkipper::new(HeadingHandler))),
|
||||
Rc::new(RefCell::new(NavSkipper::new(ListHandler))),
|
||||
Rc::new(RefCell::new(NavSkipper::new(TableHandler::new()))),
|
||||
Rc::new(RefCell::new(NavSkipper::new(StyledTextHandler))),
|
||||
];
|
||||
|
||||
let markdown = convert_html_to_markdown(html, &mut handlers)
|
||||
.map_err(|err| format!("failed to convert docs to Markdown {err}"))?;
|
||||
|
||||
let modules = module_collector
|
||||
.borrow()
|
||||
.modules
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
Ok((markdown, modules))
|
||||
}
|
||||
|
||||
/// A higher-order handler that skips all content from the `nav`.
|
||||
///
|
||||
/// We still need to traverse the `nav` for collecting information, but
|
||||
/// we don't want to include any of its content in the resulting Markdown.
|
||||
pub struct NavSkipper<T: HandleTag> {
|
||||
handler: T,
|
||||
}
|
||||
|
||||
impl<T: HandleTag> NavSkipper<T> {
|
||||
pub fn new(handler: T) -> Self {
|
||||
Self { handler }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: HandleTag> HandleTag for NavSkipper<T> {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
tag == "nav" || self.handler.should_handle(tag)
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
if writer.is_inside("nav") {
|
||||
return StartTagOutcome::Continue;
|
||||
}
|
||||
|
||||
self.handler.handle_tag_start(tag, writer)
|
||||
}
|
||||
|
||||
fn handle_tag_end(&mut self, tag: &HtmlElement, writer: &mut MarkdownWriter) {
|
||||
if writer.is_inside("nav") {
|
||||
return;
|
||||
}
|
||||
|
||||
self.handler.handle_tag_end(tag, writer)
|
||||
}
|
||||
|
||||
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||
if writer.is_inside("nav") {
|
||||
return HandlerOutcome::Handled;
|
||||
}
|
||||
|
||||
self.handler.handle_text(text, writer)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GleamChromeRemover;
|
||||
|
||||
impl HandleTag for GleamChromeRemover {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"head" | "script" | "style" | "svg" | "header" | "footer" | "a" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
_writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag() {
|
||||
"head" | "script" | "style" | "svg" | "header" | "footer" => {
|
||||
return StartTagOutcome::Skip;
|
||||
}
|
||||
"a" => {
|
||||
if tag.attr("onclick").is_some() {
|
||||
return StartTagOutcome::Skip;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GleamModuleCollector {
|
||||
modules: BTreeSet<String>,
|
||||
has_seen_modules_header: bool,
|
||||
}
|
||||
|
||||
impl GleamModuleCollector {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
modules: BTreeSet::new(),
|
||||
has_seen_modules_header: false,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_module(tag: &HtmlElement) -> Option<String> {
|
||||
if tag.tag() != "a" {
|
||||
return None;
|
||||
}
|
||||
|
||||
let href = tag.attr("href")?;
|
||||
if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
|
||||
return None;
|
||||
}
|
||||
|
||||
let module_name = href.trim_start_matches("./").trim_end_matches(".html");
|
||||
|
||||
Some(module_name.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
impl HandleTag for GleamModuleCollector {
|
||||
fn should_handle(&self, tag: &str) -> bool {
|
||||
match tag {
|
||||
"h2" | "a" => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_tag_start(
|
||||
&mut self,
|
||||
tag: &HtmlElement,
|
||||
writer: &mut MarkdownWriter,
|
||||
) -> StartTagOutcome {
|
||||
match tag.tag() {
|
||||
"a" => {
|
||||
if self.has_seen_modules_header && writer.is_inside("li") {
|
||||
if let Some(module_name) = Self::parse_module(tag) {
|
||||
self.modules.insert(module_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
StartTagOutcome::Continue
|
||||
}
|
||||
|
||||
fn handle_text(&mut self, text: &str, writer: &mut MarkdownWriter) -> HandlerOutcome {
|
||||
if writer.is_inside("nav") && writer.is_inside("h2") && text == "Modules" {
|
||||
self.has_seen_modules_header = true;
|
||||
}
|
||||
|
||||
HandlerOutcome::NoOp
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user