From c3df9b79c652d283f2559f6cd71fbee642f8d566 Mon Sep 17 00:00:00 2001 From: Marshall Bowers Date: Wed, 12 Jun 2024 15:21:50 -0400 Subject: [PATCH] Start on rustdoc crawler (#12942) This PR adds a first pass at a rustdoc crawler. We'll be using this to get information about a crate from the rustdoc artifacts for use in the Assistant. Release Notes: - N/A --------- Co-authored-by: Richard --- Cargo.lock | 13 ++ Cargo.toml | 2 + .../html_to_markdown/src/html_to_markdown.rs | 2 +- .../html_to_markdown/src/structure/rustdoc.rs | 58 +++-- crates/rustdoc/Cargo.toml | 23 ++ crates/rustdoc/LICENSE-GPL | 1 + crates/rustdoc/src/crawler.rs | 211 ++++++++++++++++++ crates/rustdoc/src/rustdoc.rs | 1 + 8 files changed, 292 insertions(+), 19 deletions(-) create mode 100644 crates/rustdoc/Cargo.toml create mode 120000 crates/rustdoc/LICENSE-GPL create mode 100644 crates/rustdoc/src/crawler.rs create mode 100644 crates/rustdoc/src/rustdoc.rs diff --git a/Cargo.lock b/Cargo.lock index e19273ee8e..fb0f84637e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8664,6 +8664,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustdoc" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "collections", + "fs", + "futures 0.3.28", + "html_to_markdown", + "http 0.1.0", +] + [[package]] name = "rustix" version = "0.37.23" diff --git a/Cargo.toml b/Cargo.toml index 79510e808e..336d5d8559 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,6 +79,7 @@ members = [ "crates/rich_text", "crates/rope", "crates/rpc", + "crates/rustdoc", "crates/task", "crates/tasks_ui", "crates/search", @@ -227,6 +228,7 @@ dev_server_projects = { path = "crates/dev_server_projects" } rich_text = { path = "crates/rich_text" } rope = { path = "crates/rope" } rpc = { path = "crates/rpc" } +rustdoc = { path = "crates/rustdoc" } task = { path = "crates/task" } tasks_ui = { path = "crates/tasks_ui" } search = { path = "crates/search" } diff --git a/crates/html_to_markdown/src/html_to_markdown.rs b/crates/html_to_markdown/src/html_to_markdown.rs index 3246b53a42..be89050054 100644 --- a/crates/html_to_markdown/src/html_to_markdown.rs +++ b/crates/html_to_markdown/src/html_to_markdown.rs @@ -58,7 +58,7 @@ pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec>(); diff --git a/crates/html_to_markdown/src/structure/rustdoc.rs b/crates/html_to_markdown/src/structure/rustdoc.rs index 20ed6b1748..16eae9cb2f 100644 --- a/crates/html_to_markdown/src/structure/rustdoc.rs +++ b/crates/html_to_markdown/src/structure/rustdoc.rs @@ -1,4 +1,6 @@ -use indexmap::IndexMap; +use std::sync::Arc; + +use indexmap::IndexSet; use strum::{EnumIter, IntoEnumIterator}; use crate::html_element::HtmlElement; @@ -238,17 +240,25 @@ impl RustdocItemKind { } } -#[derive(Debug, Clone)] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)] pub struct RustdocItem { pub kind: RustdocItemKind, - pub name: String, + /// The item path, up until the name of the item. + pub path: Vec>, + /// The name of the item. + pub name: Arc, } impl RustdocItem { pub fn url_path(&self) -> String { let name = &self.name; + let mut path_components = self.path.clone(); + match self.kind { - RustdocItemKind::Mod => format!("{name}/index.html"), + RustdocItemKind::Mod => { + path_components.push(name.clone()); + path_components.push("index.html".into()); + } RustdocItemKind::Macro | RustdocItemKind::Struct | RustdocItemKind::Enum @@ -258,20 +268,23 @@ impl RustdocItem { | RustdocItemKind::TypeAlias | RustdocItemKind::AttributeMacro | RustdocItemKind::DeriveMacro => { - format!("{kind}.{name}.html", kind = self.kind.class()) + path_components + .push(format!("{kind}.{name}.html", kind = self.kind.class()).into()); } } + + path_components.join("/") } } pub struct RustdocItemCollector { - pub items: IndexMap<(RustdocItemKind, String), RustdocItem>, + pub items: IndexSet, } impl RustdocItemCollector { pub fn new() -> Self { Self { - items: IndexMap::new(), + items: IndexSet::new(), } } @@ -281,21 +294,30 @@ impl RustdocItemCollector { } let href = tag.attr("href")?; - if href == "#" { + if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") { return None; } for kind in RustdocItemKind::iter() { if tag.has_class(kind.class()) { - let name = href - .trim_start_matches(&format!("{}.", kind.class())) - .trim_end_matches("/index.html") - .trim_end_matches(".html"); + let mut parts = href.trim_end_matches("/index.html").split('/'); - return Some(RustdocItem { - kind, - name: name.to_owned(), - }); + if let Some(last_component) = parts.next_back() { + let last_component = match last_component.split_once('#') { + Some((component, _fragment)) => component, + None => last_component, + }; + + let name = last_component + .trim_start_matches(&format!("{}.", kind.class())) + .trim_end_matches(".html"); + + return Some(RustdocItem { + kind, + name: name.into(), + path: parts.map(Into::into).collect(), + }); + } } } @@ -317,7 +339,7 @@ impl HandleTag for RustdocItemCollector { "a" => { let is_reexport = writer.current_element_stack().iter().any(|element| { if let Some(id) = element.attr("id") { - id.starts_with("reexport.") + id.starts_with("reexport.") || id.starts_with("method.") } else { false } @@ -325,7 +347,7 @@ impl HandleTag for RustdocItemCollector { if !is_reexport { if let Some(item) = Self::parse_item(tag) { - self.items.insert((item.kind, item.name.clone()), item); + self.items.insert(item); } } } diff --git a/crates/rustdoc/Cargo.toml b/crates/rustdoc/Cargo.toml new file mode 100644 index 0000000000..2a3f9a476f --- /dev/null +++ b/crates/rustdoc/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "rustdoc" +version = "0.1.0" +edition = "2021" +publish = false +license = "GPL-3.0-or-later" + +[lints] +workspace = true + +[lib] +path = "src/rustdoc.rs" + +[dependencies] +anyhow.workspace = true +async-trait.workspace = true +collections.workspace = true +fs.workspace = true +futures.workspace = true +html_to_markdown.workspace = true +http.workspace = true + +[dev-dependencies] diff --git a/crates/rustdoc/LICENSE-GPL b/crates/rustdoc/LICENSE-GPL new file mode 120000 index 0000000000..89e542f750 --- /dev/null +++ b/crates/rustdoc/LICENSE-GPL @@ -0,0 +1 @@ +../../LICENSE-GPL \ No newline at end of file diff --git a/crates/rustdoc/src/crawler.rs b/crates/rustdoc/src/crawler.rs new file mode 100644 index 0000000000..bc829c8860 --- /dev/null +++ b/crates/rustdoc/src/crawler.rs @@ -0,0 +1,211 @@ +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::{bail, Context, Result}; +use async_trait::async_trait; +use collections::{HashSet, VecDeque}; +use fs::Fs; +use futures::AsyncReadExt; +use html_to_markdown::convert_rustdoc_to_markdown; +use html_to_markdown::structure::rustdoc::{RustdocItem, RustdocItemKind}; +use http::{AsyncBody, HttpClient, HttpClientWithUrl}; + +#[derive(Debug, Clone, Copy)] +pub enum RustdocSource { + /// The docs were sourced from local `cargo doc` output. + Local, + /// The docs were sourced from `docs.rs`. + DocsDotRs, +} + +#[async_trait] +pub trait RustdocProvider { + async fn fetch_page( + &self, + crate_name: &str, + item: Option<&RustdocItem>, + ) -> Result>; +} + +pub struct LocalProvider { + fs: Arc, + cargo_workspace_root: PathBuf, +} + +impl LocalProvider { + pub fn new(fs: Arc, cargo_workspace_root: PathBuf) -> Self { + Self { + fs, + cargo_workspace_root, + } + } +} + +#[async_trait] +impl RustdocProvider for LocalProvider { + async fn fetch_page( + &self, + crate_name: &str, + item: Option<&RustdocItem>, + ) -> Result> { + let mut local_cargo_doc_path = self.cargo_workspace_root.join("target/doc"); + local_cargo_doc_path.push(&crate_name); + if let Some(item) = item { + if !item.path.is_empty() { + local_cargo_doc_path.push(item.path.join("/")); + } + } + local_cargo_doc_path.push("index.html"); + + let Ok(contents) = self.fs.load(&local_cargo_doc_path).await else { + return Ok(None); + }; + + Ok(Some(contents)) + } +} + +pub struct DocsDotRsProvider { + http_client: Arc, +} + +impl DocsDotRsProvider { + pub fn new(http_client: Arc) -> Self { + Self { http_client } + } +} + +#[async_trait] +impl RustdocProvider for DocsDotRsProvider { + async fn fetch_page( + &self, + crate_name: &str, + item: Option<&RustdocItem>, + ) -> Result> { + let version = "latest"; + let path = format!( + "{crate_name}/{version}/{crate_name}{item_path}", + item_path = item + .map(|item| format!("/{}", item.url_path())) + .unwrap_or_default() + ); + + println!("Fetching {}", &format!("https://docs.rs/{path}")); + + let mut response = self + .http_client + .get( + &format!("https://docs.rs/{path}"), + AsyncBody::default(), + true, + ) + .await?; + + let mut body = Vec::new(); + response + .body_mut() + .read_to_end(&mut body) + .await + .context("error reading docs.rs response body")?; + + if response.status().is_client_error() { + let text = String::from_utf8_lossy(body.as_slice()); + bail!( + "status error {}, response: {text:?}", + response.status().as_u16() + ); + } + + Ok(Some(String::from_utf8(body)?)) + } +} + +pub struct RustdocItemWithHistory { + pub item: RustdocItem, + #[cfg(debug_assertions)] + pub history: Vec, +} + +pub struct RustdocCrawler { + provider: Box, +} + +impl RustdocCrawler { + pub fn new(provider: Box) -> Self { + Self { provider } + } + + pub async fn crawl(&self, crate_name: String) -> Result> { + let Some(crate_index_content) = self.provider.fetch_page(&crate_name, None).await? else { + return Ok(None); + }; + + let (_markdown, items) = convert_rustdoc_to_markdown(crate_index_content.as_bytes())?; + + let mut seen_items = HashSet::default(); + let mut items_to_visit: VecDeque = + VecDeque::from_iter(items.into_iter().map(|item| RustdocItemWithHistory { + item, + #[cfg(debug_assertions)] + history: Vec::new(), + })); + + while let Some(item_with_history) = items_to_visit.pop_front() { + let item = &item_with_history.item; + println!("Visiting {:?} {:?} {}", &item.kind, &item.path, &item.name); + + let Some(result) = self + .provider + .fetch_page(&crate_name, Some(&item)) + .await + .with_context(|| { + #[cfg(debug_assertions)] + { + format!( + "failed to fetch {item:?}: {history:?}", + history = item_with_history.history + ) + } + + #[cfg(not(debug_assertions))] + { + format!("failed to fetch {item:?}") + } + })? + else { + continue; + }; + + let (_markdown, mut items) = convert_rustdoc_to_markdown(result.as_bytes())?; + + seen_items.insert(item.clone()); + + for child in &mut items { + child.path.extend(item.path.clone()); + match item.kind { + RustdocItemKind::Mod => { + child.path.push(item.name.clone()); + } + _ => {} + } + } + + let unseen_items = items + .into_iter() + .map(|item| RustdocItemWithHistory { + #[cfg(debug_assertions)] + history: { + let mut history = item_with_history.history.clone(); + history.push(item.url_path()); + history + }, + item, + }) + .filter(|item| !seen_items.contains(&item.item)); + + items_to_visit.extend(unseen_items); + } + + Ok(Some(String::new())) + } +} diff --git a/crates/rustdoc/src/rustdoc.rs b/crates/rustdoc/src/rustdoc.rs new file mode 100644 index 0000000000..145a637bc3 --- /dev/null +++ b/crates/rustdoc/src/rustdoc.rs @@ -0,0 +1 @@ +pub mod crawler;