Start on rustdoc crawler (#12942)

This PR adds a first pass at a rustdoc crawler.

We'll be using this to get information about a crate from the rustdoc
artifacts for use in the Assistant.

Release Notes:

- N/A

---------

Co-authored-by: Richard <richard@zed.dev>
This commit is contained in:
Marshall Bowers 2024-06-12 15:21:50 -04:00 committed by GitHub
parent 72dac24acf
commit c3df9b79c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 292 additions and 19 deletions

13
Cargo.lock generated
View File

@ -8664,6 +8664,19 @@ dependencies = [
"semver", "semver",
] ]
[[package]]
name = "rustdoc"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"collections",
"fs",
"futures 0.3.28",
"html_to_markdown",
"http 0.1.0",
]
[[package]] [[package]]
name = "rustix" name = "rustix"
version = "0.37.23" version = "0.37.23"

View File

@ -79,6 +79,7 @@ members = [
"crates/rich_text", "crates/rich_text",
"crates/rope", "crates/rope",
"crates/rpc", "crates/rpc",
"crates/rustdoc",
"crates/task", "crates/task",
"crates/tasks_ui", "crates/tasks_ui",
"crates/search", "crates/search",
@ -227,6 +228,7 @@ dev_server_projects = { path = "crates/dev_server_projects" }
rich_text = { path = "crates/rich_text" } rich_text = { path = "crates/rich_text" }
rope = { path = "crates/rope" } rope = { path = "crates/rope" }
rpc = { path = "crates/rpc" } rpc = { path = "crates/rpc" }
rustdoc = { path = "crates/rustdoc" }
task = { path = "crates/task" } task = { path = "crates/task" }
tasks_ui = { path = "crates/tasks_ui" } tasks_ui = { path = "crates/tasks_ui" }
search = { path = "crates/search" } search = { path = "crates/search" }

View File

@ -58,7 +58,7 @@ pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<Rustd
let items = item_collector let items = item_collector
.borrow() .borrow()
.items .items
.values() .iter()
.cloned() .cloned()
.collect::<Vec<_>>(); .collect::<Vec<_>>();

View File

@ -1,4 +1,6 @@
use indexmap::IndexMap; use std::sync::Arc;
use indexmap::IndexSet;
use strum::{EnumIter, IntoEnumIterator}; use strum::{EnumIter, IntoEnumIterator};
use crate::html_element::HtmlElement; use crate::html_element::HtmlElement;
@ -238,17 +240,25 @@ impl RustdocItemKind {
} }
} }
#[derive(Debug, Clone)] #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
pub struct RustdocItem { pub struct RustdocItem {
pub kind: RustdocItemKind, pub kind: RustdocItemKind,
pub name: String, /// The item path, up until the name of the item.
pub path: Vec<Arc<str>>,
/// The name of the item.
pub name: Arc<str>,
} }
impl RustdocItem { impl RustdocItem {
pub fn url_path(&self) -> String { pub fn url_path(&self) -> String {
let name = &self.name; let name = &self.name;
let mut path_components = self.path.clone();
match self.kind { match self.kind {
RustdocItemKind::Mod => format!("{name}/index.html"), RustdocItemKind::Mod => {
path_components.push(name.clone());
path_components.push("index.html".into());
}
RustdocItemKind::Macro RustdocItemKind::Macro
| RustdocItemKind::Struct | RustdocItemKind::Struct
| RustdocItemKind::Enum | RustdocItemKind::Enum
@ -258,20 +268,23 @@ impl RustdocItem {
| RustdocItemKind::TypeAlias | RustdocItemKind::TypeAlias
| RustdocItemKind::AttributeMacro | RustdocItemKind::AttributeMacro
| RustdocItemKind::DeriveMacro => { | RustdocItemKind::DeriveMacro => {
format!("{kind}.{name}.html", kind = self.kind.class()) path_components
.push(format!("{kind}.{name}.html", kind = self.kind.class()).into());
} }
} }
path_components.join("/")
} }
} }
pub struct RustdocItemCollector { pub struct RustdocItemCollector {
pub items: IndexMap<(RustdocItemKind, String), RustdocItem>, pub items: IndexSet<RustdocItem>,
} }
impl RustdocItemCollector { impl RustdocItemCollector {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
items: IndexMap::new(), items: IndexSet::new(),
} }
} }
@ -281,21 +294,30 @@ impl RustdocItemCollector {
} }
let href = tag.attr("href")?; let href = tag.attr("href")?;
if href == "#" { if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
return None; return None;
} }
for kind in RustdocItemKind::iter() { for kind in RustdocItemKind::iter() {
if tag.has_class(kind.class()) { if tag.has_class(kind.class()) {
let name = href let mut parts = href.trim_end_matches("/index.html").split('/');
.trim_start_matches(&format!("{}.", kind.class()))
.trim_end_matches("/index.html")
.trim_end_matches(".html");
return Some(RustdocItem { if let Some(last_component) = parts.next_back() {
kind, let last_component = match last_component.split_once('#') {
name: name.to_owned(), Some((component, _fragment)) => component,
}); None => last_component,
};
let name = last_component
.trim_start_matches(&format!("{}.", kind.class()))
.trim_end_matches(".html");
return Some(RustdocItem {
kind,
name: name.into(),
path: parts.map(Into::into).collect(),
});
}
} }
} }
@ -317,7 +339,7 @@ impl HandleTag for RustdocItemCollector {
"a" => { "a" => {
let is_reexport = writer.current_element_stack().iter().any(|element| { let is_reexport = writer.current_element_stack().iter().any(|element| {
if let Some(id) = element.attr("id") { if let Some(id) = element.attr("id") {
id.starts_with("reexport.") id.starts_with("reexport.") || id.starts_with("method.")
} else { } else {
false false
} }
@ -325,7 +347,7 @@ impl HandleTag for RustdocItemCollector {
if !is_reexport { if !is_reexport {
if let Some(item) = Self::parse_item(tag) { if let Some(item) = Self::parse_item(tag) {
self.items.insert((item.kind, item.name.clone()), item); self.items.insert(item);
} }
} }
} }

23
crates/rustdoc/Cargo.toml Normal file
View File

@ -0,0 +1,23 @@
[package]
name = "rustdoc"
version = "0.1.0"
edition = "2021"
publish = false
license = "GPL-3.0-or-later"
[lints]
workspace = true
[lib]
path = "src/rustdoc.rs"
[dependencies]
anyhow.workspace = true
async-trait.workspace = true
collections.workspace = true
fs.workspace = true
futures.workspace = true
html_to_markdown.workspace = true
http.workspace = true
[dev-dependencies]

1
crates/rustdoc/LICENSE-GPL Symbolic link
View File

@ -0,0 +1 @@
../../LICENSE-GPL

View File

@ -0,0 +1,211 @@
use std::path::PathBuf;
use std::sync::Arc;
use anyhow::{bail, Context, Result};
use async_trait::async_trait;
use collections::{HashSet, VecDeque};
use fs::Fs;
use futures::AsyncReadExt;
use html_to_markdown::convert_rustdoc_to_markdown;
use html_to_markdown::structure::rustdoc::{RustdocItem, RustdocItemKind};
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
#[derive(Debug, Clone, Copy)]
pub enum RustdocSource {
/// The docs were sourced from local `cargo doc` output.
Local,
/// The docs were sourced from `docs.rs`.
DocsDotRs,
}
#[async_trait]
pub trait RustdocProvider {
async fn fetch_page(
&self,
crate_name: &str,
item: Option<&RustdocItem>,
) -> Result<Option<String>>;
}
pub struct LocalProvider {
fs: Arc<dyn Fs>,
cargo_workspace_root: PathBuf,
}
impl LocalProvider {
pub fn new(fs: Arc<dyn Fs>, cargo_workspace_root: PathBuf) -> Self {
Self {
fs,
cargo_workspace_root,
}
}
}
#[async_trait]
impl RustdocProvider for LocalProvider {
async fn fetch_page(
&self,
crate_name: &str,
item: Option<&RustdocItem>,
) -> Result<Option<String>> {
let mut local_cargo_doc_path = self.cargo_workspace_root.join("target/doc");
local_cargo_doc_path.push(&crate_name);
if let Some(item) = item {
if !item.path.is_empty() {
local_cargo_doc_path.push(item.path.join("/"));
}
}
local_cargo_doc_path.push("index.html");
let Ok(contents) = self.fs.load(&local_cargo_doc_path).await else {
return Ok(None);
};
Ok(Some(contents))
}
}
pub struct DocsDotRsProvider {
http_client: Arc<HttpClientWithUrl>,
}
impl DocsDotRsProvider {
pub fn new(http_client: Arc<HttpClientWithUrl>) -> Self {
Self { http_client }
}
}
#[async_trait]
impl RustdocProvider for DocsDotRsProvider {
async fn fetch_page(
&self,
crate_name: &str,
item: Option<&RustdocItem>,
) -> Result<Option<String>> {
let version = "latest";
let path = format!(
"{crate_name}/{version}/{crate_name}{item_path}",
item_path = item
.map(|item| format!("/{}", item.url_path()))
.unwrap_or_default()
);
println!("Fetching {}", &format!("https://docs.rs/{path}"));
let mut response = self
.http_client
.get(
&format!("https://docs.rs/{path}"),
AsyncBody::default(),
true,
)
.await?;
let mut body = Vec::new();
response
.body_mut()
.read_to_end(&mut body)
.await
.context("error reading docs.rs response body")?;
if response.status().is_client_error() {
let text = String::from_utf8_lossy(body.as_slice());
bail!(
"status error {}, response: {text:?}",
response.status().as_u16()
);
}
Ok(Some(String::from_utf8(body)?))
}
}
pub struct RustdocItemWithHistory {
pub item: RustdocItem,
#[cfg(debug_assertions)]
pub history: Vec<String>,
}
pub struct RustdocCrawler {
provider: Box<dyn RustdocProvider + Send + Sync + 'static>,
}
impl RustdocCrawler {
pub fn new(provider: Box<dyn RustdocProvider + Send + Sync + 'static>) -> Self {
Self { provider }
}
pub async fn crawl(&self, crate_name: String) -> Result<Option<String>> {
let Some(crate_index_content) = self.provider.fetch_page(&crate_name, None).await? else {
return Ok(None);
};
let (_markdown, items) = convert_rustdoc_to_markdown(crate_index_content.as_bytes())?;
let mut seen_items = HashSet::default();
let mut items_to_visit: VecDeque<RustdocItemWithHistory> =
VecDeque::from_iter(items.into_iter().map(|item| RustdocItemWithHistory {
item,
#[cfg(debug_assertions)]
history: Vec::new(),
}));
while let Some(item_with_history) = items_to_visit.pop_front() {
let item = &item_with_history.item;
println!("Visiting {:?} {:?} {}", &item.kind, &item.path, &item.name);
let Some(result) = self
.provider
.fetch_page(&crate_name, Some(&item))
.await
.with_context(|| {
#[cfg(debug_assertions)]
{
format!(
"failed to fetch {item:?}: {history:?}",
history = item_with_history.history
)
}
#[cfg(not(debug_assertions))]
{
format!("failed to fetch {item:?}")
}
})?
else {
continue;
};
let (_markdown, mut items) = convert_rustdoc_to_markdown(result.as_bytes())?;
seen_items.insert(item.clone());
for child in &mut items {
child.path.extend(item.path.clone());
match item.kind {
RustdocItemKind::Mod => {
child.path.push(item.name.clone());
}
_ => {}
}
}
let unseen_items = items
.into_iter()
.map(|item| RustdocItemWithHistory {
#[cfg(debug_assertions)]
history: {
let mut history = item_with_history.history.clone();
history.push(item.url_path());
history
},
item,
})
.filter(|item| !seen_items.contains(&item.item));
items_to_visit.extend(unseen_items);
}
Ok(Some(String::new()))
}
}

View File

@ -0,0 +1 @@
pub mod crawler;