mirror of
https://github.com/zed-industries/zed.git
synced 2024-09-16 00:47:39 +03:00
Start on rustdoc crawler (#12942)
This PR adds a first pass at a rustdoc crawler. We'll be using this to get information about a crate from the rustdoc artifacts for use in the Assistant. Release Notes: - N/A --------- Co-authored-by: Richard <richard@zed.dev>
This commit is contained in:
parent
72dac24acf
commit
c3df9b79c6
13
Cargo.lock
generated
13
Cargo.lock
generated
@ -8664,6 +8664,19 @@ dependencies = [
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustdoc"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"collections",
|
||||
"fs",
|
||||
"futures 0.3.28",
|
||||
"html_to_markdown",
|
||||
"http 0.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.37.23"
|
||||
|
@ -79,6 +79,7 @@ members = [
|
||||
"crates/rich_text",
|
||||
"crates/rope",
|
||||
"crates/rpc",
|
||||
"crates/rustdoc",
|
||||
"crates/task",
|
||||
"crates/tasks_ui",
|
||||
"crates/search",
|
||||
@ -227,6 +228,7 @@ dev_server_projects = { path = "crates/dev_server_projects" }
|
||||
rich_text = { path = "crates/rich_text" }
|
||||
rope = { path = "crates/rope" }
|
||||
rpc = { path = "crates/rpc" }
|
||||
rustdoc = { path = "crates/rustdoc" }
|
||||
task = { path = "crates/task" }
|
||||
tasks_ui = { path = "crates/tasks_ui" }
|
||||
search = { path = "crates/search" }
|
||||
|
@ -58,7 +58,7 @@ pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<Rustd
|
||||
let items = item_collector
|
||||
.borrow()
|
||||
.items
|
||||
.values()
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
|
@ -1,4 +1,6 @@
|
||||
use indexmap::IndexMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use indexmap::IndexSet;
|
||||
use strum::{EnumIter, IntoEnumIterator};
|
||||
|
||||
use crate::html_element::HtmlElement;
|
||||
@ -238,17 +240,25 @@ impl RustdocItemKind {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
|
||||
pub struct RustdocItem {
|
||||
pub kind: RustdocItemKind,
|
||||
pub name: String,
|
||||
/// The item path, up until the name of the item.
|
||||
pub path: Vec<Arc<str>>,
|
||||
/// The name of the item.
|
||||
pub name: Arc<str>,
|
||||
}
|
||||
|
||||
impl RustdocItem {
|
||||
pub fn url_path(&self) -> String {
|
||||
let name = &self.name;
|
||||
let mut path_components = self.path.clone();
|
||||
|
||||
match self.kind {
|
||||
RustdocItemKind::Mod => format!("{name}/index.html"),
|
||||
RustdocItemKind::Mod => {
|
||||
path_components.push(name.clone());
|
||||
path_components.push("index.html".into());
|
||||
}
|
||||
RustdocItemKind::Macro
|
||||
| RustdocItemKind::Struct
|
||||
| RustdocItemKind::Enum
|
||||
@ -258,20 +268,23 @@ impl RustdocItem {
|
||||
| RustdocItemKind::TypeAlias
|
||||
| RustdocItemKind::AttributeMacro
|
||||
| RustdocItemKind::DeriveMacro => {
|
||||
format!("{kind}.{name}.html", kind = self.kind.class())
|
||||
path_components
|
||||
.push(format!("{kind}.{name}.html", kind = self.kind.class()).into());
|
||||
}
|
||||
}
|
||||
|
||||
path_components.join("/")
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RustdocItemCollector {
|
||||
pub items: IndexMap<(RustdocItemKind, String), RustdocItem>,
|
||||
pub items: IndexSet<RustdocItem>,
|
||||
}
|
||||
|
||||
impl RustdocItemCollector {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
items: IndexMap::new(),
|
||||
items: IndexSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -281,21 +294,30 @@ impl RustdocItemCollector {
|
||||
}
|
||||
|
||||
let href = tag.attr("href")?;
|
||||
if href == "#" {
|
||||
if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
|
||||
return None;
|
||||
}
|
||||
|
||||
for kind in RustdocItemKind::iter() {
|
||||
if tag.has_class(kind.class()) {
|
||||
let name = href
|
||||
.trim_start_matches(&format!("{}.", kind.class()))
|
||||
.trim_end_matches("/index.html")
|
||||
.trim_end_matches(".html");
|
||||
let mut parts = href.trim_end_matches("/index.html").split('/');
|
||||
|
||||
return Some(RustdocItem {
|
||||
kind,
|
||||
name: name.to_owned(),
|
||||
});
|
||||
if let Some(last_component) = parts.next_back() {
|
||||
let last_component = match last_component.split_once('#') {
|
||||
Some((component, _fragment)) => component,
|
||||
None => last_component,
|
||||
};
|
||||
|
||||
let name = last_component
|
||||
.trim_start_matches(&format!("{}.", kind.class()))
|
||||
.trim_end_matches(".html");
|
||||
|
||||
return Some(RustdocItem {
|
||||
kind,
|
||||
name: name.into(),
|
||||
path: parts.map(Into::into).collect(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -317,7 +339,7 @@ impl HandleTag for RustdocItemCollector {
|
||||
"a" => {
|
||||
let is_reexport = writer.current_element_stack().iter().any(|element| {
|
||||
if let Some(id) = element.attr("id") {
|
||||
id.starts_with("reexport.")
|
||||
id.starts_with("reexport.") || id.starts_with("method.")
|
||||
} else {
|
||||
false
|
||||
}
|
||||
@ -325,7 +347,7 @@ impl HandleTag for RustdocItemCollector {
|
||||
|
||||
if !is_reexport {
|
||||
if let Some(item) = Self::parse_item(tag) {
|
||||
self.items.insert((item.kind, item.name.clone()), item);
|
||||
self.items.insert(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
23
crates/rustdoc/Cargo.toml
Normal file
23
crates/rustdoc/Cargo.toml
Normal file
@ -0,0 +1,23 @@
|
||||
[package]
|
||||
name = "rustdoc"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
publish = false
|
||||
license = "GPL-3.0-or-later"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[lib]
|
||||
path = "src/rustdoc.rs"
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
collections.workspace = true
|
||||
fs.workspace = true
|
||||
futures.workspace = true
|
||||
html_to_markdown.workspace = true
|
||||
http.workspace = true
|
||||
|
||||
[dev-dependencies]
|
1
crates/rustdoc/LICENSE-GPL
Symbolic link
1
crates/rustdoc/LICENSE-GPL
Symbolic link
@ -0,0 +1 @@
|
||||
../../LICENSE-GPL
|
211
crates/rustdoc/src/crawler.rs
Normal file
211
crates/rustdoc/src/crawler.rs
Normal file
@ -0,0 +1,211 @@
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use collections::{HashSet, VecDeque};
|
||||
use fs::Fs;
|
||||
use futures::AsyncReadExt;
|
||||
use html_to_markdown::convert_rustdoc_to_markdown;
|
||||
use html_to_markdown::structure::rustdoc::{RustdocItem, RustdocItemKind};
|
||||
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum RustdocSource {
|
||||
/// The docs were sourced from local `cargo doc` output.
|
||||
Local,
|
||||
/// The docs were sourced from `docs.rs`.
|
||||
DocsDotRs,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait RustdocProvider {
|
||||
async fn fetch_page(
|
||||
&self,
|
||||
crate_name: &str,
|
||||
item: Option<&RustdocItem>,
|
||||
) -> Result<Option<String>>;
|
||||
}
|
||||
|
||||
pub struct LocalProvider {
|
||||
fs: Arc<dyn Fs>,
|
||||
cargo_workspace_root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalProvider {
|
||||
pub fn new(fs: Arc<dyn Fs>, cargo_workspace_root: PathBuf) -> Self {
|
||||
Self {
|
||||
fs,
|
||||
cargo_workspace_root,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl RustdocProvider for LocalProvider {
|
||||
async fn fetch_page(
|
||||
&self,
|
||||
crate_name: &str,
|
||||
item: Option<&RustdocItem>,
|
||||
) -> Result<Option<String>> {
|
||||
let mut local_cargo_doc_path = self.cargo_workspace_root.join("target/doc");
|
||||
local_cargo_doc_path.push(&crate_name);
|
||||
if let Some(item) = item {
|
||||
if !item.path.is_empty() {
|
||||
local_cargo_doc_path.push(item.path.join("/"));
|
||||
}
|
||||
}
|
||||
local_cargo_doc_path.push("index.html");
|
||||
|
||||
let Ok(contents) = self.fs.load(&local_cargo_doc_path).await else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
Ok(Some(contents))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocsDotRsProvider {
|
||||
http_client: Arc<HttpClientWithUrl>,
|
||||
}
|
||||
|
||||
impl DocsDotRsProvider {
|
||||
pub fn new(http_client: Arc<HttpClientWithUrl>) -> Self {
|
||||
Self { http_client }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl RustdocProvider for DocsDotRsProvider {
|
||||
async fn fetch_page(
|
||||
&self,
|
||||
crate_name: &str,
|
||||
item: Option<&RustdocItem>,
|
||||
) -> Result<Option<String>> {
|
||||
let version = "latest";
|
||||
let path = format!(
|
||||
"{crate_name}/{version}/{crate_name}{item_path}",
|
||||
item_path = item
|
||||
.map(|item| format!("/{}", item.url_path()))
|
||||
.unwrap_or_default()
|
||||
);
|
||||
|
||||
println!("Fetching {}", &format!("https://docs.rs/{path}"));
|
||||
|
||||
let mut response = self
|
||||
.http_client
|
||||
.get(
|
||||
&format!("https://docs.rs/{path}"),
|
||||
AsyncBody::default(),
|
||||
true,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut body = Vec::new();
|
||||
response
|
||||
.body_mut()
|
||||
.read_to_end(&mut body)
|
||||
.await
|
||||
.context("error reading docs.rs response body")?;
|
||||
|
||||
if response.status().is_client_error() {
|
||||
let text = String::from_utf8_lossy(body.as_slice());
|
||||
bail!(
|
||||
"status error {}, response: {text:?}",
|
||||
response.status().as_u16()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Some(String::from_utf8(body)?))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RustdocItemWithHistory {
|
||||
pub item: RustdocItem,
|
||||
#[cfg(debug_assertions)]
|
||||
pub history: Vec<String>,
|
||||
}
|
||||
|
||||
pub struct RustdocCrawler {
|
||||
provider: Box<dyn RustdocProvider + Send + Sync + 'static>,
|
||||
}
|
||||
|
||||
impl RustdocCrawler {
|
||||
pub fn new(provider: Box<dyn RustdocProvider + Send + Sync + 'static>) -> Self {
|
||||
Self { provider }
|
||||
}
|
||||
|
||||
pub async fn crawl(&self, crate_name: String) -> Result<Option<String>> {
|
||||
let Some(crate_index_content) = self.provider.fetch_page(&crate_name, None).await? else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let (_markdown, items) = convert_rustdoc_to_markdown(crate_index_content.as_bytes())?;
|
||||
|
||||
let mut seen_items = HashSet::default();
|
||||
let mut items_to_visit: VecDeque<RustdocItemWithHistory> =
|
||||
VecDeque::from_iter(items.into_iter().map(|item| RustdocItemWithHistory {
|
||||
item,
|
||||
#[cfg(debug_assertions)]
|
||||
history: Vec::new(),
|
||||
}));
|
||||
|
||||
while let Some(item_with_history) = items_to_visit.pop_front() {
|
||||
let item = &item_with_history.item;
|
||||
println!("Visiting {:?} {:?} {}", &item.kind, &item.path, &item.name);
|
||||
|
||||
let Some(result) = self
|
||||
.provider
|
||||
.fetch_page(&crate_name, Some(&item))
|
||||
.await
|
||||
.with_context(|| {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
format!(
|
||||
"failed to fetch {item:?}: {history:?}",
|
||||
history = item_with_history.history
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
format!("failed to fetch {item:?}")
|
||||
}
|
||||
})?
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let (_markdown, mut items) = convert_rustdoc_to_markdown(result.as_bytes())?;
|
||||
|
||||
seen_items.insert(item.clone());
|
||||
|
||||
for child in &mut items {
|
||||
child.path.extend(item.path.clone());
|
||||
match item.kind {
|
||||
RustdocItemKind::Mod => {
|
||||
child.path.push(item.name.clone());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let unseen_items = items
|
||||
.into_iter()
|
||||
.map(|item| RustdocItemWithHistory {
|
||||
#[cfg(debug_assertions)]
|
||||
history: {
|
||||
let mut history = item_with_history.history.clone();
|
||||
history.push(item.url_path());
|
||||
history
|
||||
},
|
||||
item,
|
||||
})
|
||||
.filter(|item| !seen_items.contains(&item.item));
|
||||
|
||||
items_to_visit.extend(unseen_items);
|
||||
}
|
||||
|
||||
Ok(Some(String::new()))
|
||||
}
|
||||
}
|
1
crates/rustdoc/src/rustdoc.rs
Normal file
1
crates/rustdoc/src/rustdoc.rs
Normal file
@ -0,0 +1 @@
|
||||
pub mod crawler;
|
Loading…
Reference in New Issue
Block a user