mirror of
https://github.com/zed-industries/zed.git
synced 2024-11-07 20:39:04 +03:00
Start on rustdoc crawler (#12942)
This PR adds a first pass at a rustdoc crawler. We'll be using this to get information about a crate from the rustdoc artifacts for use in the Assistant. Release Notes: - N/A --------- Co-authored-by: Richard <richard@zed.dev>
This commit is contained in:
parent
72dac24acf
commit
c3df9b79c6
13
Cargo.lock
generated
13
Cargo.lock
generated
@ -8664,6 +8664,19 @@ dependencies = [
|
|||||||
"semver",
|
"semver",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustdoc"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"async-trait",
|
||||||
|
"collections",
|
||||||
|
"fs",
|
||||||
|
"futures 0.3.28",
|
||||||
|
"html_to_markdown",
|
||||||
|
"http 0.1.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "0.37.23"
|
version = "0.37.23"
|
||||||
|
@ -79,6 +79,7 @@ members = [
|
|||||||
"crates/rich_text",
|
"crates/rich_text",
|
||||||
"crates/rope",
|
"crates/rope",
|
||||||
"crates/rpc",
|
"crates/rpc",
|
||||||
|
"crates/rustdoc",
|
||||||
"crates/task",
|
"crates/task",
|
||||||
"crates/tasks_ui",
|
"crates/tasks_ui",
|
||||||
"crates/search",
|
"crates/search",
|
||||||
@ -227,6 +228,7 @@ dev_server_projects = { path = "crates/dev_server_projects" }
|
|||||||
rich_text = { path = "crates/rich_text" }
|
rich_text = { path = "crates/rich_text" }
|
||||||
rope = { path = "crates/rope" }
|
rope = { path = "crates/rope" }
|
||||||
rpc = { path = "crates/rpc" }
|
rpc = { path = "crates/rpc" }
|
||||||
|
rustdoc = { path = "crates/rustdoc" }
|
||||||
task = { path = "crates/task" }
|
task = { path = "crates/task" }
|
||||||
tasks_ui = { path = "crates/tasks_ui" }
|
tasks_ui = { path = "crates/tasks_ui" }
|
||||||
search = { path = "crates/search" }
|
search = { path = "crates/search" }
|
||||||
|
@ -58,7 +58,7 @@ pub fn convert_rustdoc_to_markdown(html: impl Read) -> Result<(String, Vec<Rustd
|
|||||||
let items = item_collector
|
let items = item_collector
|
||||||
.borrow()
|
.borrow()
|
||||||
.items
|
.items
|
||||||
.values()
|
.iter()
|
||||||
.cloned()
|
.cloned()
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
use indexmap::IndexMap;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use indexmap::IndexSet;
|
||||||
use strum::{EnumIter, IntoEnumIterator};
|
use strum::{EnumIter, IntoEnumIterator};
|
||||||
|
|
||||||
use crate::html_element::HtmlElement;
|
use crate::html_element::HtmlElement;
|
||||||
@ -238,17 +240,25 @@ impl RustdocItemKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
|
||||||
pub struct RustdocItem {
|
pub struct RustdocItem {
|
||||||
pub kind: RustdocItemKind,
|
pub kind: RustdocItemKind,
|
||||||
pub name: String,
|
/// The item path, up until the name of the item.
|
||||||
|
pub path: Vec<Arc<str>>,
|
||||||
|
/// The name of the item.
|
||||||
|
pub name: Arc<str>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RustdocItem {
|
impl RustdocItem {
|
||||||
pub fn url_path(&self) -> String {
|
pub fn url_path(&self) -> String {
|
||||||
let name = &self.name;
|
let name = &self.name;
|
||||||
|
let mut path_components = self.path.clone();
|
||||||
|
|
||||||
match self.kind {
|
match self.kind {
|
||||||
RustdocItemKind::Mod => format!("{name}/index.html"),
|
RustdocItemKind::Mod => {
|
||||||
|
path_components.push(name.clone());
|
||||||
|
path_components.push("index.html".into());
|
||||||
|
}
|
||||||
RustdocItemKind::Macro
|
RustdocItemKind::Macro
|
||||||
| RustdocItemKind::Struct
|
| RustdocItemKind::Struct
|
||||||
| RustdocItemKind::Enum
|
| RustdocItemKind::Enum
|
||||||
@ -258,20 +268,23 @@ impl RustdocItem {
|
|||||||
| RustdocItemKind::TypeAlias
|
| RustdocItemKind::TypeAlias
|
||||||
| RustdocItemKind::AttributeMacro
|
| RustdocItemKind::AttributeMacro
|
||||||
| RustdocItemKind::DeriveMacro => {
|
| RustdocItemKind::DeriveMacro => {
|
||||||
format!("{kind}.{name}.html", kind = self.kind.class())
|
path_components
|
||||||
|
.push(format!("{kind}.{name}.html", kind = self.kind.class()).into());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
path_components.join("/")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct RustdocItemCollector {
|
pub struct RustdocItemCollector {
|
||||||
pub items: IndexMap<(RustdocItemKind, String), RustdocItem>,
|
pub items: IndexSet<RustdocItem>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RustdocItemCollector {
|
impl RustdocItemCollector {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
items: IndexMap::new(),
|
items: IndexSet::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -281,23 +294,32 @@ impl RustdocItemCollector {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let href = tag.attr("href")?;
|
let href = tag.attr("href")?;
|
||||||
if href == "#" {
|
if href.starts_with('#') || href.starts_with("https://") || href.starts_with("../") {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
for kind in RustdocItemKind::iter() {
|
for kind in RustdocItemKind::iter() {
|
||||||
if tag.has_class(kind.class()) {
|
if tag.has_class(kind.class()) {
|
||||||
let name = href
|
let mut parts = href.trim_end_matches("/index.html").split('/');
|
||||||
|
|
||||||
|
if let Some(last_component) = parts.next_back() {
|
||||||
|
let last_component = match last_component.split_once('#') {
|
||||||
|
Some((component, _fragment)) => component,
|
||||||
|
None => last_component,
|
||||||
|
};
|
||||||
|
|
||||||
|
let name = last_component
|
||||||
.trim_start_matches(&format!("{}.", kind.class()))
|
.trim_start_matches(&format!("{}.", kind.class()))
|
||||||
.trim_end_matches("/index.html")
|
|
||||||
.trim_end_matches(".html");
|
.trim_end_matches(".html");
|
||||||
|
|
||||||
return Some(RustdocItem {
|
return Some(RustdocItem {
|
||||||
kind,
|
kind,
|
||||||
name: name.to_owned(),
|
name: name.into(),
|
||||||
|
path: parts.map(Into::into).collect(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@ -317,7 +339,7 @@ impl HandleTag for RustdocItemCollector {
|
|||||||
"a" => {
|
"a" => {
|
||||||
let is_reexport = writer.current_element_stack().iter().any(|element| {
|
let is_reexport = writer.current_element_stack().iter().any(|element| {
|
||||||
if let Some(id) = element.attr("id") {
|
if let Some(id) = element.attr("id") {
|
||||||
id.starts_with("reexport.")
|
id.starts_with("reexport.") || id.starts_with("method.")
|
||||||
} else {
|
} else {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
@ -325,7 +347,7 @@ impl HandleTag for RustdocItemCollector {
|
|||||||
|
|
||||||
if !is_reexport {
|
if !is_reexport {
|
||||||
if let Some(item) = Self::parse_item(tag) {
|
if let Some(item) = Self::parse_item(tag) {
|
||||||
self.items.insert((item.kind, item.name.clone()), item);
|
self.items.insert(item);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
23
crates/rustdoc/Cargo.toml
Normal file
23
crates/rustdoc/Cargo.toml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
[package]
|
||||||
|
name = "rustdoc"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
publish = false
|
||||||
|
license = "GPL-3.0-or-later"
|
||||||
|
|
||||||
|
[lints]
|
||||||
|
workspace = true
|
||||||
|
|
||||||
|
[lib]
|
||||||
|
path = "src/rustdoc.rs"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow.workspace = true
|
||||||
|
async-trait.workspace = true
|
||||||
|
collections.workspace = true
|
||||||
|
fs.workspace = true
|
||||||
|
futures.workspace = true
|
||||||
|
html_to_markdown.workspace = true
|
||||||
|
http.workspace = true
|
||||||
|
|
||||||
|
[dev-dependencies]
|
1
crates/rustdoc/LICENSE-GPL
Symbolic link
1
crates/rustdoc/LICENSE-GPL
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../LICENSE-GPL
|
211
crates/rustdoc/src/crawler.rs
Normal file
211
crates/rustdoc/src/crawler.rs
Normal file
@ -0,0 +1,211 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use anyhow::{bail, Context, Result};
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use collections::{HashSet, VecDeque};
|
||||||
|
use fs::Fs;
|
||||||
|
use futures::AsyncReadExt;
|
||||||
|
use html_to_markdown::convert_rustdoc_to_markdown;
|
||||||
|
use html_to_markdown::structure::rustdoc::{RustdocItem, RustdocItemKind};
|
||||||
|
use http::{AsyncBody, HttpClient, HttpClientWithUrl};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub enum RustdocSource {
|
||||||
|
/// The docs were sourced from local `cargo doc` output.
|
||||||
|
Local,
|
||||||
|
/// The docs were sourced from `docs.rs`.
|
||||||
|
DocsDotRs,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
pub trait RustdocProvider {
|
||||||
|
async fn fetch_page(
|
||||||
|
&self,
|
||||||
|
crate_name: &str,
|
||||||
|
item: Option<&RustdocItem>,
|
||||||
|
) -> Result<Option<String>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct LocalProvider {
|
||||||
|
fs: Arc<dyn Fs>,
|
||||||
|
cargo_workspace_root: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LocalProvider {
|
||||||
|
pub fn new(fs: Arc<dyn Fs>, cargo_workspace_root: PathBuf) -> Self {
|
||||||
|
Self {
|
||||||
|
fs,
|
||||||
|
cargo_workspace_root,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl RustdocProvider for LocalProvider {
|
||||||
|
async fn fetch_page(
|
||||||
|
&self,
|
||||||
|
crate_name: &str,
|
||||||
|
item: Option<&RustdocItem>,
|
||||||
|
) -> Result<Option<String>> {
|
||||||
|
let mut local_cargo_doc_path = self.cargo_workspace_root.join("target/doc");
|
||||||
|
local_cargo_doc_path.push(&crate_name);
|
||||||
|
if let Some(item) = item {
|
||||||
|
if !item.path.is_empty() {
|
||||||
|
local_cargo_doc_path.push(item.path.join("/"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
local_cargo_doc_path.push("index.html");
|
||||||
|
|
||||||
|
let Ok(contents) = self.fs.load(&local_cargo_doc_path).await else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Some(contents))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DocsDotRsProvider {
|
||||||
|
http_client: Arc<HttpClientWithUrl>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocsDotRsProvider {
|
||||||
|
pub fn new(http_client: Arc<HttpClientWithUrl>) -> Self {
|
||||||
|
Self { http_client }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl RustdocProvider for DocsDotRsProvider {
|
||||||
|
async fn fetch_page(
|
||||||
|
&self,
|
||||||
|
crate_name: &str,
|
||||||
|
item: Option<&RustdocItem>,
|
||||||
|
) -> Result<Option<String>> {
|
||||||
|
let version = "latest";
|
||||||
|
let path = format!(
|
||||||
|
"{crate_name}/{version}/{crate_name}{item_path}",
|
||||||
|
item_path = item
|
||||||
|
.map(|item| format!("/{}", item.url_path()))
|
||||||
|
.unwrap_or_default()
|
||||||
|
);
|
||||||
|
|
||||||
|
println!("Fetching {}", &format!("https://docs.rs/{path}"));
|
||||||
|
|
||||||
|
let mut response = self
|
||||||
|
.http_client
|
||||||
|
.get(
|
||||||
|
&format!("https://docs.rs/{path}"),
|
||||||
|
AsyncBody::default(),
|
||||||
|
true,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut body = Vec::new();
|
||||||
|
response
|
||||||
|
.body_mut()
|
||||||
|
.read_to_end(&mut body)
|
||||||
|
.await
|
||||||
|
.context("error reading docs.rs response body")?;
|
||||||
|
|
||||||
|
if response.status().is_client_error() {
|
||||||
|
let text = String::from_utf8_lossy(body.as_slice());
|
||||||
|
bail!(
|
||||||
|
"status error {}, response: {text:?}",
|
||||||
|
response.status().as_u16()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(String::from_utf8(body)?))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RustdocItemWithHistory {
|
||||||
|
pub item: RustdocItem,
|
||||||
|
#[cfg(debug_assertions)]
|
||||||
|
pub history: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RustdocCrawler {
|
||||||
|
provider: Box<dyn RustdocProvider + Send + Sync + 'static>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RustdocCrawler {
|
||||||
|
pub fn new(provider: Box<dyn RustdocProvider + Send + Sync + 'static>) -> Self {
|
||||||
|
Self { provider }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn crawl(&self, crate_name: String) -> Result<Option<String>> {
|
||||||
|
let Some(crate_index_content) = self.provider.fetch_page(&crate_name, None).await? else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
|
||||||
|
let (_markdown, items) = convert_rustdoc_to_markdown(crate_index_content.as_bytes())?;
|
||||||
|
|
||||||
|
let mut seen_items = HashSet::default();
|
||||||
|
let mut items_to_visit: VecDeque<RustdocItemWithHistory> =
|
||||||
|
VecDeque::from_iter(items.into_iter().map(|item| RustdocItemWithHistory {
|
||||||
|
item,
|
||||||
|
#[cfg(debug_assertions)]
|
||||||
|
history: Vec::new(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
while let Some(item_with_history) = items_to_visit.pop_front() {
|
||||||
|
let item = &item_with_history.item;
|
||||||
|
println!("Visiting {:?} {:?} {}", &item.kind, &item.path, &item.name);
|
||||||
|
|
||||||
|
let Some(result) = self
|
||||||
|
.provider
|
||||||
|
.fetch_page(&crate_name, Some(&item))
|
||||||
|
.await
|
||||||
|
.with_context(|| {
|
||||||
|
#[cfg(debug_assertions)]
|
||||||
|
{
|
||||||
|
format!(
|
||||||
|
"failed to fetch {item:?}: {history:?}",
|
||||||
|
history = item_with_history.history
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(debug_assertions))]
|
||||||
|
{
|
||||||
|
format!("failed to fetch {item:?}")
|
||||||
|
}
|
||||||
|
})?
|
||||||
|
else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
let (_markdown, mut items) = convert_rustdoc_to_markdown(result.as_bytes())?;
|
||||||
|
|
||||||
|
seen_items.insert(item.clone());
|
||||||
|
|
||||||
|
for child in &mut items {
|
||||||
|
child.path.extend(item.path.clone());
|
||||||
|
match item.kind {
|
||||||
|
RustdocItemKind::Mod => {
|
||||||
|
child.path.push(item.name.clone());
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let unseen_items = items
|
||||||
|
.into_iter()
|
||||||
|
.map(|item| RustdocItemWithHistory {
|
||||||
|
#[cfg(debug_assertions)]
|
||||||
|
history: {
|
||||||
|
let mut history = item_with_history.history.clone();
|
||||||
|
history.push(item.url_path());
|
||||||
|
history
|
||||||
|
},
|
||||||
|
item,
|
||||||
|
})
|
||||||
|
.filter(|item| !seen_items.contains(&item.item));
|
||||||
|
|
||||||
|
items_to_visit.extend(unseen_items);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(String::new()))
|
||||||
|
}
|
||||||
|
}
|
1
crates/rustdoc/src/rustdoc.rs
Normal file
1
crates/rustdoc/src/rustdoc.rs
Normal file
@ -0,0 +1 @@
|
|||||||
|
pub mod crawler;
|
Loading…
Reference in New Issue
Block a user