mirror of
https://github.com/gitbutlerapp/gitbutler.git
synced 2024-12-25 10:33:21 +03:00
make searcher non-project specific
This commit is contained in:
parent
d04df229d6
commit
d85b40f472
@ -1,15 +1,75 @@
|
||||
use crate::{deltas, projects, sessions};
|
||||
use anyhow::Result;
|
||||
use std::{fs, path::Path};
|
||||
use tantivy::{collector, directory::MmapDirectory, schema};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fs,
|
||||
path::Path,
|
||||
sync::{Arc, Mutex},
|
||||
vec,
|
||||
};
|
||||
use tantivy::{collector, directory::MmapDirectory, schema, IndexWriter};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DeltasIndex {
|
||||
index: tantivy::Index,
|
||||
reader: tantivy::IndexReader,
|
||||
pub struct Deltas {
|
||||
base_path: String,
|
||||
|
||||
indexes: HashMap<String, tantivy::Index>,
|
||||
readers: HashMap<String, tantivy::IndexReader>,
|
||||
writers: HashMap<String, Arc<Mutex<tantivy::IndexWriter>>>,
|
||||
}
|
||||
|
||||
fn schema() -> schema::Schema {
|
||||
impl Deltas {
|
||||
pub fn at<P: AsRef<Path>>(path: P) -> Self {
|
||||
Self {
|
||||
base_path: path.as_ref().to_str().unwrap().to_string(),
|
||||
readers: HashMap::new(),
|
||||
writers: HashMap::new(),
|
||||
indexes: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn init(&mut self, project_id: &str) -> Result<()> {
|
||||
if self.indexes.contains_key(project_id) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let index = open_or_create(Path::new(&self.base_path), project_id)?;
|
||||
let reader = index.reader()?;
|
||||
let writer = index.writer(WRITE_BUFFER_SIZE)?;
|
||||
self.readers.insert(project_id.to_string(), reader);
|
||||
self.writers
|
||||
.insert(project_id.to_string(), Arc::new(Mutex::new(writer)));
|
||||
self.indexes.insert(project_id.to_string(), index);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn search(&self, project_id: &str, query: &str) -> Result<Vec<SearchResult>> {
|
||||
match self.readers.get(project_id) {
|
||||
None => Ok(vec![]),
|
||||
Some(reader) => {
|
||||
let index = self.indexes.get(project_id).unwrap();
|
||||
search(index, reader, query)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index(
|
||||
&mut self,
|
||||
repo: &git2::Repository,
|
||||
project: &projects::Project,
|
||||
session: &sessions::Session,
|
||||
) -> Result<()> {
|
||||
self.init(&project.id)?;
|
||||
index(
|
||||
&self.indexes.get(&project.id).unwrap(),
|
||||
&mut self.writers.get(&project.id).unwrap().lock().unwrap(),
|
||||
session,
|
||||
repo,
|
||||
project,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn build_schema() -> schema::Schema {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
schema_builder.add_text_field(
|
||||
"session_hash",
|
||||
@ -48,151 +108,144 @@ pub struct SearchResult {
|
||||
pub index: u64,
|
||||
}
|
||||
|
||||
impl DeltasIndex {
|
||||
pub fn open_or_create<P: AsRef<Path>>(
|
||||
base_path: P,
|
||||
project: &projects::Project,
|
||||
) -> Result<Self> {
|
||||
let dir = base_path
|
||||
.as_ref()
|
||||
.join("indexes")
|
||||
.join(&project.id)
|
||||
.join("deltas");
|
||||
fs::create_dir_all(&dir)?;
|
||||
fn open_or_create<P: AsRef<Path>>(base_path: P, project_id: &str) -> Result<tantivy::Index> {
|
||||
let dir = base_path
|
||||
.as_ref()
|
||||
.join("indexes")
|
||||
.join(&project_id)
|
||||
.join("deltas");
|
||||
fs::create_dir_all(&dir)?;
|
||||
|
||||
let schema = schema();
|
||||
let mmap_dir = MmapDirectory::open(dir)?;
|
||||
let index = tantivy::Index::open_or_create(mmap_dir, schema)?;
|
||||
Ok(Self {
|
||||
index: index.clone(),
|
||||
reader: index.reader()?,
|
||||
})
|
||||
let mmap_dir = MmapDirectory::open(dir)?;
|
||||
let schema = build_schema();
|
||||
let index = tantivy::Index::open_or_create(mmap_dir, schema)?;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
fn index(
|
||||
index: &tantivy::Index,
|
||||
writer: &mut IndexWriter,
|
||||
session: &sessions::Session,
|
||||
repo: &git2::Repository,
|
||||
project: &projects::Project,
|
||||
) -> Result<()> {
|
||||
let reference = repo.find_reference(&project.refname())?;
|
||||
let deltas = deltas::list(repo, project, &reference, &session.id)?;
|
||||
println!("Found {} deltas", deltas.len());
|
||||
if deltas.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
fn with_writer(&self, f: impl FnOnce(&tantivy::IndexWriter) -> Result<()>) -> Result<()> {
|
||||
let mut writer = self.index.writer(WRITE_BUFFER_SIZE)?;
|
||||
f(&mut writer)?;
|
||||
writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn write(
|
||||
&self,
|
||||
session: &sessions::Session,
|
||||
repo: &git2::Repository,
|
||||
project: &projects::Project,
|
||||
reference: &git2::Reference,
|
||||
) -> Result<()> {
|
||||
let deltas = deltas::list(repo, project, reference, &session.id)?;
|
||||
println!("Found {} deltas", deltas.len());
|
||||
if deltas.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let files = sessions::list_files(
|
||||
repo,
|
||||
project,
|
||||
reference,
|
||||
&session.id,
|
||||
Some(deltas.keys().map(|k| k.as_str()).collect()),
|
||||
)?;
|
||||
match &session.hash {
|
||||
None => Err(anyhow::anyhow!("Session hash is not set, on")),
|
||||
Some(hash) => self.with_writer(|writer| {
|
||||
let field_session_hash = self.index.schema().get_field("session_hash").unwrap();
|
||||
let field_file_path = self.index.schema().get_field("file_path").unwrap();
|
||||
let field_diff = self.index.schema().get_field("diff").unwrap();
|
||||
let field_is_addition = self.index.schema().get_field("is_addition").unwrap();
|
||||
let field_is_deletion = self.index.schema().get_field("is_deletion").unwrap();
|
||||
let field_index = self.index.schema().get_field("index").unwrap();
|
||||
|
||||
// index every file
|
||||
for (file_path, deltas) in deltas.into_iter() {
|
||||
// keep the state of the file after each delta operation
|
||||
// we need it to calculate diff for delete operations
|
||||
let mut file_text: Vec<char> = files
|
||||
.get(&file_path)
|
||||
.map(|f| f.as_str())
|
||||
.unwrap_or("")
|
||||
.chars()
|
||||
.collect();
|
||||
// for every deltas for the file
|
||||
for (i, delta) in deltas.into_iter().enumerate() {
|
||||
// for every operation in the delta
|
||||
for operation in &delta.operations {
|
||||
let mut doc = tantivy::Document::default();
|
||||
doc.add_u64(field_index, i.try_into()?);
|
||||
doc.add_text(field_session_hash, hash);
|
||||
doc.add_text(field_file_path, file_path.as_str());
|
||||
match operation {
|
||||
deltas::Operation::Delete((from, len)) => {
|
||||
// here we use the file_text to calculate the diff
|
||||
let diff = file_text
|
||||
.iter()
|
||||
.skip((*from).try_into()?)
|
||||
.take((*len).try_into()?)
|
||||
.collect::<String>();
|
||||
doc.add_text(field_diff, diff);
|
||||
doc.add_bool(field_is_deletion, true);
|
||||
}
|
||||
deltas::Operation::Insert((_from, value)) => {
|
||||
doc.add_text(field_diff, value);
|
||||
doc.add_bool(field_is_addition, true);
|
||||
}
|
||||
let files = sessions::list_files(
|
||||
repo,
|
||||
project,
|
||||
&reference,
|
||||
&session.id,
|
||||
Some(deltas.keys().map(|k| k.as_str()).collect()),
|
||||
)?;
|
||||
match &session.hash {
|
||||
None => Err(anyhow::anyhow!("Session hash is not set, on")),
|
||||
Some(hash) => {
|
||||
// index every file
|
||||
for (file_path, deltas) in deltas.into_iter() {
|
||||
// keep the state of the file after each delta operation
|
||||
// we need it to calculate diff for delete operations
|
||||
let mut file_text: Vec<char> = files
|
||||
.get(&file_path)
|
||||
.map(|f| f.as_str())
|
||||
.unwrap_or("")
|
||||
.chars()
|
||||
.collect();
|
||||
// for every deltas for the file
|
||||
for (i, delta) in deltas.into_iter().enumerate() {
|
||||
// for every operation in the delta
|
||||
for operation in &delta.operations {
|
||||
let mut doc = tantivy::Document::default();
|
||||
doc.add_u64(index.schema().get_field("index").unwrap(), i.try_into()?);
|
||||
doc.add_text(index.schema().get_field("session_hash").unwrap(), hash);
|
||||
doc.add_text(
|
||||
index.schema().get_field("file_path").unwrap(),
|
||||
file_path.as_str(),
|
||||
);
|
||||
match operation {
|
||||
deltas::Operation::Delete((from, len)) => {
|
||||
// here we use the file_text to calculate the diff
|
||||
let diff = file_text
|
||||
.iter()
|
||||
.skip((*from).try_into()?)
|
||||
.take((*len).try_into()?)
|
||||
.collect::<String>();
|
||||
doc.add_text(index.schema().get_field("diff").unwrap(), diff);
|
||||
doc.add_bool(
|
||||
index.schema().get_field("is_deletion").unwrap(),
|
||||
true,
|
||||
);
|
||||
}
|
||||
deltas::Operation::Insert((_from, value)) => {
|
||||
doc.add_text(index.schema().get_field("diff").unwrap(), value);
|
||||
doc.add_bool(
|
||||
index.schema().get_field("is_addition").unwrap(),
|
||||
true,
|
||||
);
|
||||
}
|
||||
writer.add_document(doc)?;
|
||||
|
||||
// don't forget to apply the operation to the file_text
|
||||
operation.apply(&mut file_text);
|
||||
}
|
||||
writer.add_document(doc)?;
|
||||
|
||||
// don't forget to apply the operation to the file_text
|
||||
operation.apply(&mut file_text);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}),
|
||||
}
|
||||
writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn search(&self, q: &str) -> Result<Vec<SearchResult>> {
|
||||
let field_file_path = self.index.schema().get_field("file_path").unwrap();
|
||||
let field_diff = self.index.schema().get_field("diff").unwrap();
|
||||
let field_session_hash = self.index.schema().get_field("session_hash").unwrap();
|
||||
let field_index = self.index.schema().get_field("index").unwrap();
|
||||
|
||||
let query_parser =
|
||||
&tantivy::query::QueryParser::for_index(&self.index, vec![field_file_path, field_diff]);
|
||||
|
||||
let query = query_parser.parse_query(q)?;
|
||||
|
||||
self.reader.reload()?;
|
||||
let searcher = self.reader.searcher();
|
||||
let top_docs = searcher.search(&query, &collector::TopDocs::with_limit(10))?;
|
||||
|
||||
let results = top_docs
|
||||
.iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let retrieved_doc = searcher.doc(*doc_address)?;
|
||||
let file_path = retrieved_doc
|
||||
.get_first(field_file_path)
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let session_hash = retrieved_doc
|
||||
.get_first(field_session_hash)
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let index = retrieved_doc
|
||||
.get_first(field_index)
|
||||
.unwrap()
|
||||
.as_u64()
|
||||
.unwrap();
|
||||
Ok(SearchResult {
|
||||
file_path: file_path.to_string(),
|
||||
session_hash: session_hash.to_string(),
|
||||
index,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<SearchResult>>>()?;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn search(
|
||||
index: &tantivy::Index,
|
||||
reader: &tantivy::IndexReader,
|
||||
q: &str,
|
||||
) -> Result<Vec<SearchResult>> {
|
||||
let query_parser = &tantivy::query::QueryParser::for_index(
|
||||
index,
|
||||
vec![
|
||||
index.schema().get_field("diff").unwrap(),
|
||||
index.schema().get_field("file_path").unwrap(),
|
||||
],
|
||||
);
|
||||
|
||||
let query = query_parser.parse_query(q)?;
|
||||
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
let top_docs = searcher.search(&query, &collector::TopDocs::with_limit(10))?;
|
||||
|
||||
let results = top_docs
|
||||
.iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let retrieved_doc = searcher.doc(*doc_address)?;
|
||||
let file_path = retrieved_doc
|
||||
.get_first(index.schema().get_field("file_path").unwrap())
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let session_hash = retrieved_doc
|
||||
.get_first(index.schema().get_field("session_hash").unwrap())
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let index = retrieved_doc
|
||||
.get_first(index.schema().get_field("index").unwrap())
|
||||
.unwrap()
|
||||
.as_u64()
|
||||
.unwrap();
|
||||
Ok(SearchResult {
|
||||
file_path: file_path.to_string(),
|
||||
session_hash: session_hash.to_string(),
|
||||
index,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<SearchResult>>>()?;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
@ -50,15 +50,14 @@ fn test_simple() {
|
||||
.unwrap();
|
||||
session.flush(&repo, &None, &project).unwrap();
|
||||
|
||||
let index = super::DeltasIndex::open_or_create(&index_path, &project).unwrap();
|
||||
let mut searcher = super::Deltas::at(&index_path);
|
||||
|
||||
let reference = repo.find_reference(&project.refname()).unwrap();
|
||||
let write_result = index.write(&session, &repo, &project, &reference);
|
||||
let write_result = searcher.index(&repo, &project, &session);
|
||||
assert!(write_result.is_ok());
|
||||
|
||||
let session_hash = session.hash.unwrap();
|
||||
|
||||
let search_result1 = index.search("hello");
|
||||
let search_result1 = searcher.search(&project.id, "hello");
|
||||
assert!(search_result1.is_ok());
|
||||
let search_result1 = search_result1.unwrap();
|
||||
assert_eq!(search_result1.len(), 1);
|
||||
@ -66,7 +65,7 @@ fn test_simple() {
|
||||
assert_eq!(search_result1[0].file_path, "test.txt");
|
||||
assert_eq!(search_result1[0].index, 0);
|
||||
|
||||
let search_result2 = index.search("world");
|
||||
let search_result2 = searcher.search(&project.id, "world");
|
||||
assert!(search_result2.is_ok());
|
||||
let search_result2 = search_result2.unwrap();
|
||||
assert_eq!(search_result2.len(), 1);
|
||||
@ -74,7 +73,7 @@ fn test_simple() {
|
||||
assert_eq!(search_result2[0].file_path, "test.txt");
|
||||
assert_eq!(search_result2[0].index, 1);
|
||||
|
||||
let search_result3 = index.search("hello world");
|
||||
let search_result3 = searcher.search(&project.id, "hello world");
|
||||
assert!(search_result3.is_ok());
|
||||
let search_result3 = search_result3.unwrap();
|
||||
assert_eq!(search_result3.len(), 2);
|
||||
@ -83,10 +82,15 @@ fn test_simple() {
|
||||
assert_eq!(search_result3[1].session_hash, session_hash);
|
||||
assert_eq!(search_result3[1].file_path, "test.txt");
|
||||
|
||||
let search_by_filename_result = index.search("test.txt");
|
||||
let search_by_filename_result = searcher.search(&project.id, "test.txt");
|
||||
assert!(search_by_filename_result.is_ok());
|
||||
let search_by_filename_result = search_by_filename_result.unwrap();
|
||||
assert_eq!(search_by_filename_result.len(), 2);
|
||||
assert_eq!(search_by_filename_result[0].session_hash, session_hash);
|
||||
assert_eq!(search_by_filename_result[0].file_path, "test.txt");
|
||||
|
||||
let not_found_result = searcher.search("404", "hello world");
|
||||
assert!(not_found_result.is_ok());
|
||||
let not_found_result = not_found_result.unwrap();
|
||||
assert_eq!(not_found_result.len(), 0);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
mod deltas;
|
||||
|
||||
pub use deltas::DeltasIndex;
|
||||
pub use deltas::Deltas;
|
||||
|
||||
#[cfg(test)]
|
||||
mod deltas_test;
|
||||
|
Loading…
Reference in New Issue
Block a user