make searcher non-project specific

This commit is contained in:
Nikita Galaiko 2023-03-03 10:39:08 +01:00
parent d04df229d6
commit d85b40f472
3 changed files with 211 additions and 154 deletions

View File

@ -1,15 +1,75 @@
use crate::{deltas, projects, sessions};
use anyhow::Result;
use std::{fs, path::Path};
use tantivy::{collector, directory::MmapDirectory, schema};
use std::{
collections::HashMap,
fs,
path::Path,
sync::{Arc, Mutex},
vec,
};
use tantivy::{collector, directory::MmapDirectory, schema, IndexWriter};
#[derive(Clone)]
pub struct DeltasIndex {
index: tantivy::Index,
reader: tantivy::IndexReader,
pub struct Deltas {
base_path: String,
indexes: HashMap<String, tantivy::Index>,
readers: HashMap<String, tantivy::IndexReader>,
writers: HashMap<String, Arc<Mutex<tantivy::IndexWriter>>>,
}
fn schema() -> schema::Schema {
impl Deltas {
pub fn at<P: AsRef<Path>>(path: P) -> Self {
Self {
base_path: path.as_ref().to_str().unwrap().to_string(),
readers: HashMap::new(),
writers: HashMap::new(),
indexes: HashMap::new(),
}
}
fn init(&mut self, project_id: &str) -> Result<()> {
if self.indexes.contains_key(project_id) {
return Ok(());
}
let index = open_or_create(Path::new(&self.base_path), project_id)?;
let reader = index.reader()?;
let writer = index.writer(WRITE_BUFFER_SIZE)?;
self.readers.insert(project_id.to_string(), reader);
self.writers
.insert(project_id.to_string(), Arc::new(Mutex::new(writer)));
self.indexes.insert(project_id.to_string(), index);
Ok(())
}
pub fn search(&self, project_id: &str, query: &str) -> Result<Vec<SearchResult>> {
match self.readers.get(project_id) {
None => Ok(vec![]),
Some(reader) => {
let index = self.indexes.get(project_id).unwrap();
search(index, reader, query)
}
}
}
pub fn index(
&mut self,
repo: &git2::Repository,
project: &projects::Project,
session: &sessions::Session,
) -> Result<()> {
self.init(&project.id)?;
index(
&self.indexes.get(&project.id).unwrap(),
&mut self.writers.get(&project.id).unwrap().lock().unwrap(),
session,
repo,
project,
)
}
}
fn build_schema() -> schema::Schema {
let mut schema_builder = schema::Schema::builder();
schema_builder.add_text_field(
"session_hash",
@ -48,151 +108,144 @@ pub struct SearchResult {
pub index: u64,
}
impl DeltasIndex {
pub fn open_or_create<P: AsRef<Path>>(
base_path: P,
project: &projects::Project,
) -> Result<Self> {
let dir = base_path
.as_ref()
.join("indexes")
.join(&project.id)
.join("deltas");
fs::create_dir_all(&dir)?;
fn open_or_create<P: AsRef<Path>>(base_path: P, project_id: &str) -> Result<tantivy::Index> {
let dir = base_path
.as_ref()
.join("indexes")
.join(&project_id)
.join("deltas");
fs::create_dir_all(&dir)?;
let schema = schema();
let mmap_dir = MmapDirectory::open(dir)?;
let index = tantivy::Index::open_or_create(mmap_dir, schema)?;
Ok(Self {
index: index.clone(),
reader: index.reader()?,
})
let mmap_dir = MmapDirectory::open(dir)?;
let schema = build_schema();
let index = tantivy::Index::open_or_create(mmap_dir, schema)?;
Ok(index)
}
fn index(
index: &tantivy::Index,
writer: &mut IndexWriter,
session: &sessions::Session,
repo: &git2::Repository,
project: &projects::Project,
) -> Result<()> {
let reference = repo.find_reference(&project.refname())?;
let deltas = deltas::list(repo, project, &reference, &session.id)?;
println!("Found {} deltas", deltas.len());
if deltas.is_empty() {
return Ok(());
}
fn with_writer(&self, f: impl FnOnce(&tantivy::IndexWriter) -> Result<()>) -> Result<()> {
let mut writer = self.index.writer(WRITE_BUFFER_SIZE)?;
f(&mut writer)?;
writer.commit()?;
Ok(())
}
pub fn write(
&self,
session: &sessions::Session,
repo: &git2::Repository,
project: &projects::Project,
reference: &git2::Reference,
) -> Result<()> {
let deltas = deltas::list(repo, project, reference, &session.id)?;
println!("Found {} deltas", deltas.len());
if deltas.is_empty() {
return Ok(());
}
let files = sessions::list_files(
repo,
project,
reference,
&session.id,
Some(deltas.keys().map(|k| k.as_str()).collect()),
)?;
match &session.hash {
None => Err(anyhow::anyhow!("Session hash is not set, on")),
Some(hash) => self.with_writer(|writer| {
let field_session_hash = self.index.schema().get_field("session_hash").unwrap();
let field_file_path = self.index.schema().get_field("file_path").unwrap();
let field_diff = self.index.schema().get_field("diff").unwrap();
let field_is_addition = self.index.schema().get_field("is_addition").unwrap();
let field_is_deletion = self.index.schema().get_field("is_deletion").unwrap();
let field_index = self.index.schema().get_field("index").unwrap();
// index every file
for (file_path, deltas) in deltas.into_iter() {
// keep the state of the file after each delta operation
// we need it to calculate diff for delete operations
let mut file_text: Vec<char> = files
.get(&file_path)
.map(|f| f.as_str())
.unwrap_or("")
.chars()
.collect();
// for every deltas for the file
for (i, delta) in deltas.into_iter().enumerate() {
// for every operation in the delta
for operation in &delta.operations {
let mut doc = tantivy::Document::default();
doc.add_u64(field_index, i.try_into()?);
doc.add_text(field_session_hash, hash);
doc.add_text(field_file_path, file_path.as_str());
match operation {
deltas::Operation::Delete((from, len)) => {
// here we use the file_text to calculate the diff
let diff = file_text
.iter()
.skip((*from).try_into()?)
.take((*len).try_into()?)
.collect::<String>();
doc.add_text(field_diff, diff);
doc.add_bool(field_is_deletion, true);
}
deltas::Operation::Insert((_from, value)) => {
doc.add_text(field_diff, value);
doc.add_bool(field_is_addition, true);
}
let files = sessions::list_files(
repo,
project,
&reference,
&session.id,
Some(deltas.keys().map(|k| k.as_str()).collect()),
)?;
match &session.hash {
None => Err(anyhow::anyhow!("Session hash is not set, on")),
Some(hash) => {
// index every file
for (file_path, deltas) in deltas.into_iter() {
// keep the state of the file after each delta operation
// we need it to calculate diff for delete operations
let mut file_text: Vec<char> = files
.get(&file_path)
.map(|f| f.as_str())
.unwrap_or("")
.chars()
.collect();
// for every deltas for the file
for (i, delta) in deltas.into_iter().enumerate() {
// for every operation in the delta
for operation in &delta.operations {
let mut doc = tantivy::Document::default();
doc.add_u64(index.schema().get_field("index").unwrap(), i.try_into()?);
doc.add_text(index.schema().get_field("session_hash").unwrap(), hash);
doc.add_text(
index.schema().get_field("file_path").unwrap(),
file_path.as_str(),
);
match operation {
deltas::Operation::Delete((from, len)) => {
// here we use the file_text to calculate the diff
let diff = file_text
.iter()
.skip((*from).try_into()?)
.take((*len).try_into()?)
.collect::<String>();
doc.add_text(index.schema().get_field("diff").unwrap(), diff);
doc.add_bool(
index.schema().get_field("is_deletion").unwrap(),
true,
);
}
deltas::Operation::Insert((_from, value)) => {
doc.add_text(index.schema().get_field("diff").unwrap(), value);
doc.add_bool(
index.schema().get_field("is_addition").unwrap(),
true,
);
}
writer.add_document(doc)?;
// don't forget to apply the operation to the file_text
operation.apply(&mut file_text);
}
writer.add_document(doc)?;
// don't forget to apply the operation to the file_text
operation.apply(&mut file_text);
}
}
Ok(())
}),
}
writer.commit()?;
Ok(())
}
}
pub fn search(&self, q: &str) -> Result<Vec<SearchResult>> {
let field_file_path = self.index.schema().get_field("file_path").unwrap();
let field_diff = self.index.schema().get_field("diff").unwrap();
let field_session_hash = self.index.schema().get_field("session_hash").unwrap();
let field_index = self.index.schema().get_field("index").unwrap();
let query_parser =
&tantivy::query::QueryParser::for_index(&self.index, vec![field_file_path, field_diff]);
let query = query_parser.parse_query(q)?;
self.reader.reload()?;
let searcher = self.reader.searcher();
let top_docs = searcher.search(&query, &collector::TopDocs::with_limit(10))?;
let results = top_docs
.iter()
.map(|(_score, doc_address)| {
let retrieved_doc = searcher.doc(*doc_address)?;
let file_path = retrieved_doc
.get_first(field_file_path)
.unwrap()
.as_text()
.unwrap();
let session_hash = retrieved_doc
.get_first(field_session_hash)
.unwrap()
.as_text()
.unwrap();
let index = retrieved_doc
.get_first(field_index)
.unwrap()
.as_u64()
.unwrap();
Ok(SearchResult {
file_path: file_path.to_string(),
session_hash: session_hash.to_string(),
index,
})
})
.collect::<Result<Vec<SearchResult>>>()?;
Ok(results)
}
}
pub fn search(
index: &tantivy::Index,
reader: &tantivy::IndexReader,
q: &str,
) -> Result<Vec<SearchResult>> {
let query_parser = &tantivy::query::QueryParser::for_index(
index,
vec![
index.schema().get_field("diff").unwrap(),
index.schema().get_field("file_path").unwrap(),
],
);
let query = query_parser.parse_query(q)?;
reader.reload()?;
let searcher = reader.searcher();
let top_docs = searcher.search(&query, &collector::TopDocs::with_limit(10))?;
let results = top_docs
.iter()
.map(|(_score, doc_address)| {
let retrieved_doc = searcher.doc(*doc_address)?;
let file_path = retrieved_doc
.get_first(index.schema().get_field("file_path").unwrap())
.unwrap()
.as_text()
.unwrap();
let session_hash = retrieved_doc
.get_first(index.schema().get_field("session_hash").unwrap())
.unwrap()
.as_text()
.unwrap();
let index = retrieved_doc
.get_first(index.schema().get_field("index").unwrap())
.unwrap()
.as_u64()
.unwrap();
Ok(SearchResult {
file_path: file_path.to_string(),
session_hash: session_hash.to_string(),
index,
})
})
.collect::<Result<Vec<SearchResult>>>()?;
Ok(results)
}

View File

@ -50,15 +50,14 @@ fn test_simple() {
.unwrap();
session.flush(&repo, &None, &project).unwrap();
let index = super::DeltasIndex::open_or_create(&index_path, &project).unwrap();
let mut searcher = super::Deltas::at(&index_path);
let reference = repo.find_reference(&project.refname()).unwrap();
let write_result = index.write(&session, &repo, &project, &reference);
let write_result = searcher.index(&repo, &project, &session);
assert!(write_result.is_ok());
let session_hash = session.hash.unwrap();
let search_result1 = index.search("hello");
let search_result1 = searcher.search(&project.id, "hello");
assert!(search_result1.is_ok());
let search_result1 = search_result1.unwrap();
assert_eq!(search_result1.len(), 1);
@ -66,7 +65,7 @@ fn test_simple() {
assert_eq!(search_result1[0].file_path, "test.txt");
assert_eq!(search_result1[0].index, 0);
let search_result2 = index.search("world");
let search_result2 = searcher.search(&project.id, "world");
assert!(search_result2.is_ok());
let search_result2 = search_result2.unwrap();
assert_eq!(search_result2.len(), 1);
@ -74,7 +73,7 @@ fn test_simple() {
assert_eq!(search_result2[0].file_path, "test.txt");
assert_eq!(search_result2[0].index, 1);
let search_result3 = index.search("hello world");
let search_result3 = searcher.search(&project.id, "hello world");
assert!(search_result3.is_ok());
let search_result3 = search_result3.unwrap();
assert_eq!(search_result3.len(), 2);
@ -83,10 +82,15 @@ fn test_simple() {
assert_eq!(search_result3[1].session_hash, session_hash);
assert_eq!(search_result3[1].file_path, "test.txt");
let search_by_filename_result = index.search("test.txt");
let search_by_filename_result = searcher.search(&project.id, "test.txt");
assert!(search_by_filename_result.is_ok());
let search_by_filename_result = search_by_filename_result.unwrap();
assert_eq!(search_by_filename_result.len(), 2);
assert_eq!(search_by_filename_result[0].session_hash, session_hash);
assert_eq!(search_by_filename_result[0].file_path, "test.txt");
let not_found_result = searcher.search("404", "hello world");
assert!(not_found_result.is_ok());
let not_found_result = not_found_result.unwrap();
assert_eq!(not_found_result.len(), 0);
}

View File

@ -1,6 +1,6 @@
mod deltas;
pub use deltas::DeltasIndex;
pub use deltas::Deltas;
#[cfg(test)]
mod deltas_test;