From d85b40f47226361a81419ed7de45f05b86d79d8d Mon Sep 17 00:00:00 2001 From: Nikita Galaiko Date: Fri, 3 Mar 2023 10:39:08 +0100 Subject: [PATCH] make searcher non-project specific --- src-tauri/src/search/deltas.rs | 345 ++++++++++++++++------------ src-tauri/src/search/deltas_test.rs | 18 +- src-tauri/src/search/mod.rs | 2 +- 3 files changed, 211 insertions(+), 154 deletions(-) diff --git a/src-tauri/src/search/deltas.rs b/src-tauri/src/search/deltas.rs index 81154210e..6d3f3bd5f 100644 --- a/src-tauri/src/search/deltas.rs +++ b/src-tauri/src/search/deltas.rs @@ -1,15 +1,75 @@ use crate::{deltas, projects, sessions}; use anyhow::Result; -use std::{fs, path::Path}; -use tantivy::{collector, directory::MmapDirectory, schema}; +use std::{ + collections::HashMap, + fs, + path::Path, + sync::{Arc, Mutex}, + vec, +}; +use tantivy::{collector, directory::MmapDirectory, schema, IndexWriter}; -#[derive(Clone)] -pub struct DeltasIndex { - index: tantivy::Index, - reader: tantivy::IndexReader, +pub struct Deltas { + base_path: String, + + indexes: HashMap, + readers: HashMap, + writers: HashMap>>, } -fn schema() -> schema::Schema { +impl Deltas { + pub fn at>(path: P) -> Self { + Self { + base_path: path.as_ref().to_str().unwrap().to_string(), + readers: HashMap::new(), + writers: HashMap::new(), + indexes: HashMap::new(), + } + } + + fn init(&mut self, project_id: &str) -> Result<()> { + if self.indexes.contains_key(project_id) { + return Ok(()); + } + + let index = open_or_create(Path::new(&self.base_path), project_id)?; + let reader = index.reader()?; + let writer = index.writer(WRITE_BUFFER_SIZE)?; + self.readers.insert(project_id.to_string(), reader); + self.writers + .insert(project_id.to_string(), Arc::new(Mutex::new(writer))); + self.indexes.insert(project_id.to_string(), index); + Ok(()) + } + + pub fn search(&self, project_id: &str, query: &str) -> Result> { + match self.readers.get(project_id) { + None => Ok(vec![]), + Some(reader) => { + let index = self.indexes.get(project_id).unwrap(); + search(index, reader, query) + } + } + } + + pub fn index( + &mut self, + repo: &git2::Repository, + project: &projects::Project, + session: &sessions::Session, + ) -> Result<()> { + self.init(&project.id)?; + index( + &self.indexes.get(&project.id).unwrap(), + &mut self.writers.get(&project.id).unwrap().lock().unwrap(), + session, + repo, + project, + ) + } +} + +fn build_schema() -> schema::Schema { let mut schema_builder = schema::Schema::builder(); schema_builder.add_text_field( "session_hash", @@ -48,151 +108,144 @@ pub struct SearchResult { pub index: u64, } -impl DeltasIndex { - pub fn open_or_create>( - base_path: P, - project: &projects::Project, - ) -> Result { - let dir = base_path - .as_ref() - .join("indexes") - .join(&project.id) - .join("deltas"); - fs::create_dir_all(&dir)?; +fn open_or_create>(base_path: P, project_id: &str) -> Result { + let dir = base_path + .as_ref() + .join("indexes") + .join(&project_id) + .join("deltas"); + fs::create_dir_all(&dir)?; - let schema = schema(); - let mmap_dir = MmapDirectory::open(dir)?; - let index = tantivy::Index::open_or_create(mmap_dir, schema)?; - Ok(Self { - index: index.clone(), - reader: index.reader()?, - }) + let mmap_dir = MmapDirectory::open(dir)?; + let schema = build_schema(); + let index = tantivy::Index::open_or_create(mmap_dir, schema)?; + Ok(index) +} + +fn index( + index: &tantivy::Index, + writer: &mut IndexWriter, + session: &sessions::Session, + repo: &git2::Repository, + project: &projects::Project, +) -> Result<()> { + let reference = repo.find_reference(&project.refname())?; + let deltas = deltas::list(repo, project, &reference, &session.id)?; + println!("Found {} deltas", deltas.len()); + if deltas.is_empty() { + return Ok(()); } - - fn with_writer(&self, f: impl FnOnce(&tantivy::IndexWriter) -> Result<()>) -> Result<()> { - let mut writer = self.index.writer(WRITE_BUFFER_SIZE)?; - f(&mut writer)?; - writer.commit()?; - Ok(()) - } - - pub fn write( - &self, - session: &sessions::Session, - repo: &git2::Repository, - project: &projects::Project, - reference: &git2::Reference, - ) -> Result<()> { - let deltas = deltas::list(repo, project, reference, &session.id)?; - println!("Found {} deltas", deltas.len()); - if deltas.is_empty() { - return Ok(()); - } - let files = sessions::list_files( - repo, - project, - reference, - &session.id, - Some(deltas.keys().map(|k| k.as_str()).collect()), - )?; - match &session.hash { - None => Err(anyhow::anyhow!("Session hash is not set, on")), - Some(hash) => self.with_writer(|writer| { - let field_session_hash = self.index.schema().get_field("session_hash").unwrap(); - let field_file_path = self.index.schema().get_field("file_path").unwrap(); - let field_diff = self.index.schema().get_field("diff").unwrap(); - let field_is_addition = self.index.schema().get_field("is_addition").unwrap(); - let field_is_deletion = self.index.schema().get_field("is_deletion").unwrap(); - let field_index = self.index.schema().get_field("index").unwrap(); - - // index every file - for (file_path, deltas) in deltas.into_iter() { - // keep the state of the file after each delta operation - // we need it to calculate diff for delete operations - let mut file_text: Vec = files - .get(&file_path) - .map(|f| f.as_str()) - .unwrap_or("") - .chars() - .collect(); - // for every deltas for the file - for (i, delta) in deltas.into_iter().enumerate() { - // for every operation in the delta - for operation in &delta.operations { - let mut doc = tantivy::Document::default(); - doc.add_u64(field_index, i.try_into()?); - doc.add_text(field_session_hash, hash); - doc.add_text(field_file_path, file_path.as_str()); - match operation { - deltas::Operation::Delete((from, len)) => { - // here we use the file_text to calculate the diff - let diff = file_text - .iter() - .skip((*from).try_into()?) - .take((*len).try_into()?) - .collect::(); - doc.add_text(field_diff, diff); - doc.add_bool(field_is_deletion, true); - } - deltas::Operation::Insert((_from, value)) => { - doc.add_text(field_diff, value); - doc.add_bool(field_is_addition, true); - } + let files = sessions::list_files( + repo, + project, + &reference, + &session.id, + Some(deltas.keys().map(|k| k.as_str()).collect()), + )?; + match &session.hash { + None => Err(anyhow::anyhow!("Session hash is not set, on")), + Some(hash) => { + // index every file + for (file_path, deltas) in deltas.into_iter() { + // keep the state of the file after each delta operation + // we need it to calculate diff for delete operations + let mut file_text: Vec = files + .get(&file_path) + .map(|f| f.as_str()) + .unwrap_or("") + .chars() + .collect(); + // for every deltas for the file + for (i, delta) in deltas.into_iter().enumerate() { + // for every operation in the delta + for operation in &delta.operations { + let mut doc = tantivy::Document::default(); + doc.add_u64(index.schema().get_field("index").unwrap(), i.try_into()?); + doc.add_text(index.schema().get_field("session_hash").unwrap(), hash); + doc.add_text( + index.schema().get_field("file_path").unwrap(), + file_path.as_str(), + ); + match operation { + deltas::Operation::Delete((from, len)) => { + // here we use the file_text to calculate the diff + let diff = file_text + .iter() + .skip((*from).try_into()?) + .take((*len).try_into()?) + .collect::(); + doc.add_text(index.schema().get_field("diff").unwrap(), diff); + doc.add_bool( + index.schema().get_field("is_deletion").unwrap(), + true, + ); + } + deltas::Operation::Insert((_from, value)) => { + doc.add_text(index.schema().get_field("diff").unwrap(), value); + doc.add_bool( + index.schema().get_field("is_addition").unwrap(), + true, + ); } - writer.add_document(doc)?; - - // don't forget to apply the operation to the file_text - operation.apply(&mut file_text); } + writer.add_document(doc)?; + + // don't forget to apply the operation to the file_text + operation.apply(&mut file_text); } } - Ok(()) - }), + } + writer.commit()?; + Ok(()) } } - - pub fn search(&self, q: &str) -> Result> { - let field_file_path = self.index.schema().get_field("file_path").unwrap(); - let field_diff = self.index.schema().get_field("diff").unwrap(); - let field_session_hash = self.index.schema().get_field("session_hash").unwrap(); - let field_index = self.index.schema().get_field("index").unwrap(); - - let query_parser = - &tantivy::query::QueryParser::for_index(&self.index, vec![field_file_path, field_diff]); - - let query = query_parser.parse_query(q)?; - - self.reader.reload()?; - let searcher = self.reader.searcher(); - let top_docs = searcher.search(&query, &collector::TopDocs::with_limit(10))?; - - let results = top_docs - .iter() - .map(|(_score, doc_address)| { - let retrieved_doc = searcher.doc(*doc_address)?; - let file_path = retrieved_doc - .get_first(field_file_path) - .unwrap() - .as_text() - .unwrap(); - let session_hash = retrieved_doc - .get_first(field_session_hash) - .unwrap() - .as_text() - .unwrap(); - let index = retrieved_doc - .get_first(field_index) - .unwrap() - .as_u64() - .unwrap(); - Ok(SearchResult { - file_path: file_path.to_string(), - session_hash: session_hash.to_string(), - index, - }) - }) - .collect::>>()?; - - Ok(results) - } +} + +pub fn search( + index: &tantivy::Index, + reader: &tantivy::IndexReader, + q: &str, +) -> Result> { + let query_parser = &tantivy::query::QueryParser::for_index( + index, + vec![ + index.schema().get_field("diff").unwrap(), + index.schema().get_field("file_path").unwrap(), + ], + ); + + let query = query_parser.parse_query(q)?; + + reader.reload()?; + let searcher = reader.searcher(); + let top_docs = searcher.search(&query, &collector::TopDocs::with_limit(10))?; + + let results = top_docs + .iter() + .map(|(_score, doc_address)| { + let retrieved_doc = searcher.doc(*doc_address)?; + let file_path = retrieved_doc + .get_first(index.schema().get_field("file_path").unwrap()) + .unwrap() + .as_text() + .unwrap(); + let session_hash = retrieved_doc + .get_first(index.schema().get_field("session_hash").unwrap()) + .unwrap() + .as_text() + .unwrap(); + let index = retrieved_doc + .get_first(index.schema().get_field("index").unwrap()) + .unwrap() + .as_u64() + .unwrap(); + Ok(SearchResult { + file_path: file_path.to_string(), + session_hash: session_hash.to_string(), + index, + }) + }) + .collect::>>()?; + + Ok(results) } diff --git a/src-tauri/src/search/deltas_test.rs b/src-tauri/src/search/deltas_test.rs index 988933129..a5909f75f 100644 --- a/src-tauri/src/search/deltas_test.rs +++ b/src-tauri/src/search/deltas_test.rs @@ -50,15 +50,14 @@ fn test_simple() { .unwrap(); session.flush(&repo, &None, &project).unwrap(); - let index = super::DeltasIndex::open_or_create(&index_path, &project).unwrap(); + let mut searcher = super::Deltas::at(&index_path); - let reference = repo.find_reference(&project.refname()).unwrap(); - let write_result = index.write(&session, &repo, &project, &reference); + let write_result = searcher.index(&repo, &project, &session); assert!(write_result.is_ok()); let session_hash = session.hash.unwrap(); - let search_result1 = index.search("hello"); + let search_result1 = searcher.search(&project.id, "hello"); assert!(search_result1.is_ok()); let search_result1 = search_result1.unwrap(); assert_eq!(search_result1.len(), 1); @@ -66,7 +65,7 @@ fn test_simple() { assert_eq!(search_result1[0].file_path, "test.txt"); assert_eq!(search_result1[0].index, 0); - let search_result2 = index.search("world"); + let search_result2 = searcher.search(&project.id, "world"); assert!(search_result2.is_ok()); let search_result2 = search_result2.unwrap(); assert_eq!(search_result2.len(), 1); @@ -74,7 +73,7 @@ fn test_simple() { assert_eq!(search_result2[0].file_path, "test.txt"); assert_eq!(search_result2[0].index, 1); - let search_result3 = index.search("hello world"); + let search_result3 = searcher.search(&project.id, "hello world"); assert!(search_result3.is_ok()); let search_result3 = search_result3.unwrap(); assert_eq!(search_result3.len(), 2); @@ -83,10 +82,15 @@ fn test_simple() { assert_eq!(search_result3[1].session_hash, session_hash); assert_eq!(search_result3[1].file_path, "test.txt"); - let search_by_filename_result = index.search("test.txt"); + let search_by_filename_result = searcher.search(&project.id, "test.txt"); assert!(search_by_filename_result.is_ok()); let search_by_filename_result = search_by_filename_result.unwrap(); assert_eq!(search_by_filename_result.len(), 2); assert_eq!(search_by_filename_result[0].session_hash, session_hash); assert_eq!(search_by_filename_result[0].file_path, "test.txt"); + + let not_found_result = searcher.search("404", "hello world"); + assert!(not_found_result.is_ok()); + let not_found_result = not_found_result.unwrap(); + assert_eq!(not_found_result.len(), 0); } diff --git a/src-tauri/src/search/mod.rs b/src-tauri/src/search/mod.rs index 9d0107cde..7d707a7a4 100644 --- a/src-tauri/src/search/mod.rs +++ b/src-tauri/src/search/mod.rs @@ -1,6 +1,6 @@ mod deltas; -pub use deltas::DeltasIndex; +pub use deltas::Deltas; #[cfg(test)] mod deltas_test;