make searcher non-project specific

2024-12-25 10:33:21 +03:00 · 2023-03-03 10:39:08 +01:00 · 2023-03-03 10:39:08 +01:00 · d85b40f472
commit d85b40f472
parent d04df229d6
3 changed files with 211 additions and 154 deletions
--- a/src-tauri/src/search/deltas.rs
+++ b/src-tauri/src/search/deltas.rs
@ -1,15 +1,75 @@
 use crate::{deltas, projects, sessions};
 use anyhow::Result;
-use std::{fs, path::Path};
-use tantivy::{collector, directory::MmapDirectory, schema};
+use std::{
+    collections::HashMap,
+    fs,
+    path::Path,
+    sync::{Arc, Mutex},
+    vec,
+};
+use tantivy::{collector, directory::MmapDirectory, schema, IndexWriter};

-#[derive(Clone)]
-pub struct DeltasIndex {
-    index: tantivy::Index,
-    reader: tantivy::IndexReader,
+pub struct Deltas {
+    base_path: String,
+
+    indexes: HashMap<String, tantivy::Index>,
+    readers: HashMap<String, tantivy::IndexReader>,
+    writers: HashMap<String, Arc<Mutex<tantivy::IndexWriter>>>,
 }

-fn schema() -> schema::Schema {
+impl Deltas {
+    pub fn at<P: AsRef<Path>>(path: P) -> Self {
+        Self {
+            base_path: path.as_ref().to_str().unwrap().to_string(),
+            readers: HashMap::new(),
+            writers: HashMap::new(),
+            indexes: HashMap::new(),
+        }
+    }
+
+    fn init(&mut self, project_id: &str) -> Result<()> {
+        if self.indexes.contains_key(project_id) {
+            return Ok(());
+        }
+
+        let index = open_or_create(Path::new(&self.base_path), project_id)?;
+        let reader = index.reader()?;
+        let writer = index.writer(WRITE_BUFFER_SIZE)?;
+        self.readers.insert(project_id.to_string(), reader);
+        self.writers
+            .insert(project_id.to_string(), Arc::new(Mutex::new(writer)));
+        self.indexes.insert(project_id.to_string(), index);
+        Ok(())
+    }
+
+    pub fn search(&self, project_id: &str, query: &str) -> Result<Vec<SearchResult>> {
+        match self.readers.get(project_id) {
+            None => Ok(vec![]),
+            Some(reader) => {
+                let index = self.indexes.get(project_id).unwrap();
+                search(index, reader, query)
+            }
+        }
+    }
+
+    pub fn index(
+        &mut self,
+        repo: &git2::Repository,
+        project: &projects::Project,
+        session: &sessions::Session,
+    ) -> Result<()> {
+        self.init(&project.id)?;
+        index(
+            &self.indexes.get(&project.id).unwrap(),
+            &mut self.writers.get(&project.id).unwrap().lock().unwrap(),
+            session,
+            repo,
+            project,
+        )
+    }
+}
+
+fn build_schema() -> schema::Schema {
    let mut schema_builder = schema::Schema::builder();
    schema_builder.add_text_field(
        "session_hash",
@ -48,151 +108,144 @@ pub struct SearchResult {
    pub index: u64,
 }

-impl DeltasIndex {
-    pub fn open_or_create<P: AsRef<Path>>(
-        base_path: P,
-        project: &projects::Project,
-    ) -> Result<Self> {
-        let dir = base_path
-            .as_ref()
-            .join("indexes")
-            .join(&project.id)
-            .join("deltas");
-        fs::create_dir_all(&dir)?;
+fn open_or_create<P: AsRef<Path>>(base_path: P, project_id: &str) -> Result<tantivy::Index> {
+    let dir = base_path
+        .as_ref()
+        .join("indexes")
+        .join(&project_id)
+        .join("deltas");
+    fs::create_dir_all(&dir)?;

-        let schema = schema();
-        let mmap_dir = MmapDirectory::open(dir)?;
-        let index = tantivy::Index::open_or_create(mmap_dir, schema)?;
-        Ok(Self {
-            index: index.clone(),
-            reader: index.reader()?,
-        })
+    let mmap_dir = MmapDirectory::open(dir)?;
+    let schema = build_schema();
+    let index = tantivy::Index::open_or_create(mmap_dir, schema)?;
+    Ok(index)
+}
+
+fn index(
+    index: &tantivy::Index,
+    writer: &mut IndexWriter,
+    session: &sessions::Session,
+    repo: &git2::Repository,
+    project: &projects::Project,
+) -> Result<()> {
+    let reference = repo.find_reference(&project.refname())?;
+    let deltas = deltas::list(repo, project, &reference, &session.id)?;
+    println!("Found {} deltas", deltas.len());
+    if deltas.is_empty() {
+        return Ok(());
    }
-
-    fn with_writer(&self, f: impl FnOnce(&tantivy::IndexWriter) -> Result<()>) -> Result<()> {
-        let mut writer = self.index.writer(WRITE_BUFFER_SIZE)?;
-        f(&mut writer)?;
-        writer.commit()?;
-        Ok(())
-    }
-
-    pub fn write(
-        &self,
-        session: &sessions::Session,
-        repo: &git2::Repository,
-        project: &projects::Project,
-        reference: &git2::Reference,
-    ) -> Result<()> {
-        let deltas = deltas::list(repo, project, reference, &session.id)?;
-        println!("Found {} deltas", deltas.len());
-        if deltas.is_empty() {
-            return Ok(());
-        }
-        let files = sessions::list_files(
-            repo,
-            project,
-            reference,
-            &session.id,
-            Some(deltas.keys().map(|k| k.as_str()).collect()),
-        )?;
-        match &session.hash {
-            None => Err(anyhow::anyhow!("Session hash is not set, on")),
-            Some(hash) => self.with_writer(|writer| {
-                let field_session_hash = self.index.schema().get_field("session_hash").unwrap();
-                let field_file_path = self.index.schema().get_field("file_path").unwrap();
-                let field_diff = self.index.schema().get_field("diff").unwrap();
-                let field_is_addition = self.index.schema().get_field("is_addition").unwrap();
-                let field_is_deletion = self.index.schema().get_field("is_deletion").unwrap();
-                let field_index = self.index.schema().get_field("index").unwrap();
-
-                // index every file
-                for (file_path, deltas) in deltas.into_iter() {
-                    // keep the state of the file after each delta operation
-                    // we need it to calculate diff for delete operations
-                    let mut file_text: Vec<char> = files
-                        .get(&file_path)
-                        .map(|f| f.as_str())
-                        .unwrap_or("")
-                        .chars()
-                        .collect();
-                    // for every deltas for the file
-                    for (i, delta) in deltas.into_iter().enumerate() {
-                        // for every operation in the delta
-                        for operation in &delta.operations {
-                            let mut doc = tantivy::Document::default();
-                            doc.add_u64(field_index, i.try_into()?);
-                            doc.add_text(field_session_hash, hash);
-                            doc.add_text(field_file_path, file_path.as_str());
-                            match operation {
-                                deltas::Operation::Delete((from, len)) => {
-                                    // here we use the file_text to calculate the diff
-                                    let diff = file_text
-                                        .iter()
-                                        .skip((*from).try_into()?)
-                                        .take((*len).try_into()?)
-                                        .collect::<String>();
-                                    doc.add_text(field_diff, diff);
-                                    doc.add_bool(field_is_deletion, true);
-                                }
-                                deltas::Operation::Insert((_from, value)) => {
-                                    doc.add_text(field_diff, value);
-                                    doc.add_bool(field_is_addition, true);
-                                }
+    let files = sessions::list_files(
+        repo,
+        project,
+        &reference,
+        &session.id,
+        Some(deltas.keys().map(|k| k.as_str()).collect()),
+    )?;
+    match &session.hash {
+        None => Err(anyhow::anyhow!("Session hash is not set, on")),
+        Some(hash) => {
+            // index every file
+            for (file_path, deltas) in deltas.into_iter() {
+                // keep the state of the file after each delta operation
+                // we need it to calculate diff for delete operations
+                let mut file_text: Vec<char> = files
+                    .get(&file_path)
+                    .map(|f| f.as_str())
+                    .unwrap_or("")
+                    .chars()
+                    .collect();
+                // for every deltas for the file
+                for (i, delta) in deltas.into_iter().enumerate() {
+                    // for every operation in the delta
+                    for operation in &delta.operations {
+                        let mut doc = tantivy::Document::default();
+                        doc.add_u64(index.schema().get_field("index").unwrap(), i.try_into()?);
+                        doc.add_text(index.schema().get_field("session_hash").unwrap(), hash);
+                        doc.add_text(
+                            index.schema().get_field("file_path").unwrap(),
+                            file_path.as_str(),
+                        );
+                        match operation {
+                            deltas::Operation::Delete((from, len)) => {
+                                // here we use the file_text to calculate the diff
+                                let diff = file_text
+                                    .iter()
+                                    .skip((*from).try_into()?)
+                                    .take((*len).try_into()?)
+                                    .collect::<String>();
+                                doc.add_text(index.schema().get_field("diff").unwrap(), diff);
+                                doc.add_bool(
+                                    index.schema().get_field("is_deletion").unwrap(),
+                                    true,
+                                );
+                            }
+                            deltas::Operation::Insert((_from, value)) => {
+                                doc.add_text(index.schema().get_field("diff").unwrap(), value);
+                                doc.add_bool(
+                                    index.schema().get_field("is_addition").unwrap(),
+                                    true,
+                                );
                            }
-                            writer.add_document(doc)?;
-
-                            // don't forget to apply the operation to the file_text
-                            operation.apply(&mut file_text);
                        }
+                        writer.add_document(doc)?;
+
+                        // don't forget to apply the operation to the file_text
+                        operation.apply(&mut file_text);
                    }
                }
-                Ok(())
-            }),
+            }
+            writer.commit()?;
+            Ok(())
        }
    }
-
-    pub fn search(&self, q: &str) -> Result<Vec<SearchResult>> {
-        let field_file_path = self.index.schema().get_field("file_path").unwrap();
-        let field_diff = self.index.schema().get_field("diff").unwrap();
-        let field_session_hash = self.index.schema().get_field("session_hash").unwrap();
-        let field_index = self.index.schema().get_field("index").unwrap();
-
-        let query_parser =
-            &tantivy::query::QueryParser::for_index(&self.index, vec![field_file_path, field_diff]);
-
-        let query = query_parser.parse_query(q)?;
-
-        self.reader.reload()?;
-        let searcher = self.reader.searcher();
-        let top_docs = searcher.search(&query, &collector::TopDocs::with_limit(10))?;
-
-        let results = top_docs
-            .iter()
-            .map(|(_score, doc_address)| {
-                let retrieved_doc = searcher.doc(*doc_address)?;
-                let file_path = retrieved_doc
-                    .get_first(field_file_path)
-                    .unwrap()
-                    .as_text()
-                    .unwrap();
-                let session_hash = retrieved_doc
-                    .get_first(field_session_hash)
-                    .unwrap()
-                    .as_text()
-                    .unwrap();
-                let index = retrieved_doc
-                    .get_first(field_index)
-                    .unwrap()
-                    .as_u64()
-                    .unwrap();
-                Ok(SearchResult {
-                    file_path: file_path.to_string(),
-                    session_hash: session_hash.to_string(),
-                    index,
-                })
-            })
-            .collect::<Result<Vec<SearchResult>>>()?;
-
-        Ok(results)
-    }
+}
+
+pub fn search(
+    index: &tantivy::Index,
+    reader: &tantivy::IndexReader,
+    q: &str,
+) -> Result<Vec<SearchResult>> {
+    let query_parser = &tantivy::query::QueryParser::for_index(
+        index,
+        vec![
+            index.schema().get_field("diff").unwrap(),
+            index.schema().get_field("file_path").unwrap(),
+        ],
+    );
+
+    let query = query_parser.parse_query(q)?;
+
+    reader.reload()?;
+    let searcher = reader.searcher();
+    let top_docs = searcher.search(&query, &collector::TopDocs::with_limit(10))?;
+
+    let results = top_docs
+        .iter()
+        .map(|(_score, doc_address)| {
+            let retrieved_doc = searcher.doc(*doc_address)?;
+            let file_path = retrieved_doc
+                .get_first(index.schema().get_field("file_path").unwrap())
+                .unwrap()
+                .as_text()
+                .unwrap();
+            let session_hash = retrieved_doc
+                .get_first(index.schema().get_field("session_hash").unwrap())
+                .unwrap()
+                .as_text()
+                .unwrap();
+            let index = retrieved_doc
+                .get_first(index.schema().get_field("index").unwrap())
+                .unwrap()
+                .as_u64()
+                .unwrap();
+            Ok(SearchResult {
+                file_path: file_path.to_string(),
+                session_hash: session_hash.to_string(),
+                index,
+            })
+        })
+        .collect::<Result<Vec<SearchResult>>>()?;
+
+    Ok(results)
 }
--- a/src-tauri/src/search/deltas_test.rs
+++ b/src-tauri/src/search/deltas_test.rs
@ -50,15 +50,14 @@ fn test_simple() {
    .unwrap();
    session.flush(&repo, &None, &project).unwrap();

-    let index = super::DeltasIndex::open_or_create(&index_path, &project).unwrap();
+    let mut searcher = super::Deltas::at(&index_path);

-    let reference = repo.find_reference(&project.refname()).unwrap();
-    let write_result = index.write(&session, &repo, &project, &reference);
+    let write_result = searcher.index(&repo, &project, &session);
    assert!(write_result.is_ok());

    let session_hash = session.hash.unwrap();

-    let search_result1 = index.search("hello");
+    let search_result1 = searcher.search(&project.id, "hello");
    assert!(search_result1.is_ok());
    let search_result1 = search_result1.unwrap();
    assert_eq!(search_result1.len(), 1);
@ -66,7 +65,7 @@ fn test_simple() {
    assert_eq!(search_result1[0].file_path, "test.txt");
    assert_eq!(search_result1[0].index, 0);

-    let search_result2 = index.search("world");
+    let search_result2 = searcher.search(&project.id, "world");
    assert!(search_result2.is_ok());
    let search_result2 = search_result2.unwrap();
    assert_eq!(search_result2.len(), 1);
@ -74,7 +73,7 @@ fn test_simple() {
    assert_eq!(search_result2[0].file_path, "test.txt");
    assert_eq!(search_result2[0].index, 1);

-    let search_result3 = index.search("hello world");
+    let search_result3 = searcher.search(&project.id, "hello world");
    assert!(search_result3.is_ok());
    let search_result3 = search_result3.unwrap();
    assert_eq!(search_result3.len(), 2);
@ -83,10 +82,15 @@ fn test_simple() {
    assert_eq!(search_result3[1].session_hash, session_hash);
    assert_eq!(search_result3[1].file_path, "test.txt");

-    let search_by_filename_result = index.search("test.txt");
+    let search_by_filename_result = searcher.search(&project.id, "test.txt");
    assert!(search_by_filename_result.is_ok());
    let search_by_filename_result = search_by_filename_result.unwrap();
    assert_eq!(search_by_filename_result.len(), 2);
    assert_eq!(search_by_filename_result[0].session_hash, session_hash);
    assert_eq!(search_by_filename_result[0].file_path, "test.txt");
+
+    let not_found_result = searcher.search("404", "hello world");
+    assert!(not_found_result.is_ok());
+    let not_found_result = not_found_result.unwrap();
+    assert_eq!(not_found_result.len(), 0);
 }
--- a/src-tauri/src/search/mod.rs
+++ b/src-tauri/src/search/mod.rs
@ -1,6 +1,6 @@
 mod deltas;

-pub use deltas::DeltasIndex;
+pub use deltas::Deltas;

 #[cfg(test)]
 mod deltas_test;