setup ngram search

2024-12-23 09:33:01 +03:00 · 2023-05-25 16:33:33 +02:00 · 2023-05-25 16:33:33 +02:00 · afa4c397d8
commit afa4c397d8
parent 165fe531c4
3 changed files with 188 additions and 245 deletions
--- a/src-tauri/src/main.rs
+++ b/src-tauri/src/main.rs
@ -162,8 +162,6 @@ async fn search(
    query: &str,
    limit: Option<usize>,
    offset: Option<usize>,
-    timestamp_ms_gte: Option<u64>,
-    timestamp_ms_lt: Option<u64>,
 ) -> Result<search::SearchResults, Error> {
    let app = handle.state::<app::App>();

@ -172,10 +170,6 @@ async fn search(
        q: query.to_string(),
        limit: limit.unwrap_or(100),
        offset,
-        range: ops::Range {
-            start: timestamp_ms_gte.unwrap_or(0),
-            end: timestamp_ms_lt.unwrap_or(u64::MAX),
-        },
    };

    let results = app.search(&query).with_context(|| {
--- a/src-tauri/src/search/deltas.rs
+++ b/src-tauri/src/search/deltas.rs
@ -1,4 +1,3 @@
-use std::ops::Range;
 use std::{
    fs,
    path::{Path, PathBuf},
@ -10,10 +9,16 @@ use anyhow::{Context, Result};
 use serde::Serialize;
 use similar::{ChangeTag, TextDiff};
 use tantivy::{collector, directory::MmapDirectory, schema, IndexWriter};
+use tantivy::{query::QueryParser, Term};
+use tantivy::{
+    query::{Occur, TermQuery},
+    schema::{TextFieldIndexing, TextOptions},
+};
+use tantivy::{schema::IndexRecordOption, tokenizer};

 use crate::{deltas, gb_repository, sessions, storage};

-const CURRENT_VERSION: u64 = 4; // should not decrease
+const CURRENT_VERSION: u64 = 5; // should not decrease

 #[derive(Clone)]
 struct MetaStorage {
@ -91,6 +96,12 @@ impl Deltas {
            .settings(index_settings)
            .open_or_create(mmap_dir)?;

+        index.tokenizers().register(
+            "ngram2_3",
+            tokenizer::TextAnalyzer::from(tokenizer::NgramTokenizer::all_ngrams(2, 3))
+                .filter(tokenizer::LowerCaser),
+        );
+
        let reader = index.reader()?;
        let writer = index.writer_with_num_threads(1, WRITE_BUFFER_SIZE)?;

@ -102,8 +113,96 @@ impl Deltas {
        })
    }

-    pub fn search(&self, query: &SearchQuery) -> Result<SearchResults> {
-        search(&self.index, &self.reader, query)
+    pub fn search(&self, q: &SearchQuery) -> Result<SearchResults> {
+        let version_field = self.index.schema().get_field("version").unwrap();
+        let project_id_field = self.index.schema().get_field("project_id").unwrap();
+        let diff_field = self.index.schema().get_field("diff").unwrap();
+        let file_path_field = self.index.schema().get_field("file_path").unwrap();
+        let timestamp_ns_field = self.index.schema().get_field("timestamp_ms").unwrap();
+
+        let version_term_query = Box::new(TermQuery::new(
+            Term::from_field_u64(version_field, CURRENT_VERSION),
+            IndexRecordOption::Basic,
+        ));
+        let project_id_term_query = Box::new(TermQuery::new(
+            Term::from_field_text(project_id_field, q.project_id.as_str()),
+            IndexRecordOption::Basic,
+        ));
+        let diff_or_file_path_query = Box::new(
+            QueryParser::for_index(&self.index, vec![diff_field, file_path_field])
+                .parse_query(&q.q)?,
+        );
+
+        let query = tantivy::query::BooleanQuery::new(vec![
+            (Occur::Must, version_term_query),
+            (Occur::Must, project_id_term_query),
+            (Occur::Must, diff_or_file_path_query),
+        ]);
+
+        self.reader.reload()?;
+        let searcher = self.reader.searcher();
+
+        let mut collectors = collector::MultiCollector::new();
+        let top_docs_handle = collectors.add_collector(
+            collector::TopDocs::with_limit(q.limit)
+                .and_offset(q.offset.unwrap_or(0))
+                .order_by_u64_field(timestamp_ns_field),
+        );
+        let count_handle = collectors.add_collector(collector::Count);
+
+        let snippet_generator = tantivy::SnippetGenerator::create(
+            &searcher,
+            &query,
+            self.index.schema().get_field("diff").unwrap(),
+        )?;
+
+        let mut result = searcher.search(&query, &collectors)?;
+        let count = count_handle.extract(&mut result);
+        let top_docs = top_docs_handle.extract(&mut result);
+
+        let page = top_docs
+            .iter()
+            .map(|(_score, doc_address)| {
+                let retrieved_doc = searcher.doc(*doc_address)?;
+
+                let project_id = retrieved_doc
+                    .get_first(self.index.schema().get_field("project_id").unwrap())
+                    .unwrap()
+                    .as_text()
+                    .unwrap();
+                let file_path = retrieved_doc
+                    .get_first(self.index.schema().get_field("file_path").unwrap())
+                    .unwrap()
+                    .as_text()
+                    .unwrap();
+                let session_id = retrieved_doc
+                    .get_first(self.index.schema().get_field("session_id").unwrap())
+                    .unwrap()
+                    .as_text()
+                    .unwrap();
+                let index = retrieved_doc
+                    .get_first(self.index.schema().get_field("index").unwrap())
+                    .unwrap()
+                    .as_u64()
+                    .unwrap();
+                let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
+                let fragment = snippet.fragment();
+                let highlighted: Vec<String> = snippet
+                    .highlighted()
+                    .iter()
+                    .map(|range| fragment[range.start..range.end].to_string())
+                    .collect();
+                Ok(SearchResult {
+                    project_id: project_id.to_string(),
+                    file_path: file_path.to_string(),
+                    session_id: session_id.to_string(),
+                    highlighted,
+                    index,
+                })
+            })
+            .collect::<Result<Vec<SearchResult>>>()?;
+
+        Ok(SearchResults { page, total: count })
    }

    pub fn delete_all_data(&self) -> Result<()> {
@ -158,15 +257,29 @@ impl Deltas {

 fn build_schema() -> schema::Schema {
    let mut schema_builder = schema::Schema::builder();
-    schema_builder.add_u64_field("version", schema::INDEXED | schema::FAST);
-    schema_builder.add_text_field("project_id", schema::TEXT | schema::STORED | schema::FAST);
-    schema_builder.add_text_field("session_id", schema::STORED);
-    schema_builder.add_u64_field("index", schema::STORED);
-    schema_builder.add_text_field("file_path", schema::TEXT | schema::STORED | schema::FAST);
-    schema_builder.add_text_field("diff", schema::TEXT | schema::STORED);
-    schema_builder.add_bool_field("is_addition", schema::FAST);
-    schema_builder.add_bool_field("is_deletion", schema::FAST);
+
+    schema_builder.add_u64_field("version", schema::INDEXED);
    schema_builder.add_u64_field("timestamp_ms", schema::INDEXED | schema::FAST);
+    schema_builder.add_u64_field("index", schema::STORED);
+
+    let id_options = TextOptions::default()
+        .set_indexing_options(TextFieldIndexing::default().set_tokenizer("raw"))
+        .set_stored();
+
+    schema_builder.add_text_field("project_id", id_options.clone());
+    schema_builder.add_text_field("session_id", id_options);
+
+    let text_options = TextOptions::default()
+        .set_indexing_options(
+            TextFieldIndexing::default()
+                .set_tokenizer("ngram2_3")
+                .set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
+        )
+        .set_stored();
+
+    schema_builder.add_text_field("file_path", text_options.clone());
+    schema_builder.add_text_field("diff", text_options);
+
    schema_builder.build()
 }

@ -300,91 +413,4 @@ pub struct SearchQuery {
    pub project_id: String,
    pub limit: usize,
    pub offset: Option<usize>,
-    pub range: Range<u64>,
-}
-
-pub fn search(
-    index: &tantivy::Index,
-    reader: &tantivy::IndexReader,
-    q: &SearchQuery,
-) -> Result<SearchResults> {
-    let query = tantivy::query::QueryParser::for_index(
-        index,
-        vec![
-            index.schema().get_field("diff").unwrap(),
-            index.schema().get_field("file_path").unwrap(),
-        ],
-    )
-    .parse_query(
-        format!(
-            "version:\"{}\" AND project_id:\"{}\" AND timestamp_ms:[{} TO {}}} AND ({})",
-            CURRENT_VERSION, q.project_id, q.range.start, q.range.end, q.q,
-        )
-        .as_str(),
-    )?;
-
-    reader.reload()?;
-    let searcher = reader.searcher();
-
-    let mut collectors = collector::MultiCollector::new();
-    let top_docs_handle = collectors.add_collector(
-        collector::TopDocs::with_limit(q.limit)
-            .and_offset(q.offset.unwrap_or(0))
-            .order_by_u64_field(index.schema().get_field("timestamp_ms").unwrap()),
-    );
-    let count_handle = collectors.add_collector(collector::Count);
-
-    let snippet_generator = tantivy::SnippetGenerator::create(
-        &searcher,
-        &*query,
-        index.schema().get_field("diff").unwrap(),
-    )?;
-
-    let mut result = searcher.search(&query, &collectors)?;
-    let count = count_handle.extract(&mut result);
-    let top_docs = top_docs_handle.extract(&mut result);
-
-    let page = top_docs
-        .iter()
-        .map(|(_score, doc_address)| {
-            let retrieved_doc = searcher.doc(*doc_address)?;
-
-            let project_id = retrieved_doc
-                .get_first(index.schema().get_field("project_id").unwrap())
-                .unwrap()
-                .as_text()
-                .unwrap();
-            let file_path = retrieved_doc
-                .get_first(index.schema().get_field("file_path").unwrap())
-                .unwrap()
-                .as_text()
-                .unwrap();
-            let session_id = retrieved_doc
-                .get_first(index.schema().get_field("session_id").unwrap())
-                .unwrap()
-                .as_text()
-                .unwrap();
-            let index = retrieved_doc
-                .get_first(index.schema().get_field("index").unwrap())
-                .unwrap()
-                .as_u64()
-                .unwrap();
-            let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
-            let fragment = snippet.fragment();
-            let highlighted: Vec<String> = snippet
-                .highlighted()
-                .iter()
-                .map(|range| fragment[range.start..range.end].to_string())
-                .collect();
-            Ok(SearchResult {
-                project_id: project_id.to_string(),
-                file_path: file_path.to_string(),
-                session_id: session_id.to_string(),
-                highlighted,
-                index,
-            })
-        })
-        .collect::<Result<Vec<SearchResult>>>()?;
-
-    Ok(SearchResults { page, total: count })
 }
--- a/src-tauri/src/search/deltas_test.rs
+++ b/src-tauri/src/search/deltas_test.rs
@ -1,4 +1,3 @@
-use core::ops::Range;
 use std::path::Path;

 use anyhow::Result;
@ -36,82 +35,6 @@ fn test_project(repository: &git2::Repository) -> Result<projects::Project> {
    Ok(project)
 }

-#[test]
-fn test_filter_by_timestamp() -> Result<()> {
-    let repository = test_repository()?;
-    let project = test_project(&repository)?;
-    let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
-    let storage = storage::Storage::from_path(tempdir()?.path().to_path_buf());
-    let project_store = projects::Storage::new(storage.clone());
-    project_store.add_project(&project)?;
-    let user_store = users::Storage::new(storage);
-    let gb_repo = gb_repository::Repository::open(
-        gb_repo_path,
-        project.id.clone(),
-        project_store.clone(),
-        user_store,
-    )?;
-
-    let index_path = tempdir()?.path().to_str().unwrap().to_string();
-
-    let session = gb_repo.get_or_create_current_session()?;
-    let writer = sessions::Writer::open(&gb_repo, &session)?;
-    writer.write_deltas(
-        Path::new("test.txt"),
-        &vec![
-            deltas::Delta {
-                operations: vec![deltas::Operation::Insert((0, "Hello".to_string()))],
-                timestamp_ms: 0,
-            },
-            deltas::Delta {
-                operations: vec![deltas::Operation::Insert((5, "World".to_string()))],
-                timestamp_ms: 1,
-            },
-            deltas::Delta {
-                operations: vec![deltas::Operation::Insert((5, " ".to_string()))],
-                timestamp_ms: 2,
-            },
-        ],
-    )?;
-    let session = gb_repo.flush()?;
-
-    let searcher = super::Deltas::at(index_path)?;
-
-    searcher.index_session(&gb_repo, &session.unwrap())?;
-
-    let search_result_from = searcher.search(&super::SearchQuery {
-        project_id: gb_repo.get_project_id().to_string(),
-        q: "test.txt".to_string(),
-        limit: 10,
-        range: Range { start: 2, end: 10 },
-        offset: None,
-    })?;
-    assert_eq!(search_result_from.total, 1);
-    assert_eq!(search_result_from.page[0].index, 2);
-
-    let search_result_to = searcher.search(&super::SearchQuery {
-        project_id: gb_repo.get_project_id().to_string(),
-        q: "test.txt".to_string(),
-        limit: 10,
-        range: Range { start: 0, end: 1 },
-        offset: None,
-    })?;
-    assert_eq!(search_result_to.total, 1);
-    assert_eq!(search_result_to.page[0].index, 0);
-
-    let search_result_from_to = searcher.search(&super::SearchQuery {
-        project_id: gb_repo.get_project_id().to_string(),
-        q: "test.txt".to_string(),
-        limit: 10,
-        range: Range { start: 1, end: 2 },
-        offset: None,
-    })?;
-    assert_eq!(search_result_from_to.total, 1);
-    assert_eq!(search_result_from_to.page[0].index, 1);
-
-    Ok(())
-}
-
 #[test]
 fn test_sorted_by_timestamp() -> Result<()> {
    let repository = test_repository()?;
@ -156,7 +79,6 @@ fn test_sorted_by_timestamp() -> Result<()> {
        project_id: gb_repo.get_project_id().to_string(),
        q: "hello world".to_string(),
        limit: 10,
-        range: Range { start: 0, end: 10 },
        offset: None,
    });
    assert!(search_result.is_ok());
@ -169,7 +91,7 @@ fn test_sorted_by_timestamp() -> Result<()> {
 }

 #[test]
-fn test_simple() -> Result<()> {
+fn search_by_diff() -> Result<()> {
    let repository = test_repository()?;
    let project = test_project(&repository)?;
    let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
@ -209,79 +131,81 @@ fn test_simple() -> Result<()> {
    let write_result = searcher.index_session(&gb_repo, &session);
    assert!(write_result.is_ok());

-    let search_result1 = searcher.search(&super::SearchQuery {
+    let result = searcher.search(&super::SearchQuery {
        project_id: gb_repo.get_project_id().to_string(),
        q: "hello".to_string(),
        limit: 10,
        offset: None,
-        range: Range { start: 0, end: 10 },
-    });
-    assert!(search_result1.is_ok());
-    let search_result1 = search_result1.unwrap();
-    assert_eq!(search_result1.total, 1);
-    assert_eq!(search_result1.page[0].session_id, session.id);
-    assert_eq!(search_result1.page[0].project_id, gb_repo.get_project_id());
-    assert_eq!(search_result1.page[0].file_path, "test.txt");
-    assert_eq!(search_result1.page[0].index, 0);
+    })?;
+    assert_eq!(result.total, 1);
+    assert_eq!(result.page[0].session_id, session.id);
+    assert_eq!(result.page[0].project_id, gb_repo.get_project_id());
+    assert_eq!(result.page[0].file_path, "test.txt");
+    assert_eq!(result.page[0].index, 0);

-    let search_result2 = searcher.search(&super::SearchQuery {
-        project_id: gb_repo.get_project_id().to_string(),
-        q: "world".to_string(),
-        limit: 10,
-        offset: None,
-        range: Range { start: 0, end: 10 },
-    });
-    assert!(search_result2.is_ok());
-    let search_result2 = search_result2.unwrap().page;
-    assert_eq!(search_result2.len(), 1);
-    assert_eq!(search_result2[0].session_id, session.id);
-    assert_eq!(search_result2[0].project_id, gb_repo.get_project_id());
-    assert_eq!(search_result2[0].file_path, "test.txt");
-    assert_eq!(search_result2[0].index, 1);
+    Ok(())
+}

-    let search_result3 = searcher.search(&super::SearchQuery {
-        project_id: gb_repo.get_project_id().to_string(),
-        q: "hello world".to_string(),
-        limit: 10,
-        offset: None,
-        range: Range { start: 0, end: 10 },
-    });
-    assert!(search_result3.is_ok());
-    let search_result3 = search_result3.unwrap().page;
-    assert_eq!(search_result3.len(), 2);
-    assert_eq!(search_result3[0].project_id, gb_repo.get_project_id());
-    assert_eq!(search_result3[0].session_id, session.id);
-    assert_eq!(search_result3[0].file_path, "test.txt");
-    assert_eq!(search_result3[1].session_id, session.id);
-    assert_eq!(search_result3[1].project_id, gb_repo.get_project_id());
-    assert_eq!(search_result3[1].file_path, "test.txt");
+#[test]
+fn search_by_filename() -> Result<()> {
+    let repository = test_repository()?;
+    let project = test_project(&repository)?;
+    let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
+    let storage = storage::Storage::from_path(tempdir()?.path().to_path_buf());
+    let project_store = projects::Storage::new(storage.clone());
+    project_store.add_project(&project)?;
+    let user_store = users::Storage::new(storage);
+    let gb_repo = gb_repository::Repository::open(
+        gb_repo_path,
+        project.id.clone(),
+        project_store.clone(),
+        user_store,
+    )?;

-    let search_by_filename_result = searcher.search(&super::SearchQuery {
-        project_id: gb_repo.get_project_id().to_string(),
-        q: "test.txt".to_string(),
-        limit: 10,
-        offset: None,
-        range: Range { start: 0, end: 10 },
-    });
-    assert!(search_by_filename_result.is_ok());
-    let search_by_filename_result = search_by_filename_result.unwrap().page;
-    assert_eq!(search_by_filename_result.len(), 2);
-    assert_eq!(search_by_filename_result[0].session_id, session.id);
-    assert_eq!(
-        search_by_filename_result[0].project_id,
-        gb_repo.get_project_id()
-    );
-    assert_eq!(search_by_filename_result[0].file_path, "test.txt");
+    let index_path = tempdir()?.path().to_str().unwrap().to_string();
+
+    let session = gb_repo.get_or_create_current_session()?;
+    let writer = sessions::Writer::open(&gb_repo, &session)?;
+    writer.write_deltas(
+        Path::new("test.txt"),
+        &vec![
+            deltas::Delta {
+                operations: vec![deltas::Operation::Insert((0, "Hello".to_string()))],
+                timestamp_ms: 0,
+            },
+            deltas::Delta {
+                operations: vec![deltas::Operation::Insert((5, "World".to_string()))],
+                timestamp_ms: 1,
+            },
+        ],
+    )?;
+    let session = gb_repo.flush()?;
+    let session = session.unwrap();
+
+    let searcher = super::Deltas::at(index_path).unwrap();
+
+    let write_result = searcher.index_session(&gb_repo, &session);
+    assert!(write_result.is_ok());
+
+    let found_result = searcher
+        .search(&super::SearchQuery {
+            project_id: gb_repo.get_project_id().to_string(),
+            q: "test.txt".to_string(),
+            limit: 10,
+            offset: None,
+        })?
+        .page;
+    assert_eq!(found_result.len(), 2);
+    assert_eq!(found_result[0].session_id, session.id);
+    assert_eq!(found_result[0].project_id, gb_repo.get_project_id());
+    assert_eq!(found_result[0].file_path, "test.txt");

    let not_found_result = searcher.search(&super::SearchQuery {
        project_id: "not found".to_string(),
        q: "test.txt".to_string(),
        limit: 10,
        offset: None,
-        range: Range { start: 0, end: 10 },
-    });
-    assert!(not_found_result.is_ok());
-    let not_found_result = not_found_result.unwrap();
+    })?;
    assert_eq!(not_found_result.total, 0);

    Ok(())
@ -334,7 +258,6 @@ fn test_delete_all() -> Result<()> {
        project_id: gb_repo.get_project_id().to_string(),
        q: "test.txt".to_string(),
        limit: 10,
-        range: Range { start: 2, end: 10 },
        offset: None,
    })?;
    assert_eq!(search_result_from.total, 0);