setup ngram search

This commit is contained in:
Nikita Galaiko 2023-05-25 16:33:33 +02:00
parent 165fe531c4
commit afa4c397d8
3 changed files with 188 additions and 245 deletions

View File

@ -162,8 +162,6 @@ async fn search(
query: &str,
limit: Option<usize>,
offset: Option<usize>,
timestamp_ms_gte: Option<u64>,
timestamp_ms_lt: Option<u64>,
) -> Result<search::SearchResults, Error> {
let app = handle.state::<app::App>();
@ -172,10 +170,6 @@ async fn search(
q: query.to_string(),
limit: limit.unwrap_or(100),
offset,
range: ops::Range {
start: timestamp_ms_gte.unwrap_or(0),
end: timestamp_ms_lt.unwrap_or(u64::MAX),
},
};
let results = app.search(&query).with_context(|| {

View File

@ -1,4 +1,3 @@
use std::ops::Range;
use std::{
fs,
path::{Path, PathBuf},
@ -10,10 +9,16 @@ use anyhow::{Context, Result};
use serde::Serialize;
use similar::{ChangeTag, TextDiff};
use tantivy::{collector, directory::MmapDirectory, schema, IndexWriter};
use tantivy::{query::QueryParser, Term};
use tantivy::{
query::{Occur, TermQuery},
schema::{TextFieldIndexing, TextOptions},
};
use tantivy::{schema::IndexRecordOption, tokenizer};
use crate::{deltas, gb_repository, sessions, storage};
const CURRENT_VERSION: u64 = 4; // should not decrease
const CURRENT_VERSION: u64 = 5; // should not decrease
#[derive(Clone)]
struct MetaStorage {
@ -91,6 +96,12 @@ impl Deltas {
.settings(index_settings)
.open_or_create(mmap_dir)?;
index.tokenizers().register(
"ngram2_3",
tokenizer::TextAnalyzer::from(tokenizer::NgramTokenizer::all_ngrams(2, 3))
.filter(tokenizer::LowerCaser),
);
let reader = index.reader()?;
let writer = index.writer_with_num_threads(1, WRITE_BUFFER_SIZE)?;
@ -102,8 +113,96 @@ impl Deltas {
})
}
pub fn search(&self, query: &SearchQuery) -> Result<SearchResults> {
search(&self.index, &self.reader, query)
pub fn search(&self, q: &SearchQuery) -> Result<SearchResults> {
let version_field = self.index.schema().get_field("version").unwrap();
let project_id_field = self.index.schema().get_field("project_id").unwrap();
let diff_field = self.index.schema().get_field("diff").unwrap();
let file_path_field = self.index.schema().get_field("file_path").unwrap();
let timestamp_ns_field = self.index.schema().get_field("timestamp_ms").unwrap();
let version_term_query = Box::new(TermQuery::new(
Term::from_field_u64(version_field, CURRENT_VERSION),
IndexRecordOption::Basic,
));
let project_id_term_query = Box::new(TermQuery::new(
Term::from_field_text(project_id_field, q.project_id.as_str()),
IndexRecordOption::Basic,
));
let diff_or_file_path_query = Box::new(
QueryParser::for_index(&self.index, vec![diff_field, file_path_field])
.parse_query(&q.q)?,
);
let query = tantivy::query::BooleanQuery::new(vec![
(Occur::Must, version_term_query),
(Occur::Must, project_id_term_query),
(Occur::Must, diff_or_file_path_query),
]);
self.reader.reload()?;
let searcher = self.reader.searcher();
let mut collectors = collector::MultiCollector::new();
let top_docs_handle = collectors.add_collector(
collector::TopDocs::with_limit(q.limit)
.and_offset(q.offset.unwrap_or(0))
.order_by_u64_field(timestamp_ns_field),
);
let count_handle = collectors.add_collector(collector::Count);
let snippet_generator = tantivy::SnippetGenerator::create(
&searcher,
&query,
self.index.schema().get_field("diff").unwrap(),
)?;
let mut result = searcher.search(&query, &collectors)?;
let count = count_handle.extract(&mut result);
let top_docs = top_docs_handle.extract(&mut result);
let page = top_docs
.iter()
.map(|(_score, doc_address)| {
let retrieved_doc = searcher.doc(*doc_address)?;
let project_id = retrieved_doc
.get_first(self.index.schema().get_field("project_id").unwrap())
.unwrap()
.as_text()
.unwrap();
let file_path = retrieved_doc
.get_first(self.index.schema().get_field("file_path").unwrap())
.unwrap()
.as_text()
.unwrap();
let session_id = retrieved_doc
.get_first(self.index.schema().get_field("session_id").unwrap())
.unwrap()
.as_text()
.unwrap();
let index = retrieved_doc
.get_first(self.index.schema().get_field("index").unwrap())
.unwrap()
.as_u64()
.unwrap();
let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
let fragment = snippet.fragment();
let highlighted: Vec<String> = snippet
.highlighted()
.iter()
.map(|range| fragment[range.start..range.end].to_string())
.collect();
Ok(SearchResult {
project_id: project_id.to_string(),
file_path: file_path.to_string(),
session_id: session_id.to_string(),
highlighted,
index,
})
})
.collect::<Result<Vec<SearchResult>>>()?;
Ok(SearchResults { page, total: count })
}
pub fn delete_all_data(&self) -> Result<()> {
@ -158,15 +257,29 @@ impl Deltas {
fn build_schema() -> schema::Schema {
let mut schema_builder = schema::Schema::builder();
schema_builder.add_u64_field("version", schema::INDEXED | schema::FAST);
schema_builder.add_text_field("project_id", schema::TEXT | schema::STORED | schema::FAST);
schema_builder.add_text_field("session_id", schema::STORED);
schema_builder.add_u64_field("index", schema::STORED);
schema_builder.add_text_field("file_path", schema::TEXT | schema::STORED | schema::FAST);
schema_builder.add_text_field("diff", schema::TEXT | schema::STORED);
schema_builder.add_bool_field("is_addition", schema::FAST);
schema_builder.add_bool_field("is_deletion", schema::FAST);
schema_builder.add_u64_field("version", schema::INDEXED);
schema_builder.add_u64_field("timestamp_ms", schema::INDEXED | schema::FAST);
schema_builder.add_u64_field("index", schema::STORED);
let id_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_tokenizer("raw"))
.set_stored();
schema_builder.add_text_field("project_id", id_options.clone());
schema_builder.add_text_field("session_id", id_options);
let text_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("ngram2_3")
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
schema_builder.add_text_field("file_path", text_options.clone());
schema_builder.add_text_field("diff", text_options);
schema_builder.build()
}
@ -300,91 +413,4 @@ pub struct SearchQuery {
pub project_id: String,
pub limit: usize,
pub offset: Option<usize>,
pub range: Range<u64>,
}
pub fn search(
index: &tantivy::Index,
reader: &tantivy::IndexReader,
q: &SearchQuery,
) -> Result<SearchResults> {
let query = tantivy::query::QueryParser::for_index(
index,
vec![
index.schema().get_field("diff").unwrap(),
index.schema().get_field("file_path").unwrap(),
],
)
.parse_query(
format!(
"version:\"{}\" AND project_id:\"{}\" AND timestamp_ms:[{} TO {}}} AND ({})",
CURRENT_VERSION, q.project_id, q.range.start, q.range.end, q.q,
)
.as_str(),
)?;
reader.reload()?;
let searcher = reader.searcher();
let mut collectors = collector::MultiCollector::new();
let top_docs_handle = collectors.add_collector(
collector::TopDocs::with_limit(q.limit)
.and_offset(q.offset.unwrap_or(0))
.order_by_u64_field(index.schema().get_field("timestamp_ms").unwrap()),
);
let count_handle = collectors.add_collector(collector::Count);
let snippet_generator = tantivy::SnippetGenerator::create(
&searcher,
&*query,
index.schema().get_field("diff").unwrap(),
)?;
let mut result = searcher.search(&query, &collectors)?;
let count = count_handle.extract(&mut result);
let top_docs = top_docs_handle.extract(&mut result);
let page = top_docs
.iter()
.map(|(_score, doc_address)| {
let retrieved_doc = searcher.doc(*doc_address)?;
let project_id = retrieved_doc
.get_first(index.schema().get_field("project_id").unwrap())
.unwrap()
.as_text()
.unwrap();
let file_path = retrieved_doc
.get_first(index.schema().get_field("file_path").unwrap())
.unwrap()
.as_text()
.unwrap();
let session_id = retrieved_doc
.get_first(index.schema().get_field("session_id").unwrap())
.unwrap()
.as_text()
.unwrap();
let index = retrieved_doc
.get_first(index.schema().get_field("index").unwrap())
.unwrap()
.as_u64()
.unwrap();
let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
let fragment = snippet.fragment();
let highlighted: Vec<String> = snippet
.highlighted()
.iter()
.map(|range| fragment[range.start..range.end].to_string())
.collect();
Ok(SearchResult {
project_id: project_id.to_string(),
file_path: file_path.to_string(),
session_id: session_id.to_string(),
highlighted,
index,
})
})
.collect::<Result<Vec<SearchResult>>>()?;
Ok(SearchResults { page, total: count })
}

View File

@ -1,4 +1,3 @@
use core::ops::Range;
use std::path::Path;
use anyhow::Result;
@ -36,82 +35,6 @@ fn test_project(repository: &git2::Repository) -> Result<projects::Project> {
Ok(project)
}
#[test]
fn test_filter_by_timestamp() -> Result<()> {
let repository = test_repository()?;
let project = test_project(&repository)?;
let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
let storage = storage::Storage::from_path(tempdir()?.path().to_path_buf());
let project_store = projects::Storage::new(storage.clone());
project_store.add_project(&project)?;
let user_store = users::Storage::new(storage);
let gb_repo = gb_repository::Repository::open(
gb_repo_path,
project.id.clone(),
project_store.clone(),
user_store,
)?;
let index_path = tempdir()?.path().to_str().unwrap().to_string();
let session = gb_repo.get_or_create_current_session()?;
let writer = sessions::Writer::open(&gb_repo, &session)?;
writer.write_deltas(
Path::new("test.txt"),
&vec![
deltas::Delta {
operations: vec![deltas::Operation::Insert((0, "Hello".to_string()))],
timestamp_ms: 0,
},
deltas::Delta {
operations: vec![deltas::Operation::Insert((5, "World".to_string()))],
timestamp_ms: 1,
},
deltas::Delta {
operations: vec![deltas::Operation::Insert((5, " ".to_string()))],
timestamp_ms: 2,
},
],
)?;
let session = gb_repo.flush()?;
let searcher = super::Deltas::at(index_path)?;
searcher.index_session(&gb_repo, &session.unwrap())?;
let search_result_from = searcher.search(&super::SearchQuery {
project_id: gb_repo.get_project_id().to_string(),
q: "test.txt".to_string(),
limit: 10,
range: Range { start: 2, end: 10 },
offset: None,
})?;
assert_eq!(search_result_from.total, 1);
assert_eq!(search_result_from.page[0].index, 2);
let search_result_to = searcher.search(&super::SearchQuery {
project_id: gb_repo.get_project_id().to_string(),
q: "test.txt".to_string(),
limit: 10,
range: Range { start: 0, end: 1 },
offset: None,
})?;
assert_eq!(search_result_to.total, 1);
assert_eq!(search_result_to.page[0].index, 0);
let search_result_from_to = searcher.search(&super::SearchQuery {
project_id: gb_repo.get_project_id().to_string(),
q: "test.txt".to_string(),
limit: 10,
range: Range { start: 1, end: 2 },
offset: None,
})?;
assert_eq!(search_result_from_to.total, 1);
assert_eq!(search_result_from_to.page[0].index, 1);
Ok(())
}
#[test]
fn test_sorted_by_timestamp() -> Result<()> {
let repository = test_repository()?;
@ -156,7 +79,6 @@ fn test_sorted_by_timestamp() -> Result<()> {
project_id: gb_repo.get_project_id().to_string(),
q: "hello world".to_string(),
limit: 10,
range: Range { start: 0, end: 10 },
offset: None,
});
assert!(search_result.is_ok());
@ -169,7 +91,7 @@ fn test_sorted_by_timestamp() -> Result<()> {
}
#[test]
fn test_simple() -> Result<()> {
fn search_by_diff() -> Result<()> {
let repository = test_repository()?;
let project = test_project(&repository)?;
let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
@ -209,79 +131,81 @@ fn test_simple() -> Result<()> {
let write_result = searcher.index_session(&gb_repo, &session);
assert!(write_result.is_ok());
let search_result1 = searcher.search(&super::SearchQuery {
let result = searcher.search(&super::SearchQuery {
project_id: gb_repo.get_project_id().to_string(),
q: "hello".to_string(),
limit: 10,
offset: None,
range: Range { start: 0, end: 10 },
});
assert!(search_result1.is_ok());
let search_result1 = search_result1.unwrap();
assert_eq!(search_result1.total, 1);
assert_eq!(search_result1.page[0].session_id, session.id);
assert_eq!(search_result1.page[0].project_id, gb_repo.get_project_id());
assert_eq!(search_result1.page[0].file_path, "test.txt");
assert_eq!(search_result1.page[0].index, 0);
})?;
assert_eq!(result.total, 1);
assert_eq!(result.page[0].session_id, session.id);
assert_eq!(result.page[0].project_id, gb_repo.get_project_id());
assert_eq!(result.page[0].file_path, "test.txt");
assert_eq!(result.page[0].index, 0);
let search_result2 = searcher.search(&super::SearchQuery {
project_id: gb_repo.get_project_id().to_string(),
q: "world".to_string(),
limit: 10,
offset: None,
range: Range { start: 0, end: 10 },
});
assert!(search_result2.is_ok());
let search_result2 = search_result2.unwrap().page;
assert_eq!(search_result2.len(), 1);
assert_eq!(search_result2[0].session_id, session.id);
assert_eq!(search_result2[0].project_id, gb_repo.get_project_id());
assert_eq!(search_result2[0].file_path, "test.txt");
assert_eq!(search_result2[0].index, 1);
Ok(())
}
let search_result3 = searcher.search(&super::SearchQuery {
project_id: gb_repo.get_project_id().to_string(),
q: "hello world".to_string(),
limit: 10,
offset: None,
range: Range { start: 0, end: 10 },
});
assert!(search_result3.is_ok());
let search_result3 = search_result3.unwrap().page;
assert_eq!(search_result3.len(), 2);
assert_eq!(search_result3[0].project_id, gb_repo.get_project_id());
assert_eq!(search_result3[0].session_id, session.id);
assert_eq!(search_result3[0].file_path, "test.txt");
assert_eq!(search_result3[1].session_id, session.id);
assert_eq!(search_result3[1].project_id, gb_repo.get_project_id());
assert_eq!(search_result3[1].file_path, "test.txt");
#[test]
fn search_by_filename() -> Result<()> {
let repository = test_repository()?;
let project = test_project(&repository)?;
let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
let storage = storage::Storage::from_path(tempdir()?.path().to_path_buf());
let project_store = projects::Storage::new(storage.clone());
project_store.add_project(&project)?;
let user_store = users::Storage::new(storage);
let gb_repo = gb_repository::Repository::open(
gb_repo_path,
project.id.clone(),
project_store.clone(),
user_store,
)?;
let search_by_filename_result = searcher.search(&super::SearchQuery {
project_id: gb_repo.get_project_id().to_string(),
q: "test.txt".to_string(),
limit: 10,
offset: None,
range: Range { start: 0, end: 10 },
});
assert!(search_by_filename_result.is_ok());
let search_by_filename_result = search_by_filename_result.unwrap().page;
assert_eq!(search_by_filename_result.len(), 2);
assert_eq!(search_by_filename_result[0].session_id, session.id);
assert_eq!(
search_by_filename_result[0].project_id,
gb_repo.get_project_id()
);
assert_eq!(search_by_filename_result[0].file_path, "test.txt");
let index_path = tempdir()?.path().to_str().unwrap().to_string();
let session = gb_repo.get_or_create_current_session()?;
let writer = sessions::Writer::open(&gb_repo, &session)?;
writer.write_deltas(
Path::new("test.txt"),
&vec![
deltas::Delta {
operations: vec![deltas::Operation::Insert((0, "Hello".to_string()))],
timestamp_ms: 0,
},
deltas::Delta {
operations: vec![deltas::Operation::Insert((5, "World".to_string()))],
timestamp_ms: 1,
},
],
)?;
let session = gb_repo.flush()?;
let session = session.unwrap();
let searcher = super::Deltas::at(index_path).unwrap();
let write_result = searcher.index_session(&gb_repo, &session);
assert!(write_result.is_ok());
let found_result = searcher
.search(&super::SearchQuery {
project_id: gb_repo.get_project_id().to_string(),
q: "test.txt".to_string(),
limit: 10,
offset: None,
})?
.page;
assert_eq!(found_result.len(), 2);
assert_eq!(found_result[0].session_id, session.id);
assert_eq!(found_result[0].project_id, gb_repo.get_project_id());
assert_eq!(found_result[0].file_path, "test.txt");
let not_found_result = searcher.search(&super::SearchQuery {
project_id: "not found".to_string(),
q: "test.txt".to_string(),
limit: 10,
offset: None,
range: Range { start: 0, end: 10 },
});
assert!(not_found_result.is_ok());
let not_found_result = not_found_result.unwrap();
})?;
assert_eq!(not_found_result.total, 0);
Ok(())
@ -334,7 +258,6 @@ fn test_delete_all() -> Result<()> {
project_id: gb_repo.get_project_id().to_string(),
q: "test.txt".to_string(),
limit: 10,
range: Range { start: 2, end: 10 },
offset: None,
})?;
assert_eq!(search_result_from.total, 0);