mirror of
https://github.com/gitbutlerapp/gitbutler.git
synced 2024-12-23 01:22:12 +03:00
setup ngram search
This commit is contained in:
parent
165fe531c4
commit
afa4c397d8
@ -162,8 +162,6 @@ async fn search(
|
||||
query: &str,
|
||||
limit: Option<usize>,
|
||||
offset: Option<usize>,
|
||||
timestamp_ms_gte: Option<u64>,
|
||||
timestamp_ms_lt: Option<u64>,
|
||||
) -> Result<search::SearchResults, Error> {
|
||||
let app = handle.state::<app::App>();
|
||||
|
||||
@ -172,10 +170,6 @@ async fn search(
|
||||
q: query.to_string(),
|
||||
limit: limit.unwrap_or(100),
|
||||
offset,
|
||||
range: ops::Range {
|
||||
start: timestamp_ms_gte.unwrap_or(0),
|
||||
end: timestamp_ms_lt.unwrap_or(u64::MAX),
|
||||
},
|
||||
};
|
||||
|
||||
let results = app.search(&query).with_context(|| {
|
||||
|
@ -1,4 +1,3 @@
|
||||
use std::ops::Range;
|
||||
use std::{
|
||||
fs,
|
||||
path::{Path, PathBuf},
|
||||
@ -10,10 +9,16 @@ use anyhow::{Context, Result};
|
||||
use serde::Serialize;
|
||||
use similar::{ChangeTag, TextDiff};
|
||||
use tantivy::{collector, directory::MmapDirectory, schema, IndexWriter};
|
||||
use tantivy::{query::QueryParser, Term};
|
||||
use tantivy::{
|
||||
query::{Occur, TermQuery},
|
||||
schema::{TextFieldIndexing, TextOptions},
|
||||
};
|
||||
use tantivy::{schema::IndexRecordOption, tokenizer};
|
||||
|
||||
use crate::{deltas, gb_repository, sessions, storage};
|
||||
|
||||
const CURRENT_VERSION: u64 = 4; // should not decrease
|
||||
const CURRENT_VERSION: u64 = 5; // should not decrease
|
||||
|
||||
#[derive(Clone)]
|
||||
struct MetaStorage {
|
||||
@ -91,6 +96,12 @@ impl Deltas {
|
||||
.settings(index_settings)
|
||||
.open_or_create(mmap_dir)?;
|
||||
|
||||
index.tokenizers().register(
|
||||
"ngram2_3",
|
||||
tokenizer::TextAnalyzer::from(tokenizer::NgramTokenizer::all_ngrams(2, 3))
|
||||
.filter(tokenizer::LowerCaser),
|
||||
);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let writer = index.writer_with_num_threads(1, WRITE_BUFFER_SIZE)?;
|
||||
|
||||
@ -102,8 +113,96 @@ impl Deltas {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn search(&self, query: &SearchQuery) -> Result<SearchResults> {
|
||||
search(&self.index, &self.reader, query)
|
||||
pub fn search(&self, q: &SearchQuery) -> Result<SearchResults> {
|
||||
let version_field = self.index.schema().get_field("version").unwrap();
|
||||
let project_id_field = self.index.schema().get_field("project_id").unwrap();
|
||||
let diff_field = self.index.schema().get_field("diff").unwrap();
|
||||
let file_path_field = self.index.schema().get_field("file_path").unwrap();
|
||||
let timestamp_ns_field = self.index.schema().get_field("timestamp_ms").unwrap();
|
||||
|
||||
let version_term_query = Box::new(TermQuery::new(
|
||||
Term::from_field_u64(version_field, CURRENT_VERSION),
|
||||
IndexRecordOption::Basic,
|
||||
));
|
||||
let project_id_term_query = Box::new(TermQuery::new(
|
||||
Term::from_field_text(project_id_field, q.project_id.as_str()),
|
||||
IndexRecordOption::Basic,
|
||||
));
|
||||
let diff_or_file_path_query = Box::new(
|
||||
QueryParser::for_index(&self.index, vec![diff_field, file_path_field])
|
||||
.parse_query(&q.q)?,
|
||||
);
|
||||
|
||||
let query = tantivy::query::BooleanQuery::new(vec![
|
||||
(Occur::Must, version_term_query),
|
||||
(Occur::Must, project_id_term_query),
|
||||
(Occur::Must, diff_or_file_path_query),
|
||||
]);
|
||||
|
||||
self.reader.reload()?;
|
||||
let searcher = self.reader.searcher();
|
||||
|
||||
let mut collectors = collector::MultiCollector::new();
|
||||
let top_docs_handle = collectors.add_collector(
|
||||
collector::TopDocs::with_limit(q.limit)
|
||||
.and_offset(q.offset.unwrap_or(0))
|
||||
.order_by_u64_field(timestamp_ns_field),
|
||||
);
|
||||
let count_handle = collectors.add_collector(collector::Count);
|
||||
|
||||
let snippet_generator = tantivy::SnippetGenerator::create(
|
||||
&searcher,
|
||||
&query,
|
||||
self.index.schema().get_field("diff").unwrap(),
|
||||
)?;
|
||||
|
||||
let mut result = searcher.search(&query, &collectors)?;
|
||||
let count = count_handle.extract(&mut result);
|
||||
let top_docs = top_docs_handle.extract(&mut result);
|
||||
|
||||
let page = top_docs
|
||||
.iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let retrieved_doc = searcher.doc(*doc_address)?;
|
||||
|
||||
let project_id = retrieved_doc
|
||||
.get_first(self.index.schema().get_field("project_id").unwrap())
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let file_path = retrieved_doc
|
||||
.get_first(self.index.schema().get_field("file_path").unwrap())
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let session_id = retrieved_doc
|
||||
.get_first(self.index.schema().get_field("session_id").unwrap())
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let index = retrieved_doc
|
||||
.get_first(self.index.schema().get_field("index").unwrap())
|
||||
.unwrap()
|
||||
.as_u64()
|
||||
.unwrap();
|
||||
let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
|
||||
let fragment = snippet.fragment();
|
||||
let highlighted: Vec<String> = snippet
|
||||
.highlighted()
|
||||
.iter()
|
||||
.map(|range| fragment[range.start..range.end].to_string())
|
||||
.collect();
|
||||
Ok(SearchResult {
|
||||
project_id: project_id.to_string(),
|
||||
file_path: file_path.to_string(),
|
||||
session_id: session_id.to_string(),
|
||||
highlighted,
|
||||
index,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<SearchResult>>>()?;
|
||||
|
||||
Ok(SearchResults { page, total: count })
|
||||
}
|
||||
|
||||
pub fn delete_all_data(&self) -> Result<()> {
|
||||
@ -158,15 +257,29 @@ impl Deltas {
|
||||
|
||||
fn build_schema() -> schema::Schema {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
schema_builder.add_u64_field("version", schema::INDEXED | schema::FAST);
|
||||
schema_builder.add_text_field("project_id", schema::TEXT | schema::STORED | schema::FAST);
|
||||
schema_builder.add_text_field("session_id", schema::STORED);
|
||||
schema_builder.add_u64_field("index", schema::STORED);
|
||||
schema_builder.add_text_field("file_path", schema::TEXT | schema::STORED | schema::FAST);
|
||||
schema_builder.add_text_field("diff", schema::TEXT | schema::STORED);
|
||||
schema_builder.add_bool_field("is_addition", schema::FAST);
|
||||
schema_builder.add_bool_field("is_deletion", schema::FAST);
|
||||
|
||||
schema_builder.add_u64_field("version", schema::INDEXED);
|
||||
schema_builder.add_u64_field("timestamp_ms", schema::INDEXED | schema::FAST);
|
||||
schema_builder.add_u64_field("index", schema::STORED);
|
||||
|
||||
let id_options = TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default().set_tokenizer("raw"))
|
||||
.set_stored();
|
||||
|
||||
schema_builder.add_text_field("project_id", id_options.clone());
|
||||
schema_builder.add_text_field("session_id", id_options);
|
||||
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("ngram2_3")
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored();
|
||||
|
||||
schema_builder.add_text_field("file_path", text_options.clone());
|
||||
schema_builder.add_text_field("diff", text_options);
|
||||
|
||||
schema_builder.build()
|
||||
}
|
||||
|
||||
@ -300,91 +413,4 @@ pub struct SearchQuery {
|
||||
pub project_id: String,
|
||||
pub limit: usize,
|
||||
pub offset: Option<usize>,
|
||||
pub range: Range<u64>,
|
||||
}
|
||||
|
||||
pub fn search(
|
||||
index: &tantivy::Index,
|
||||
reader: &tantivy::IndexReader,
|
||||
q: &SearchQuery,
|
||||
) -> Result<SearchResults> {
|
||||
let query = tantivy::query::QueryParser::for_index(
|
||||
index,
|
||||
vec![
|
||||
index.schema().get_field("diff").unwrap(),
|
||||
index.schema().get_field("file_path").unwrap(),
|
||||
],
|
||||
)
|
||||
.parse_query(
|
||||
format!(
|
||||
"version:\"{}\" AND project_id:\"{}\" AND timestamp_ms:[{} TO {}}} AND ({})",
|
||||
CURRENT_VERSION, q.project_id, q.range.start, q.range.end, q.q,
|
||||
)
|
||||
.as_str(),
|
||||
)?;
|
||||
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let mut collectors = collector::MultiCollector::new();
|
||||
let top_docs_handle = collectors.add_collector(
|
||||
collector::TopDocs::with_limit(q.limit)
|
||||
.and_offset(q.offset.unwrap_or(0))
|
||||
.order_by_u64_field(index.schema().get_field("timestamp_ms").unwrap()),
|
||||
);
|
||||
let count_handle = collectors.add_collector(collector::Count);
|
||||
|
||||
let snippet_generator = tantivy::SnippetGenerator::create(
|
||||
&searcher,
|
||||
&*query,
|
||||
index.schema().get_field("diff").unwrap(),
|
||||
)?;
|
||||
|
||||
let mut result = searcher.search(&query, &collectors)?;
|
||||
let count = count_handle.extract(&mut result);
|
||||
let top_docs = top_docs_handle.extract(&mut result);
|
||||
|
||||
let page = top_docs
|
||||
.iter()
|
||||
.map(|(_score, doc_address)| {
|
||||
let retrieved_doc = searcher.doc(*doc_address)?;
|
||||
|
||||
let project_id = retrieved_doc
|
||||
.get_first(index.schema().get_field("project_id").unwrap())
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let file_path = retrieved_doc
|
||||
.get_first(index.schema().get_field("file_path").unwrap())
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let session_id = retrieved_doc
|
||||
.get_first(index.schema().get_field("session_id").unwrap())
|
||||
.unwrap()
|
||||
.as_text()
|
||||
.unwrap();
|
||||
let index = retrieved_doc
|
||||
.get_first(index.schema().get_field("index").unwrap())
|
||||
.unwrap()
|
||||
.as_u64()
|
||||
.unwrap();
|
||||
let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
|
||||
let fragment = snippet.fragment();
|
||||
let highlighted: Vec<String> = snippet
|
||||
.highlighted()
|
||||
.iter()
|
||||
.map(|range| fragment[range.start..range.end].to_string())
|
||||
.collect();
|
||||
Ok(SearchResult {
|
||||
project_id: project_id.to_string(),
|
||||
file_path: file_path.to_string(),
|
||||
session_id: session_id.to_string(),
|
||||
highlighted,
|
||||
index,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<SearchResult>>>()?;
|
||||
|
||||
Ok(SearchResults { page, total: count })
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
use core::ops::Range;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
@ -36,82 +35,6 @@ fn test_project(repository: &git2::Repository) -> Result<projects::Project> {
|
||||
Ok(project)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_filter_by_timestamp() -> Result<()> {
|
||||
let repository = test_repository()?;
|
||||
let project = test_project(&repository)?;
|
||||
let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
|
||||
let storage = storage::Storage::from_path(tempdir()?.path().to_path_buf());
|
||||
let project_store = projects::Storage::new(storage.clone());
|
||||
project_store.add_project(&project)?;
|
||||
let user_store = users::Storage::new(storage);
|
||||
let gb_repo = gb_repository::Repository::open(
|
||||
gb_repo_path,
|
||||
project.id.clone(),
|
||||
project_store.clone(),
|
||||
user_store,
|
||||
)?;
|
||||
|
||||
let index_path = tempdir()?.path().to_str().unwrap().to_string();
|
||||
|
||||
let session = gb_repo.get_or_create_current_session()?;
|
||||
let writer = sessions::Writer::open(&gb_repo, &session)?;
|
||||
writer.write_deltas(
|
||||
Path::new("test.txt"),
|
||||
&vec![
|
||||
deltas::Delta {
|
||||
operations: vec![deltas::Operation::Insert((0, "Hello".to_string()))],
|
||||
timestamp_ms: 0,
|
||||
},
|
||||
deltas::Delta {
|
||||
operations: vec![deltas::Operation::Insert((5, "World".to_string()))],
|
||||
timestamp_ms: 1,
|
||||
},
|
||||
deltas::Delta {
|
||||
operations: vec![deltas::Operation::Insert((5, " ".to_string()))],
|
||||
timestamp_ms: 2,
|
||||
},
|
||||
],
|
||||
)?;
|
||||
let session = gb_repo.flush()?;
|
||||
|
||||
let searcher = super::Deltas::at(index_path)?;
|
||||
|
||||
searcher.index_session(&gb_repo, &session.unwrap())?;
|
||||
|
||||
let search_result_from = searcher.search(&super::SearchQuery {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "test.txt".to_string(),
|
||||
limit: 10,
|
||||
range: Range { start: 2, end: 10 },
|
||||
offset: None,
|
||||
})?;
|
||||
assert_eq!(search_result_from.total, 1);
|
||||
assert_eq!(search_result_from.page[0].index, 2);
|
||||
|
||||
let search_result_to = searcher.search(&super::SearchQuery {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "test.txt".to_string(),
|
||||
limit: 10,
|
||||
range: Range { start: 0, end: 1 },
|
||||
offset: None,
|
||||
})?;
|
||||
assert_eq!(search_result_to.total, 1);
|
||||
assert_eq!(search_result_to.page[0].index, 0);
|
||||
|
||||
let search_result_from_to = searcher.search(&super::SearchQuery {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "test.txt".to_string(),
|
||||
limit: 10,
|
||||
range: Range { start: 1, end: 2 },
|
||||
offset: None,
|
||||
})?;
|
||||
assert_eq!(search_result_from_to.total, 1);
|
||||
assert_eq!(search_result_from_to.page[0].index, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sorted_by_timestamp() -> Result<()> {
|
||||
let repository = test_repository()?;
|
||||
@ -156,7 +79,6 @@ fn test_sorted_by_timestamp() -> Result<()> {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "hello world".to_string(),
|
||||
limit: 10,
|
||||
range: Range { start: 0, end: 10 },
|
||||
offset: None,
|
||||
});
|
||||
assert!(search_result.is_ok());
|
||||
@ -169,7 +91,7 @@ fn test_sorted_by_timestamp() -> Result<()> {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple() -> Result<()> {
|
||||
fn search_by_diff() -> Result<()> {
|
||||
let repository = test_repository()?;
|
||||
let project = test_project(&repository)?;
|
||||
let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
|
||||
@ -209,79 +131,81 @@ fn test_simple() -> Result<()> {
|
||||
let write_result = searcher.index_session(&gb_repo, &session);
|
||||
assert!(write_result.is_ok());
|
||||
|
||||
let search_result1 = searcher.search(&super::SearchQuery {
|
||||
let result = searcher.search(&super::SearchQuery {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "hello".to_string(),
|
||||
limit: 10,
|
||||
offset: None,
|
||||
range: Range { start: 0, end: 10 },
|
||||
});
|
||||
assert!(search_result1.is_ok());
|
||||
let search_result1 = search_result1.unwrap();
|
||||
assert_eq!(search_result1.total, 1);
|
||||
assert_eq!(search_result1.page[0].session_id, session.id);
|
||||
assert_eq!(search_result1.page[0].project_id, gb_repo.get_project_id());
|
||||
assert_eq!(search_result1.page[0].file_path, "test.txt");
|
||||
assert_eq!(search_result1.page[0].index, 0);
|
||||
})?;
|
||||
assert_eq!(result.total, 1);
|
||||
assert_eq!(result.page[0].session_id, session.id);
|
||||
assert_eq!(result.page[0].project_id, gb_repo.get_project_id());
|
||||
assert_eq!(result.page[0].file_path, "test.txt");
|
||||
assert_eq!(result.page[0].index, 0);
|
||||
|
||||
let search_result2 = searcher.search(&super::SearchQuery {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "world".to_string(),
|
||||
limit: 10,
|
||||
offset: None,
|
||||
range: Range { start: 0, end: 10 },
|
||||
});
|
||||
assert!(search_result2.is_ok());
|
||||
let search_result2 = search_result2.unwrap().page;
|
||||
assert_eq!(search_result2.len(), 1);
|
||||
assert_eq!(search_result2[0].session_id, session.id);
|
||||
assert_eq!(search_result2[0].project_id, gb_repo.get_project_id());
|
||||
assert_eq!(search_result2[0].file_path, "test.txt");
|
||||
assert_eq!(search_result2[0].index, 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
let search_result3 = searcher.search(&super::SearchQuery {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "hello world".to_string(),
|
||||
limit: 10,
|
||||
offset: None,
|
||||
range: Range { start: 0, end: 10 },
|
||||
});
|
||||
assert!(search_result3.is_ok());
|
||||
let search_result3 = search_result3.unwrap().page;
|
||||
assert_eq!(search_result3.len(), 2);
|
||||
assert_eq!(search_result3[0].project_id, gb_repo.get_project_id());
|
||||
assert_eq!(search_result3[0].session_id, session.id);
|
||||
assert_eq!(search_result3[0].file_path, "test.txt");
|
||||
assert_eq!(search_result3[1].session_id, session.id);
|
||||
assert_eq!(search_result3[1].project_id, gb_repo.get_project_id());
|
||||
assert_eq!(search_result3[1].file_path, "test.txt");
|
||||
#[test]
|
||||
fn search_by_filename() -> Result<()> {
|
||||
let repository = test_repository()?;
|
||||
let project = test_project(&repository)?;
|
||||
let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
|
||||
let storage = storage::Storage::from_path(tempdir()?.path().to_path_buf());
|
||||
let project_store = projects::Storage::new(storage.clone());
|
||||
project_store.add_project(&project)?;
|
||||
let user_store = users::Storage::new(storage);
|
||||
let gb_repo = gb_repository::Repository::open(
|
||||
gb_repo_path,
|
||||
project.id.clone(),
|
||||
project_store.clone(),
|
||||
user_store,
|
||||
)?;
|
||||
|
||||
let search_by_filename_result = searcher.search(&super::SearchQuery {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "test.txt".to_string(),
|
||||
limit: 10,
|
||||
offset: None,
|
||||
range: Range { start: 0, end: 10 },
|
||||
});
|
||||
assert!(search_by_filename_result.is_ok());
|
||||
let search_by_filename_result = search_by_filename_result.unwrap().page;
|
||||
assert_eq!(search_by_filename_result.len(), 2);
|
||||
assert_eq!(search_by_filename_result[0].session_id, session.id);
|
||||
assert_eq!(
|
||||
search_by_filename_result[0].project_id,
|
||||
gb_repo.get_project_id()
|
||||
);
|
||||
assert_eq!(search_by_filename_result[0].file_path, "test.txt");
|
||||
let index_path = tempdir()?.path().to_str().unwrap().to_string();
|
||||
|
||||
let session = gb_repo.get_or_create_current_session()?;
|
||||
let writer = sessions::Writer::open(&gb_repo, &session)?;
|
||||
writer.write_deltas(
|
||||
Path::new("test.txt"),
|
||||
&vec![
|
||||
deltas::Delta {
|
||||
operations: vec![deltas::Operation::Insert((0, "Hello".to_string()))],
|
||||
timestamp_ms: 0,
|
||||
},
|
||||
deltas::Delta {
|
||||
operations: vec![deltas::Operation::Insert((5, "World".to_string()))],
|
||||
timestamp_ms: 1,
|
||||
},
|
||||
],
|
||||
)?;
|
||||
let session = gb_repo.flush()?;
|
||||
let session = session.unwrap();
|
||||
|
||||
let searcher = super::Deltas::at(index_path).unwrap();
|
||||
|
||||
let write_result = searcher.index_session(&gb_repo, &session);
|
||||
assert!(write_result.is_ok());
|
||||
|
||||
let found_result = searcher
|
||||
.search(&super::SearchQuery {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "test.txt".to_string(),
|
||||
limit: 10,
|
||||
offset: None,
|
||||
})?
|
||||
.page;
|
||||
assert_eq!(found_result.len(), 2);
|
||||
assert_eq!(found_result[0].session_id, session.id);
|
||||
assert_eq!(found_result[0].project_id, gb_repo.get_project_id());
|
||||
assert_eq!(found_result[0].file_path, "test.txt");
|
||||
|
||||
let not_found_result = searcher.search(&super::SearchQuery {
|
||||
project_id: "not found".to_string(),
|
||||
q: "test.txt".to_string(),
|
||||
limit: 10,
|
||||
offset: None,
|
||||
range: Range { start: 0, end: 10 },
|
||||
});
|
||||
assert!(not_found_result.is_ok());
|
||||
let not_found_result = not_found_result.unwrap();
|
||||
})?;
|
||||
assert_eq!(not_found_result.total, 0);
|
||||
|
||||
Ok(())
|
||||
@ -334,7 +258,6 @@ fn test_delete_all() -> Result<()> {
|
||||
project_id: gb_repo.get_project_id().to_string(),
|
||||
q: "test.txt".to_string(),
|
||||
limit: 10,
|
||||
range: Range { start: 2, end: 10 },
|
||||
offset: None,
|
||||
})?;
|
||||
assert_eq!(search_result_from.total, 0);
|
||||
|
Loading…
Reference in New Issue
Block a user