custom snippet generator

This commit is contained in:
Nikita Galaiko 2023-05-29 11:00:36 +02:00
parent 86209c04a5
commit 9e73226702
4 changed files with 156 additions and 38 deletions

View File

@ -0,0 +1,81 @@
use std::{collections::HashSet, ops::Range};
use tantivy::Snippet;
// this is similar to Snippet.to_html, but only extracts the highlighted parts
pub fn get_highlighted(snippet: &Snippet) -> Vec<String> {
let mut result = HashSet::new();
for item in collapse_overlapped_ranges(&snippet.highlighted()) {
result.insert(snippet.fragment()[item.clone()].to_string());
}
let mut vec = result.into_iter().collect::<Vec<String>>();
vec.sort();
vec
}
// copied from tantivy::Snippet
fn collapse_overlapped_ranges(ranges: &[Range<usize>]) -> Vec<Range<usize>> {
let mut result = Vec::new();
let mut ranges_it = ranges.iter();
let mut current = match ranges_it.next() {
Some(range) => range.clone(),
None => return result,
};
for range in ranges {
if current.end > range.start {
current = current.start..std::cmp::max(current.end, range.end);
} else {
result.push(current);
current = range.clone();
}
}
result.push(current);
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_collapse_overlapped_ranges() {
assert_eq!(&collapse_overlapped_ranges(&[0..1, 2..3,]), &[0..1, 2..3]);
assert_eq!(&collapse_overlapped_ranges(&[0..1, 1..2,]), &[0..1, 1..2]);
assert_eq!(&collapse_overlapped_ranges(&[0..2, 1..2,]), &[0..2]);
assert_eq!(&collapse_overlapped_ranges(&[0..2, 1..3,]), &[0..3]);
assert_eq!(&collapse_overlapped_ranges(&[0..3, 1..2,]), &[0..3]);
}
// #[test]
// fn test_snippet_with_overlapped_highlighted_ranges() {
// let text = "abc";
// let mut terms = BTreeMap::new();
// terms.insert(String::from("ab"), 0.9);
// terms.insert(String::from("bc"), 1.0);
// let fragments = search_fragments(
// &From::from(NgramTokenizer::all_ngrams(2, 2)),
// text,
// &terms,
// 3,
// );
// assert_eq!(fragments.len(), 1);
// {
// let first = &fragments[0];
// assert_eq!(first.score, 1.9);
// assert_eq!(first.start_offset, 0);
// assert_eq!(first.stop_offset, 3);
// }
// let snippet = select_best_fragment_combination(&fragments[..], text);
// assert_eq!(snippet.fragment, "abc");
// assert_eq!(snippet.to_html(), "<b>abc</b>");
// }
}

View File

@ -1,6 +1,7 @@
mod index;
mod meta;
mod searcher;
mod highlighted;
pub use searcher::{Query, Results, Searcher};

View File

@ -12,7 +12,7 @@ use tantivy::{collector, directory::MmapDirectory, IndexWriter};
use tantivy::{query::QueryParser, Term};
use tantivy::{schema::IndexRecordOption, tokenizer};
use crate::{bookmarks, deltas, gb_repository, sessions};
use crate::{bookmarks, deltas, gb_repository, search::highlighted::get_highlighted, sessions};
use super::{index, meta};
@ -99,7 +99,7 @@ impl Searcher {
);
let count_handle = collectors.add_collector(collector::Count);
let snippet_generator = tantivy::SnippetGenerator::create(
let diff_snippet_generator = tantivy::SnippetGenerator::create(
&searcher,
&query,
self.index.schema().get_field("diff").unwrap(),
@ -112,41 +112,16 @@ impl Searcher {
let page = top_docs
.iter()
.map(|(_score, doc_address)| {
let retrieved_doc = searcher.doc(*doc_address)?;
let project_id = retrieved_doc
.get_first(self.index.schema().get_field("project_id").unwrap())
.unwrap()
.as_text()
.unwrap();
let file_path = retrieved_doc
.get_first(self.index.schema().get_field("file_path").unwrap())
.unwrap()
.as_text()
.unwrap();
let session_id = retrieved_doc
.get_first(self.index.schema().get_field("session_id").unwrap())
.unwrap()
.as_text()
.unwrap();
let index = retrieved_doc
.get_first(self.index.schema().get_field("index").unwrap())
.unwrap()
.as_u64()
.unwrap();
let snippet = snippet_generator.snippet_from_doc(&retrieved_doc);
let fragment = snippet.fragment();
let highlighted: Vec<String> = snippet
.highlighted()
.iter()
.map(|range| fragment[range.start..range.end].to_string())
.collect();
let doc = &searcher.doc(*doc_address)?;
let index_document =
index::IndexDocument::from_document(&self.index.schema(), &doc);
let snippet = diff_snippet_generator.snippet_from_doc(&doc);
Ok(SearchResult {
project_id: project_id.to_string(),
file_path: file_path.to_string(),
session_id: session_id.to_string(),
highlighted,
index,
project_id: index_document.project_id.unwrap(),
file_path: index_document.file_path.unwrap(),
session_id: index_document.session_id.unwrap(),
highlighted: get_highlighted(&snippet),
index: index_document.index.unwrap(),
})
})
.collect::<Result<Vec<SearchResult>>>()?;

View File

@ -514,8 +514,7 @@ fn search_by_filename() -> Result<()> {
let searcher = super::Searcher::at(index_path).unwrap();
let write_result = searcher.index_session(&gb_repo, &session);
assert!(write_result.is_ok());
searcher.index_session(&gb_repo, &session)?;
let found_result = searcher
.search(&super::Query {
@ -540,3 +539,65 @@ fn search_by_filename() -> Result<()> {
Ok(())
}
#[test]
fn test_highlight() -> Result<()> {
let repository = test_repository()?;
let project = test_project(&repository)?;
let gb_repo_path = tempdir()?.path().to_str().unwrap().to_string();
let storage = storage::Storage::from_path(tempdir()?.path().to_path_buf());
let project_store = projects::Storage::new(storage.clone());
project_store.add_project(&project)?;
let user_store = users::Storage::new(storage);
let gb_repo = gb_repository::Repository::open(
gb_repo_path,
project.id.clone(),
project_store.clone(),
user_store,
)?;
let index_path = tempdir()?.path().to_str().unwrap().to_string();
let session = gb_repo.get_or_create_current_session()?;
let writer = sessions::Writer::open(&gb_repo, &session)?;
writer.write_deltas(
Path::new("test.txt"),
&vec![deltas::Delta {
operations: vec![deltas::Operation::Insert((
0,
"hello world hello".to_string(),
))],
timestamp_ms: 0,
}],
)?;
let session = gb_repo.flush()?;
let session = session.unwrap();
let searcher = super::Searcher::at(index_path).unwrap();
searcher.index_session(&gb_repo, &session)?;
let result = searcher
.search(&super::Query {
project_id: gb_repo.get_project_id().to_string(),
q: "hello".to_string(),
limit: 10,
offset: None,
})?
.page;
assert_eq!(result.len(), 1);
assert_eq!(result[0].highlighted, vec!["hello"]);
let result = searcher
.search(&super::Query {
project_id: gb_repo.get_project_id().to_string(),
q: "hello world".to_string(),
limit: 10,
offset: None,
})?
.page;
assert_eq!(result.len(), 1);
assert_eq!(result[0].highlighted, vec!["hello", "hello world"]);
Ok(())
}