manage for edge case in which file documents are larger than the allowable limit

This commit is contained in:
KCaverly 2023-08-21 11:29:45 +02:00
parent b7e03507c2
commit 1cae4758cc
2 changed files with 63 additions and 22 deletions

View File

@ -156,25 +156,27 @@ impl VectorDatabase {
mtime: SystemTime, mtime: SystemTime,
documents: Vec<Document>, documents: Vec<Document>,
) -> Result<()> { ) -> Result<()> {
// Write to files table, and return generated id. // Return the existing ID, if both the file and mtime match
self.db.execute(
"
DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;
",
params![worktree_id, path.to_str()],
)?;
let mtime = Timestamp::from(mtime); let mtime = Timestamp::from(mtime);
self.db.execute( let mut existing_id_query = self.db.prepare("SELECT id FROM files WHERE worktree_id = ?1 AND relative_path = ?2 AND mtime_seconds = ?3 AND mtime_nanos = ?4")?;
" let existing_id = existing_id_query
INSERT INTO files .query_row(
(worktree_id, relative_path, mtime_seconds, mtime_nanos) params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
VALUES |row| Ok(row.get::<_, i64>(0)?),
(?1, ?2, $3, $4); )
", .map_err(|err| anyhow!(err));
params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos], let file_id = if existing_id.is_ok() {
)?; // If already exists, just return the existing id
existing_id.unwrap()
let file_id = self.db.last_insert_rowid(); } else {
// Delete Existing Row
self.db.execute(
"DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;",
params![worktree_id, path.to_str()],
)?;
self.db.execute("INSERT INTO files (worktree_id, relative_path, mtime_seconds, mtime_nanos) VALUES (?1, ?2, ?3, ?4);", params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos])?;
self.db.last_insert_rowid()
};
// Currently inserting at approximately 3400 documents a second // Currently inserting at approximately 3400 documents a second
// I imagine we can speed this up with a bulk insert of some kind. // I imagine we can speed this up with a bulk insert of some kind.

View File

@ -96,6 +96,7 @@ struct ProjectState {
_outstanding_job_count_tx: Arc<Mutex<watch::Sender<usize>>>, _outstanding_job_count_tx: Arc<Mutex<watch::Sender<usize>>>,
} }
#[derive(Clone)]
struct JobHandle { struct JobHandle {
tx: Weak<Mutex<watch::Sender<usize>>>, tx: Weak<Mutex<watch::Sender<usize>>>,
} }
@ -389,6 +390,7 @@ impl SemanticIndex {
embeddings_queue: &mut Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>, embeddings_queue: &mut Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>,
embed_batch_tx: &channel::Sender<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>, embed_batch_tx: &channel::Sender<Vec<(i64, Vec<Document>, PathBuf, SystemTime, JobHandle)>>,
) { ) {
// Handle edge case where individual file has more documents than max batch size
let should_flush = match job { let should_flush = match job {
EmbeddingJob::Enqueue { EmbeddingJob::Enqueue {
documents, documents,
@ -397,9 +399,43 @@ impl SemanticIndex {
mtime, mtime,
job_handle, job_handle,
} => { } => {
*queue_len += &documents.len(); // If documents is greater than embeddings batch size, recursively batch existing rows.
embeddings_queue.push((worktree_id, documents, path, mtime, job_handle)); if &documents.len() > &EMBEDDINGS_BATCH_SIZE {
*queue_len >= EMBEDDINGS_BATCH_SIZE let first_job = EmbeddingJob::Enqueue {
documents: documents[..EMBEDDINGS_BATCH_SIZE].to_vec(),
worktree_id,
path: path.clone(),
mtime,
job_handle: job_handle.clone(),
};
Self::enqueue_documents_to_embed(
first_job,
queue_len,
embeddings_queue,
embed_batch_tx,
);
let second_job = EmbeddingJob::Enqueue {
documents: documents[EMBEDDINGS_BATCH_SIZE..].to_vec(),
worktree_id,
path: path.clone(),
mtime,
job_handle: job_handle.clone(),
};
Self::enqueue_documents_to_embed(
second_job,
queue_len,
embeddings_queue,
embed_batch_tx,
);
return;
} else {
*queue_len += &documents.len();
embeddings_queue.push((worktree_id, documents, path, mtime, job_handle));
*queue_len >= EMBEDDINGS_BATCH_SIZE
}
} }
EmbeddingJob::Flush => true, EmbeddingJob::Flush => true,
}; };
@ -796,7 +832,10 @@ impl Drop for JobHandle {
fn drop(&mut self) { fn drop(&mut self) {
if let Some(tx) = self.tx.upgrade() { if let Some(tx) = self.tx.upgrade() {
let mut tx = tx.lock(); let mut tx = tx.lock();
*tx.borrow_mut() -= 1; // Manage for overflow, cause we are cloning the Job Handle
if *tx.borrow() > 0 {
*tx.borrow_mut() -= 1;
};
} }
} }
} }