Improve git-import speed - save_bonsai_changesets

Summary:
Improve git-import speed by grouping many BonsaiChangests into each call to save_bonsai_changesets.

During earlier profiling noticed that the speed of gitimport was dictated by the save_bonsai_changesets, by doing this we can split the speed more evenly between save_bonsai_changesets and the steps to derive the diff and upload the missing file-blobs.

For profiling and performance analysis details please see
https://fb.workplace.com/groups/1619247701796399/permalink/1709890502732118/

Reviewed By: StanislavGlebik

Differential Revision: D28497973

fbshipit-source-id: 0fbcf37535554dd96664da4906633eeb07c58f7c
This commit is contained in:
Robin Håkanson 2021-05-18 16:30:08 -07:00 committed by Facebook GitHub Bot
parent ffe5a114f8
commit 4790bab686
2 changed files with 84 additions and 46 deletions

View File

@ -20,6 +20,7 @@ filestore = { version = "0.1.0", path = "../../filestore" }
futures = { version = "0.3.13", features = ["async-await", "compat"] }
futures-old = { package = "futures", version = "0.1.31" }
futures_ext = { package = "futures_01_ext", version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "master" }
futures_stats = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "master" }
git-pool = { version = "0.1.0", path = "../git-pool" }
git2 = "0.13"
git_types = { version = "0.1.0", path = "../git_types" }

View File

@ -25,6 +25,7 @@ use context::CoreContext;
use derived_data::BonsaiDerived;
use filestore::{self, Alias, FetchKey, FilestoreConfig, StoreRequest};
use futures::{future, stream, Stream, StreamExt, TryStreamExt};
use futures_stats::TimedTryFutureExt;
use git2::{ObjectType, Oid, Repository, Sort, TreeWalkMode, TreeWalkResult};
pub use git_pool::GitPool;
use git_types::TreeHandle;
@ -37,6 +38,7 @@ use mononoke_types::{
};
use slog::{debug, info};
use sorted_vector_map::SortedVectorMap;
use std::cell::RefCell;
use std::collections::{BTreeMap, HashMap};
use std::convert::TryInto;
use std::path::Path;
@ -135,7 +137,7 @@ pub trait GitimportAccumulator: Sized {
fn is_empty(&self) -> bool {
self.len() == 0
}
fn insert(&mut self, oid: Oid, cs_id: ChangesetId, bonsai: BonsaiChangeset);
fn insert(&mut self, oid: Oid, cs_id: ChangesetId, bonsai: &BonsaiChangeset);
fn get(&self, oid: &Oid) -> Option<ChangesetId>;
}
@ -154,8 +156,8 @@ impl GitimportAccumulator for BufferingGitimportAccumulator {
self.inner.len()
}
fn insert(&mut self, oid: Oid, cs_id: ChangesetId, bonsai: BonsaiChangeset) {
self.inner.insert(oid, (cs_id, bonsai));
fn insert(&mut self, oid: Oid, cs_id: ChangesetId, bonsai: &BonsaiChangeset) {
self.inner.insert(oid, (cs_id, bonsai.clone()));
}
fn get(&self, oid: &Oid) -> Option<ChangesetId> {
@ -196,9 +198,11 @@ pub async fn gitimport_acc<Acc: GitimportAccumulator>(
return Ok(Acc::new());
}
let acc = RefCell::new(Acc::new());
// Kick off a stream that consumes the walk and prepared commits. Then, produce the Bonsais.
let ret = stream::iter(walk)
.map(|oid| async move {
stream::iter(walk)
.map(|oid| async {
let oid = oid.with_context(|| "While walking commits")?;
let ExtractedCommit {
@ -226,46 +230,73 @@ pub async fn gitimport_acc<Acc: GitimportAccumulator>(
Ok((metadata, file_changes))
})
.buffered(prefs.concurrency)
.try_fold(Acc::new(), {
move |mut acc, (metadata, file_changes)| async move {
let oid = metadata.oid;
let parents = metadata
.parents
.iter()
.map(|p| {
roots
.get(&p)
.copied()
.or_else(|| acc.get(p))
.ok_or_else(|| format_err!("Commit was not imported: {}", p))
})
.collect::<Result<Vec<_>, _>>()
.with_context(|| format_err!("While looking for parents of {}", oid))?;
.and_then(|(metadata, file_changes)| async {
let oid = metadata.oid;
let parents = metadata
.parents
.iter()
.map(|p| {
roots
.get(&p)
.copied()
.or_else(|| acc.borrow().get(p))
.ok_or_else(|| format_err!("Commit was not imported: {}", p))
})
.collect::<Result<Vec<_>, _>>()
.with_context(|| format_err!("While looking for parents of {}", oid))?;
let bcs = generate_bonsai_changeset(metadata, parents, file_changes, &prefs)?;
let bcs_id = bcs.get_changeset_id();
acc.borrow_mut().insert(oid, bcs_id, &bcs);
let bcs =
import_bonsai_changeset(ctx, repo, metadata, parents, file_changes, &prefs)
.await?;
let git_sha1 = oid_to_sha1(&oid)?;
info!(
ctx.logger(),
"GitRepo:{} commit {} of {} - Oid:{} => Bid:{}",
repo_name_ref,
acc.borrow().len(),
nb_commits_to_import,
git_sha1.to_brief(),
bcs_id.to_brief()
);
Ok((bcs, git_sha1))
})
// Chunk togehter into Vec<std::result::Result<(bcs, oid), Error> >
.chunks(prefs.concurrency)
// Go from Vec<Result<X,Y>> -> Result<Vec<X>,Y>
//.then(|v| future::ready(v.into_iter().collect::<Result<Vec<_>, Error>>()))
.map(|v| v.into_iter().collect::<Result<Vec<_>, Error>>())
.try_for_each(|v| async {
let oid_to_bcsid = v
.iter()
.map(|(bcs, git_sha1)| {
BonsaiGitMappingEntry::new(*git_sha1, bcs.get_changeset_id())
})
.collect::<Vec<BonsaiGitMappingEntry>>();
let vbcs = v.into_iter().map(|x| x.0).collect();
let bcs_id = bcs.get_changeset_id();
// We know that the commits are in order (this is guaranteed by the Walk), so we
// can insert them as-is, one by one, without extra dependency / ordering checks.
let (stats, ()) = save_bonsai_changesets(vbcs, ctx.clone(), repo.clone())
.try_timed()
.await?;
debug!(
ctx.logger(),
"save_bonsai_changesets for {} commits in {:?}",
oid_to_bcsid.len(),
stats.completion_time
);
acc.insert(oid, bcs_id, bcs);
info!(
ctx.logger(),
"GitRepo:{} commit {} of {} - Oid:{} => Bid:{}",
repo_name_ref,
acc.len(),
nb_commits_to_import,
oid_to_sha1(&oid)?.to_brief(),
bcs_id.to_brief()
);
Result::<_, Error>::Ok(acc)
if prefs.bonsai_git_mapping {
repo.bonsai_git_mapping()
.bulk_add(&ctx, &oid_to_bcsid)
.await?;
}
Ok(())
})
.await?;
Ok(ret)
Ok(acc.into_inner())
}
pub async fn gitimport(
@ -338,9 +369,7 @@ pub async fn gitimport(
Ok(import_map)
}
async fn import_bonsai_changeset(
ctx: &CoreContext,
repo: &BlobRepo,
fn generate_bonsai_changeset(
metadata: CommitMetadata,
parents: Vec<ChangesetId>,
file_changes: SortedVectorMap<MPath, Option<FileChange>>,
@ -365,7 +394,7 @@ async fn import_bonsai_changeset(
}
// TODO: Should we have further extras?
let bcs = BonsaiChangesetMut {
BonsaiChangesetMut {
parents,
author,
author_date,
@ -375,13 +404,21 @@ async fn import_bonsai_changeset(
extra,
file_changes,
}
.freeze()?;
.freeze()
}
async fn import_bonsai_changeset(
ctx: &CoreContext,
repo: &BlobRepo,
metadata: CommitMetadata,
parents: Vec<ChangesetId>,
file_changes: SortedVectorMap<MPath, Option<FileChange>>,
prefs: &GitimportPreferences,
) -> Result<BonsaiChangeset, Error> {
let oid = metadata.oid;
let bcs = generate_bonsai_changeset(metadata, parents, file_changes, prefs)?;
let bcs_id = bcs.get_changeset_id();
// We now that the commits are in order (this is guaranteed by the Walk), so we
// can insert them as-is, one by one, without extra dependency / ordering checks.
save_bonsai_changesets(vec![bcs.clone()], ctx.clone(), repo.clone()).await?;
if prefs.bonsai_git_mapping {