benchmarks: add benchmark_large_directory

Summary:
Add a microbenchmark for deriving data with large directories.

This benchmark creates a commit with 100k files in a single directory, and then
derives data for that commit and 10 descendant commits, each of which add,
modify and remove some files.

Reviewed By: ahornby

Differential Revision: D26947361

fbshipit-source-id: 4215f1ac806c53a112217ceb10e50cfad56f4f28
This commit is contained in:
Mark Juggurnauth-Thomas 2021-03-11 04:25:00 -08:00 committed by Facebook GitHub Bot
parent eb4d31cc82
commit 36f78eadb8
3 changed files with 209 additions and 0 deletions

View File

@ -239,6 +239,7 @@ toml = { git = "https://github.com/jsgf/toml-rs", branch = "dotted-table-0.5.7"
members = [
".",
"alpn",
"benchmarks/derived_data",
"benchmarks/simulated_repo",
"blobimport_lib",
"blobimport_lib/consts",

View File

@ -0,0 +1,27 @@
[package]
name = "benchmark_large_directory"
version = "0.1.0"
authors = ["Facebook"]
edition = "2018"
license = "GPLv2+"
[[bin]]
name = "benchmark_large_directory"
path = "benchmark_large_directory.rs"
[dependencies]
anyhow = "1.0"
blobrepo = { version = "0.1.0", path = "../../blobrepo" }
blobrepo_factory = { version = "0.1.0", path = "../../blobrepo/factory" }
context = { version = "0.1.0", path = "../../server/context" }
deleted_files_manifest = { version = "0.1.0", path = "../../derived_data/deleted_files_manifest" }
derived_data = { version = "0.1.0", path = "../../derived_data" }
fbinit = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "master" }
fbinit-tokio-02 = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "master" }
fsnodes = { version = "0.1.0", path = "../../derived_data/fsnodes" }
futures_stats = { version = "0.1.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "master" }
mononoke_types = { version = "0.1.0", path = "../../mononoke_types" }
rand = { version = "0.7", features = ["small_rng"] }
skeleton_manifest = { version = "0.1.0", path = "../../derived_data/skeleton_manifest" }
tests_utils = { version = "0.1.0", path = "../../tests/utils" }
unodes = { version = "0.1.0", path = "../../derived_data/unodes" }

View File

@ -0,0 +1,181 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This software may be used and distributed according to the terms of the
* GNU General Public License version 2.
*/
//! This benchmark generates a single initial commit that adds 100k files to
//! a single large directory, and then 10 more commits that add, modify, and
//! remove some of those files at random.
//!
//! It then benchmarks deriving one of the derived data types (fsnodes,
//! unodes, skeleton manifest or deleted files manifests) for those commits.
use std::collections::BTreeSet;
use anyhow::Result;
use blobrepo::BlobRepo;
use context::CoreContext;
use deleted_files_manifest::RootDeletedManifestId;
use derived_data::{BonsaiDerivable, BonsaiDerived};
use fbinit::FacebookInit;
use fsnodes::RootFsnodeId;
use futures_stats::TimedFutureExt;
use mononoke_types::ChangesetId;
use rand::distributions::{Alphanumeric, Uniform};
use rand::{thread_rng, Rng};
use skeleton_manifest::RootSkeletonManifestId;
use tests_utils::CreateCommitContext;
use unodes::RootUnodeManifestId;
fn gen_filename(rng: &mut impl Rng, len: usize) -> String {
std::iter::repeat_with(|| rng.sample(Alphanumeric))
.take(len)
.collect()
}
async fn make_initial_large_directory(
ctx: &CoreContext,
repo: &BlobRepo,
count: usize,
) -> Result<(ChangesetId, BTreeSet<String>)> {
let mut filenames = BTreeSet::new();
let mut rng = thread_rng();
let len_distr = Uniform::new(5, 50);
while filenames.len() < count {
let len = rng.sample(len_distr);
let filename = gen_filename(&mut rng, len);
filenames.insert(filename);
}
let mut create = CreateCommitContext::new_root(ctx, repo);
for filename in filenames.iter() {
create = create.add_file(
format!("large_directory/{}", filename).as_str(),
format!("content of {}", filename),
);
}
let csid = create.commit().await?;
Ok((csid, filenames))
}
async fn modify_large_directory(
ctx: &CoreContext,
repo: &BlobRepo,
filenames: &mut BTreeSet<String>,
csid: ChangesetId,
index: usize,
add_count: usize,
modify_count: usize,
delete_count: usize,
) -> Result<ChangesetId> {
let mut create = CreateCommitContext::new(ctx, repo, vec![csid]);
let mut rng = thread_rng();
let len_distr = Uniform::new(5, 50);
let mut add_filenames = BTreeSet::new();
while add_filenames.len() < add_count {
let len = rng.sample(len_distr);
let filename = gen_filename(&mut rng, len);
if !filenames.contains(&filename) {
add_filenames.insert(filename);
}
}
let delete_count = delete_count.min(filenames.len());
let modify_count = modify_count.min(filenames.len() - delete_count);
let mut modify_filename_indexes = BTreeSet::new();
let index_distr = Uniform::new(0, filenames.len());
while modify_filename_indexes.len() < modify_count {
let index = rng.sample(index_distr);
modify_filename_indexes.insert(index);
}
let mut delete_filename_indexes = BTreeSet::new();
while delete_filename_indexes.len() < delete_count {
let index = rng.sample(index_distr);
if !modify_filename_indexes.contains(&index) {
delete_filename_indexes.insert(index);
}
}
let mut modify_filenames = BTreeSet::new();
let mut delete_filenames = BTreeSet::new();
for (index, filename) in filenames.iter().enumerate() {
if modify_filename_indexes.contains(&index) {
modify_filenames.insert(filename);
} else if delete_filename_indexes.contains(&index) {
delete_filenames.insert(filename);
}
}
for filename in add_filenames.iter().chain(modify_filenames) {
create = create.add_file(
format!("large_directory/{}", filename).as_str(),
format!("content {} of {}", index, filename),
);
}
for filename in delete_filenames.iter() {
create = create.delete_file(format!("large_directory/{}", filename).as_str());
}
let csid = create.commit().await?;
Ok(csid)
}
async fn derive(ctx: &CoreContext, repo: &BlobRepo, data: &str, csid: ChangesetId) -> String {
match data {
RootSkeletonManifestId::NAME => RootSkeletonManifestId::derive(&ctx, &repo, csid)
.await
.unwrap()
.skeleton_manifest_id()
.to_string(),
RootUnodeManifestId::NAME => RootUnodeManifestId::derive(&ctx, &repo, csid)
.await
.unwrap()
.manifest_unode_id()
.to_string(),
RootDeletedManifestId::NAME => RootDeletedManifestId::derive(&ctx, &repo, csid)
.await
.unwrap()
.deleted_manifest_id()
.to_string(),
RootFsnodeId::NAME => RootFsnodeId::derive(&ctx, &repo, csid)
.await
.unwrap()
.fsnode_id()
.to_string(),
_ => panic!("invalid derived data type: {}", data),
}
}
#[fbinit::main]
async fn main(fb: FacebookInit) -> Result<()> {
let ctx = CoreContext::test_mock(fb);
let mut args = std::env::args();
let _ = args.next();
let data = args.next().unwrap_or_else(|| String::from("fsnodes"));
println!("Deriving: {}", data);
let repo = blobrepo_factory::new_memblob_empty(None)?;
let (mut csid, mut filenames) = make_initial_large_directory(&ctx, &repo, 100_000).await?;
println!("First commit: {}", csid);
let (stats, derived_id) = derive(&ctx, &repo, &data, csid).timed().await;
println!("Derived id: {} stats: {:?}", derived_id, stats);
let commit_count = 10;
for commit in 0..commit_count {
csid =
modify_large_directory(&ctx, &repo, &mut filenames, csid, commit, 25, 100, 25).await?;
}
println!("Last commit: {}", csid);
let (stats, derived_id) = derive(&ctx, &repo, &data, csid).timed().await;
println!("Derived id: {} stats: {:?}", derived_id, stats);
Ok(())
}