mirror of
https://github.com/facebook/sapling.git
synced 2024-10-10 16:57:49 +03:00
c902acbc3f
Summary: remove the need to pass mapping to `::derive` method Reviewed By: StanislavGlebik Differential Revision: D19856560 fbshipit-source-id: 219af827ea7e077a4c3e678a85c51dc0e3822d79
923 lines
35 KiB
Rust
923 lines
35 KiB
Rust
/*
|
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
*
|
|
* This software may be used and distributed according to the terms of the
|
|
* GNU General Public License version 2.
|
|
*/
|
|
|
|
#![deny(warnings)]
|
|
|
|
use anyhow::{format_err, Error};
|
|
use blame::{fetch_file_full_content, BlameRoot};
|
|
use blobrepo::{BlobRepo, DangerousOverride};
|
|
use blobstore::{Blobstore, Loadable};
|
|
use bookmarks::{BookmarkPrefix, Bookmarks, Freshness};
|
|
use bytes::Bytes;
|
|
use cacheblob::{dummy::DummyLease, LeaseOps};
|
|
use changesets::{
|
|
deserialize_cs_entries, serialize_cs_entries, ChangesetEntry, Changesets, SqlChangesets,
|
|
};
|
|
use clap::{Arg, ArgMatches, SubCommand};
|
|
use cloned::cloned;
|
|
use cmdlib::{
|
|
args, helpers,
|
|
monitoring::{start_fb303_and_stats_agg, AliveService},
|
|
};
|
|
use context::CoreContext;
|
|
use dbbookmarks::SqlBookmarks;
|
|
use deleted_files_manifest::RootDeletedManifestId;
|
|
use derived_data::BonsaiDerived;
|
|
use derived_data_utils::{derived_data_utils, POSSIBLE_DERIVED_TYPES};
|
|
use fastlog::{fetch_parent_root_unodes, RootFastlog};
|
|
use fbinit::FacebookInit;
|
|
use fsnodes::RootFsnodeId;
|
|
use futures::{future, stream, Future, IntoFuture, Stream};
|
|
use futures_ext::{spawn_future, try_boxfuture, BoxFuture, FutureExt};
|
|
use futures_preview::{compat::Future01CompatExt, future::try_join3, stream::FuturesUnordered};
|
|
use futures_stats::Timed;
|
|
use futures_util::{
|
|
future::{ready, FutureExt as _, TryFutureExt},
|
|
stream::TryStreamExt,
|
|
try_join,
|
|
};
|
|
use lock_ext::LockExt;
|
|
use manifest::find_intersection_of_diffs;
|
|
use mononoke_types::{ChangesetId, FileUnodeId, MPath, RepositoryId};
|
|
use phases::SqlPhases;
|
|
use slog::{info, Logger};
|
|
use stats::prelude::*;
|
|
use std::{
|
|
collections::HashMap,
|
|
fs,
|
|
path::Path,
|
|
sync::{
|
|
atomic::{AtomicUsize, Ordering},
|
|
Arc, Mutex,
|
|
},
|
|
time::Duration,
|
|
};
|
|
use unodes::{find_unode_renames, RootUnodeManifestId};
|
|
|
|
define_stats_struct! {
|
|
DerivedDataStats("mononoke.backfill_derived_data.{}.{}", repo_name: String, data_type: &'static str),
|
|
pending_heads: timeseries(Rate, Sum),
|
|
}
|
|
|
|
const ARG_DERIVED_DATA_TYPE: &'static str = "derived-data-type";
|
|
const ARG_OUT_FILENAME: &'static str = "out-filename";
|
|
const ARG_SKIP: &'static str = "skip-changesets";
|
|
const ARG_REGENERATE: &'static str = "regenerate";
|
|
const ARG_PREFETCHED_COMMITS_PATH: &'static str = "prefetched-commits-path";
|
|
const ARG_CHANGESET: &'static str = "changeset";
|
|
|
|
const SUBCOMMAND_BACKFILL: &'static str = "backfill";
|
|
const SUBCOMMAND_TAIL: &'static str = "tail";
|
|
const SUBCOMMAND_PREFETCH_COMMITS: &'static str = "prefetch-commits";
|
|
const SUBCOMMAND_SINGLE: &'static str = "single";
|
|
|
|
const CHUNK_SIZE: usize = 4096;
|
|
|
|
/// Derived data types that are permitted to access redacted files. This list
|
|
/// should be limited to those data types that need access to the content of
|
|
/// redacted files in order to compute their data, and will not leak redacted
|
|
/// data; for example, derived data types that compute hashes of file
|
|
/// contents that form part of a Merkle tree, and thus need to have correct
|
|
/// hashes for file content.
|
|
const UNREDACTED_TYPES: &[&str] = &[
|
|
// Fsnodes need access to redacted file contents to compute SHA-1 and
|
|
// SHA-256 hashes of the file content, which form part of the fsnode
|
|
// tree hashes. Redacted content is only hashed, and so cannot be
|
|
// discovered via the fsnode tree.
|
|
RootFsnodeId::NAME,
|
|
// Blame does not contain any content of the file itself
|
|
BlameRoot::NAME,
|
|
];
|
|
|
|
/// Types of derived data for which prefetching content for changed files
|
|
/// migth speed up derivation.
|
|
const PREFETCH_CONTENT_TYPES: &[&str] = &[BlameRoot::NAME];
|
|
const PREFETCH_UNODE_TYPES: &[&str] = &[RootFastlog::NAME, RootDeletedManifestId::NAME];
|
|
|
|
fn open_repo_maybe_unredacted<'a>(
|
|
fb: FacebookInit,
|
|
logger: &Logger,
|
|
matches: &ArgMatches<'a>,
|
|
data_type: &str,
|
|
) -> impl Future<Item = BlobRepo, Error = Error> {
|
|
if UNREDACTED_TYPES.contains(&data_type) {
|
|
args::open_repo_unredacted(fb, logger, matches).left_future()
|
|
} else {
|
|
args::open_repo(fb, logger, matches).right_future()
|
|
}
|
|
}
|
|
|
|
#[fbinit::main]
|
|
fn main(fb: FacebookInit) -> Result<(), Error> {
|
|
let app = args::MononokeApp::new("Utility to work with bonsai derived data")
|
|
.with_advanced_args_hidden()
|
|
.with_fb303_args()
|
|
.build()
|
|
.version("0.0.0")
|
|
.about("Utility to work with bonsai derived data")
|
|
.subcommand(
|
|
SubCommand::with_name(SUBCOMMAND_BACKFILL)
|
|
.about("backfill derived data for public commits")
|
|
.arg(
|
|
Arg::with_name(ARG_DERIVED_DATA_TYPE)
|
|
.required(true)
|
|
.index(1)
|
|
.possible_values(POSSIBLE_DERIVED_TYPES)
|
|
.help("derived data type for which backfill will be run"),
|
|
)
|
|
.arg(
|
|
Arg::with_name(ARG_SKIP)
|
|
.long(ARG_SKIP)
|
|
.takes_value(true)
|
|
.help("skip this number of changesets"),
|
|
)
|
|
.arg(
|
|
Arg::with_name(ARG_REGENERATE)
|
|
.long(ARG_REGENERATE)
|
|
.help("regenerate derivations even if mapping contains changeset"),
|
|
)
|
|
.arg(
|
|
Arg::with_name(ARG_PREFETCHED_COMMITS_PATH)
|
|
.long(ARG_PREFETCHED_COMMITS_PATH)
|
|
.takes_value(true)
|
|
.required(false)
|
|
.help("a file with a list of bonsai changesets to backfill"),
|
|
),
|
|
)
|
|
.subcommand(
|
|
SubCommand::with_name(SUBCOMMAND_TAIL)
|
|
.about("tail public commits and fill derived data")
|
|
.arg(
|
|
Arg::with_name(ARG_DERIVED_DATA_TYPE)
|
|
.required(true)
|
|
.multiple(true)
|
|
.index(1)
|
|
.possible_values(POSSIBLE_DERIVED_TYPES)
|
|
.help("comma separated list of derived data types"),
|
|
),
|
|
)
|
|
.subcommand(
|
|
SubCommand::with_name(SUBCOMMAND_PREFETCH_COMMITS)
|
|
.about("fetch commits metadata from the database and save them to a file")
|
|
.arg(
|
|
Arg::with_name(ARG_OUT_FILENAME)
|
|
.long(ARG_OUT_FILENAME)
|
|
.takes_value(true)
|
|
.required(true)
|
|
.help("file name where commits will be saved"),
|
|
),
|
|
)
|
|
.subcommand(
|
|
SubCommand::with_name(SUBCOMMAND_SINGLE)
|
|
.about("backfill single changeset (mainly for performance testing purposes)")
|
|
.arg(
|
|
Arg::with_name(ARG_DERIVED_DATA_TYPE)
|
|
.required(true)
|
|
.index(1)
|
|
.possible_values(POSSIBLE_DERIVED_TYPES)
|
|
.help("derived data type for which backfill will be run"),
|
|
)
|
|
.arg(
|
|
Arg::with_name(ARG_CHANGESET)
|
|
.required(true)
|
|
.index(2)
|
|
.help("changeset by {hd|bonsai} hash or bookmark"),
|
|
),
|
|
);
|
|
let matches = app.get_matches();
|
|
args::init_cachelib(fb, &matches, None);
|
|
|
|
let logger = args::init_logging(fb, &matches);
|
|
let ctx = CoreContext::new_with_logger(fb, logger.clone());
|
|
let mut runtime = args::init_runtime(&matches)?;
|
|
|
|
let run = match matches.subcommand() {
|
|
(SUBCOMMAND_BACKFILL, Some(sub_m)) => {
|
|
let derived_data_type = sub_m
|
|
.value_of(ARG_DERIVED_DATA_TYPE)
|
|
.ok_or_else(|| format_err!("missing required argument: {}", ARG_DERIVED_DATA_TYPE))?
|
|
.to_string();
|
|
|
|
let prefetched_commits_path = sub_m
|
|
.value_of(ARG_PREFETCHED_COMMITS_PATH)
|
|
.ok_or_else(|| {
|
|
format_err!("missing required argument: {}", ARG_PREFETCHED_COMMITS_PATH)
|
|
})?
|
|
.to_string();
|
|
|
|
let skip = sub_m
|
|
.value_of(ARG_SKIP)
|
|
.map(|skip| skip.parse::<usize>())
|
|
.transpose()
|
|
.map(|skip| skip.unwrap_or(0))
|
|
.into_future()
|
|
.from_err();
|
|
let regenerate = sub_m.is_present(ARG_REGENERATE);
|
|
|
|
(
|
|
open_repo_maybe_unredacted(fb, &logger, &matches, &derived_data_type),
|
|
skip,
|
|
)
|
|
.into_future()
|
|
.and_then(move |(repo, skip)| {
|
|
subcommand_backfill(
|
|
ctx,
|
|
repo,
|
|
derived_data_type,
|
|
skip,
|
|
regenerate,
|
|
prefetched_commits_path,
|
|
)
|
|
})
|
|
.boxify()
|
|
}
|
|
(SUBCOMMAND_TAIL, Some(sub_m)) => {
|
|
let derived_data_types: Vec<_> = sub_m
|
|
.values_of_lossy(ARG_DERIVED_DATA_TYPE)
|
|
.ok_or_else(|| {
|
|
format_err!("missing required argument: {}", ARG_DERIVED_DATA_TYPE)
|
|
})?;
|
|
let service_name =
|
|
std::env::var("TW_JOB_NAME").unwrap_or("backfill_derived_data".to_string());
|
|
|
|
let stats = {
|
|
let repo_name = format!(
|
|
"{}_{}",
|
|
args::get_repo_name(fb, &matches)?,
|
|
args::get_repo_id(fb, &matches)?
|
|
);
|
|
move |data_type| DerivedDataStats::new(repo_name.clone(), data_type)
|
|
};
|
|
start_fb303_and_stats_agg(
|
|
fb,
|
|
&mut runtime,
|
|
&service_name,
|
|
&logger,
|
|
&matches,
|
|
AliveService,
|
|
)?;
|
|
(
|
|
args::open_repo(fb, &logger, &matches),
|
|
args::open_repo_unredacted(fb, &logger, &matches),
|
|
args::open_sql::<SqlBookmarks>(fb, &matches),
|
|
)
|
|
.into_future()
|
|
.and_then(move |(repo, unredacted_repo, bookmarks)| {
|
|
subcommand_tail(
|
|
ctx,
|
|
stats,
|
|
repo,
|
|
unredacted_repo,
|
|
bookmarks,
|
|
derived_data_types,
|
|
)
|
|
})
|
|
.boxify()
|
|
}
|
|
(SUBCOMMAND_PREFETCH_COMMITS, Some(sub_m)) => {
|
|
let out_filename = sub_m
|
|
.value_of(ARG_OUT_FILENAME)
|
|
.ok_or_else(|| format_err!("missing required argument: {}", ARG_OUT_FILENAME))?
|
|
.to_string();
|
|
|
|
(
|
|
args::open_repo(fb, &logger, &matches),
|
|
args::open_sql::<SqlChangesets>(fb, &matches),
|
|
)
|
|
.into_future()
|
|
.and_then(move |(repo, changesets)| {
|
|
let phases = repo.get_phases();
|
|
let sql_phases = phases.get_sql_phases();
|
|
fetch_all_public_changesets(
|
|
ctx.clone(),
|
|
repo.get_repoid(),
|
|
changesets,
|
|
sql_phases.clone(),
|
|
)
|
|
.collect()
|
|
})
|
|
.and_then(move |css| {
|
|
let serialized = serialize_cs_entries(css);
|
|
fs::write(out_filename, serialized).map_err(Error::from)
|
|
})
|
|
.boxify()
|
|
}
|
|
(SUBCOMMAND_SINGLE, Some(sub_m)) => {
|
|
let hash_or_bookmark = sub_m
|
|
.value_of_lossy(ARG_CHANGESET)
|
|
.ok_or_else(|| format_err!("missing required argument: {}", ARG_CHANGESET))?
|
|
.to_string();
|
|
let derived_data_type = sub_m
|
|
.value_of(ARG_DERIVED_DATA_TYPE)
|
|
.ok_or_else(|| format_err!("missing required argument: {}", ARG_DERIVED_DATA_TYPE))?
|
|
.to_string();
|
|
open_repo_maybe_unredacted(fb, &logger, &matches, &derived_data_type)
|
|
.and_then(move |repo| {
|
|
helpers::csid_resolve(ctx.clone(), repo.clone(), hash_or_bookmark)
|
|
.and_then(move |csid| subcommand_single(ctx, repo, csid, derived_data_type))
|
|
})
|
|
.boxify()
|
|
}
|
|
(name, _) => {
|
|
return Err(format_err!("unhandled subcommand: {}", name));
|
|
}
|
|
};
|
|
runtime.block_on_std(run.compat())
|
|
}
|
|
|
|
fn windows(start: u64, stop: u64, step: u64) -> impl Iterator<Item = (u64, u64)> {
|
|
(0..)
|
|
.map(move |index| (start + index * step, start + (index + 1) * step))
|
|
.take_while(move |(low, _high)| *low < stop)
|
|
.map(move |(low, high)| (low, std::cmp::min(stop, high)))
|
|
}
|
|
|
|
// This function is not optimal since it could be made faster by doing more processing
|
|
// on XDB side, but for the puprpose of this binary it is good enough
|
|
fn fetch_all_public_changesets(
|
|
ctx: CoreContext,
|
|
repo_id: RepositoryId,
|
|
changesets: SqlChangesets,
|
|
phases: SqlPhases,
|
|
) -> impl Stream<Item = ChangesetEntry, Error = Error> {
|
|
changesets
|
|
.get_changesets_ids_bounds(repo_id.clone())
|
|
.and_then(move |(start, stop)| {
|
|
let start = start.ok_or_else(|| Error::msg("changesets table is empty"))?;
|
|
let stop = stop.ok_or_else(|| Error::msg("changesets table is empty"))?;
|
|
let step = 65536;
|
|
Ok(stream::iter_ok(windows(start, stop, step)))
|
|
})
|
|
.flatten_stream()
|
|
.and_then(move |(lower_bound, upper_bound)| {
|
|
changesets
|
|
.get_list_bs_cs_id_in_range(repo_id, lower_bound, upper_bound)
|
|
.collect()
|
|
.and_then({
|
|
cloned!(ctx, changesets, phases);
|
|
move |ids| {
|
|
changesets
|
|
.get_many(ctx, repo_id, ids)
|
|
.and_then(move |mut entries| {
|
|
phases
|
|
.get_public_raw(
|
|
&entries.iter().map(|entry| entry.cs_id).collect(),
|
|
)
|
|
.map(move |public| {
|
|
entries.retain(|entry| public.contains(&entry.cs_id));
|
|
stream::iter_ok(entries)
|
|
})
|
|
})
|
|
}
|
|
})
|
|
})
|
|
.flatten()
|
|
}
|
|
|
|
fn parse_serialized_commits<P: AsRef<Path>>(file: P) -> Result<Vec<ChangesetEntry>, Error> {
|
|
let data = fs::read(file).map_err(Error::from)?;
|
|
deserialize_cs_entries(&Bytes::from(data))
|
|
}
|
|
|
|
fn subcommand_backfill<P: AsRef<Path>>(
|
|
ctx: CoreContext,
|
|
repo: BlobRepo,
|
|
derived_data_type: String,
|
|
skip: usize,
|
|
regenerate: bool,
|
|
prefetched_commits_path: P,
|
|
) -> BoxFuture<(), Error> {
|
|
let derived_utils = try_boxfuture!(derived_data_utils(
|
|
ctx.clone(),
|
|
repo.clone(),
|
|
derived_data_type.clone(),
|
|
));
|
|
|
|
info!(
|
|
ctx.logger(),
|
|
"reading all changesets for: {:?}",
|
|
repo.get_repoid()
|
|
);
|
|
parse_serialized_commits(prefetched_commits_path)
|
|
.into_future()
|
|
.and_then(move |mut changesets| {
|
|
changesets.sort_by_key(|cs_entry| cs_entry.gen);
|
|
let changesets: Vec<_> = changesets
|
|
.into_iter()
|
|
.skip(skip)
|
|
.map(|entry| entry.cs_id)
|
|
.collect();
|
|
info!(
|
|
ctx.logger(),
|
|
"starting deriving data for {} changesets",
|
|
changesets.len()
|
|
);
|
|
|
|
let total_count = changesets.len();
|
|
let generated_count = Arc::new(AtomicUsize::new(0));
|
|
let total_duration = Arc::new(Mutex::new(Duration::from_secs(0)));
|
|
|
|
if regenerate {
|
|
derived_utils.regenerate(&changesets);
|
|
}
|
|
|
|
stream::iter_ok(changesets)
|
|
.chunks(CHUNK_SIZE)
|
|
.and_then({
|
|
cloned!(ctx, repo, derived_utils);
|
|
move |chunk| derived_utils.pending(ctx.clone(), repo.clone(), chunk.clone())
|
|
})
|
|
.and_then({
|
|
cloned!(ctx, derived_data_type, repo);
|
|
move |chunk| {
|
|
warmup(
|
|
ctx.clone(),
|
|
repo.clone(),
|
|
derived_data_type.clone(),
|
|
chunk.clone(),
|
|
)
|
|
.boxed()
|
|
.compat()
|
|
.map(move |()| chunk)
|
|
}
|
|
})
|
|
.for_each(move |chunk| {
|
|
let chunk_size = chunk.len();
|
|
derived_utils
|
|
.derive_batch(ctx.clone(), repo.clone(), chunk)
|
|
.timed({
|
|
cloned!(ctx, generated_count, total_duration);
|
|
move |stats, _| {
|
|
generated_count.fetch_add(chunk_size, Ordering::SeqCst);
|
|
let elapsed = total_duration.with(|total_duration| {
|
|
*total_duration += stats.completion_time;
|
|
*total_duration
|
|
});
|
|
|
|
let generated = generated_count.load(Ordering::SeqCst);
|
|
if generated != 0 {
|
|
let generated = generated as f32;
|
|
let total = total_count as f32;
|
|
info!(
|
|
ctx.logger(),
|
|
"{}/{} estimate:{:.2?} speed:{:.2}/s mean_speed:{:.2}/s",
|
|
generated,
|
|
total_count,
|
|
elapsed.mul_f32((total - generated) / generated),
|
|
chunk_size as f32 / stats.completion_time.as_secs() as f32,
|
|
generated / elapsed.as_secs() as f32,
|
|
);
|
|
}
|
|
Ok(())
|
|
}
|
|
})
|
|
})
|
|
})
|
|
.boxify()
|
|
}
|
|
|
|
async fn warmup(
|
|
ctx: CoreContext,
|
|
repo: BlobRepo,
|
|
derived_data_type: String,
|
|
chunk: Vec<ChangesetId>,
|
|
) -> Result<(), Error> {
|
|
// Warmup bonsai changesets unconditionally because
|
|
// most likely all derived data needs it. And they are cheap to warm up anyway
|
|
|
|
let bcs_warmup = {
|
|
cloned!(ctx, chunk, repo);
|
|
async move {
|
|
stream::iter_ok(chunk.clone())
|
|
.map({
|
|
cloned!(ctx, repo);
|
|
move |cs_id| cs_id.load(ctx.clone(), repo.blobstore())
|
|
})
|
|
.buffer_unordered(100)
|
|
.for_each(|_| Ok(()))
|
|
.compat()
|
|
.await
|
|
}
|
|
};
|
|
|
|
let content_warmup = async {
|
|
if PREFETCH_CONTENT_TYPES.contains(&derived_data_type.as_ref()) {
|
|
content_warmup(ctx.clone(), repo.clone(), chunk.clone()).await?
|
|
}
|
|
Ok(())
|
|
};
|
|
|
|
let unode_warmup = async {
|
|
if PREFETCH_UNODE_TYPES.contains(&derived_data_type.as_ref()) {
|
|
unode_warmup(ctx.clone(), repo.clone(), &chunk).await?
|
|
}
|
|
Ok(())
|
|
};
|
|
|
|
try_join3(bcs_warmup, content_warmup, unode_warmup).await?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn content_warmup(
|
|
ctx: CoreContext,
|
|
repo: BlobRepo,
|
|
chunk: Vec<ChangesetId>,
|
|
) -> Result<(), Error> {
|
|
stream::iter_ok(chunk)
|
|
.map({
|
|
cloned!(ctx, repo);
|
|
move |csid| prefetch_content(ctx.clone(), repo.clone(), csid)
|
|
})
|
|
.buffered(CHUNK_SIZE)
|
|
.for_each(|_| Ok(()))
|
|
.compat()
|
|
.await
|
|
}
|
|
|
|
async fn unode_warmup(
|
|
ctx: CoreContext,
|
|
repo: BlobRepo,
|
|
chunk: &Vec<ChangesetId>,
|
|
) -> Result<(), Error> {
|
|
let futs = FuturesUnordered::new();
|
|
for cs_id in chunk {
|
|
cloned!(ctx, repo);
|
|
let f = async move {
|
|
let bcs = cs_id.load(ctx.clone(), repo.blobstore()).compat().await?;
|
|
|
|
let root_mf_id =
|
|
RootUnodeManifestId::derive(ctx.clone(), repo.clone(), bcs.get_changeset_id());
|
|
|
|
let parent_unodes = fetch_parent_root_unodes(ctx.clone(), repo.clone(), bcs);
|
|
let (root_mf_id, parent_unodes) =
|
|
try_join!(root_mf_id.compat(), parent_unodes.compat())?;
|
|
let unode_mf_id = root_mf_id.manifest_unode_id().clone();
|
|
find_intersection_of_diffs(
|
|
ctx.clone(),
|
|
Arc::new(repo.get_blobstore()),
|
|
unode_mf_id,
|
|
parent_unodes,
|
|
)
|
|
.for_each(|_| Ok(()))
|
|
.compat()
|
|
.await
|
|
};
|
|
futs.push(f);
|
|
}
|
|
|
|
futs.try_for_each(|_| ready(Ok(()))).await
|
|
}
|
|
|
|
fn subcommand_tail(
|
|
ctx: CoreContext,
|
|
stats_constructor: impl Fn(&'static str) -> DerivedDataStats,
|
|
repo: BlobRepo,
|
|
unredacted_repo: BlobRepo,
|
|
bookmarks: SqlBookmarks,
|
|
derived_data_types: Vec<String>,
|
|
) -> impl Future<Item = (), Error = Error> {
|
|
let derive_utils: Result<Vec<_>, Error> = derived_data_types
|
|
.into_iter()
|
|
.map(|name| {
|
|
let maybe_unredacted_repo = if UNREDACTED_TYPES.contains(&name.as_ref()) {
|
|
unredacted_repo.clone()
|
|
} else {
|
|
repo.clone()
|
|
};
|
|
let derive = derived_data_utils(ctx.clone(), repo.clone(), name)?;
|
|
let stats = stats_constructor(derive.name());
|
|
Ok((derive, maybe_unredacted_repo, Arc::new(stats)))
|
|
})
|
|
.collect();
|
|
derive_utils.into_future().and_then(move |derive_utils| {
|
|
let derive_utils = Arc::new(derive_utils);
|
|
stream::repeat::<_, Error>(())
|
|
.and_then(move |_| {
|
|
bookmarks
|
|
.list_publishing_by_prefix(
|
|
ctx.clone(),
|
|
&BookmarkPrefix::empty(),
|
|
repo.get_repoid(),
|
|
Freshness::MostRecent,
|
|
)
|
|
.map(|(_name, csid)| csid)
|
|
.collect()
|
|
.and_then({
|
|
cloned!(ctx, derive_utils);
|
|
move |heads| {
|
|
let pending: Vec<_> = derive_utils
|
|
.iter()
|
|
.map({
|
|
cloned!(ctx);
|
|
move |(derive, maybe_unredacted_repo, stats)| {
|
|
// create new context so each derivation would have its own trace
|
|
let ctx = CoreContext::new_with_logger(
|
|
ctx.fb,
|
|
ctx.logger().clone(),
|
|
);
|
|
derive
|
|
.pending(
|
|
ctx.clone(),
|
|
maybe_unredacted_repo.clone(),
|
|
heads.clone(),
|
|
)
|
|
.map({
|
|
cloned!(ctx, maybe_unredacted_repo, derive, stats);
|
|
move |pending| {
|
|
stats
|
|
.pending_heads
|
|
.add_value(pending.len() as i64);
|
|
pending
|
|
.into_iter()
|
|
.map(|csid| {
|
|
derive.derive(
|
|
ctx.clone(),
|
|
maybe_unredacted_repo.clone(),
|
|
csid,
|
|
)
|
|
})
|
|
.collect::<Vec<_>>()
|
|
}
|
|
})
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
future::join_all(pending).and_then(move |pending| {
|
|
let pending: Vec<_> = pending.into_iter().flatten().collect();
|
|
if pending.is_empty() {
|
|
tokio_timer::sleep(Duration::from_millis(250))
|
|
.from_err()
|
|
.left_future()
|
|
} else {
|
|
let count = pending.len();
|
|
info!(ctx.logger(), "found {} outdated heads", count);
|
|
stream::iter_ok(pending)
|
|
.buffered(1024)
|
|
.for_each(|_| Ok(()))
|
|
.timed({
|
|
cloned!(ctx);
|
|
move |stats, _| {
|
|
info!(
|
|
ctx.logger(),
|
|
"derived data for {} heads in {:?}",
|
|
count,
|
|
stats.completion_time
|
|
);
|
|
Ok(())
|
|
}
|
|
})
|
|
.right_future()
|
|
}
|
|
})
|
|
}
|
|
})
|
|
})
|
|
.for_each(|_| Ok(()))
|
|
})
|
|
}
|
|
|
|
fn subcommand_single(
|
|
ctx: CoreContext,
|
|
repo: BlobRepo,
|
|
csid: ChangesetId,
|
|
derived_data_type: String,
|
|
) -> impl Future<Item = (), Error = Error> {
|
|
let repo = repo.dangerous_override(|_| Arc::new(DummyLease {}) as Arc<dyn LeaseOps>);
|
|
let derived_utils = match derived_data_utils(ctx.clone(), repo.clone(), derived_data_type) {
|
|
Ok(derived_utils) => derived_utils,
|
|
Err(error) => return future::err(error).left_future(),
|
|
};
|
|
derived_utils.regenerate(&vec![csid]);
|
|
derived_utils
|
|
.derive(ctx.clone(), repo, csid)
|
|
.timed(move |stats, result| {
|
|
info!(
|
|
ctx.logger(),
|
|
"derived in {:?}: {:?}", stats.completion_time, result
|
|
);
|
|
Ok(())
|
|
})
|
|
.map(|_| ())
|
|
.right_future()
|
|
}
|
|
|
|
// Prefetch content of changed files between parents
|
|
fn prefetch_content(
|
|
ctx: CoreContext,
|
|
repo: BlobRepo,
|
|
csid: ChangesetId,
|
|
) -> impl Future<Item = (), Error = Error> {
|
|
fn prefetch_content_unode(
|
|
ctx: CoreContext,
|
|
blobstore: Arc<dyn Blobstore>,
|
|
renames: &HashMap<MPath, FileUnodeId>,
|
|
path: MPath,
|
|
file_unode_id: FileUnodeId,
|
|
) -> impl Future<Item = (), Error = Error> {
|
|
let rename = renames.get(&path).copied();
|
|
file_unode_id
|
|
.load(ctx.clone(), &blobstore)
|
|
.from_err()
|
|
.and_then(move |file_unode| {
|
|
let parents_content: Vec<_> = file_unode
|
|
.parents()
|
|
.iter()
|
|
.cloned()
|
|
.chain(rename)
|
|
.map({
|
|
cloned!(ctx, blobstore);
|
|
move |file_unode_id| {
|
|
fetch_file_full_content(ctx.clone(), blobstore.clone(), file_unode_id)
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
(
|
|
fetch_file_full_content(ctx.clone(), blobstore.clone(), file_unode_id),
|
|
future::join_all(parents_content),
|
|
)
|
|
.into_future()
|
|
.map(|_| ())
|
|
})
|
|
.boxify()
|
|
}
|
|
|
|
csid.load(ctx.clone(), repo.blobstore())
|
|
.from_err()
|
|
.and_then(move |bonsai| {
|
|
let root_manifest = RootUnodeManifestId::derive(ctx.clone(), repo.clone(), csid)
|
|
.map(|mf| mf.manifest_unode_id().clone());
|
|
|
|
let parents_manifest = bonsai.parents().collect::<Vec<_>>().into_iter().map({
|
|
cloned!(ctx, repo);
|
|
move |csid| {
|
|
RootUnodeManifestId::derive(ctx.clone(), repo.clone(), csid)
|
|
.map(|mf| mf.manifest_unode_id().clone())
|
|
}
|
|
});
|
|
|
|
(
|
|
root_manifest,
|
|
future::join_all(parents_manifest),
|
|
find_unode_renames(ctx.clone(), repo.clone(), &bonsai),
|
|
)
|
|
.into_future()
|
|
.and_then(move |(root_mf, parents_mf, renames)| {
|
|
let blobstore = repo.get_blobstore().boxed();
|
|
find_intersection_of_diffs(ctx.clone(), blobstore.clone(), root_mf, parents_mf)
|
|
.filter_map(|(path, entry)| Some((path?, entry.into_leaf()?)))
|
|
.map(move |(path, file)| {
|
|
spawn_future(prefetch_content_unode(
|
|
ctx.clone(),
|
|
blobstore.clone(),
|
|
&renames,
|
|
path,
|
|
file,
|
|
))
|
|
})
|
|
.buffered(256)
|
|
.for_each(|_| Ok(()))
|
|
})
|
|
})
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use blobstore::BlobstoreBytes;
|
|
use fixtures::linear;
|
|
use mercurial_types::HgChangesetId;
|
|
use std::str::FromStr;
|
|
use tokio_compat::runtime::Runtime;
|
|
|
|
#[fbinit::test]
|
|
fn test_backfill_data_latest(fb: FacebookInit) -> Result<(), Error> {
|
|
let ctx = CoreContext::test_mock(fb);
|
|
let repo = linear::getrepo(fb);
|
|
let mut runtime = Runtime::new()?;
|
|
|
|
let hg_cs_id = HgChangesetId::from_str("79a13814c5ce7330173ec04d279bf95ab3f652fb")?;
|
|
let maybe_bcs_id = runtime.block_on(repo.get_bonsai_from_hg(ctx.clone(), hg_cs_id))?;
|
|
let bcs_id = maybe_bcs_id.unwrap();
|
|
|
|
let derived_utils =
|
|
derived_data_utils(ctx.clone(), repo.clone(), RootUnodeManifestId::NAME)?;
|
|
runtime.block_on(derived_utils.derive_batch(ctx.clone(), repo.clone(), vec![bcs_id]))?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[fbinit::test]
|
|
fn test_backfill_data_batch(fb: FacebookInit) -> Result<(), Error> {
|
|
let ctx = CoreContext::test_mock(fb);
|
|
let repo = linear::getrepo(fb);
|
|
let mut runtime = Runtime::new()?;
|
|
|
|
let mut batch = vec![];
|
|
let hg_cs_ids = vec![
|
|
"a9473beb2eb03ddb1cccc3fbaeb8a4820f9cd157",
|
|
"3c15267ebf11807f3d772eb891272b911ec68759",
|
|
"a5ffa77602a066db7d5cfb9fb5823a0895717c5a",
|
|
"79a13814c5ce7330173ec04d279bf95ab3f652fb",
|
|
];
|
|
for hg_cs_id in &hg_cs_ids {
|
|
let hg_cs_id = HgChangesetId::from_str(hg_cs_id)?;
|
|
let maybe_bcs_id = runtime.block_on(repo.get_bonsai_from_hg(ctx.clone(), hg_cs_id))?;
|
|
batch.push(maybe_bcs_id.unwrap());
|
|
}
|
|
|
|
let derived_utils =
|
|
derived_data_utils(ctx.clone(), repo.clone(), RootUnodeManifestId::NAME)?;
|
|
let pending =
|
|
runtime.block_on(derived_utils.pending(ctx.clone(), repo.clone(), batch.clone()))?;
|
|
assert_eq!(pending.len(), hg_cs_ids.len());
|
|
runtime.block_on(derived_utils.derive_batch(ctx.clone(), repo.clone(), batch.clone()))?;
|
|
let pending = runtime.block_on(derived_utils.pending(ctx.clone(), repo, batch))?;
|
|
assert_eq!(pending.len(), 0);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[fbinit::test]
|
|
fn test_backfill_data_failing_blobstore(fb: FacebookInit) -> Result<(), Error> {
|
|
// The test exercises that derived data mapping entries are written only after
|
|
// all other blobstore writes were successful i.e. mapping entry shouldn't exist
|
|
// if any of the corresponding blobs weren't successfully saved
|
|
|
|
let ctx = CoreContext::test_mock(fb);
|
|
let origrepo = linear::getrepo(fb);
|
|
let mut runtime = Runtime::new()?;
|
|
|
|
let repo = origrepo.dangerous_override(|blobstore| -> Arc<dyn Blobstore> {
|
|
Arc::new(FailingBlobstore::new("manifest".to_string(), blobstore))
|
|
});
|
|
|
|
let first_hg_cs_id = HgChangesetId::from_str("2d7d4ba9ce0a6ffd222de7785b249ead9c51c536")?;
|
|
let maybe_bcs_id =
|
|
runtime.block_on(repo.get_bonsai_from_hg(ctx.clone(), first_hg_cs_id))?;
|
|
let bcs_id = maybe_bcs_id.unwrap();
|
|
|
|
let derived_utils =
|
|
derived_data_utils(ctx.clone(), repo.clone(), RootUnodeManifestId::NAME)?;
|
|
let res =
|
|
runtime.block_on(derived_utils.derive_batch(ctx.clone(), repo.clone(), vec![bcs_id]));
|
|
// Deriving should fail because blobstore writes fail
|
|
assert!(res.is_err());
|
|
|
|
// Make sure that since deriving for first_hg_cs_id failed it didn't
|
|
// write any mapping entries. And because it didn't deriving the parent changeset
|
|
// is now safe
|
|
let repo = origrepo;
|
|
let second_hg_cs_id = HgChangesetId::from_str("3e0e761030db6e479a7fb58b12881883f9f8c63f")?;
|
|
let maybe_bcs_id =
|
|
runtime.block_on(repo.get_bonsai_from_hg(ctx.clone(), second_hg_cs_id))?;
|
|
let bcs_id = maybe_bcs_id.unwrap();
|
|
runtime.block_on(derived_utils.derive_batch(ctx.clone(), repo.clone(), vec![bcs_id]))?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
struct FailingBlobstore {
|
|
bad_key_substring: String,
|
|
inner: Arc<dyn Blobstore>,
|
|
}
|
|
|
|
impl FailingBlobstore {
|
|
fn new(bad_key_substring: String, inner: Arc<dyn Blobstore>) -> Self {
|
|
Self {
|
|
bad_key_substring,
|
|
inner,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Blobstore for FailingBlobstore {
|
|
fn put(
|
|
&self,
|
|
ctx: CoreContext,
|
|
key: String,
|
|
value: BlobstoreBytes,
|
|
) -> BoxFuture<(), Error> {
|
|
if key.find(&self.bad_key_substring).is_some() {
|
|
tokio_timer::sleep(Duration::new(1, 0))
|
|
.from_err()
|
|
.and_then(|_| future::err(format_err!("failed")))
|
|
.boxify()
|
|
} else {
|
|
self.inner.put(ctx, key, value).boxify()
|
|
}
|
|
}
|
|
|
|
fn get(&self, ctx: CoreContext, key: String) -> BoxFuture<Option<BlobstoreBytes>, Error> {
|
|
self.inner.get(ctx, key)
|
|
}
|
|
}
|
|
}
|