2020-01-22 20:55:26 +03:00
|
|
|
/*
|
|
|
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
|
|
*
|
|
|
|
* This software may be used and distributed according to the terms of the
|
2020-02-11 13:42:43 +03:00
|
|
|
* GNU General Public License version 2.
|
2020-01-22 20:55:26 +03:00
|
|
|
*/
|
|
|
|
|
2020-10-15 13:49:27 +03:00
|
|
|
#![deny(warnings)]
|
|
|
|
|
|
|
|
use anyhow::{anyhow, format_err, Error};
|
2020-05-08 17:44:54 +03:00
|
|
|
use async_trait::async_trait;
|
2020-01-22 20:55:26 +03:00
|
|
|
use blame::{BlameRoot, BlameRootMapping};
|
2020-06-22 17:20:47 +03:00
|
|
|
use blobrepo::BlobRepo;
|
|
|
|
use blobrepo_override::DangerousOverride;
|
2020-05-08 17:44:54 +03:00
|
|
|
use blobstore::{Blobstore, Loadable};
|
2020-01-22 20:55:26 +03:00
|
|
|
use cacheblob::{dummy::DummyLease, LeaseOps, MemWritesBlobstore};
|
2020-03-10 02:38:58 +03:00
|
|
|
use changeset_info::{ChangesetInfo, ChangesetInfoMapping};
|
2020-01-22 20:55:26 +03:00
|
|
|
use cloned::cloned;
|
2020-12-14 20:22:57 +03:00
|
|
|
use context::{CoreContext, SessionContainer};
|
2020-01-22 20:55:26 +03:00
|
|
|
use deleted_files_manifest::{RootDeletedManifestId, RootDeletedManifestMapping};
|
2020-02-12 21:12:46 +03:00
|
|
|
use derived_data::{
|
2020-12-14 20:22:57 +03:00
|
|
|
derive_impl, BlobstoreExistsMapping, BlobstoreRootIdMapping, BonsaiDerivable,
|
|
|
|
BonsaiDerivedMapping, DerivedDataTypesConfig, RegenerateMapping,
|
2020-02-12 21:12:46 +03:00
|
|
|
};
|
2020-01-27 12:09:21 +03:00
|
|
|
use derived_data_filenodes::{FilenodesOnlyPublic, FilenodesOnlyPublicMapping};
|
2020-01-22 20:55:26 +03:00
|
|
|
use fastlog::{RootFastlog, RootFastlogMapping};
|
|
|
|
use fsnodes::{RootFsnodeId, RootFsnodeMapping};
|
2020-06-30 16:21:10 +03:00
|
|
|
use futures::{
|
2020-10-16 17:44:19 +03:00
|
|
|
future::{self, ready, try_join_all, BoxFuture, FutureExt},
|
2020-06-30 16:21:10 +03:00
|
|
|
stream::{self, futures_unordered::FuturesUnordered},
|
2020-10-16 17:44:19 +03:00
|
|
|
Future, Stream, StreamExt, TryFutureExt, TryStreamExt,
|
2020-10-15 13:49:27 +03:00
|
|
|
};
|
|
|
|
use lazy_static::lazy_static;
|
|
|
|
use lock_ext::LockExt;
|
2020-01-22 20:55:26 +03:00
|
|
|
use mercurial_derived_data::{HgChangesetIdMapping, MappedHgChangesetId};
|
2020-05-08 17:44:54 +03:00
|
|
|
use mononoke_types::{BonsaiChangeset, ChangesetId};
|
2020-12-14 20:22:57 +03:00
|
|
|
use scuba_ext::MononokeScubaSampleBuilder;
|
2020-11-12 00:21:42 +03:00
|
|
|
use skeleton_manifest::{RootSkeletonManifestId, RootSkeletonManifestMapping};
|
2020-01-22 20:55:26 +03:00
|
|
|
use std::{
|
2020-10-15 13:49:27 +03:00
|
|
|
collections::{HashMap, HashSet},
|
|
|
|
hash::{Hash, Hasher},
|
2020-10-19 17:28:24 +03:00
|
|
|
io::Write,
|
2020-01-22 20:55:26 +03:00
|
|
|
sync::{Arc, Mutex},
|
|
|
|
};
|
2020-10-15 13:49:27 +03:00
|
|
|
use topo_sort::sort_topological;
|
2020-12-14 20:22:57 +03:00
|
|
|
use tunables::tunables;
|
2020-01-22 20:55:26 +03:00
|
|
|
use unodes::{RootUnodeManifestId, RootUnodeManifestMapping};
|
|
|
|
|
|
|
|
pub const POSSIBLE_DERIVED_TYPES: &[&str] = &[
|
|
|
|
RootUnodeManifestId::NAME,
|
|
|
|
RootFastlog::NAME,
|
|
|
|
MappedHgChangesetId::NAME,
|
|
|
|
RootFsnodeId::NAME,
|
|
|
|
BlameRoot::NAME,
|
2020-03-10 02:38:58 +03:00
|
|
|
ChangesetInfo::NAME,
|
2020-01-22 20:55:26 +03:00
|
|
|
RootDeletedManifestId::NAME,
|
2020-01-27 12:09:21 +03:00
|
|
|
FilenodesOnlyPublic::NAME,
|
2020-11-12 00:21:42 +03:00
|
|
|
RootSkeletonManifestId::NAME,
|
2020-01-22 20:55:26 +03:00
|
|
|
];
|
|
|
|
|
2020-10-15 13:49:27 +03:00
|
|
|
lazy_static! {
|
|
|
|
// TODO: come up with a better way to maintain these dependencies T77090285
|
|
|
|
pub static ref DERIVED_DATA_DEPS: HashMap<&'static str, Vec<&'static str>> = {
|
|
|
|
let unodes = RootUnodeManifestId::NAME;
|
|
|
|
let fastlog = RootFastlog::NAME;
|
|
|
|
let hgchangeset = MappedHgChangesetId::NAME;
|
|
|
|
let fsnodes = RootFsnodeId::NAME;
|
|
|
|
let blame = BlameRoot::NAME;
|
|
|
|
let changesets_info = ChangesetInfo::NAME;
|
|
|
|
let deleted_mf = RootDeletedManifestId::NAME;
|
|
|
|
let filenodes = FilenodesOnlyPublic::NAME;
|
2020-11-12 00:21:42 +03:00
|
|
|
let skeleton_mf = RootSkeletonManifestId::NAME;
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
let mut dag = HashMap::new();
|
|
|
|
|
|
|
|
dag.insert(hgchangeset, vec![]);
|
|
|
|
dag.insert(unodes, vec![]);
|
|
|
|
dag.insert(blame, vec![unodes]);
|
|
|
|
dag.insert(fastlog, vec![unodes]);
|
|
|
|
dag.insert(changesets_info, vec![]);
|
|
|
|
dag.insert(filenodes, vec![hgchangeset]);
|
|
|
|
dag.insert(fsnodes, vec![]);
|
|
|
|
dag.insert(deleted_mf, vec![unodes]);
|
2020-11-12 00:21:42 +03:00
|
|
|
dag.insert(skeleton_mf, vec![]);
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
dag
|
|
|
|
};
|
|
|
|
pub static ref DERIVED_DATA_ORDER: HashMap<&'static str, usize> = {
|
|
|
|
let order = sort_topological(&*DERIVED_DATA_DEPS).expect("derived data can not form loop");
|
|
|
|
order
|
|
|
|
.into_iter()
|
|
|
|
.enumerate()
|
|
|
|
.map(|(index, name)| (name, index))
|
|
|
|
.collect::<HashMap<_, _>>()
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2020-06-30 16:21:10 +03:00
|
|
|
pub fn derive_data_for_csids(
|
|
|
|
ctx: &CoreContext,
|
|
|
|
repo: &BlobRepo,
|
|
|
|
csids: Vec<ChangesetId>,
|
|
|
|
derived_data_types: &[String],
|
|
|
|
) -> Result<impl Future<Output = Result<(), Error>>, Error> {
|
|
|
|
let derivations = FuturesUnordered::new();
|
|
|
|
|
|
|
|
for data_type in derived_data_types {
|
2020-12-14 20:22:57 +03:00
|
|
|
let derived_utils = derived_data_utils(repo, data_type)?;
|
2020-06-30 16:21:10 +03:00
|
|
|
|
|
|
|
let mut futs = vec![];
|
|
|
|
for csid in &csids {
|
|
|
|
let fut = derived_utils
|
|
|
|
.derive(ctx.clone(), repo.clone(), *csid)
|
2020-10-16 17:44:19 +03:00
|
|
|
.map_ok(|_| ());
|
2020-06-30 16:21:10 +03:00
|
|
|
futs.push(fut);
|
|
|
|
}
|
|
|
|
|
|
|
|
derivations.push(async move {
|
|
|
|
// Call functions sequentially because derived data is sequential
|
|
|
|
// so there's no point in trying to derive it in parallel
|
|
|
|
for f in futs {
|
|
|
|
f.await?;
|
|
|
|
}
|
|
|
|
Result::<_, Error>::Ok(())
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(async move {
|
|
|
|
derivations.try_for_each(|_| ready(Ok(()))).await?;
|
|
|
|
Ok(())
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2020-05-08 17:44:54 +03:00
|
|
|
#[async_trait]
|
2020-01-22 20:55:26 +03:00
|
|
|
pub trait DerivedUtils: Send + Sync + 'static {
|
|
|
|
/// Derive data for changeset
|
|
|
|
fn derive(
|
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csid: ChangesetId,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> BoxFuture<'static, Result<String, Error>>;
|
2020-01-22 20:55:26 +03:00
|
|
|
|
2020-06-08 17:28:08 +03:00
|
|
|
fn backfill_batch_dangerous(
|
2020-01-22 20:55:26 +03:00
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csids: Vec<ChangesetId>,
|
2020-12-14 20:22:57 +03:00
|
|
|
parallel: bool,
|
2021-01-29 17:34:10 +03:00
|
|
|
gap_size: Option<usize>,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> BoxFuture<'static, Result<(), Error>>;
|
2020-01-22 20:55:26 +03:00
|
|
|
|
|
|
|
/// Find pending changeset (changesets for which data have not been derived)
|
2020-10-16 17:44:19 +03:00
|
|
|
async fn pending(
|
2020-01-22 20:55:26 +03:00
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csids: Vec<ChangesetId>,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> Result<Vec<ChangesetId>, Error>;
|
2020-01-22 20:55:26 +03:00
|
|
|
|
|
|
|
/// Regenerate derived data for specified set of commits
|
|
|
|
fn regenerate(&self, csids: &Vec<ChangesetId>);
|
|
|
|
|
|
|
|
/// Get a name for this type of derived data
|
|
|
|
fn name(&self) -> &'static str;
|
2020-05-08 17:44:54 +03:00
|
|
|
|
|
|
|
async fn find_oldest_underived<'a>(
|
|
|
|
&'a self,
|
|
|
|
ctx: &'a CoreContext,
|
|
|
|
repo: &'a BlobRepo,
|
|
|
|
csids: &'a Vec<ChangesetId>,
|
|
|
|
) -> Result<Option<BonsaiChangeset>, Error>;
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone)]
|
|
|
|
struct DerivedUtilsFromMapping<M> {
|
|
|
|
mapping: RegenerateMapping<M>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<M> DerivedUtilsFromMapping<M> {
|
2020-12-14 20:22:57 +03:00
|
|
|
fn new(mapping: M) -> Self {
|
2020-01-22 20:55:26 +03:00
|
|
|
let mapping = RegenerateMapping::new(mapping);
|
2020-12-14 20:22:57 +03:00
|
|
|
Self { mapping }
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-08 17:44:54 +03:00
|
|
|
#[async_trait]
|
2020-01-22 20:55:26 +03:00
|
|
|
impl<M> DerivedUtils for DerivedUtilsFromMapping<M>
|
|
|
|
where
|
|
|
|
M: BonsaiDerivedMapping + Clone + 'static,
|
2020-12-14 20:22:57 +03:00
|
|
|
M::Value: BonsaiDerivable + std::fmt::Debug,
|
2020-01-22 20:55:26 +03:00
|
|
|
{
|
|
|
|
fn derive(
|
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csid: ChangesetId,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> BoxFuture<'static, Result<String, Error>> {
|
2020-07-20 13:16:49 +03:00
|
|
|
// We call derive_impl directly so that we can pass
|
2020-05-13 13:23:15 +03:00
|
|
|
// `self.mapping` there. This will allow us to
|
|
|
|
// e.g. regenerate derived data for the commit
|
|
|
|
// even if it was already generated (see RegenerateMapping call).
|
2020-12-14 20:22:57 +03:00
|
|
|
cloned!(self.mapping);
|
2020-10-16 17:44:19 +03:00
|
|
|
async move {
|
2020-12-14 20:22:57 +03:00
|
|
|
let result =
|
|
|
|
derive_impl::derive_impl::<M::Value, _>(&ctx, &repo, &mapping, csid).await?;
|
2020-10-16 17:44:19 +03:00
|
|
|
Ok(format!("{:?}", result))
|
|
|
|
}
|
|
|
|
.boxed()
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
|
2020-06-08 17:28:08 +03:00
|
|
|
/// !!!!This function is dangerous and should be used with care!!!!
|
|
|
|
/// In particular it might corrupt the data if it tries to derive data that
|
|
|
|
/// depends on another derived data (e.g. blame depends on unodes) and both
|
|
|
|
/// of them are not derived.
|
|
|
|
/// For example, if unodes and blame are both underived and we are trying
|
|
|
|
/// to derive blame then unodes mapping might be inserted in the blobstore
|
|
|
|
/// before all unodes were derived.
|
|
|
|
///
|
|
|
|
/// This function should be safe to use only if derived data doesn't depend
|
|
|
|
/// on another derived data (e.g. unodes) or if this dependency is already derived
|
|
|
|
/// (e.g. deriving blame when unodes are already derived).
|
|
|
|
fn backfill_batch_dangerous(
|
2020-01-22 20:55:26 +03:00
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csids: Vec<ChangesetId>,
|
2020-12-14 20:22:57 +03:00
|
|
|
parallel: bool,
|
2021-01-29 17:34:10 +03:00
|
|
|
gap_size: Option<usize>,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> BoxFuture<'static, Result<(), Error>> {
|
2020-01-22 20:55:26 +03:00
|
|
|
let orig_mapping = self.mapping.clone();
|
|
|
|
// With InMemoryMapping we can ensure that mapping entries are written only after
|
|
|
|
// all corresponding blobs were successfully saved
|
|
|
|
let in_memory_mapping = InMemoryMapping::new(self.mapping.clone());
|
|
|
|
|
|
|
|
// Use `MemWritesBlobstore` to avoid blocking on writes to underlying blobstore.
|
|
|
|
// `::persist` is later used to bulk write all pending data.
|
|
|
|
let mut memblobstore = None;
|
|
|
|
let repo = repo
|
|
|
|
.dangerous_override(|_| Arc::new(DummyLease {}) as Arc<dyn LeaseOps>)
|
|
|
|
.dangerous_override(|blobstore| -> Arc<dyn Blobstore> {
|
|
|
|
let blobstore = Arc::new(MemWritesBlobstore::new(blobstore));
|
|
|
|
memblobstore = Some(blobstore.clone());
|
|
|
|
blobstore
|
|
|
|
});
|
|
|
|
let memblobstore = memblobstore.expect("memblobstore should have been updated");
|
|
|
|
|
2020-10-16 17:44:19 +03:00
|
|
|
async move {
|
2021-01-29 17:34:10 +03:00
|
|
|
if parallel || gap_size.is_some() {
|
2020-12-14 20:22:57 +03:00
|
|
|
// create new context so each derivation batch has its own trace
|
2020-12-14 20:22:57 +03:00
|
|
|
// and is rate-limited
|
|
|
|
let session = SessionContainer::builder(ctx.fb)
|
|
|
|
.blobstore_maybe_read_qps_limiter(tunables().get_backfill_read_qps())
|
|
|
|
.await
|
|
|
|
.blobstore_maybe_write_qps_limiter(tunables().get_backfill_write_qps())
|
|
|
|
.await
|
|
|
|
.build();
|
|
|
|
let ctx = session.new_context(
|
|
|
|
ctx.logger().clone(),
|
|
|
|
MononokeScubaSampleBuilder::with_discard(),
|
|
|
|
);
|
2020-12-14 20:22:57 +03:00
|
|
|
|
|
|
|
// derive the batch of derived data in parallel
|
2021-01-29 17:34:10 +03:00
|
|
|
M::Value::batch_derive(&ctx, &repo, csids, &in_memory_mapping, gap_size).await?;
|
2020-12-14 20:22:57 +03:00
|
|
|
} else {
|
|
|
|
for csid in csids {
|
|
|
|
// create new context so each derivation has its own trace
|
|
|
|
let ctx = CoreContext::new_with_logger(ctx.fb, ctx.logger().clone());
|
|
|
|
|
|
|
|
// derive each changeset sequentially
|
2020-12-14 20:22:57 +03:00
|
|
|
derive_impl::derive_impl::<M::Value, _>(&ctx, &repo, &in_memory_mapping, csid)
|
|
|
|
.await?;
|
2020-12-14 20:22:57 +03:00
|
|
|
}
|
2020-12-07 13:50:27 +03:00
|
|
|
}
|
2020-10-16 17:44:19 +03:00
|
|
|
|
2021-01-15 00:00:59 +03:00
|
|
|
{
|
2021-02-01 15:32:16 +03:00
|
|
|
let ctx = derived_data::override_ctx(ctx.clone(), &repo);
|
2021-01-15 00:00:59 +03:00
|
|
|
// flush blobstore
|
|
|
|
memblobstore.persist(&ctx).await?;
|
|
|
|
}
|
|
|
|
|
2020-10-16 17:44:19 +03:00
|
|
|
// flush mapping
|
|
|
|
let futs = FuturesUnordered::new();
|
|
|
|
{
|
2020-07-20 13:16:49 +03:00
|
|
|
let buffer = in_memory_mapping.into_buffer();
|
|
|
|
let buffer = buffer.lock().unwrap();
|
|
|
|
for (cs_id, value) in buffer.iter() {
|
2020-11-03 20:13:45 +03:00
|
|
|
futs.push(orig_mapping.put(ctx.clone(), *cs_id, value.clone()));
|
2020-07-20 13:16:49 +03:00
|
|
|
}
|
2020-10-16 17:44:19 +03:00
|
|
|
}
|
|
|
|
futs.try_for_each(|_| future::ok(())).await?;
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
.boxed()
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
|
2020-10-16 17:44:19 +03:00
|
|
|
async fn pending(
|
2020-01-22 20:55:26 +03:00
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
_repo: BlobRepo,
|
|
|
|
mut csids: Vec<ChangesetId>,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> Result<Vec<ChangesetId>, Error> {
|
2020-11-03 20:13:45 +03:00
|
|
|
let derived = self.mapping.get(ctx, csids.clone()).await?;
|
2020-10-16 17:44:19 +03:00
|
|
|
csids.retain(|csid| !derived.contains_key(&csid));
|
|
|
|
Ok(csids)
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
|
2020-05-08 17:44:54 +03:00
|
|
|
async fn find_oldest_underived<'a>(
|
|
|
|
&'a self,
|
|
|
|
ctx: &'a CoreContext,
|
|
|
|
repo: &'a BlobRepo,
|
|
|
|
csids: &'a Vec<ChangesetId>,
|
|
|
|
) -> Result<Option<BonsaiChangeset>, Error> {
|
|
|
|
let mut underived_ancestors = vec![];
|
|
|
|
for cs_id in csids {
|
2020-12-14 20:22:57 +03:00
|
|
|
underived_ancestors.push(derive_impl::find_topo_sorted_underived::<M::Value, _, _>(
|
2020-11-27 14:29:09 +03:00
|
|
|
ctx,
|
|
|
|
repo,
|
2020-12-14 20:22:57 +03:00
|
|
|
&self.mapping,
|
2020-09-15 19:19:40 +03:00
|
|
|
vec![*cs_id],
|
2020-12-14 20:22:57 +03:00
|
|
|
None,
|
2020-09-15 19:19:40 +03:00
|
|
|
));
|
2020-05-08 17:44:54 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
let boxed_stream = stream::iter(underived_ancestors)
|
2020-12-14 20:22:57 +03:00
|
|
|
.map(Result::<_, Error>::Ok)
|
2020-05-08 17:44:54 +03:00
|
|
|
.try_buffer_unordered(100)
|
|
|
|
// boxed() is necessary to avoid "one type is more general than the other" error
|
|
|
|
.boxed();
|
|
|
|
|
|
|
|
let res = boxed_stream.try_collect::<Vec<_>>().await?;
|
|
|
|
let oldest_changesets = stream::iter(
|
|
|
|
res.into_iter()
|
|
|
|
// The first element is the first underived ancestor in toposorted order.
|
|
|
|
// Let's use it as a proxy for the oldest underived commit
|
|
|
|
.map(|all_underived| all_underived.get(0).cloned())
|
|
|
|
.flatten()
|
2020-11-27 14:29:09 +03:00
|
|
|
.map(|cs_id| async move { cs_id.load(ctx, repo.blobstore()).await }),
|
2020-05-08 17:44:54 +03:00
|
|
|
)
|
|
|
|
.map(Ok)
|
|
|
|
.try_buffer_unordered(100)
|
|
|
|
// boxed() is necessary to avoid "one type is more general than the other" error
|
|
|
|
.boxed();
|
|
|
|
|
|
|
|
let oldest_changesets = oldest_changesets.try_collect::<Vec<_>>().await?;
|
|
|
|
Ok(oldest_changesets
|
|
|
|
.into_iter()
|
|
|
|
.min_by_key(|bcs| *bcs.author_date()))
|
|
|
|
}
|
|
|
|
|
2020-01-22 20:55:26 +03:00
|
|
|
fn regenerate(&self, csids: &Vec<ChangesetId>) {
|
|
|
|
self.mapping.regenerate(csids.iter().copied())
|
|
|
|
}
|
|
|
|
|
|
|
|
fn name(&self) -> &'static str {
|
|
|
|
M::Value::NAME
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone)]
|
|
|
|
struct InMemoryMapping<M: BonsaiDerivedMapping + Clone> {
|
|
|
|
mapping: M,
|
|
|
|
buffer: Arc<Mutex<HashMap<ChangesetId, M::Value>>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<M> InMemoryMapping<M>
|
|
|
|
where
|
|
|
|
M: BonsaiDerivedMapping + Clone,
|
|
|
|
<M as BonsaiDerivedMapping>::Value: Clone,
|
|
|
|
{
|
|
|
|
fn new(mapping: M) -> Self {
|
|
|
|
Self {
|
|
|
|
mapping,
|
|
|
|
buffer: Arc::new(Mutex::new(HashMap::new())),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn into_buffer(self) -> Arc<Mutex<HashMap<ChangesetId, M::Value>>> {
|
|
|
|
self.buffer
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-03 20:13:45 +03:00
|
|
|
#[async_trait]
|
2020-01-22 20:55:26 +03:00
|
|
|
impl<M> BonsaiDerivedMapping for InMemoryMapping<M>
|
|
|
|
where
|
|
|
|
M: BonsaiDerivedMapping + Clone,
|
|
|
|
<M as BonsaiDerivedMapping>::Value: Clone,
|
|
|
|
{
|
|
|
|
type Value = M::Value;
|
|
|
|
|
2020-11-03 20:13:45 +03:00
|
|
|
async fn get(
|
2020-01-22 20:55:26 +03:00
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
mut csids: Vec<ChangesetId>,
|
2020-11-03 20:13:45 +03:00
|
|
|
) -> Result<HashMap<ChangesetId, Self::Value>, Error> {
|
2020-01-22 20:55:26 +03:00
|
|
|
let mut ans = HashMap::new();
|
2020-11-03 20:13:45 +03:00
|
|
|
{
|
|
|
|
let buffer = self.buffer.lock().unwrap();
|
|
|
|
csids.retain(|cs_id| {
|
|
|
|
if let Some(v) = buffer.get(cs_id) {
|
|
|
|
ans.insert(*cs_id, v.clone());
|
|
|
|
false
|
|
|
|
} else {
|
|
|
|
true
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
2020-01-22 20:55:26 +03:00
|
|
|
|
2020-11-03 20:13:45 +03:00
|
|
|
let fetched = self.mapping.get(ctx, csids).await?;
|
|
|
|
Ok(ans.into_iter().chain(fetched.into_iter()).collect())
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
|
2020-11-03 20:13:45 +03:00
|
|
|
async fn put(
|
2020-10-16 17:44:19 +03:00
|
|
|
&self,
|
|
|
|
_ctx: CoreContext,
|
|
|
|
csid: ChangesetId,
|
|
|
|
id: Self::Value,
|
2020-11-03 20:13:45 +03:00
|
|
|
) -> Result<(), Error> {
|
2020-01-22 20:55:26 +03:00
|
|
|
let mut buffer = self.buffer.lock().unwrap();
|
|
|
|
buffer.insert(csid, id);
|
2020-11-03 20:13:45 +03:00
|
|
|
Ok(())
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
2020-12-14 20:22:57 +03:00
|
|
|
|
|
|
|
fn options(&self) -> <M::Value as BonsaiDerivable>::Options {
|
|
|
|
self.mapping.options()
|
|
|
|
}
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn derived_data_utils(
|
2020-12-14 20:22:57 +03:00
|
|
|
repo: &BlobRepo,
|
2020-01-22 20:55:26 +03:00
|
|
|
name: impl AsRef<str>,
|
2020-02-16 15:54:41 +03:00
|
|
|
) -> Result<Arc<dyn DerivedUtils>, Error> {
|
2020-12-14 20:22:57 +03:00
|
|
|
let name = name.as_ref();
|
|
|
|
let config = repo.get_derived_data_config();
|
|
|
|
let types_config = if config.enabled.types.contains(name) {
|
|
|
|
&config.enabled
|
|
|
|
} else {
|
|
|
|
return Err(anyhow!("Derived data type {} is not configured", name));
|
|
|
|
};
|
|
|
|
derived_data_utils_impl(repo, name, types_config)
|
2020-02-16 15:54:41 +03:00
|
|
|
}
|
|
|
|
|
2020-12-14 20:22:57 +03:00
|
|
|
pub fn derived_data_utils_for_backfill(
|
2020-12-14 20:22:57 +03:00
|
|
|
repo: &BlobRepo,
|
2020-02-16 15:54:41 +03:00
|
|
|
name: impl AsRef<str>,
|
|
|
|
) -> Result<Arc<dyn DerivedUtils>, Error> {
|
2020-12-14 20:22:57 +03:00
|
|
|
let name = name.as_ref();
|
|
|
|
let config = repo.get_derived_data_config();
|
|
|
|
let types_config = if config.backfilling.types.contains(name) {
|
|
|
|
&config.backfilling
|
|
|
|
} else if config.enabled.types.contains(name) {
|
|
|
|
&config.enabled
|
|
|
|
} else {
|
|
|
|
return Err(anyhow!(
|
|
|
|
"Derived data type {} is not configured for backfilling",
|
|
|
|
name
|
|
|
|
));
|
|
|
|
};
|
|
|
|
derived_data_utils_impl(repo, name, types_config)
|
2020-02-16 15:54:41 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
fn derived_data_utils_impl(
|
2020-12-14 20:22:57 +03:00
|
|
|
repo: &BlobRepo,
|
2020-12-14 20:22:57 +03:00
|
|
|
name: &str,
|
|
|
|
config: &DerivedDataTypesConfig,
|
2020-01-22 20:55:26 +03:00
|
|
|
) -> Result<Arc<dyn DerivedUtils>, Error> {
|
2020-12-14 20:22:57 +03:00
|
|
|
match name {
|
2020-01-22 20:55:26 +03:00
|
|
|
RootUnodeManifestId::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = RootUnodeManifestMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
RootFastlog::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = RootFastlogMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
MappedHgChangesetId::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = HgChangesetIdMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
RootFsnodeId::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = RootFsnodeMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
|
|
|
BlameRoot::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = BlameRootMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
2020-03-10 02:38:58 +03:00
|
|
|
ChangesetInfo::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = ChangesetInfoMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-03-10 02:38:58 +03:00
|
|
|
}
|
2020-01-22 20:55:26 +03:00
|
|
|
RootDeletedManifestId::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = RootDeletedManifestMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-01-22 20:55:26 +03:00
|
|
|
}
|
2020-01-27 12:09:21 +03:00
|
|
|
FilenodesOnlyPublic::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = FilenodesOnlyPublicMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-01-27 12:09:21 +03:00
|
|
|
}
|
2020-11-12 00:21:42 +03:00
|
|
|
RootSkeletonManifestId::NAME => {
|
2020-12-14 20:22:57 +03:00
|
|
|
let mapping = RootSkeletonManifestMapping::new(repo, config)?;
|
|
|
|
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
|
2020-11-12 00:21:42 +03:00
|
|
|
}
|
2020-01-22 20:55:26 +03:00
|
|
|
name => Err(format_err!("Unsupported derived data type: {}", name)),
|
|
|
|
}
|
|
|
|
}
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
pub struct DeriveGraphInner {
|
2020-10-19 17:28:24 +03:00
|
|
|
pub id: usize,
|
2020-10-15 13:49:27 +03:00
|
|
|
// deriver can be None only for the root element, and csids for this element is empty.
|
|
|
|
pub deriver: Option<Arc<dyn DerivedUtils>>,
|
|
|
|
pub csids: Vec<ChangesetId>,
|
|
|
|
pub dependencies: Vec<DeriveGraph>,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone)]
|
|
|
|
pub struct DeriveGraph {
|
|
|
|
inner: Arc<DeriveGraphInner>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl DeriveGraph {
|
|
|
|
fn new(
|
|
|
|
id: usize,
|
|
|
|
deriver: Arc<dyn DerivedUtils>,
|
|
|
|
csids: Vec<ChangesetId>,
|
|
|
|
dependencies: Vec<DeriveGraph>,
|
|
|
|
) -> Self {
|
|
|
|
let inner = DeriveGraphInner {
|
|
|
|
id,
|
|
|
|
deriver: Some(deriver),
|
|
|
|
csids,
|
|
|
|
dependencies,
|
|
|
|
};
|
|
|
|
Self {
|
|
|
|
inner: Arc::new(inner),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-19 17:28:24 +03:00
|
|
|
pub fn is_empty(&self) -> bool {
|
|
|
|
self.size() == 0
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Number of derivation to be carried out by this graph
|
|
|
|
pub fn size(&self) -> usize {
|
|
|
|
let mut stack = vec![self];
|
|
|
|
let mut visited = HashSet::new();
|
|
|
|
let mut count = 0;
|
|
|
|
while let Some(node) = stack.pop() {
|
|
|
|
count += node.csids.len();
|
|
|
|
for dep in node.dependencies.iter() {
|
|
|
|
if visited.insert(dep.id) {
|
|
|
|
stack.push(dep);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
count
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Derive all data in the graph
|
2020-12-14 20:22:57 +03:00
|
|
|
pub async fn derive(
|
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
parallel: bool,
|
2021-01-29 17:34:10 +03:00
|
|
|
gap_size: Option<usize>,
|
2020-12-14 20:22:57 +03:00
|
|
|
) -> Result<(), Error> {
|
2020-10-15 13:49:27 +03:00
|
|
|
bounded_traversal::bounded_traversal_dag(
|
|
|
|
100,
|
|
|
|
self.clone(),
|
|
|
|
|node| async move {
|
|
|
|
let deps = node.dependencies.clone();
|
|
|
|
Ok((node, deps))
|
|
|
|
},
|
|
|
|
move |node, _| {
|
|
|
|
cloned!(ctx, repo);
|
|
|
|
async move {
|
|
|
|
if let Some(deriver) = &node.deriver {
|
|
|
|
deriver
|
2020-12-14 20:22:57 +03:00
|
|
|
.backfill_batch_dangerous(
|
|
|
|
ctx.clone(),
|
|
|
|
repo,
|
|
|
|
node.csids.clone(),
|
|
|
|
parallel,
|
2021-01-29 17:34:10 +03:00
|
|
|
gap_size,
|
2020-12-14 20:22:57 +03:00
|
|
|
)
|
2020-10-15 13:49:27 +03:00
|
|
|
.await?;
|
|
|
|
if let (Some(first), Some(last)) = (node.csids.first(), node.csids.last()) {
|
2020-10-15 13:49:27 +03:00
|
|
|
slog::info!(
|
2020-10-15 13:49:27 +03:00
|
|
|
ctx.logger(),
|
2020-10-15 13:49:27 +03:00
|
|
|
"[{}:{}] count:{} start:{} end:{}",
|
2020-10-15 13:49:27 +03:00
|
|
|
deriver.name(),
|
|
|
|
node.id,
|
|
|
|
node.csids.len(),
|
|
|
|
first,
|
|
|
|
last
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok::<_, Error>(())
|
|
|
|
}
|
|
|
|
},
|
|
|
|
)
|
|
|
|
.await?
|
|
|
|
.ok_or_else(|| anyhow!("derive graph contains a cycle"))
|
|
|
|
}
|
|
|
|
|
|
|
|
// render derive graph as digraph that can be rendered with graphviz
|
|
|
|
// for debugging purposes.
|
|
|
|
// $ dot -Tpng <outout> -o <image>
|
2020-10-19 17:28:24 +03:00
|
|
|
pub fn digraph(&self, w: &mut dyn Write) -> std::io::Result<()> {
|
2020-10-15 13:49:27 +03:00
|
|
|
writeln!(w, "digraph DeriveGraph {{")?;
|
|
|
|
|
|
|
|
let mut stack = vec![self];
|
|
|
|
let mut visited = HashSet::new();
|
|
|
|
while let Some(node) = stack.pop() {
|
|
|
|
let deriver = node.deriver.as_ref().map_or_else(|| "root", |d| d.name());
|
2020-10-19 17:28:24 +03:00
|
|
|
writeln!(
|
|
|
|
w,
|
|
|
|
" {0} [label=\"[{1}] id:{0} csids:{2}\"]",
|
|
|
|
node.id,
|
|
|
|
deriver,
|
|
|
|
node.csids.len()
|
|
|
|
)?;
|
2020-10-15 13:49:27 +03:00
|
|
|
for dep in node.dependencies.iter() {
|
|
|
|
writeln!(w, " {} -> {}", node.id, dep.id)?;
|
|
|
|
if visited.insert(dep.id) {
|
|
|
|
stack.push(dep);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
writeln!(w, "}}")?;
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl std::ops::Deref for DeriveGraph {
|
|
|
|
type Target = DeriveGraphInner;
|
|
|
|
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
|
|
&self.inner
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl PartialEq for DeriveGraph {
|
|
|
|
fn eq(&self, other: &Self) -> bool {
|
|
|
|
self.inner.id == other.inner.id
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Eq for DeriveGraph {}
|
|
|
|
|
|
|
|
impl Hash for DeriveGraph {
|
|
|
|
fn hash<H: Hasher>(&self, state: &mut H) {
|
|
|
|
self.inner.id.hash(state)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Generate batched derivation graph
|
|
|
|
///
|
|
|
|
/// This function generates graph of changeset batches with assocated `DerivedUtil` that
|
|
|
|
/// accounts for dependencies between changesets and derived data types.
|
|
|
|
///
|
|
|
|
/// NOTE: This function might take very long time to run and consume a lot of memory,
|
|
|
|
/// if there are a lot of underived changesets is present. Avoid direct use of this
|
|
|
|
/// function in mononoke.
|
|
|
|
pub async fn build_derive_graph(
|
|
|
|
ctx: &CoreContext,
|
|
|
|
repo: &BlobRepo,
|
|
|
|
csids: Vec<ChangesetId>,
|
|
|
|
mut derivers: Vec<Arc<dyn DerivedUtils>>,
|
|
|
|
batch_size: usize,
|
|
|
|
thin_out: ThinOut,
|
|
|
|
) -> Result<DeriveGraph, Error> {
|
|
|
|
// resolve derived data types dependencies, and require derivers for all dependencies
|
|
|
|
// to be provided.
|
|
|
|
derivers.sort_by_key(|d| DERIVED_DATA_ORDER.get(d.name()));
|
|
|
|
let deriver_to_index: HashMap<_, _> = derivers
|
|
|
|
.iter()
|
|
|
|
.enumerate()
|
|
|
|
.map(|(i, d)| (d.name(), i))
|
|
|
|
.collect();
|
|
|
|
let mut derivers_dependiencies = Vec::new();
|
|
|
|
derivers_dependiencies.resize_with(derivers.len(), Vec::new);
|
|
|
|
for (index, deriver) in derivers.iter().enumerate() {
|
|
|
|
let dep_names = DERIVED_DATA_DEPS
|
|
|
|
.get(deriver.name())
|
|
|
|
.ok_or_else(|| anyhow!("unknown derived data type: {}", deriver.name()))?;
|
|
|
|
for dep_name in dep_names {
|
|
|
|
let dep_index = deriver_to_index.get(dep_name).ok_or_else(|| {
|
|
|
|
anyhow!(
|
|
|
|
"{0} depends on {1}, but deriver for {1} is not provided",
|
|
|
|
deriver.name(),
|
|
|
|
dep_name,
|
|
|
|
)
|
|
|
|
})?;
|
|
|
|
derivers_dependiencies[index].push(dep_index);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// find underived changesets
|
|
|
|
let mut underived_to_derivers = HashMap::new();
|
|
|
|
let mut underived_dag = HashMap::new();
|
|
|
|
let mut underived_stream =
|
|
|
|
find_underived_many(ctx.clone(), repo.clone(), csids, derivers.clone(), thin_out);
|
2020-10-15 13:49:27 +03:00
|
|
|
let mut found_changesets = 0usize;
|
|
|
|
let start = std::time::Instant::now();
|
2020-10-15 13:49:27 +03:00
|
|
|
while let Some((csid, parents, derivers)) = underived_stream.try_next().await? {
|
|
|
|
underived_dag.insert(csid, parents);
|
|
|
|
underived_to_derivers.insert(csid, derivers);
|
2020-10-15 13:49:27 +03:00
|
|
|
found_changesets += 1;
|
|
|
|
if found_changesets % 1000 == 0 {
|
|
|
|
slog::info!(
|
|
|
|
ctx.logger(),
|
2020-12-14 20:22:57 +03:00
|
|
|
"found changesets: {} {:.3}/s",
|
2020-10-15 13:49:27 +03:00
|
|
|
found_changesets,
|
|
|
|
found_changesets as f32 / start.elapsed().as_secs_f32(),
|
|
|
|
);
|
|
|
|
}
|
2020-10-15 13:49:27 +03:00
|
|
|
}
|
2020-10-19 17:28:24 +03:00
|
|
|
if found_changesets > 0 {
|
|
|
|
slog::info!(
|
|
|
|
ctx.logger(),
|
2020-12-14 20:22:57 +03:00
|
|
|
"found changesets: {} {:.3}/s",
|
2020-10-19 17:28:24 +03:00
|
|
|
found_changesets,
|
|
|
|
found_changesets as f32 / start.elapsed().as_secs_f32(),
|
|
|
|
);
|
|
|
|
}
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
// topologically sort changeset
|
|
|
|
let underived_ordered = sort_topological(&underived_dag).expect("commit graph has cycles!");
|
|
|
|
let underived_ordered: Vec<_> = underived_ordered
|
|
|
|
.into_iter()
|
|
|
|
// Note - sort_topological returns all nodes including commits which were already
|
|
|
|
// derived i.e. sort_topological({"a" -> ["b"]}) return ("a", "b").
|
|
|
|
// The '.filter()' below removes ["b"]
|
|
|
|
.filter(|csid| underived_dag.contains_key(csid))
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
// build derive graph
|
|
|
|
// `nodes` keeps a list of most recent nodes for each derived data type.
|
|
|
|
// It is important to note that derived data types are sorted in topolotical
|
|
|
|
// order (each type can only depend on types with small index in this list),
|
|
|
|
// it is used during construction of the node for a given chunk of changesets
|
|
|
|
// to create a dependency between different types.
|
|
|
|
let mut nodes: Vec<Option<DeriveGraph>> = Vec::new();
|
|
|
|
nodes.resize_with(derivers.len(), || None);
|
|
|
|
let mut node_ids = 0;
|
|
|
|
for csids in underived_ordered.chunks(batch_size) {
|
|
|
|
// group csids by derivers
|
|
|
|
let mut csids_by_deriver = Vec::new();
|
|
|
|
csids_by_deriver.resize_with(derivers.len(), Vec::new);
|
|
|
|
for csid in csids {
|
|
|
|
match underived_to_derivers.get(csid) {
|
|
|
|
None => continue,
|
|
|
|
Some(csid_derivers) => {
|
|
|
|
for deriver in csid_derivers.iter() {
|
|
|
|
let index = deriver_to_index[deriver.name()];
|
|
|
|
csids_by_deriver[index].push(*csid);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// generate node per deriver
|
|
|
|
for (index, csids) in csids_by_deriver.into_iter().enumerate() {
|
|
|
|
if csids.is_empty() {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
let mut dependencies = Vec::new();
|
|
|
|
// add dependency on the previous chunk associated with the same
|
|
|
|
// derived data type.
|
|
|
|
dependencies.extend(nodes[index].clone());
|
|
|
|
// add dependencies on derived types which are required for derivation
|
|
|
|
// of the given type. `dep_index` is always less then `index` since
|
|
|
|
// derived data types are topologically sorted.
|
|
|
|
for dep_index in derivers_dependiencies[index].iter() {
|
|
|
|
dependencies.extend(nodes[**dep_index].clone());
|
|
|
|
}
|
|
|
|
// update node associated with current type of derived data
|
|
|
|
let node = DeriveGraph::new(node_ids, derivers[index].clone(), csids, dependencies);
|
|
|
|
node_ids += 1;
|
|
|
|
nodes[index] = Some(node);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let root = DeriveGraphInner {
|
|
|
|
id: node_ids,
|
|
|
|
deriver: None,
|
|
|
|
csids: vec![],
|
|
|
|
dependencies: nodes.into_iter().flatten().collect(),
|
|
|
|
};
|
|
|
|
Ok(DeriveGraph {
|
|
|
|
inner: Arc::new(root),
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// This structure is used to thin out some actions. Intially it allows all of them
|
|
|
|
// until threshold is reached, then 1/step is allowed until threshold elemnts are
|
|
|
|
// collected, then 1/step^2 is allowed ...
|
|
|
|
#[derive(Clone, Copy)]
|
|
|
|
pub struct ThinOut {
|
|
|
|
counter: f64,
|
|
|
|
step: f64,
|
|
|
|
threshold: f64,
|
|
|
|
multiplier: f64,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl ThinOut {
|
|
|
|
pub fn new(threshold: f64, multiplier: f64) -> Self {
|
|
|
|
Self {
|
|
|
|
counter: 0.0,
|
|
|
|
step: 1.0,
|
|
|
|
threshold,
|
|
|
|
multiplier,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn new_keep_all() -> Self {
|
|
|
|
Self::new(1.0, 1.0)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn check_and_update(&mut self) -> bool {
|
|
|
|
self.counter += 1.0;
|
|
|
|
if self.counter > self.step * self.threshold {
|
|
|
|
self.step *= self.multiplier;
|
|
|
|
self.counter = 1.0;
|
|
|
|
}
|
|
|
|
self.counter.rem_euclid(self.step) < 1.0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Find underived ancestors for many derived data type at once.
|
|
|
|
///
|
|
|
|
/// This function finds all underived ancestors for derived data types specified by `derivers`
|
|
|
|
/// reachable for provided set of changesets `csids`. It also accepts `thin_out` argument that
|
|
|
|
/// can reduce number of lookups to derived data mapping at the expense of overfetching changesets,
|
|
|
|
/// that is it will return some changesets for which we might already have all derived data.
|
|
|
|
///
|
|
|
|
/// Returns a `Stream` of (in no particular order)
|
|
|
|
/// - underived changeset id
|
|
|
|
/// - parents of this changeset
|
|
|
|
/// - derivers that should be used on this changeset
|
|
|
|
pub fn find_underived_many(
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csids: Vec<ChangesetId>,
|
|
|
|
derivers: Vec<Arc<dyn DerivedUtils>>,
|
|
|
|
thin_out: ThinOut,
|
|
|
|
) -> impl Stream<
|
|
|
|
Item = Result<
|
|
|
|
(
|
|
|
|
ChangesetId,
|
|
|
|
Vec<ChangesetId>,
|
|
|
|
Arc<Vec<Arc<dyn DerivedUtils>>>,
|
|
|
|
),
|
|
|
|
Error,
|
|
|
|
>,
|
|
|
|
> {
|
|
|
|
let derivers = Arc::new(derivers);
|
|
|
|
let init: Vec<_> = csids
|
|
|
|
.into_iter()
|
|
|
|
.map(|csid| (csid, derivers.clone(), thin_out))
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
let changeset_fetcher = repo.get_changeset_fetcher();
|
|
|
|
let visited = Arc::new(Mutex::new(HashSet::new()));
|
|
|
|
bounded_traversal::bounded_traversal_stream(100, init, {
|
|
|
|
move |(csid, derivers, mut thin_out)| {
|
|
|
|
cloned!(changeset_fetcher, visited, repo, ctx);
|
|
|
|
async move {
|
|
|
|
let derivers = if thin_out.check_and_update() {
|
|
|
|
let repo = &repo;
|
|
|
|
let ctx = &ctx;
|
|
|
|
// exclude derivers not applicable (already derived) to this changeset id
|
|
|
|
let derivers = derivers.iter().map(|deriver| async move {
|
|
|
|
if deriver
|
|
|
|
.pending(ctx.clone(), repo.clone(), vec![csid])
|
|
|
|
.await?
|
|
|
|
.is_empty()
|
|
|
|
{
|
|
|
|
Ok::<_, Error>(None)
|
|
|
|
} else {
|
|
|
|
Ok(Some(deriver.clone()))
|
|
|
|
}
|
|
|
|
});
|
|
|
|
let derivers = try_join_all(derivers)
|
|
|
|
.await?
|
|
|
|
.into_iter()
|
|
|
|
.filter_map(|v| v)
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
Arc::new(derivers)
|
|
|
|
} else {
|
|
|
|
// based on thin-out, we are not actually checking that we need to derive
|
|
|
|
// data for this changeset, and instead doing unconditional unfold.
|
|
|
|
derivers
|
|
|
|
};
|
|
|
|
if derivers.is_empty() {
|
|
|
|
// all derived data has already been derived
|
|
|
|
Ok::<_, Error>((None, Vec::new()))
|
|
|
|
} else {
|
2020-12-03 02:38:47 +03:00
|
|
|
let parents = changeset_fetcher.get_parents(ctx.clone(), csid).await?;
|
2020-10-15 13:49:27 +03:00
|
|
|
let dependencies: Vec<_> = parents
|
|
|
|
.iter()
|
|
|
|
.copied()
|
|
|
|
.filter_map(|p| {
|
|
|
|
if visited.with(|vs| vs.insert(p)) {
|
|
|
|
Some((p, derivers.clone(), thin_out))
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
Ok((Some((csid, parents, derivers)), dependencies))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.try_filter_map(future::ok)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
use bookmarks::BookmarkName;
|
2020-12-14 20:22:57 +03:00
|
|
|
use derived_data::BonsaiDerived;
|
2020-10-15 13:49:27 +03:00
|
|
|
use fbinit::FacebookInit;
|
|
|
|
use fixtures::merge_even;
|
2020-12-14 20:22:57 +03:00
|
|
|
use maplit::{btreemap, hashset};
|
|
|
|
use metaconfig_types::UnodeVersion;
|
2020-10-15 13:49:27 +03:00
|
|
|
use std::{
|
|
|
|
collections::BTreeMap,
|
|
|
|
sync::atomic::{AtomicUsize, Ordering},
|
|
|
|
};
|
2020-12-14 20:22:57 +03:00
|
|
|
use tests_utils::drawdag::create_from_dag;
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
// decompose graph into map between node indices and list of nodes
|
|
|
|
fn derive_graph_unpack(node: &DeriveGraph) -> (BTreeMap<usize, Vec<usize>>, Vec<DeriveGraph>) {
|
|
|
|
let mut graph = BTreeMap::new();
|
|
|
|
let mut nodes = Vec::new();
|
|
|
|
let mut stack = vec![node];
|
|
|
|
while let Some(node) = stack.pop() {
|
|
|
|
let mut deps = Vec::new();
|
|
|
|
for dep in node.dependencies.iter() {
|
|
|
|
deps.push(dep.id);
|
|
|
|
if !graph.contains_key(&dep.id) {
|
|
|
|
stack.push(dep);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
graph.insert(node.id, deps);
|
|
|
|
nodes.push(node.clone());
|
|
|
|
}
|
|
|
|
nodes.sort_by_key(|n| n.id);
|
|
|
|
(graph, nodes)
|
|
|
|
}
|
|
|
|
|
2021-02-22 20:20:44 +03:00
|
|
|
#[fbinit::test]
|
2020-10-15 13:49:27 +03:00
|
|
|
async fn test_build_derive_graph(fb: FacebookInit) -> Result<(), Error> {
|
|
|
|
let ctx = CoreContext::test_mock(fb);
|
|
|
|
let repo = merge_even::getrepo(fb).await;
|
|
|
|
let thin_out = ThinOut::new_keep_all();
|
|
|
|
let master = repo
|
|
|
|
.get_bonsai_bookmark(ctx.clone(), &BookmarkName::new("master").unwrap())
|
|
|
|
.await?
|
|
|
|
.unwrap();
|
2020-12-14 20:22:57 +03:00
|
|
|
let blame_deriver = derived_data_utils(&repo, "blame")?;
|
|
|
|
let unodes_deriver = derived_data_utils(&repo, "unodes")?;
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
// make sure we require all dependencies, blame depens on unodes
|
|
|
|
let graph = build_derive_graph(
|
|
|
|
&ctx,
|
|
|
|
&repo,
|
|
|
|
vec![master],
|
|
|
|
vec![blame_deriver.clone()],
|
|
|
|
2,
|
|
|
|
thin_out,
|
|
|
|
)
|
|
|
|
.await;
|
|
|
|
assert!(graph.is_err());
|
|
|
|
|
|
|
|
let graph = build_derive_graph(
|
|
|
|
&ctx,
|
|
|
|
&repo,
|
|
|
|
vec![master],
|
|
|
|
vec![blame_deriver.clone(), unodes_deriver.clone()],
|
|
|
|
2,
|
|
|
|
thin_out,
|
|
|
|
)
|
|
|
|
.await?;
|
2020-10-19 17:28:24 +03:00
|
|
|
assert_eq!(graph.size(), 16);
|
2020-10-15 13:49:27 +03:00
|
|
|
let (graph_ids, nodes) = derive_graph_unpack(&graph);
|
|
|
|
assert_eq!(
|
|
|
|
graph_ids,
|
|
|
|
btreemap! {
|
|
|
|
0 => vec![],
|
|
|
|
1 => vec![0],
|
|
|
|
2 => vec![0],
|
|
|
|
3 => vec![1, 2],
|
|
|
|
4 => vec![2],
|
|
|
|
5 => vec![3, 4],
|
|
|
|
6 => vec![4],
|
|
|
|
7 => vec![5, 6],
|
|
|
|
8 => vec![6, 7],
|
|
|
|
}
|
|
|
|
);
|
|
|
|
assert_eq!(nodes[0].deriver.as_ref().unwrap().name(), "unodes");
|
|
|
|
assert_eq!(nodes[1].deriver.as_ref().unwrap().name(), "blame");
|
|
|
|
assert_eq!(nodes[2].deriver.as_ref().unwrap().name(), "unodes");
|
|
|
|
|
|
|
|
let graph = build_derive_graph(
|
|
|
|
&ctx,
|
|
|
|
&repo,
|
|
|
|
vec![master],
|
|
|
|
vec![blame_deriver.clone(), unodes_deriver.clone()],
|
|
|
|
2,
|
|
|
|
thin_out,
|
|
|
|
)
|
|
|
|
.await?;
|
2021-01-29 17:34:10 +03:00
|
|
|
graph.derive(ctx.clone(), repo.clone(), false, None).await?;
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
let graph = build_derive_graph(
|
|
|
|
&ctx,
|
|
|
|
&repo,
|
|
|
|
vec![master],
|
|
|
|
vec![blame_deriver, unodes_deriver],
|
|
|
|
2,
|
|
|
|
thin_out,
|
|
|
|
)
|
|
|
|
.await?;
|
2020-10-19 17:28:24 +03:00
|
|
|
assert!(graph.is_empty());
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
Ok::<_, Error>(())
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_thin_out() {
|
|
|
|
let mut thin_out = ThinOut::new(3.0, 2.0);
|
|
|
|
let result: String = (0..40)
|
|
|
|
.map(|_| {
|
|
|
|
if thin_out.check_and_update() {
|
|
|
|
"X"
|
|
|
|
} else {
|
|
|
|
"_"
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
assert_eq!("XXX_X_X_X___X___X___X_______X_______X___", result);
|
|
|
|
|
|
|
|
let mut thin_out = ThinOut::new_keep_all();
|
|
|
|
assert_eq!(40, (0..40).filter(|_| thin_out.check_and_update()).count());
|
|
|
|
|
|
|
|
let mut thin_out = ThinOut::new(1000.0, 1.5);
|
|
|
|
let count: usize = (0..10_000_000)
|
|
|
|
.map(|_| if thin_out.check_and_update() { 1 } else { 0 })
|
|
|
|
.sum();
|
|
|
|
assert_eq!(count, 20988);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct CountedDerivedUtils {
|
|
|
|
deriver: Arc<dyn DerivedUtils>,
|
|
|
|
count: Arc<AtomicUsize>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl CountedDerivedUtils {
|
|
|
|
fn new(deriver: Arc<dyn DerivedUtils>) -> Self {
|
|
|
|
Self {
|
|
|
|
deriver,
|
|
|
|
count: Arc::new(AtomicUsize::new(0)),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn count(&self) -> usize {
|
|
|
|
self.count.load(Ordering::SeqCst)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[async_trait]
|
|
|
|
impl DerivedUtils for CountedDerivedUtils {
|
|
|
|
fn derive(
|
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csid: ChangesetId,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> BoxFuture<'static, Result<String, Error>> {
|
2020-10-15 13:49:27 +03:00
|
|
|
self.deriver.derive(ctx, repo, csid)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn backfill_batch_dangerous(
|
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csids: Vec<ChangesetId>,
|
2020-12-14 20:22:57 +03:00
|
|
|
parallel: bool,
|
2021-01-29 17:34:10 +03:00
|
|
|
gap_size: Option<usize>,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> BoxFuture<'static, Result<(), Error>> {
|
2020-12-14 20:22:57 +03:00
|
|
|
self.deriver
|
2021-01-29 17:34:10 +03:00
|
|
|
.backfill_batch_dangerous(ctx, repo, csids, parallel, gap_size)
|
2020-10-15 13:49:27 +03:00
|
|
|
}
|
|
|
|
|
2020-10-16 17:44:19 +03:00
|
|
|
async fn pending(
|
2020-10-15 13:49:27 +03:00
|
|
|
&self,
|
|
|
|
ctx: CoreContext,
|
|
|
|
repo: BlobRepo,
|
|
|
|
csids: Vec<ChangesetId>,
|
2020-10-16 17:44:19 +03:00
|
|
|
) -> Result<Vec<ChangesetId>, Error> {
|
2020-10-15 13:49:27 +03:00
|
|
|
self.count.fetch_add(1, Ordering::SeqCst);
|
2020-10-16 17:44:19 +03:00
|
|
|
self.deriver.pending(ctx, repo, csids).await
|
2020-10-15 13:49:27 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
fn regenerate(&self, _csids: &Vec<ChangesetId>) {
|
|
|
|
unimplemented!()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn name(&self) -> &'static str {
|
|
|
|
self.deriver.name()
|
|
|
|
}
|
|
|
|
|
|
|
|
async fn find_oldest_underived<'a>(
|
|
|
|
&'a self,
|
|
|
|
_ctx: &'a CoreContext,
|
|
|
|
_repo: &'a BlobRepo,
|
|
|
|
_csids: &'a Vec<ChangesetId>,
|
|
|
|
) -> Result<Option<BonsaiChangeset>, Error> {
|
|
|
|
unimplemented!()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-22 20:20:44 +03:00
|
|
|
#[fbinit::test]
|
2020-10-15 13:49:27 +03:00
|
|
|
async fn test_find_underived_many(fb: FacebookInit) -> Result<(), Error> {
|
|
|
|
let ctx = CoreContext::test_mock(fb);
|
|
|
|
let repo = blobrepo_factory::new_memblob_empty(None)?;
|
2020-12-14 20:22:57 +03:00
|
|
|
let dag = create_from_dag(&ctx, &repo, "A-B-C").await?;
|
2020-10-15 13:49:27 +03:00
|
|
|
let a = *dag.get("A").unwrap();
|
|
|
|
let b = *dag.get("B").unwrap();
|
|
|
|
let c = *dag.get("C").unwrap();
|
|
|
|
|
|
|
|
let thin_out = ThinOut::new_keep_all();
|
2020-12-14 20:22:57 +03:00
|
|
|
let blame_deriver = derived_data_utils(&repo, "blame")?;
|
2020-10-15 13:49:27 +03:00
|
|
|
let unodes_deriver = {
|
2020-12-14 20:22:57 +03:00
|
|
|
let deriver = derived_data_utils(&repo, "unodes")?;
|
2020-10-15 13:49:27 +03:00
|
|
|
Arc::new(CountedDerivedUtils::new(deriver))
|
|
|
|
};
|
|
|
|
|
|
|
|
let entries: BTreeMap<_, _> = find_underived_many(
|
|
|
|
ctx.clone(),
|
|
|
|
repo.clone(),
|
|
|
|
vec![c],
|
|
|
|
vec![blame_deriver.clone(), unodes_deriver.clone()],
|
|
|
|
thin_out,
|
|
|
|
)
|
|
|
|
.map_ok(|(csid, _parents, derivers)| {
|
|
|
|
let names: Vec<_> = derivers.iter().map(|d| d.name()).collect();
|
|
|
|
(csid, names)
|
|
|
|
})
|
|
|
|
.try_collect()
|
|
|
|
.await?;
|
|
|
|
assert_eq!(unodes_deriver.count(), 3);
|
|
|
|
assert_eq!(
|
|
|
|
entries,
|
|
|
|
btreemap! {
|
|
|
|
a => vec!["blame", "unodes"],
|
|
|
|
b => vec!["blame", "unodes"],
|
|
|
|
c => vec!["blame", "unodes"],
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
2020-10-16 17:44:19 +03:00
|
|
|
unodes_deriver.derive(ctx.clone(), repo.clone(), b).await?;
|
|
|
|
blame_deriver.derive(ctx.clone(), repo.clone(), a).await?;
|
2020-10-15 13:49:27 +03:00
|
|
|
|
|
|
|
let entries: BTreeMap<_, _> = find_underived_many(
|
|
|
|
ctx.clone(),
|
|
|
|
repo.clone(),
|
|
|
|
vec![c],
|
|
|
|
vec![blame_deriver.clone(), unodes_deriver.clone()],
|
|
|
|
thin_out,
|
|
|
|
)
|
|
|
|
.map_ok(|(csid, _parents, derivers)| {
|
|
|
|
let names: Vec<_> = derivers.iter().map(|d| d.name()).collect();
|
|
|
|
(csid, names)
|
|
|
|
})
|
|
|
|
.try_collect()
|
|
|
|
.await?;
|
|
|
|
assert_eq!(unodes_deriver.count(), 5);
|
|
|
|
assert_eq!(
|
|
|
|
entries,
|
|
|
|
btreemap! {
|
|
|
|
b => vec!["blame"],
|
|
|
|
c => vec!["blame", "unodes"],
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
Ok::<_, Error>(())
|
|
|
|
}
|
2020-12-14 20:22:57 +03:00
|
|
|
|
2021-02-22 20:20:44 +03:00
|
|
|
#[fbinit::test]
|
2020-12-14 20:22:57 +03:00
|
|
|
async fn multiple_independent_mappings(fb: FacebookInit) -> Result<(), Error> {
|
|
|
|
let ctx = CoreContext::test_mock(fb);
|
|
|
|
let repo = blobrepo_factory::new_memblob_empty(None)?;
|
|
|
|
let dag = create_from_dag(&ctx, &repo, "A-B-C").await?;
|
|
|
|
let a = *dag.get("A").unwrap();
|
|
|
|
let b = *dag.get("B").unwrap();
|
|
|
|
let c = *dag.get("C").unwrap();
|
|
|
|
|
|
|
|
// Create utils for both versions of unodes.
|
|
|
|
let utils_v1 = derived_data_utils_impl(
|
|
|
|
&repo,
|
|
|
|
"unodes",
|
|
|
|
&DerivedDataTypesConfig {
|
|
|
|
types: hashset! { String::from("unodes") },
|
|
|
|
unode_version: UnodeVersion::V1,
|
|
|
|
..Default::default()
|
|
|
|
},
|
|
|
|
)?;
|
|
|
|
|
|
|
|
let utils_v2 = derived_data_utils_impl(
|
|
|
|
&repo,
|
|
|
|
"unodes",
|
|
|
|
&DerivedDataTypesConfig {
|
|
|
|
types: hashset! { String::from("unodes") },
|
|
|
|
unode_version: UnodeVersion::V2,
|
|
|
|
..Default::default()
|
|
|
|
},
|
|
|
|
)?;
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
utils_v1
|
|
|
|
.find_oldest_underived(&ctx, &repo, &vec![c])
|
|
|
|
.await?
|
|
|
|
.unwrap()
|
|
|
|
.get_changeset_id(),
|
|
|
|
a
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
utils_v2
|
|
|
|
.find_oldest_underived(&ctx, &repo, &vec![c])
|
|
|
|
.await?
|
|
|
|
.unwrap()
|
|
|
|
.get_changeset_id(),
|
|
|
|
a
|
|
|
|
);
|
|
|
|
|
|
|
|
// Derive V1 of A using the V1 utils. V2 of A should still be underived.
|
|
|
|
utils_v1.derive(ctx.clone(), repo.clone(), a).await?;
|
|
|
|
assert_eq!(
|
|
|
|
utils_v1
|
|
|
|
.find_oldest_underived(&ctx, &repo, &vec![c])
|
|
|
|
.await?
|
|
|
|
.unwrap()
|
|
|
|
.get_changeset_id(),
|
|
|
|
b
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
utils_v2
|
|
|
|
.find_oldest_underived(&ctx, &repo, &vec![c])
|
|
|
|
.await?
|
|
|
|
.unwrap()
|
|
|
|
.get_changeset_id(),
|
|
|
|
a
|
|
|
|
);
|
|
|
|
|
|
|
|
// Derive B directly, which should use the V2 mapping, as that is the
|
|
|
|
// version configured on the repo. V1 of B should still be underived.
|
|
|
|
RootUnodeManifestId::derive(&ctx, &repo, b).await?;
|
|
|
|
assert_eq!(
|
|
|
|
utils_v1
|
|
|
|
.find_oldest_underived(&ctx, &repo, &vec![c])
|
|
|
|
.await?
|
|
|
|
.unwrap()
|
|
|
|
.get_changeset_id(),
|
|
|
|
b
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
utils_v2
|
|
|
|
.find_oldest_underived(&ctx, &repo, &vec![c])
|
|
|
|
.await?
|
|
|
|
.unwrap()
|
|
|
|
.get_changeset_id(),
|
|
|
|
c
|
|
|
|
);
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
2020-10-15 13:49:27 +03:00
|
|
|
}
|