backfill_derived_data mononoke: fail if derived data disabled

Summary:
Now let's fail if we try to derive data that's not enabled in the config.
Note that this diff also adds `derive_unsafe()` method which should be used in backfiller.

Reviewed By: krallin

Differential Revision: D19807956

fbshipit-source-id: c39af8555164314cafa9691197629ab9ddb956f1
This commit is contained in:
Stanislau Hlebik 2020-02-16 04:54:41 -08:00 committed by Facebook Github Bot
parent bd0c0b89fb
commit ee7c0a8d26
11 changed files with 153 additions and 45 deletions

View File

@ -51,7 +51,7 @@ fn derive_data_for_csids(
let derivations = FuturesUnordered::new();
for data_type in derived_data_types {
let derived_utils = derived_data_utils(ctx.clone(), repo.clone(), data_type)?;
let derived_utils = derived_data_utils(repo.clone(), data_type)?;
derivations.push(
derived_utils

View File

@ -30,6 +30,7 @@ use filestore::FilestoreConfig;
use fsnodes::RootFsnodeId;
use futures::{future::IntoFuture, Future};
use futures_ext::{try_boxfuture, BoxFuture, FutureExt};
use git_types::TreeHandle;
use maplit::btreeset;
use memblob::EagerMemblob;
use metaconfig_types::{self, DerivedDataConfig, FilestoreParams, Redaction, StorageConfig};
@ -262,12 +263,13 @@ pub fn init_all_derived_data() -> DerivedDataConfig {
DerivedDataConfig {
scuba_table: None,
derived_data_types: btreeset! {
RootUnodeManifestId::NAME.to_string(),
BlameRoot::NAME.to_string(),
FilenodesOnlyPublic::NAME.to_string(),
RootFastlog::NAME.to_string(),
RootFsnodeId::NAME.to_string(),
BlameRoot::NAME.to_string(),
RootDeletedManifestId::NAME.to_string(),
FilenodesOnlyPublic::NAME.to_string(),
RootUnodeManifestId::NAME.to_string(),
TreeHandle::NAME.to_string(),
},
}
}

View File

@ -2049,3 +2049,16 @@ impl DangerousOverride<Arc<dyn Filenodes>> for BlobRepo {
}
}
}
impl DangerousOverride<DerivedDataConfig> for BlobRepo {
fn dangerous_override<F>(&self, modify: F) -> Self
where
F: FnOnce(DerivedDataConfig) -> DerivedDataConfig,
{
let derived_data_config = modify(self.derived_data_config.clone());
BlobRepo {
derived_data_config,
..self.clone()
}
}
}

View File

@ -85,7 +85,7 @@ async fn check_derived_data_exists(
derived_data_type: String,
hashes_or_bookmarks: Vec<String>,
) -> Result<(), SubcommandError> {
let derived_utils = derived_data_utils(ctx.clone(), repo.clone(), derived_data_type)?;
let derived_utils = derived_data_utils(repo.clone(), derived_data_type)?;
let cs_id_futs: Vec<_> = hashes_or_bookmarks
.into_iter()

View File

@ -27,7 +27,7 @@ use context::CoreContext;
use dbbookmarks::SqlBookmarks;
use deleted_files_manifest::RootDeletedManifestId;
use derived_data::BonsaiDerived;
use derived_data_utils::{derived_data_utils, POSSIBLE_DERIVED_TYPES};
use derived_data_utils::{derived_data_utils, derived_data_utils_unsafe, POSSIBLE_DERIVED_TYPES};
use fastlog::{fetch_parent_root_unodes, RootFastlog};
use fbinit::FacebookInit;
use fsnodes::RootFsnodeId;
@ -391,8 +391,7 @@ fn subcommand_backfill<P: AsRef<Path>>(
regenerate: bool,
prefetched_commits_path: P,
) -> BoxFuture<(), Error> {
let derived_utils = try_boxfuture!(derived_data_utils(
ctx.clone(),
let derived_utils = try_boxfuture!(derived_data_utils_unsafe(
repo.clone(),
derived_data_type.clone(),
));
@ -589,7 +588,7 @@ fn subcommand_tail(
} else {
repo.clone()
};
let derive = derived_data_utils(ctx.clone(), repo.clone(), name)?;
let derive = derived_data_utils(repo.clone(), name)?;
let stats = stats_constructor(derive.name());
Ok((derive, maybe_unredacted_repo, Arc::new(stats)))
})
@ -689,7 +688,7 @@ fn subcommand_single(
derived_data_type: String,
) -> impl Future<Item = (), Error = Error> {
let repo = repo.dangerous_override(|_| Arc::new(DummyLease {}) as Arc<dyn LeaseOps>);
let derived_utils = match derived_data_utils(ctx.clone(), repo.clone(), derived_data_type) {
let derived_utils = match derived_data_utils(repo.clone(), derived_data_type) {
Ok(derived_utils) => derived_utils,
Err(error) => return future::err(error).left_future(),
};
@ -806,8 +805,7 @@ mod tests {
let maybe_bcs_id = runtime.block_on(repo.get_bonsai_from_hg(ctx.clone(), hg_cs_id))?;
let bcs_id = maybe_bcs_id.unwrap();
let derived_utils =
derived_data_utils(ctx.clone(), repo.clone(), RootUnodeManifestId::NAME)?;
let derived_utils = derived_data_utils(repo.clone(), RootUnodeManifestId::NAME)?;
runtime.block_on(derived_utils.derive_batch(ctx.clone(), repo.clone(), vec![bcs_id]))?;
Ok(())
@ -832,8 +830,7 @@ mod tests {
batch.push(maybe_bcs_id.unwrap());
}
let derived_utils =
derived_data_utils(ctx.clone(), repo.clone(), RootUnodeManifestId::NAME)?;
let derived_utils = derived_data_utils(repo.clone(), RootUnodeManifestId::NAME)?;
let pending =
runtime.block_on(derived_utils.pending(ctx.clone(), repo.clone(), batch.clone()))?;
assert_eq!(pending.len(), hg_cs_ids.len());
@ -863,8 +860,7 @@ mod tests {
runtime.block_on(repo.get_bonsai_from_hg(ctx.clone(), first_hg_cs_id))?;
let bcs_id = maybe_bcs_id.unwrap();
let derived_utils =
derived_data_utils(ctx.clone(), repo.clone(), RootUnodeManifestId::NAME)?;
let derived_utils = derived_data_utils(repo.clone(), RootUnodeManifestId::NAME)?;
let res =
runtime.block_on(derived_utils.derive_batch(ctx.clone(), repo.clone(), vec![bcs_id]));
// Deriving should fail because blobstore writes fail

View File

@ -5,8 +5,8 @@
* GNU General Public License version 2.
*/
use crate::{BonsaiDerived, BonsaiDerivedMapping};
use anyhow::Error;
use crate::{BonsaiDerived, BonsaiDerivedMapping, Mode};
use anyhow::{format_err, Error};
use blobrepo::BlobRepo;
use blobstore::Loadable;
use cloned::cloned;
@ -15,7 +15,7 @@ use futures::{
future::{self, Loop},
stream, Future, IntoFuture, Stream,
};
use futures_ext::{bounded_traversal, BoxFuture, FutureExt, StreamExt};
use futures_ext::{bounded_traversal, try_boxfuture, BoxFuture, FutureExt, StreamExt};
use futures_stats::Timed;
use lock_ext::LockExt;
use mononoke_types::ChangesetId;
@ -76,8 +76,9 @@ pub fn derive_impl<
repo: BlobRepo,
derived_mapping: Mapping,
start_csid: ChangesetId,
mode: Mode,
) -> impl Future<Item = Derived, Error = Error> {
find_underived(&ctx, &repo, &derived_mapping, &start_csid, None)
find_underived(&ctx, &repo, &derived_mapping, &start_csid, None, mode)
.map({
cloned!(ctx);
move |commits_not_derived_to_parents| {
@ -172,14 +173,20 @@ pub fn derive_impl<
.and_then(move |_| fetch_derived_may_panic(ctx, start_csid, derived_mapping))
}
fn log_if_disabled<Derived: BonsaiDerived>(repo: &BlobRepo) {
fn fail_if_disabled<Derived: BonsaiDerived>(repo: &BlobRepo) -> Result<(), Error> {
if !repo
.get_derived_data_config()
.derived_data_types
.contains(Derived::NAME)
{
STATS::derived_data_disabled.add_value(1, (repo.get_repoid().id(), Derived::NAME));
return Err(format_err!(
"{} is not enabled for repo {}",
Derived::NAME,
repo.get_repoid()
));
}
Ok(())
}
pub(crate) fn find_underived<
@ -191,8 +198,12 @@ pub(crate) fn find_underived<
derived_mapping: &Mapping,
start_csid: &ChangesetId,
limit: Option<u64>,
mode: Mode,
) -> impl Future<Item = HashMap<ChangesetId, Vec<ChangesetId>>, Error = Error> {
log_if_disabled::<Derived>(repo);
if mode == Mode::OnlyIfEnabled {
try_boxfuture!(fail_if_disabled::<Derived>(repo));
}
let changeset_fetcher = repo.get_changeset_fetcher();
// This is necessary to avoid visiting the same commit a lot of times in mergy repos
let visited: Arc<Mutex<HashSet<ChangesetId>>> = Arc::new(Mutex::new(HashSet::new()));
@ -236,6 +247,7 @@ pub(crate) fn find_underived<
.traced(&ctx.trace(), "derive::find_dependencies", None)
.filter_map(|x| x)
.collect_to()
.boxify()
}
// Panics if any of the parents is not derived yet
@ -516,6 +528,7 @@ mod test {
use lock_ext::LockExt;
use maplit::hashmap;
use mercurial_types::HgChangesetId;
use metaconfig_types::DerivedDataConfig;
use mononoke_types::BonsaiChangeset;
use revset::AncestorsNodeStream;
use std::{
@ -613,6 +626,13 @@ mod test {
}
fn derive_for_master(runtime: &mut Runtime, ctx: CoreContext, repo: BlobRepo) {
let repo = repo.dangerous_override(|mut derived_data_config: DerivedDataConfig| {
derived_data_config
.derived_data_types
.insert(TestGenNum::NAME.to_string());
derived_data_config
});
let master_book = BookmarkName::new("master").unwrap();
let bcs_id = runtime
.block_on(repo.get_bonsai_bookmark(ctx.clone(), &master_book))
@ -653,12 +673,21 @@ mod test {
.unwrap();
}
fn init_linear(fb: FacebookInit) -> BlobRepo {
linear::getrepo(fb).dangerous_override(|mut derived_data_config: DerivedDataConfig| {
derived_data_config
.derived_data_types
.insert(TestGenNum::NAME.to_string());
derived_data_config
})
}
#[fbinit::test]
fn test_incomplete_maping(fb: FacebookInit) -> Result<(), Error> {
let ctx = CoreContext::test_mock(fb);
let mut runtime = Runtime::new()?;
let repo = linear::getrepo(fb);
let repo = init_linear(fb);
runtime.block_on(
async move {
// This is the parent of the root commit
@ -700,7 +729,7 @@ mod test {
let ctx = CoreContext::test_mock(fb);
let mut runtime = Runtime::new()?;
let repo = linear::getrepo(fb);
let repo = init_linear(fb);
runtime.block_on(
async move {
// This is the parent of the root commit
@ -782,9 +811,9 @@ mod test {
#[fbinit::test]
fn test_leases(fb: FacebookInit) -> Result<(), Error> {
let ctx = CoreContext::test_mock(fb);
let repo = linear::getrepo(fb);
let mapping = TestGenNum::mapping(&ctx, &repo);
let mut runtime = Runtime::new()?;
let repo = init_linear(fb);
let mapping = TestGenNum::mapping(&ctx, &repo);
let hg_csid = HgChangesetId::from_str("2d7d4ba9ce0a6ffd222de7785b249ead9c51c536")?;
let csid = runtime
@ -878,7 +907,7 @@ mod test {
fn test_always_failing_lease(fb: FacebookInit) -> Result<(), Error> {
let ctx = CoreContext::test_mock(fb);
let repo =
linear::getrepo(fb).dangerous_override(|_| Arc::new(FailingLease) as Arc<dyn LeaseOps>);
init_linear(fb).dangerous_override(|_| Arc::new(FailingLease) as Arc<dyn LeaseOps>);
let mapping = TestGenNum::mapping(&ctx, &repo);
let mut runtime = Runtime::new()?;

View File

@ -21,6 +21,15 @@ use std::{
pub mod derive_impl;
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum Mode {
/// This mode should almost always be preferred
OnlyIfEnabled,
/// This mode should rarely be used, perhaps only for backfilling type of derived data
/// which is not enabled in this repo yet
Unsafe,
}
/// Trait for the data that can be derived from bonsai changeset.
/// Examples of that are hg changeset id, unodes root manifest id, git changeset ids etc
pub trait BonsaiDerived: Sized + 'static + Send + Sync + Clone {
@ -53,12 +62,35 @@ pub trait BonsaiDerived: Sized + 'static + Send + Sync + Clone {
/// This function is the entrypoint for changeset derivation, it converts
/// bonsai representation to derived one by calling derive_from_parents(), and saves mapping
/// from csid -> BonsaiDerived in BonsaiDerivedMapping
///
/// This function fails immediately if this type of derived data is not enabled for this repo.
fn derive(ctx: CoreContext, repo: BlobRepo, csid: ChangesetId) -> BoxFuture<Self, Error> {
let mapping = Self::mapping(&ctx, &repo);
derive_impl::derive_impl::<Self, Self::Mapping>(ctx, repo, mapping, csid).boxify()
derive_impl::derive_impl::<Self, Self::Mapping>(
ctx,
repo,
mapping,
csid,
Mode::OnlyIfEnabled,
)
.boxify()
}
/// Derives derived data even if it's disabled in the config. Should normally
/// be used only for backfilling.
fn derive_with_mode(
ctx: CoreContext,
repo: BlobRepo,
csid: ChangesetId,
mode: Mode,
) -> BoxFuture<Self, Error> {
let mapping = Self::mapping(&ctx, &repo);
derive_impl::derive_impl::<Self, Self::Mapping>(ctx, repo, mapping, csid, mode).boxify()
}
/// Returns min(number of ancestors of `csid` to be derived, `limit`)
///
/// This function fails immediately if derived data is not enabled for this repo.
fn count_underived(
ctx: &CoreContext,
repo: &BlobRepo,
@ -66,9 +98,16 @@ pub trait BonsaiDerived: Sized + 'static + Send + Sync + Clone {
limit: u64,
) -> BoxFuture<u64, Error> {
let mapping = Self::mapping(&ctx, &repo);
derive_impl::find_underived::<Self, Self::Mapping>(ctx, repo, &mapping, csid, Some(limit))
.map(|underived| underived.len() as u64)
.boxify()
derive_impl::find_underived::<Self, Self::Mapping>(
ctx,
repo,
&mapping,
csid,
Some(limit),
Mode::OnlyIfEnabled,
)
.map(|underived| underived.len() as u64)
.boxify()
}
}

View File

@ -14,7 +14,8 @@ use cloned::cloned;
use context::CoreContext;
use deleted_files_manifest::{RootDeletedManifestId, RootDeletedManifestMapping};
use derived_data::{
derive_impl::derive_impl, BonsaiDerived, BonsaiDerivedMapping, RegenerateMapping,
derive_impl::derive_impl, BonsaiDerived, BonsaiDerivedMapping, Mode as DeriveMode,
RegenerateMapping,
};
use derived_data_filenodes::{FilenodesOnlyPublic, FilenodesOnlyPublicMapping};
use fastlog::{RootFastlog, RootFastlogMapping};
@ -73,12 +74,13 @@ pub trait DerivedUtils: Send + Sync + 'static {
#[derive(Clone)]
struct DerivedUtilsFromMapping<M> {
mapping: RegenerateMapping<M>,
mode: DeriveMode,
}
impl<M> DerivedUtilsFromMapping<M> {
fn new(mapping: M) -> Self {
fn new(mapping: M, mode: DeriveMode) -> Self {
let mapping = RegenerateMapping::new(mapping);
Self { mapping }
Self { mapping, mode }
}
}
@ -93,7 +95,7 @@ where
repo: BlobRepo,
csid: ChangesetId,
) -> BoxFuture<String, Error> {
<M::Value as BonsaiDerived>::derive(ctx.clone(), repo, csid)
<M::Value as BonsaiDerived>::derive_with_mode(ctx.clone(), repo, csid, self.mode)
.map(|result| format!("{:?}", result))
.boxify()
}
@ -132,6 +134,7 @@ where
repo.clone(),
in_memory_mapping.clone(),
csid,
DeriveMode::Unsafe,
)
.map(|_| ())
}
@ -236,38 +239,52 @@ where
}
pub fn derived_data_utils(
_ctx: CoreContext,
repo: BlobRepo,
name: impl AsRef<str>,
) -> Result<Arc<dyn DerivedUtils>, Error> {
derived_data_utils_impl(repo, name, DeriveMode::OnlyIfEnabled)
}
pub fn derived_data_utils_unsafe(
repo: BlobRepo,
name: impl AsRef<str>,
) -> Result<Arc<dyn DerivedUtils>, Error> {
derived_data_utils_impl(repo, name, DeriveMode::Unsafe)
}
fn derived_data_utils_impl(
repo: BlobRepo,
name: impl AsRef<str>,
mode: DeriveMode,
) -> Result<Arc<dyn DerivedUtils>, Error> {
match name.as_ref() {
RootUnodeManifestId::NAME => {
let mapping = RootUnodeManifestMapping::new(repo.get_blobstore());
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping, mode)))
}
RootFastlog::NAME => {
let mapping = RootFastlogMapping::new(repo.get_blobstore().boxed());
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping, mode)))
}
MappedHgChangesetId::NAME => {
let mapping = HgChangesetIdMapping::new(&repo);
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping, mode)))
}
RootFsnodeId::NAME => {
let mapping = RootFsnodeMapping::new(repo.get_blobstore());
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping, mode)))
}
BlameRoot::NAME => {
let mapping = BlameRootMapping::new(repo.get_blobstore().boxed());
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping, mode)))
}
RootDeletedManifestId::NAME => {
let mapping = RootDeletedManifestMapping::new(repo.get_blobstore());
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping, mode)))
}
FilenodesOnlyPublic::NAME => {
let mapping = FilenodesOnlyPublicMapping::new(repo);
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping)))
Ok(Arc::new(DerivedUtilsFromMapping::new(mapping, mode)))
}
name => Err(format_err!("Unsupported derived data type: {}", name)),
}

View File

@ -725,6 +725,18 @@ allow_writes = ${INFINITEPUSH_ALLOW_WRITES:-true}
${namespace}
CONFIG
fi
if [[ -v ENABLED_DERIVED_DATA ]]; then
cat >> "repos/$reponame/server.toml" <<CONFIG
[derived_data_config]
derived_data_types = $ENABLED_DERIVED_DATA
CONFIG
else
cat >> "repos/$reponame/server.toml" <<CONFIG
[derived_data_config]
derived_data_types=["blame", "deleted_manifest", "fastlog", "filenodes", "fsnodes", "unodes"]
CONFIG
fi
}
function register_hook {

View File

@ -5,7 +5,7 @@
# directory of this source tree.
$ . "${TEST_FIXTURES}/library.sh"
$ setup_common_config "blob_files"
$ ENABLED_DERIVED_DATA='["git_trees", "filenodes"]' setup_common_config "blob_files"
$ GIT_REPO="${TESTTMP}/repo-git"
$ HG_REPO="${TESTTMP}/repo-hg"

View File

@ -5,7 +5,7 @@
# directory of this source tree.
$ . "${TEST_FIXTURES}/library.sh"
$ setup_common_config
$ ENABLED_DERIVED_DATA='["git_trees", "filenodes"]' setup_common_config
$ GIT_REPO="${TESTTMP}/repo-git"
$ HG_REPO="${TESTTMP}/repo-hg"