sapling/eden/fs/store/ObjectStore.cpp
Andrey Chursin 3051184ce0 do not recompute metadata if using aux data from hg
Summary: Sha1 computation show up in many profilers, we can skip this step entirely if we are using metadata from hg.

Reviewed By: xavierd

Differential Revision: D32396312

fbshipit-source-id: 928f8458a91bba4d18da6730d3fc244eb5499489
2021-11-15 13:56:47 -08:00

350 lines
12 KiB
C++

/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This software may be used and distributed according to the terms of the
* GNU General Public License version 2.
*/
#include "ObjectStore.h"
#include <folly/Conv.h>
#include <folly/Executor.h>
#include <folly/futures/Future.h>
#include <folly/io/IOBuf.h>
#include <stdexcept>
#include "eden/fs/model/Blob.h"
#include "eden/fs/model/Tree.h"
#include "eden/fs/store/BackingStore.h"
#include "eden/fs/store/LocalStore.h"
#include "eden/fs/store/ObjectFetchContext.h"
#include "eden/fs/telemetry/EdenStats.h"
#include "eden/fs/utils/ImmediateFuture.h"
using folly::Future;
using folly::makeFuture;
using std::shared_ptr;
using std::string;
using std::unique_ptr;
namespace facebook::eden {
namespace {
constexpr uint64_t kImportPriorityDeprioritizeAmount = 1;
}
std::shared_ptr<ObjectStore> ObjectStore::create(
shared_ptr<LocalStore> localStore,
shared_ptr<BackingStore> backingStore,
shared_ptr<TreeCache> treeCache,
shared_ptr<EdenStats> stats,
folly::Executor::KeepAlive<folly::Executor> executor,
std::shared_ptr<ProcessNameCache> processNameCache,
std::shared_ptr<StructuredLogger> structuredLogger,
std::shared_ptr<const EdenConfig> edenConfig) {
return std::shared_ptr<ObjectStore>{new ObjectStore{
std::move(localStore),
std::move(backingStore),
std::move(treeCache),
std::move(stats),
executor,
processNameCache,
structuredLogger,
edenConfig}};
}
ObjectStore::ObjectStore(
shared_ptr<LocalStore> localStore,
shared_ptr<BackingStore> backingStore,
shared_ptr<TreeCache> treeCache,
shared_ptr<EdenStats> stats,
folly::Executor::KeepAlive<folly::Executor> executor,
std::shared_ptr<ProcessNameCache> processNameCache,
std::shared_ptr<StructuredLogger> structuredLogger,
std::shared_ptr<const EdenConfig> edenConfig)
: metadataCache_{folly::in_place, kCacheSize},
treeCache_{std::move(treeCache)},
localStore_{std::move(localStore)},
backingStore_{std::move(backingStore)},
stats_{std::move(stats)},
executor_{executor},
pidFetchCounts_{std::make_unique<PidFetchCounts>()},
processNameCache_(processNameCache),
structuredLogger_(structuredLogger),
edenConfig_(edenConfig) {}
ObjectStore::~ObjectStore() {}
void ObjectStore::updateProcessFetch(
const ObjectFetchContext& fetchContext) const {
if (auto pid = fetchContext.getClientPid()) {
auto fetch_count = pidFetchCounts_->recordProcessFetch(pid.value());
auto threshold = edenConfig_->fetchHeavyThreshold.getValue();
if (fetch_count && threshold && !(fetch_count % threshold)) {
sendFetchHeavyEvent(pid.value(), fetch_count);
}
}
}
void ObjectStore::sendFetchHeavyEvent(pid_t pid, uint64_t fetch_count) const {
auto processName = processNameCache_->getSpacedProcessName(pid);
if (processName.has_value()) {
structuredLogger_->logEvent(
FetchHeavy{processName.value(), pid, fetch_count});
}
}
void ObjectStore::deprioritizeWhenFetchHeavy(
ObjectFetchContext& context) const {
auto pid = context.getClientPid();
if (pid.has_value()) {
auto fetch_count = pidFetchCounts_->getCountByPid(pid.value());
auto threshold = edenConfig_->fetchHeavyThreshold.getValue();
if (threshold && fetch_count >= threshold) {
context.deprioritize(kImportPriorityDeprioritizeAmount);
}
}
}
RootId ObjectStore::parseRootId(folly::StringPiece rootId) {
return backingStore_->parseRootId(rootId);
}
std::string ObjectStore::renderRootId(const RootId& rootId) {
return backingStore_->renderRootId(rootId);
}
Future<shared_ptr<const Tree>> ObjectStore::getRootTree(
const RootId& rootId,
ObjectFetchContext& context) const {
XLOG(DBG3) << "getRootTree(" << rootId << ")";
return backingStore_->getRootTree(rootId, context)
.via(executor_)
.thenValue(
[treeCache = treeCache_, rootId](std::shared_ptr<const Tree> tree) {
if (!tree) {
throw std::domain_error(
folly::to<string>("unable to import root ", rootId));
}
treeCache->insert(tree);
return tree;
});
}
folly::Future<std::shared_ptr<TreeEntry>> ObjectStore::getTreeEntryForRootId(
const RootId& rootId,
facebook::eden::TreeEntryType treeEntryType,
facebook::eden::PathComponentPiece pathComponentPiece,
ObjectFetchContext& context) const {
XLOG(DBG3) << "getTreeEntryForRootId(" << rootId << ")";
return backingStore_
->getTreeEntryForRootId(
rootId, treeEntryType, pathComponentPiece, context)
.via(executor_);
}
ImmediateFuture<shared_ptr<const Tree>> ObjectStore::getTree(
const ObjectId& id,
ObjectFetchContext& fetchContext) const {
// Check in the LocalStore first
// TODO: We should consider checking if we have in flight BackingStore
// requests on this layer instead of only in the BackingStore. Consider the
// case in which thread A and thread B both request a Tree at the same time.
// Let's say thread A checks the LocalStore, then thread B checks the
// LocalStore, gets the file from the BackingStore (making a request to the
// server), then writes the Tree to the LocalStore. Now when thread A checks
// for in flight requests in the BackingStore, it will not see any since
// thread B has completely finished, so thread A will make a duplicate
// request. If we we're to mark here that we got a request on this layer, then
// we could avoid that case.
if (auto maybeTree = treeCache_->get(id)) {
fetchContext.didFetch(
ObjectFetchContext::Tree, id, ObjectFetchContext::FromMemoryCache);
updateProcessFetch(fetchContext);
return maybeTree;
}
deprioritizeWhenFetchHeavy(fetchContext);
return backingStore_->getTree(id, fetchContext)
.via(executor_)
.thenValue([self = shared_from_this(), id, &fetchContext](
BackingStore::GetTreeRes result) {
if (!result.tree) {
// TODO: Perhaps we should do some short-term negative
// caching?
XLOG(DBG2) << "unable to find tree " << id;
throw std::domain_error(fmt::format("tree {} not found", id));
}
// promote to shared_ptr so we can store in the cache and return
auto sharedTree = std::shared_ptr<const Tree>(std::move(result.tree));
self->treeCache_->insert(sharedTree);
fetchContext.didFetch(ObjectFetchContext::Tree, id, result.origin);
self->updateProcessFetch(fetchContext);
return sharedTree;
})
.semi();
}
folly::Future<folly::Unit> ObjectStore::prefetchBlobs(
ObjectIdRange ids,
ObjectFetchContext& fetchContext) const {
// In theory we could/should ask the localStore_ to filter the list
// of ids down to just the set that we need to load, but there is no
// bulk key existence check in rocksdb, so we would need to cause it
// to load all the blocks of those keys into memory.
// So for the moment we are committing a layering violation in the
// interest of making things faster in practice by just asking the
// mercurial backing store to ensure that its local hgcache storage
// has entries for all of the requested keys.
if (ids.empty()) {
return folly::unit;
}
return backingStore_->prefetchBlobs(ids, fetchContext).via(executor_);
}
Future<shared_ptr<const Blob>> ObjectStore::getBlob(
const ObjectId& id,
ObjectFetchContext& fetchContext) const {
deprioritizeWhenFetchHeavy(fetchContext);
return backingStore_->getBlob(id, fetchContext)
.via(executor_)
.thenValue([self = shared_from_this(), id, &fetchContext](
BackingStore::GetBlobRes result) {
if (!result.blob) {
// TODO: Perhaps we should do some short-term negative caching?
XLOG(DBG2) << "unable to find blob " << id;
throw std::domain_error(fmt::format("blob {} not found", id));
}
// Quick check in-memory cache first, before doing expensive
// calculations. If metadata is present in cache, it most certainly
// exists in local store too.
// Additionally check if we use aux metadata from mercurial, and do not
// compute it in this case.
if (!self->edenConfig_->useAuxMetadata.getValue() &&
!self->metadataCache_.rlock()->exists(id)) {
auto metadata =
self->localStore_->putBlobMetadata(id, result.blob.get());
self->metadataCache_.wlock()->set(id, metadata);
}
self->updateProcessFetch(fetchContext);
fetchContext.didFetch(ObjectFetchContext::Blob, id, result.origin);
return std::move(result.blob);
});
}
ImmediateFuture<BlobMetadata> ObjectStore::getBlobMetadata(
const ObjectId& id,
ObjectFetchContext& context) const {
// Check in-memory cache
{
auto metadataCache = metadataCache_.wlock();
auto cacheIter = metadataCache->find(id);
if (cacheIter != metadataCache->end()) {
stats_->getObjectStoreStatsForCurrentThread()
.getBlobMetadataFromMemory.addValue(1);
context.didFetch(
ObjectFetchContext::BlobMetadata,
id,
ObjectFetchContext::FromMemoryCache);
updateProcessFetch(context);
return cacheIter->second;
}
}
if (edenConfig_->useAuxMetadata.getValue()) {
// if configured, check hg cache for aux metadata
auto localMetadata = backingStore_->getLocalBlobMetadata(id, context);
if (localMetadata) {
metadataCache_.wlock()->set(id, *localMetadata);
context.didFetch(
ObjectFetchContext::BlobMetadata,
id,
ObjectFetchContext::FromDiskCache);
updateProcessFetch(context);
return *localMetadata;
}
}
auto self = shared_from_this();
// Check local store
return localStore_->getBlobMetadata(id)
.thenValue([self, id, &context](std::optional<BlobMetadata>&& metadata) {
if (metadata) {
self->stats_->getObjectStoreStatsForCurrentThread()
.getBlobMetadataFromLocalStore.addValue(1);
self->metadataCache_.wlock()->set(id, *metadata);
context.didFetch(
ObjectFetchContext::BlobMetadata,
id,
ObjectFetchContext::FromDiskCache);
self->updateProcessFetch(context);
return makeFuture(*metadata);
}
self->deprioritizeWhenFetchHeavy(context);
// Check backing store
//
// TODO: It would be nice to add a smarter API to the BackingStore so
// that we can query it just for the blob metadata if it supports
// getting that without retrieving the full blob data.
//
// TODO: This should probably check the LocalStore for the blob first,
// especially when we begin to expire entries in RocksDB.
return self->backingStore_->getBlob(id, context)
.via(self->executor_)
.thenValue([self, id, &context](BackingStore::GetBlobRes result) {
if (result.blob) {
self->stats_->getObjectStoreStatsForCurrentThread()
.getBlobMetadataFromBackingStore.addValue(1);
self->localStore_->putBlob(id, result.blob.get());
auto metadata =
self->localStore_->putBlobMetadata(id, result.blob.get());
self->metadataCache_.wlock()->set(id, metadata);
// I could see an argument for recording this fetch with
// type Blob instead of BlobMetadata, but it's probably more
// useful in context to know how many metadata fetches
// occurred. Also, since backing stores don't directly
// support fetching metadata, it should be clear.
context.didFetch(
ObjectFetchContext::BlobMetadata, id, result.origin);
self->updateProcessFetch(context);
return makeFuture(metadata);
}
throw std::domain_error(fmt::format("blob {} not fonud", id));
});
})
.semi();
}
ImmediateFuture<Hash20> ObjectStore::getBlobSha1(
const ObjectId& id,
ObjectFetchContext& context) const {
return getBlobMetadata(id, context)
.thenValue([](const BlobMetadata& metadata) { return metadata.sha1; });
}
ImmediateFuture<uint64_t> ObjectStore::getBlobSize(
const ObjectId& id,
ObjectFetchContext& context) const {
return getBlobMetadata(id, context)
.thenValue([](const BlobMetadata& metadata) { return metadata.size; });
}
} // namespace facebook::eden