Update HgDatapackStore to handle duplicate tree requests

Summary:
As part of the project to remove HgImporter (aka `hg debugedenimporthelper`) it was discovered that we present duplicate keys to the EdenAPI when making batched requests - https://fburl.com/scuba/edenfs_events/owb8b65e. This reults in the requests being failed in the `backingstore` library (to avoid being failed by Mononoke). When these batched requests fail - they fall back to using HgImporter and fetch each object one at a time.

This diff addresses the batching of trees. It creates a map of unique object ids to vectors of requests. In most cases the vector is size 1.

With the help of chadaustin, we were able to root cause why duplicate requests are submitted to  EdenAPI. From Eden's POV, each individual file and directory are different and identified by a different hash. From Mononoke's POV, content (tree or blob) that contain the exact same bytes are the same regardless of where they are in a repo. For this reason, it is possible (somewhat common) to create a batch of requests that contain the same proxy hash to be submitted to EdenAPI and the solution put in place here is the correct one - identifying a set of duplicates, submitting only one hash for the set, and fulfilling all the set's promises when completed.

This diff and subsequent diffs also are increasing the amount of duplicate code between getBobBatch, getBlobMetadataBatch and getTreeBatch. This will be address in subsequent diffs - see T165043385.

Reviewed By: kmancini

Differential Revision: D49707255

fbshipit-source-id: f176b785e74e71a2d13865397e9023ce538e928c
This commit is contained in:
John Elliott 2023-10-03 18:16:39 -07:00 committed by Facebook GitHub Bot
parent 18a8aec9d7
commit 6abee9e9bb

View File

@ -110,24 +110,66 @@ TreePtr fromRawTree(
} // namespace
void HgDatapackStore::getTreeBatch(const ImportRequestsList& importRequests) {
auto count = importRequests.size();
// TODO: extract each ClientRequestInfo from importRequests into a
// sapling::ClientRequestInfo and pass them with the corresponding
// sapling::NodeId
std::vector<sapling::NodeId> requests;
requests.reserve(count);
// Group requests by proxyHash to ensure no duplicates in fetch request to
// SaplingNativeBackingStore.
ImportRequestsMap importRequestsMap;
for (const auto& importRequest : importRequests) {
auto& proxyHash =
importRequest->getRequest<HgImportRequest::TreeImport>()->proxyHash;
requests.emplace_back(proxyHash.byteHash());
auto nodeId = importRequest->getRequest<HgImportRequest::TreeImport>()
->proxyHash.byteHash();
// Look for and log duplicates.
const auto& importRequestsEntry = importRequestsMap.find(nodeId);
if (importRequestsEntry != importRequestsMap.end()) {
XLOGF(DBG9, "Duplicate tree fetch request with proxyHash: {}", nodeId);
auto& importRequestList = importRequestsEntry->second.first;
// Only look for mismatched requests if logging level is high enough.
// Make sure this level is the same as the XLOG_IF statement below.
if (XLOG_IS_ON(DBG9)) {
// Log requests that do not have the same hash (ObjectId).
// This happens when two paths (file or directory) have same content.
for (const auto& priorRequest : importRequestList) {
XLOGF_IF(
DBG9,
UNLIKELY(
priorRequest->getRequest<HgImportRequest::TreeImport>()
->hash !=
importRequest->getRequest<HgImportRequest::TreeImport>()
->hash),
"Tree requests have the same proxyHash (HgProxyHash) but different hash (ObjectId). "
"This should not happen. Previous request: proxyHash='{}', hash='{}'; "
"current request: proxyHash ='{}', hash='{}'.",
folly::hexlify(
priorRequest->getRequest<HgImportRequest::TreeImport>()
->proxyHash.getValue()),
priorRequest->getRequest<HgImportRequest::TreeImport>()
->hash.asHexString(),
folly::hexlify(
importRequest->getRequest<HgImportRequest::TreeImport>()
->proxyHash.getValue()),
importRequest->getRequest<HgImportRequest::TreeImport>()
->hash.asHexString());
}
}
importRequestList.emplace_back(importRequest);
} else {
std::vector<std::shared_ptr<HgImportRequest>> requests({importRequest});
importRequestsMap.emplace(
nodeId, make_pair(requests, &liveBatchedBlobWatches_));
}
}
std::vector<RequestMetricsScope> requestsWatches;
requestsWatches.reserve(count);
for (auto i = 0ul; i < count; i++) {
requestsWatches.emplace_back(&liveBatchedTreeWatches_);
}
// Indexable vector of nodeIds - required by SaplingNativeBackingStore API.
std::vector<sapling::NodeId> requests;
requests.reserve(importRequestsMap.size());
std::transform(
importRequestsMap.begin(),
importRequestsMap.end(),
std::back_inserter(requests),
[](auto& pair) { return pair.first; });
auto hgObjectIdFormat = config_->getEdenConfig()->hgObjectIdFormat.getValue();
const auto& filteredPaths =
@ -150,19 +192,19 @@ void HgDatapackStore::getTreeBatch(const ImportRequestsList& importRequests) {
// If we're falling back, the caller will fulfill this Promise with a
// tree from HgImporter.
// TODO: Remove this.
return;
}
XLOGF(DBG4, "Imported tree node={}", folly::hexlify(requests[index]));
auto& importRequest = importRequests[index];
XLOGF(DBG9, "Imported tree node={}", folly::hexlify(requests[index]));
const auto& nodeId = requests[index];
auto& [importRequestList, watch] = importRequestsMap[nodeId];
for (auto& importRequest : importRequestList) {
auto* treeRequest =
importRequest->getRequest<HgImportRequest::TreeImport>();
// A proposed folly::Try::and_then would make the following much
// simpler.
importRequest->getPromise<TreePtr>()->setWith(
[&]() -> folly::Try<TreePtr> {
if (content.hasException()) {
return folly::Try<TreePtr>{std::move(content).exception()};
return folly::Try<TreePtr>{content.exception()};
}
return folly::Try{fromRawTree(
content.value().get(),
@ -171,9 +213,10 @@ void HgDatapackStore::getTreeBatch(const ImportRequestsList& importRequests) {
hgObjectIdFormat,
filteredPaths)};
});
}
// Make sure that we're stopping this watch.
requestsWatches[index].reset();
watch.reset();
});
}
@ -260,7 +303,7 @@ void HgDatapackStore::getBlobBatch(const ImportRequestsList& importRequests) {
->hash !=
importRequest->getRequest<HgImportRequest::BlobImport>()
->hash),
"Requests have the same proxyHash (HgProxyHash) but different hash (ObjectId). "
"Blob requests have the same proxyHash (HgProxyHash) but different hash (ObjectId). "
"This should not happen. Previous request: proxyHash='{}', hash='{}'; "
"current request: proxyHash ='{}', hash='{}'.",
folly::hexlify(
@ -311,7 +354,7 @@ void HgDatapackStore::getBlobBatch(const ImportRequestsList& importRequests) {
return;
}
XLOGF(DBG9, "Imported node={}", folly::hexlify(requests[index]));
XLOGF(DBG9, "Imported blob node={}", folly::hexlify(requests[index]));
const auto& nodeId = requests[index];
auto& [importRequestList, watch] = importRequestsMap[nodeId];
auto result = content.hasException()