sapling/eden/fs/store/Diff.cpp
Xavier Deguillard d80d66bdd2 store: split loadGitIgnoreThenDiffTrees
Summary:
One of the unfortunate behavior of this function is that it forces inodes to be
loaded when calling getLoadFileContentsFromPath. As we now know, loading inodes
is very expensive on some platforms and thus we should make sure to limit it.
In this case, we should be able to read the .gitignore file directly from the
ObjectStore without needing to load inodes. Splitting the function will help in
making this happen.

Reviewed By: kmancini

Differential Revision: D36363213

fbshipit-source-id: 6bb106cee4e77a33fc922c5e4f23215298f9d2cf
2022-06-21 17:27:16 -07:00

544 lines
19 KiB
C++

/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This software may be used and distributed according to the terms of the
* GNU General Public License version 2.
*/
#include "eden/fs/store/Diff.h"
#include <folly/Portability.h>
#include <folly/Synchronized.h>
#include <folly/futures/Future.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <vector>
#include "eden/fs/model/Tree.h"
#include "eden/fs/model/TreeEntry.h"
#include "eden/fs/model/git/GitIgnoreStack.h"
#include "eden/fs/store/DiffContext.h"
#include "eden/fs/store/ObjectStore.h"
#include "eden/fs/store/ScmStatusDiffCallback.h"
#include "eden/fs/utils/Future.h"
#include "eden/fs/utils/ImmediateFuture.h"
#include "eden/fs/utils/PathFuncs.h"
using folly::Future;
using folly::makeFuture;
using folly::Try;
using folly::Unit;
using std::make_unique;
using std::vector;
namespace facebook::eden {
/*
* In practice, while the functions in this file are comparing two source
* control Tree objects, they are used for comparing the current
* (non-materialized) working directory state (as wdTree) to its corresponding
* source control state (as scmTree).
*/
namespace {
struct ChildFutures {
void add(RelativePath&& path, Future<Unit>&& future) {
paths.emplace_back(std::move(path));
futures.emplace_back(std::move(future));
}
vector<RelativePath> paths;
vector<Future<Unit>> futures;
};
static constexpr PathComponentPiece kIgnoreFilename{".gitignore"};
/**
* Process a TreeEntry that is present only on one side of the diff.
* We don't know yet if this TreeEntry refers to a Tree or a Blob.
*
* If we could not compute a result immediately we will add an entry to
* childFutures.
*/
void processRemovedSide(
DiffContext* context,
ChildFutures& childFutures,
RelativePathPiece currentPath,
const Tree::value_type& scmEntry) {
context->callback->removedPath(
currentPath + scmEntry.first, scmEntry.second.getDType());
if (!scmEntry.second.isTree()) {
return;
}
auto entryPath = currentPath + scmEntry.first;
auto childFuture =
diffRemovedTree(context, entryPath, scmEntry.second.getHash());
childFutures.add(std::move(entryPath), std::move(childFuture));
}
/**
* Process a TreeEntry that is present only on one side of the diff.
* We don't know yet if this TreeEntry refers to a Tree or a Blob.
*
* If we could not compute a result immediately we will add an entry to
* childFutures.
*/
void processAddedSide(
DiffContext* context,
ChildFutures& childFutures,
RelativePathPiece currentPath,
const Tree::value_type& wdEntry,
const GitIgnoreStack* ignore,
bool isIgnored) {
bool entryIgnored = isIgnored;
auto entryPath = currentPath + wdEntry.first;
if (!isIgnored && ignore) {
auto fileType =
wdEntry.second.isTree() ? GitIgnore::TYPE_DIR : GitIgnore::TYPE_FILE;
auto ignoreStatus = ignore->match(entryPath, fileType);
if (ignoreStatus == GitIgnore::HIDDEN) {
// Completely skip over hidden entries.
// This is used for reserved directories like .hg and .eden
return;
}
entryIgnored = (ignoreStatus == GitIgnore::EXCLUDE);
}
if (!entryIgnored) {
context->callback->addedPath(entryPath, wdEntry.second.getDType());
} else if (context->listIgnored) {
context->callback->ignoredPath(entryPath, wdEntry.second.getDType());
} else {
// Don't bother reporting this ignored file since
// listIgnored is false.
}
if (wdEntry.second.isTree()) {
if (!entryIgnored || context->listIgnored) {
auto childFuture = diffAddedTree(
context, entryPath, wdEntry.second.getHash(), ignore, entryIgnored);
childFutures.add(std::move(entryPath), std::move(childFuture));
}
}
}
/**
* Process TreeEntry objects that exist on both sides of the diff.
*/
void processBothPresent(
DiffContext* context,
ChildFutures& childFutures,
RelativePathPiece currentPath,
const Tree::value_type& scmEntry,
const Tree::value_type& wdEntry,
const GitIgnoreStack* ignore,
bool isIgnored) {
bool entryIgnored = isIgnored;
auto entryPath = currentPath + scmEntry.first;
bool isTreeSCM = scmEntry.second.isTree();
bool isTreeWD = wdEntry.second.isTree();
// If wdEntry and scmEntry are both files (or symlinks) then we don't need
// to bother computing the ignore status: the file is explicitly tracked in
// source control, so we should report it's status even if it would normally
// be ignored.
if (!isIgnored && (isTreeWD || isTreeSCM) && ignore) {
auto fileType = isTreeWD ? GitIgnore::TYPE_DIR : GitIgnore::TYPE_FILE;
auto ignoreStatus = ignore->match(entryPath, fileType);
if (ignoreStatus == GitIgnore::HIDDEN) {
// This is rather unexpected. We don't expect to find entries in
// source control using reserved hidden names.
// Treat this as ignored for now.
entryIgnored = true;
} else if (ignoreStatus == GitIgnore::EXCLUDE) {
entryIgnored = true;
} else {
entryIgnored = false;
}
}
if (isTreeSCM) {
if (isTreeWD) {
// tree-to-tree diff
XDCHECK_EQ(scmEntry.second.getType(), wdEntry.second.getType());
if (scmEntry.second.getHash() == wdEntry.second.getHash()) {
return;
}
context->callback->modifiedPath(entryPath, wdEntry.second.getDType());
auto childFuture = diffTrees(
context,
entryPath,
scmEntry.second.getHash(),
wdEntry.second.getHash(),
ignore,
entryIgnored);
childFutures.add(std::move(entryPath), std::move(childFuture));
} else {
// tree-to-file
// Add a ADDED entry for this path and a removal of the directory
if (entryIgnored) {
if (context->listIgnored) {
context->callback->ignoredPath(entryPath, wdEntry.second.getDType());
}
} else {
context->callback->addedPath(entryPath, wdEntry.second.getDType());
}
// Report everything in scmTree as REMOVED
context->callback->removedPath(entryPath, scmEntry.second.getDType());
auto childFuture =
diffRemovedTree(context, entryPath, scmEntry.second.getHash());
childFutures.add(std::move(entryPath), std::move(childFuture));
}
} else {
if (isTreeWD) {
// file-to-tree
// Add a REMOVED entry for this path
context->callback->removedPath(entryPath, scmEntry.second.getDType());
// Report everything in wdEntry as ADDED
context->callback->addedPath(entryPath, wdEntry.second.getDType());
auto childFuture = diffAddedTree(
context, entryPath, wdEntry.second.getHash(), ignore, entryIgnored);
childFutures.add(std::move(entryPath), std::move(childFuture));
} else {
// file-to-file diff
// Even if blobs have different hashes, they could have the same contents.
// For example, if between the two revisions being compared, if a file was
// changed and then later reverted. In that case, the contents would be
// the same but the blobs would have different hashes
// If the types are different, then this entry is definitely modified
if (scmEntry.second.getType() != wdEntry.second.getType()) {
context->callback->modifiedPath(entryPath, wdEntry.second.getDType());
} else {
// If Mercurial eventually switches to using blob IDs that are solely
// based on the file contents (as opposed to file contents + history)
// then we could drop this extra load of the blob SHA-1, and rely only
// on the blob ID comparison instead.
auto compareEntryContents =
folly::makeFutureWith([context,
entryPath = currentPath + scmEntry.first,
&scmEntry,
&wdEntry] {
auto scmFuture = context->store->getBlobSha1(
scmEntry.second.getHash(), context->getFetchContext());
auto wdFuture = context->store->getBlobSha1(
wdEntry.second.getHash(), context->getFetchContext());
return collectAllSafe(scmFuture, wdFuture)
.thenValue([entryPath = entryPath.copy(),
context,
dtype = scmEntry.second.getDType()](
const std::tuple<Hash20, Hash20>& info) {
const auto& [scmHash, wdHash] = info;
if (scmHash != wdHash) {
context->callback->modifiedPath(entryPath, dtype);
}
})
.semi()
.via(&folly::QueuedImmediateExecutor::instance());
});
childFutures.add(std::move(entryPath), std::move(compareEntryContents));
}
}
}
}
FOLLY_NODISCARD Future<Unit> waitOnResults(
DiffContext* context,
ChildFutures&& childFutures) {
XDCHECK_EQ(childFutures.paths.size(), childFutures.futures.size());
if (childFutures.futures.empty()) {
return makeFuture();
}
return folly::collectAll(std::move(childFutures.futures))
.toUnsafeFuture()
.thenValue([context, paths = std::move(childFutures.paths)](
vector<Try<Unit>>&& results) {
XDCHECK_EQ(paths.size(), results.size());
for (size_t idx = 0; idx < results.size(); ++idx) {
const auto& result = results[idx];
if (!result.hasException()) {
continue;
}
XLOG(ERR) << "error computing SCM diff for " << paths.at(idx);
context->callback->diffError(paths.at(idx), result.exception());
}
});
}
/**
* Diff two trees.
*
* The path argument specifies the path to these trees, and will be prefixed
* to all differences recorded in the results.
*
* The differences will be recorded using a callback provided by the caller.
*/
FOLLY_NODISCARD Future<Unit> computeTreeDiff(
DiffContext* context,
RelativePathPiece currentPath,
std::shared_ptr<const Tree> scmTree,
std::shared_ptr<const Tree> wdTree,
std::unique_ptr<GitIgnoreStack> ignore,
bool isIgnored) {
// A list of Futures to wait on for our children's results.
ChildFutures childFutures;
// Walk through the entries in both trees.
// This relies on the fact that the entry list in each tree is always sorted.
Tree::container emptyEntries{kPathMapDefaultCaseSensitive};
auto scmIter = scmTree ? scmTree->cbegin() : emptyEntries.cbegin();
auto scmEnd = scmTree ? scmTree->cend() : emptyEntries.cend();
auto wdIter = wdTree ? wdTree->cbegin() : emptyEntries.cend();
auto wdEnd = wdTree ? wdTree->cend() : emptyEntries.cend();
while (true) {
if (scmIter == scmEnd) {
if (wdIter == wdEnd) {
// All Done
break;
}
// This entry is present in wdTree but not scmTree
processAddedSide(
context, childFutures, currentPath, *wdIter, ignore.get(), isIgnored);
++wdIter;
} else if (wdIter == wdEnd) {
// This entry is present in scmTree but not wdTree
processRemovedSide(context, childFutures, currentPath, *scmIter);
++scmIter;
} else {
auto compare = comparePathPiece(
scmIter->first, wdIter->first, context->getCaseSensitive());
if (compare == CompareResult::BEFORE) {
processRemovedSide(context, childFutures, currentPath, *scmIter);
++scmIter;
} else if (compare == CompareResult::AFTER) {
processAddedSide(
context,
childFutures,
currentPath,
*wdIter,
ignore.get(),
isIgnored);
++wdIter;
} else {
processBothPresent(
context,
childFutures,
currentPath,
*scmIter,
*wdIter,
ignore.get(),
isIgnored);
++scmIter;
++wdIter;
}
}
}
// Add an ensure() block that makes sure the ignore stack exists until all of
// our children results have finished processing
return waitOnResults(context, std::move(childFutures))
.ensure([ignore = std::move(ignore)] {});
}
/**
* Load the content of the .gitignore file and return it.
*/
ImmediateFuture<std::string> loadGitIgnore(
DiffContext* context,
RelativePath gitIgnorePath) {
// TODO: load file contents directly from context->store if gitIgnoreEntry is
// a regular file
auto loadFileContentsFromPath = context->getLoadFileContentsFromPath();
auto loadFuture =
loadFileContentsFromPath(context->getFetchContext(), gitIgnorePath);
return std::move(loadFuture)
.thenError([entryPath = std::move(gitIgnorePath)](
const folly::exception_wrapper& ex) {
// TODO: add an API to DiffCallback to report user errors like this
// (errors that do not indicate a problem with EdenFS itself) that
// can be returned to the caller in a thrift response
XLOG(WARN) << "error loading gitignore at " << entryPath << ": "
<< folly::exceptionStr(ex);
return std::string{};
})
.semi();
}
FOLLY_NODISCARD Future<Unit> diffTrees(
DiffContext* context,
RelativePathPiece currentPath,
std::shared_ptr<const Tree> scmTree,
std::shared_ptr<const Tree> wdTree,
const GitIgnoreStack* parentIgnore,
bool isIgnored) {
if (context->isCancelled()) {
XLOG(DBG7) << "diff() on directory " << currentPath
<< " cancelled due to client request no longer being active";
return makeFuture();
}
// If this directory is already ignored, we don't need to bother loading its
// .gitignore file. Everything inside this directory must also be ignored,
// unless it is explicitly tracked in source control.
//
// Explicit include rules cannot be used to unignore files inside an ignored
// directory.
//
// We check context->getLoadFileContentsFromPath() here as a way to see if we
// are processing gitIgnore files or not, since this is only set from code
// that enters through eden/fs/inodes/Diff.cpp. Either way, it is
// impossible to load file contents without this set.
if (isIgnored || !context->getLoadFileContentsFromPath()) {
// We can pass in a null GitIgnoreStack pointer here.
// Since the entire directory is ignored, we don't need to check ignore
// status for any entries that aren't already tracked in source control.
return computeTreeDiff(
context,
currentPath,
std::move(scmTree),
std::move(wdTree),
nullptr,
isIgnored);
}
ImmediateFuture<std::string> gitIgnore{};
if (wdTree) {
// If this directory has a .gitignore file, load it first.
const auto it = wdTree->find(kIgnoreFilename);
if (it != wdTree->cend() && !it->second.isTree()) {
gitIgnore = loadGitIgnore(context, currentPath + it->first);
}
}
return std::move(gitIgnore)
.thenValue([context,
currentPath = currentPath.copy(),
scmTree = std::move(scmTree),
wdTree = std::move(wdTree),
parentIgnore,
isIgnored](std::string gitIgnore) mutable {
auto gitIgnoreStack = gitIgnore.empty()
? std::make_unique<GitIgnoreStack>(parentIgnore)
: std::make_unique<GitIgnoreStack>(parentIgnore, gitIgnore);
return computeTreeDiff(
context,
currentPath,
std::move(scmTree),
std::move(wdTree),
std::move(gitIgnoreStack),
isIgnored)
.semi();
})
.semi()
.via(&folly::QueuedImmediateExecutor::instance());
}
FOLLY_NODISCARD Future<Unit> diffTrees(
DiffContext* context,
RelativePathPiece currentPath,
ImmediateFuture<std::shared_ptr<const Tree>> scmFuture,
ImmediateFuture<std::shared_ptr<const Tree>> wdFuture,
const GitIgnoreStack* ignore,
bool isIgnored) {
auto treesFuture = collectAllSafe(std::move(scmFuture), std::move(wdFuture));
// Optimization for the case when the trees are immediately ready. We can
// avoid copying the input path in this case.
auto copiedCurrentPath =
!treesFuture.isReady() ? std::optional{currentPath.copy()} : std::nullopt;
return std::move(treesFuture)
.thenValue([context,
copiedCurrentPath = std::move(copiedCurrentPath),
currentPath,
ignore,
isIgnored](std::tuple<
std::shared_ptr<const Tree>,
std::shared_ptr<const Tree>> tup) {
auto [scmTree, wdTree] = std::move(tup);
// Shortcut in the case where we're trying to diff the same tree. This
// happens in the case in which the CLI (during eden doctor) calls
// getScmStatusBetweenRevisions() with the same hash in order to check
// if a commit hash is valid.
if (scmTree && wdTree && scmTree->getHash() == wdTree->getHash()) {
return folly::makeSemiFuture();
}
auto pathPiece = copiedCurrentPath.has_value()
? copiedCurrentPath->piece()
: currentPath;
return diffTrees(
context,
pathPiece,
std::move(scmTree),
std::move(wdTree),
ignore,
isIgnored)
.semi();
})
.semi()
.via(&folly::QueuedImmediateExecutor::instance());
}
} // namespace
Future<Unit>
diffRoots(DiffContext* context, const RootId& root1, const RootId& root2) {
auto future1 = context->store->getRootTree(root1, context->getFetchContext());
auto future2 = context->store->getRootTree(root2, context->getFetchContext());
return diffTrees(
context,
RelativePathPiece{},
std::move(future1).semi(),
std::move(future2).semi(),
nullptr,
false);
}
FOLLY_NODISCARD Future<Unit> diffTrees(
DiffContext* context,
RelativePathPiece currentPath,
ObjectId scmHash,
ObjectId wdHash,
const GitIgnoreStack* ignore,
bool isIgnored) {
return diffTrees(
context,
currentPath,
context->store->getTree(scmHash, context->getFetchContext()),
context->store->getTree(wdHash, context->getFetchContext()),
ignore,
isIgnored);
}
FOLLY_NODISCARD Future<Unit> diffAddedTree(
DiffContext* context,
RelativePathPiece currentPath,
ObjectId wdHash,
const GitIgnoreStack* ignore,
bool isIgnored) {
return diffTrees(
context,
currentPath,
std::shared_ptr<const Tree>{nullptr},
context->store->getTree(wdHash, context->getFetchContext()),
ignore,
isIgnored);
}
FOLLY_NODISCARD Future<Unit> diffRemovedTree(
DiffContext* context,
RelativePathPiece currentPath,
ObjectId scmHash) {
return diffTrees(
context,
currentPath,
context->store->getTree(scmHash, context->getFetchContext()),
std::shared_ptr<const Tree>{nullptr},
nullptr,
false);
}
} // namespace facebook::eden