switch hg manifest import to two passes

Summary:
previously, the importer would read the entire manifest
and emit data to the store as it resolved complete directory entries.
The entire manifest data would be buffered and sent out to the store.

In the scenario where one subtree has been modified and a commit has
been made, only the parents of the subdirectory need to be hashed
and stored, but we would compute and try to store everything anyway.

While this diff can't avoid having to compute hashes for everything (we need
tree manifest data for that), by breaking the import into two passes we can
potentially avoid interrogating the LocalStore about every tree in the entire
manifest during an import; we only need to store the trees that are missing and
can simply cut out sub trees that are already present.

This saves us IO at the expense of buffering the manifest tree in memory
for the duration of the first pass; it is an acceptable trade off.

Reviewed By: simpkins

Differential Revision: D4832166

fbshipit-source-id: 0a40cb851c65393b407a8161db05c4b1795fb11a
This commit is contained in:
Wez Furlong 2017-04-06 10:47:57 -07:00 committed by Facebook Github Bot
parent fd073f1961
commit d32fc630f2
4 changed files with 90 additions and 15 deletions

View File

@ -271,22 +271,34 @@ BlobMetadata LocalStore::putBlob(const Hash& id, const Blob* blob) {
return metadata;
}
Hash LocalStore::putTree(const Tree* tree) {
std::pair<Hash, folly::IOBuf> LocalStore::serializeTree(
const Tree* tree) const {
GitTreeSerializer serializer;
for (auto& entry : tree->getTreeEntries()) {
serializer.addEntry(std::move(entry));
}
IOBuf treeBuf = serializer.finalize();
ByteRange treeData = treeBuf.coalesce();
auto id = tree->getHash();
if (id == Hash()) {
id = Hash::sha1(&treeBuf);
}
return std::make_pair(id, treeBuf);
}
Hash LocalStore::putTree(const Tree* tree) {
auto serialized = serializeTree(tree);
ByteRange treeData = serialized.second.coalesce();
auto& id = serialized.first;
put(id.getBytes(), treeData);
return id;
}
void LocalStore::put(const Hash& id, folly::ByteRange value) {
put(id.getBytes(), value);
}
void LocalStore::put(folly::ByteRange key, folly::ByteRange value) {
if (hasKey(key)) {
// Don't try to overwrite an existing key

View File

@ -99,6 +99,15 @@ class LocalStore {
*/
folly::Optional<Hash> getSha1ForBlob(const Hash& id) const;
/**
* Compute the serialized version of the tree.
* Returns the key and the (not coalesced) serialized data.
* This does not modify the contents of the store; it is the method
* used by the putTree method to compute the data that it stores.
* This is useful when computing the overal set of data during a
* two phase import. */
std::pair<Hash, folly::IOBuf> serializeTree(const Tree* tree) const;
Hash putTree(const Tree* tree);
/**
@ -113,6 +122,7 @@ class LocalStore {
* Put arbitrary data in the store.
*/
void put(folly::ByteRange key, folly::ByteRange value);
void put(const Hash& id, folly::ByteRange value);
/**
* Enables batch loading mode.

View File

@ -48,21 +48,46 @@ class HgManifestImporter::PartialTree {
}
void addEntry(TreeEntry&& entry);
/** move in a computed sub-tree.
* The tree will be recorded in the store in the second pass of
* the import, but only if the parent(s) are not stored. */
void addPartialTree(PartialTree&& tree);
/** Record this node against the store.
* May only be called after compute() has been called (this method
* will check and assert on this). */
Hash record(LocalStore* store);
/** Compute the serialized version of this tree.
* Records the id and data ready to be stored by a later call
* to the record() method. */
Hash compute(LocalStore* store);
private:
// The full path from the root of this repository
RelativePath path_;
// LocalStore currently requires that all data be stored in git tree format.
GitTreeSerializer serializer_;
unsigned int numPaths_{0};
std::vector<TreeEntry> entries_;
// Serialized data and id that we may need to store;
// this is the representation of this PartialTree instance.
Hash id_;
folly::IOBuf treeData_;
bool computed_{false};
// Children that we may need to store
std::vector<PartialTree> trees_;
};
HgManifestImporter::PartialTree::PartialTree(RelativePathPiece path)
: path_(std::move(path)) {}
void HgManifestImporter::PartialTree::addPartialTree(PartialTree&& tree) {
trees_.emplace_back(std::move(tree));
}
void HgManifestImporter::PartialTree::addEntry(TreeEntry&& entry) {
// Common case should be that we append because we expect the entries
// to be in the correct sorted order most of the time.
@ -87,14 +112,39 @@ void HgManifestImporter::PartialTree::addEntry(TreeEntry&& entry) {
++numPaths_;
}
Hash HgManifestImporter::PartialTree::record(LocalStore* store) {
Hash HgManifestImporter::PartialTree::compute(LocalStore* store) {
DCHECK(!computed_) << "Can only compute a PartialTree once";
auto tree = Tree(std::move(entries_));
auto hash = store->putTree(&tree);
std::tie(id_, treeData_) = store->serializeTree(&tree);
VLOG(6) << "record tree: '" << path_ << "' --> " << hash.toString() << " ("
computed_ = true;
VLOG(6) << "compute tree: '" << path_ << "' --> " << id_.toString() << " ("
<< numPaths_ << " paths)";
return hash;
return id_;
}
Hash HgManifestImporter::PartialTree::record(LocalStore* store) {
DCHECK(computed_) << "Must have computed PartialTree prior to recording";
// If the store already has data on this node, then we don't need to
// recurse into any of our children; we're done!
if (store->hasKey(id_)) {
return id_;
}
// make sure that we try to store each of our children before we try
// to store this node, so that failure to store one of these prevents
// us from storing a parent for which we have no children computed.
for (auto& it : trees_) {
it.record(store);
}
store->put(id_, treeData_.coalesce());
VLOG(6) << "record tree: '" << path_ << "' --> " << id_.toString() << " ("
<< numPaths_ << " paths, " << trees_.size() << " trees)";
return id_;
}
HgManifestImporter::HgManifestImporter(LocalStore* store) : store_(store) {
@ -140,7 +190,7 @@ void HgManifestImporter::processEntry(
// the stack.
VLOG(5) << "pop '" << dirStack_.back().getPath() << "' --> '"
<< (dirStack_.end() - 2)->getPath() << "' # '" << dirname << "'";
popAndRecordCurrentDir();
popCurrentDir();
CHECK(!dirStack_.empty());
// Continue around the while loop, now that the current directory
// is updated.
@ -155,10 +205,11 @@ Hash HgManifestImporter::finish() {
// Pop everything off dirStack_, and record the trees as we go.
while (dirStack_.size() > 1) {
VLOG(5) << "final pop '" << dirStack_.back().getPath() << "'";
popAndRecordCurrentDir();
popCurrentDir();
}
auto rootHash = dirStack_.back().record(store_);
auto rootHash = dirStack_.back().compute(store_);
dirStack_.back().record(store_);
dirStack_.pop_back();
CHECK(dirStack_.empty());
@ -167,17 +218,20 @@ Hash HgManifestImporter::finish() {
return rootHash;
}
void HgManifestImporter::popAndRecordCurrentDir() {
void HgManifestImporter::popCurrentDir() {
PathComponent entryName = dirStack_.back().getPath().basename().copy();
auto dirHash = dirStack_.back().record(store_);
PartialTree back = std::move(dirStack_.back());
dirStack_.pop_back();
DCHECK(!dirStack_.empty());
auto dirHash = back.compute(store_);
uint8_t ownerPermissions = 0111;
TreeEntry dirEntry(
dirHash, entryName.stringPiece(), FileType::DIRECTORY, ownerPermissions);
dirStack_.back().addEntry(std::move(dirEntry));
dirStack_.back().addPartialTree(std::move(back));
}
}
} // facebook::eden

View File

@ -52,8 +52,7 @@ class HgManifestImporter {
HgManifestImporter(const HgManifestImporter&) = delete;
HgManifestImporter& operator=(const HgManifestImporter&) = delete;
void popAndRecordCurrentDir();
Hash recordCurrentDir();
void popCurrentDir();
LocalStore* store_{nullptr};
std::vector<PartialTree> dirStack_;