sapling/eden/fs/inodes/Overlay.cpp

974 lines
32 KiB
C++
Raw Normal View History

/*
* Copyright (c) 2016-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*
*/
#include "eden/fs/inodes/Overlay.h"
#include <boost/filesystem.hpp>
#include <folly/Exception.h>
#include <folly/File.h>
#include <folly/FileUtil.h>
#include <folly/io/Cursor.h>
#include <folly/io/IOBuf.h>
#include <folly/logging/xlog.h>
#include <thrift/lib/cpp2/protocol/Serializer.h>
#include "eden/fs/inodes/DirEntry.h"
#include "eden/fs/inodes/InodeMap.h"
#include "eden/fs/inodes/InodeTable.h"
#include "eden/fs/utils/PathFuncs.h"
namespace facebook {
namespace eden {
using apache::thrift::CompactSerializer;
using folly::ByteRange;
using folly::fbvector;
using folly::File;
using folly::IOBuf;
using folly::MutableStringPiece;
using folly::Optional;
using folly::StringPiece;
using folly::literals::string_piece_literals::operator""_sp;
using std::string;
/* Relative to the localDir, the metaFile holds the serialized rendition
* of the overlay_ data. We use thrift CompactSerialization for this.
*/
constexpr StringPiece kInfoFile{"info"};
constexpr StringPiece kMetadataFile{"metadata.table"};
constexpr const char* kNextInodeNumberFile{"next-inode-number"};
/**
* 4-byte magic identifier to put at the start of the info file.
* This merely helps confirm that we are in fact reading an overlay info file
*/
constexpr StringPiece kInfoHeaderMagic{"\xed\xe0\x00\x01"};
/**
* A version number for the overlay directory format.
*
* If we change the overlay storage format in the future we can bump this
* version number to help identify when eden is reading overlay data created by
* an older version of the code.
*/
constexpr uint32_t kOverlayVersion = 1;
constexpr size_t kInfoHeaderSize =
kInfoHeaderMagic.size() + sizeof(kOverlayVersion);
namespace {
/**
* Get the name of the subdirectory to use for the overlay data for the
* specified inode number.
*
* We shard the inode files across the 256 subdirectories using the least
* significant byte. Inode numbers are allocated in monotonically increasing
* order, so this helps spread them out across the subdirectories.
*/
void formatSubdirPath(MutableStringPiece subdirPath, uint64_t inode) {
constexpr char hexdigit[] = "0123456789abcdef";
DCHECK_EQ(subdirPath.size(), 2);
subdirPath[0] = hexdigit[(inode >> 4) & 0xf];
subdirPath[1] = hexdigit[inode & 0xf];
}
} // namespace
constexpr folly::StringPiece Overlay::kHeaderIdentifierDir;
constexpr folly::StringPiece Overlay::kHeaderIdentifierFile;
constexpr uint32_t Overlay::kHeaderVersion;
constexpr size_t Overlay::kHeaderLength;
Overlay::Overlay(AbsolutePathPiece localDir) : localDir_(localDir) {
initOverlay();
tryLoadNextInodeNumber();
gcThread_ = std::thread([this] { gcThread(); });
}
Overlay::~Overlay() {
close();
}
uint64_t Overlay::close() {
CHECK_NE(std::this_thread::get_id(), gcThread_.get_id());
if (!infoFile_) {
return nextInodeNumber_.load(std::memory_order_relaxed);
}
// Make sure everything is shut down in reverse of construction order.
gcQueue_.lock()->stop = true;
gcCondVar_.notify_one();
gcThread_.join();
saveNextInodeNumber();
inodeMetadataTable_.reset();
dirFile_.close();
infoFile_.close();
return nextInodeNumber_.load(std::memory_order_relaxed);
}
void Overlay::initOverlay() {
// Read the info file.
auto infoPath = localDir_ + PathComponentPiece{kInfoFile};
int fd = folly::openNoInt(infoPath.value().c_str(), O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
// This is an existing overlay directory.
// Read the info file and make sure we are compatible with its version.
infoFile_ = File{fd, /* ownsFd */ true};
readExistingOverlay(infoFile_.fd());
} else if (errno != ENOENT) {
folly::throwSystemError(
"error reading eden overlay info file ", infoPath.stringPiece());
} else {
// This is a brand new overlay directory.
initNewOverlay();
infoFile_ = File{infoPath.value().c_str(), O_RDONLY | O_CLOEXEC};
}
if (!infoFile_.try_lock()) {
folly::throwSystemError("failed to acquire overlay lock on ", infoPath);
}
// Open a handle on the overlay directory itself
int dirFd =
open(localDir_.c_str(), O_RDONLY | O_PATH | O_DIRECTORY | O_CLOEXEC);
folly::checkUnixError(
dirFd, "error opening overlay directory handle for ", localDir_.value());
dirFile_ = File{dirFd, /* ownsFd */ true};
// Ensure tmp directory is created.
// TODO: It would be a bit expensive, but it might be worth checking
// all of the numbered subdirectories here too.
struct stat tmpStat;
int statResult = fstatat(dirFile_.fd(), "tmp", &tmpStat, AT_SYMLINK_NOFOLLOW);
if (statResult == 0) {
if (!S_ISDIR(tmpStat.st_mode)) {
folly::throwSystemErrorExplicit(
ENOTDIR, "overlay tmp is not a directory");
}
} else {
if (errno == ENOENT) {
folly::checkUnixError(
mkdirat(dirFile_.fd(), "tmp", 0700),
"failed to create overlay tmp directory");
} else {
folly::throwSystemError("fstatat(\"tmp\") failed");
}
}
// Open after infoFile_'s lock is acquired because the InodeTable acquires
// its own lock, which should be released prior to infoFile_.
inodeMetadataTable_ = InodeMetadataTable::open(
(localDir_ + PathComponentPiece{kMetadataFile}).c_str());
}
void Overlay::tryLoadNextInodeNumber() {
// If we ever want to extend this file, it should be renamed and a proper
// header with version number added. In the meantime, we enforce the file is
// 8 bytes.
int fd = openat(dirFile_.fd(), kNextInodeNumberFile, O_RDONLY | O_CLOEXEC);
if (fd == -1) {
if (errno == ENOENT) {
// No max inode number file was written which usually means either Eden
// was not shut down cleanly or an old overlay is being loaded.
// Either way, a full scan of the overlay is necessary, so leave
// nextInodeNumber_ at 0.
return;
} else {
folly::throwSystemError("Failed to open ", kNextInodeNumberFile);
}
}
folly::File nextInodeNumberFile{fd, /* ownsFd */ true};
// Immediately unlink - the presence of the file indicates a clean shutdown.
if (unlinkat(dirFile_.fd(), kNextInodeNumberFile, 0)) {
folly::throwSystemError(
"Failed to unlink ", kNextInodeNumberFile, " in overlay");
}
uint64_t nextInodeNumber;
auto readResult =
folly::readFull(fd, &nextInodeNumber, sizeof(nextInodeNumber));
if (readResult < 0) {
folly::throwSystemError(
"Failed to read ", kNextInodeNumberFile, " from overlay");
}
if (readResult != sizeof(nextInodeNumber)) {
XLOG(WARN) << "Failed to read entire inode number. Only read " << readResult
<< " bytes. Full overlay scan required.";
return;
}
if (nextInodeNumber <= kRootNodeId.get()) {
XLOG(WARN) << "Invalid max inode number " << nextInodeNumber
<< ". Full overlay scan required.";
return;
}
nextInodeNumber_.store(nextInodeNumber, std::memory_order_relaxed);
}
void Overlay::saveNextInodeNumber() {
auto nextInodeNumber = nextInodeNumber_.load(std::memory_order_relaxed);
if (!nextInodeNumber) {
return;
}
auto nextInodeNumberPath =
localDir_ + PathComponentPiece{kNextInodeNumberFile};
folly::writeFileAtomic(
nextInodeNumberPath.value().c_str(),
ByteRange(
reinterpret_cast<const uint8_t*>(&nextInodeNumber),
reinterpret_cast<const uint8_t*>(&nextInodeNumber + 1)));
}
void Overlay::readExistingOverlay(int infoFD) {
// Read the info file header
std::array<uint8_t, kInfoHeaderSize> infoHeader;
auto sizeRead = folly::readFull(infoFD, infoHeader.data(), infoHeader.size());
folly::checkUnixError(
sizeRead,
"error reading from overlay info file in ",
localDir_.stringPiece());
if (sizeRead != infoHeader.size()) {
throw std::runtime_error(folly::to<string>(
"truncated info file in overlay directory ", localDir_));
}
// Verify the magic value is correct
if (memcmp(
infoHeader.data(),
kInfoHeaderMagic.data(),
kInfoHeaderMagic.size()) != 0) {
throw std::runtime_error(
folly::to<string>("bad data in overlay info file for ", localDir_));
}
// Extract the version number
uint32_t version;
memcpy(
&version, infoHeader.data() + kInfoHeaderMagic.size(), sizeof(version));
version = folly::Endian::big(version);
// Make sure we understand this version number
if (version != kOverlayVersion) {
throw std::runtime_error(folly::to<string>(
"Unsupported eden overlay format ", version, " in ", localDir_));
}
}
void Overlay::initNewOverlay() {
// Make sure the overlay directory itself exists. It's fine if it already
// exists (although presumably it should be empty).
auto result = ::mkdir(localDir_.value().c_str(), 0755);
if (result != 0 && errno != EEXIST) {
folly::throwSystemError(
"error creating eden overlay directory ", localDir_.stringPiece());
}
auto localDirFile = File(localDir_.stringPiece(), O_RDONLY);
// We split the inode files across 256 subdirectories.
// Populate these subdirectories now.
std::array<char, 3> subdirPath;
subdirPath[2] = '\0';
for (uint64_t n = 0; n < 256; ++n) {
formatSubdirPath(MutableStringPiece{subdirPath.data(), 2}, n);
result = ::mkdirat(localDirFile.fd(), subdirPath.data(), 0755);
if (result != 0 && errno != EEXIST) {
folly::throwSystemError(
"error creating eden overlay directory ",
StringPiece{subdirPath.data()});
}
}
// For now we just write a simple header, with a magic number to identify
// this as an eden overlay file, and the version number of the overlay
// format.
std::array<uint8_t, kInfoHeaderSize> infoHeader;
memcpy(infoHeader.data(), kInfoHeaderMagic.data(), kInfoHeaderMagic.size());
auto version = folly::Endian::big(kOverlayVersion);
memcpy(
infoHeader.data() + kInfoHeaderMagic.size(), &version, sizeof(version));
auto infoPath = localDir_ + PathComponentPiece{kInfoFile};
folly::writeFileAtomic(
infoPath.stringPiece(), ByteRange(infoHeader.data(), infoHeader.size()));
// kRootNodeId is reserved - start at the next one. No scan is necessary.
setNextInodeNumber(InodeNumber{kRootNodeId.get() + 1});
}
void Overlay::setNextInodeNumber(InodeNumber nextInodeNumber) {
if (auto ino = nextInodeNumber_.load(std::memory_order_relaxed)) {
// It's okay to allow setNextInodeNumber as long as the values are
// consistent. This code path will disappear when takeover transitions to
// relying on the Overlay efficiently remembering the next inode number
// itself.
DCHECK_EQ(ino, nextInodeNumber.get())
<< "Overlay nextInodeNumber already initialized with " << ino
<< " so it should not be initialized with " << nextInodeNumber;
return;
}
DCHECK_GT(nextInodeNumber, kRootNodeId);
nextInodeNumber_.store(nextInodeNumber.get(), std::memory_order_relaxed);
}
InodeNumber Overlay::allocateInodeNumber() {
// InodeNumber should generally be 64-bits wide, in which case it isn't even
// worth bothering to handle the case where nextInodeNumber_ wraps. We don't
// need to bother checking for conflicts with existing inode numbers since
// this can only happen if we wrap around. We don't currently support
// platforms with 32-bit inode numbers.
static_assert(
sizeof(nextInodeNumber_) == sizeof(InodeNumber),
"expected nextInodeNumber_ and InodeNumber to have the same size");
static_assert(
sizeof(InodeNumber) >= 8, "expected InodeNumber to be at least 64 bits");
// This could be a relaxed atomic operation. It doesn't matter on x86 but
// might on ARM.
auto previous = nextInodeNumber_++;
DCHECK_NE(0, previous) << "allocateInodeNumber called before initialize";
return InodeNumber{previous};
}
Optional<std::pair<DirContents, InodeTimestamps>> Overlay::loadOverlayDir(
InodeNumber inodeNumber) {
InodeTimestamps timestamps;
auto dirData = deserializeOverlayDir(inodeNumber, timestamps);
if (!dirData.hasValue()) {
return folly::none;
}
const auto& dir = dirData.value();
bool shouldMigrateToNewFormat = false;
DirContents result;
for (auto& iter : dir.entries) {
const auto& name = iter.first;
const auto& value = iter.second;
bool isMaterialized = !value.__isset.hash || value.hash.empty();
InodeNumber ino;
if (value.inodeNumber) {
ino = InodeNumber::fromThrift(value.inodeNumber);
} else {
ino = allocateInodeNumber();
shouldMigrateToNewFormat = true;
}
if (isMaterialized) {
result.emplace(PathComponentPiece{name}, value.mode, ino);
} else {
auto hash = Hash{folly::ByteRange{folly::StringPiece{value.hash}}};
result.emplace(PathComponentPiece{name}, value.mode, ino, hash);
}
}
if (shouldMigrateToNewFormat) {
saveOverlayDir(inodeNumber, result, timestamps);
}
return std::pair<DirContents, InodeTimestamps>{std::move(result), timestamps};
}
void Overlay::saveOverlayDir(
InodeNumber inodeNumber,
const DirContents& dir,
const InodeTimestamps& timestamps) {
auto nextInodeNumber = nextInodeNumber_.load(std::memory_order_relaxed);
CHECK_LT(inodeNumber.get(), nextInodeNumber)
<< "saveOverlayDir called with unallocated inode number";
// TODO: T20282158 clean up access of child inode information.
//
// Translate the data to the thrift equivalents
overlay::OverlayDir odir;
for (auto& entIter : dir) {
const auto& entName = entIter.first;
const auto& ent = entIter.second;
CHECK_LT(ent.getInodeNumber().get(), nextInodeNumber)
<< "saveOverlayDir called with entry using unallocated inode number";
overlay::OverlayEntry oent;
oent.mode = ent.getModeUnsafe();
oent.inodeNumber = ent.getInodeNumber().get();
bool isMaterialized = ent.isMaterialized();
if (!isMaterialized) {
auto entHash = ent.getHash();
auto bytes = entHash.getBytes();
oent.set_hash(std::string{reinterpret_cast<const char*>(bytes.data()),
bytes.size()});
}
odir.entries.emplace(
std::make_pair(entName.stringPiece().str(), std::move(oent)));
}
// Ask thrift to serialize it.
auto serializedData = CompactSerializer::serialize<std::string>(odir);
// Add header to the overlay directory.
auto header = createHeader(kHeaderIdentifierDir, kHeaderVersion, timestamps);
std::array<struct iovec, 2> iov;
iov[0].iov_base = header.data();
iov[0].iov_len = header.size();
iov[1].iov_base = const_cast<char*>(serializedData.data());
iov[1].iov_len = serializedData.size();
(void)createOverlayFileImpl(inodeNumber, iov.data(), iov.size());
}
void Overlay::removeOverlayData(InodeNumber inodeNumber) {
// TODO: batch request during GC
getInodeMetadataTable()->freeInode(inodeNumber);
InodePath path;
getFilePath(inodeNumber, path);
int result = ::unlinkat(dirFile_.fd(), path.data(), 0);
if (result == 0) {
XLOG(DBG4) << "removed overlay data for inode " << inodeNumber;
} else if (errno != ENOENT) {
folly::throwSystemError("error unlinking overlay file: ", path);
}
}
void Overlay::recursivelyRemoveOverlayData(InodeNumber inodeNumber) {
InodeTimestamps dummy;
auto dirData = deserializeOverlayDir(inodeNumber, dummy);
// This inode's data must be removed from the overlay before
// recursivelyRemoveOverlayData returns to avoid a race condition if
// recursivelyRemoveOverlayData(I) is called immediately prior to
// saveOverlayDir(I). There's also no risk of violating our durability
// guarantees if the process dies after this call but before the thread could
// remove this data.
removeOverlayData(inodeNumber);
if (dirData) {
gcQueue_.lock()->queue.emplace_back(std::move(*dirData));
gcCondVar_.notify_one();
}
}
folly::Future<folly::Unit> Overlay::flushPendingAsync() {
folly::Promise<folly::Unit> promise;
auto future = promise.getFuture();
gcQueue_.lock()->queue.emplace_back(std::move(promise));
gcCondVar_.notify_one();
return future;
}
bool Overlay::hasOverlayData(InodeNumber inodeNumber) {
// TODO: It might be worth maintaining a memory-mapped set to rapidly
// query whether the overlay has an entry for a particular inode. As it is,
// this function requires a syscall to see if the overlay has an entry.
InodePath path;
getFilePath(inodeNumber, path);
struct stat st;
if (0 == fstatat(dirFile_.fd(), path.data(), &st, AT_SYMLINK_NOFOLLOW)) {
return S_ISREG(st.st_mode);
} else {
return false;
}
}
InodeNumber Overlay::scanForNextInodeNumber() {
if (auto ino = nextInodeNumber_.load(std::memory_order_relaxed)) {
// Already defined.
CHECK_GT(ino, 1);
return InodeNumber{ino - 1};
}
// TODO: We should probably store the max inode number in the header file
// during graceful unmount. When opening an overlay we can then simply read
// back the max inode number from this file if the overlay was shut down
// cleanly last time.
//
// We would only then need to do a scan if the overlay was not cleanly shut
// down.
//
// For now we always do a scan.
// Walk the root directory downwards to find all (non-unlinked) directory
// inodes stored in the overlay.
//
// TODO: It would be nicer if each overlay file contained a short header so
// we could tell if it was a file or directory. This way we could do a
// simpler scan of opening every single file. For now we have to walk the
// directory tree from the root downwards.
auto maxInode = kRootNodeId;
std::vector<InodeNumber> toProcess;
toProcess.push_back(maxInode);
while (!toProcess.empty()) {
auto dirInodeNumber = toProcess.back();
toProcess.pop_back();
InodeTimestamps timeStamps;
auto dir = deserializeOverlayDir(dirInodeNumber, timeStamps);
if (!dir.hasValue()) {
continue;
}
for (const auto& entry : dir.value().entries) {
if (entry.second.inodeNumber == 0) {
continue;
}
auto entryInode = InodeNumber::fromThrift(entry.second.inodeNumber);
maxInode = std::max(maxInode, entryInode);
if (mode_to_dtype(entry.second.mode) == dtype_t::Dir) {
toProcess.push_back(entryInode);
}
}
}
// Look through the subdirectories and increment maxInode based on the
// filenames we see. This is needed in case there are unlinked inodes
// present.
std::array<char, 2> subdir;
for (uint64_t n = 0; n < 256; ++n) {
formatSubdirPath(MutableStringPiece{subdir.data(), subdir.size()}, n);
auto subdirPath = localDir_ +
PathComponentPiece{StringPiece{subdir.data(), subdir.size()}};
auto boostPath = boost::filesystem::path{subdirPath.value().c_str()};
for (const auto& entry : boost::filesystem::directory_iterator(boostPath)) {
auto entryInodeNumber =
folly::tryTo<uint64_t>(entry.path().filename().string());
if (entryInodeNumber.hasValue()) {
maxInode = std::max(maxInode, InodeNumber{entryInodeNumber.value()});
}
}
}
setNextInodeNumber(InodeNumber{maxInode.get() + 1});
return maxInode;
}
size_t Overlay::getFilePath(InodeNumber inodeNumber, InodePath& outPath) {
formatSubdirPath(MutableStringPiece{outPath.data(), 2}, inodeNumber.get());
outPath[2] = '/';
auto index =
folly::uint64ToBufferUnsafe(inodeNumber.get(), outPath.data() + 3);
DCHECK_LT(index + 3, outPath.size());
outPath[index + 3] = '\0';
return index + 3;
}
Optional<overlay::OverlayDir> Overlay::deserializeOverlayDir(
InodeNumber inodeNumber,
InodeTimestamps& timeStamps) const {
// Open the file. Return folly::none if the file does not exist.
InodePath path;
getFilePath(inodeNumber, path);
int fd = openat(dirFile_.fd(), path.data(), O_RDWR | O_CLOEXEC | O_NOFOLLOW);
if (fd == -1) {
int err = errno;
if (err == ENOENT) {
// There is no overlay here
return folly::none;
}
folly::throwSystemErrorExplicit(
err,
"error opening overlay file for inode ",
inodeNumber,
" in ",
localDir_);
}
folly::File file{fd, /* ownsFd */ true};
// Read the file data
std::string serializedData;
if (!folly::readFile(file.fd(), serializedData)) {
int err = errno;
if (err == ENOENT) {
// There is no overlay here
return folly::none;
}
folly::throwSystemErrorExplicit(errno, "failed to read ", path);
}
// Removing header and deserializing the contents
if (serializedData.size() < kHeaderLength) {
// Something Wrong with the file(may be corrupted)
folly::throwSystemErrorExplicit(
EIO,
"Overlay file ",
path,
" is too short for header: size=",
serializedData.size());
}
StringPiece header{serializedData, 0, kHeaderLength};
// validate header and get the timestamps
parseHeader(header, kHeaderIdentifierDir, timeStamps);
StringPiece contents{serializedData};
contents.advance(kHeaderLength);
return CompactSerializer::deserialize<overlay::OverlayDir>(contents);
}
std::array<uint8_t, Overlay::kHeaderLength> Overlay::createHeader(
folly::StringPiece identifier,
uint32_t version,
const InodeTimestamps& timestamps) {
std::array<uint8_t, kHeaderLength> headerStorage;
IOBuf header{IOBuf::WRAP_BUFFER, folly::MutableByteRange{headerStorage}};
header.clear();
folly::io::Appender appender(&header, 0);
appender.push(identifier);
appender.writeBE(version);
auto atime = timestamps.atime.toTimespec();
auto ctime = timestamps.ctime.toTimespec();
auto mtime = timestamps.mtime.toTimespec();
appender.writeBE<uint64_t>(atime.tv_sec);
appender.writeBE<uint64_t>(atime.tv_nsec);
appender.writeBE<uint64_t>(ctime.tv_sec);
appender.writeBE<uint64_t>(ctime.tv_nsec);
appender.writeBE<uint64_t>(mtime.tv_sec);
appender.writeBE<uint64_t>(mtime.tv_nsec);
auto paddingSize = kHeaderLength - header.length();
appender.ensure(paddingSize);
memset(appender.writableData(), 0, paddingSize);
appender.append(paddingSize);
return headerStorage;
}
// Helper function to open,validate,
// get file pointer of an overlay file
folly::File Overlay::openFile(
InodeNumber inodeNumber,
folly::StringPiece headerId,
InodeTimestamps& timeStamps) {
// Open the overlay file
auto file = openFileNoVerify(inodeNumber);
// Read the contents
std::string contents;
if (!folly::readFile(file.fd(), contents, kHeaderLength)) {
folly::throwSystemErrorExplicit(
errno,
"failed to read overlay file for inode ",
inodeNumber,
" in ",
localDir_);
}
StringPiece header{contents};
parseHeader(header, headerId, timeStamps);
return file;
}
folly::File Overlay::openFileNoVerify(InodeNumber inodeNumber) {
InodePath path;
getFilePath(inodeNumber, path);
int fd = openat(dirFile_.fd(), path.data(), O_RDWR | O_CLOEXEC | O_NOFOLLOW);
folly::checkUnixError(
fd,
"error opening overlay file for inode ",
inodeNumber,
" in ",
localDir_);
return folly::File{fd, /* ownsFd */ true};
}
folly::File Overlay::createOverlayFileImpl(
InodeNumber inodeNumber,
iovec* iov,
size_t iovCount) {
CHECK_LT(inodeNumber.get(), nextInodeNumber_.load(std::memory_order_relaxed))
<< "createOverlayFile called with unallocated inode number";
// We do not use mkstemp() to create the temporary file, since there is no
// mkstempat() equivalent that can create files relative to dirFile_. We
// simply create the file with a fixed suffix, and do not use O_EXCL. This
// is not a security risk since only the current user should have permission
// to create files inside the overlay directory, so no one else can create
// symlinks inside the overlay directory. We also open the temporary file
// using O_NOFOLLOW.
//
// We could potentially use O_TMPFILE followed by linkat() to commit the
// file. However this may not be supported by all filesystems, and seems to
// provide minimal benefits for our use case.
InodePath path;
getFilePath(inodeNumber, path);
// It's substantially faster on XFS to create this temporary file in
// an empty directory and then move it into its destination rather
// than to create it directly in the subtree.
constexpr auto tmpPrefix = "tmp/"_sp;
std::array<char, tmpPrefix.size() + kMaxDecimalInodeNumberLength + 1> tmpPath;
memcpy(tmpPath.data(), tmpPrefix.data(), tmpPrefix.size());
auto index = folly::uint64ToBufferUnsafe(
inodeNumber.get(), tmpPath.data() + tmpPrefix.size());
tmpPath[tmpPrefix.size() + index] = '\0';
auto tmpFD = openat(
dirFile_.fd(),
tmpPath.data(),
O_CREAT | O_RDWR | O_CLOEXEC | O_NOFOLLOW | O_TRUNC,
0600);
folly::checkUnixError(
tmpFD,
"failed to create temporary overlay file for inode ",
inodeNumber,
" in ",
localDir_);
folly::File file{tmpFD, /* ownsFd */ true};
bool success = false;
SCOPE_EXIT {
if (!success) {
unlinkat(dirFile_.fd(), tmpPath.data(), 0);
}
};
auto sizeWritten = folly::writevFull(tmpFD, iov, iovCount);
folly::checkUnixError(
sizeWritten,
"error writing to overlay file for inode ",
inodeNumber,
" in ",
localDir_);
// Eden used to call fdatasync() here because technically that's required to
// reliably, atomically write a file. But, per docs/InodeStorage.md, Eden
// does not claim to handle disk, kernel, or power failure, and fdatasync has
// a nearly 300 microsecond cost.
auto returnCode =
renameat(dirFile_.fd(), tmpPath.data(), dirFile_.fd(), path.data());
folly::checkUnixError(
returnCode,
"error committing overlay file for inode ",
inodeNumber,
" in ",
localDir_);
// We do not want to unlink the temporary file on exit now that we have
// successfully renamed it.
success = true;
return file;
}
remove dep on libfuse Summary: This serves a few purposes: 1. We can avoid some conditional code inside eden if we know that we have a specific fuse_kernel.h header implementation. 2. We don't have to figure out a way to propagate the kernel capabilities through the graceful restart process. 3. libfuse3 removed the channel/session hooks that we've been using thus far to interject ourselves for mounting and graceful restarting, so we were already effectively the walking dead here. 4. We're now able to take advtange of the latest aspects of the fuse kernel interface without being tied to the implementation of libfuse2 or libfuse3. We're interested in the readdirplus functionality and will look at enabling that in a future diff. This may make some things slightly harder for the more immediate macOS port but I belive that we're in a much better place overall. This diff is relatively mechanical and sadly is (unavoidably) large. The main aspects of this diff are: 1. The `fuse_ino_t` type was provided by libfuse so we needed to replace it with our own definition. This has decent penetration throughout the codebase. 2. The confusing `fuse_file_info` type that was multi-purpose and had fields that were sometimes *in* parameters and sometimes *out* parameters has been removed and replaced with a simpler *flags* parameter that corresponds to the `open(2)` flags parameter. The *out* portions are subsumed by existing file handle metadata methods. 3. The fuse parameters returned from variations of the `LOOKUP` opcode now return the fuse kernel type for this directly. I suspect that we may need to introduce a compatibility type when we revisit the macOS port, but this at least makes this diff slightly simpler. You'll notice that some field and symbol name prefixes vary as a result of this. 4. Similarly for `setattr`, libfuse separated the kernel data into two parameters that were a little awkward to use; we're now just passing the kernel data through and this, IMO, makes the interface slightly more understandable. 5. The bulk of the code from `Dispatcher.cpp` that shimmed the libfuse callbacks into the C++ virtual methods has been removed and replaced by a `switch` statement based dispatcher in `FuseChannel`. I'm not married to this being `switch` based and may revise this to be driven by an `unordered_map` of opcode -> dispatcher method defined in `FuseChannel`. Regardless, `Dispatcher.cpp` is now much slimmer and should be easier to replace by rolling it together into `EdenDispatcher.cpp` in the future should we desire to do so. 6. This diff disables dispatching `poll` and `ioctl` calls. We didn't make use of them and their interfaces are a bit fiddly. 7. `INTERRUPT` is also disabled here. I will re-enable it in a follow-up diff where I can also revise how we track outstanding requests for graceful shutdown. 8. I've imported `fuse_kernel.h` from libfuse. This is included under the permissive 2-clause BSD license that it allows for exactly this integration purpose. Reviewed By: simpkins Differential Revision: D6576472 fbshipit-source-id: 7cb088af5e06fe27bf22a1bed295c18c17d8006c
2018-01-03 03:25:03 +03:00
folly::File Overlay::createOverlayFile(
InodeNumber inodeNumber,
const InodeTimestamps& timestamps,
ByteRange contents) {
auto header = createHeader(kHeaderIdentifierFile, kHeaderVersion, timestamps);
std::array<struct iovec, 2> iov;
iov[0].iov_base = header.data();
iov[0].iov_len = header.size();
iov[1].iov_base = const_cast<uint8_t*>(contents.data());
iov[1].iov_len = contents.size();
return createOverlayFileImpl(inodeNumber, iov.data(), iov.size());
}
folly::File Overlay::createOverlayFile(
InodeNumber inodeNumber,
const InodeTimestamps& timestamps,
const IOBuf& contents) {
// In the common case where there is just one element in the chain, use the
// ByteRange version of createOverlayFile() to avoid having to allocate the
// iovec array on the heap.
if (contents.next() == &contents) {
return createOverlayFile(
inodeNumber, timestamps, ByteRange{contents.data(), contents.length()});
}
auto header = createHeader(kHeaderIdentifierFile, kHeaderVersion, timestamps);
fbvector<struct iovec> iov;
iov.resize(1);
iov[0].iov_base = header.data();
iov[0].iov_len = header.size();
contents.appendToIov(&iov);
return createOverlayFileImpl(inodeNumber, iov.data(), iov.size());
}
void Overlay::parseHeader(
folly::StringPiece header,
folly::StringPiece headerId,
InodeTimestamps& timestamps) {
IOBuf buf(IOBuf::WRAP_BUFFER, ByteRange{header});
folly::io::Cursor cursor(&buf);
// Validate header identifier
auto id = cursor.readFixedString(kHeaderIdentifierDir.size());
StringPiece identifier{id};
if (identifier.compare(headerId) != 0) {
folly::throwSystemError(
EIO,
"unexpected overlay header identifier : ",
folly::hexlify(ByteRange{identifier}));
}
// Validate header version
auto version = cursor.readBE<uint32_t>();
if (version != kHeaderVersion) {
folly::throwSystemError(EIO, "Unexpected overlay version :", version);
}
timespec atime, ctime, mtime;
atime.tv_sec = cursor.readBE<uint64_t>();
atime.tv_nsec = cursor.readBE<uint64_t>();
ctime.tv_sec = cursor.readBE<uint64_t>();
ctime.tv_nsec = cursor.readBE<uint64_t>();
mtime.tv_sec = cursor.readBE<uint64_t>();
mtime.tv_nsec = cursor.readBE<uint64_t>();
timestamps.atime = atime;
timestamps.ctime = ctime;
timestamps.mtime = mtime;
}
// Helper function to update timestamps into overlay file
void Overlay::updateTimestampToHeader(
int fd,
const InodeTimestamps& timestamps) {
// Create a string piece with timestamps
std::array<uint64_t, 6> buf;
IOBuf iobuf(IOBuf::WRAP_BUFFER, buf.data(), sizeof(buf));
iobuf.clear();
folly::io::Appender appender(&iobuf, 0);
auto atime = timestamps.atime.toTimespec();
auto ctime = timestamps.ctime.toTimespec();
auto mtime = timestamps.mtime.toTimespec();
appender.writeBE<uint64_t>(atime.tv_sec);
appender.writeBE<uint64_t>(atime.tv_nsec);
appender.writeBE<uint64_t>(ctime.tv_sec);
appender.writeBE<uint64_t>(ctime.tv_nsec);
appender.writeBE<uint64_t>(mtime.tv_sec);
appender.writeBE<uint64_t>(mtime.tv_nsec);
// replace the timestamps of current header with the new timestamps
auto newHeader = iobuf.coalesce();
auto wrote = folly::pwriteNoInt(
fd,
newHeader.data(),
newHeader.size(),
kHeaderIdentifierDir.size() + sizeof(kHeaderVersion));
if (wrote == -1) {
folly::throwSystemError("pwriteNoInt failed");
}
if (wrote != static_cast<ssize_t>(newHeader.size())) {
folly::throwSystemError(
"writeNoInt wrote only ", wrote, " of ", newHeader.size(), " bytes");
}
}
void Overlay::gcThread() noexcept {
for (;;) {
std::vector<GCRequest> requests;
{
auto lock = gcQueue_.lock();
while (lock->queue.empty()) {
if (lock->stop) {
return;
}
gcCondVar_.wait(lock.getUniqueLock());
continue;
}
requests = std::move(lock->queue);
}
for (auto& request : requests) {
try {
handleGCRequest(request);
} catch (const std::exception& e) {
XLOG(ERR) << "handleGCRequest should never throw, but it did: "
<< e.what();
}
}
}
}
void Overlay::handleGCRequest(GCRequest& request) {
if (request.flush) {
request.flush->setValue();
return;
}
// Should only include inode numbers for trees.
std::queue<InodeNumber> queue;
// TODO: For better throughput on large tree collections, it might make
// sense to split this into two threads: one for traversing the tree and
// another that makes the actual unlink calls.
auto safeRemoveOverlayData = [&](InodeNumber inodeNumber) {
try {
removeOverlayData(inodeNumber);
} catch (const std::exception& e) {
XLOG(ERR) << "Failed to remove overlay data for inode " << inodeNumber
<< ": " << e.what();
}
};
auto processDir = [&](const overlay::OverlayDir& dir) {
for (const auto& entry : dir.entries) {
const auto& value = entry.second;
if (!value.inodeNumber) {
// Legacy-only. All new Overlay trees have inode numbers for all
// children.
continue;
}
auto ino = InodeNumber::fromThrift(value.inodeNumber);
if (S_ISDIR(value.mode)) {
queue.push(ino);
} else {
// No need to recurse, but delete any file at this inode. Note that,
// under normal operation, there should be nothing at this path
// because files are only written into the overlay if they're
// materialized.
safeRemoveOverlayData(ino);
}
}
};
processDir(request.dir);
while (!queue.empty()) {
auto ino = queue.front();
queue.pop();
overlay::OverlayDir dir;
try {
InodeTimestamps dummy;
auto dirData = deserializeOverlayDir(ino, dummy);
if (!dirData.hasValue()) {
XLOG(DBG3) << "no dir data for inode " << ino;
continue;
} else {
dir = std::move(*dirData);
}
} catch (const std::exception& e) {
XLOG(ERR) << "While collecting, failed to load tree data for inode "
<< ino << ": " << e.what();
continue;
}
safeRemoveOverlayData(ino);
processDir(dir);
}
}
} // namespace eden
} // namespace facebook