implement an InodeTable for storing persistent per-inode info decoupled from memory

Summary:
A persistent (but notably non-durable) mapping from inode
number to a fixed-size record stored in a memory-mapped file.  The two
primary goals here are:

1. efficiently (and lazily) reify timestamps for inodes that aren't in the overlay
2. allow the kernel's page cache to drop pages under memory pressure

Reviewed By: simpkins

Differential Revision: D6877361

fbshipit-source-id: a4366b12e21e2bf483c83069cd93ef150829b2ac
This commit is contained in:
Chad Austin 2018-05-09 16:20:46 -07:00 committed by Facebook Github Bot
parent d9a6089dd5
commit fd20487c7b
3 changed files with 494 additions and 0 deletions

View File

@ -0,0 +1,50 @@
/*
* Copyright (c) 2018-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*
*/
#pragma once
#include <sys/stat.h>
#include "eden/fs/inodes/InodeTimestamps.h"
namespace facebook {
namespace eden {
/**
* Fixed-size structure of per-inode bits that should be persisted across runs.
*
* Warning: This data structure is serialized directly to disk via InodeTable.
* Do not change the order, sizes, or meanings of the fields. Instead, rename
* this struct, create a new InodeMetadata struct with the next VERSION value,
* add an explicit constructor from the old version, and add the old version to
* the InodeMetadataTable typedef in InodeTable.h.
*/
struct InodeMetadata {
enum { VERSION = 0 };
InodeMetadata() = default;
explicit InodeMetadata(
mode_t m,
uid_t u,
gid_t g,
const InodeTimestamps& ts) noexcept
: mode{m}, uid{u}, gid{g}, timestamps{ts} {}
mode_t mode{0};
uid_t uid{0};
gid_t gid{0};
InodeTimestamps timestamps;
// Other potential things to include:
// nlink_t nlinks;
// dev_t rdev;
// creation time
};
} // namespace eden
} // namespace facebook

276
eden/fs/inodes/InodeTable.h Normal file
View File

@ -0,0 +1,276 @@
/*
* Copyright (c) 2018-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*
*/
#pragma once
#include "eden/fs/fuse/FuseTypes.h"
#include "eden/fs/inodes/InodeMetadata.h"
#include "eden/fs/utils/Bug.h"
#include "eden/fs/utils/MappedDiskVector.h"
namespace facebook {
namespace eden {
namespace detail {
template <typename Record>
struct InodeTableEntry {
enum { VERSION = Record::VERSION };
InodeTableEntry() = delete;
InodeTableEntry(InodeNumber ino, const Record& rec)
: inode{ino}, record{rec} {}
/// Conversion from old versions.
template <typename OldRecord>
explicit InodeTableEntry(const InodeTableEntry<OldRecord>& old)
: inode{old.inode}, record{old.record} {}
// WARNING: this data structure is serialized directly to disk via
// MappedDiskVector. Do not change the order, set, or types of fields. We
// could, if we want to change Entry itself, coopt high bits of VERSION and
// modify MappedDiskVector to allow direct upgrades rather than linear
// upgrades.
InodeNumber inode;
// TODO: should we maintain a 64-bit SpookyHashV2 checksum to ignore
// corrupted entries?
Record record;
};
} // namespace detail
/**
* InodeTable is an efficient storage engine for fixed-size inode records.
* It is intended for timestamps and mode bits (and any additional fixed-size
* per-inode state.)
*
* The data is stored in a memory-mapped file and flushed to disk on occasion.
* Durability on kernel or disk shutdown is not a primary goal. Timestamps and
* permission bits are easy enough to fix and uncommitted changes are
* short-lived, and the kernel will flush dirty pages if the process is killed.
*
* Rather than using a free list, upon removal of an entry, the last entry is
* moved to the removed entry's index.
*
* The locking strategy is as follows:
*
* The index from inode number to record index is wrapped in a SharedMutex.
* Most accesses will only take a reader lock unless a new entry is added or
* an inode number is removed.
*
* The contents of each record itself is protected by the FileInode and
* TreeInode's locks.
*/
template <typename Record>
class InodeTable {
public:
using Entry = detail::InodeTableEntry<Record>;
InodeTable() = delete;
InodeTable(const InodeTable&) = delete;
InodeTable(InodeTable&&) = delete;
InodeTable& operator=(const InodeTable&) = delete;
InodeTable& operator=(InodeTable&&) = delete;
/**
* Create or open an InodeTable at the specified path.
*/
template <typename... OldRecords>
static std::unique_ptr<InodeTable> open(folly::StringPiece path) {
return std::unique_ptr<InodeTable>{
new InodeTable{MappedDiskVector<Entry>::template open<
detail::InodeTableEntry<OldRecords>...>(path, true)}};
}
/**
* If no value is stored for this inode, assigns one. Returns the new value,
* whether it was set to the default or not.
*/
Record setDefault(InodeNumber ino, const Record& record) {
return state_.withULockPtr([&](auto&& ulock) {
const auto& indices = ulock->indices;
auto iter = indices.find(ino);
if (iter != indices.end()) {
return ulock->storage[iter->second].record;
} else {
auto wlock = ulock.moveFromUpgradeToWrite();
size_t index = wlock->storage.size();
wlock->storage.emplace_back(ino, record);
wlock->indices.emplace(ino, index);
return wlock->storage[index].record;
}
});
}
/**
* If no value is stored for this inode, calls a function to populate its
* initial data. This is more efficient than setDefault when computing the
* default value is nontrivial.
*
* Note that the callback is run while the table's locks are held. Don't
* call any other InodeTable methods from it.
*/
template <typename PopFn>
void populateIfNotSet(InodeNumber ino, PopFn&& pop) {
return state_.withULockPtr([&](auto&& ulock) {
const auto& indices = ulock->indices;
auto iter = indices.find(ino);
if (iter != indices.end()) {
return;
} else {
auto wlock = ulock.moveFromUpgradeToWrite();
size_t index = wlock->storage.size();
wlock->storage.emplace_back(ino, pop());
wlock->indices.emplace(ino, index);
}
});
}
/**
* Assign or overwrite a value for this inode.
*/
void set(InodeNumber ino, const Record& record) {
return state_.withWLock([&](auto& state) {
const auto& indices = state.indices;
auto iter = indices.find(ino);
size_t index;
if (iter != indices.end()) {
index = iter->second;
assert(ino == state.storage[index].inode);
state.storage[index].record = record;
} else {
index = state.storage.size();
state.storage.emplace_back(ino, record);
state.indices.emplace(ino, index);
}
});
}
/**
* If a value is present for the given inode, returns it. Otherwise, throws
* std::out_of_range.
*/
Record getOrThrow(InodeNumber ino) {
auto rv = getOptional(ino);
if (rv) {
return *rv;
} else {
throw std::out_of_range(
folly::to<std::string>("no entry in InodeTable for inode ", ino));
}
}
/**
* If the table has an entry for this inode, returns it. Otherwise, returns
* folly::none.
*/
folly::Optional<Record> getOptional(InodeNumber ino) {
return state_.withRLock([&](const auto& state) -> folly::Optional<Record> {
auto iter = state.indices.find(ino);
if (iter == state.indices.end()) {
return folly::none;
} else {
auto index = iter->second;
CHECK_LT(index, state.storage.size());
return state.storage[index].record;
}
});
}
/**
* Calls a function that can modify the data at the given InodeNumber. Throws
* std::out_of_range if there is no record.
*
* Note that the callback is run while the table's locks are held. Don't
* call any other InodeTable methods from it.
*/
template <typename ModFn>
Record modifyOrThrow(InodeNumber ino, ModFn&& fn) {
return state_.withWLock([&](const auto& state) {
auto iter = state.indices.find(ino);
if (iter == state.indices.end()) {
throw std::out_of_range(
folly::to<std::string>("no entry in InodeTable for inode ", ino));
}
auto index = iter->second;
CHECK_LT(index, state.storage.size());
fn(state.storage[index]);
// TODO: maybe trigger a background msync
return state.storage[index];
});
}
// TODO: replace with freeInodes - it's much more efficient to free a bunch
// at once.
void freeInode(InodeNumber ino) {
state_.withWLock([&](auto& state) {
auto& storage = state.storage;
auto& indices = state.indices;
auto iter = indices.find(ino);
if (iter == indices.end()) {
EDEN_BUG() << "tried to deallocate unknown (or already freed) inode";
}
size_t indexToDelete = iter->second;
indices.erase(iter);
DCHECK_GT(storage.size(), 0);
size_t lastIndex = storage.size() - 1;
if (lastIndex != indexToDelete) {
auto lastInode = storage[lastIndex].inode;
storage[indexToDelete] = storage[lastIndex];
indices[lastInode] = indexToDelete;
}
storage.pop_back();
});
}
private:
explicit InodeTable(MappedDiskVector<Entry>&& storage)
: state_{folly::in_place, std::move(storage)} {}
struct State {
State(MappedDiskVector<Entry>&& mdv) : storage{std::move(mdv)} {
for (size_t i = 0; i < storage.size(); ++i) {
const Entry& entry = storage[i];
auto ret = indices.insert({entry.inode, i});
if (!ret.second) {
XLOG(WARNING) << "Duplicate records for the same inode: indices "
<< indices[entry.inode] << " and " << i;
continue;
}
}
}
/**
* Holds the actual records, indexed by the values in indices_. The
* records are stored densely. Freeing an inode moves the last entry into
* the newly-freed hole.
*/
MappedDiskVector<Entry> storage;
/// Maintains an index from inode number to index in storage_.
std::unordered_map<InodeNumber, size_t> indices;
};
folly::Synchronized<State> state_;
};
static_assert(
sizeof(InodeMetadata) == 40,
"Don't change InodeMetadata without implementing a migration path");
using InodeMetadataTable = InodeTable<InodeMetadata>;
} // namespace eden
} // namespace facebook

View File

@ -0,0 +1,168 @@
/*
* Copyright (c) 2018-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree. An additional grant
* of patent rights can be found in the PATENTS file in the same directory.
*
*/
#include "eden/fs/inodes/InodeTable.h"
#include <folly/chrono/Conv.h>
#include <folly/experimental/TestUtil.h>
#include <folly/test/TestUtils.h>
#include <gtest/gtest.h>
using namespace facebook::eden;
using std::chrono::system_clock;
namespace {
struct InodeTableTest : public ::testing::Test {
InodeTableTest()
: tmpDir{"eden_inodetable_"},
tablePath{(tmpDir.path() / "test.inodes").string()} {}
folly::test::TemporaryDirectory tmpDir;
std::string tablePath;
};
struct Int {
enum { VERSION = 0 };
/* implicit */ Int(int v) : value(v) {}
operator int() const {
return value;
}
int value;
};
} // namespace
TEST_F(InodeTableTest, persists_record) {
{
auto inodeTable = InodeTable<Int>::open(tablePath);
inodeTable->set(10_ino, 15);
}
auto inodeTable = InodeTable<Int>::open(tablePath);
EXPECT_EQ(15, inodeTable->getOrThrow(10_ino));
}
namespace {
struct Small {
enum { VERSION = 0 };
uint64_t x;
};
struct Large {
enum { VERSION = 0 };
uint64_t x;
uint64_t y;
};
} // namespace
TEST_F(InodeTableTest, fails_to_load_if_record_changes_size_without_migration) {
{
auto inodeTable = InodeTable<Small>::open(tablePath);
inodeTable->set(1_ino, {1});
}
ASSERT_THROW({ InodeTable<Large>::open(tablePath); }, std::runtime_error);
}
namespace {
struct OldRecord {
enum { VERSION = 0 };
uint32_t x;
uint32_t y;
};
struct NewRecord {
enum { VERSION = 1 };
explicit NewRecord(const OldRecord& old)
: x{old.x}, y{old.y}, z{old.x + old.y} {}
uint64_t x;
uint64_t y;
uint64_t z;
};
} // namespace
TEST_F(InodeTableTest, migrate_from_one_record_format_to_another) {
{
auto inodeTable = InodeTable<OldRecord>::open(tablePath);
inodeTable->set(1_ino, {11, 22});
inodeTable->set(2_ino, {100, 200});
}
{
auto inodeTable = InodeTable<NewRecord>::open<OldRecord>(tablePath);
auto one = inodeTable->getOrThrow(1_ino);
auto two = inodeTable->getOrThrow(2_ino);
EXPECT_EQ(11, one.x);
EXPECT_EQ(22, one.y);
EXPECT_EQ(33, one.z);
EXPECT_EQ(100, two.x);
EXPECT_EQ(200, two.y);
EXPECT_EQ(300, two.z);
}
}
namespace {
struct OldVersion {
enum { VERSION = 0 };
uint32_t x;
uint32_t y;
};
struct NewVersion {
enum { VERSION = 1 };
explicit NewVersion(const OldVersion& old)
: x{old.x + old.y}, y{old.x - old.y} {}
uint32_t x;
uint32_t y;
};
} // namespace
TEST_F(
InodeTableTest,
migrate_from_one_record_format_to_another_even_if_same_size) {
{
auto inodeTable = InodeTable<OldVersion>::open(tablePath);
inodeTable->set(1_ino, {7, 3});
inodeTable->set(2_ino, {60, 40});
}
{
auto inodeTable = InodeTable<NewVersion>::open<OldVersion>(tablePath);
auto one = inodeTable->getOrThrow(1_ino);
auto two = inodeTable->getOrThrow(2_ino);
EXPECT_EQ(10, one.x);
EXPECT_EQ(4, one.y);
EXPECT_EQ(100, two.x);
EXPECT_EQ(20, two.y);
}
}
TEST_F(InodeTableTest, populateIfNotSet) {
auto inodeTable = InodeTable<Int>::open(tablePath);
inodeTable->set(1_ino, 15);
inodeTable->populateIfNotSet(1_ino, [&] { return 100; });
inodeTable->populateIfNotSet(2_ino, [&] { return 101; });
EXPECT_EQ(15, inodeTable->getOrThrow(1_ino));
EXPECT_EQ(101, inodeTable->getOrThrow(2_ino));
}
// TEST(INodeTable, setDefault) {}
// TEST(INodeTable, setDefault) {}
// TEST(INodeTable, set) {}
// TEST(INodeTable, getOrThrow) {}
// TEST(INodeTable, getOptional) {}
// TEST(INodeTable, modifyOrThrow) {}
// TEST(INodeTable, freeInodes) {}