mirror of
https://github.com/facebook/sapling.git
synced 2024-10-07 07:17:55 +03:00
implement an InodeTable for storing persistent per-inode info decoupled from memory
Summary: A persistent (but notably non-durable) mapping from inode number to a fixed-size record stored in a memory-mapped file. The two primary goals here are: 1. efficiently (and lazily) reify timestamps for inodes that aren't in the overlay 2. allow the kernel's page cache to drop pages under memory pressure Reviewed By: simpkins Differential Revision: D6877361 fbshipit-source-id: a4366b12e21e2bf483c83069cd93ef150829b2ac
This commit is contained in:
parent
d9a6089dd5
commit
fd20487c7b
50
eden/fs/inodes/InodeMetadata.h
Normal file
50
eden/fs/inodes/InodeMetadata.h
Normal file
@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2018-present, Facebook, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* This source code is licensed under the BSD-style license found in the
|
||||
* LICENSE file in the root directory of this source tree. An additional grant
|
||||
* of patent rights can be found in the PATENTS file in the same directory.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include "eden/fs/inodes/InodeTimestamps.h"
|
||||
|
||||
namespace facebook {
|
||||
namespace eden {
|
||||
|
||||
/**
|
||||
* Fixed-size structure of per-inode bits that should be persisted across runs.
|
||||
*
|
||||
* Warning: This data structure is serialized directly to disk via InodeTable.
|
||||
* Do not change the order, sizes, or meanings of the fields. Instead, rename
|
||||
* this struct, create a new InodeMetadata struct with the next VERSION value,
|
||||
* add an explicit constructor from the old version, and add the old version to
|
||||
* the InodeMetadataTable typedef in InodeTable.h.
|
||||
*/
|
||||
struct InodeMetadata {
|
||||
enum { VERSION = 0 };
|
||||
|
||||
InodeMetadata() = default;
|
||||
|
||||
explicit InodeMetadata(
|
||||
mode_t m,
|
||||
uid_t u,
|
||||
gid_t g,
|
||||
const InodeTimestamps& ts) noexcept
|
||||
: mode{m}, uid{u}, gid{g}, timestamps{ts} {}
|
||||
|
||||
mode_t mode{0};
|
||||
uid_t uid{0};
|
||||
gid_t gid{0};
|
||||
InodeTimestamps timestamps;
|
||||
|
||||
// Other potential things to include:
|
||||
// nlink_t nlinks;
|
||||
// dev_t rdev;
|
||||
// creation time
|
||||
};
|
||||
} // namespace eden
|
||||
} // namespace facebook
|
276
eden/fs/inodes/InodeTable.h
Normal file
276
eden/fs/inodes/InodeTable.h
Normal file
@ -0,0 +1,276 @@
|
||||
/*
|
||||
* Copyright (c) 2018-present, Facebook, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* This source code is licensed under the BSD-style license found in the
|
||||
* LICENSE file in the root directory of this source tree. An additional grant
|
||||
* of patent rights can be found in the PATENTS file in the same directory.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "eden/fs/fuse/FuseTypes.h"
|
||||
#include "eden/fs/inodes/InodeMetadata.h"
|
||||
#include "eden/fs/utils/Bug.h"
|
||||
#include "eden/fs/utils/MappedDiskVector.h"
|
||||
|
||||
namespace facebook {
|
||||
namespace eden {
|
||||
|
||||
namespace detail {
|
||||
template <typename Record>
|
||||
struct InodeTableEntry {
|
||||
enum { VERSION = Record::VERSION };
|
||||
|
||||
InodeTableEntry() = delete;
|
||||
InodeTableEntry(InodeNumber ino, const Record& rec)
|
||||
: inode{ino}, record{rec} {}
|
||||
|
||||
/// Conversion from old versions.
|
||||
template <typename OldRecord>
|
||||
explicit InodeTableEntry(const InodeTableEntry<OldRecord>& old)
|
||||
: inode{old.inode}, record{old.record} {}
|
||||
|
||||
// WARNING: this data structure is serialized directly to disk via
|
||||
// MappedDiskVector. Do not change the order, set, or types of fields. We
|
||||
// could, if we want to change Entry itself, coopt high bits of VERSION and
|
||||
// modify MappedDiskVector to allow direct upgrades rather than linear
|
||||
// upgrades.
|
||||
InodeNumber inode;
|
||||
// TODO: should we maintain a 64-bit SpookyHashV2 checksum to ignore
|
||||
// corrupted entries?
|
||||
Record record;
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
* InodeTable is an efficient storage engine for fixed-size inode records.
|
||||
* It is intended for timestamps and mode bits (and any additional fixed-size
|
||||
* per-inode state.)
|
||||
*
|
||||
* The data is stored in a memory-mapped file and flushed to disk on occasion.
|
||||
* Durability on kernel or disk shutdown is not a primary goal. Timestamps and
|
||||
* permission bits are easy enough to fix and uncommitted changes are
|
||||
* short-lived, and the kernel will flush dirty pages if the process is killed.
|
||||
*
|
||||
* Rather than using a free list, upon removal of an entry, the last entry is
|
||||
* moved to the removed entry's index.
|
||||
*
|
||||
* The locking strategy is as follows:
|
||||
*
|
||||
* The index from inode number to record index is wrapped in a SharedMutex.
|
||||
* Most accesses will only take a reader lock unless a new entry is added or
|
||||
* an inode number is removed.
|
||||
*
|
||||
* The contents of each record itself is protected by the FileInode and
|
||||
* TreeInode's locks.
|
||||
*/
|
||||
template <typename Record>
|
||||
class InodeTable {
|
||||
public:
|
||||
using Entry = detail::InodeTableEntry<Record>;
|
||||
|
||||
InodeTable() = delete;
|
||||
InodeTable(const InodeTable&) = delete;
|
||||
InodeTable(InodeTable&&) = delete;
|
||||
|
||||
InodeTable& operator=(const InodeTable&) = delete;
|
||||
InodeTable& operator=(InodeTable&&) = delete;
|
||||
|
||||
/**
|
||||
* Create or open an InodeTable at the specified path.
|
||||
*/
|
||||
template <typename... OldRecords>
|
||||
static std::unique_ptr<InodeTable> open(folly::StringPiece path) {
|
||||
return std::unique_ptr<InodeTable>{
|
||||
new InodeTable{MappedDiskVector<Entry>::template open<
|
||||
detail::InodeTableEntry<OldRecords>...>(path, true)}};
|
||||
}
|
||||
|
||||
/**
|
||||
* If no value is stored for this inode, assigns one. Returns the new value,
|
||||
* whether it was set to the default or not.
|
||||
*/
|
||||
Record setDefault(InodeNumber ino, const Record& record) {
|
||||
return state_.withULockPtr([&](auto&& ulock) {
|
||||
const auto& indices = ulock->indices;
|
||||
auto iter = indices.find(ino);
|
||||
if (iter != indices.end()) {
|
||||
return ulock->storage[iter->second].record;
|
||||
} else {
|
||||
auto wlock = ulock.moveFromUpgradeToWrite();
|
||||
|
||||
size_t index = wlock->storage.size();
|
||||
wlock->storage.emplace_back(ino, record);
|
||||
wlock->indices.emplace(ino, index);
|
||||
return wlock->storage[index].record;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* If no value is stored for this inode, calls a function to populate its
|
||||
* initial data. This is more efficient than setDefault when computing the
|
||||
* default value is nontrivial.
|
||||
*
|
||||
* Note that the callback is run while the table's locks are held. Don't
|
||||
* call any other InodeTable methods from it.
|
||||
*/
|
||||
template <typename PopFn>
|
||||
void populateIfNotSet(InodeNumber ino, PopFn&& pop) {
|
||||
return state_.withULockPtr([&](auto&& ulock) {
|
||||
const auto& indices = ulock->indices;
|
||||
auto iter = indices.find(ino);
|
||||
if (iter != indices.end()) {
|
||||
return;
|
||||
} else {
|
||||
auto wlock = ulock.moveFromUpgradeToWrite();
|
||||
|
||||
size_t index = wlock->storage.size();
|
||||
wlock->storage.emplace_back(ino, pop());
|
||||
wlock->indices.emplace(ino, index);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Assign or overwrite a value for this inode.
|
||||
*/
|
||||
void set(InodeNumber ino, const Record& record) {
|
||||
return state_.withWLock([&](auto& state) {
|
||||
const auto& indices = state.indices;
|
||||
auto iter = indices.find(ino);
|
||||
size_t index;
|
||||
if (iter != indices.end()) {
|
||||
index = iter->second;
|
||||
assert(ino == state.storage[index].inode);
|
||||
state.storage[index].record = record;
|
||||
} else {
|
||||
index = state.storage.size();
|
||||
state.storage.emplace_back(ino, record);
|
||||
state.indices.emplace(ino, index);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* If a value is present for the given inode, returns it. Otherwise, throws
|
||||
* std::out_of_range.
|
||||
*/
|
||||
Record getOrThrow(InodeNumber ino) {
|
||||
auto rv = getOptional(ino);
|
||||
if (rv) {
|
||||
return *rv;
|
||||
} else {
|
||||
throw std::out_of_range(
|
||||
folly::to<std::string>("no entry in InodeTable for inode ", ino));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* If the table has an entry for this inode, returns it. Otherwise, returns
|
||||
* folly::none.
|
||||
*/
|
||||
folly::Optional<Record> getOptional(InodeNumber ino) {
|
||||
return state_.withRLock([&](const auto& state) -> folly::Optional<Record> {
|
||||
auto iter = state.indices.find(ino);
|
||||
if (iter == state.indices.end()) {
|
||||
return folly::none;
|
||||
} else {
|
||||
auto index = iter->second;
|
||||
CHECK_LT(index, state.storage.size());
|
||||
return state.storage[index].record;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls a function that can modify the data at the given InodeNumber. Throws
|
||||
* std::out_of_range if there is no record.
|
||||
*
|
||||
* Note that the callback is run while the table's locks are held. Don't
|
||||
* call any other InodeTable methods from it.
|
||||
*/
|
||||
template <typename ModFn>
|
||||
Record modifyOrThrow(InodeNumber ino, ModFn&& fn) {
|
||||
return state_.withWLock([&](const auto& state) {
|
||||
auto iter = state.indices.find(ino);
|
||||
if (iter == state.indices.end()) {
|
||||
throw std::out_of_range(
|
||||
folly::to<std::string>("no entry in InodeTable for inode ", ino));
|
||||
}
|
||||
auto index = iter->second;
|
||||
CHECK_LT(index, state.storage.size());
|
||||
fn(state.storage[index]);
|
||||
// TODO: maybe trigger a background msync
|
||||
return state.storage[index];
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: replace with freeInodes - it's much more efficient to free a bunch
|
||||
// at once.
|
||||
void freeInode(InodeNumber ino) {
|
||||
state_.withWLock([&](auto& state) {
|
||||
auto& storage = state.storage;
|
||||
auto& indices = state.indices;
|
||||
|
||||
auto iter = indices.find(ino);
|
||||
if (iter == indices.end()) {
|
||||
EDEN_BUG() << "tried to deallocate unknown (or already freed) inode";
|
||||
}
|
||||
|
||||
size_t indexToDelete = iter->second;
|
||||
indices.erase(iter);
|
||||
|
||||
DCHECK_GT(storage.size(), 0);
|
||||
size_t lastIndex = storage.size() - 1;
|
||||
|
||||
if (lastIndex != indexToDelete) {
|
||||
auto lastInode = storage[lastIndex].inode;
|
||||
storage[indexToDelete] = storage[lastIndex];
|
||||
indices[lastInode] = indexToDelete;
|
||||
}
|
||||
|
||||
storage.pop_back();
|
||||
});
|
||||
}
|
||||
|
||||
private:
|
||||
explicit InodeTable(MappedDiskVector<Entry>&& storage)
|
||||
: state_{folly::in_place, std::move(storage)} {}
|
||||
|
||||
struct State {
|
||||
State(MappedDiskVector<Entry>&& mdv) : storage{std::move(mdv)} {
|
||||
for (size_t i = 0; i < storage.size(); ++i) {
|
||||
const Entry& entry = storage[i];
|
||||
auto ret = indices.insert({entry.inode, i});
|
||||
if (!ret.second) {
|
||||
XLOG(WARNING) << "Duplicate records for the same inode: indices "
|
||||
<< indices[entry.inode] << " and " << i;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds the actual records, indexed by the values in indices_. The
|
||||
* records are stored densely. Freeing an inode moves the last entry into
|
||||
* the newly-freed hole.
|
||||
*/
|
||||
MappedDiskVector<Entry> storage;
|
||||
|
||||
/// Maintains an index from inode number to index in storage_.
|
||||
std::unordered_map<InodeNumber, size_t> indices;
|
||||
};
|
||||
|
||||
folly::Synchronized<State> state_;
|
||||
};
|
||||
|
||||
static_assert(
|
||||
sizeof(InodeMetadata) == 40,
|
||||
"Don't change InodeMetadata without implementing a migration path");
|
||||
|
||||
using InodeMetadataTable = InodeTable<InodeMetadata>;
|
||||
|
||||
} // namespace eden
|
||||
} // namespace facebook
|
168
eden/fs/inodes/test/InodeTableTest.cpp
Normal file
168
eden/fs/inodes/test/InodeTableTest.cpp
Normal file
@ -0,0 +1,168 @@
|
||||
/*
|
||||
* Copyright (c) 2018-present, Facebook, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* This source code is licensed under the BSD-style license found in the
|
||||
* LICENSE file in the root directory of this source tree. An additional grant
|
||||
* of patent rights can be found in the PATENTS file in the same directory.
|
||||
*
|
||||
*/
|
||||
#include "eden/fs/inodes/InodeTable.h"
|
||||
|
||||
#include <folly/chrono/Conv.h>
|
||||
#include <folly/experimental/TestUtil.h>
|
||||
#include <folly/test/TestUtils.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
using namespace facebook::eden;
|
||||
using std::chrono::system_clock;
|
||||
|
||||
namespace {
|
||||
struct InodeTableTest : public ::testing::Test {
|
||||
InodeTableTest()
|
||||
: tmpDir{"eden_inodetable_"},
|
||||
tablePath{(tmpDir.path() / "test.inodes").string()} {}
|
||||
|
||||
folly::test::TemporaryDirectory tmpDir;
|
||||
std::string tablePath;
|
||||
};
|
||||
|
||||
struct Int {
|
||||
enum { VERSION = 0 };
|
||||
/* implicit */ Int(int v) : value(v) {}
|
||||
operator int() const {
|
||||
return value;
|
||||
}
|
||||
|
||||
int value;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST_F(InodeTableTest, persists_record) {
|
||||
{
|
||||
auto inodeTable = InodeTable<Int>::open(tablePath);
|
||||
inodeTable->set(10_ino, 15);
|
||||
}
|
||||
|
||||
auto inodeTable = InodeTable<Int>::open(tablePath);
|
||||
EXPECT_EQ(15, inodeTable->getOrThrow(10_ino));
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct Small {
|
||||
enum { VERSION = 0 };
|
||||
uint64_t x;
|
||||
};
|
||||
struct Large {
|
||||
enum { VERSION = 0 };
|
||||
uint64_t x;
|
||||
uint64_t y;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST_F(InodeTableTest, fails_to_load_if_record_changes_size_without_migration) {
|
||||
{
|
||||
auto inodeTable = InodeTable<Small>::open(tablePath);
|
||||
inodeTable->set(1_ino, {1});
|
||||
}
|
||||
|
||||
ASSERT_THROW({ InodeTable<Large>::open(tablePath); }, std::runtime_error);
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct OldRecord {
|
||||
enum { VERSION = 0 };
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
};
|
||||
|
||||
struct NewRecord {
|
||||
enum { VERSION = 1 };
|
||||
|
||||
explicit NewRecord(const OldRecord& old)
|
||||
: x{old.x}, y{old.y}, z{old.x + old.y} {}
|
||||
|
||||
uint64_t x;
|
||||
uint64_t y;
|
||||
uint64_t z;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST_F(InodeTableTest, migrate_from_one_record_format_to_another) {
|
||||
{
|
||||
auto inodeTable = InodeTable<OldRecord>::open(tablePath);
|
||||
inodeTable->set(1_ino, {11, 22});
|
||||
inodeTable->set(2_ino, {100, 200});
|
||||
}
|
||||
|
||||
{
|
||||
auto inodeTable = InodeTable<NewRecord>::open<OldRecord>(tablePath);
|
||||
auto one = inodeTable->getOrThrow(1_ino);
|
||||
auto two = inodeTable->getOrThrow(2_ino);
|
||||
|
||||
EXPECT_EQ(11, one.x);
|
||||
EXPECT_EQ(22, one.y);
|
||||
EXPECT_EQ(33, one.z);
|
||||
EXPECT_EQ(100, two.x);
|
||||
EXPECT_EQ(200, two.y);
|
||||
EXPECT_EQ(300, two.z);
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct OldVersion {
|
||||
enum { VERSION = 0 };
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
};
|
||||
|
||||
struct NewVersion {
|
||||
enum { VERSION = 1 };
|
||||
|
||||
explicit NewVersion(const OldVersion& old)
|
||||
: x{old.x + old.y}, y{old.x - old.y} {}
|
||||
|
||||
uint32_t x;
|
||||
uint32_t y;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST_F(
|
||||
InodeTableTest,
|
||||
migrate_from_one_record_format_to_another_even_if_same_size) {
|
||||
{
|
||||
auto inodeTable = InodeTable<OldVersion>::open(tablePath);
|
||||
inodeTable->set(1_ino, {7, 3});
|
||||
inodeTable->set(2_ino, {60, 40});
|
||||
}
|
||||
|
||||
{
|
||||
auto inodeTable = InodeTable<NewVersion>::open<OldVersion>(tablePath);
|
||||
auto one = inodeTable->getOrThrow(1_ino);
|
||||
auto two = inodeTable->getOrThrow(2_ino);
|
||||
|
||||
EXPECT_EQ(10, one.x);
|
||||
EXPECT_EQ(4, one.y);
|
||||
EXPECT_EQ(100, two.x);
|
||||
EXPECT_EQ(20, two.y);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(InodeTableTest, populateIfNotSet) {
|
||||
auto inodeTable = InodeTable<Int>::open(tablePath);
|
||||
inodeTable->set(1_ino, 15);
|
||||
|
||||
inodeTable->populateIfNotSet(1_ino, [&] { return 100; });
|
||||
inodeTable->populateIfNotSet(2_ino, [&] { return 101; });
|
||||
|
||||
EXPECT_EQ(15, inodeTable->getOrThrow(1_ino));
|
||||
EXPECT_EQ(101, inodeTable->getOrThrow(2_ino));
|
||||
}
|
||||
|
||||
// TEST(INodeTable, setDefault) {}
|
||||
// TEST(INodeTable, setDefault) {}
|
||||
// TEST(INodeTable, set) {}
|
||||
// TEST(INodeTable, getOrThrow) {}
|
||||
// TEST(INodeTable, getOptional) {}
|
||||
// TEST(INodeTable, modifyOrThrow) {}
|
||||
// TEST(INodeTable, freeInodes) {}
|
Loading…
Reference in New Issue
Block a user