From 61c02ae9ef02deaabe6a32c92e2beae99f274c72 Mon Sep 17 00:00:00 2001 From: Michael Cuevas Date: Wed, 17 May 2023 17:47:42 -0700 Subject: [PATCH] remove entropy location assumptions from ObjectId hash Summary: Having a strict ObjectID format is quite inconvenient. We will need to introduce a new ObjectID format for Eden x Sparse, so it's in our best interest to remove the ObjectID format restrictions before hand. This will allow us to place the high entropy data (proxy hash in our case) in any location in the ObjectId without causing a ton of hash collisions. This will enable us to introduce FilteredObjectIDs in the form: `` where the `` contains the high entropy bits we need to hash. Reviewed By: xavierd Differential Revision: D45793298 fbshipit-source-id: 77385e32f63d5f3d1fc37b72b9971f5717cbd872 --- eden/fs/model/ObjectId.cpp | 23 ++++-- eden/fs/model/ObjectId.h | 23 ++---- eden/fs/model/test/ObjectIdTest.cpp | 107 ++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 23 deletions(-) create mode 100644 eden/fs/model/test/ObjectIdTest.cpp diff --git a/eden/fs/model/ObjectId.cpp b/eden/fs/model/ObjectId.cpp index 1e678e926f..9ac3442676 100644 --- a/eden/fs/model/ObjectId.cpp +++ b/eden/fs/model/ObjectId.cpp @@ -36,12 +36,25 @@ std::string ObjectId::asString() const { } size_t ObjectId::getHashCode() const noexcept { - if (bytes_.size() > sizeof(size_t) + 1) { - size_t ret; - memcpy(&ret, bytes_.data() + 1, sizeof(size_t)); - return ret; + const char* p = bytes_.data(); + size_t n = bytes_.size(); + + if (UNLIKELY(n < sizeof(uint64_t))) { + size_t rv = 0; + memcpy(&rv, p, n); + return rv; } - return std::hash{}(bytes_); + + // unaligned load of tail + size_t rv; + size_t incrementSize = sizeof(uint64_t); + memcpy(&rv, p + (n - incrementSize), incrementSize); + for (const char* end = p + (n - incrementSize); p < end; p += incrementSize) { + size_t x; + memcpy(&x, p, incrementSize); + rv ^= x; + } + return rv; } ObjectId ObjectId::sha1(const folly::IOBuf& buf) { diff --git a/eden/fs/model/ObjectId.h b/eden/fs/model/ObjectId.h index 99c105db3b..f73a4ec45d 100644 --- a/eden/fs/model/ObjectId.h +++ b/eden/fs/model/ObjectId.h @@ -24,10 +24,6 @@ namespace facebook::eden { * Identifies tree and blob objects. * This identifier is a variable length string. * - * NOTE: The hash function assumes that ObjectID are stored in a specific - * format to provide constant time hash functions. The high entropy data must - * be stored in the 2nd to 9th bytes of the ObjectID. This property must be - * respected for new ObjectID types. */ class ObjectId { public: @@ -101,16 +97,11 @@ class ObjectId { /** * Computes a hash for this ObjectID. * - * ObjectID are currently of 2 forms: - * - <20-byte hash> - * - <1-byte type><20-bytes hash> - * - <20-byte-hash><8-byte size> - * - * With this, we can compute a hash code by simply returning part of the - * already stored hash. The implementation currently returns the bytes at - * [1,9]. - * - * Smaller ObjectID will use `std::hash`. + * Short ObjectIDs hash to themselves as we assume the ObjectID itself has + * high entropy. Long ObjectIDs are hashed by mixing the bits of the ID + * together with a XOR operation. This is okay since we assume at least one + * eight byte range in the ObjectID has high entropy and XORing with that + * range will give us a decent hash. */ size_t getHashCode() const noexcept; @@ -207,10 +198,6 @@ class ObjectIdCodec { /** * Parse the string as an ObjectId. - * - * Note to implementer: ObjectId::getHashCode is very specific as to how it - * expects ObjectId to be layed out in memory. Make sure to respect this - * layout. */ virtual ObjectId parseObjectId(folly::StringPiece objectId) = 0; virtual std::string renderObjectId(const ObjectId& objectId) = 0; diff --git a/eden/fs/model/test/ObjectIdTest.cpp b/eden/fs/model/test/ObjectIdTest.cpp new file mode 100644 index 0000000000..00ff47f0fb --- /dev/null +++ b/eden/fs/model/test/ObjectIdTest.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This software may be used and distributed according to the terms of the + * GNU General Public License version 2. + */ + +#include "eden/fs/model/ObjectId.h" + +#include +#include +#include + +namespace { + +using namespace facebook::eden; +using folly::ByteRange; + +TEST(ObjectId, testHashCodeExact) { + auto bytes = folly::make_array( + 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0xff, 0xff); + auto byteRange = folly::ByteRange(bytes.data(), bytes.size()); + auto exactObjectId = ObjectId(byteRange); + auto hashCode = exactObjectId.getHashCode(); + EXPECT_EQ(hashCode, folly::Endian::big(0x0000ffff0000ffff)); +} + +TEST(ObjectId, testHashCodeShort) { + auto bytes = folly::make_array(0x00, 0xff); + auto byteRange = folly::ByteRange(bytes.data(), bytes.size()); + auto exactObjectId = ObjectId(byteRange); + auto hashCode = exactObjectId.getHashCode(); + // May not work correctly on little-endian machines. + EXPECT_EQ(hashCode, 0xff00); +} + +TEST(ObjectId, testHashCodeLong) { + auto bytes = folly::make_array( + // all 1s in binary + 0x01, + 0x01, + 0x01, + 0x01, + + 0x01, + 0x01, + 0x01, + 0x01, + + 0x02, + 0x02, + 0x02, + 0x02, + + 0x02, + 0x02, + 0x02, + 0x02, + + 0x04, + 0x04, + 0x04, + 0x04, + + 0x04, + 0x04, + 0x04, + 0x04); + auto byteRange = folly::ByteRange(bytes.data(), bytes.size()); + auto exactObjectId = ObjectId(byteRange); + auto hashCode = exactObjectId.getHashCode(); + EXPECT_EQ(hashCode, folly::Endian::big(0x0707070707070707)); +} + +TEST(ObjectId, testHashCodeNotMod8) { + auto bytes = folly::make_array( + // all 1s in binary + 0xff, + 0xff, + 0xff, + 0xff, + + 0xff, + 0xff, + 0xff, + 0xff, + + // all 0s in binary + 0x00, + 0x00, + 0x00, + 0x00); + auto byteRange = folly::ByteRange(bytes.data(), bytes.size()); + auto exactObjectId = ObjectId(byteRange); + auto hashCode = exactObjectId.getHashCode(); + + // When length of ObjectID is not a multiple of 8, we end up overlapping + // xor byte ranges. In this case, we'll xor as follows: + // + // 0x00 00 00 00 ff ff ff ff + // 0xff ff ff ff ff ff ff ff ^ + // -------------------------- + // 0xff ff ff ff 00 00 00 00 + // + EXPECT_EQ(hashCode, 0xffffffff00000000); +} +} // namespace