remove entropy location assumptions from ObjectId hash

Summary: Having a strict ObjectID format is quite inconvenient. We will need to introduce a new ObjectID format for Eden x Sparse, so it's in our best interest to remove the ObjectID format restrictions before hand. This will allow us to place the high entropy data (proxy hash in our case) in any location in the ObjectId without causing a ton of hash collisions. This will enable us to introduce FilteredObjectIDs in the form: `<tree_or_blob_byte><filterset_id><path><ObjectId>` where the `<ObjectId>` contains the high entropy bits we need to hash. Reviewed By: xavierd Differential Revision: D45793298 fbshipit-source-id: 77385e32f63d5f3d1fc37b72b9971f5717cbd872
2024-10-05 14:28:17 +03:00 · 2023-05-17 17:47:42 -07:00 · 2023-05-17 17:47:42 -07:00 · 61c02ae9ef
commit 61c02ae9ef
parent e691b04a24
3 changed files with 130 additions and 23 deletions
--- a/eden/fs/model/ObjectId.cpp
+++ b/eden/fs/model/ObjectId.cpp
@ -36,12 +36,25 @@ std::string ObjectId::asString() const {
 }

 size_t ObjectId::getHashCode() const noexcept {
-  if (bytes_.size() > sizeof(size_t) + 1) {
-    size_t ret;
-    memcpy(&ret, bytes_.data() + 1, sizeof(size_t));
-    return ret;
+  const char* p = bytes_.data();
+  size_t n = bytes_.size();
+
+  if (UNLIKELY(n < sizeof(uint64_t))) {
+    size_t rv = 0;
+    memcpy(&rv, p, n);
+    return rv;
  }
-  return std::hash<folly::fbstring>{}(bytes_);
+
+  // unaligned load of tail
+  size_t rv;
+  size_t incrementSize = sizeof(uint64_t);
+  memcpy(&rv, p + (n - incrementSize), incrementSize);
+  for (const char* end = p + (n - incrementSize); p < end; p += incrementSize) {
+    size_t x;
+    memcpy(&x, p, incrementSize);
+    rv ^= x;
+  }
+  return rv;
 }

 ObjectId ObjectId::sha1(const folly::IOBuf& buf) {
--- a/eden/fs/model/ObjectId.h
+++ b/eden/fs/model/ObjectId.h
@ -24,10 +24,6 @@ namespace facebook::eden {
 * Identifies tree and blob objects.
 * This identifier is a variable length string.
 *
- * NOTE: The hash function assumes that ObjectID are stored in a specific
- * format to provide constant time hash functions. The high entropy data must
- * be stored in the 2nd to 9th bytes of the ObjectID. This property must be
- * respected for new ObjectID types.
 */
 class ObjectId {
 public:
@ -101,16 +97,11 @@ class ObjectId {
  /**
   * Computes a hash for this ObjectID.
   *
-   * ObjectID are currently of 2 forms:
-   *  - <20-byte hash>
-   *  - <1-byte type><20-bytes hash><path>
-   *  - <20-byte-hash><8-byte size>
-   *
-   * With this, we can compute a hash code by simply returning part of the
-   * already stored hash. The implementation currently returns the bytes at
-   * [1,9].
-   *
-   * Smaller ObjectID will use `std::hash`.
+   * Short ObjectIDs hash to themselves as we assume the ObjectID itself has
+   * high entropy. Long ObjectIDs are hashed by mixing the bits of the ID
+   * together with a XOR operation. This is okay since we assume at least one
+   * eight byte range in the ObjectID has high entropy and XORing with that
+   * range will give us a decent hash.
   */
  size_t getHashCode() const noexcept;

@ -207,10 +198,6 @@ class ObjectIdCodec {

  /**
   * Parse the string as an ObjectId.
-   *
-   * Note to implementer: ObjectId::getHashCode is very specific as to how it
-   * expects ObjectId to be layed out in memory. Make sure to respect this
-   * layout.
   */
  virtual ObjectId parseObjectId(folly::StringPiece objectId) = 0;
  virtual std::string renderObjectId(const ObjectId& objectId) = 0;
--- a/eden/fs/model/test/ObjectIdTest.cpp
+++ b/eden/fs/model/test/ObjectIdTest.cpp
@ -0,0 +1,107 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This software may be used and distributed according to the terms of the
+ * GNU General Public License version 2.
+ */
+
+#include "eden/fs/model/ObjectId.h"
+
+#include <folly/Range.h>
+#include <folly/container/Array.h>
+#include <folly/portability/GTest.h>
+
+namespace {
+
+using namespace facebook::eden;
+using folly::ByteRange;
+
+TEST(ObjectId, testHashCodeExact) {
+  auto bytes = folly::make_array<uint8_t>(
+      0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0xff, 0xff);
+  auto byteRange = folly::ByteRange(bytes.data(), bytes.size());
+  auto exactObjectId = ObjectId(byteRange);
+  auto hashCode = exactObjectId.getHashCode();
+  EXPECT_EQ(hashCode, folly::Endian::big(0x0000ffff0000ffff));
+}
+
+TEST(ObjectId, testHashCodeShort) {
+  auto bytes = folly::make_array<uint8_t>(0x00, 0xff);
+  auto byteRange = folly::ByteRange(bytes.data(), bytes.size());
+  auto exactObjectId = ObjectId(byteRange);
+  auto hashCode = exactObjectId.getHashCode();
+  // May not work correctly on little-endian machines.
+  EXPECT_EQ(hashCode, 0xff00);
+}
+
+TEST(ObjectId, testHashCodeLong) {
+  auto bytes = folly::make_array<uint8_t>(
+      // all 1s in binary
+      0x01,
+      0x01,
+      0x01,
+      0x01,
+
+      0x01,
+      0x01,
+      0x01,
+      0x01,
+
+      0x02,
+      0x02,
+      0x02,
+      0x02,
+
+      0x02,
+      0x02,
+      0x02,
+      0x02,
+
+      0x04,
+      0x04,
+      0x04,
+      0x04,
+
+      0x04,
+      0x04,
+      0x04,
+      0x04);
+  auto byteRange = folly::ByteRange(bytes.data(), bytes.size());
+  auto exactObjectId = ObjectId(byteRange);
+  auto hashCode = exactObjectId.getHashCode();
+  EXPECT_EQ(hashCode, folly::Endian::big(0x0707070707070707));
+}
+
+TEST(ObjectId, testHashCodeNotMod8) {
+  auto bytes = folly::make_array<uint8_t>(
+      // all 1s in binary
+      0xff,
+      0xff,
+      0xff,
+      0xff,
+
+      0xff,
+      0xff,
+      0xff,
+      0xff,
+
+      // all 0s in binary
+      0x00,
+      0x00,
+      0x00,
+      0x00);
+  auto byteRange = folly::ByteRange(bytes.data(), bytes.size());
+  auto exactObjectId = ObjectId(byteRange);
+  auto hashCode = exactObjectId.getHashCode();
+
+  // When length of ObjectID is not a multiple of 8, we end up overlapping
+  // xor byte ranges. In this case, we'll xor as follows:
+  //
+  // 0x00 00 00 00 ff ff ff ff
+  // 0xff ff ff ff ff ff ff ff ^
+  // --------------------------
+  // 0xff ff ff ff 00 00 00 00
+  //
+  EXPECT_EQ(hashCode, 0xffffffff00000000);
+}
+} // namespace