2020-01-18 11:38:21 +03:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
|
|
|
|
*
|
2021-04-22 11:24:48 +03:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2020-01-18 11:38:21 +03:00
|
|
|
*/
|
|
|
|
|
2018-10-10 12:53:07 +03:00
|
|
|
#pragma once
|
|
|
|
|
2021-11-07 16:52:20 +03:00
|
|
|
#include <AK/Concepts.h>
|
2021-11-11 01:00:21 +03:00
|
|
|
#include <AK/Error.h>
|
2021-06-13 17:26:08 +03:00
|
|
|
#include <AK/Forward.h>
|
2020-10-16 00:34:07 +03:00
|
|
|
#include <AK/HashFunctions.h>
|
2019-06-27 17:36:31 +03:00
|
|
|
#include <AK/StdLibExtras.h>
|
2021-09-16 01:00:33 +03:00
|
|
|
#include <AK/Traits.h>
|
2020-10-16 00:34:07 +03:00
|
|
|
#include <AK/Types.h>
|
|
|
|
#include <AK/kmalloc.h>
|
2018-10-10 12:53:07 +03:00
|
|
|
|
|
|
|
namespace AK {
|
|
|
|
|
2020-07-07 00:44:33 +03:00
|
|
|
enum class HashSetResult {
|
|
|
|
InsertedNewEntry,
|
2021-06-08 23:42:07 +03:00
|
|
|
ReplacedExistingEntry,
|
|
|
|
KeptExistingEntry
|
|
|
|
};
|
|
|
|
|
|
|
|
enum class HashSetExistingEntryBehavior {
|
|
|
|
Keep,
|
|
|
|
Replace
|
2020-07-07 00:44:33 +03:00
|
|
|
};
|
|
|
|
|
2022-03-08 18:23:08 +03:00
|
|
|
// Upper nibble determines state class:
|
|
|
|
// - 0: unused bucket
|
|
|
|
// - 1: used bucket
|
|
|
|
// - F: end bucket
|
|
|
|
// Lower nibble determines state within a class.
|
2022-03-07 17:10:10 +03:00
|
|
|
enum class BucketState : u8 {
|
2022-03-08 18:23:08 +03:00
|
|
|
Free = 0x00,
|
|
|
|
Used = 0x10,
|
|
|
|
Deleted = 0x01,
|
|
|
|
Rehashed = 0x12,
|
|
|
|
End = 0xFF,
|
2022-03-07 17:10:10 +03:00
|
|
|
};
|
|
|
|
|
2022-03-08 18:23:08 +03:00
|
|
|
// Note that because there's the end state, used and free are not 100% opposites!
|
|
|
|
constexpr bool is_used_bucket(BucketState state)
|
|
|
|
{
|
|
|
|
return (static_cast<u8>(state) & 0xf0) == 0x10;
|
|
|
|
}
|
|
|
|
|
|
|
|
constexpr bool is_free_bucket(BucketState state)
|
|
|
|
{
|
|
|
|
return (static_cast<u8>(state) & 0xf0) == 0x00;
|
|
|
|
}
|
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
template<typename HashTableType, typename T, typename BucketType>
|
2019-06-27 16:57:49 +03:00
|
|
|
class HashTableIterator {
|
2020-10-16 00:34:07 +03:00
|
|
|
friend HashTableType;
|
|
|
|
|
2019-06-27 16:57:49 +03:00
|
|
|
public:
|
2022-04-01 20:58:27 +03:00
|
|
|
bool operator==(HashTableIterator const& other) const { return m_bucket == other.m_bucket; }
|
|
|
|
bool operator!=(HashTableIterator const& other) const { return m_bucket != other.m_bucket; }
|
2020-10-16 00:34:07 +03:00
|
|
|
T& operator*() { return *m_bucket->slot(); }
|
|
|
|
T* operator->() { return m_bucket->slot(); }
|
|
|
|
void operator++() { skip_to_next(); }
|
2019-06-27 16:57:49 +03:00
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
private:
|
2019-06-27 16:57:49 +03:00
|
|
|
void skip_to_next()
|
|
|
|
{
|
2020-10-16 00:34:07 +03:00
|
|
|
if (!m_bucket)
|
|
|
|
return;
|
|
|
|
do {
|
|
|
|
++m_bucket;
|
2022-03-07 17:10:10 +03:00
|
|
|
if (m_bucket->state == BucketState::Used)
|
2019-06-27 16:57:49 +03:00
|
|
|
return;
|
2022-03-07 17:10:10 +03:00
|
|
|
} while (m_bucket->state != BucketState::End);
|
|
|
|
if (m_bucket->state == BucketState::End)
|
2020-10-16 00:34:07 +03:00
|
|
|
m_bucket = nullptr;
|
2019-06-27 16:57:49 +03:00
|
|
|
}
|
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
explicit HashTableIterator(BucketType* bucket)
|
|
|
|
: m_bucket(bucket)
|
2019-06-27 16:57:49 +03:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
BucketType* m_bucket { nullptr };
|
2019-06-27 16:57:49 +03:00
|
|
|
};
|
|
|
|
|
2021-06-13 17:26:08 +03:00
|
|
|
template<typename OrderedHashTableType, typename T, typename BucketType>
|
|
|
|
class OrderedHashTableIterator {
|
|
|
|
friend OrderedHashTableType;
|
|
|
|
|
|
|
|
public:
|
2022-04-01 20:58:27 +03:00
|
|
|
bool operator==(OrderedHashTableIterator const& other) const { return m_bucket == other.m_bucket; }
|
|
|
|
bool operator!=(OrderedHashTableIterator const& other) const { return m_bucket != other.m_bucket; }
|
2021-06-13 17:26:08 +03:00
|
|
|
T& operator*() { return *m_bucket->slot(); }
|
|
|
|
T* operator->() { return m_bucket->slot(); }
|
|
|
|
void operator++() { m_bucket = m_bucket->next; }
|
|
|
|
void operator--() { m_bucket = m_bucket->previous; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
explicit OrderedHashTableIterator(BucketType* bucket)
|
|
|
|
: m_bucket(bucket)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
BucketType* m_bucket { nullptr };
|
|
|
|
};
|
|
|
|
|
|
|
|
template<typename T, typename TraitsForT, bool IsOrdered>
|
2018-10-10 12:53:07 +03:00
|
|
|
class HashTable {
|
2020-10-16 09:32:35 +03:00
|
|
|
static constexpr size_t load_factor_in_percent = 60;
|
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
struct Bucket {
|
2022-03-07 17:10:10 +03:00
|
|
|
BucketState state;
|
2020-10-16 00:34:07 +03:00
|
|
|
alignas(T) u8 storage[sizeof(T)];
|
|
|
|
|
|
|
|
T* slot() { return reinterpret_cast<T*>(storage); }
|
2022-10-17 01:06:11 +03:00
|
|
|
T const* slot() const { return reinterpret_cast<T const*>(storage); }
|
2020-10-16 00:34:07 +03:00
|
|
|
};
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2021-06-13 17:26:08 +03:00
|
|
|
struct OrderedBucket {
|
|
|
|
OrderedBucket* previous;
|
|
|
|
OrderedBucket* next;
|
2022-03-07 17:10:10 +03:00
|
|
|
BucketState state;
|
2021-06-13 17:26:08 +03:00
|
|
|
alignas(T) u8 storage[sizeof(T)];
|
|
|
|
T* slot() { return reinterpret_cast<T*>(storage); }
|
2022-10-17 01:06:11 +03:00
|
|
|
T const* slot() const { return reinterpret_cast<T const*>(storage); }
|
2021-06-13 17:26:08 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
using BucketType = Conditional<IsOrdered, OrderedBucket, Bucket>;
|
|
|
|
|
|
|
|
struct CollectionData {
|
|
|
|
};
|
|
|
|
|
|
|
|
struct OrderedCollectionData {
|
|
|
|
BucketType* head { nullptr };
|
|
|
|
BucketType* tail { nullptr };
|
|
|
|
};
|
|
|
|
|
|
|
|
using CollectionDataType = Conditional<IsOrdered, OrderedCollectionData, CollectionData>;
|
|
|
|
|
2018-10-10 12:53:07 +03:00
|
|
|
public:
|
2021-01-11 02:29:28 +03:00
|
|
|
HashTable() = default;
|
2021-04-11 11:24:35 +03:00
|
|
|
explicit HashTable(size_t capacity) { rehash(capacity); }
|
2020-10-17 16:44:43 +03:00
|
|
|
|
|
|
|
~HashTable()
|
|
|
|
{
|
|
|
|
if (!m_buckets)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < m_capacity; ++i) {
|
2022-03-08 18:23:08 +03:00
|
|
|
if (is_used_bucket(m_buckets[i].state))
|
2020-10-17 16:44:43 +03:00
|
|
|
m_buckets[i].slot()->~T();
|
|
|
|
}
|
|
|
|
|
2021-07-11 14:22:58 +03:00
|
|
|
kfree_sized(m_buckets, size_in_bytes(m_capacity));
|
2020-10-17 16:44:43 +03:00
|
|
|
}
|
2020-10-16 00:34:07 +03:00
|
|
|
|
2022-04-01 20:58:27 +03:00
|
|
|
HashTable(HashTable const& other)
|
2019-06-24 12:57:54 +03:00
|
|
|
{
|
2020-10-16 00:34:07 +03:00
|
|
|
rehash(other.capacity());
|
2019-06-24 12:57:54 +03:00
|
|
|
for (auto& it : other)
|
|
|
|
set(it);
|
|
|
|
}
|
2020-10-16 00:34:07 +03:00
|
|
|
|
2022-04-01 20:58:27 +03:00
|
|
|
HashTable& operator=(HashTable const& other)
|
2019-06-24 12:57:54 +03:00
|
|
|
{
|
2020-10-17 16:08:09 +03:00
|
|
|
HashTable temporary(other);
|
|
|
|
swap(*this, temporary);
|
2019-06-24 12:57:54 +03:00
|
|
|
return *this;
|
|
|
|
}
|
2020-10-16 00:34:07 +03:00
|
|
|
|
2020-10-17 16:08:09 +03:00
|
|
|
HashTable(HashTable&& other) noexcept
|
2018-10-10 12:53:07 +03:00
|
|
|
: m_buckets(other.m_buckets)
|
2021-06-13 17:26:08 +03:00
|
|
|
, m_collection_data(other.m_collection_data)
|
2018-10-10 12:53:07 +03:00
|
|
|
, m_size(other.m_size)
|
|
|
|
, m_capacity(other.m_capacity)
|
2020-10-16 00:34:07 +03:00
|
|
|
, m_deleted_count(other.m_deleted_count)
|
2018-10-10 12:53:07 +03:00
|
|
|
{
|
|
|
|
other.m_size = 0;
|
|
|
|
other.m_capacity = 0;
|
2020-10-16 00:34:07 +03:00
|
|
|
other.m_deleted_count = 0;
|
2018-10-10 12:53:07 +03:00
|
|
|
other.m_buckets = nullptr;
|
2021-06-13 17:26:08 +03:00
|
|
|
if constexpr (IsOrdered)
|
|
|
|
other.m_collection_data = { nullptr, nullptr };
|
2018-10-10 12:53:07 +03:00
|
|
|
}
|
2020-10-16 00:34:07 +03:00
|
|
|
|
2020-10-17 16:08:09 +03:00
|
|
|
HashTable& operator=(HashTable&& other) noexcept
|
2018-10-10 12:53:07 +03:00
|
|
|
{
|
2021-05-30 15:23:23 +03:00
|
|
|
HashTable temporary { move(other) };
|
|
|
|
swap(*this, temporary);
|
2018-10-10 12:53:07 +03:00
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2020-10-17 15:44:59 +03:00
|
|
|
friend void swap(HashTable& a, HashTable& b) noexcept
|
|
|
|
{
|
|
|
|
swap(a.m_buckets, b.m_buckets);
|
|
|
|
swap(a.m_size, b.m_size);
|
|
|
|
swap(a.m_capacity, b.m_capacity);
|
|
|
|
swap(a.m_deleted_count, b.m_deleted_count);
|
2021-06-13 17:26:08 +03:00
|
|
|
|
|
|
|
if constexpr (IsOrdered)
|
|
|
|
swap(a.m_collection_data, b.m_collection_data);
|
2020-10-17 15:44:59 +03:00
|
|
|
}
|
|
|
|
|
2021-11-06 23:12:16 +03:00
|
|
|
[[nodiscard]] bool is_empty() const { return m_size == 0; }
|
2021-04-11 11:25:22 +03:00
|
|
|
[[nodiscard]] size_t size() const { return m_size; }
|
|
|
|
[[nodiscard]] size_t capacity() const { return m_capacity; }
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
template<typename U, size_t N>
|
2021-11-11 01:00:21 +03:00
|
|
|
ErrorOr<void> try_set_from(U (&from_array)[N])
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
2021-11-11 01:00:21 +03:00
|
|
|
for (size_t i = 0; i < N; ++i)
|
|
|
|
TRY(try_set(from_array[i]));
|
|
|
|
return {};
|
2021-08-14 03:07:39 +03:00
|
|
|
}
|
|
|
|
template<typename U, size_t N>
|
|
|
|
void set_from(U (&from_array)[N])
|
|
|
|
{
|
2021-11-11 01:00:21 +03:00
|
|
|
MUST(try_set_from(from_array));
|
2020-10-16 00:34:07 +03:00
|
|
|
}
|
|
|
|
|
2020-02-24 11:42:52 +03:00
|
|
|
void ensure_capacity(size_t capacity)
|
2019-05-27 14:07:20 +03:00
|
|
|
{
|
2021-02-23 22:42:32 +03:00
|
|
|
VERIFY(capacity >= size());
|
2020-10-16 00:34:07 +03:00
|
|
|
rehash(capacity * 2);
|
2019-05-27 14:07:20 +03:00
|
|
|
}
|
|
|
|
|
2022-01-25 03:31:20 +03:00
|
|
|
ErrorOr<void> try_ensure_capacity(size_t capacity)
|
|
|
|
{
|
|
|
|
VERIFY(capacity >= size());
|
|
|
|
return try_rehash(capacity * 2);
|
|
|
|
}
|
|
|
|
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] bool contains(T const& value) const
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
|
|
|
return find(value) != end();
|
|
|
|
}
|
2020-08-16 12:04:00 +03:00
|
|
|
|
2021-11-07 16:52:20 +03:00
|
|
|
template<Concepts::HashCompatible<T> K>
|
|
|
|
requires(IsSame<TraitsForT, Traits<T>>) [[nodiscard]] bool contains(K const& value) const
|
|
|
|
{
|
|
|
|
return find(value) != end();
|
|
|
|
}
|
|
|
|
|
2021-06-13 17:26:08 +03:00
|
|
|
using Iterator = Conditional<IsOrdered,
|
|
|
|
OrderedHashTableIterator<HashTable, T, BucketType>,
|
|
|
|
HashTableIterator<HashTable, T, BucketType>>;
|
2020-10-16 00:34:07 +03:00
|
|
|
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] Iterator begin()
|
2020-08-16 12:04:00 +03:00
|
|
|
{
|
2021-06-13 17:26:08 +03:00
|
|
|
if constexpr (IsOrdered)
|
|
|
|
return Iterator(m_collection_data.head);
|
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
for (size_t i = 0; i < m_capacity; ++i) {
|
2022-03-08 18:23:08 +03:00
|
|
|
if (is_used_bucket(m_buckets[i].state))
|
2020-10-16 00:34:07 +03:00
|
|
|
return Iterator(&m_buckets[i]);
|
2020-08-16 12:04:00 +03:00
|
|
|
}
|
2020-10-16 00:34:07 +03:00
|
|
|
return end();
|
2020-08-16 12:04:00 +03:00
|
|
|
}
|
|
|
|
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] Iterator end()
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
|
|
|
return Iterator(nullptr);
|
|
|
|
}
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2021-06-13 17:26:08 +03:00
|
|
|
using ConstIterator = Conditional<IsOrdered,
|
2022-10-17 01:06:11 +03:00
|
|
|
OrderedHashTableIterator<const HashTable, const T, BucketType const>,
|
|
|
|
HashTableIterator<const HashTable, const T, BucketType const>>;
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] ConstIterator begin() const
|
2019-06-29 22:09:40 +03:00
|
|
|
{
|
2021-06-13 17:26:08 +03:00
|
|
|
if constexpr (IsOrdered)
|
|
|
|
return ConstIterator(m_collection_data.head);
|
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
for (size_t i = 0; i < m_capacity; ++i) {
|
2022-03-08 18:23:08 +03:00
|
|
|
if (is_used_bucket(m_buckets[i].state))
|
2020-10-16 00:34:07 +03:00
|
|
|
return ConstIterator(&m_buckets[i]);
|
|
|
|
}
|
2019-06-29 22:09:40 +03:00
|
|
|
return end();
|
|
|
|
}
|
|
|
|
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] ConstIterator end() const
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
|
|
|
return ConstIterator(nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void clear()
|
|
|
|
{
|
2020-10-17 16:44:43 +03:00
|
|
|
*this = HashTable();
|
2020-10-16 00:34:07 +03:00
|
|
|
}
|
2021-09-21 00:43:52 +03:00
|
|
|
void clear_with_capacity()
|
|
|
|
{
|
2022-11-11 04:54:43 +03:00
|
|
|
if (m_capacity == 0)
|
|
|
|
return;
|
2022-11-14 21:25:18 +03:00
|
|
|
if constexpr (!IsTriviallyDestructible<T>) {
|
2021-09-21 00:43:52 +03:00
|
|
|
for (auto* bucket : *this)
|
|
|
|
bucket->~T();
|
|
|
|
}
|
|
|
|
__builtin_memset(m_buckets, 0, size_in_bytes(capacity()));
|
|
|
|
m_size = 0;
|
|
|
|
m_deleted_count = 0;
|
|
|
|
|
|
|
|
if constexpr (IsOrdered)
|
|
|
|
m_collection_data = { nullptr, nullptr };
|
|
|
|
else
|
2022-03-07 17:10:10 +03:00
|
|
|
m_buckets[m_capacity].state = BucketState::End;
|
2021-09-21 00:43:52 +03:00
|
|
|
}
|
2020-10-16 00:34:07 +03:00
|
|
|
|
2021-01-16 01:59:55 +03:00
|
|
|
template<typename U = T>
|
2021-11-11 01:00:21 +03:00
|
|
|
ErrorOr<HashSetResult> try_set(U&& value, HashSetExistingEntryBehavior existing_entry_behavior = HashSetExistingEntryBehavior::Replace)
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
2021-11-11 01:00:21 +03:00
|
|
|
auto* bucket = TRY(try_lookup_for_writing(value));
|
2022-03-08 18:23:08 +03:00
|
|
|
if (is_used_bucket(bucket->state)) {
|
2021-09-07 13:56:50 +03:00
|
|
|
if (existing_entry_behavior == HashSetExistingEntryBehavior::Keep)
|
2021-06-08 23:42:07 +03:00
|
|
|
return HashSetResult::KeptExistingEntry;
|
2021-08-14 03:07:39 +03:00
|
|
|
(*bucket->slot()) = forward<U>(value);
|
2020-10-16 00:34:07 +03:00
|
|
|
return HashSetResult::ReplacedExistingEntry;
|
|
|
|
}
|
|
|
|
|
2021-08-14 03:07:39 +03:00
|
|
|
new (bucket->slot()) T(forward<U>(value));
|
2022-03-07 17:10:10 +03:00
|
|
|
if (bucket->state == BucketState::Deleted)
|
2020-10-16 00:34:07 +03:00
|
|
|
--m_deleted_count;
|
2022-03-07 17:10:10 +03:00
|
|
|
bucket->state = BucketState::Used;
|
2021-06-13 17:26:08 +03:00
|
|
|
|
|
|
|
if constexpr (IsOrdered) {
|
|
|
|
if (!m_collection_data.head) [[unlikely]] {
|
2021-08-14 03:07:39 +03:00
|
|
|
m_collection_data.head = bucket;
|
2021-06-13 17:26:08 +03:00
|
|
|
} else {
|
2021-08-14 03:07:39 +03:00
|
|
|
bucket->previous = m_collection_data.tail;
|
|
|
|
m_collection_data.tail->next = bucket;
|
2021-06-13 17:26:08 +03:00
|
|
|
}
|
2021-08-14 03:07:39 +03:00
|
|
|
m_collection_data.tail = bucket;
|
2021-06-13 17:26:08 +03:00
|
|
|
}
|
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
++m_size;
|
|
|
|
return HashSetResult::InsertedNewEntry;
|
|
|
|
}
|
2021-08-14 03:07:39 +03:00
|
|
|
template<typename U = T>
|
|
|
|
HashSetResult set(U&& value, HashSetExistingEntryBehavior existing_entry_behaviour = HashSetExistingEntryBehavior::Replace)
|
|
|
|
{
|
2021-11-11 01:00:21 +03:00
|
|
|
return MUST(try_set(forward<U>(value), existing_entry_behaviour));
|
2021-08-14 03:07:39 +03:00
|
|
|
}
|
2020-10-16 00:34:07 +03:00
|
|
|
|
2021-07-13 00:23:08 +03:00
|
|
|
template<typename TUnaryPredicate>
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] Iterator find(unsigned hash, TUnaryPredicate predicate)
|
2019-06-29 22:09:40 +03:00
|
|
|
{
|
2021-07-13 00:23:08 +03:00
|
|
|
return Iterator(lookup_with_hash(hash, move(predicate)));
|
2019-06-29 22:09:40 +03:00
|
|
|
}
|
|
|
|
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] Iterator find(T const& value)
|
2019-06-29 22:09:40 +03:00
|
|
|
{
|
|
|
|
return find(TraitsForT::hash(value), [&](auto& other) { return TraitsForT::equals(value, other); });
|
|
|
|
}
|
|
|
|
|
2021-07-13 00:23:08 +03:00
|
|
|
template<typename TUnaryPredicate>
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] ConstIterator find(unsigned hash, TUnaryPredicate predicate) const
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
2021-07-13 00:23:08 +03:00
|
|
|
return ConstIterator(lookup_with_hash(hash, move(predicate)));
|
2020-10-16 00:34:07 +03:00
|
|
|
}
|
|
|
|
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] ConstIterator find(T const& value) const
|
2019-06-29 22:09:40 +03:00
|
|
|
{
|
|
|
|
return find(TraitsForT::hash(value), [&](auto& other) { return TraitsForT::equals(value, other); });
|
|
|
|
}
|
2021-11-07 16:52:20 +03:00
|
|
|
// FIXME: Support for predicates, while guaranteeing that the predicate call
|
|
|
|
// does not call a non trivial constructor each time invoked
|
|
|
|
template<Concepts::HashCompatible<T> K>
|
|
|
|
requires(IsSame<TraitsForT, Traits<T>>) [[nodiscard]] Iterator find(K const& value)
|
|
|
|
{
|
2022-01-29 21:01:35 +03:00
|
|
|
return find(Traits<K>::hash(value), [&](auto& other) { return Traits<T>::equals(other, value); });
|
2021-11-07 16:52:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
template<Concepts::HashCompatible<T> K, typename TUnaryPredicate>
|
|
|
|
requires(IsSame<TraitsForT, Traits<T>>) [[nodiscard]] Iterator find(K const& value, TUnaryPredicate predicate)
|
|
|
|
{
|
|
|
|
return find(Traits<K>::hash(value), move(predicate));
|
|
|
|
}
|
|
|
|
|
|
|
|
template<Concepts::HashCompatible<T> K>
|
|
|
|
requires(IsSame<TraitsForT, Traits<T>>) [[nodiscard]] ConstIterator find(K const& value) const
|
|
|
|
{
|
2022-01-29 21:01:35 +03:00
|
|
|
return find(Traits<K>::hash(value), [&](auto& other) { return Traits<T>::equals(other, value); });
|
2021-11-07 16:52:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
template<Concepts::HashCompatible<T> K, typename TUnaryPredicate>
|
|
|
|
requires(IsSame<TraitsForT, Traits<T>>) [[nodiscard]] ConstIterator find(K const& value, TUnaryPredicate predicate) const
|
|
|
|
{
|
|
|
|
return find(Traits<K>::hash(value), move(predicate));
|
|
|
|
}
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2022-10-17 01:06:11 +03:00
|
|
|
bool remove(T const& value)
|
2018-10-13 15:22:09 +03:00
|
|
|
{
|
|
|
|
auto it = find(value);
|
2020-07-07 00:44:33 +03:00
|
|
|
if (it != end()) {
|
2018-10-13 15:22:09 +03:00
|
|
|
remove(it);
|
2020-07-07 00:44:33 +03:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
2018-10-13 15:22:09 +03:00
|
|
|
}
|
|
|
|
|
2021-12-15 17:18:30 +03:00
|
|
|
template<Concepts::HashCompatible<T> K>
|
|
|
|
requires(IsSame<TraitsForT, Traits<T>>) bool remove(K const& value)
|
|
|
|
{
|
|
|
|
auto it = find(value);
|
|
|
|
if (it != end()) {
|
|
|
|
remove(it);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-03-06 21:14:29 +03:00
|
|
|
void remove(Iterator iterator)
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
2021-02-23 22:42:32 +03:00
|
|
|
VERIFY(iterator.m_bucket);
|
2020-10-16 00:34:07 +03:00
|
|
|
auto& bucket = *iterator.m_bucket;
|
2022-03-08 18:23:08 +03:00
|
|
|
VERIFY(is_used_bucket(bucket.state));
|
2021-06-13 17:26:08 +03:00
|
|
|
|
2022-03-06 21:11:17 +03:00
|
|
|
delete_bucket(bucket);
|
2020-10-16 00:34:07 +03:00
|
|
|
--m_size;
|
|
|
|
++m_deleted_count;
|
2022-03-06 21:26:04 +03:00
|
|
|
|
AK: Rehash HashTable in-place instead of shrinking
As seen on TV, HashTable can get "thrashed", i.e. it has a bunch of
deleted buckets that count towards the load factor. This means that hash
tables which are large enough for their contents need to be resized.
This was fixed in 9d8da16 with a workaround that shrinks the HashTable
back down in these cases, as after the resize and re-hash the load
factor is very low again. However, that's not a good solution. If you
insert and remove repeatedly around a size boundary, you might get
frequent resizes, which involve frequent re-allocations.
The new solution is an in-place rehashing algorithm that I came up with.
(Do complain to me, I'm at fault.) Basically, it iterates the buckets
and re-hashes the used buckets while marking the deleted slots empty.
The issue arises with collisions in the re-hash. For this reason, there
are two kinds of used buckets during the re-hashing: the normal "used"
buckets, which are old and are treated as free space, and the
"re-hashed" buckets, which are new and treated as used space, i.e. they
trigger probing. Therefore, the procedure for relocating a bucket's
contents is as follows:
- Locate the "real" bucket of the contents with the hash. That bucket is
the starting point for the target bucket, and the current (old) bucket
is the bucket we want to move.
- While we still need to move the bucket:
- If we're the target, something strange happened last iteration or we
just re-hashed to the same location. We're done.
- If the target is empty or deleted, just move the bucket. We're done.
- If the target is a re-hashed full bucket, we probe by double-hashing
our hash as usual. Henceforth, we move our target for the next
iteration.
- If the target is an old full bucket, we swap the target and to-move
buckets. Therefore, the bucket to move is a the correct location and the
former target, which still needs to find a new place, is now in the
bucket to move. So we can just continue with the loop; the target is
re-obtained from the bucket to move. This happens for each and every
bucket, though some buckets are "coincidentally" moved before their
point of iteration is reached. Either way, this guarantees full in-place
movement (even without stack storage) and therefore space complexity of
O(1). Time complexity is amortized O(2n) asssuming a good hashing
function.
This leads to a performance improvement of ~30% on the benchmark
introduced with the last commit.
Co-authored-by: Hendiadyoin1 <leon.a@serenityos.org>
2022-03-08 01:56:54 +03:00
|
|
|
rehash_in_place_if_needed();
|
2022-01-05 18:45:42 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
template<typename TUnaryPredicate>
|
2022-04-12 20:21:05 +03:00
|
|
|
bool remove_all_matching(TUnaryPredicate const& predicate)
|
2022-01-05 18:45:42 +03:00
|
|
|
{
|
2022-03-06 21:11:17 +03:00
|
|
|
size_t removed_count = 0;
|
|
|
|
for (size_t i = 0; i < m_capacity; ++i) {
|
|
|
|
auto& bucket = m_buckets[i];
|
2022-03-08 18:23:08 +03:00
|
|
|
if (is_used_bucket(bucket.state) && predicate(*bucket.slot())) {
|
2022-03-06 21:11:17 +03:00
|
|
|
delete_bucket(bucket);
|
|
|
|
++removed_count;
|
2022-01-05 18:57:45 +03:00
|
|
|
}
|
2022-01-05 18:45:42 +03:00
|
|
|
}
|
2022-03-06 21:11:17 +03:00
|
|
|
if (removed_count) {
|
|
|
|
m_deleted_count += removed_count;
|
|
|
|
m_size -= removed_count;
|
|
|
|
}
|
AK: Rehash HashTable in-place instead of shrinking
As seen on TV, HashTable can get "thrashed", i.e. it has a bunch of
deleted buckets that count towards the load factor. This means that hash
tables which are large enough for their contents need to be resized.
This was fixed in 9d8da16 with a workaround that shrinks the HashTable
back down in these cases, as after the resize and re-hash the load
factor is very low again. However, that's not a good solution. If you
insert and remove repeatedly around a size boundary, you might get
frequent resizes, which involve frequent re-allocations.
The new solution is an in-place rehashing algorithm that I came up with.
(Do complain to me, I'm at fault.) Basically, it iterates the buckets
and re-hashes the used buckets while marking the deleted slots empty.
The issue arises with collisions in the re-hash. For this reason, there
are two kinds of used buckets during the re-hashing: the normal "used"
buckets, which are old and are treated as free space, and the
"re-hashed" buckets, which are new and treated as used space, i.e. they
trigger probing. Therefore, the procedure for relocating a bucket's
contents is as follows:
- Locate the "real" bucket of the contents with the hash. That bucket is
the starting point for the target bucket, and the current (old) bucket
is the bucket we want to move.
- While we still need to move the bucket:
- If we're the target, something strange happened last iteration or we
just re-hashed to the same location. We're done.
- If the target is empty or deleted, just move the bucket. We're done.
- If the target is a re-hashed full bucket, we probe by double-hashing
our hash as usual. Henceforth, we move our target for the next
iteration.
- If the target is an old full bucket, we swap the target and to-move
buckets. Therefore, the bucket to move is a the correct location and the
former target, which still needs to find a new place, is now in the
bucket to move. So we can just continue with the loop; the target is
re-obtained from the bucket to move. This happens for each and every
bucket, though some buckets are "coincidentally" moved before their
point of iteration is reached. Either way, this guarantees full in-place
movement (even without stack storage) and therefore space complexity of
O(1). Time complexity is amortized O(2n) asssuming a good hashing
function.
This leads to a performance improvement of ~30% on the benchmark
introduced with the last commit.
Co-authored-by: Hendiadyoin1 <leon.a@serenityos.org>
2022-03-08 01:56:54 +03:00
|
|
|
rehash_in_place_if_needed();
|
2022-03-06 21:26:04 +03:00
|
|
|
return removed_count;
|
2020-10-16 00:34:07 +03:00
|
|
|
}
|
2018-10-13 15:22:09 +03:00
|
|
|
|
2022-12-09 11:29:36 +03:00
|
|
|
T pop()
|
|
|
|
{
|
|
|
|
VERIFY(!is_empty());
|
|
|
|
T element;
|
|
|
|
if constexpr (IsOrdered) {
|
|
|
|
element = *m_collection_data.tail->slot();
|
|
|
|
} else {
|
|
|
|
for (size_t i = 0; i < m_capacity; ++i) {
|
|
|
|
if (is_used_bucket(m_buckets[i].state)) {
|
|
|
|
element = *m_buckets[i].slot();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
remove(element);
|
|
|
|
return element;
|
|
|
|
}
|
|
|
|
|
2018-10-10 12:53:07 +03:00
|
|
|
private:
|
2020-10-16 00:34:07 +03:00
|
|
|
void insert_during_rehash(T&& value)
|
2019-06-29 22:09:40 +03:00
|
|
|
{
|
2020-10-16 00:34:07 +03:00
|
|
|
auto& bucket = lookup_for_writing(value);
|
|
|
|
new (bucket.slot()) T(move(value));
|
2022-03-07 17:10:10 +03:00
|
|
|
bucket.state = BucketState::Used;
|
2021-06-13 17:26:08 +03:00
|
|
|
|
|
|
|
if constexpr (IsOrdered) {
|
|
|
|
if (!m_collection_data.head) [[unlikely]] {
|
|
|
|
m_collection_data.head = &bucket;
|
|
|
|
} else {
|
|
|
|
bucket.previous = m_collection_data.tail;
|
|
|
|
m_collection_data.tail->next = &bucket;
|
|
|
|
}
|
|
|
|
m_collection_data.tail = &bucket;
|
|
|
|
}
|
2019-06-29 22:09:40 +03:00
|
|
|
}
|
|
|
|
|
2021-08-15 23:20:37 +03:00
|
|
|
[[nodiscard]] static constexpr size_t size_in_bytes(size_t capacity)
|
2021-07-11 14:22:58 +03:00
|
|
|
{
|
|
|
|
if constexpr (IsOrdered) {
|
|
|
|
return sizeof(BucketType) * capacity;
|
|
|
|
} else {
|
|
|
|
return sizeof(BucketType) * (capacity + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-11 01:00:21 +03:00
|
|
|
ErrorOr<void> try_rehash(size_t new_capacity)
|
2019-06-29 22:09:40 +03:00
|
|
|
{
|
AK: Rehash HashTable in-place instead of shrinking
As seen on TV, HashTable can get "thrashed", i.e. it has a bunch of
deleted buckets that count towards the load factor. This means that hash
tables which are large enough for their contents need to be resized.
This was fixed in 9d8da16 with a workaround that shrinks the HashTable
back down in these cases, as after the resize and re-hash the load
factor is very low again. However, that's not a good solution. If you
insert and remove repeatedly around a size boundary, you might get
frequent resizes, which involve frequent re-allocations.
The new solution is an in-place rehashing algorithm that I came up with.
(Do complain to me, I'm at fault.) Basically, it iterates the buckets
and re-hashes the used buckets while marking the deleted slots empty.
The issue arises with collisions in the re-hash. For this reason, there
are two kinds of used buckets during the re-hashing: the normal "used"
buckets, which are old and are treated as free space, and the
"re-hashed" buckets, which are new and treated as used space, i.e. they
trigger probing. Therefore, the procedure for relocating a bucket's
contents is as follows:
- Locate the "real" bucket of the contents with the hash. That bucket is
the starting point for the target bucket, and the current (old) bucket
is the bucket we want to move.
- While we still need to move the bucket:
- If we're the target, something strange happened last iteration or we
just re-hashed to the same location. We're done.
- If the target is empty or deleted, just move the bucket. We're done.
- If the target is a re-hashed full bucket, we probe by double-hashing
our hash as usual. Henceforth, we move our target for the next
iteration.
- If the target is an old full bucket, we swap the target and to-move
buckets. Therefore, the bucket to move is a the correct location and the
former target, which still needs to find a new place, is now in the
bucket to move. So we can just continue with the loop; the target is
re-obtained from the bucket to move. This happens for each and every
bucket, though some buckets are "coincidentally" moved before their
point of iteration is reached. Either way, this guarantees full in-place
movement (even without stack storage) and therefore space complexity of
O(1). Time complexity is amortized O(2n) asssuming a good hashing
function.
This leads to a performance improvement of ~30% on the benchmark
introduced with the last commit.
Co-authored-by: Hendiadyoin1 <leon.a@serenityos.org>
2022-03-08 01:56:54 +03:00
|
|
|
if (new_capacity == m_capacity && new_capacity >= 4) {
|
|
|
|
rehash_in_place();
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
new_capacity = max(new_capacity, static_cast<size_t>(4));
|
2021-06-13 17:26:08 +03:00
|
|
|
new_capacity = kmalloc_good_size(new_capacity * sizeof(BucketType)) / sizeof(BucketType);
|
2019-06-29 22:09:40 +03:00
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
auto* old_buckets = m_buckets;
|
2021-07-11 14:22:58 +03:00
|
|
|
auto old_capacity = m_capacity;
|
2021-06-13 17:26:08 +03:00
|
|
|
Iterator old_iter = begin();
|
|
|
|
|
2022-03-15 01:59:16 +03:00
|
|
|
auto* new_buckets = kcalloc(1, size_in_bytes(new_capacity));
|
2021-08-14 03:07:39 +03:00
|
|
|
if (!new_buckets)
|
2021-11-11 01:00:21 +03:00
|
|
|
return Error::from_errno(ENOMEM);
|
2021-06-13 17:26:08 +03:00
|
|
|
|
2021-08-14 03:07:39 +03:00
|
|
|
m_buckets = (BucketType*)new_buckets;
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
m_capacity = new_capacity;
|
|
|
|
m_deleted_count = 0;
|
2019-06-27 16:57:49 +03:00
|
|
|
|
2021-08-14 03:07:39 +03:00
|
|
|
if constexpr (IsOrdered)
|
|
|
|
m_collection_data = { nullptr, nullptr };
|
|
|
|
else
|
2022-03-07 17:10:10 +03:00
|
|
|
m_buckets[m_capacity].state = BucketState::End;
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
if (!old_buckets)
|
2021-11-11 01:00:21 +03:00
|
|
|
return {};
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2021-06-13 17:26:08 +03:00
|
|
|
for (auto it = move(old_iter); it != end(); ++it) {
|
|
|
|
insert_during_rehash(move(*it));
|
|
|
|
it->~T();
|
2019-03-25 06:23:17 +03:00
|
|
|
}
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2021-07-11 14:22:58 +03:00
|
|
|
kfree_sized(old_buckets, size_in_bytes(old_capacity));
|
2021-11-11 01:00:21 +03:00
|
|
|
return {};
|
2021-08-14 03:07:39 +03:00
|
|
|
}
|
|
|
|
void rehash(size_t new_capacity)
|
|
|
|
{
|
2021-11-11 01:00:21 +03:00
|
|
|
MUST(try_rehash(new_capacity));
|
2018-11-07 03:38:51 +03:00
|
|
|
}
|
|
|
|
|
AK: Rehash HashTable in-place instead of shrinking
As seen on TV, HashTable can get "thrashed", i.e. it has a bunch of
deleted buckets that count towards the load factor. This means that hash
tables which are large enough for their contents need to be resized.
This was fixed in 9d8da16 with a workaround that shrinks the HashTable
back down in these cases, as after the resize and re-hash the load
factor is very low again. However, that's not a good solution. If you
insert and remove repeatedly around a size boundary, you might get
frequent resizes, which involve frequent re-allocations.
The new solution is an in-place rehashing algorithm that I came up with.
(Do complain to me, I'm at fault.) Basically, it iterates the buckets
and re-hashes the used buckets while marking the deleted slots empty.
The issue arises with collisions in the re-hash. For this reason, there
are two kinds of used buckets during the re-hashing: the normal "used"
buckets, which are old and are treated as free space, and the
"re-hashed" buckets, which are new and treated as used space, i.e. they
trigger probing. Therefore, the procedure for relocating a bucket's
contents is as follows:
- Locate the "real" bucket of the contents with the hash. That bucket is
the starting point for the target bucket, and the current (old) bucket
is the bucket we want to move.
- While we still need to move the bucket:
- If we're the target, something strange happened last iteration or we
just re-hashed to the same location. We're done.
- If the target is empty or deleted, just move the bucket. We're done.
- If the target is a re-hashed full bucket, we probe by double-hashing
our hash as usual. Henceforth, we move our target for the next
iteration.
- If the target is an old full bucket, we swap the target and to-move
buckets. Therefore, the bucket to move is a the correct location and the
former target, which still needs to find a new place, is now in the
bucket to move. So we can just continue with the loop; the target is
re-obtained from the bucket to move. This happens for each and every
bucket, though some buckets are "coincidentally" moved before their
point of iteration is reached. Either way, this guarantees full in-place
movement (even without stack storage) and therefore space complexity of
O(1). Time complexity is amortized O(2n) asssuming a good hashing
function.
This leads to a performance improvement of ~30% on the benchmark
introduced with the last commit.
Co-authored-by: Hendiadyoin1 <leon.a@serenityos.org>
2022-03-08 01:56:54 +03:00
|
|
|
void rehash_in_place()
|
|
|
|
{
|
|
|
|
// FIXME: This implementation takes two loops over the entire bucket array, but avoids re-allocation.
|
|
|
|
// Please benchmark your new implementation before you replace this.
|
|
|
|
// The reason is that because of collisions, we use the special "rehashed" bucket state to mark already-rehashed used buckets.
|
|
|
|
// Because we of course want to write into old used buckets, but already rehashed data shall not be touched.
|
|
|
|
|
|
|
|
// FIXME: Find a way to reduce the cognitive complexity of this function.
|
|
|
|
|
|
|
|
for (size_t i = 0; i < m_capacity; ++i) {
|
|
|
|
auto& bucket = m_buckets[i];
|
|
|
|
|
|
|
|
// FIXME: Bail out when we have handled every filled bucket.
|
|
|
|
|
|
|
|
if (bucket.state == BucketState::Rehashed || bucket.state == BucketState::End || bucket.state == BucketState::Free)
|
|
|
|
continue;
|
|
|
|
if (bucket.state == BucketState::Deleted) {
|
|
|
|
bucket.state = BucketState::Free;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto const new_hash = TraitsForT::hash(*bucket.slot());
|
|
|
|
if (new_hash % m_capacity == i) {
|
|
|
|
bucket.state = BucketState::Rehashed;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto target_hash = new_hash;
|
|
|
|
auto const to_move_hash = i;
|
|
|
|
BucketType* target_bucket = &m_buckets[target_hash % m_capacity];
|
|
|
|
BucketType* bucket_to_move = &m_buckets[i];
|
|
|
|
|
|
|
|
// Try to move the bucket to move into its correct spot.
|
|
|
|
// During the procedure, we might re-hash or actually change the bucket to move.
|
2022-03-08 18:23:08 +03:00
|
|
|
while (!is_free_bucket(bucket_to_move->state)) {
|
AK: Rehash HashTable in-place instead of shrinking
As seen on TV, HashTable can get "thrashed", i.e. it has a bunch of
deleted buckets that count towards the load factor. This means that hash
tables which are large enough for their contents need to be resized.
This was fixed in 9d8da16 with a workaround that shrinks the HashTable
back down in these cases, as after the resize and re-hash the load
factor is very low again. However, that's not a good solution. If you
insert and remove repeatedly around a size boundary, you might get
frequent resizes, which involve frequent re-allocations.
The new solution is an in-place rehashing algorithm that I came up with.
(Do complain to me, I'm at fault.) Basically, it iterates the buckets
and re-hashes the used buckets while marking the deleted slots empty.
The issue arises with collisions in the re-hash. For this reason, there
are two kinds of used buckets during the re-hashing: the normal "used"
buckets, which are old and are treated as free space, and the
"re-hashed" buckets, which are new and treated as used space, i.e. they
trigger probing. Therefore, the procedure for relocating a bucket's
contents is as follows:
- Locate the "real" bucket of the contents with the hash. That bucket is
the starting point for the target bucket, and the current (old) bucket
is the bucket we want to move.
- While we still need to move the bucket:
- If we're the target, something strange happened last iteration or we
just re-hashed to the same location. We're done.
- If the target is empty or deleted, just move the bucket. We're done.
- If the target is a re-hashed full bucket, we probe by double-hashing
our hash as usual. Henceforth, we move our target for the next
iteration.
- If the target is an old full bucket, we swap the target and to-move
buckets. Therefore, the bucket to move is a the correct location and the
former target, which still needs to find a new place, is now in the
bucket to move. So we can just continue with the loop; the target is
re-obtained from the bucket to move. This happens for each and every
bucket, though some buckets are "coincidentally" moved before their
point of iteration is reached. Either way, this guarantees full in-place
movement (even without stack storage) and therefore space complexity of
O(1). Time complexity is amortized O(2n) asssuming a good hashing
function.
This leads to a performance improvement of ~30% on the benchmark
introduced with the last commit.
Co-authored-by: Hendiadyoin1 <leon.a@serenityos.org>
2022-03-08 01:56:54 +03:00
|
|
|
|
|
|
|
// If we're targeting ourselves, there's nothing to do.
|
|
|
|
if (to_move_hash == target_hash % m_capacity) {
|
|
|
|
bucket_to_move->state = BucketState::Rehashed;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2022-03-08 18:23:08 +03:00
|
|
|
if (is_free_bucket(target_bucket->state)) {
|
AK: Rehash HashTable in-place instead of shrinking
As seen on TV, HashTable can get "thrashed", i.e. it has a bunch of
deleted buckets that count towards the load factor. This means that hash
tables which are large enough for their contents need to be resized.
This was fixed in 9d8da16 with a workaround that shrinks the HashTable
back down in these cases, as after the resize and re-hash the load
factor is very low again. However, that's not a good solution. If you
insert and remove repeatedly around a size boundary, you might get
frequent resizes, which involve frequent re-allocations.
The new solution is an in-place rehashing algorithm that I came up with.
(Do complain to me, I'm at fault.) Basically, it iterates the buckets
and re-hashes the used buckets while marking the deleted slots empty.
The issue arises with collisions in the re-hash. For this reason, there
are two kinds of used buckets during the re-hashing: the normal "used"
buckets, which are old and are treated as free space, and the
"re-hashed" buckets, which are new and treated as used space, i.e. they
trigger probing. Therefore, the procedure for relocating a bucket's
contents is as follows:
- Locate the "real" bucket of the contents with the hash. That bucket is
the starting point for the target bucket, and the current (old) bucket
is the bucket we want to move.
- While we still need to move the bucket:
- If we're the target, something strange happened last iteration or we
just re-hashed to the same location. We're done.
- If the target is empty or deleted, just move the bucket. We're done.
- If the target is a re-hashed full bucket, we probe by double-hashing
our hash as usual. Henceforth, we move our target for the next
iteration.
- If the target is an old full bucket, we swap the target and to-move
buckets. Therefore, the bucket to move is a the correct location and the
former target, which still needs to find a new place, is now in the
bucket to move. So we can just continue with the loop; the target is
re-obtained from the bucket to move. This happens for each and every
bucket, though some buckets are "coincidentally" moved before their
point of iteration is reached. Either way, this guarantees full in-place
movement (even without stack storage) and therefore space complexity of
O(1). Time complexity is amortized O(2n) asssuming a good hashing
function.
This leads to a performance improvement of ~30% on the benchmark
introduced with the last commit.
Co-authored-by: Hendiadyoin1 <leon.a@serenityos.org>
2022-03-08 01:56:54 +03:00
|
|
|
// We can just overwrite the target bucket and bail out.
|
|
|
|
new (target_bucket->slot()) T(move(*bucket_to_move->slot()));
|
|
|
|
target_bucket->state = BucketState::Rehashed;
|
|
|
|
bucket_to_move->state = BucketState::Free;
|
|
|
|
|
|
|
|
if constexpr (IsOrdered) {
|
|
|
|
swap(bucket_to_move->previous, target_bucket->previous);
|
|
|
|
swap(bucket_to_move->next, target_bucket->next);
|
|
|
|
|
|
|
|
if (target_bucket->previous)
|
|
|
|
target_bucket->previous->next = target_bucket;
|
|
|
|
else
|
|
|
|
m_collection_data.head = target_bucket;
|
|
|
|
if (target_bucket->next)
|
|
|
|
target_bucket->next->previous = target_bucket;
|
|
|
|
else
|
|
|
|
m_collection_data.tail = target_bucket;
|
|
|
|
}
|
|
|
|
} else if (target_bucket->state == BucketState::Rehashed) {
|
|
|
|
// If the target bucket is already re-hashed, we do normal probing.
|
2023-01-20 23:25:33 +03:00
|
|
|
target_hash = rehash_for_collision(target_hash);
|
AK: Rehash HashTable in-place instead of shrinking
As seen on TV, HashTable can get "thrashed", i.e. it has a bunch of
deleted buckets that count towards the load factor. This means that hash
tables which are large enough for their contents need to be resized.
This was fixed in 9d8da16 with a workaround that shrinks the HashTable
back down in these cases, as after the resize and re-hash the load
factor is very low again. However, that's not a good solution. If you
insert and remove repeatedly around a size boundary, you might get
frequent resizes, which involve frequent re-allocations.
The new solution is an in-place rehashing algorithm that I came up with.
(Do complain to me, I'm at fault.) Basically, it iterates the buckets
and re-hashes the used buckets while marking the deleted slots empty.
The issue arises with collisions in the re-hash. For this reason, there
are two kinds of used buckets during the re-hashing: the normal "used"
buckets, which are old and are treated as free space, and the
"re-hashed" buckets, which are new and treated as used space, i.e. they
trigger probing. Therefore, the procedure for relocating a bucket's
contents is as follows:
- Locate the "real" bucket of the contents with the hash. That bucket is
the starting point for the target bucket, and the current (old) bucket
is the bucket we want to move.
- While we still need to move the bucket:
- If we're the target, something strange happened last iteration or we
just re-hashed to the same location. We're done.
- If the target is empty or deleted, just move the bucket. We're done.
- If the target is a re-hashed full bucket, we probe by double-hashing
our hash as usual. Henceforth, we move our target for the next
iteration.
- If the target is an old full bucket, we swap the target and to-move
buckets. Therefore, the bucket to move is a the correct location and the
former target, which still needs to find a new place, is now in the
bucket to move. So we can just continue with the loop; the target is
re-obtained from the bucket to move. This happens for each and every
bucket, though some buckets are "coincidentally" moved before their
point of iteration is reached. Either way, this guarantees full in-place
movement (even without stack storage) and therefore space complexity of
O(1). Time complexity is amortized O(2n) asssuming a good hashing
function.
This leads to a performance improvement of ~30% on the benchmark
introduced with the last commit.
Co-authored-by: Hendiadyoin1 <leon.a@serenityos.org>
2022-03-08 01:56:54 +03:00
|
|
|
target_bucket = &m_buckets[target_hash % m_capacity];
|
|
|
|
} else {
|
|
|
|
VERIFY(target_bucket->state != BucketState::End);
|
|
|
|
// The target bucket is a used bucket that hasn't been re-hashed.
|
|
|
|
// Swap the data into the target; now the target's data resides in the bucket to move again.
|
|
|
|
// (That's of course what we want, how neat!)
|
|
|
|
swap(*bucket_to_move->slot(), *target_bucket->slot());
|
|
|
|
bucket_to_move->state = target_bucket->state;
|
|
|
|
target_bucket->state = BucketState::Rehashed;
|
|
|
|
|
|
|
|
if constexpr (IsOrdered) {
|
|
|
|
// Update state for the target bucket, we'll do the bucket to move later.
|
|
|
|
swap(bucket_to_move->previous, target_bucket->previous);
|
|
|
|
swap(bucket_to_move->next, target_bucket->next);
|
|
|
|
|
|
|
|
if (target_bucket->previous)
|
|
|
|
target_bucket->previous->next = target_bucket;
|
|
|
|
else
|
|
|
|
m_collection_data.head = target_bucket;
|
|
|
|
if (target_bucket->next)
|
|
|
|
target_bucket->next->previous = target_bucket;
|
|
|
|
else
|
|
|
|
m_collection_data.tail = target_bucket;
|
|
|
|
}
|
|
|
|
|
|
|
|
target_hash = TraitsForT::hash(*bucket_to_move->slot());
|
|
|
|
target_bucket = &m_buckets[target_hash % m_capacity];
|
|
|
|
|
|
|
|
// The data is already in the correct location: Adjust the pointers
|
|
|
|
if (target_hash % m_capacity == to_move_hash) {
|
|
|
|
bucket_to_move->state = BucketState::Rehashed;
|
|
|
|
if constexpr (IsOrdered) {
|
|
|
|
// Update state for the bucket to move as it's not actually moved anymore.
|
|
|
|
if (bucket_to_move->previous)
|
|
|
|
bucket_to_move->previous->next = bucket_to_move;
|
|
|
|
else
|
|
|
|
m_collection_data.head = bucket_to_move;
|
|
|
|
if (bucket_to_move->next)
|
|
|
|
bucket_to_move->next->previous = bucket_to_move;
|
|
|
|
else
|
|
|
|
m_collection_data.tail = bucket_to_move;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// After this, the bucket_to_move either contains data that rehashes to itself, or it contains nothing as we were able to move the last thing.
|
|
|
|
if (bucket_to_move->state == BucketState::Deleted)
|
|
|
|
bucket_to_move->state = BucketState::Free;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < m_capacity; ++i) {
|
|
|
|
if (m_buckets[i].state == BucketState::Rehashed)
|
|
|
|
m_buckets[i].state = BucketState::Used;
|
|
|
|
}
|
|
|
|
|
|
|
|
m_deleted_count = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void rehash_in_place_if_needed()
|
|
|
|
{
|
|
|
|
// This signals a "thrashed" hash table with many deleted slots.
|
|
|
|
if (m_deleted_count >= m_size && should_grow())
|
|
|
|
rehash_in_place();
|
|
|
|
}
|
|
|
|
|
2021-07-13 00:23:08 +03:00
|
|
|
template<typename TUnaryPredicate>
|
2021-07-21 19:18:29 +03:00
|
|
|
[[nodiscard]] BucketType* lookup_with_hash(unsigned hash, TUnaryPredicate predicate) const
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
|
|
|
if (is_empty())
|
|
|
|
return nullptr;
|
2021-04-02 04:52:32 +03:00
|
|
|
|
2020-10-16 00:34:07 +03:00
|
|
|
for (;;) {
|
2021-04-02 04:52:32 +03:00
|
|
|
auto& bucket = m_buckets[hash % m_capacity];
|
2020-10-16 00:34:07 +03:00
|
|
|
|
2022-03-08 18:23:08 +03:00
|
|
|
if (is_used_bucket(bucket.state) && predicate(*bucket.slot()))
|
2020-10-16 00:34:07 +03:00
|
|
|
return &bucket;
|
|
|
|
|
2022-03-07 17:10:10 +03:00
|
|
|
if (bucket.state != BucketState::Used && bucket.state != BucketState::Deleted)
|
2020-10-16 00:34:07 +03:00
|
|
|
return nullptr;
|
|
|
|
|
2023-01-20 23:25:33 +03:00
|
|
|
hash = rehash_for_collision(hash);
|
2018-10-10 12:53:07 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-11 01:00:21 +03:00
|
|
|
ErrorOr<BucketType*> try_lookup_for_writing(T const& value)
|
2020-10-16 00:34:07 +03:00
|
|
|
{
|
2021-08-14 03:07:39 +03:00
|
|
|
// FIXME: Maybe overrun the "allowed" load factor to avoid OOM
|
|
|
|
// If we are allowed to do that, separate that logic from
|
|
|
|
// the normal lookup_for_writing
|
2021-11-11 01:00:21 +03:00
|
|
|
if (should_grow())
|
|
|
|
TRY(try_rehash(capacity() * 2));
|
2021-04-02 05:02:33 +03:00
|
|
|
auto hash = TraitsForT::hash(value);
|
2021-06-13 17:26:08 +03:00
|
|
|
BucketType* first_empty_bucket = nullptr;
|
2020-10-16 00:34:07 +03:00
|
|
|
for (;;) {
|
2021-04-02 04:52:32 +03:00
|
|
|
auto& bucket = m_buckets[hash % m_capacity];
|
2021-04-02 05:02:33 +03:00
|
|
|
|
2022-03-08 18:23:08 +03:00
|
|
|
if (is_used_bucket(bucket.state) && TraitsForT::equals(*bucket.slot(), value))
|
2021-08-14 03:07:39 +03:00
|
|
|
return &bucket;
|
2021-04-02 05:02:33 +03:00
|
|
|
|
2022-03-08 18:23:08 +03:00
|
|
|
if (!is_used_bucket(bucket.state)) {
|
2021-04-02 05:02:33 +03:00
|
|
|
if (!first_empty_bucket)
|
|
|
|
first_empty_bucket = &bucket;
|
|
|
|
|
2022-03-07 17:10:10 +03:00
|
|
|
if (bucket.state != BucketState::Deleted)
|
2021-08-14 03:07:39 +03:00
|
|
|
return const_cast<BucketType*>(first_empty_bucket);
|
2021-04-02 05:02:33 +03:00
|
|
|
}
|
|
|
|
|
2023-01-20 23:25:33 +03:00
|
|
|
hash = rehash_for_collision(hash);
|
2020-10-16 00:34:07 +03:00
|
|
|
}
|
|
|
|
}
|
2021-08-14 03:07:39 +03:00
|
|
|
[[nodiscard]] BucketType& lookup_for_writing(T const& value)
|
|
|
|
{
|
2021-11-11 01:00:21 +03:00
|
|
|
return *MUST(try_lookup_for_writing(value));
|
2021-08-14 03:07:39 +03:00
|
|
|
}
|
2018-10-13 15:22:09 +03:00
|
|
|
|
2021-04-11 11:25:22 +03:00
|
|
|
[[nodiscard]] size_t used_bucket_count() const { return m_size + m_deleted_count; }
|
|
|
|
[[nodiscard]] bool should_grow() const { return ((used_bucket_count() + 1) * 100) >= (m_capacity * load_factor_in_percent); }
|
2018-10-10 12:53:07 +03:00
|
|
|
|
2022-03-06 21:11:17 +03:00
|
|
|
void delete_bucket(auto& bucket)
|
|
|
|
{
|
|
|
|
bucket.slot()->~T();
|
2022-03-07 17:10:10 +03:00
|
|
|
bucket.state = BucketState::Deleted;
|
2022-03-06 21:11:17 +03:00
|
|
|
|
|
|
|
if constexpr (IsOrdered) {
|
|
|
|
if (bucket.previous)
|
|
|
|
bucket.previous->next = bucket.next;
|
|
|
|
else
|
|
|
|
m_collection_data.head = bucket.next;
|
|
|
|
if (bucket.next)
|
|
|
|
bucket.next->previous = bucket.previous;
|
|
|
|
else
|
|
|
|
m_collection_data.tail = bucket.previous;
|
2022-06-23 18:00:41 +03:00
|
|
|
bucket.previous = nullptr;
|
2022-06-22 21:06:28 +03:00
|
|
|
bucket.next = nullptr;
|
2022-03-06 21:11:17 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-13 17:26:08 +03:00
|
|
|
BucketType* m_buckets { nullptr };
|
|
|
|
|
|
|
|
[[no_unique_address]] CollectionDataType m_collection_data;
|
2020-10-16 00:34:07 +03:00
|
|
|
size_t m_size { 0 };
|
|
|
|
size_t m_capacity { 0 };
|
|
|
|
size_t m_deleted_count { 0 };
|
|
|
|
};
|
2018-10-10 12:53:07 +03:00
|
|
|
}
|
|
|
|
|
2022-11-26 14:18:30 +03:00
|
|
|
#if USING_AK_GLOBALLY
|
|
|
|
using AK::HashSetResult;
|
2018-10-10 12:53:07 +03:00
|
|
|
using AK::HashTable;
|
2021-06-13 17:26:08 +03:00
|
|
|
using AK::OrderedHashTable;
|
2022-11-26 14:18:30 +03:00
|
|
|
#endif
|