mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-11-09 16:04:41 +03:00
14e02978fc
Avoid unspecified behavior of mmap when a file is resized reported by Christian Hardmeier
Fixes for Mavericks and a workaround for Boost's broken semaphore
Clean clang compile (of kenlm)
Merged some of 744376b3fb
but also undid some of it because it was just masking a fundaemntal problem with pread rather than working around windows limitations
327 lines
10 KiB
C++
327 lines
10 KiB
C++
#ifndef UTIL_PROBING_HASH_TABLE__
|
|
#define UTIL_PROBING_HASH_TABLE__
|
|
|
|
#include "util/exception.hh"
|
|
#include "util/scoped.hh"
|
|
|
|
#include <algorithm>
|
|
#include <cstddef>
|
|
#include <functional>
|
|
#include <vector>
|
|
|
|
#include <assert.h>
|
|
#include <stdint.h>
|
|
|
|
namespace util {
|
|
|
|
/* Thrown when table grows too large */
|
|
class ProbingSizeException : public Exception {
|
|
public:
|
|
ProbingSizeException() throw() {}
|
|
~ProbingSizeException() throw() {}
|
|
};
|
|
|
|
// std::identity is an SGI extension :-(
|
|
struct IdentityHash {
|
|
template <class T> T operator()(T arg) const { return arg; }
|
|
};
|
|
|
|
template <class EntryT, class HashT, class EqualT> class AutoProbing;
|
|
|
|
/* Non-standard hash table
|
|
* Buckets must be set at the beginning and must be greater than maximum number
|
|
* of elements, else it throws ProbingSizeException.
|
|
* Memory management and initialization is externalized to make it easier to
|
|
* serialize these to disk and load them quickly.
|
|
* Uses linear probing to find value.
|
|
* Only insert and lookup operations.
|
|
*/
|
|
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable {
|
|
public:
|
|
typedef EntryT Entry;
|
|
typedef typename Entry::Key Key;
|
|
typedef const Entry *ConstIterator;
|
|
typedef Entry *MutableIterator;
|
|
typedef HashT Hash;
|
|
typedef EqualT Equal;
|
|
|
|
static uint64_t Size(uint64_t entries, float multiplier) {
|
|
uint64_t buckets = std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries)));
|
|
return buckets * sizeof(Entry);
|
|
}
|
|
|
|
// Must be assigned to later.
|
|
ProbingHashTable() : entries_(0)
|
|
#ifdef DEBUG
|
|
, initialized_(false)
|
|
#endif
|
|
{}
|
|
|
|
ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal())
|
|
: begin_(reinterpret_cast<MutableIterator>(start)),
|
|
buckets_(allocated / sizeof(Entry)),
|
|
end_(begin_ + buckets_),
|
|
invalid_(invalid),
|
|
hash_(hash_func),
|
|
equal_(equal_func),
|
|
entries_(0)
|
|
#ifdef DEBUG
|
|
, initialized_(true)
|
|
#endif
|
|
{}
|
|
|
|
void Relocate(void *new_base) {
|
|
begin_ = reinterpret_cast<MutableIterator>(new_base);
|
|
end_ = begin_ + buckets_;
|
|
}
|
|
|
|
template <class T> MutableIterator Insert(const T &t) {
|
|
#ifdef DEBUG
|
|
assert(initialized_);
|
|
#endif
|
|
UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
|
|
return UncheckedInsert(t);
|
|
}
|
|
|
|
// Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map!
|
|
template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
|
|
#ifdef DEBUG
|
|
assert(initialized_);
|
|
#endif
|
|
for (MutableIterator i = Ideal(t);;) {
|
|
Key got(i->GetKey());
|
|
if (equal_(got, t.GetKey())) { out = i; return true; }
|
|
if (equal_(got, invalid_)) {
|
|
UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
|
|
*i = t;
|
|
out = i;
|
|
return false;
|
|
}
|
|
if (++i == end_) i = begin_;
|
|
}
|
|
}
|
|
|
|
void FinishedInserting() {}
|
|
|
|
// Don't change anything related to GetKey,
|
|
template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
|
|
#ifdef DEBUG
|
|
assert(initialized_);
|
|
#endif
|
|
for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) {
|
|
Key got(i->GetKey());
|
|
if (equal_(got, key)) { out = i; return true; }
|
|
if (equal_(got, invalid_)) return false;
|
|
if (++i == end_) i = begin_;
|
|
}
|
|
}
|
|
|
|
// Like UnsafeMutableFind, but the key must be there.
|
|
template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
|
|
for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) {
|
|
Key got(i->GetKey());
|
|
if (equal_(got, key)) { return i; }
|
|
assert(!equal_(got, invalid_));
|
|
if (++i == end_) i = begin_;
|
|
}
|
|
}
|
|
|
|
|
|
template <class Key> bool Find(const Key key, ConstIterator &out) const {
|
|
#ifdef DEBUG
|
|
assert(initialized_);
|
|
#endif
|
|
for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) {
|
|
Key got(i->GetKey());
|
|
if (equal_(got, key)) { out = i; return true; }
|
|
if (equal_(got, invalid_)) return false;
|
|
if (++i == end_) i = begin_;
|
|
}
|
|
}
|
|
|
|
// Like Find but we're sure it must be there.
|
|
template <class Key> ConstIterator MustFind(const Key key) const {
|
|
for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) {
|
|
Key got(i->GetKey());
|
|
if (equal_(got, key)) { return i; }
|
|
assert(!equal_(got, invalid_));
|
|
if (++i == end_) i = begin_;
|
|
}
|
|
}
|
|
|
|
void Clear() {
|
|
Entry invalid;
|
|
invalid.SetKey(invalid_);
|
|
std::fill(begin_, end_, invalid);
|
|
entries_ = 0;
|
|
}
|
|
|
|
// Return number of entries assuming no serialization went on.
|
|
std::size_t SizeNoSerialization() const {
|
|
return entries_;
|
|
}
|
|
|
|
// Return memory size expected by Double.
|
|
std::size_t DoubleTo() const {
|
|
return buckets_ * 2 * sizeof(Entry);
|
|
}
|
|
|
|
// Inform the table that it has double the amount of memory.
|
|
// Pass clear_new = false if you are sure the new memory is initialized
|
|
// properly (to invalid_) i.e. by mremap.
|
|
void Double(void *new_base, bool clear_new = true) {
|
|
begin_ = static_cast<MutableIterator>(new_base);
|
|
MutableIterator old_end = begin_ + buckets_;
|
|
buckets_ *= 2;
|
|
end_ = begin_ + buckets_;
|
|
if (clear_new) {
|
|
Entry invalid;
|
|
invalid.SetKey(invalid_);
|
|
std::fill(old_end, end_, invalid);
|
|
}
|
|
std::vector<Entry> rolled_over;
|
|
// Move roll-over entries to a buffer because they might not roll over anymore. This should be small.
|
|
for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) {
|
|
rolled_over.push_back(*i);
|
|
i->SetKey(invalid_);
|
|
}
|
|
/* Re-insert everything. Entries might go backwards to take over a
|
|
* recently opened gap, stay, move to new territory, or wrap around. If
|
|
* an entry wraps around, it might go to a pointer greater than i (which
|
|
* can happen at the beginning) and it will be revisited to possibly fill
|
|
* in a gap created later.
|
|
*/
|
|
Entry temp;
|
|
for (MutableIterator i = begin_; i != old_end; ++i) {
|
|
if (!equal_(i->GetKey(), invalid_)) {
|
|
temp = *i;
|
|
i->SetKey(invalid_);
|
|
UncheckedInsert(temp);
|
|
}
|
|
}
|
|
// Put the roll-over entries back in.
|
|
for (typename std::vector<Entry>::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) {
|
|
UncheckedInsert(*i);
|
|
}
|
|
}
|
|
|
|
// Mostly for tests, check consistency of every entry.
|
|
void CheckConsistency() {
|
|
MutableIterator last;
|
|
for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {}
|
|
UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full");
|
|
MutableIterator i;
|
|
// Beginning can be wrap-arounds.
|
|
for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) {
|
|
MutableIterator ideal = Ideal(*i);
|
|
UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_));
|
|
}
|
|
MutableIterator pre_gap = i;
|
|
for (; i != end_; ++i) {
|
|
if (equal_(i->GetKey(), invalid_)) {
|
|
pre_gap = i;
|
|
continue;
|
|
}
|
|
MutableIterator ideal = Ideal(*i);
|
|
UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_));
|
|
}
|
|
}
|
|
|
|
private:
|
|
friend class AutoProbing<Entry, Hash, Equal>;
|
|
|
|
template <class T> MutableIterator Ideal(const T &t) {
|
|
return begin_ + (hash_(t.GetKey()) % buckets_);
|
|
}
|
|
|
|
template <class T> MutableIterator UncheckedInsert(const T &t) {
|
|
for (MutableIterator i(Ideal(t));;) {
|
|
if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
|
|
if (++i == end_) { i = begin_; }
|
|
}
|
|
}
|
|
|
|
MutableIterator begin_;
|
|
std::size_t buckets_;
|
|
MutableIterator end_;
|
|
Key invalid_;
|
|
Hash hash_;
|
|
Equal equal_;
|
|
std::size_t entries_;
|
|
#ifdef DEBUG
|
|
bool initialized_;
|
|
#endif
|
|
};
|
|
|
|
// Resizable linear probing hash table. This owns the memory.
|
|
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing {
|
|
private:
|
|
typedef ProbingHashTable<EntryT, HashT, EqualT> Backend;
|
|
public:
|
|
typedef EntryT Entry;
|
|
typedef typename Entry::Key Key;
|
|
typedef const Entry *ConstIterator;
|
|
typedef Entry *MutableIterator;
|
|
typedef HashT Hash;
|
|
typedef EqualT Equal;
|
|
|
|
AutoProbing(std::size_t initial_size = 10, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
|
|
allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
|
|
threshold_ = initial_size * 1.2;
|
|
}
|
|
|
|
// Assumes that the key is unique. Multiple insertions won't cause a failure, just inconsistent lookup.
|
|
template <class T> MutableIterator Insert(const T &t) {
|
|
DoubleIfNeeded();
|
|
return backend_.UncheckedInsert(t);
|
|
}
|
|
|
|
template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
|
|
DoubleIfNeeded();
|
|
return backend_.FindOrInsert(t, out);
|
|
}
|
|
|
|
template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
|
|
return backend_.UnsafeMutableFind(key, out);
|
|
}
|
|
|
|
template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
|
|
return backend_.UnsafeMutableMustFind(key);
|
|
}
|
|
|
|
template <class Key> bool Find(const Key key, ConstIterator &out) const {
|
|
return backend_.Find(key, out);
|
|
}
|
|
|
|
template <class Key> ConstIterator MustFind(const Key key) const {
|
|
return backend_.MustFind(key);
|
|
}
|
|
|
|
std::size_t Size() const {
|
|
return backend_.SizeNoSerialization();
|
|
}
|
|
|
|
void Clear() {
|
|
backend_.Clear();
|
|
}
|
|
|
|
private:
|
|
void DoubleIfNeeded() {
|
|
if (Size() < threshold_)
|
|
return;
|
|
mem_.call_realloc(backend_.DoubleTo());
|
|
allocated_ = backend_.DoubleTo();
|
|
backend_.Double(mem_.get());
|
|
threshold_ *= 2;
|
|
}
|
|
|
|
std::size_t allocated_;
|
|
util::scoped_malloc mem_;
|
|
Backend backend_;
|
|
std::size_t threshold_;
|
|
};
|
|
|
|
} // namespace util
|
|
|
|
#endif // UTIL_PROBING_HASH_TABLE__
|