Update kenlm

This commit is contained in:
Kenneth Heafield 2015-09-10 16:04:09 +01:00
parent 37f7326057
commit 5732129bd4
12 changed files with 236 additions and 62 deletions

View File

@ -71,7 +71,7 @@ foreach(exe ${EXE_LIST})
add_executable(${exe} ${exe}_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>)
# Link the executable against boost
target_link_libraries(${exe} ${Boost_LIBRARIES})
target_link_libraries(${exe} ${Boost_LIBRARIES} pthread)
# Group executables together
set_target_properties(${exe} PROPERTIES FOLDER executables)
@ -104,7 +104,7 @@ if(BUILD_TESTING)
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES})
target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# model_test requires an extra command line parameter
if ("${test}" STREQUAL "model_test")

View File

@ -45,7 +45,7 @@ add_library(kenlm_builder OBJECT ${KENLM_BUILDER_SOURCE})
add_executable(lmplz lmplz_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
# Link the executable against boost
target_link_libraries(lmplz ${Boost_LIBRARIES})
target_link_libraries(lmplz ${Boost_LIBRARIES} pthread)
# Group executables together
set_target_properties(lmplz PROPERTIES FOLDER executables)
@ -68,7 +68,7 @@ if(BUILD_TESTING)
set_target_properties(${test} PROPERTIES COMPILE_FLAGS "-DBOOST_TEST_DYN_LINK -DBOOST_PROGRAM_OPTIONS_DYN_LINK")
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES})
target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# Specify command arguments for how to run each unit test
#

View File

@ -9,6 +9,7 @@
#include "util/fixed_array.hh"
#include "util/murmur_hash.hh"
#include <iostream>
#include <cassert>
#include <cmath>

View File

@ -191,6 +191,9 @@ class Master {
chains_.clear();
std::cerr << "Chain sizes:";
for (std::size_t i = 0; i < config_.order; ++i) {
// Always have enough for at least one record.
// This was crashing if e.g. there was no 5-gram.
assignments[i] = std::max(assignments[i], block_count[i] * NGram<BuildingPayload>::TotalSize(i + 1));
std::cerr << ' ' << (i+1) << ":" << assignments[i];
chains_.push_back(util::stream::ChainConfig(NGram<BuildingPayload>::TotalSize(i + 1), block_count[i], assignments[i]));
}

View File

@ -52,7 +52,7 @@ foreach(exe ${EXE_LIST})
add_executable(${exe} ${exe}_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_filter> $<TARGET_OBJECTS:kenlm_util>)
# Link the executable against boost
target_link_libraries(${exe} ${Boost_LIBRARIES})
target_link_libraries(${exe} ${Boost_LIBRARIES} pthread)
# Group executables together
set_target_properties(${exe} PROPERTIES FOLDER executables)

View File

@ -31,9 +31,7 @@ template <class Model, class Width> void QueryFromBytes(const Model &model, int
Width kEOS = model.GetVocabulary().EndSentence();
Width buf[4096];
float sum = 0.0;
while (true) {
std::size_t got = util::ReadOrEOF(fd_in, buf, sizeof(buf));
if (!got) break;
while (std::size_t got = util::ReadOrEOF(fd_in, buf, sizeof(buf))) {
UTIL_THROW_IF2(got % sizeof(Width), "File size not a multiple of vocab id size " << sizeof(Width));
got /= sizeof(Width);
// Do even stuff first.

View File

@ -80,7 +80,7 @@ if(BUILD_TESTING)
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES})
target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# file_piece_test requires an extra command line parameter
if ("${test}" STREQUAL "file_piece_test")

View File

@ -11,10 +11,10 @@
namespace util {
/**
* Defines a fixed-size collection.
* Defines an array with fixed maximum size.
*
* Ever want an array of things by they don't have a default constructor or are
* non-copyable? FixedArray allows constructing one at a time.
* Ever want an array of things but they don't have a default constructor or
* are non-copyable? FixedArray allows constructing one at a time.
*/
template <class T> class FixedArray {
public:

View File

@ -26,6 +26,65 @@ struct IdentityHash {
template <class T> T operator()(T arg) const { return arg; }
};
class DivMod {
public:
explicit DivMod(std::size_t buckets) : buckets_(buckets) {}
static std::size_t RoundBuckets(std::size_t from) {
return from;
}
template <class It> It Ideal(It begin, uint64_t hash) const {
return begin + (hash % buckets_);
}
template <class BaseIt, class OutIt> void Next(BaseIt begin, BaseIt end, OutIt &it) const {
if (++it == end) it = begin;
}
void Double() {
buckets_ *= 2;
}
private:
std::size_t buckets_;
};
class Power2Mod {
public:
explicit Power2Mod(std::size_t buckets) {
UTIL_THROW_IF(!buckets || (((buckets - 1) & buckets)), ProbingSizeException, "Size " << buckets << " is not a power of 2.");
mask_ = buckets - 1;
}
// Round up to next power of 2.
static std::size_t RoundBuckets(std::size_t from) {
--from;
from |= from >> 1;
from |= from >> 2;
from |= from >> 4;
from |= from >> 8;
from |= from >> 16;
from |= from >> 32;
return from + 1;
}
template <class It> It Ideal(It begin, uint64_t hash) const {
return begin + (hash & mask_);
}
template <class BaseIt, class OutIt> void Next(BaseIt begin, BaseIt /*end*/, OutIt &it) const {
it = begin + ((it - begin + 1) & mask_);
}
void Double() {
mask_ = (mask_ << 1) | 1;
}
private:
std::size_t mask_;
};
template <class EntryT, class HashT, class EqualT> class AutoProbing;
/* Non-standard hash table
@ -36,7 +95,7 @@ template <class EntryT, class HashT, class EqualT> class AutoProbing;
* Uses linear probing to find value.
* Only insert and lookup operations.
*/
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable {
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key>, class ModT = DivMod> class ProbingHashTable {
public:
typedef EntryT Entry;
typedef typename Entry::Key Key;
@ -44,14 +103,15 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
typedef Entry *MutableIterator;
typedef HashT Hash;
typedef EqualT Equal;
typedef ModT Mod;
static uint64_t Size(uint64_t entries, float multiplier) {
uint64_t buckets = std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries)));
uint64_t buckets = Mod::RoundBuckets(std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries))));
return buckets * sizeof(Entry);
}
// Must be assigned to later.
ProbingHashTable() : entries_(0)
ProbingHashTable() : mod_(1), entries_(0)
#ifdef DEBUG
, initialized_(false)
#endif
@ -59,11 +119,12 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal())
: begin_(reinterpret_cast<MutableIterator>(start)),
buckets_(allocated / sizeof(Entry)),
end_(begin_ + buckets_),
end_(begin_ + allocated / sizeof(Entry)),
buckets_(end_ - begin_),
invalid_(invalid),
hash_(hash_func),
equal_(equal_func),
mod_(end_ - begin_),
entries_(0)
#ifdef DEBUG
, initialized_(true)
@ -75,6 +136,13 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
end_ = begin_ + buckets_;
}
MutableIterator Ideal(const Key key) {
return mod_.Ideal(begin_, hash_(key));
}
ConstIterator Ideal(const Key key) const {
return mod_.Ideal(begin_, hash_(key));
}
template <class T> MutableIterator Insert(const T &t) {
#ifdef DEBUG
assert(initialized_);
@ -83,12 +151,12 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
return UncheckedInsert(t);
}
// Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map!
// Return true if the value was found (and not inserted). This is consistent with Find but the opposite of hash_map!
template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
#ifdef DEBUG
assert(initialized_);
#endif
for (MutableIterator i = Ideal(t.GetKey());;) {
for (MutableIterator i = Ideal(t.GetKey());;mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, t.GetKey())) { out = i; return true; }
if (equal_(got, invalid_)) {
@ -97,7 +165,6 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
out = i;
return false;
}
if (++i == end_) i = begin_;
}
}
@ -108,44 +175,45 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
#ifdef DEBUG
assert(initialized_);
#endif
for (MutableIterator i(Ideal(key));;) {
for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { out = i; return true; }
if (equal_(got, invalid_)) return false;
if (++i == end_) i = begin_;
}
}
// Like UnsafeMutableFind, but the key must be there.
template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
for (MutableIterator i(Ideal(key));;) {
for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { return i; }
assert(!equal_(got, invalid_));
if (++i == end_) i = begin_;
}
}
template <class Key> bool Find(const Key key, ConstIterator &out) const {
// Iterator is both input and output.
template <class Key> bool FindFromIdeal(const Key key, ConstIterator &i) const {
#ifdef DEBUG
assert(initialized_);
#endif
for (ConstIterator i(Ideal(key));;) {
for (;; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { out = i; return true; }
if (equal_(got, key)) return true;
if (equal_(got, invalid_)) return false;
if (++i == end_) i = begin_;
}
}
template <class Key> bool Find(const Key key, ConstIterator &out) const {
out = Ideal(key);
return FindFromIdeal(key, out);
}
// Like Find but we're sure it must be there.
template <class Key> ConstIterator MustFind(const Key key) const {
for (ConstIterator i(Ideal(key));;) {
for (ConstIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { return i; }
assert(!equal_(got, invalid_));
if (++i == end_) i = begin_;
}
}
@ -174,6 +242,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
MutableIterator old_end = begin_ + buckets_;
buckets_ *= 2;
end_ = begin_ + buckets_;
mod_.Double();
if (clear_new) {
Entry invalid;
invalid.SetKey(invalid_);
@ -230,26 +299,20 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
private:
friend class AutoProbing<Entry, Hash, Equal>;
MutableIterator Ideal(const Key key) {
return begin_ + (hash_(key) % buckets_);
}
ConstIterator Ideal(const Key key) const {
return begin_ + (hash_(key) % buckets_);
}
template <class T> MutableIterator UncheckedInsert(const T &t) {
for (MutableIterator i(Ideal(t.GetKey()));;) {
for (MutableIterator i(Ideal(t.GetKey()));; mod_.Next(begin_, end_, i)) {
if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
if (++i == end_) { i = begin_; }
}
}
MutableIterator begin_;
std::size_t buckets_;
MutableIterator end_;
std::size_t buckets_;
Key invalid_;
Hash hash_;
Equal equal_;
Mod mod_;
std::size_t entries_;
#ifdef DEBUG
bool initialized_;
@ -259,7 +322,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
// Resizable linear probing hash table. This owns the memory.
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing {
private:
typedef ProbingHashTable<EntryT, HashT, EqualT> Backend;
typedef ProbingHashTable<EntryT, HashT, EqualT, Power2Mod> Backend;
public:
static std::size_t MemUsage(std::size_t size, float multiplier = 1.5) {
return Backend::Size(size, multiplier);
@ -272,7 +335,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
typedef HashT Hash;
typedef EqualT Equal;
AutoProbing(std::size_t initial_size = 10, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
AutoProbing(std::size_t initial_size = 5, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
threshold_ = initial_size * 1.2;
Clear();

View File

@ -1,10 +1,8 @@
#include "util/file.hh"
#include "util/probing_hash_table.hh"
#include "util/scoped.hh"
#include "util/usage.hh"
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include <iostream>
namespace util {
@ -16,34 +14,143 @@ struct Entry {
Key GetKey() const { return key; }
};
typedef util::ProbingHashTable<Entry, util::IdentityHash> Table;
// I don't care if this doesn't run on Windows. Empirically /dev/urandom was faster than boost::random's Mersenne Twister.
class URandom {
public:
URandom() :
it_(buf_ + 1024), end_(buf_ + 1024),
file_(util::OpenReadOrThrow("/dev/urandom")) {}
void Test(uint64_t entries, uint64_t lookups, float multiplier = 1.5) {
std::size_t size = Table::Size(entries, multiplier);
uint64_t Get() {
if (it_ == end_) {
it_ = buf_;
util::ReadOrThrow(file_.get(), buf_, sizeof(buf_));
it_ = buf_;
}
return *it_++;
}
void Batch(uint64_t *begin, uint64_t *end) {
util::ReadOrThrow(file_.get(), begin, (end - begin) * sizeof(uint64_t));
}
private:
uint64_t buf_[1024];
uint64_t *it_, *end_;
util::scoped_fd file_;
};
struct PrefetchEntry {
uint64_t key;
const Entry *pointer;
};
const std::size_t kPrefetchSize = 4;
template <class Table> class PrefetchQueue {
public:
explicit PrefetchQueue(Table &table) : table_(table), cur_(0), twiddle_(false) {
for (PrefetchEntry *i = entries_; i != entries_ + kPrefetchSize; ++i)
i->pointer = NULL;
}
void Add(uint64_t key) {
if (Cur().pointer) {
twiddle_ ^= table_.FindFromIdeal(Cur().key, Cur().pointer);
}
Cur().key = key;
Cur().pointer = table_.Ideal(key);
__builtin_prefetch(Cur().pointer, 0, 0);
Next();
}
bool Drain() {
if (Cur().pointer) {
for (PrefetchEntry *i = &Cur(); i < entries_ + kPrefetchSize; ++i) {
twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
}
}
for (PrefetchEntry *i = entries_; i < &Cur(); ++i) {
twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
}
return twiddle_;
}
private:
PrefetchEntry &Cur() { return entries_[cur_]; }
void Next() {
++cur_;
cur_ = cur_ % kPrefetchSize;
}
Table &table_;
PrefetchEntry entries_[kPrefetchSize];
std::size_t cur_;
bool twiddle_;
PrefetchQueue(const PrefetchQueue&);
void operator=(const PrefetchQueue&);
};
/*template <class Table> class Immediate {
public:
private:
Table &table_;
};*/
std::size_t Size(uint64_t entries, float multiplier = 1.5) {
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
// Always round up to power of 2 for fair comparison.
return Power2Mod::RoundBuckets(Table::Size(entries, multiplier) / sizeof(Entry)) * sizeof(Entry);
}
template <class Mod> bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, float multiplier = 1.5) {
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Mod> Table;
std::size_t size = Size(entries, multiplier);
scoped_malloc backing(util::CallocOrThrow(size));
Table table(backing.get(), size);
boost::random::mt19937 gen;
boost::random::uniform_int_distribution<> dist(std::numeric_limits<uint64_t>::min(), std::numeric_limits<uint64_t>::max());
double start = UserTime();
for (uint64_t i = 0; i < entries; ++i) {
Entry entry;
entry.key = dist(gen);
entry.key = rn.Get();
table.Insert(entry);
}
double inserted = UserTime();
bool meaningless = true;
for (uint64_t i = 0; i < lookups; ++i) {
Table::ConstIterator it;
meaningless ^= table.Find(dist(gen), it);
double inserted = UserTime() - start;
double before_lookup = UserTime();
PrefetchQueue<Table> queue(table);
for (const uint64_t *i = queries_begin; i != queries_end; ++i) {
queue.Add(*i);
/* typename Table::ConstIterator it;
meaningless ^= table.Find(*i, it);*/
}
std::cout << meaningless << ' ' << entries << ' ' << multiplier << ' ' << (inserted - start) << ' ' << (UserTime() - inserted) / static_cast<double>(lookups) << std::endl;
bool meaningless = queue.Drain();
std::cout << entries << ' ' << size << ' ' << (inserted / static_cast<double>(entries)) << ' ' << (UserTime() - before_lookup) / static_cast<double>(queries_end - queries_begin) << '\n';
return meaningless;
}
template <class Mod> bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) {
URandom rn;
util::scoped_malloc queries(util::CallocOrThrow(lookups * sizeof(uint64_t)));
rn.Batch(static_cast<uint64_t*>(queries.get()), static_cast<uint64_t*>(queries.get()) + lookups);
uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2;
bool meaningless = true;
for (uint64_t i = 4; Size(i / multiplier) < physical_mem_limit; i *= 4) {
meaningless ^= util::Test<Mod>(rn, i / multiplier, static_cast<const uint64_t*>(queries.get()), static_cast<const uint64_t*>(queries.get()) + lookups, multiplier);
}
return meaningless;
}
} // namespace
} // namespace util
int main() {
for (uint64_t i = 1; i <= 10000000ULL; i *= 10) {
util::Test(i, 10000000);
}
bool meaningless = false;
std::cout << "#Integer division\n";
meaningless ^= util::TestRun<util::DivMod>();
std::cout << "#Masking\n";
meaningless ^= util::TestRun<util::Power2Mod>();
std::cerr << "Meaningless: " << meaningless << '\n';
}

View File

@ -55,7 +55,7 @@ if(BUILD_TESTING)
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
target_link_libraries(${test} ${Boost_LIBRARIES})
target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# Specify command arguments for how to run each unit test
#

View File

@ -1,6 +1,8 @@
#include "util/stream/rewindable_stream.hh"
#include "util/pcqueue.hh"
#include <iostream>
namespace util {
namespace stream {