Updates to kenlm:

Kludged and slow interface requested by Hieu because apparently Moses can't store language model state.  
Separate files for ARPA reading, vocabulary, and weights.  
Remove build shell scripts that won't work after Hieu changed the header file layout.  



git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3572 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
heafield 2010-09-27 03:46:44 +00:00
parent 61f5472f1c
commit e6184ae947
13 changed files with 523 additions and 310 deletions

View File

@ -1,2 +0,0 @@
#!/bin/bash
rm */*.o query build_binary */*_test

View File

@ -1,11 +0,0 @@
#!/bin/bash
#This is just an example compilation. You should integrate these files into your build system. I can provide boost jam if you want.
#If your code uses ICU, edit util/string_piece.hh and uncomment #define USE_ICU
set -e
for i in util/{ersatz_progress,exception,file_piece,murmur_hash,scoped,string_piece,mmap} lm/{exception,virtual_interface,ngram}; do
g++ -I. -O3 -c $i.cc -o $i.o
done
g++ -I. -O3 lm/ngram_build_binary.cc {lm,util}/*.o -o build_binary
g++ -I. -O3 lm/ngram_query.cc {lm,util}/*.o -o query

View File

@ -6,6 +6,8 @@ libkenlm_a_SOURCES = \
virtual_interface.cc \
ngram.cc \
exception.cc \
read_arpa.cc \
vocab.cc \
../util/string_piece.cc \
../util/scoped.cc \
../util/murmur_hash.cc \

View File

@ -1,6 +1,7 @@
#include "ngram.hh"
#include "exception.hh"
#include "read_arpa.hh"
#include "../util/file_piece.hh"
#include "../util/joint_sort.hh"
#include "../util/murmur_hash.hh"
@ -28,97 +29,7 @@ size_t hash_value(const State &state) {
return util::MurmurHashNative(state.history_, sizeof(WordIndex) * state.valid_length_);
}
namespace detail {
uint64_t HashForVocab(const char *str, std::size_t len) {
// This proved faster than Boost's hash in speed trials: total load time Murmur 67090000, Boost 72210000
// Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit.
return util::MurmurHash64A(str, len, 0);
}
void Prob::SetBackoff(float to) {
UTIL_THROW(FormatLoadException, "Attempt to set backoff " << to << " for the highest order n-gram");
}
// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok.
const uint64_t kUnknownHash = HashForVocab("<unk>", 5);
// Sadly some LMs have <UNK>.
const uint64_t kUnknownCapHash = HashForVocab("<UNK>", 5);
} // namespace detail
SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL) {}
std::size_t SortedVocabulary::Size(std::size_t entries, float /*ignored*/) {
// Lead with the number of entries.
return sizeof(uint64_t) + sizeof(Entry) * entries;
}
void SortedVocabulary::Init(void *start, std::size_t allocated, std::size_t entries) {
assert(allocated >= Size(entries));
// Leave space for number of entries.
begin_ = reinterpret_cast<Entry*>(reinterpret_cast<uint64_t*>(start) + 1);
end_ = begin_;
saw_unk_ = false;
}
WordIndex SortedVocabulary::Insert(const StringPiece &str) {
uint64_t hashed = detail::HashForVocab(str);
if (hashed == detail::kUnknownHash || hashed == detail::kUnknownCapHash) {
saw_unk_ = true;
return 0;
}
end_->key = hashed;
++end_;
// This is 1 + the offset where it was inserted to make room for unk.
return end_ - begin_;
}
bool SortedVocabulary::FinishedLoading(detail::ProbBackoff *reorder_vocab) {
util::JointSort(begin_, end_, reorder_vocab + 1);
SetSpecial(Index("<s>"), Index("</s>"), 0, end_ - begin_ + 1);
// Save size.
*(reinterpret_cast<uint64_t*>(begin_) - 1) = end_ - begin_;
return saw_unk_;
}
void SortedVocabulary::LoadedBinary() {
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
SetSpecial(Index("<s>"), Index("</s>"), 0, end_ - begin_ + 1);
}
namespace detail {
template <class Search> MapVocabulary<Search>::MapVocabulary() {}
template <class Search> void MapVocabulary<Search>::Init(void *start, std::size_t allocated, std::size_t /*entries*/) {
lookup_ = Lookup(start, allocated);
available_ = 1;
// Later if available_ != expected_available_ then we can throw UnknownMissingException.
saw_unk_ = false;
}
template <class Search> WordIndex MapVocabulary<Search>::Insert(const StringPiece &str) {
uint64_t hashed = HashForVocab(str);
// Prevent unknown from going into the table.
if (hashed == kUnknownHash || hashed == kUnknownCapHash) {
saw_unk_ = true;
return 0;
} else {
lookup_.Insert(Lookup::Packing::Make(hashed, available_));
return available_++;
}
}
template <class Search> bool MapVocabulary<Search>::FinishedLoading(ProbBackoff * /*reorder_vocab*/) {
lookup_.FinishedInserting();
SetSpecial(Index("<s>"), Index("</s>"), 0, available_);
return saw_unk_;
}
template <class Search> void MapVocabulary<Search>::LoadedBinary() {
lookup_.LoadedBinary();
SetSpecial(Index("<s>"), Index("</s>"), 0, available_);
}
namespace {
/* All of the entropy is in low order bits and boost::hash does poorly with
* these. Odd numbers near 2^64 chosen by mashing on the keyboard. There is a
@ -138,71 +49,6 @@ uint64_t ChainedWordHash(const WordIndex *word, const WordIndex *word_end) {
return current;
}
bool IsEntirelyWhiteSpace(const StringPiece &line) {
for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) {
if (!isspace(line.data()[i])) return false;
}
return true;
}
void ReadARPACounts(util::FilePiece &in, std::vector<size_t> &number) {
number.clear();
StringPiece line;
if (!IsEntirelyWhiteSpace(line = in.ReadLine())) UTIL_THROW(FormatLoadException, "First line was \"" << line << "\" not blank");
if ((line = in.ReadLine()) != "\\data\\") UTIL_THROW(FormatLoadException, "second line was \"" << line << "\" not \\data\\.");
while (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \"");
// So strtol doesn't go off the end of line.
std::string remaining(line.data() + 6, line.size() - 6);
char *end_ptr;
unsigned long int length = std::strtol(remaining.c_str(), &end_ptr, 10);
if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line);
if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line);
++end_ptr;
const char *start = end_ptr;
long int count = std::strtol(start, &end_ptr, 10);
if (count < 0) UTIL_THROW(FormatLoadException, "Negative n-gram count " << count);
if (start == end_ptr) UTIL_THROW(FormatLoadException, "Couldn't parse n-gram count from " << line);
number.push_back(count);
}
}
void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
StringPiece line;
while (IsEntirelyWhiteSpace(line = in.ReadLine())) {}
std::stringstream expected;
expected << '\\' << length << "-grams:";
if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead.");
}
// Special unigram reader because unigram's data structure is different and because we're inserting vocab words.
template <class Voc> void Read1Grams(util::FilePiece &f, const size_t count, Voc &vocab, ProbBackoff *unigrams) {
ReadNGramHeader(f, 1);
for (size_t i = 0; i < count; ++i) {
try {
float prob = f.ReadFloat();
if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability");
ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited())];
value.prob = prob;
switch (f.get()) {
case '\t':
value.SetBackoff(f.ReadFloat());
if ((f.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff");
break;
case '\n':
value.ZeroBackoff();
break;
default:
UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram");
}
} catch(util::Exception &e) {
e << " in the " << i << "th 1-gram at byte " << f.Offset();
throw;
}
}
if (f.ReadLine().size()) UTIL_THROW(FormatLoadException, "Expected blank line after unigrams at byte " << f.Offset());
}
template <class Voc, class Store> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, Store &store) {
ReadNGramHeader(f, n);
@ -210,35 +56,18 @@ template <class Voc, class Store> void ReadNGrams(util::FilePiece &f, const unsi
WordIndex vocab_ids[n];
typename Store::Packing::Value value;
for (size_t i = 0; i < count; ++i) {
try {
value.prob = f.ReadFloat();
for (WordIndex *vocab_out = &vocab_ids[n-1]; vocab_out >= vocab_ids; --vocab_out) {
*vocab_out = vocab.Index(f.ReadDelimited());
}
uint64_t key = ChainedWordHash(vocab_ids, vocab_ids + n);
switch (f.get()) {
case '\t':
value.SetBackoff(f.ReadFloat());
if ((f.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff");
break;
case '\n':
value.ZeroBackoff();
break;
default:
UTIL_THROW(FormatLoadException, "Expected tab or newline after n-gram");
}
store.Insert(Store::Packing::Make(key, value));
} catch(util::Exception &e) {
e << " in the " << i << "th " << n << "-gram at byte " << f.Offset();
throw;
}
ReadNGram(f, n, vocab, vocab_ids, value);
uint64_t key = ChainedWordHash(vocab_ids, vocab_ids + n);
store.Insert(Store::Packing::Make(key, value));
}
if (f.ReadLine().size()) UTIL_THROW(FormatLoadException, "Expected blank line after " << n << "-grams at byte " << f.Offset());
store.FinishedInserting();
}
} // namespace
namespace detail {
template <class Search, class VocabularyT> size_t GenericModel<Search, VocabularyT>::Size(const std::vector<size_t> &counts, const Config &config) {
if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Edit ngram.hh's kMaxOrder to at least this value and recompile.");
if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model.");
@ -402,7 +231,7 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::LoadFromARPA(util::FilePiece &f, const std::vector<size_t> &counts, const Config &config) {
// Read the unigrams.
Read1Grams(f, counts[0], vocab_, unigram_);
bool saw_unk = vocab_.FinishedLoading(unigram_);
bool saw_unk = vocab_.SawUnk();
if (!saw_unk) {
switch(config.unknown_missing) {
case Config::THROW_UP:
@ -443,7 +272,6 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
State &out_state) const {
FullScoreReturn ret;
// This is end pointer passed to SumBackoffs.
const ProbBackoff &unigram = unigram_[new_word];
if (new_word == 0) {
ret.ngram_length = out_state.valid_length_ = 0;
@ -515,7 +343,95 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
return ret;
}
template class GenericModel<ProbingSearch, MapVocabulary<ProbingSearch> >;
/* Until somebody implements stateful language models in Moses, here's a slower stateless version. It also provides a mostly meaningless void * value that can be used for pruning. */
template <class Search, class VocabularyT> HieuShouldRefactorMoses GenericModel<Search, VocabularyT>::SlowStatelessScore(
const WordIndex *begin, const WordIndex *end) const {
begin = std::max(begin, end - P::Order());
HieuShouldRefactorMoses ret;
// This is end pointer passed to SumBackoffs.
const ProbBackoff &unigram = unigram_[*(end - 1)];
if (!*(end - 1)) {
ret.ngram_length = 0;
// all of backoff.
ret.prob = unigram.prob + SlowBackoffLookup(begin, end - 1, 1);
ret.meaningless_unique_state = 0;
return ret;
}
ret.prob = unigram.prob;
if (begin == end - 1) {
ret.ngram_length = 1;
ret.meaningless_unique_state = reinterpret_cast<void*>(*(end - 1));
// No backoff because the context is empty.
return ret;
}
// Ok now we now that the bigram contains known words. Start by looking it up.
uint64_t lookup_hash = static_cast<uint64_t>(*(end - 1));
const WordIndex *hist_iter = end - 2;
const WordIndex *const hist_none = begin - 1;
typename std::vector<Middle>::const_iterator mid_iter = middle_.begin();
for (; ; ++mid_iter, --hist_iter) {
if (hist_iter == hist_none) {
// Ran out. No backoff.
ret.ngram_length = end - begin;
// ret.prob was already set.
ret.meaningless_unique_state = reinterpret_cast<void*>(lookup_hash + 1);
return ret;
}
lookup_hash = CombineWordHash(lookup_hash, *hist_iter);
if (mid_iter == middle_.end()) break;
typename Middle::ConstIterator found;
if (!mid_iter->Find(lookup_hash, found)) {
// Didn't find an ngram using hist_iter.
ret.ngram_length = end - 1 - hist_iter;
ret.prob += SlowBackoffLookup(begin, end - 1, mid_iter - middle_.begin() + 1);
ret.meaningless_unique_state = reinterpret_cast<void*>(lookup_hash + 2);
return ret;
}
ret.prob = found->GetValue().prob;
}
typename Longest::ConstIterator found;
if (!longest_.Find(lookup_hash, found)) {
// It's an (P::Order()-1)-gram
ret.ngram_length = P::Order() - 1;
ret.prob += SlowBackoffLookup(begin, end - 1, P::Order() - 1);
ret.meaningless_unique_state = reinterpret_cast<void*>(lookup_hash + 3);
return ret;
}
// It's an P::Order()-gram
ret.ngram_length = P::Order();
ret.prob = found->GetValue().prob;
ret.meaningless_unique_state = reinterpret_cast<void*>(lookup_hash + 4);
return ret;
}
template <class Search, class VocabularyT> float GenericModel<Search, VocabularyT>::SlowBackoffLookup(
const WordIndex *const begin, const WordIndex *const end, unsigned char start) const {
// Add the backoff weights for n-grams of order start to (end - begin).
if (end - begin < static_cast<std::ptrdiff_t>(start)) return 0.0;
float ret = 0.0;
if (start == 1) {
ret += unigram_[*(end - 1)].backoff;
start = 2;
}
uint64_t lookup_hash = static_cast<uint64_t>(*(end - 1));
for (unsigned char i = 2; i < start; ++i) {
lookup_hash = CombineWordHash(lookup_hash, *(end - i));
}
typename Middle::ConstIterator found;
// i is the order of the backoff we're looking for.
for (unsigned char i = start; i <= static_cast<unsigned char>(end - begin); ++i) {
lookup_hash = CombineWordHash(lookup_hash, *(end - i));
if (!middle_[i - 2].Find(lookup_hash, found)) break;
ret += found->GetValue().backoff;
}
return ret;
}
template class GenericModel<ProbingSearch, ProbingVocabulary>;
template class GenericModel<SortedUniformSearch, SortedVocabulary>;
} // namespace detail
} // namespace ngram

View File

@ -3,6 +3,8 @@
#include "facade.hh"
#include "ngram_config.hh"
#include "vocab.hh"
#include "weights.hh"
#include "../util/key_value_packing.hh"
#include "../util/mmap.hh"
#include "../util/probing_hash_table.hh"
@ -11,7 +13,6 @@
#include "../util/string_piece.hh"
#include <algorithm>
#include <memory>
#include <vector>
namespace util { class FilePiece; }
@ -48,104 +49,15 @@ class State {
size_t hash_value(const State &state);
namespace detail {
uint64_t HashForVocab(const char *str, std::size_t len);
inline uint64_t HashForVocab(const StringPiece &str) {
return HashForVocab(str.data(), str.length());
}
struct Prob {
// TODO(hieuhoang1972): refactor language models to keep arbitrary state, not a void* pointer. Then use FullScore like good people do. For now, you get a stateless interface.
struct HieuShouldRefactorMoses {
float prob;
void SetBackoff(float to);
void ZeroBackoff() {}
};
// No inheritance so this will be a POD.
struct ProbBackoff {
float prob;
float backoff;
void SetBackoff(float to) { backoff = to; }
void ZeroBackoff() { backoff = 0.0; }
};
} // namespace detail
// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices.
class SortedVocabulary : public base::Vocabulary {
private:
// Sorted uniform requires a GetKey function.
struct Entry {
uint64_t GetKey() const { return key; }
uint64_t key;
bool operator<(const Entry &other) const {
return key < other.key;
}
};
public:
SortedVocabulary();
WordIndex Index(const StringPiece &str) const {
const Entry *found;
if (util::SortedUniformFind<const Entry *, uint64_t>(begin_, end_, detail::HashForVocab(str), found)) {
return found - begin_ + 1; // +1 because <unk> is 0 and does not appear in the lookup table.
} else {
return 0;
}
}
// Ignores second argument for consistency with probing hash which has a float here.
static size_t Size(std::size_t entries, float ignored = 0.0);
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
void Init(void *start, std::size_t allocated, std::size_t entries);
WordIndex Insert(const StringPiece &str);
// Returns true if unknown was seen. Reorders reorder_vocab so that the IDs are sorted.
bool FinishedLoading(detail::ProbBackoff *reorder_vocab);
void LoadedBinary();
private:
Entry *begin_, *end_;
bool saw_unk_;
unsigned char ngram_length;
void *meaningless_unique_state;
};
namespace detail {
// Vocabulary storing a map from uint64_t to WordIndex.
template <class Search> class MapVocabulary : public base::Vocabulary {
public:
MapVocabulary();
WordIndex Index(const StringPiece &str) const {
typename Lookup::ConstIterator i;
return lookup_.Find(HashForVocab(str), i) ? i->GetValue() : 0;
}
static size_t Size(std::size_t entries, float probing_multiplier) {
return Lookup::Size(entries, probing_multiplier);
}
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
void Init(void *start, std::size_t allocated, std::size_t entries);
WordIndex Insert(const StringPiece &str);
// Returns true if unknown was seen. Does nothing with reorder_vocab.
bool FinishedLoading(ProbBackoff *reorder_vocab);
void LoadedBinary();
private:
typedef typename Search::template Table<WordIndex>::T Lookup;
Lookup lookup_;
bool saw_unk_;
};
// std::identity is an SGI extension :-(
struct IdentityHash : public std::unary_function<uint64_t, size_t> {
size_t operator()(uint64_t arg) const { return static_cast<size_t>(arg); }
@ -166,8 +78,18 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
/* Slower but stateless call. Don't use this if you can avoid it. This
* is mostly a hack for Hieu to integrate it into Moses which is currently
* unable to handle arbitrary LM state. Sigh.
* The word indices should be in an array. *begin is the earliest word of context.
* *(end-1) is the word being appended.
*/
HieuShouldRefactorMoses SlowStatelessScore(const WordIndex *begin, const WordIndex *end) const;
private:
// Appears after Size in the cc.
float SlowBackoffLookup(const WordIndex *const begin, const WordIndex *const end, unsigned char start) const;
// Appears after Size in the cc file.
void SetupMemory(char *start, const std::vector<size_t> &counts, const Config &config);
void LoadFromARPA(util::FilePiece &f, const std::vector<size_t> &counts, const Config &config);
@ -214,10 +136,10 @@ struct SortedUniformSearch {
} // namespace detail
// These must also be instantiated in the cc file.
typedef detail::MapVocabulary<detail::ProbingSearch> Vocabulary;
typedef ::lm::ProbingVocabulary Vocabulary;
typedef detail::GenericModel<detail::ProbingSearch, Vocabulary> Model;
// SortedVocabulary was defined above.
typedef ::lm::SortedVocabulary SortedVocabulary;
typedef detail::GenericModel<detail::SortedUniformSearch, SortedVocabulary> SortedModel;
} // namespace ngram

View File

@ -22,7 +22,7 @@ namespace {
StartTest(word, ngram, score) \
state = out;
template <class M> void Starters(M &model) {
template <class M> void Starters(const M &model) {
FullScoreReturn ret;
Model::State state(model.BeginSentenceState());
Model::State out;
@ -35,7 +35,7 @@ template <class M> void Starters(M &model) {
StartTest("this_is_not_found", 0, -1.995635 + -0.4149733);
}
template <class M> void Continuation(M &model) {
template <class M> void Continuation(const M &model) {
FullScoreReturn ret;
Model::State state(model.BeginSentenceState());
Model::State out;
@ -57,10 +57,43 @@ template <class M> void Continuation(M &model) {
AppendTest("loin", 5, -0.0432557);
}
BOOST_AUTO_TEST_CASE(starters_probing) { Model m("test.arpa"); Starters(m); }
BOOST_AUTO_TEST_CASE(continuation_probing) { Model m("test.arpa"); Continuation(m); }
BOOST_AUTO_TEST_CASE(starters_sorted) { SortedModel m("test.arpa"); Starters(m); }
BOOST_AUTO_TEST_CASE(continuation_sorted) { SortedModel m("test.arpa"); Continuation(m); }
#define StatelessTest(begin, end, ngram, score) \
ret = model.SlowStatelessScore(begin, end); \
BOOST_CHECK_CLOSE(score, ret.prob, 0.001); \
BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length);
template <class M> void Stateless(const M &model) {
const char *words[] = {"<s>", "looking", "on", "a", "little", "the", "biarritz", "not_found", "more", ".", "</s>"};
WordIndex indices[sizeof(words) / sizeof(const char*)];
for (unsigned int i = 0; i < sizeof(words) / sizeof(const char*); ++i) {
indices[i] = model.GetVocabulary().Index(words[i]);
}
FullScoreReturn ret;
StatelessTest(indices, indices + 2, 2, -0.484652);
StatelessTest(indices, indices + 3, 3, -0.348837);
StatelessTest(indices, indices + 4, 4, -0.0155266);
StatelessTest(indices, indices + 5, 5, -0.00306122);
// the
StatelessTest(indices, indices + 6, 1, -4.04005);
StatelessTest(indices + 1, indices + 6, 1, -4.04005);
// biarritz
StatelessTest(indices, indices + 7, 1, -1.9889);
// not found
StatelessTest(indices, indices + 8, 0, -2.29666);
}
BOOST_AUTO_TEST_CASE(probing) {
Model m("test.arpa");
Starters(m);
Continuation(m);
Stateless(m);
}
BOOST_AUTO_TEST_CASE(sorted) {
SortedModel m("test.arpa");
Starters(m);
Continuation(m);
Stateless(m);
}
BOOST_AUTO_TEST_CASE(write_and_read_probing) {
Config config;
@ -71,6 +104,7 @@ BOOST_AUTO_TEST_CASE(write_and_read_probing) {
Model binary("test.binary");
Starters(binary);
Continuation(binary);
Stateless(binary);
}
BOOST_AUTO_TEST_CASE(write_and_read_sorted) {
@ -83,6 +117,7 @@ BOOST_AUTO_TEST_CASE(write_and_read_sorted) {
SortedModel binary("test.binary");
Starters(binary);
Continuation(binary);
Stateless(binary);
}

71
kenlm/lm/read_arpa.cc Normal file
View File

@ -0,0 +1,71 @@
#include "read_arpa.hh"
#include <cstdlib>
#include <ctype.h>
namespace lm {
bool IsEntirelyWhiteSpace(const StringPiece &line) {
for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) {
if (!isspace(line.data()[i])) return false;
}
return true;
}
void ReadARPACounts(util::FilePiece &in, std::vector<size_t> &number) {
number.clear();
StringPiece line;
if (!IsEntirelyWhiteSpace(line = in.ReadLine())) UTIL_THROW(FormatLoadException, "First line was \"" << line << "\" not blank");
if ((line = in.ReadLine()) != "\\data\\") UTIL_THROW(FormatLoadException, "second line was \"" << line << "\" not \\data\\.");
while (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \"");
// So strtol doesn't go off the end of line.
std::string remaining(line.data() + 6, line.size() - 6);
char *end_ptr;
unsigned long int length = std::strtol(remaining.c_str(), &end_ptr, 10);
if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line);
if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line);
++end_ptr;
const char *start = end_ptr;
long int count = std::strtol(start, &end_ptr, 10);
if (count < 0) UTIL_THROW(FormatLoadException, "Negative n-gram count " << count);
if (start == end_ptr) UTIL_THROW(FormatLoadException, "Couldn't parse n-gram count from " << line);
number.push_back(count);
}
}
void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
StringPiece line;
while (IsEntirelyWhiteSpace(line = in.ReadLine())) {}
std::stringstream expected;
expected << '\\' << length << "-grams:";
if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead. ");
}
void ReadBackoff(util::FilePiece &f, Prob &weights) {
switch (f.get()) {
case '\t':
UTIL_THROW(FormatLoadException, "Backoff " << f.ReadDelimited() << " provided for an n-gram that should have no backoff.");
break;
case '\n':
break;
default:
UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram");
}
}
void ReadBackoff(util::FilePiece &f, ProbBackoff &weights) {
switch (f.get()) {
case '\t':
weights.backoff = f.ReadFloat();
if ((f.get() != '\n')) UTIL_THROW(FormatLoadException, "Expected newline after backoff");
break;
case '\n':
weights.backoff = 0.0;
break;
default:
UTIL_THROW(FormatLoadException, "Expected tab or newline after unigram");
}
}
} // namespace lm

59
kenlm/lm/read_arpa.hh Normal file
View File

@ -0,0 +1,59 @@
#ifndef LM_READ_ARPA__
#define LM_READ_ARPA__
#include "exception.hh"
#include "weights.hh"
#include "word_index.hh"
#include "../util/file_piece.hh"
#include <cstddef>
#include <vector>
namespace lm {
void ReadARPACounts(util::FilePiece &in, std::vector<std::size_t> &number);
void ReadNGramHeader(util::FilePiece &in, unsigned int length);
void ReadBackoff(util::FilePiece &f, Prob &weights);
void ReadBackoff(util::FilePiece &f, ProbBackoff &weights);
template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams) {
try {
float prob = f.ReadFloat();
if (prob > 0) UTIL_THROW(FormatLoadException, "Positive probability " << prob);
if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability");
ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited())];
value.prob = prob;
ReadBackoff(f, value);
} catch(util::Exception &e) {
e << " in the 1-gram at byte " << f.Offset();
throw;
}
}
template <class Voc> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, ProbBackoff *unigrams) {
ReadNGramHeader(f, 1);
for (std::size_t i = 0; i < count; ++i) {
Read1Gram(f, vocab, unigrams);
}
if (f.ReadLine().size()) UTIL_THROW(FormatLoadException, "Expected blank line after unigrams at byte " << f.Offset());
vocab.FinishedLoading(unigrams);
}
template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights) {
try {
weights.prob = f.ReadFloat();
if (weights.prob > 0) UTIL_THROW(FormatLoadException, "Positive probability " << weights.prob);
for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) {
*vocab_out = vocab.Index(f.ReadDelimited());
}
ReadBackoff(f, weights);
} catch(util::Exception &e) {
e << " in the " << n << "-gram at byte " << f.Offset();
throw;
}
}
} // namespace lm
#endif // LM_READ_ARPA__

97
kenlm/lm/vocab.cc Normal file
View File

@ -0,0 +1,97 @@
#include "vocab.hh"
#include "weights.hh"
#include "../util/joint_sort.hh"
#include "../util/murmur_hash.hh"
#include "../util/probing_hash_table.hh"
#include <string>
namespace lm {
namespace detail {
uint64_t HashForVocab(const char *str, std::size_t len) {
// This proved faster than Boost's hash in speed trials: total load time Murmur 67090000, Boost 72210000
// Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit.
return util::MurmurHash64A(str, len, 0);
}
} // namespace detail
namespace {
// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok.
const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
// Sadly some LMs have <UNK>.
const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);
} // namespace
SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL) {}
std::size_t SortedVocabulary::Size(std::size_t entries, float ignored) {
// Lead with the number of entries.
return sizeof(uint64_t) + sizeof(Entry) * entries;
}
void SortedVocabulary::Init(void *start, std::size_t allocated, std::size_t entries) {
assert(allocated >= Size(entries));
// Leave space for number of entries.
begin_ = reinterpret_cast<Entry*>(reinterpret_cast<uint64_t*>(start) + 1);
end_ = begin_;
saw_unk_ = false;
}
WordIndex SortedVocabulary::Insert(const StringPiece &str) {
uint64_t hashed = detail::HashForVocab(str);
if (hashed == kUnknownHash || hashed == kUnknownCapHash) {
saw_unk_ = true;
return 0;
}
end_->key = hashed;
++end_;
// This is 1 + the offset where it was inserted to make room for unk.
return end_ - begin_;
}
void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
util::JointSort(begin_, end_, reorder_vocab + 1);
SetSpecial(Index("<s>"), Index("</s>"), 0, end_ - begin_ + 1);
// Save size.
*(reinterpret_cast<uint64_t*>(begin_) - 1) = end_ - begin_;
}
void SortedVocabulary::LoadedBinary() {
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
SetSpecial(Index("<s>"), Index("</s>"), 0, end_ - begin_ + 1);
}
ProbingVocabulary::ProbingVocabulary() {}
void ProbingVocabulary::Init(void *start, std::size_t allocated, std::size_t entries) {
lookup_ = Lookup(start, allocated);
available_ = 1;
// Later if available_ != expected_available_ then we can throw UnknownMissingException.
saw_unk_ = false;
}
WordIndex ProbingVocabulary::Insert(const StringPiece &str) {
uint64_t hashed = detail::HashForVocab(str);
// Prevent unknown from going into the table.
if (hashed == kUnknownHash || hashed == kUnknownCapHash) {
saw_unk_ = true;
return 0;
} else {
lookup_.Insert(Lookup::Packing::Make(hashed, available_));
return available_++;
}
}
void ProbingVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
lookup_.FinishedInserting();
SetSpecial(Index("<s>"), Index("</s>"), 0, available_);
}
void ProbingVocabulary::LoadedBinary() {
lookup_.LoadedBinary();
SetSpecial(Index("<s>"), Index("</s>"), 0, available_);
}
} // namespace lm

106
kenlm/lm/vocab.hh Normal file
View File

@ -0,0 +1,106 @@
#ifndef LM_VOCAB__
#define LM_VOCAB__
#include "virtual_interface.hh"
#include "../util/key_value_packing.hh"
#include "../util/probing_hash_table.hh"
#include "../util/sorted_uniform.hh"
#include "../util/string_piece.hh"
namespace lm {
class ProbBackoff;
namespace detail {
uint64_t HashForVocab(const char *str, std::size_t len);
inline uint64_t HashForVocab(const StringPiece &str) {
return HashForVocab(str.data(), str.length());
}
} // namespace detail
// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices.
class SortedVocabulary : public base::Vocabulary {
private:
// Sorted uniform requires a GetKey function.
struct Entry {
uint64_t GetKey() const { return key; }
uint64_t key;
bool operator<(const Entry &other) const {
return key < other.key;
}
};
public:
SortedVocabulary();
WordIndex Index(const StringPiece &str) const {
const Entry *found;
if (util::SortedUniformFind<const Entry *, uint64_t>(begin_, end_, detail::HashForVocab(str), found)) {
return found - begin_ + 1; // +1 because <unk> is 0 and does not appear in the lookup table.
} else {
return 0;
}
}
// Ignores second argument for consistency with probing hash which has a float here.
static size_t Size(std::size_t entries, float ignored = 0.0);
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
void Init(void *start, std::size_t allocated, std::size_t entries);
WordIndex Insert(const StringPiece &str);
// Reorders reorder_vocab so that the IDs are sorted.
void FinishedLoading(ProbBackoff *reorder_vocab);
bool SawUnk() const { return saw_unk_; }
void LoadedBinary();
private:
Entry *begin_, *end_;
bool saw_unk_;
};
// Vocabulary storing a map from uint64_t to WordIndex.
class ProbingVocabulary : public base::Vocabulary {
public:
ProbingVocabulary();
WordIndex Index(const StringPiece &str) const {
Lookup::ConstIterator i;
return lookup_.Find(detail::HashForVocab(str), i) ? i->GetValue() : 0;
}
static size_t Size(std::size_t entries, float probing_multiplier) {
return Lookup::Size(entries, probing_multiplier);
}
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
void Init(void *start, std::size_t allocated, std::size_t entries);
WordIndex Insert(const StringPiece &str);
void FinishedLoading(ProbBackoff *reorder_vocab);
bool SawUnk() const { return saw_unk_; }
void LoadedBinary();
private:
// std::identity is an SGI extension :-(
struct IdentityHash : public std::unary_function<uint64_t, std::size_t> {
std::size_t operator()(uint64_t arg) const { return static_cast<std::size_t>(arg); }
};
typedef util::ProbingHashTable<util::ByteAlignedPacking<uint64_t, WordIndex>, IdentityHash> Lookup;
Lookup lookup_;
bool saw_unk_;
};
} // namespace lm
#endif // LM_VOCAB__

17
kenlm/lm/weights.hh Normal file
View File

@ -0,0 +1,17 @@
#ifndef LM_WEIGHTS__
#define LM_WEIGHTS__
// Weights for n-grams. Probability and possibly a backoff.
namespace lm {
struct Prob {
float prob;
};
// No inheritance so this will be a POD.
struct ProbBackoff {
float prob;
float backoff;
};
} // namespace lm
#endif // LM_WEIGHTS__

View File

@ -1,8 +0,0 @@
#!/bin/bash
#Run tests. Requires Boost.
set -e
./compile.sh
for i in util/{file_piece,joint_sort,key_value_packing,probing_hash_table,sorted_uniform}_test lm/ngram_test; do
g++ -I. -O3 $i.cc {lm,util}/*.o -lboost_test_exec_monitor -o $i
pushd $(dirname $i) && ./$(basename $i); popd
done

View File

@ -87,13 +87,22 @@ float LanguageModelKen::GetValue(const vector<const Word*> &contextFactor, State
const Factor *factor = contextFactor[i]->GetFactor(factorType);
const string &word = factor->GetString();
//ngramId[i] = StringToId(word); FOR_KEN
// TODO(hieuhoang1972): precompute this.
ngramId[i] = m_ngram->GetVocabulary().Index(word);
}
float prob;
//prob = m_ngram.GetScore(ngramId); FOR_KEN
// TODO(hieuhoang1972): use my stateful interface instead of this stateless one you asked heafield to kludge for you.
lm::ngram::HieuShouldRefactorMoses ret(m_ngram->SlowStatelessScore(&*ngramId.begin(), &*ngramId.end()));
if (finalState)
{
*finalState = ret.meaningless_unique_state;
}
if (len)
{
*len = ret.ngram_length;
}
return TransformLMScore(prob);
return TransformLMScore(ret.prob);
}