KenLM 0e5d259 including read_compressed fix

This commit is contained in:
Kenneth Heafield 2013-01-04 21:02:47 +00:00
parent 3203f7c92d
commit f9ee7ae4b3
14 changed files with 111 additions and 98 deletions

View File

@ -16,11 +16,11 @@ namespace ngram {
namespace { namespace {
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version"; const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0"; const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed). // This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n"; const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
const long int kMagicVersion = 5; const long int kMagicVersion = 5;
// Old binary files built on 32-bit machines have this header. // Old binary files built on 32-bit machines have this header.
// TODO: eliminate with next binary release. // TODO: eliminate with next binary release.
struct OldSanity { struct OldSanity {
char magic[sizeof(kMagicBytes)]; char magic[sizeof(kMagicBytes)];
@ -39,7 +39,7 @@ struct OldSanity {
}; };
// Test values aligned to 8 bytes. // Test values aligned to 8 bytes.
struct Sanity { struct Sanity {
char magic[ALIGN8(sizeof(kMagicBytes))]; char magic[ALIGN8(sizeof(kMagicBytes))];
float zero_f, one_f, minus_half_f; float zero_f, one_f, minus_half_f;
@ -101,7 +101,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) { uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad; std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
if (config.write_mmap) { if (config.write_mmap) {
// Grow the file to accomodate the search, using zeros. // Grow the file to accomodate the search, using zeros.
try { try {
util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size); util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
} catch (util::ErrnoException &e) { } catch (util::ErrnoException &e) {
@ -114,7 +114,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
return reinterpret_cast<uint8_t*>(backing.search.get()); return reinterpret_cast<uint8_t*>(backing.search.get());
} }
// mmap it now. // mmap it now.
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down. // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
std::size_t page_size = util::SizePage(); std::size_t page_size = util::SizePage();
std::size_t alignment_cruft = adjusted_vocab % page_size; std::size_t alignment_cruft = adjusted_vocab % page_size;
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED); backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
@ -122,7 +122,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
} else { } else {
util::MapAnonymous(memory_size, backing.search); util::MapAnonymous(memory_size, backing.search);
return reinterpret_cast<uint8_t*>(backing.search.get()); return reinterpret_cast<uint8_t*>(backing.search.get());
} }
} }
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) { void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
@ -140,7 +140,7 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
util::FSyncOrThrow(backing.file.get()); util::FSyncOrThrow(backing.file.get());
break; break;
} }
// header and vocab share the same mmap. The header is written here because we know the counts. // header and vocab share the same mmap. The header is written here because we know the counts.
Parameters params = Parameters(); Parameters params = Parameters();
params.counts = counts; params.counts = counts;
params.fixed.order = counts.size(); params.fixed.order = counts.size();
@ -160,7 +160,7 @@ namespace detail {
bool IsBinaryFormat(int fd) { bool IsBinaryFormat(int fd) {
const uint64_t size = util::SizeFile(fd); const uint64_t size = util::SizeFile(fd);
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false; if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
// Try reading the header. // Try reading the header.
util::scoped_memory memory; util::scoped_memory memory;
try { try {
util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory); util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
@ -214,7 +214,7 @@ void SeekPastHeader(int fd, const Parameters &params) {
uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing) { uint8_t *SetupBinary(const Config &config, const Parameters &params, uint64_t memory_size, Backing &backing) {
const uint64_t file_size = util::SizeFile(backing.file.get()); const uint64_t file_size = util::SizeFile(backing.file.get());
// The header is smaller than a page, so we have to map the whole header as well. // The header is smaller than a page, so we have to map the whole header as well.
std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size); std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map) if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
@ -233,7 +233,8 @@ void ComplainAboutARPA(const Config &config, ModelType model_type) {
if (config.write_mmap || !config.messages) return; if (config.write_mmap || !config.messages) return;
if (config.arpa_complain == Config::ALL) { if (config.arpa_complain == Config::ALL) {
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
} else if (config.arpa_complain == Config::EXPENSIVE && model_type == TRIE_SORTED) { } else if (config.arpa_complain == Config::EXPENSIVE &&
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
} }
} }

View File

@ -6,6 +6,7 @@ namespace lm {
namespace ngram { namespace ngram {
Config::Config() : Config::Config() :
show_progress(true),
messages(&std::cerr), messages(&std::cerr),
enumerate_vocab(NULL), enumerate_vocab(NULL),
unknown_missing(COMPLAIN), unknown_missing(COMPLAIN),

View File

@ -11,46 +11,52 @@
/* Configuration for ngram model. Separate header to reduce pollution. */ /* Configuration for ngram model. Separate header to reduce pollution. */
namespace lm { namespace lm {
class EnumerateVocab; class EnumerateVocab;
namespace ngram { namespace ngram {
struct Config { struct Config {
// EFFECTIVE FOR BOTH ARPA AND BINARY READS // EFFECTIVE FOR BOTH ARPA AND BINARY READS
// (default true) print progress bar to messages
bool show_progress;
// Where to log messages including the progress bar. Set to NULL for // Where to log messages including the progress bar. Set to NULL for
// silence. // silence.
std::ostream *messages; std::ostream *messages;
std::ostream *ProgressMessages() const {
return show_progress ? messages : 0;
}
// This will be called with every string in the vocabulary. See // This will be called with every string in the vocabulary. See
// enumerate_vocab.hh for more detail. Config does not take ownership; you // enumerate_vocab.hh for more detail. Config does not take ownership; you
// are still responsible for deleting it (or stack allocating). // are still responsible for deleting it (or stack allocating).
EnumerateVocab *enumerate_vocab; EnumerateVocab *enumerate_vocab;
// ONLY EFFECTIVE WHEN READING ARPA // ONLY EFFECTIVE WHEN READING ARPA
// What to do when <unk> isn't in the provided model. // What to do when <unk> isn't in the provided model.
WarningAction unknown_missing; WarningAction unknown_missing;
// What to do when <s> or </s> is missing from the model. // What to do when <s> or </s> is missing from the model.
// If THROW_UP, the exception will be of type util::SpecialWordMissingException. // If THROW_UP, the exception will be of type util::SpecialWordMissingException.
WarningAction sentence_marker_missing; WarningAction sentence_marker_missing;
// What to do with a positive log probability. For COMPLAIN and SILENT, map // What to do with a positive log probability. For COMPLAIN and SILENT, map
// to 0. // to 0.
WarningAction positive_log_probability; WarningAction positive_log_probability;
// The probability to substitute for <unk> if it's missing from the model. // The probability to substitute for <unk> if it's missing from the model.
// No effect if the model has <unk> or unknown_missing == THROW_UP. // No effect if the model has <unk> or unknown_missing == THROW_UP.
float unknown_missing_logprob; float unknown_missing_logprob;
// Size multiplier for probing hash table. Must be > 1. Space is linear in // Size multiplier for probing hash table. Must be > 1. Space is linear in
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect // this. Time is probing_multiplier / (probing_multiplier - 1). No effect
// for sorted variant. // for sorted variant.
// If you find yourself setting this to a low number, consider using the // If you find yourself setting this to a low number, consider using the
// TrieModel which has lower memory consumption. // TrieModel which has lower memory consumption.
float probing_multiplier; float probing_multiplier;
// Amount of memory to use for building. The actual memory usage will be // Amount of memory to use for building. The actual memory usage will be
@ -58,10 +64,10 @@ struct Config {
// models. // models.
std::size_t building_memory; std::size_t building_memory;
// Template for temporary directory appropriate for passing to mkdtemp. // Template for temporary directory appropriate for passing to mkdtemp.
// The characters XXXXXX are appended before passing to mkdtemp. Only // The characters XXXXXX are appended before passing to mkdtemp. Only
// applies to trie. If NULL, defaults to write_mmap. If that's NULL, // applies to trie. If NULL, defaults to write_mmap. If that's NULL,
// defaults to input file name. // defaults to input file name.
const char *temporary_directory_prefix; const char *temporary_directory_prefix;
// Level of complaining to do when loading from ARPA instead of binary format. // Level of complaining to do when loading from ARPA instead of binary format.
@ -69,49 +75,46 @@ struct Config {
ARPALoadComplain arpa_complain; ARPALoadComplain arpa_complain;
// While loading an ARPA file, also write out this binary format file. Set // While loading an ARPA file, also write out this binary format file. Set
// to NULL to disable. // to NULL to disable.
const char *write_mmap; const char *write_mmap;
enum WriteMethod { enum WriteMethod {
WRITE_MMAP, // Map the file directly. WRITE_MMAP, // Map the file directly.
WRITE_AFTER // Write after we're done. WRITE_AFTER // Write after we're done.
}; };
WriteMethod write_method; WriteMethod write_method;
// Include the vocab in the binary file? Only effective if write_mmap != NULL. // Include the vocab in the binary file? Only effective if write_mmap != NULL.
bool include_vocab; bool include_vocab;
// Left rest options. Only used when the model includes rest costs. // Left rest options. Only used when the model includes rest costs.
enum RestFunction { enum RestFunction {
REST_MAX, // Maximum of any score to the left REST_MAX, // Maximum of any score to the left
REST_LOWER, // Use lower-order files given below. REST_LOWER, // Use lower-order files given below.
}; };
RestFunction rest_function; RestFunction rest_function;
// Only used for REST_LOWER. // Only used for REST_LOWER.
std::vector<std::string> rest_lower_files; std::vector<std::string> rest_lower_files;
// Quantization options. Only effective for QuantTrieModel. One value is // Quantization options. Only effective for QuantTrieModel. One value is
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
// to quantize (and one of the remaining backoffs will be 0). // to quantize (and one of the remaining backoffs will be 0).
uint8_t prob_bits, backoff_bits; uint8_t prob_bits, backoff_bits;
// Bhiksha compression (simple form). Only works with trie. // Bhiksha compression (simple form). Only works with trie.
uint8_t pointer_bhiksha_bits; uint8_t pointer_bhiksha_bits;
// ONLY EFFECTIVE WHEN READING BINARY // ONLY EFFECTIVE WHEN READING BINARY
// How to get the giant array into memory: lazy mmap, populate, read etc. // How to get the giant array into memory: lazy mmap, populate, read etc.
// See util/mmap.hh for details of MapMethod. // See util/mmap.hh for details of MapMethod.
util::LoadMethod load_method; util::LoadMethod load_method;
// Set defaults.
// Set defaults.
Config(); Config();
}; };

View File

@ -37,7 +37,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) { template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
LoadLM(file, config, *this); LoadLM(file, config, *this);
// g++ prints warnings unless these are fully initialized. // g++ prints warnings unless these are fully initialized.
State begin_sentence = State(); State begin_sentence = State();
begin_sentence.length = 1; begin_sentence.length = 1;
begin_sentence.words[0] = vocab_.BeginSentence(); begin_sentence.words[0] = vocab_.BeginSentence();
@ -69,8 +69,8 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
} }
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) { template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
// Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any. // Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
util::FilePiece f(backing_.file.release(), file, config.messages); util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
try { try {
std::vector<uint64_t> counts; std::vector<uint64_t> counts;
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
@ -80,7 +80,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");
std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config)); std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
// Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs. // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config); vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
if (config.write_mmap) { if (config.write_mmap) {
@ -95,7 +95,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
if (!vocab_.SawUnk()) { if (!vocab_.SawUnk()) {
assert(config.unknown_missing != THROW_UP); assert(config.unknown_missing != THROW_UP);
// Default probabilities for unknown. // Default probabilities for unknown.
search_.UnknownUnigram().backoff = 0.0; search_.UnknownUnigram().backoff = 0.0;
search_.UnknownUnigram().prob = config.unknown_missing_logprob; search_.UnknownUnigram().prob = config.unknown_missing_logprob;
} }
@ -147,7 +147,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
} }
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const { template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const {
// Generate a state from context. // Generate a state from context.
context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
if (context_rend == context_rbegin) { if (context_rend == context_rbegin) {
out_state.length = 0; out_state.length = 0;
@ -191,7 +191,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
ret.rest = ptr.Rest(); ret.rest = ptr.Rest();
ret.prob = ptr.Prob(); ret.prob = ptr.Prob();
ret.extend_left = extend_pointer; ret.extend_left = extend_pointer;
// If this function is called, then it does depend on left words. // If this function is called, then it does depend on left words.
ret.independent_left = false; ret.independent_left = false;
} }
float subtract_me = ret.rest; float subtract_me = ret.rest;
@ -199,7 +199,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
next_use = extend_length; next_use = extend_length;
ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret); ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret);
next_use -= extend_length; next_use -= extend_length;
// Charge backoffs. // Charge backoffs.
for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b; for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
ret.prob -= subtract_me; ret.prob -= subtract_me;
ret.rest -= subtract_me; ret.rest -= subtract_me;
@ -209,7 +209,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
namespace { namespace {
// Do a paraonoid copy of history, assuming new_word has already been copied // Do a paraonoid copy of history, assuming new_word has already been copied
// (hence the -1). out_state.length could be zero so I avoided using // (hence the -1). out_state.length could be zero so I avoided using
// std::copy. // std::copy.
void CopyRemainingHistory(const WordIndex *from, State &out_state) { void CopyRemainingHistory(const WordIndex *from, State &out_state) {
WordIndex *out = out_state.words + 1; WordIndex *out = out_state.words + 1;
const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1; const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1;
@ -217,10 +217,10 @@ void CopyRemainingHistory(const WordIndex *from, State &out_state) {
} }
} // namespace } // namespace
/* Ugly optimized function. Produce a score excluding backoff. /* Ugly optimized function. Produce a score excluding backoff.
* The search goes in increasing order of ngram length. * The search goes in increasing order of ngram length.
* Context goes backward, so context_begin is the word immediately preceeding * Context goes backward, so context_begin is the word immediately preceeding
* new_word. * new_word.
*/ */
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff( template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff(
const WordIndex *const context_rbegin, const WordIndex *const context_rbegin,
@ -229,7 +229,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
State &out_state) const { State &out_state) const {
assert(new_word < vocab_.Bound()); assert(new_word < vocab_.Bound());
FullScoreReturn ret; FullScoreReturn ret;
// ret.ngram_length contains the last known non-blank ngram length. // ret.ngram_length contains the last known non-blank ngram length.
ret.ngram_length = 1; ret.ngram_length = 1;
typename Search::Node node; typename Search::Node node;
@ -238,9 +238,9 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
ret.prob = uni.Prob(); ret.prob = uni.Prob();
ret.rest = uni.Rest(); ret.rest = uni.Rest();
// This is the length of the context that should be used for continuation to the right. // This is the length of the context that should be used for continuation to the right.
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
// We'll write the word anyway since it will probably be used and does no harm being there. // We'll write the word anyway since it will probably be used and does no harm being there.
out_state.words[0] = new_word; out_state.words[0] = new_word;
if (context_rbegin == context_rend) return ret; if (context_rbegin == context_rend) return ret;

View File

@ -55,7 +55,7 @@ struct ProbPointer {
uint64_t index; uint64_t index;
}; };
// Array of n-grams and float indices. // Array of n-grams and float indices.
class BackoffMessages { class BackoffMessages {
public: public:
void Init(std::size_t entry_size) { void Init(std::size_t entry_size) {
@ -100,7 +100,7 @@ class BackoffMessages {
void Apply(float *const *const base, RecordReader &reader) { void Apply(float *const *const base, RecordReader &reader) {
FinishedAdding(); FinishedAdding();
if (current_ == allocated_) return; if (current_ == allocated_) return;
// We'll also use the same buffer to record messages to blanks that they extend. // We'll also use the same buffer to record messages to blanks that they extend.
WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_); WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex); const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);
for (reader.Rewind(); reader && (current_ != allocated_); ) { for (reader.Rewind(); reader && (current_ != allocated_); ) {
@ -109,7 +109,7 @@ class BackoffMessages {
++reader; ++reader;
break; break;
case 1: case 1:
// Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends. // Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends.
for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w; for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;
current_ += entry_size_; current_ += entry_size_;
break; break;
@ -126,7 +126,7 @@ class BackoffMessages {
break; break;
} }
} }
// Now this is a list of blanks that extend right. // Now this is a list of blanks that extend right.
entry_size_ = sizeof(WordIndex) * order; entry_size_ = sizeof(WordIndex) * order;
Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get())); Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));
current_ = (uint8_t*)backing_.get(); current_ = (uint8_t*)backing_.get();
@ -153,7 +153,7 @@ class BackoffMessages {
private: private:
void FinishedAdding() { void FinishedAdding() {
Resize(current_ - (uint8_t*)backing_.get()); Resize(current_ - (uint8_t*)backing_.get());
// Sort requests in same order as files. // Sort requests in same order as files.
std::sort( std::sort(
util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)), util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
util::SizedIterator(util::SizedProxy(current_, entry_size_)), util::SizedIterator(util::SizedProxy(current_, entry_size_)),
@ -220,7 +220,7 @@ class SRISucks {
} }
private: private:
// This used to be one array. Then I needed to separate it by order for quantization to work. // This used to be one array. Then I needed to separate it by order for quantization to work.
std::vector<float> values_[KENLM_MAX_ORDER - 1]; std::vector<float> values_[KENLM_MAX_ORDER - 1];
BackoffMessages messages_[KENLM_MAX_ORDER - 1]; BackoffMessages messages_[KENLM_MAX_ORDER - 1];
@ -253,7 +253,7 @@ class FindBlanks {
++counts_.back(); ++counts_.back();
} }
// Unigrams wrote one past. // Unigrams wrote one past.
void Cleanup() { void Cleanup() {
--counts_[0]; --counts_[0];
} }
@ -270,15 +270,15 @@ class FindBlanks {
SRISucks &sri_; SRISucks &sri_;
}; };
// Phase to actually write n-grams to the trie. // Phase to actually write n-grams to the trie.
template <class Quant, class Bhiksha> class WriteEntries { template <class Quant, class Bhiksha> class WriteEntries {
public: public:
WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) : WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
contexts_(contexts), contexts_(contexts),
quant_(quant), quant_(quant),
unigrams_(unigrams), unigrams_(unigrams),
middle_(middle), middle_(middle),
longest_(longest), longest_(longest),
bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)), bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
order_(order), order_(order),
sri_(sri) {} sri_(sri) {}
@ -328,7 +328,7 @@ struct Gram {
const WordIndex *begin, *end; const WordIndex *begin, *end;
// For queue, this is the direction we want. // For queue, this is the direction we want.
bool operator<(const Gram &other) const { bool operator<(const Gram &other) const {
return std::lexicographical_compare(other.begin, other.end, begin, end); return std::lexicographical_compare(other.begin, other.end, begin, end);
} }
@ -353,7 +353,7 @@ template <class Doing> class BlankManager {
been_length_ = length; been_length_ = length;
return; return;
} }
// There are blanks to insert starting with order blank. // There are blanks to insert starting with order blank.
unsigned char blank = cur - to + 1; unsigned char blank = cur - to + 1;
UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
const float *lower_basis; const float *lower_basis;
@ -363,7 +363,7 @@ template <class Doing> class BlankManager {
assert(*lower_basis != kBadProb); assert(*lower_basis != kBadProb);
doing_.MiddleBlank(blank, to, based_on, *lower_basis); doing_.MiddleBlank(blank, to, based_on, *lower_basis);
*pre = *cur; *pre = *cur;
// Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
basis_[blank - 1] = kBadProb; basis_[blank - 1] = kBadProb;
} }
*pre = *cur; *pre = *cur;
@ -377,7 +377,7 @@ template <class Doing> class BlankManager {
unsigned char been_length_; unsigned char been_length_;
float basis_[KENLM_MAX_ORDER]; float basis_[KENLM_MAX_ORDER];
Doing &doing_; Doing &doing_;
}; };
@ -451,7 +451,7 @@ template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, Re
} }
void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) { void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
// Fill unigram probabilities. // Fill unigram probabilities.
try { try {
rewind(file); rewind(file);
for (WordIndex i = 0; i < unigram_count; ++i) { for (WordIndex i = 0; i < unigram_count; ++i) {
@ -486,7 +486,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
util::scoped_memory unigrams; util::scoped_memory unigrams;
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri); FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder); RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder);
fixed_counts = finder.Counts(); fixed_counts = finder.Counts();
} }
unigram_file.reset(util::FDOpenOrThrow(unigram_fd)); unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
@ -504,7 +504,8 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
inputs[i-2].Rewind(); inputs[i-2].Rewind();
} }
if (Quant::kTrain) { if (Quant::kTrain) {
util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), config.messages, "Quantizing"); util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0),
config.ProgressMessages(), "Quantizing");
for (unsigned char i = 2; i < counts.size(); ++i) { for (unsigned char i = 2; i < counts.size(); ++i) {
TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant); TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);
} }
@ -519,13 +520,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
for (unsigned char i = 2; i <= counts.size(); ++i) { for (unsigned char i = 2; i <= counts.size(); ++i) {
inputs[i-2].Rewind(); inputs[i-2].Rewind();
} }
// Fill entries except unigram probabilities. // Fill entries except unigram probabilities.
{ {
WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri); WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer); RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);
} }
// Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
for (unsigned char order = 2; order <= counts.size(); ++order) { for (unsigned char order = 2; order <= counts.size(); ++order) {
const RecordReader &context = contexts[order - 2]; const RecordReader &context = contexts[order - 2];
if (context) { if (context) {
@ -541,13 +542,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
} }
/* Set ending offsets so the last entry will be sized properly */ /* Set ending offsets so the last entry will be sized properly */
// Last entry for unigrams was already set. // Last entry for unigrams was already set.
if (out.middle_begin_ != out.middle_end_) { if (out.middle_begin_ != out.middle_end_) {
for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) { for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {
i->FinishedLoading((i+1)->InsertIndex(), config); i->FinishedLoading((i+1)->InsertIndex(), config);
} }
(out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config); (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config);
} }
} }
template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) { template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
@ -595,7 +596,7 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::Initializ
} else { } else {
temporary_prefix = file; temporary_prefix = file;
} }
// At least 1MB sorting memory. // At least 1MB sorting memory.
SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab); SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
BuildTrie(sorted, counts, config, *this, quant_, vocab, backing); BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);

View File

@ -38,7 +38,7 @@ void ErsatzProgress::Milestone() {
next_ = std::numeric_limits<uint64_t>::max(); next_ = std::numeric_limits<uint64_t>::max();
out_ = NULL; out_ = NULL;
} else { } else {
next_ = std::max(next_, (stone * complete_) / kWidth); next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth);
} }
} }

View File

@ -32,7 +32,6 @@ class ErsatzProgress {
void Set(uint64_t to) { void Set(uint64_t to) {
if ((current_ = to) >= next_) Milestone(); if ((current_ = to) >= next_) Milestone();
Milestone();
} }
void Finished() { void Finished() {

View File

@ -1,3 +1,5 @@
#define _LARGEFILE64_SOURCE
#include "util/file.hh" #include "util/file.hh"
#include "util/exception.hh" #include "util/exception.hh"
@ -91,7 +93,7 @@ void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void); uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) { while (amount) {
std::size_t ret = PartialRead(fd, to, amount); std::size_t ret = PartialRead(fd, to, amount);
UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");
amount -= ret; amount -= ret;
to += ret; to += ret;
} }
@ -141,7 +143,7 @@ void InternalSeek(int fd, int64_t off, int whence) {
UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed"); UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed");
#else #else
UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed"); UTIL_THROW_IF((off_t)-1 == lseek64(fd, off, whence), ErrnoException, "Seek failed");
#endif #endif
} }
} // namespace } // namespace

View File

@ -32,8 +32,6 @@ class scoped_fd {
return ret; return ret;
} }
operator bool() { return fd_ != -1; }
private: private:
int fd_; int fd_;

View File

@ -23,17 +23,17 @@ class ParseNumberException : public Exception {
extern const bool kSpaces[256]; extern const bool kSpaces[256];
// Memory backing the returned StringPiece may vanish on the next call. // Memory backing the returned StringPiece may vanish on the next call.
class FilePiece { class FilePiece {
public: public:
// 1 MB default. // 1 MB default.
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
// Takes ownership of fd. name is used for messages. // Takes ownership of fd. name is used for messages.
explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
~FilePiece(); ~FilePiece();
char get() { char get() {
if (position_ == position_end_) { if (position_ == position_end_) {
Shift(); Shift();
if (at_end_) throw EndOfFileException(); if (at_end_) throw EndOfFileException();
@ -41,14 +41,14 @@ class FilePiece {
return *(position_++); return *(position_++);
} }
// Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
StringPiece ReadDelimited(const bool *delim = kSpaces) { StringPiece ReadDelimited(const bool *delim = kSpaces) {
SkipSpaces(delim); SkipSpaces(delim);
return Consume(FindDelimiterOrEOF(delim)); return Consume(FindDelimiterOrEOF(delim));
} }
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
// It is similar to getline in that way. // It is similar to getline in that way.
StringPiece ReadLine(char delim = '\n'); StringPiece ReadLine(char delim = '\n');
float ReadFloat(); float ReadFloat();
@ -56,7 +56,7 @@ class FilePiece {
long int ReadLong(); long int ReadLong();
unsigned long int ReadULong(); unsigned long int ReadULong();
// Skip spaces defined by isspace. // Skip spaces defined by isspace.
void SkipSpaces(const bool *delim = kSpaces) { void SkipSpaces(const bool *delim = kSpaces) {
for (; ; ++position_) { for (; ; ++position_) {
if (position_ == position_end_) Shift(); if (position_ == position_end_) Shift();
@ -69,7 +69,7 @@ class FilePiece {
} }
const std::string &FileName() const { return file_name_; } const std::string &FileName() const { return file_name_; }
private: private:
void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);

View File

@ -6,8 +6,8 @@
//#define HAVE_ICU //#define HAVE_ICU
#endif #endif
#ifndef HAVE_THREADS #ifndef HAVE_BOOST
//#define HAVE_THREADS #define HAVE_BOOST
#endif #endif
#endif // UTIL_HAVE__ #endif // UTIL_HAVE__

View File

@ -60,7 +60,7 @@ template <class KeyIter, class ValueIter> class JointProxy {
JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {} JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {} JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
operator const value_type() const { operator value_type() const {
value_type ret; value_type ret;
ret.key = *inner_.key_; ret.key = *inner_.key_;
ret.value = *inner_.value_; ret.value = *inner_.value_;
@ -121,7 +121,7 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi
template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > { template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
public: public:
PairedIterator(const KeyIter &key, const ValueIter &value) : PairedIterator(const KeyIter &key, const ValueIter &value) :
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {} ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
}; };

View File

@ -370,7 +370,7 @@ ReadBase *ReadFactory(int fd, uint64_t &raw_amount) {
break; break;
} }
try { try {
AdvanceOrThrow(fd, -ReadCompressed::kMagicSize); SeekOrThrow(fd, 0);
} catch (const util::ErrnoException &e) { } catch (const util::ErrnoException &e) {
return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize); return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize);
} }

View File

@ -1,6 +1,6 @@
/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If /* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
* you don't use ICU, then this will use the Google implementation from Chrome. * you don't use ICU, then this will use the Google implementation from Chrome.
* This has been modified from the original version to let you choose. * This has been modified from the original version to let you choose.
*/ */
// Copyright 2008, Google Inc. // Copyright 2008, Google Inc.
@ -49,7 +49,11 @@
#define BASE_STRING_PIECE_H__ #define BASE_STRING_PIECE_H__
#include "util/have.hh" #include "util/have.hh"
#ifdef HAVE_BOOST
#include <boost/functional/hash/hash.hpp> #include <boost/functional/hash/hash.hpp>
#endif // HAVE_BOOST
#include <cstring> #include <cstring>
#include <iosfwd> #include <iosfwd>
#include <ostream> #include <ostream>
@ -58,9 +62,9 @@
#include <unicode/stringpiece.h> #include <unicode/stringpiece.h>
#include <unicode/uversion.h> #include <unicode/uversion.h>
// Old versions of ICU don't define operator== and operator!=. // Old versions of ICU don't define operator== and operator!=.
#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) #if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. #warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
inline bool operator==(const StringPiece& x, const StringPiece& y) { inline bool operator==(const StringPiece& x, const StringPiece& y) {
if (x.size() != y.size()) if (x.size() != y.size())
return false; return false;
@ -252,6 +256,7 @@ inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
return o.write(piece.data(), static_cast<std::streamsize>(piece.size())); return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
} }
#ifdef HAVE_BOOST
inline size_t hash_value(const StringPiece &str) { inline size_t hash_value(const StringPiece &str) {
return boost::hash_range(str.data(), str.data() + str.length()); return boost::hash_range(str.data(), str.data() + str.length());
} }
@ -285,9 +290,12 @@ template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece
return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
#endif #endif
} }
#endif
#ifdef HAVE_ICU #ifdef HAVE_ICU
U_NAMESPACE_END U_NAMESPACE_END
using U_NAMESPACE_QUALIFIER StringPiece;
#endif #endif
#endif // BASE_STRING_PIECE_H__ #endif // BASE_STRING_PIECE_H__