mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
KenLM 0e5d259 including read_compressed fix
This commit is contained in:
parent
3203f7c92d
commit
f9ee7ae4b3
@ -16,11 +16,11 @@ namespace ngram {
|
||||
namespace {
|
||||
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
|
||||
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
|
||||
// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
|
||||
// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
|
||||
const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
|
||||
const long int kMagicVersion = 5;
|
||||
|
||||
// Old binary files built on 32-bit machines have this header.
|
||||
// Old binary files built on 32-bit machines have this header.
|
||||
// TODO: eliminate with next binary release.
|
||||
struct OldSanity {
|
||||
char magic[sizeof(kMagicBytes)];
|
||||
@ -39,7 +39,7 @@ struct OldSanity {
|
||||
};
|
||||
|
||||
|
||||
// Test values aligned to 8 bytes.
|
||||
// Test values aligned to 8 bytes.
|
||||
struct Sanity {
|
||||
char magic[ALIGN8(sizeof(kMagicBytes))];
|
||||
float zero_f, one_f, minus_half_f;
|
||||
@ -101,7 +101,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_
|
||||
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
|
||||
std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
|
||||
if (config.write_mmap) {
|
||||
// Grow the file to accomodate the search, using zeros.
|
||||
// Grow the file to accomodate the search, using zeros.
|
||||
try {
|
||||
util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
|
||||
} catch (util::ErrnoException &e) {
|
||||
@ -114,7 +114,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||
}
|
||||
// mmap it now.
|
||||
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
|
||||
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
|
||||
std::size_t page_size = util::SizePage();
|
||||
std::size_t alignment_cruft = adjusted_vocab % page_size;
|
||||
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
||||
@ -122,7 +122,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
|
||||
} else {
|
||||
util::MapAnonymous(memory_size, backing.search);
|
||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
|
||||
@ -140,7 +140,7 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
|
||||
util::FSyncOrThrow(backing.file.get());
|
||||
break;
|
||||
}
|
||||
// header and vocab share the same mmap. The header is written here because we know the counts.
|
||||
// header and vocab share the same mmap. The header is written here because we know the counts.
|
||||
Parameters params = Parameters();
|
||||
params.counts = counts;
|
||||
params.fixed.order = counts.size();
|
||||
@ -160,7 +160,7 @@ namespace detail {
|
||||
bool IsBinaryFormat(int fd) {
|
||||
const uint64_t size = util::SizeFile(fd);
|
||||
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
|
||||
// Try reading the header.
|
||||
// Try reading the header.
|
||||
util::scoped_memory memory;
|
||||
try {
|
||||
util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
|
||||
@ -214,7 +214,7 @@ void SeekPastHeader(int fd, const Parameters ¶ms) {
|
||||
|
||||
uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) {
|
||||
const uint64_t file_size = util::SizeFile(backing.file.get());
|
||||
// The header is smaller than a page, so we have to map the whole header as well.
|
||||
// The header is smaller than a page, so we have to map the whole header as well.
|
||||
std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
|
||||
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
|
||||
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
|
||||
@ -233,7 +233,8 @@ void ComplainAboutARPA(const Config &config, ModelType model_type) {
|
||||
if (config.write_mmap || !config.messages) return;
|
||||
if (config.arpa_complain == Config::ALL) {
|
||||
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
|
||||
} else if (config.arpa_complain == Config::EXPENSIVE && model_type == TRIE_SORTED) {
|
||||
} else if (config.arpa_complain == Config::EXPENSIVE &&
|
||||
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
|
||||
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ namespace lm {
|
||||
namespace ngram {
|
||||
|
||||
Config::Config() :
|
||||
show_progress(true),
|
||||
messages(&std::cerr),
|
||||
enumerate_vocab(NULL),
|
||||
unknown_missing(COMPLAIN),
|
||||
|
59
lm/config.hh
59
lm/config.hh
@ -11,46 +11,52 @@
|
||||
/* Configuration for ngram model. Separate header to reduce pollution. */
|
||||
|
||||
namespace lm {
|
||||
|
||||
|
||||
class EnumerateVocab;
|
||||
|
||||
namespace ngram {
|
||||
|
||||
struct Config {
|
||||
// EFFECTIVE FOR BOTH ARPA AND BINARY READS
|
||||
// EFFECTIVE FOR BOTH ARPA AND BINARY READS
|
||||
|
||||
// (default true) print progress bar to messages
|
||||
bool show_progress;
|
||||
|
||||
// Where to log messages including the progress bar. Set to NULL for
|
||||
// silence.
|
||||
std::ostream *messages;
|
||||
|
||||
std::ostream *ProgressMessages() const {
|
||||
return show_progress ? messages : 0;
|
||||
}
|
||||
|
||||
// This will be called with every string in the vocabulary. See
|
||||
// enumerate_vocab.hh for more detail. Config does not take ownership; you
|
||||
// are still responsible for deleting it (or stack allocating).
|
||||
// are still responsible for deleting it (or stack allocating).
|
||||
EnumerateVocab *enumerate_vocab;
|
||||
|
||||
|
||||
|
||||
// ONLY EFFECTIVE WHEN READING ARPA
|
||||
|
||||
// What to do when <unk> isn't in the provided model.
|
||||
// What to do when <unk> isn't in the provided model.
|
||||
WarningAction unknown_missing;
|
||||
// What to do when <s> or </s> is missing from the model.
|
||||
// If THROW_UP, the exception will be of type util::SpecialWordMissingException.
|
||||
// What to do when <s> or </s> is missing from the model.
|
||||
// If THROW_UP, the exception will be of type util::SpecialWordMissingException.
|
||||
WarningAction sentence_marker_missing;
|
||||
|
||||
// What to do with a positive log probability. For COMPLAIN and SILENT, map
|
||||
// to 0.
|
||||
// to 0.
|
||||
WarningAction positive_log_probability;
|
||||
|
||||
// The probability to substitute for <unk> if it's missing from the model.
|
||||
// The probability to substitute for <unk> if it's missing from the model.
|
||||
// No effect if the model has <unk> or unknown_missing == THROW_UP.
|
||||
float unknown_missing_logprob;
|
||||
|
||||
// Size multiplier for probing hash table. Must be > 1. Space is linear in
|
||||
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect
|
||||
// for sorted variant.
|
||||
// for sorted variant.
|
||||
// If you find yourself setting this to a low number, consider using the
|
||||
// TrieModel which has lower memory consumption.
|
||||
// TrieModel which has lower memory consumption.
|
||||
float probing_multiplier;
|
||||
|
||||
// Amount of memory to use for building. The actual memory usage will be
|
||||
@ -58,10 +64,10 @@ struct Config {
|
||||
// models.
|
||||
std::size_t building_memory;
|
||||
|
||||
// Template for temporary directory appropriate for passing to mkdtemp.
|
||||
// Template for temporary directory appropriate for passing to mkdtemp.
|
||||
// The characters XXXXXX are appended before passing to mkdtemp. Only
|
||||
// applies to trie. If NULL, defaults to write_mmap. If that's NULL,
|
||||
// defaults to input file name.
|
||||
// defaults to input file name.
|
||||
const char *temporary_directory_prefix;
|
||||
|
||||
// Level of complaining to do when loading from ARPA instead of binary format.
|
||||
@ -69,49 +75,46 @@ struct Config {
|
||||
ARPALoadComplain arpa_complain;
|
||||
|
||||
// While loading an ARPA file, also write out this binary format file. Set
|
||||
// to NULL to disable.
|
||||
// to NULL to disable.
|
||||
const char *write_mmap;
|
||||
|
||||
enum WriteMethod {
|
||||
WRITE_MMAP, // Map the file directly.
|
||||
WRITE_AFTER // Write after we're done.
|
||||
WRITE_MMAP, // Map the file directly.
|
||||
WRITE_AFTER // Write after we're done.
|
||||
};
|
||||
WriteMethod write_method;
|
||||
|
||||
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
|
||||
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
|
||||
bool include_vocab;
|
||||
|
||||
|
||||
// Left rest options. Only used when the model includes rest costs.
|
||||
// Left rest options. Only used when the model includes rest costs.
|
||||
enum RestFunction {
|
||||
REST_MAX, // Maximum of any score to the left
|
||||
REST_LOWER, // Use lower-order files given below.
|
||||
REST_LOWER, // Use lower-order files given below.
|
||||
};
|
||||
RestFunction rest_function;
|
||||
// Only used for REST_LOWER.
|
||||
// Only used for REST_LOWER.
|
||||
std::vector<std::string> rest_lower_files;
|
||||
|
||||
|
||||
|
||||
// Quantization options. Only effective for QuantTrieModel. One value is
|
||||
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
|
||||
// to quantize (and one of the remaining backoffs will be 0).
|
||||
// to quantize (and one of the remaining backoffs will be 0).
|
||||
uint8_t prob_bits, backoff_bits;
|
||||
|
||||
// Bhiksha compression (simple form). Only works with trie.
|
||||
uint8_t pointer_bhiksha_bits;
|
||||
|
||||
|
||||
|
||||
|
||||
// ONLY EFFECTIVE WHEN READING BINARY
|
||||
|
||||
|
||||
// How to get the giant array into memory: lazy mmap, populate, read etc.
|
||||
// See util/mmap.hh for details of MapMethod.
|
||||
// See util/mmap.hh for details of MapMethod.
|
||||
util::LoadMethod load_method;
|
||||
|
||||
|
||||
|
||||
// Set defaults.
|
||||
// Set defaults.
|
||||
Config();
|
||||
};
|
||||
|
||||
|
30
lm/model.cc
30
lm/model.cc
@ -37,7 +37,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
|
||||
LoadLM(file, config, *this);
|
||||
|
||||
// g++ prints warnings unless these are fully initialized.
|
||||
// g++ prints warnings unless these are fully initialized.
|
||||
State begin_sentence = State();
|
||||
begin_sentence.length = 1;
|
||||
begin_sentence.words[0] = vocab_.BeginSentence();
|
||||
@ -69,8 +69,8 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
}
|
||||
|
||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
|
||||
// Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
|
||||
util::FilePiece f(backing_.file.release(), file, config.messages);
|
||||
// Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
|
||||
util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
|
||||
try {
|
||||
std::vector<uint64_t> counts;
|
||||
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
|
||||
@ -80,7 +80,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");
|
||||
|
||||
std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
|
||||
// Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
|
||||
// Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
|
||||
vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
|
||||
|
||||
if (config.write_mmap) {
|
||||
@ -95,7 +95,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
||||
|
||||
if (!vocab_.SawUnk()) {
|
||||
assert(config.unknown_missing != THROW_UP);
|
||||
// Default probabilities for unknown.
|
||||
// Default probabilities for unknown.
|
||||
search_.UnknownUnigram().backoff = 0.0;
|
||||
search_.UnknownUnigram().prob = config.unknown_missing_logprob;
|
||||
}
|
||||
@ -147,7 +147,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
}
|
||||
|
||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const {
|
||||
// Generate a state from context.
|
||||
// Generate a state from context.
|
||||
context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
|
||||
if (context_rend == context_rbegin) {
|
||||
out_state.length = 0;
|
||||
@ -191,7 +191,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
ret.rest = ptr.Rest();
|
||||
ret.prob = ptr.Prob();
|
||||
ret.extend_left = extend_pointer;
|
||||
// If this function is called, then it does depend on left words.
|
||||
// If this function is called, then it does depend on left words.
|
||||
ret.independent_left = false;
|
||||
}
|
||||
float subtract_me = ret.rest;
|
||||
@ -199,7 +199,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
next_use = extend_length;
|
||||
ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret);
|
||||
next_use -= extend_length;
|
||||
// Charge backoffs.
|
||||
// Charge backoffs.
|
||||
for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
|
||||
ret.prob -= subtract_me;
|
||||
ret.rest -= subtract_me;
|
||||
@ -209,7 +209,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
namespace {
|
||||
// Do a paraonoid copy of history, assuming new_word has already been copied
|
||||
// (hence the -1). out_state.length could be zero so I avoided using
|
||||
// std::copy.
|
||||
// std::copy.
|
||||
void CopyRemainingHistory(const WordIndex *from, State &out_state) {
|
||||
WordIndex *out = out_state.words + 1;
|
||||
const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1;
|
||||
@ -217,10 +217,10 @@ void CopyRemainingHistory(const WordIndex *from, State &out_state) {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
/* Ugly optimized function. Produce a score excluding backoff.
|
||||
* The search goes in increasing order of ngram length.
|
||||
/* Ugly optimized function. Produce a score excluding backoff.
|
||||
* The search goes in increasing order of ngram length.
|
||||
* Context goes backward, so context_begin is the word immediately preceeding
|
||||
* new_word.
|
||||
* new_word.
|
||||
*/
|
||||
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff(
|
||||
const WordIndex *const context_rbegin,
|
||||
@ -229,7 +229,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
State &out_state) const {
|
||||
assert(new_word < vocab_.Bound());
|
||||
FullScoreReturn ret;
|
||||
// ret.ngram_length contains the last known non-blank ngram length.
|
||||
// ret.ngram_length contains the last known non-blank ngram length.
|
||||
ret.ngram_length = 1;
|
||||
|
||||
typename Search::Node node;
|
||||
@ -238,9 +238,9 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
||||
ret.prob = uni.Prob();
|
||||
ret.rest = uni.Rest();
|
||||
|
||||
// This is the length of the context that should be used for continuation to the right.
|
||||
// This is the length of the context that should be used for continuation to the right.
|
||||
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
|
||||
// We'll write the word anyway since it will probably be used and does no harm being there.
|
||||
// We'll write the word anyway since it will probably be used and does no harm being there.
|
||||
out_state.words[0] = new_word;
|
||||
if (context_rbegin == context_rend) return ret;
|
||||
|
||||
|
@ -55,7 +55,7 @@ struct ProbPointer {
|
||||
uint64_t index;
|
||||
};
|
||||
|
||||
// Array of n-grams and float indices.
|
||||
// Array of n-grams and float indices.
|
||||
class BackoffMessages {
|
||||
public:
|
||||
void Init(std::size_t entry_size) {
|
||||
@ -100,7 +100,7 @@ class BackoffMessages {
|
||||
void Apply(float *const *const base, RecordReader &reader) {
|
||||
FinishedAdding();
|
||||
if (current_ == allocated_) return;
|
||||
// We'll also use the same buffer to record messages to blanks that they extend.
|
||||
// We'll also use the same buffer to record messages to blanks that they extend.
|
||||
WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
|
||||
const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);
|
||||
for (reader.Rewind(); reader && (current_ != allocated_); ) {
|
||||
@ -109,7 +109,7 @@ class BackoffMessages {
|
||||
++reader;
|
||||
break;
|
||||
case 1:
|
||||
// Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends.
|
||||
// Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends.
|
||||
for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;
|
||||
current_ += entry_size_;
|
||||
break;
|
||||
@ -126,7 +126,7 @@ class BackoffMessages {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Now this is a list of blanks that extend right.
|
||||
// Now this is a list of blanks that extend right.
|
||||
entry_size_ = sizeof(WordIndex) * order;
|
||||
Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));
|
||||
current_ = (uint8_t*)backing_.get();
|
||||
@ -153,7 +153,7 @@ class BackoffMessages {
|
||||
private:
|
||||
void FinishedAdding() {
|
||||
Resize(current_ - (uint8_t*)backing_.get());
|
||||
// Sort requests in same order as files.
|
||||
// Sort requests in same order as files.
|
||||
std::sort(
|
||||
util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
|
||||
util::SizedIterator(util::SizedProxy(current_, entry_size_)),
|
||||
@ -220,7 +220,7 @@ class SRISucks {
|
||||
}
|
||||
|
||||
private:
|
||||
// This used to be one array. Then I needed to separate it by order for quantization to work.
|
||||
// This used to be one array. Then I needed to separate it by order for quantization to work.
|
||||
std::vector<float> values_[KENLM_MAX_ORDER - 1];
|
||||
BackoffMessages messages_[KENLM_MAX_ORDER - 1];
|
||||
|
||||
@ -253,7 +253,7 @@ class FindBlanks {
|
||||
++counts_.back();
|
||||
}
|
||||
|
||||
// Unigrams wrote one past.
|
||||
// Unigrams wrote one past.
|
||||
void Cleanup() {
|
||||
--counts_[0];
|
||||
}
|
||||
@ -270,15 +270,15 @@ class FindBlanks {
|
||||
SRISucks &sri_;
|
||||
};
|
||||
|
||||
// Phase to actually write n-grams to the trie.
|
||||
// Phase to actually write n-grams to the trie.
|
||||
template <class Quant, class Bhiksha> class WriteEntries {
|
||||
public:
|
||||
WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
|
||||
WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
|
||||
contexts_(contexts),
|
||||
quant_(quant),
|
||||
unigrams_(unigrams),
|
||||
middle_(middle),
|
||||
longest_(longest),
|
||||
longest_(longest),
|
||||
bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
|
||||
order_(order),
|
||||
sri_(sri) {}
|
||||
@ -328,7 +328,7 @@ struct Gram {
|
||||
|
||||
const WordIndex *begin, *end;
|
||||
|
||||
// For queue, this is the direction we want.
|
||||
// For queue, this is the direction we want.
|
||||
bool operator<(const Gram &other) const {
|
||||
return std::lexicographical_compare(other.begin, other.end, begin, end);
|
||||
}
|
||||
@ -353,7 +353,7 @@ template <class Doing> class BlankManager {
|
||||
been_length_ = length;
|
||||
return;
|
||||
}
|
||||
// There are blanks to insert starting with order blank.
|
||||
// There are blanks to insert starting with order blank.
|
||||
unsigned char blank = cur - to + 1;
|
||||
UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
|
||||
const float *lower_basis;
|
||||
@ -363,7 +363,7 @@ template <class Doing> class BlankManager {
|
||||
assert(*lower_basis != kBadProb);
|
||||
doing_.MiddleBlank(blank, to, based_on, *lower_basis);
|
||||
*pre = *cur;
|
||||
// Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
|
||||
// Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
|
||||
basis_[blank - 1] = kBadProb;
|
||||
}
|
||||
*pre = *cur;
|
||||
@ -377,7 +377,7 @@ template <class Doing> class BlankManager {
|
||||
unsigned char been_length_;
|
||||
|
||||
float basis_[KENLM_MAX_ORDER];
|
||||
|
||||
|
||||
Doing &doing_;
|
||||
};
|
||||
|
||||
@ -451,7 +451,7 @@ template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, Re
|
||||
}
|
||||
|
||||
void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
|
||||
// Fill unigram probabilities.
|
||||
// Fill unigram probabilities.
|
||||
try {
|
||||
rewind(file);
|
||||
for (WordIndex i = 0; i < unigram_count; ++i) {
|
||||
@ -486,7 +486,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
||||
util::scoped_memory unigrams;
|
||||
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
|
||||
FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
|
||||
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
|
||||
RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder);
|
||||
fixed_counts = finder.Counts();
|
||||
}
|
||||
unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
|
||||
@ -504,7 +504,8 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
||||
inputs[i-2].Rewind();
|
||||
}
|
||||
if (Quant::kTrain) {
|
||||
util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), config.messages, "Quantizing");
|
||||
util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0),
|
||||
config.ProgressMessages(), "Quantizing");
|
||||
for (unsigned char i = 2; i < counts.size(); ++i) {
|
||||
TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);
|
||||
}
|
||||
@ -519,13 +520,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
||||
for (unsigned char i = 2; i <= counts.size(); ++i) {
|
||||
inputs[i-2].Rewind();
|
||||
}
|
||||
// Fill entries except unigram probabilities.
|
||||
// Fill entries except unigram probabilities.
|
||||
{
|
||||
WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
|
||||
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer);
|
||||
RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);
|
||||
}
|
||||
|
||||
// Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
|
||||
// Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
|
||||
for (unsigned char order = 2; order <= counts.size(); ++order) {
|
||||
const RecordReader &context = contexts[order - 2];
|
||||
if (context) {
|
||||
@ -541,13 +542,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
||||
}
|
||||
|
||||
/* Set ending offsets so the last entry will be sized properly */
|
||||
// Last entry for unigrams was already set.
|
||||
// Last entry for unigrams was already set.
|
||||
if (out.middle_begin_ != out.middle_end_) {
|
||||
for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {
|
||||
i->FinishedLoading((i+1)->InsertIndex(), config);
|
||||
}
|
||||
(out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
|
||||
@ -595,7 +596,7 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::Initializ
|
||||
} else {
|
||||
temporary_prefix = file;
|
||||
}
|
||||
// At least 1MB sorting memory.
|
||||
// At least 1MB sorting memory.
|
||||
SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
|
||||
|
||||
BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
|
||||
|
@ -38,7 +38,7 @@ void ErsatzProgress::Milestone() {
|
||||
next_ = std::numeric_limits<uint64_t>::max();
|
||||
out_ = NULL;
|
||||
} else {
|
||||
next_ = std::max(next_, (stone * complete_) / kWidth);
|
||||
next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -32,7 +32,6 @@ class ErsatzProgress {
|
||||
|
||||
void Set(uint64_t to) {
|
||||
if ((current_ = to) >= next_) Milestone();
|
||||
Milestone();
|
||||
}
|
||||
|
||||
void Finished() {
|
||||
|
@ -1,3 +1,5 @@
|
||||
#define _LARGEFILE64_SOURCE
|
||||
|
||||
#include "util/file.hh"
|
||||
|
||||
#include "util/exception.hh"
|
||||
@ -91,7 +93,7 @@ void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
|
||||
uint8_t *to = static_cast<uint8_t*>(to_void);
|
||||
while (amount) {
|
||||
std::size_t ret = PartialRead(fd, to, amount);
|
||||
UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
|
||||
UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");
|
||||
amount -= ret;
|
||||
to += ret;
|
||||
}
|
||||
@ -141,7 +143,7 @@ void InternalSeek(int fd, int64_t off, int whence) {
|
||||
UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed");
|
||||
|
||||
#else
|
||||
UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
|
||||
UTIL_THROW_IF((off_t)-1 == lseek64(fd, off, whence), ErrnoException, "Seek failed");
|
||||
#endif
|
||||
}
|
||||
} // namespace
|
||||
|
@ -32,8 +32,6 @@ class scoped_fd {
|
||||
return ret;
|
||||
}
|
||||
|
||||
operator bool() { return fd_ != -1; }
|
||||
|
||||
private:
|
||||
int fd_;
|
||||
|
||||
|
@ -23,17 +23,17 @@ class ParseNumberException : public Exception {
|
||||
|
||||
extern const bool kSpaces[256];
|
||||
|
||||
// Memory backing the returned StringPiece may vanish on the next call.
|
||||
// Memory backing the returned StringPiece may vanish on the next call.
|
||||
class FilePiece {
|
||||
public:
|
||||
// 1 MB default.
|
||||
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
||||
// Takes ownership of fd. name is used for messages.
|
||||
// Takes ownership of fd. name is used for messages.
|
||||
explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
||||
|
||||
~FilePiece();
|
||||
|
||||
char get() {
|
||||
|
||||
char get() {
|
||||
if (position_ == position_end_) {
|
||||
Shift();
|
||||
if (at_end_) throw EndOfFileException();
|
||||
@ -41,14 +41,14 @@ class FilePiece {
|
||||
return *(position_++);
|
||||
}
|
||||
|
||||
// Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
|
||||
// Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
|
||||
StringPiece ReadDelimited(const bool *delim = kSpaces) {
|
||||
SkipSpaces(delim);
|
||||
return Consume(FindDelimiterOrEOF(delim));
|
||||
}
|
||||
|
||||
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
|
||||
// It is similar to getline in that way.
|
||||
// It is similar to getline in that way.
|
||||
StringPiece ReadLine(char delim = '\n');
|
||||
|
||||
float ReadFloat();
|
||||
@ -56,7 +56,7 @@ class FilePiece {
|
||||
long int ReadLong();
|
||||
unsigned long int ReadULong();
|
||||
|
||||
// Skip spaces defined by isspace.
|
||||
// Skip spaces defined by isspace.
|
||||
void SkipSpaces(const bool *delim = kSpaces) {
|
||||
for (; ; ++position_) {
|
||||
if (position_ == position_end_) Shift();
|
||||
@ -69,7 +69,7 @@ class FilePiece {
|
||||
}
|
||||
|
||||
const std::string &FileName() const { return file_name_; }
|
||||
|
||||
|
||||
private:
|
||||
void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
|
||||
|
||||
|
@ -6,8 +6,8 @@
|
||||
//#define HAVE_ICU
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_THREADS
|
||||
//#define HAVE_THREADS
|
||||
#ifndef HAVE_BOOST
|
||||
#define HAVE_BOOST
|
||||
#endif
|
||||
|
||||
#endif // UTIL_HAVE__
|
||||
|
@ -60,7 +60,7 @@ template <class KeyIter, class ValueIter> class JointProxy {
|
||||
JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
|
||||
JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
|
||||
|
||||
operator const value_type() const {
|
||||
operator value_type() const {
|
||||
value_type ret;
|
||||
ret.key = *inner_.key_;
|
||||
ret.value = *inner_.value_;
|
||||
@ -121,7 +121,7 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi
|
||||
|
||||
template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
|
||||
public:
|
||||
PairedIterator(const KeyIter &key, const ValueIter &value) :
|
||||
PairedIterator(const KeyIter &key, const ValueIter &value) :
|
||||
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
|
||||
};
|
||||
|
||||
|
@ -370,7 +370,7 @@ ReadBase *ReadFactory(int fd, uint64_t &raw_amount) {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
AdvanceOrThrow(fd, -ReadCompressed::kMagicSize);
|
||||
SeekOrThrow(fd, 0);
|
||||
} catch (const util::ErrnoException &e) {
|
||||
return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
|
||||
* you don't use ICU, then this will use the Google implementation from Chrome.
|
||||
* This has been modified from the original version to let you choose.
|
||||
* This has been modified from the original version to let you choose.
|
||||
*/
|
||||
|
||||
// Copyright 2008, Google Inc.
|
||||
@ -49,7 +49,11 @@
|
||||
#define BASE_STRING_PIECE_H__
|
||||
|
||||
#include "util/have.hh"
|
||||
|
||||
#ifdef HAVE_BOOST
|
||||
#include <boost/functional/hash/hash.hpp>
|
||||
#endif // HAVE_BOOST
|
||||
|
||||
#include <cstring>
|
||||
#include <iosfwd>
|
||||
#include <ostream>
|
||||
@ -58,9 +62,9 @@
|
||||
#include <unicode/stringpiece.h>
|
||||
#include <unicode/uversion.h>
|
||||
|
||||
// Old versions of ICU don't define operator== and operator!=.
|
||||
// Old versions of ICU don't define operator== and operator!=.
|
||||
#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
|
||||
#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
|
||||
#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
|
||||
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
||||
if (x.size() != y.size())
|
||||
return false;
|
||||
@ -252,6 +256,7 @@ inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
|
||||
return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
|
||||
}
|
||||
|
||||
#ifdef HAVE_BOOST
|
||||
inline size_t hash_value(const StringPiece &str) {
|
||||
return boost::hash_range(str.data(), str.data() + str.length());
|
||||
}
|
||||
@ -285,9 +290,12 @@ template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece
|
||||
return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ICU
|
||||
U_NAMESPACE_END
|
||||
using U_NAMESPACE_QUALIFIER StringPiece;
|
||||
#endif
|
||||
|
||||
|
||||
#endif // BASE_STRING_PIECE_H__
|
||||
|
Loading…
Reference in New Issue
Block a user