mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
KenLM 0e5d259 including read_compressed fix
This commit is contained in:
parent
3203f7c92d
commit
f9ee7ae4b3
@ -16,11 +16,11 @@ namespace ngram {
|
|||||||
namespace {
|
namespace {
|
||||||
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
|
const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
|
||||||
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
|
const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
|
||||||
// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
|
// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
|
||||||
const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
|
const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
|
||||||
const long int kMagicVersion = 5;
|
const long int kMagicVersion = 5;
|
||||||
|
|
||||||
// Old binary files built on 32-bit machines have this header.
|
// Old binary files built on 32-bit machines have this header.
|
||||||
// TODO: eliminate with next binary release.
|
// TODO: eliminate with next binary release.
|
||||||
struct OldSanity {
|
struct OldSanity {
|
||||||
char magic[sizeof(kMagicBytes)];
|
char magic[sizeof(kMagicBytes)];
|
||||||
@ -39,7 +39,7 @@ struct OldSanity {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// Test values aligned to 8 bytes.
|
// Test values aligned to 8 bytes.
|
||||||
struct Sanity {
|
struct Sanity {
|
||||||
char magic[ALIGN8(sizeof(kMagicBytes))];
|
char magic[ALIGN8(sizeof(kMagicBytes))];
|
||||||
float zero_f, one_f, minus_half_f;
|
float zero_f, one_f, minus_half_f;
|
||||||
@ -101,7 +101,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_
|
|||||||
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
|
uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) {
|
||||||
std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
|
std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad;
|
||||||
if (config.write_mmap) {
|
if (config.write_mmap) {
|
||||||
// Grow the file to accomodate the search, using zeros.
|
// Grow the file to accomodate the search, using zeros.
|
||||||
try {
|
try {
|
||||||
util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
|
util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size);
|
||||||
} catch (util::ErrnoException &e) {
|
} catch (util::ErrnoException &e) {
|
||||||
@ -114,7 +114,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
|
|||||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||||
}
|
}
|
||||||
// mmap it now.
|
// mmap it now.
|
||||||
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
|
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
|
||||||
std::size_t page_size = util::SizePage();
|
std::size_t page_size = util::SizePage();
|
||||||
std::size_t alignment_cruft = adjusted_vocab % page_size;
|
std::size_t alignment_cruft = adjusted_vocab % page_size;
|
||||||
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
||||||
@ -122,7 +122,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
|
|||||||
} else {
|
} else {
|
||||||
util::MapAnonymous(memory_size, backing.search);
|
util::MapAnonymous(memory_size, backing.search);
|
||||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
|
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
|
||||||
@ -140,7 +140,7 @@ void FinishFile(const Config &config, ModelType model_type, unsigned int search_
|
|||||||
util::FSyncOrThrow(backing.file.get());
|
util::FSyncOrThrow(backing.file.get());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// header and vocab share the same mmap. The header is written here because we know the counts.
|
// header and vocab share the same mmap. The header is written here because we know the counts.
|
||||||
Parameters params = Parameters();
|
Parameters params = Parameters();
|
||||||
params.counts = counts;
|
params.counts = counts;
|
||||||
params.fixed.order = counts.size();
|
params.fixed.order = counts.size();
|
||||||
@ -160,7 +160,7 @@ namespace detail {
|
|||||||
bool IsBinaryFormat(int fd) {
|
bool IsBinaryFormat(int fd) {
|
||||||
const uint64_t size = util::SizeFile(fd);
|
const uint64_t size = util::SizeFile(fd);
|
||||||
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
|
if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
|
||||||
// Try reading the header.
|
// Try reading the header.
|
||||||
util::scoped_memory memory;
|
util::scoped_memory memory;
|
||||||
try {
|
try {
|
||||||
util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
|
util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
|
||||||
@ -214,7 +214,7 @@ void SeekPastHeader(int fd, const Parameters ¶ms) {
|
|||||||
|
|
||||||
uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) {
|
uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) {
|
||||||
const uint64_t file_size = util::SizeFile(backing.file.get());
|
const uint64_t file_size = util::SizeFile(backing.file.get());
|
||||||
// The header is smaller than a page, so we have to map the whole header as well.
|
// The header is smaller than a page, so we have to map the whole header as well.
|
||||||
std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
|
std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size);
|
||||||
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
|
if (file_size != util::kBadSize && static_cast<uint64_t>(file_size) < total_map)
|
||||||
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
|
UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
|
||||||
@ -233,7 +233,8 @@ void ComplainAboutARPA(const Config &config, ModelType model_type) {
|
|||||||
if (config.write_mmap || !config.messages) return;
|
if (config.write_mmap || !config.messages) return;
|
||||||
if (config.arpa_complain == Config::ALL) {
|
if (config.arpa_complain == Config::ALL) {
|
||||||
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
|
*config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
|
||||||
} else if (config.arpa_complain == Config::EXPENSIVE && model_type == TRIE_SORTED) {
|
} else if (config.arpa_complain == Config::EXPENSIVE &&
|
||||||
|
(model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
|
||||||
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
|
*config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ namespace lm {
|
|||||||
namespace ngram {
|
namespace ngram {
|
||||||
|
|
||||||
Config::Config() :
|
Config::Config() :
|
||||||
|
show_progress(true),
|
||||||
messages(&std::cerr),
|
messages(&std::cerr),
|
||||||
enumerate_vocab(NULL),
|
enumerate_vocab(NULL),
|
||||||
unknown_missing(COMPLAIN),
|
unknown_missing(COMPLAIN),
|
||||||
|
59
lm/config.hh
59
lm/config.hh
@ -11,46 +11,52 @@
|
|||||||
/* Configuration for ngram model. Separate header to reduce pollution. */
|
/* Configuration for ngram model. Separate header to reduce pollution. */
|
||||||
|
|
||||||
namespace lm {
|
namespace lm {
|
||||||
|
|
||||||
class EnumerateVocab;
|
class EnumerateVocab;
|
||||||
|
|
||||||
namespace ngram {
|
namespace ngram {
|
||||||
|
|
||||||
struct Config {
|
struct Config {
|
||||||
// EFFECTIVE FOR BOTH ARPA AND BINARY READS
|
// EFFECTIVE FOR BOTH ARPA AND BINARY READS
|
||||||
|
|
||||||
|
// (default true) print progress bar to messages
|
||||||
|
bool show_progress;
|
||||||
|
|
||||||
// Where to log messages including the progress bar. Set to NULL for
|
// Where to log messages including the progress bar. Set to NULL for
|
||||||
// silence.
|
// silence.
|
||||||
std::ostream *messages;
|
std::ostream *messages;
|
||||||
|
|
||||||
|
std::ostream *ProgressMessages() const {
|
||||||
|
return show_progress ? messages : 0;
|
||||||
|
}
|
||||||
|
|
||||||
// This will be called with every string in the vocabulary. See
|
// This will be called with every string in the vocabulary. See
|
||||||
// enumerate_vocab.hh for more detail. Config does not take ownership; you
|
// enumerate_vocab.hh for more detail. Config does not take ownership; you
|
||||||
// are still responsible for deleting it (or stack allocating).
|
// are still responsible for deleting it (or stack allocating).
|
||||||
EnumerateVocab *enumerate_vocab;
|
EnumerateVocab *enumerate_vocab;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// ONLY EFFECTIVE WHEN READING ARPA
|
// ONLY EFFECTIVE WHEN READING ARPA
|
||||||
|
|
||||||
// What to do when <unk> isn't in the provided model.
|
// What to do when <unk> isn't in the provided model.
|
||||||
WarningAction unknown_missing;
|
WarningAction unknown_missing;
|
||||||
// What to do when <s> or </s> is missing from the model.
|
// What to do when <s> or </s> is missing from the model.
|
||||||
// If THROW_UP, the exception will be of type util::SpecialWordMissingException.
|
// If THROW_UP, the exception will be of type util::SpecialWordMissingException.
|
||||||
WarningAction sentence_marker_missing;
|
WarningAction sentence_marker_missing;
|
||||||
|
|
||||||
// What to do with a positive log probability. For COMPLAIN and SILENT, map
|
// What to do with a positive log probability. For COMPLAIN and SILENT, map
|
||||||
// to 0.
|
// to 0.
|
||||||
WarningAction positive_log_probability;
|
WarningAction positive_log_probability;
|
||||||
|
|
||||||
// The probability to substitute for <unk> if it's missing from the model.
|
// The probability to substitute for <unk> if it's missing from the model.
|
||||||
// No effect if the model has <unk> or unknown_missing == THROW_UP.
|
// No effect if the model has <unk> or unknown_missing == THROW_UP.
|
||||||
float unknown_missing_logprob;
|
float unknown_missing_logprob;
|
||||||
|
|
||||||
// Size multiplier for probing hash table. Must be > 1. Space is linear in
|
// Size multiplier for probing hash table. Must be > 1. Space is linear in
|
||||||
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect
|
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect
|
||||||
// for sorted variant.
|
// for sorted variant.
|
||||||
// If you find yourself setting this to a low number, consider using the
|
// If you find yourself setting this to a low number, consider using the
|
||||||
// TrieModel which has lower memory consumption.
|
// TrieModel which has lower memory consumption.
|
||||||
float probing_multiplier;
|
float probing_multiplier;
|
||||||
|
|
||||||
// Amount of memory to use for building. The actual memory usage will be
|
// Amount of memory to use for building. The actual memory usage will be
|
||||||
@ -58,10 +64,10 @@ struct Config {
|
|||||||
// models.
|
// models.
|
||||||
std::size_t building_memory;
|
std::size_t building_memory;
|
||||||
|
|
||||||
// Template for temporary directory appropriate for passing to mkdtemp.
|
// Template for temporary directory appropriate for passing to mkdtemp.
|
||||||
// The characters XXXXXX are appended before passing to mkdtemp. Only
|
// The characters XXXXXX are appended before passing to mkdtemp. Only
|
||||||
// applies to trie. If NULL, defaults to write_mmap. If that's NULL,
|
// applies to trie. If NULL, defaults to write_mmap. If that's NULL,
|
||||||
// defaults to input file name.
|
// defaults to input file name.
|
||||||
const char *temporary_directory_prefix;
|
const char *temporary_directory_prefix;
|
||||||
|
|
||||||
// Level of complaining to do when loading from ARPA instead of binary format.
|
// Level of complaining to do when loading from ARPA instead of binary format.
|
||||||
@ -69,49 +75,46 @@ struct Config {
|
|||||||
ARPALoadComplain arpa_complain;
|
ARPALoadComplain arpa_complain;
|
||||||
|
|
||||||
// While loading an ARPA file, also write out this binary format file. Set
|
// While loading an ARPA file, also write out this binary format file. Set
|
||||||
// to NULL to disable.
|
// to NULL to disable.
|
||||||
const char *write_mmap;
|
const char *write_mmap;
|
||||||
|
|
||||||
enum WriteMethod {
|
enum WriteMethod {
|
||||||
WRITE_MMAP, // Map the file directly.
|
WRITE_MMAP, // Map the file directly.
|
||||||
WRITE_AFTER // Write after we're done.
|
WRITE_AFTER // Write after we're done.
|
||||||
};
|
};
|
||||||
WriteMethod write_method;
|
WriteMethod write_method;
|
||||||
|
|
||||||
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
|
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
|
||||||
bool include_vocab;
|
bool include_vocab;
|
||||||
|
|
||||||
|
|
||||||
// Left rest options. Only used when the model includes rest costs.
|
// Left rest options. Only used when the model includes rest costs.
|
||||||
enum RestFunction {
|
enum RestFunction {
|
||||||
REST_MAX, // Maximum of any score to the left
|
REST_MAX, // Maximum of any score to the left
|
||||||
REST_LOWER, // Use lower-order files given below.
|
REST_LOWER, // Use lower-order files given below.
|
||||||
};
|
};
|
||||||
RestFunction rest_function;
|
RestFunction rest_function;
|
||||||
// Only used for REST_LOWER.
|
// Only used for REST_LOWER.
|
||||||
std::vector<std::string> rest_lower_files;
|
std::vector<std::string> rest_lower_files;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Quantization options. Only effective for QuantTrieModel. One value is
|
// Quantization options. Only effective for QuantTrieModel. One value is
|
||||||
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
|
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
|
||||||
// to quantize (and one of the remaining backoffs will be 0).
|
// to quantize (and one of the remaining backoffs will be 0).
|
||||||
uint8_t prob_bits, backoff_bits;
|
uint8_t prob_bits, backoff_bits;
|
||||||
|
|
||||||
// Bhiksha compression (simple form). Only works with trie.
|
// Bhiksha compression (simple form). Only works with trie.
|
||||||
uint8_t pointer_bhiksha_bits;
|
uint8_t pointer_bhiksha_bits;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// ONLY EFFECTIVE WHEN READING BINARY
|
// ONLY EFFECTIVE WHEN READING BINARY
|
||||||
|
|
||||||
// How to get the giant array into memory: lazy mmap, populate, read etc.
|
// How to get the giant array into memory: lazy mmap, populate, read etc.
|
||||||
// See util/mmap.hh for details of MapMethod.
|
// See util/mmap.hh for details of MapMethod.
|
||||||
util::LoadMethod load_method;
|
util::LoadMethod load_method;
|
||||||
|
|
||||||
|
|
||||||
|
// Set defaults.
|
||||||
// Set defaults.
|
|
||||||
Config();
|
Config();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
30
lm/model.cc
30
lm/model.cc
@ -37,7 +37,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
|||||||
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
|
template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &config) {
|
||||||
LoadLM(file, config, *this);
|
LoadLM(file, config, *this);
|
||||||
|
|
||||||
// g++ prints warnings unless these are fully initialized.
|
// g++ prints warnings unless these are fully initialized.
|
||||||
State begin_sentence = State();
|
State begin_sentence = State();
|
||||||
begin_sentence.length = 1;
|
begin_sentence.length = 1;
|
||||||
begin_sentence.words[0] = vocab_.BeginSentence();
|
begin_sentence.words[0] = vocab_.BeginSentence();
|
||||||
@ -69,8 +69,8 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
|
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(const char *file, const Config &config) {
|
||||||
// Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
|
// Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any.
|
||||||
util::FilePiece f(backing_.file.release(), file, config.messages);
|
util::FilePiece f(backing_.file.release(), file, config.ProgressMessages());
|
||||||
try {
|
try {
|
||||||
std::vector<uint64_t> counts;
|
std::vector<uint64_t> counts;
|
||||||
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
|
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
|
||||||
@ -80,7 +80,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
|||||||
if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");
|
if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");
|
||||||
|
|
||||||
std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
|
std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
|
||||||
// Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
|
// Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
|
||||||
vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
|
vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config);
|
||||||
|
|
||||||
if (config.write_mmap) {
|
if (config.write_mmap) {
|
||||||
@ -95,7 +95,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
|||||||
|
|
||||||
if (!vocab_.SawUnk()) {
|
if (!vocab_.SawUnk()) {
|
||||||
assert(config.unknown_missing != THROW_UP);
|
assert(config.unknown_missing != THROW_UP);
|
||||||
// Default probabilities for unknown.
|
// Default probabilities for unknown.
|
||||||
search_.UnknownUnigram().backoff = 0.0;
|
search_.UnknownUnigram().backoff = 0.0;
|
||||||
search_.UnknownUnigram().prob = config.unknown_missing_logprob;
|
search_.UnknownUnigram().prob = config.unknown_missing_logprob;
|
||||||
}
|
}
|
||||||
@ -147,7 +147,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const {
|
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const {
|
||||||
// Generate a state from context.
|
// Generate a state from context.
|
||||||
context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
|
context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
|
||||||
if (context_rend == context_rbegin) {
|
if (context_rend == context_rbegin) {
|
||||||
out_state.length = 0;
|
out_state.length = 0;
|
||||||
@ -191,7 +191,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
ret.rest = ptr.Rest();
|
ret.rest = ptr.Rest();
|
||||||
ret.prob = ptr.Prob();
|
ret.prob = ptr.Prob();
|
||||||
ret.extend_left = extend_pointer;
|
ret.extend_left = extend_pointer;
|
||||||
// If this function is called, then it does depend on left words.
|
// If this function is called, then it does depend on left words.
|
||||||
ret.independent_left = false;
|
ret.independent_left = false;
|
||||||
}
|
}
|
||||||
float subtract_me = ret.rest;
|
float subtract_me = ret.rest;
|
||||||
@ -199,7 +199,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
next_use = extend_length;
|
next_use = extend_length;
|
||||||
ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret);
|
ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret);
|
||||||
next_use -= extend_length;
|
next_use -= extend_length;
|
||||||
// Charge backoffs.
|
// Charge backoffs.
|
||||||
for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
|
for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
|
||||||
ret.prob -= subtract_me;
|
ret.prob -= subtract_me;
|
||||||
ret.rest -= subtract_me;
|
ret.rest -= subtract_me;
|
||||||
@ -209,7 +209,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
namespace {
|
namespace {
|
||||||
// Do a paraonoid copy of history, assuming new_word has already been copied
|
// Do a paraonoid copy of history, assuming new_word has already been copied
|
||||||
// (hence the -1). out_state.length could be zero so I avoided using
|
// (hence the -1). out_state.length could be zero so I avoided using
|
||||||
// std::copy.
|
// std::copy.
|
||||||
void CopyRemainingHistory(const WordIndex *from, State &out_state) {
|
void CopyRemainingHistory(const WordIndex *from, State &out_state) {
|
||||||
WordIndex *out = out_state.words + 1;
|
WordIndex *out = out_state.words + 1;
|
||||||
const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1;
|
const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1;
|
||||||
@ -217,10 +217,10 @@ void CopyRemainingHistory(const WordIndex *from, State &out_state) {
|
|||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
/* Ugly optimized function. Produce a score excluding backoff.
|
/* Ugly optimized function. Produce a score excluding backoff.
|
||||||
* The search goes in increasing order of ngram length.
|
* The search goes in increasing order of ngram length.
|
||||||
* Context goes backward, so context_begin is the word immediately preceeding
|
* Context goes backward, so context_begin is the word immediately preceeding
|
||||||
* new_word.
|
* new_word.
|
||||||
*/
|
*/
|
||||||
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff(
|
template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff(
|
||||||
const WordIndex *const context_rbegin,
|
const WordIndex *const context_rbegin,
|
||||||
@ -229,7 +229,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
State &out_state) const {
|
State &out_state) const {
|
||||||
assert(new_word < vocab_.Bound());
|
assert(new_word < vocab_.Bound());
|
||||||
FullScoreReturn ret;
|
FullScoreReturn ret;
|
||||||
// ret.ngram_length contains the last known non-blank ngram length.
|
// ret.ngram_length contains the last known non-blank ngram length.
|
||||||
ret.ngram_length = 1;
|
ret.ngram_length = 1;
|
||||||
|
|
||||||
typename Search::Node node;
|
typename Search::Node node;
|
||||||
@ -238,9 +238,9 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
ret.prob = uni.Prob();
|
ret.prob = uni.Prob();
|
||||||
ret.rest = uni.Rest();
|
ret.rest = uni.Rest();
|
||||||
|
|
||||||
// This is the length of the context that should be used for continuation to the right.
|
// This is the length of the context that should be used for continuation to the right.
|
||||||
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
|
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
|
||||||
// We'll write the word anyway since it will probably be used and does no harm being there.
|
// We'll write the word anyway since it will probably be used and does no harm being there.
|
||||||
out_state.words[0] = new_word;
|
out_state.words[0] = new_word;
|
||||||
if (context_rbegin == context_rend) return ret;
|
if (context_rbegin == context_rend) return ret;
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ struct ProbPointer {
|
|||||||
uint64_t index;
|
uint64_t index;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Array of n-grams and float indices.
|
// Array of n-grams and float indices.
|
||||||
class BackoffMessages {
|
class BackoffMessages {
|
||||||
public:
|
public:
|
||||||
void Init(std::size_t entry_size) {
|
void Init(std::size_t entry_size) {
|
||||||
@ -100,7 +100,7 @@ class BackoffMessages {
|
|||||||
void Apply(float *const *const base, RecordReader &reader) {
|
void Apply(float *const *const base, RecordReader &reader) {
|
||||||
FinishedAdding();
|
FinishedAdding();
|
||||||
if (current_ == allocated_) return;
|
if (current_ == allocated_) return;
|
||||||
// We'll also use the same buffer to record messages to blanks that they extend.
|
// We'll also use the same buffer to record messages to blanks that they extend.
|
||||||
WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
|
WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
|
||||||
const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);
|
const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex);
|
||||||
for (reader.Rewind(); reader && (current_ != allocated_); ) {
|
for (reader.Rewind(); reader && (current_ != allocated_); ) {
|
||||||
@ -109,7 +109,7 @@ class BackoffMessages {
|
|||||||
++reader;
|
++reader;
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
// Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends.
|
// Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends.
|
||||||
for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;
|
for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w;
|
||||||
current_ += entry_size_;
|
current_ += entry_size_;
|
||||||
break;
|
break;
|
||||||
@ -126,7 +126,7 @@ class BackoffMessages {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Now this is a list of blanks that extend right.
|
// Now this is a list of blanks that extend right.
|
||||||
entry_size_ = sizeof(WordIndex) * order;
|
entry_size_ = sizeof(WordIndex) * order;
|
||||||
Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));
|
Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get()));
|
||||||
current_ = (uint8_t*)backing_.get();
|
current_ = (uint8_t*)backing_.get();
|
||||||
@ -153,7 +153,7 @@ class BackoffMessages {
|
|||||||
private:
|
private:
|
||||||
void FinishedAdding() {
|
void FinishedAdding() {
|
||||||
Resize(current_ - (uint8_t*)backing_.get());
|
Resize(current_ - (uint8_t*)backing_.get());
|
||||||
// Sort requests in same order as files.
|
// Sort requests in same order as files.
|
||||||
std::sort(
|
std::sort(
|
||||||
util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
|
util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
|
||||||
util::SizedIterator(util::SizedProxy(current_, entry_size_)),
|
util::SizedIterator(util::SizedProxy(current_, entry_size_)),
|
||||||
@ -220,7 +220,7 @@ class SRISucks {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// This used to be one array. Then I needed to separate it by order for quantization to work.
|
// This used to be one array. Then I needed to separate it by order for quantization to work.
|
||||||
std::vector<float> values_[KENLM_MAX_ORDER - 1];
|
std::vector<float> values_[KENLM_MAX_ORDER - 1];
|
||||||
BackoffMessages messages_[KENLM_MAX_ORDER - 1];
|
BackoffMessages messages_[KENLM_MAX_ORDER - 1];
|
||||||
|
|
||||||
@ -253,7 +253,7 @@ class FindBlanks {
|
|||||||
++counts_.back();
|
++counts_.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unigrams wrote one past.
|
// Unigrams wrote one past.
|
||||||
void Cleanup() {
|
void Cleanup() {
|
||||||
--counts_[0];
|
--counts_[0];
|
||||||
}
|
}
|
||||||
@ -270,15 +270,15 @@ class FindBlanks {
|
|||||||
SRISucks &sri_;
|
SRISucks &sri_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Phase to actually write n-grams to the trie.
|
// Phase to actually write n-grams to the trie.
|
||||||
template <class Quant, class Bhiksha> class WriteEntries {
|
template <class Quant, class Bhiksha> class WriteEntries {
|
||||||
public:
|
public:
|
||||||
WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
|
WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
|
||||||
contexts_(contexts),
|
contexts_(contexts),
|
||||||
quant_(quant),
|
quant_(quant),
|
||||||
unigrams_(unigrams),
|
unigrams_(unigrams),
|
||||||
middle_(middle),
|
middle_(middle),
|
||||||
longest_(longest),
|
longest_(longest),
|
||||||
bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
|
bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)),
|
||||||
order_(order),
|
order_(order),
|
||||||
sri_(sri) {}
|
sri_(sri) {}
|
||||||
@ -328,7 +328,7 @@ struct Gram {
|
|||||||
|
|
||||||
const WordIndex *begin, *end;
|
const WordIndex *begin, *end;
|
||||||
|
|
||||||
// For queue, this is the direction we want.
|
// For queue, this is the direction we want.
|
||||||
bool operator<(const Gram &other) const {
|
bool operator<(const Gram &other) const {
|
||||||
return std::lexicographical_compare(other.begin, other.end, begin, end);
|
return std::lexicographical_compare(other.begin, other.end, begin, end);
|
||||||
}
|
}
|
||||||
@ -353,7 +353,7 @@ template <class Doing> class BlankManager {
|
|||||||
been_length_ = length;
|
been_length_ = length;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// There are blanks to insert starting with order blank.
|
// There are blanks to insert starting with order blank.
|
||||||
unsigned char blank = cur - to + 1;
|
unsigned char blank = cur - to + 1;
|
||||||
UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
|
UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context.");
|
||||||
const float *lower_basis;
|
const float *lower_basis;
|
||||||
@ -363,7 +363,7 @@ template <class Doing> class BlankManager {
|
|||||||
assert(*lower_basis != kBadProb);
|
assert(*lower_basis != kBadProb);
|
||||||
doing_.MiddleBlank(blank, to, based_on, *lower_basis);
|
doing_.MiddleBlank(blank, to, based_on, *lower_basis);
|
||||||
*pre = *cur;
|
*pre = *cur;
|
||||||
// Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
|
// Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram.
|
||||||
basis_[blank - 1] = kBadProb;
|
basis_[blank - 1] = kBadProb;
|
||||||
}
|
}
|
||||||
*pre = *cur;
|
*pre = *cur;
|
||||||
@ -377,7 +377,7 @@ template <class Doing> class BlankManager {
|
|||||||
unsigned char been_length_;
|
unsigned char been_length_;
|
||||||
|
|
||||||
float basis_[KENLM_MAX_ORDER];
|
float basis_[KENLM_MAX_ORDER];
|
||||||
|
|
||||||
Doing &doing_;
|
Doing &doing_;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -451,7 +451,7 @@ template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, Re
|
|||||||
}
|
}
|
||||||
|
|
||||||
void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
|
void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
|
||||||
// Fill unigram probabilities.
|
// Fill unigram probabilities.
|
||||||
try {
|
try {
|
||||||
rewind(file);
|
rewind(file);
|
||||||
for (WordIndex i = 0; i < unigram_count; ++i) {
|
for (WordIndex i = 0; i < unigram_count; ++i) {
|
||||||
@ -486,7 +486,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
|||||||
util::scoped_memory unigrams;
|
util::scoped_memory unigrams;
|
||||||
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
|
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
|
||||||
FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
|
FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
|
||||||
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
|
RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder);
|
||||||
fixed_counts = finder.Counts();
|
fixed_counts = finder.Counts();
|
||||||
}
|
}
|
||||||
unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
|
unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
|
||||||
@ -504,7 +504,8 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
|||||||
inputs[i-2].Rewind();
|
inputs[i-2].Rewind();
|
||||||
}
|
}
|
||||||
if (Quant::kTrain) {
|
if (Quant::kTrain) {
|
||||||
util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), config.messages, "Quantizing");
|
util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0),
|
||||||
|
config.ProgressMessages(), "Quantizing");
|
||||||
for (unsigned char i = 2; i < counts.size(); ++i) {
|
for (unsigned char i = 2; i < counts.size(); ++i) {
|
||||||
TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);
|
TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant);
|
||||||
}
|
}
|
||||||
@ -519,13 +520,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
|||||||
for (unsigned char i = 2; i <= counts.size(); ++i) {
|
for (unsigned char i = 2; i <= counts.size(); ++i) {
|
||||||
inputs[i-2].Rewind();
|
inputs[i-2].Rewind();
|
||||||
}
|
}
|
||||||
// Fill entries except unigram probabilities.
|
// Fill entries except unigram probabilities.
|
||||||
{
|
{
|
||||||
WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
|
WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
|
||||||
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer);
|
RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
|
// Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
|
||||||
for (unsigned char order = 2; order <= counts.size(); ++order) {
|
for (unsigned char order = 2; order <= counts.size(); ++order) {
|
||||||
const RecordReader &context = contexts[order - 2];
|
const RecordReader &context = contexts[order - 2];
|
||||||
if (context) {
|
if (context) {
|
||||||
@ -541,13 +542,13 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Set ending offsets so the last entry will be sized properly */
|
/* Set ending offsets so the last entry will be sized properly */
|
||||||
// Last entry for unigrams was already set.
|
// Last entry for unigrams was already set.
|
||||||
if (out.middle_begin_ != out.middle_end_) {
|
if (out.middle_begin_ != out.middle_end_) {
|
||||||
for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {
|
for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) {
|
||||||
i->FinishedLoading((i+1)->InsertIndex(), config);
|
i->FinishedLoading((i+1)->InsertIndex(), config);
|
||||||
}
|
}
|
||||||
(out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config);
|
(out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
|
template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
|
||||||
@ -595,7 +596,7 @@ template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::Initializ
|
|||||||
} else {
|
} else {
|
||||||
temporary_prefix = file;
|
temporary_prefix = file;
|
||||||
}
|
}
|
||||||
// At least 1MB sorting memory.
|
// At least 1MB sorting memory.
|
||||||
SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
|
SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
|
||||||
|
|
||||||
BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
|
BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
|
||||||
|
@ -38,7 +38,7 @@ void ErsatzProgress::Milestone() {
|
|||||||
next_ = std::numeric_limits<uint64_t>::max();
|
next_ = std::numeric_limits<uint64_t>::max();
|
||||||
out_ = NULL;
|
out_ = NULL;
|
||||||
} else {
|
} else {
|
||||||
next_ = std::max(next_, (stone * complete_) / kWidth);
|
next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,7 +32,6 @@ class ErsatzProgress {
|
|||||||
|
|
||||||
void Set(uint64_t to) {
|
void Set(uint64_t to) {
|
||||||
if ((current_ = to) >= next_) Milestone();
|
if ((current_ = to) >= next_) Milestone();
|
||||||
Milestone();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Finished() {
|
void Finished() {
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
#define _LARGEFILE64_SOURCE
|
||||||
|
|
||||||
#include "util/file.hh"
|
#include "util/file.hh"
|
||||||
|
|
||||||
#include "util/exception.hh"
|
#include "util/exception.hh"
|
||||||
@ -91,7 +93,7 @@ void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
|
|||||||
uint8_t *to = static_cast<uint8_t*>(to_void);
|
uint8_t *to = static_cast<uint8_t*>(to_void);
|
||||||
while (amount) {
|
while (amount) {
|
||||||
std::size_t ret = PartialRead(fd, to, amount);
|
std::size_t ret = PartialRead(fd, to, amount);
|
||||||
UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read.");
|
UTIL_THROW_IF(ret == 0, EndOfFileException, " in fd " << fd << " but there should be " << amount << " more bytes to read.");
|
||||||
amount -= ret;
|
amount -= ret;
|
||||||
to += ret;
|
to += ret;
|
||||||
}
|
}
|
||||||
@ -141,7 +143,7 @@ void InternalSeek(int fd, int64_t off, int whence) {
|
|||||||
UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed");
|
UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed");
|
||||||
|
|
||||||
#else
|
#else
|
||||||
UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
|
UTIL_THROW_IF((off_t)-1 == lseek64(fd, off, whence), ErrnoException, "Seek failed");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
@ -32,8 +32,6 @@ class scoped_fd {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
operator bool() { return fd_ != -1; }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int fd_;
|
int fd_;
|
||||||
|
|
||||||
|
@ -23,17 +23,17 @@ class ParseNumberException : public Exception {
|
|||||||
|
|
||||||
extern const bool kSpaces[256];
|
extern const bool kSpaces[256];
|
||||||
|
|
||||||
// Memory backing the returned StringPiece may vanish on the next call.
|
// Memory backing the returned StringPiece may vanish on the next call.
|
||||||
class FilePiece {
|
class FilePiece {
|
||||||
public:
|
public:
|
||||||
// 1 MB default.
|
// 1 MB default.
|
||||||
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
||||||
// Takes ownership of fd. name is used for messages.
|
// Takes ownership of fd. name is used for messages.
|
||||||
explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
|
||||||
|
|
||||||
~FilePiece();
|
~FilePiece();
|
||||||
|
|
||||||
char get() {
|
char get() {
|
||||||
if (position_ == position_end_) {
|
if (position_ == position_end_) {
|
||||||
Shift();
|
Shift();
|
||||||
if (at_end_) throw EndOfFileException();
|
if (at_end_) throw EndOfFileException();
|
||||||
@ -41,14 +41,14 @@ class FilePiece {
|
|||||||
return *(position_++);
|
return *(position_++);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
|
// Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
|
||||||
StringPiece ReadDelimited(const bool *delim = kSpaces) {
|
StringPiece ReadDelimited(const bool *delim = kSpaces) {
|
||||||
SkipSpaces(delim);
|
SkipSpaces(delim);
|
||||||
return Consume(FindDelimiterOrEOF(delim));
|
return Consume(FindDelimiterOrEOF(delim));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
|
// Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
|
||||||
// It is similar to getline in that way.
|
// It is similar to getline in that way.
|
||||||
StringPiece ReadLine(char delim = '\n');
|
StringPiece ReadLine(char delim = '\n');
|
||||||
|
|
||||||
float ReadFloat();
|
float ReadFloat();
|
||||||
@ -56,7 +56,7 @@ class FilePiece {
|
|||||||
long int ReadLong();
|
long int ReadLong();
|
||||||
unsigned long int ReadULong();
|
unsigned long int ReadULong();
|
||||||
|
|
||||||
// Skip spaces defined by isspace.
|
// Skip spaces defined by isspace.
|
||||||
void SkipSpaces(const bool *delim = kSpaces) {
|
void SkipSpaces(const bool *delim = kSpaces) {
|
||||||
for (; ; ++position_) {
|
for (; ; ++position_) {
|
||||||
if (position_ == position_end_) Shift();
|
if (position_ == position_end_) Shift();
|
||||||
@ -69,7 +69,7 @@ class FilePiece {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const std::string &FileName() const { return file_name_; }
|
const std::string &FileName() const { return file_name_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
|
void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
|
||||||
|
|
||||||
|
@ -6,8 +6,8 @@
|
|||||||
//#define HAVE_ICU
|
//#define HAVE_ICU
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_THREADS
|
#ifndef HAVE_BOOST
|
||||||
//#define HAVE_THREADS
|
#define HAVE_BOOST
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // UTIL_HAVE__
|
#endif // UTIL_HAVE__
|
||||||
|
@ -60,7 +60,7 @@ template <class KeyIter, class ValueIter> class JointProxy {
|
|||||||
JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
|
JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
|
||||||
JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
|
JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
|
||||||
|
|
||||||
operator const value_type() const {
|
operator value_type() const {
|
||||||
value_type ret;
|
value_type ret;
|
||||||
ret.key = *inner_.key_;
|
ret.key = *inner_.key_;
|
||||||
ret.value = *inner_.value_;
|
ret.value = *inner_.value_;
|
||||||
@ -121,7 +121,7 @@ template <class Proxy, class Less> class LessWrapper : public std::binary_functi
|
|||||||
|
|
||||||
template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
|
template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
|
||||||
public:
|
public:
|
||||||
PairedIterator(const KeyIter &key, const ValueIter &value) :
|
PairedIterator(const KeyIter &key, const ValueIter &value) :
|
||||||
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
|
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -370,7 +370,7 @@ ReadBase *ReadFactory(int fd, uint64_t &raw_amount) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
AdvanceOrThrow(fd, -ReadCompressed::kMagicSize);
|
SeekOrThrow(fd, 0);
|
||||||
} catch (const util::ErrnoException &e) {
|
} catch (const util::ErrnoException &e) {
|
||||||
return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize);
|
return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
|
/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If
|
||||||
* you don't use ICU, then this will use the Google implementation from Chrome.
|
* you don't use ICU, then this will use the Google implementation from Chrome.
|
||||||
* This has been modified from the original version to let you choose.
|
* This has been modified from the original version to let you choose.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Copyright 2008, Google Inc.
|
// Copyright 2008, Google Inc.
|
||||||
@ -49,7 +49,11 @@
|
|||||||
#define BASE_STRING_PIECE_H__
|
#define BASE_STRING_PIECE_H__
|
||||||
|
|
||||||
#include "util/have.hh"
|
#include "util/have.hh"
|
||||||
|
|
||||||
|
#ifdef HAVE_BOOST
|
||||||
#include <boost/functional/hash/hash.hpp>
|
#include <boost/functional/hash/hash.hpp>
|
||||||
|
#endif // HAVE_BOOST
|
||||||
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <iosfwd>
|
#include <iosfwd>
|
||||||
#include <ostream>
|
#include <ostream>
|
||||||
@ -58,9 +62,9 @@
|
|||||||
#include <unicode/stringpiece.h>
|
#include <unicode/stringpiece.h>
|
||||||
#include <unicode/uversion.h>
|
#include <unicode/uversion.h>
|
||||||
|
|
||||||
// Old versions of ICU don't define operator== and operator!=.
|
// Old versions of ICU don't define operator== and operator!=.
|
||||||
#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
|
#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
|
||||||
#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
|
#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
|
||||||
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
||||||
if (x.size() != y.size())
|
if (x.size() != y.size())
|
||||||
return false;
|
return false;
|
||||||
@ -252,6 +256,7 @@ inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
|
|||||||
return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
|
return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef HAVE_BOOST
|
||||||
inline size_t hash_value(const StringPiece &str) {
|
inline size_t hash_value(const StringPiece &str) {
|
||||||
return boost::hash_range(str.data(), str.data() + str.length());
|
return boost::hash_range(str.data(), str.data() + str.length());
|
||||||
}
|
}
|
||||||
@ -285,9 +290,12 @@ template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece
|
|||||||
return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
|
return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_ICU
|
#ifdef HAVE_ICU
|
||||||
U_NAMESPACE_END
|
U_NAMESPACE_END
|
||||||
|
using U_NAMESPACE_QUALIFIER StringPiece;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#endif // BASE_STRING_PIECE_H__
|
#endif // BASE_STRING_PIECE_H__
|
||||||
|
Loading…
Reference in New Issue
Block a user