mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 05:55:02 +03:00
KenLM 98814b2 including faster malloc-backed building and portability improvements
This commit is contained in:
parent
bf54a5f38b
commit
1be424bcb3
@ -87,7 +87,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_
|
|||||||
strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order));
|
strncpy(reinterpret_cast<char*>(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order));
|
||||||
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
|
return reinterpret_cast<uint8_t*>(backing.vocab.get()) + TotalHeaderSize(order);
|
||||||
} else {
|
} else {
|
||||||
backing.vocab.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
util::MapAnonymous(memory_size, backing.vocab);
|
||||||
return reinterpret_cast<uint8_t*>(backing.vocab.get());
|
return reinterpret_cast<uint8_t*>(backing.vocab.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -103,32 +103,44 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
|
|||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (config.write_method == Config::WRITE_AFTER) {
|
||||||
|
util::MapAnonymous(memory_size, backing.search);
|
||||||
|
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||||
|
}
|
||||||
|
// mmap it now.
|
||||||
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
|
// We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down.
|
||||||
std::size_t page_size = util::SizePage();
|
std::size_t page_size = util::SizePage();
|
||||||
std::size_t alignment_cruft = adjusted_vocab % page_size;
|
std::size_t alignment_cruft = adjusted_vocab % page_size;
|
||||||
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
||||||
|
|
||||||
return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
|
return reinterpret_cast<uint8_t*>(backing.search.get()) + alignment_cruft;
|
||||||
} else {
|
} else {
|
||||||
backing.search.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED);
|
util::MapAnonymous(memory_size, backing.search);
|
||||||
return reinterpret_cast<uint8_t*>(backing.search.get());
|
return reinterpret_cast<uint8_t*>(backing.search.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing) {
|
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing) {
|
||||||
if (config.write_mmap) {
|
if (!config.write_mmap) return;
|
||||||
util::SyncOrThrow(backing.search.get(), backing.search.size());
|
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
|
||||||
util::SyncOrThrow(backing.vocab.get(), backing.vocab.size());
|
switch (config.write_method) {
|
||||||
// header and vocab share the same mmap. The header is written here because we know the counts.
|
case Config::WRITE_MMAP:
|
||||||
Parameters params = Parameters();
|
util::SyncOrThrow(backing.search.get(), backing.search.size());
|
||||||
params.counts = counts;
|
break;
|
||||||
params.fixed.order = counts.size();
|
case Config::WRITE_AFTER:
|
||||||
params.fixed.probing_multiplier = config.probing_multiplier;
|
util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad);
|
||||||
params.fixed.model_type = model_type;
|
util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size());
|
||||||
params.fixed.has_vocabulary = config.include_vocab;
|
util::FSyncOrThrow(backing.file.get());
|
||||||
params.fixed.search_version = search_version;
|
break;
|
||||||
WriteHeader(backing.vocab.get(), params);
|
|
||||||
}
|
}
|
||||||
|
// header and vocab share the same mmap. The header is written here because we know the counts.
|
||||||
|
Parameters params = Parameters();
|
||||||
|
params.counts = counts;
|
||||||
|
params.fixed.order = counts.size();
|
||||||
|
params.fixed.probing_multiplier = config.probing_multiplier;
|
||||||
|
params.fixed.model_type = model_type;
|
||||||
|
params.fixed.has_vocabulary = config.include_vocab;
|
||||||
|
params.fixed.search_version = search_version;
|
||||||
|
WriteHeader(backing.vocab.get(), params);
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
@ -172,7 +184,7 @@ void ReadHeader(int fd, Parameters &out) {
|
|||||||
UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
|
UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
|
||||||
|
|
||||||
out.counts.resize(static_cast<std::size_t>(out.fixed.order));
|
out.counts.resize(static_cast<std::size_t>(out.fixed.order));
|
||||||
util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
|
if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
|
||||||
}
|
}
|
||||||
|
|
||||||
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) {
|
void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) {
|
||||||
|
@ -58,7 +58,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t
|
|||||||
|
|
||||||
// Write header to binary file. This is done last to prevent incomplete files
|
// Write header to binary file. This is done last to prevent incomplete files
|
||||||
// from loading.
|
// from loading.
|
||||||
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, Backing &backing);
|
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts, std::size_t vocab_pad, Backing &backing);
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
|
@ -18,11 +18,14 @@ namespace ngram {
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
void Usage(const char *name) {
|
void Usage(const char *name) {
|
||||||
std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
|
std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
|
||||||
"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
|
"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
|
||||||
" Default is -100. The ARPA file will always take precedence.\n"
|
" Default is -100. The ARPA file will always take precedence.\n"
|
||||||
"-s allows models to be built even if they do not have <s> and </s>.\n"
|
"-s allows models to be built even if they do not have <s> and </s>.\n"
|
||||||
"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n\n"
|
"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
|
||||||
|
"-w mmap|after determines how writing is done.\n"
|
||||||
|
" mmap maps the binary file and writes to it. Default for trie.\n"
|
||||||
|
" after allocates anonymous memory, builds, and writes. Default for probing.\n\n"
|
||||||
"type is either probing or trie. Default is probing.\n\n"
|
"type is either probing or trie. Default is probing.\n\n"
|
||||||
"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
|
"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
|
||||||
"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
|
"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
|
||||||
@ -58,7 +61,7 @@ uint8_t ParseBitCount(const char *from) {
|
|||||||
unsigned long val = ParseUInt(from);
|
unsigned long val = ParseUInt(from);
|
||||||
if (val > 25) {
|
if (val > 25) {
|
||||||
util::ParseNumberException e(from);
|
util::ParseNumberException e(from);
|
||||||
e << " bit counts are limited to 256.";
|
e << " bit counts are limited to 25.";
|
||||||
}
|
}
|
||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
@ -115,10 +118,10 @@ int main(int argc, char *argv[]) {
|
|||||||
using namespace lm::ngram;
|
using namespace lm::ngram;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
bool quantize = false, set_backoff_bits = false, bhiksha = false;
|
bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false;
|
||||||
lm::ngram::Config config;
|
lm::ngram::Config config;
|
||||||
int opt;
|
int opt;
|
||||||
while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:a:")) != -1) {
|
while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) {
|
||||||
switch(opt) {
|
switch(opt) {
|
||||||
case 'q':
|
case 'q':
|
||||||
config.prob_bits = ParseBitCount(optarg);
|
config.prob_bits = ParseBitCount(optarg);
|
||||||
@ -132,6 +135,7 @@ int main(int argc, char *argv[]) {
|
|||||||
case 'a':
|
case 'a':
|
||||||
config.pointer_bhiksha_bits = ParseBitCount(optarg);
|
config.pointer_bhiksha_bits = ParseBitCount(optarg);
|
||||||
bhiksha = true;
|
bhiksha = true;
|
||||||
|
break;
|
||||||
case 'u':
|
case 'u':
|
||||||
config.unknown_missing_logprob = ParseFloat(optarg);
|
config.unknown_missing_logprob = ParseFloat(optarg);
|
||||||
break;
|
break;
|
||||||
@ -144,6 +148,16 @@ int main(int argc, char *argv[]) {
|
|||||||
case 'm':
|
case 'm':
|
||||||
config.building_memory = ParseUInt(optarg) * 1048576;
|
config.building_memory = ParseUInt(optarg) * 1048576;
|
||||||
break;
|
break;
|
||||||
|
case 'w':
|
||||||
|
set_write_method = true;
|
||||||
|
if (!strcmp(optarg, "mmap")) {
|
||||||
|
config.write_method = Config::WRITE_MMAP;
|
||||||
|
} else if (!strcmp(optarg, "after")) {
|
||||||
|
config.write_method = Config::WRITE_AFTER;
|
||||||
|
} else {
|
||||||
|
Usage(argv[0]);
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 's':
|
case 's':
|
||||||
config.sentence_marker_missing = lm::SILENT;
|
config.sentence_marker_missing = lm::SILENT;
|
||||||
break;
|
break;
|
||||||
@ -160,45 +174,45 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
if (optind + 1 == argc) {
|
if (optind + 1 == argc) {
|
||||||
ShowSizes(argv[optind], config);
|
ShowSizes(argv[optind], config);
|
||||||
return 0;
|
} else if (optind + 2 == argc) {
|
||||||
}
|
|
||||||
const char *model_type, *from_file;
|
|
||||||
if (optind + 2 == argc) {
|
|
||||||
model_type = "probing";
|
|
||||||
from_file = argv[optind];
|
|
||||||
config.write_mmap = argv[optind + 1];
|
config.write_mmap = argv[optind + 1];
|
||||||
} else if (optind + 3 == argc) {
|
|
||||||
model_type = argv[optind];
|
|
||||||
from_file = argv[optind + 1];
|
|
||||||
config.write_mmap = argv[optind + 2];
|
|
||||||
} else {
|
|
||||||
Usage(argv[0]);
|
|
||||||
}
|
|
||||||
if (!strcmp(model_type, "probing")) {
|
|
||||||
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
|
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
|
||||||
ProbingModel(from_file, config);
|
ProbingModel(argv[optind], config);
|
||||||
} else if (!strcmp(model_type, "trie")) {
|
} else if (optind + 3 == argc) {
|
||||||
if (quantize) {
|
const char *model_type = argv[optind];
|
||||||
if (bhiksha) {
|
const char *from_file = argv[optind + 1];
|
||||||
QuantArrayTrieModel(from_file, config);
|
config.write_mmap = argv[optind + 2];
|
||||||
|
if (!strcmp(model_type, "probing")) {
|
||||||
|
if (!set_write_method) config.write_method = Config::WRITE_AFTER;
|
||||||
|
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
|
||||||
|
ProbingModel(from_file, config);
|
||||||
|
} else if (!strcmp(model_type, "trie")) {
|
||||||
|
if (!set_write_method) config.write_method = Config::WRITE_MMAP;
|
||||||
|
if (quantize) {
|
||||||
|
if (bhiksha) {
|
||||||
|
QuantArrayTrieModel(from_file, config);
|
||||||
|
} else {
|
||||||
|
QuantTrieModel(from_file, config);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
QuantTrieModel(from_file, config);
|
if (bhiksha) {
|
||||||
|
ArrayTrieModel(from_file, config);
|
||||||
|
} else {
|
||||||
|
TrieModel(from_file, config);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (bhiksha) {
|
Usage(argv[0]);
|
||||||
ArrayTrieModel(from_file, config);
|
|
||||||
} else {
|
|
||||||
TrieModel(from_file, config);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Usage(argv[0]);
|
Usage(argv[0]);
|
||||||
}
|
}
|
||||||
std::cerr << "Built " << config.write_mmap << " successfully." << std::endl;
|
}
|
||||||
} catch (const std::exception &e) {
|
catch (const std::exception &e) {
|
||||||
std::cerr << e.what() << std::endl;
|
std::cerr << e.what() << std::endl;
|
||||||
|
std::cerr << "ERROR" << std::endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
std::cerr << "SUCCESS" << std::endl;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,7 @@ Config::Config() :
|
|||||||
temporary_directory_prefix(NULL),
|
temporary_directory_prefix(NULL),
|
||||||
arpa_complain(ALL),
|
arpa_complain(ALL),
|
||||||
write_mmap(NULL),
|
write_mmap(NULL),
|
||||||
|
write_method(WRITE_AFTER),
|
||||||
include_vocab(true),
|
include_vocab(true),
|
||||||
prob_bits(8),
|
prob_bits(8),
|
||||||
backoff_bits(8),
|
backoff_bits(8),
|
||||||
|
@ -70,9 +70,17 @@ struct Config {
|
|||||||
// to NULL to disable.
|
// to NULL to disable.
|
||||||
const char *write_mmap;
|
const char *write_mmap;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
WRITE_MMAP, // Map the file directly.
|
||||||
|
WRITE_AFTER // Write after we're done.
|
||||||
|
} WriteMethod;
|
||||||
|
WriteMethod write_method;
|
||||||
|
|
||||||
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
|
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
|
||||||
bool include_vocab;
|
bool include_vocab;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Quantization options. Only effective for QuantTrieModel. One value is
|
// Quantization options. Only effective for QuantTrieModel. One value is
|
||||||
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
|
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
|
||||||
// to quantize (and one of the remaining backoffs will be 0).
|
// to quantize (and one of the remaining backoffs will be 0).
|
||||||
|
12
lm/model.cc
12
lm/model.cc
@ -46,7 +46,7 @@ template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::Ge
|
|||||||
|
|
||||||
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) {
|
template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) {
|
||||||
SetupMemory(start, params.counts, config);
|
SetupMemory(start, params.counts, config);
|
||||||
vocab_.LoadedBinary(fd, config.enumerate_vocab);
|
vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab);
|
||||||
search_.LoadedBinary();
|
search_.LoadedBinary();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
|||||||
search_.unigram.Unknown().backoff = 0.0;
|
search_.unigram.Unknown().backoff = 0.0;
|
||||||
search_.unigram.Unknown().prob = config.unknown_missing_logprob;
|
search_.unigram.Unknown().prob = config.unknown_missing_logprob;
|
||||||
}
|
}
|
||||||
FinishFile(config, kModelType, kVersion, counts, backing_);
|
FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_);
|
||||||
} catch (util::Exception &e) {
|
} catch (util::Exception &e) {
|
||||||
e << " Byte: " << f.Offset();
|
e << " Byte: " << f.Offset();
|
||||||
throw;
|
throw;
|
||||||
@ -119,7 +119,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
}
|
}
|
||||||
float backoff;
|
float backoff;
|
||||||
// i is the order of the backoff we're looking for.
|
// i is the order of the backoff we're looking for.
|
||||||
const Middle *mid_iter = search_.MiddleBegin() + start - 2;
|
typename Search::MiddleIter mid_iter = search_.MiddleBegin() + start - 2;
|
||||||
for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++mid_iter) {
|
for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++mid_iter) {
|
||||||
if (!search_.LookupMiddleNoProb(*mid_iter, *i, backoff, node)) break;
|
if (!search_.LookupMiddleNoProb(*mid_iter, *i, backoff, node)) break;
|
||||||
ret.prob += backoff;
|
ret.prob += backoff;
|
||||||
@ -139,7 +139,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
|
|||||||
search_.LookupUnigram(*context_rbegin, out_state.backoff[0], node, ignored);
|
search_.LookupUnigram(*context_rbegin, out_state.backoff[0], node, ignored);
|
||||||
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
|
out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
|
||||||
float *backoff_out = out_state.backoff + 1;
|
float *backoff_out = out_state.backoff + 1;
|
||||||
const typename Search::Middle *mid = search_.MiddleBegin();
|
typename Search::MiddleIter mid(search_.MiddleBegin());
|
||||||
for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++mid) {
|
for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++mid) {
|
||||||
if (!search_.LookupMiddleNoProb(*mid, *i, *backoff_out, node)) {
|
if (!search_.LookupMiddleNoProb(*mid, *i, *backoff_out, node)) {
|
||||||
std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words);
|
std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words);
|
||||||
@ -166,7 +166,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
// If this function is called, then it does depend on left words.
|
// If this function is called, then it does depend on left words.
|
||||||
ret.independent_left = false;
|
ret.independent_left = false;
|
||||||
ret.extend_left = extend_pointer;
|
ret.extend_left = extend_pointer;
|
||||||
const typename Search::Middle *mid_iter = search_.MiddleBegin() + extend_length - 1;
|
typename Search::MiddleIter mid_iter(search_.MiddleBegin() + extend_length - 1);
|
||||||
const WordIndex *i = add_rbegin;
|
const WordIndex *i = add_rbegin;
|
||||||
for (; ; ++i, ++backoff_out, ++mid_iter) {
|
for (; ; ++i, ++backoff_out, ++mid_iter) {
|
||||||
if (i == add_rend) {
|
if (i == add_rend) {
|
||||||
@ -235,7 +235,7 @@ template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search,
|
|||||||
|
|
||||||
// Ok start by looking up the bigram.
|
// Ok start by looking up the bigram.
|
||||||
const WordIndex *hist_iter = context_rbegin;
|
const WordIndex *hist_iter = context_rbegin;
|
||||||
const typename Search::Middle *mid_iter = search_.MiddleBegin();
|
typename Search::MiddleIter mid_iter(search_.MiddleBegin());
|
||||||
for (; ; ++mid_iter, ++hist_iter, ++backoff_out) {
|
for (; ; ++mid_iter, ++hist_iter, ++backoff_out) {
|
||||||
if (hist_iter == context_rend) {
|
if (hist_iter == context_rend) {
|
||||||
// Ran out of history. Typically no backoff, but this could be a blank.
|
// Ran out of history. Typically no backoff, but this could be a blank.
|
||||||
|
@ -20,11 +20,11 @@ namespace ngram {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
void MakeBins(float *values, float *values_end, float *centers, uint32_t bins) {
|
void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) {
|
||||||
std::sort(values, values_end);
|
std::sort(values.begin(), values.end());
|
||||||
const float *start = values, *finish;
|
std::vector<float>::const_iterator start = values.begin(), finish;
|
||||||
for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) {
|
for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) {
|
||||||
finish = values + (((values_end - values) * static_cast<uint64_t>(i + 1)) / bins);
|
finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins);
|
||||||
if (finish == start) {
|
if (finish == start) {
|
||||||
// zero length bucket.
|
// zero length bucket.
|
||||||
*centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity();
|
*centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity();
|
||||||
@ -66,12 +66,12 @@ void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vec
|
|||||||
float *centers = start_ + TableStart(order) + ProbTableLength();
|
float *centers = start_ + TableStart(order) + ProbTableLength();
|
||||||
*(centers++) = kNoExtensionBackoff;
|
*(centers++) = kNoExtensionBackoff;
|
||||||
*(centers++) = kExtensionBackoff;
|
*(centers++) = kExtensionBackoff;
|
||||||
MakeBins(&*backoff.begin(), &*backoff.end(), centers, (1ULL << backoff_bits_) - 2);
|
MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
|
void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
|
||||||
float *centers = start_ + TableStart(order);
|
float *centers = start_ + TableStart(order);
|
||||||
MakeBins(&*prob.begin(), &*prob.end(), centers, (1ULL << prob_bits_));
|
MakeBins(prob, centers, (1ULL << prob_bits_));
|
||||||
}
|
}
|
||||||
|
|
||||||
void SeparatelyQuantize::FinishedLoading(const Config &config) {
|
void SeparatelyQuantize::FinishedLoading(const Config &config) {
|
||||||
|
@ -84,9 +84,11 @@ template <class Middle> void FixSRI(int lower, float negative_lower_prob, unsign
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, ProbBackoff *unigrams, std::vector<Middle> &middle, Activate activate, Store &store, PositiveProbWarn &warn) {
|
template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, ProbBackoff *unigrams, std::vector<Middle> &middle, Activate activate, Store &store, PositiveProbWarn &warn) {
|
||||||
|
assert(n >= 2);
|
||||||
ReadNGramHeader(f, n);
|
ReadNGramHeader(f, n);
|
||||||
|
|
||||||
// vocab ids of words in reverse order
|
// Both vocab_ids and keys are non-empty because n >= 2.
|
||||||
|
// vocab ids of words in reverse order.
|
||||||
std::vector<WordIndex> vocab_ids(n);
|
std::vector<WordIndex> vocab_ids(n);
|
||||||
std::vector<uint64_t> keys(n-1);
|
std::vector<uint64_t> keys(n-1);
|
||||||
typename Store::Entry::Value value;
|
typename Store::Entry::Value value;
|
||||||
@ -147,7 +149,7 @@ template <class MiddleT, class LongestT> uint8_t *TemplateHashedSearch<MiddleT,
|
|||||||
|
|
||||||
template <class MiddleT, class LongestT> template <class Voc> void TemplateHashedSearch<MiddleT, LongestT>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing) {
|
template <class MiddleT, class LongestT> template <class Voc> void TemplateHashedSearch<MiddleT, LongestT>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing) {
|
||||||
// TODO: fix sorted.
|
// TODO: fix sorted.
|
||||||
SetupMemory(GrowForSearch(config, 0, Size(counts, config), backing), counts, config);
|
SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config);
|
||||||
|
|
||||||
PositiveProbWarn warn(config.positive_log_probability);
|
PositiveProbWarn warn(config.positive_log_probability);
|
||||||
|
|
||||||
|
@ -91,8 +91,10 @@ template <class MiddleT, class LongestT> class TemplateHashedSearch : public Has
|
|||||||
|
|
||||||
template <class Voc> void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing);
|
template <class Voc> void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, Voc &vocab, Backing &backing);
|
||||||
|
|
||||||
const Middle *MiddleBegin() const { return &*middle_.begin(); }
|
typedef typename std::vector<Middle>::const_iterator MiddleIter;
|
||||||
const Middle *MiddleEnd() const { return &*middle_.end(); }
|
|
||||||
|
MiddleIter MiddleBegin() const { return middle_.begin(); }
|
||||||
|
MiddleIter MiddleEnd() const { return middle_.end(); }
|
||||||
|
|
||||||
Node Unpack(uint64_t extend_pointer, unsigned char extend_length, float &prob) const {
|
Node Unpack(uint64_t extend_pointer, unsigned char extend_length, float &prob) const {
|
||||||
util::FloatEnc val;
|
util::FloatEnc val;
|
||||||
|
@ -197,7 +197,7 @@ class SRISucks {
|
|||||||
|
|
||||||
void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
|
void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
|
||||||
for (unsigned char i = 0; i < kMaxOrder - 1; ++i) {
|
for (unsigned char i = 0; i < kMaxOrder - 1; ++i) {
|
||||||
it_[i] = &*values_[i].begin();
|
it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();
|
||||||
}
|
}
|
||||||
messages_[0].Apply(it_, unigram_file);
|
messages_[0].Apply(it_, unigram_file);
|
||||||
BackoffMessages *messages = messages_ + 1;
|
BackoffMessages *messages = messages_ + 1;
|
||||||
@ -229,8 +229,8 @@ class SRISucks {
|
|||||||
|
|
||||||
class FindBlanks {
|
class FindBlanks {
|
||||||
public:
|
public:
|
||||||
FindBlanks(uint64_t *counts, unsigned char order, const ProbBackoff *unigrams, SRISucks &messages)
|
FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages)
|
||||||
: counts_(counts), longest_counts_(counts + order - 1), unigrams_(unigrams), sri_(messages) {}
|
: counts_(order), unigrams_(unigrams), sri_(messages) {}
|
||||||
|
|
||||||
float UnigramProb(WordIndex index) const {
|
float UnigramProb(WordIndex index) const {
|
||||||
return unigrams_[index].prob;
|
return unigrams_[index].prob;
|
||||||
@ -250,7 +250,7 @@ class FindBlanks {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Longest(const void * /*data*/) {
|
void Longest(const void * /*data*/) {
|
||||||
++*longest_counts_;
|
++counts_.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unigrams wrote one past.
|
// Unigrams wrote one past.
|
||||||
@ -258,8 +258,12 @@ class FindBlanks {
|
|||||||
--counts_[0];
|
--counts_[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::vector<uint64_t> &Counts() const {
|
||||||
|
return counts_;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint64_t *const counts_, *const longest_counts_;
|
std::vector<uint64_t> counts_;
|
||||||
|
|
||||||
const ProbBackoff *unigrams_;
|
const ProbBackoff *unigrams_;
|
||||||
|
|
||||||
@ -473,14 +477,15 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
|
|||||||
}
|
}
|
||||||
|
|
||||||
SRISucks sri;
|
SRISucks sri;
|
||||||
std::vector<uint64_t> fixed_counts(counts.size());
|
std::vector<uint64_t> fixed_counts;
|
||||||
util::scoped_FILE unigram_file;
|
util::scoped_FILE unigram_file;
|
||||||
util::scoped_fd unigram_fd(files.StealUnigram());
|
util::scoped_fd unigram_fd(files.StealUnigram());
|
||||||
{
|
{
|
||||||
util::scoped_memory unigrams;
|
util::scoped_memory unigrams;
|
||||||
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
|
MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
|
||||||
FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
|
FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri);
|
||||||
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
|
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
|
||||||
|
fixed_counts = finder.Counts();
|
||||||
}
|
}
|
||||||
unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
|
unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
|
||||||
for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
|
for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
|
||||||
|
@ -62,6 +62,8 @@ template <class Quant, class Bhiksha> class TrieSearch {
|
|||||||
|
|
||||||
void LoadedBinary();
|
void LoadedBinary();
|
||||||
|
|
||||||
|
typedef const Middle *MiddleIter;
|
||||||
|
|
||||||
const Middle *MiddleBegin() const { return middle_begin_; }
|
const Middle *MiddleBegin() const { return middle_begin_; }
|
||||||
const Middle *MiddleEnd() const { return middle_end_; }
|
const Middle *MiddleEnd() const { return middle_end_; }
|
||||||
|
|
||||||
|
@ -83,7 +83,12 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &make
|
|||||||
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
|
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
|
||||||
PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));
|
PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));
|
||||||
|
|
||||||
std::sort(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
std::stable_sort
|
||||||
|
#else
|
||||||
|
std::sort
|
||||||
|
#endif
|
||||||
|
(context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1)));
|
||||||
|
|
||||||
util::scoped_FILE out(maker.MakeFile());
|
util::scoped_FILE out(maker.MakeFile());
|
||||||
|
|
||||||
@ -157,7 +162,10 @@ void RecordReader::Overwrite(const void *start, std::size_t amount) {
|
|||||||
UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
|
UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision");
|
||||||
WriteOrThrow(file_, start, amount);
|
WriteOrThrow(file_, start, amount);
|
||||||
long forward = entry_size_ - internal - amount;
|
long forward = entry_size_ - internal - amount;
|
||||||
if (forward) UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
|
#if !defined(_WIN32) && !defined(_WIN64)
|
||||||
|
if (forward)
|
||||||
|
#endif
|
||||||
|
UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
|
||||||
}
|
}
|
||||||
|
|
||||||
void RecordReader::Rewind() {
|
void RecordReader::Rewind() {
|
||||||
@ -244,8 +252,13 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
|
|||||||
}
|
}
|
||||||
// Sort full records by full n-gram.
|
// Sort full records by full n-gram.
|
||||||
util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
|
util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
|
||||||
// parallel_sort uses too much RAM
|
// parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies.
|
||||||
std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
std::stable_sort
|
||||||
|
#else
|
||||||
|
std::sort
|
||||||
|
#endif
|
||||||
|
(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order)));
|
||||||
files.push_back(DiskFlush(begin, out_end, maker));
|
files.push_back(DiskFlush(begin, out_end, maker));
|
||||||
contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order));
|
contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order));
|
||||||
|
|
||||||
|
16
lm/vocab.cc
16
lm/vocab.cc
@ -125,8 +125,10 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) {
|
|||||||
|
|
||||||
void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
|
void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
|
||||||
if (enumerate_) {
|
if (enumerate_) {
|
||||||
util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin());
|
if (!strings_to_enumerate_.empty()) {
|
||||||
util::JointSort(begin_, end_, values);
|
util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin());
|
||||||
|
util::JointSort(begin_, end_, values);
|
||||||
|
}
|
||||||
for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) {
|
for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) {
|
||||||
// <unk> strikes again: +1 here.
|
// <unk> strikes again: +1 here.
|
||||||
enumerate_->Add(i + 1, strings_to_enumerate_[i]);
|
enumerate_->Add(i + 1, strings_to_enumerate_[i]);
|
||||||
@ -142,11 +144,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
|
|||||||
bound_ = end_ - begin_ + 1;
|
bound_ = end_ - begin_ + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
|
void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
|
||||||
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
|
end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
|
||||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||||
bound_ = end_ - begin_ + 1;
|
bound_ = end_ - begin_ + 1;
|
||||||
ReadWords(fd, to, bound_);
|
if (have_words) ReadWords(fd, to, bound_);
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
@ -201,12 +203,12 @@ void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) {
|
|||||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {
|
void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) {
|
||||||
UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code.");
|
UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code.");
|
||||||
lookup_.LoadedBinary();
|
lookup_.LoadedBinary();
|
||||||
bound_ = header_->bound;
|
bound_ = header_->bound;
|
||||||
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
SetSpecial(Index("<s>"), Index("</s>"), 0);
|
||||||
ReadWords(fd, to, bound_);
|
if (have_words) ReadWords(fd, to, bound_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
|
void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
|
||||||
@ -229,7 +231,7 @@ void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialW
|
|||||||
if (config.messages) *config.messages << "Missing special word " << str << "; will treat it as <unk>.";
|
if (config.messages) *config.messages << "Missing special word " << str << "; will treat it as <unk>.";
|
||||||
break;
|
break;
|
||||||
case THROW_UP:
|
case THROW_UP:
|
||||||
UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models. If you built your APRA with IRSTLM and forgot to run add-start-end.sh, complain to <bertoldi at fbk.eu> stating that you think build-lm.sh should do this by default, then go back and retrain your model from the start. To bypass this check and treat " << str << " as an OOV, pass -s. The resulting model will not work with e.g. Moses.");
|
UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models. Run build_binary -s to disable this check.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -82,7 +82,7 @@ class SortedVocabulary : public base::Vocabulary {
|
|||||||
|
|
||||||
bool SawUnk() const { return saw_unk_; }
|
bool SawUnk() const { return saw_unk_; }
|
||||||
|
|
||||||
void LoadedBinary(int fd, EnumerateVocab *to);
|
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint64_t *begin_, *end_;
|
uint64_t *begin_, *end_;
|
||||||
@ -143,9 +143,11 @@ class ProbingVocabulary : public base::Vocabulary {
|
|||||||
|
|
||||||
void FinishedLoading(ProbBackoff *reorder_vocab);
|
void FinishedLoading(ProbBackoff *reorder_vocab);
|
||||||
|
|
||||||
|
std::size_t UnkCountChangePadding() const { return 0; }
|
||||||
|
|
||||||
bool SawUnk() const { return saw_unk_; }
|
bool SawUnk() const { return saw_unk_; }
|
||||||
|
|
||||||
void LoadedBinary(int fd, EnumerateVocab *to);
|
void LoadedBinary(bool have_words, int fd, EnumerateVocab *to);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
typedef util::ProbingHashTable<ProbingVocabuaryEntry, util::IdentityHash> Lookup;
|
typedef util::ProbingHashTable<ProbingVocabuaryEntry, util::IdentityHash> Lookup;
|
||||||
|
@ -47,7 +47,14 @@ inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline uint64_t ReadOff(const void *base, uint64_t bit_off) {
|
inline uint64_t ReadOff(const void *base, uint64_t bit_off) {
|
||||||
|
#if defined(__arm) || defined(__arm__)
|
||||||
|
const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
|
||||||
|
uint64_t value64;
|
||||||
|
memcpy(&value64, base_off, sizeof(value64));
|
||||||
|
return value64;
|
||||||
|
#else
|
||||||
return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3));
|
return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Pack integers up to 57 bits using their least significant digits.
|
/* Pack integers up to 57 bits using their least significant digits.
|
||||||
@ -75,7 +82,14 @@ inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t va
|
|||||||
|
|
||||||
/* Same caveats as above, but for a 25 bit limit. */
|
/* Same caveats as above, but for a 25 bit limit. */
|
||||||
inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) {
|
inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) {
|
||||||
|
#if defined(__arm) || defined(__arm__)
|
||||||
|
const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
|
||||||
|
uint32_t value32;
|
||||||
|
memcpy(&value32, base_off, sizeof(value32));
|
||||||
|
return (value32 >> BitPackShift(bit_off & 7, length)) & mask;
|
||||||
|
#else
|
||||||
return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask;
|
return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {
|
inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {
|
||||||
|
@ -99,6 +99,13 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FSyncOrThrow(int fd) {
|
||||||
|
// Apparently windows doesn't have fsync?
|
||||||
|
#if !defined(_WIN32) && !defined(_WIN64)
|
||||||
|
UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed.");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void InternalSeek(int fd, off_t off, int whence) {
|
void InternalSeek(int fd, off_t off, int whence) {
|
||||||
UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
|
UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed");
|
||||||
|
@ -78,6 +78,8 @@ std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount);
|
|||||||
|
|
||||||
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
|
void WriteOrThrow(int fd, const void *data_void, std::size_t size);
|
||||||
|
|
||||||
|
void FSyncOrThrow(int fd);
|
||||||
|
|
||||||
// Seeking
|
// Seeking
|
||||||
void SeekOrThrow(int fd, uint64_t off);
|
void SeekOrThrow(int fd, uint64_t off);
|
||||||
void AdvanceOrThrow(int fd, int64_t off);
|
void AdvanceOrThrow(int fd, int64_t off);
|
||||||
|
@ -24,12 +24,12 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() {
|
|||||||
*this << "Could not parse \"" << value << "\" into a number";
|
*this << "Could not parse \"" << value << "\" into a number";
|
||||||
}
|
}
|
||||||
|
|
||||||
GZException::GZException(gzFile file) {
|
|
||||||
#ifdef HAVE_ZLIB
|
#ifdef HAVE_ZLIB
|
||||||
|
GZException::GZException(gzFile file) {
|
||||||
int num;
|
int num;
|
||||||
*this << gzerror( file, &num) << " from zlib";
|
*this << gzerror( file, &num) << " from zlib";
|
||||||
#endif // HAVE_ZLIB
|
|
||||||
}
|
}
|
||||||
|
#endif // HAVE_ZLIB
|
||||||
|
|
||||||
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
|
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
|
||||||
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||||
|
@ -27,7 +27,9 @@ class ParseNumberException : public Exception {
|
|||||||
|
|
||||||
class GZException : public Exception {
|
class GZException : public Exception {
|
||||||
public:
|
public:
|
||||||
|
#ifdef HAVE_ZLIB
|
||||||
explicit GZException(gzFile file);
|
explicit GZException(gzFile file);
|
||||||
|
#endif
|
||||||
GZException() throw() {}
|
GZException() throw() {}
|
||||||
~GZException() throw() {}
|
~GZException() throw() {}
|
||||||
};
|
};
|
||||||
|
@ -1,126 +0,0 @@
|
|||||||
#ifndef UTIL_KEY_VALUE_PACKING__
|
|
||||||
#define UTIL_KEY_VALUE_PACKING__
|
|
||||||
|
|
||||||
/* Why such a general interface? I'm planning on doing bit-level packing. */
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cstddef>
|
|
||||||
#include <cstring>
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
namespace util {
|
|
||||||
|
|
||||||
template <class Key, class Value> struct Entry {
|
|
||||||
Key key;
|
|
||||||
Value value;
|
|
||||||
|
|
||||||
const Key &GetKey() const { return key; }
|
|
||||||
const Value &GetValue() const { return value; }
|
|
||||||
|
|
||||||
Value &MutableValue() { return value; }
|
|
||||||
|
|
||||||
void Set(const Key &key_in, const Value &value_in) {
|
|
||||||
SetKey(key_in);
|
|
||||||
SetValue(value_in);
|
|
||||||
}
|
|
||||||
void SetKey(const Key &key_in) { key = key_in; }
|
|
||||||
void SetValue(const Value &value_in) { value = value_in; }
|
|
||||||
|
|
||||||
bool operator<(const Entry<Key, Value> &other) const { return GetKey() < other.GetKey(); }
|
|
||||||
};
|
|
||||||
|
|
||||||
// And now for a brief interlude to specialize std::swap.
|
|
||||||
} // namespace util
|
|
||||||
namespace std {
|
|
||||||
template <class Key, class Value> void swap(util::Entry<Key, Value> &first, util::Entry<Key, Value> &second) {
|
|
||||||
swap(first.key, second.key);
|
|
||||||
swap(first.value, second.value);
|
|
||||||
}
|
|
||||||
}// namespace std
|
|
||||||
namespace util {
|
|
||||||
|
|
||||||
template <class KeyT, class ValueT> class AlignedPacking {
|
|
||||||
public:
|
|
||||||
typedef KeyT Key;
|
|
||||||
typedef ValueT Value;
|
|
||||||
|
|
||||||
public:
|
|
||||||
static const std::size_t kBytes = sizeof(Entry<Key, Value>);
|
|
||||||
static const std::size_t kBits = kBytes * 8;
|
|
||||||
|
|
||||||
typedef Entry<Key, Value> * MutableIterator;
|
|
||||||
typedef const Entry<Key, Value> * ConstIterator;
|
|
||||||
typedef const Entry<Key, Value> & ConstReference;
|
|
||||||
|
|
||||||
static MutableIterator FromVoid(void *start) {
|
|
||||||
return reinterpret_cast<MutableIterator>(start);
|
|
||||||
}
|
|
||||||
|
|
||||||
static Entry<Key, Value> Make(const Key &key, const Value &value) {
|
|
||||||
Entry<Key, Value> ret;
|
|
||||||
ret.Set(key, value);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class KeyT, class ValueT> class ByteAlignedPacking {
|
|
||||||
public:
|
|
||||||
typedef KeyT Key;
|
|
||||||
typedef ValueT Value;
|
|
||||||
|
|
||||||
private:
|
|
||||||
#pragma pack(push)
|
|
||||||
#pragma pack(1)
|
|
||||||
struct RawEntry {
|
|
||||||
Key key;
|
|
||||||
Value value;
|
|
||||||
|
|
||||||
const Key &GetKey() const { return key; }
|
|
||||||
const Value &GetValue() const { return value; }
|
|
||||||
|
|
||||||
Value &MutableValue() { return value; }
|
|
||||||
|
|
||||||
void Set(const Key &key_in, const Value &value_in) {
|
|
||||||
SetKey(key_in);
|
|
||||||
SetValue(value_in);
|
|
||||||
}
|
|
||||||
void SetKey(const Key &key_in) { key = key_in; }
|
|
||||||
void SetValue(const Value &value_in) { value = value_in; }
|
|
||||||
|
|
||||||
bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); }
|
|
||||||
};
|
|
||||||
#pragma pack(pop)
|
|
||||||
|
|
||||||
friend void std::swap<>(RawEntry&, RawEntry&);
|
|
||||||
|
|
||||||
public:
|
|
||||||
typedef RawEntry *MutableIterator;
|
|
||||||
typedef const RawEntry *ConstIterator;
|
|
||||||
typedef RawEntry &ConstReference;
|
|
||||||
|
|
||||||
static const std::size_t kBytes = sizeof(RawEntry);
|
|
||||||
static const std::size_t kBits = kBytes * 8;
|
|
||||||
|
|
||||||
static MutableIterator FromVoid(void *start) {
|
|
||||||
return MutableIterator(reinterpret_cast<RawEntry*>(start));
|
|
||||||
}
|
|
||||||
|
|
||||||
static RawEntry Make(const Key &key, const Value &value) {
|
|
||||||
RawEntry ret;
|
|
||||||
ret.Set(key, value);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace util
|
|
||||||
namespace std {
|
|
||||||
template <class Key, class Value> void swap(
|
|
||||||
typename util::ByteAlignedPacking<Key, Value>::RawEntry &first,
|
|
||||||
typename util::ByteAlignedPacking<Key, Value>::RawEntry &second) {
|
|
||||||
swap(first.key, second.key);
|
|
||||||
swap(first.value, second.value);
|
|
||||||
}
|
|
||||||
}// namespace std
|
|
||||||
|
|
||||||
#endif // UTIL_KEY_VALUE_PACKING__
|
|
@ -1,75 +0,0 @@
|
|||||||
#include "util/key_value_packing.hh"
|
|
||||||
|
|
||||||
#include <boost/random/mersenne_twister.hpp>
|
|
||||||
#include <boost/random/uniform_int.hpp>
|
|
||||||
#include <boost/random/variate_generator.hpp>
|
|
||||||
#include <boost/scoped_array.hpp>
|
|
||||||
#define BOOST_TEST_MODULE KeyValueStoreTest
|
|
||||||
#include <boost/test/unit_test.hpp>
|
|
||||||
|
|
||||||
#include <limits>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
namespace util {
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE(basic_in_out) {
|
|
||||||
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
|
|
||||||
void *backing = malloc(Packing::kBytes * 2);
|
|
||||||
Packing::MutableIterator i(Packing::FromVoid(backing));
|
|
||||||
i->SetKey(10);
|
|
||||||
BOOST_CHECK_EQUAL(10, i->GetKey());
|
|
||||||
i->SetValue(3);
|
|
||||||
BOOST_CHECK_EQUAL(3, i->GetValue());
|
|
||||||
++i;
|
|
||||||
i->SetKey(5);
|
|
||||||
BOOST_CHECK_EQUAL(5, i->GetKey());
|
|
||||||
i->SetValue(42);
|
|
||||||
BOOST_CHECK_EQUAL(42, i->GetValue());
|
|
||||||
|
|
||||||
Packing::ConstIterator c(i);
|
|
||||||
BOOST_CHECK_EQUAL(5, c->GetKey());
|
|
||||||
--c;
|
|
||||||
BOOST_CHECK_EQUAL(10, c->GetKey());
|
|
||||||
BOOST_CHECK_EQUAL(42, i->GetValue());
|
|
||||||
|
|
||||||
BOOST_CHECK_EQUAL(5, i->GetKey());
|
|
||||||
free(backing);
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE(simple_sort) {
|
|
||||||
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
|
|
||||||
char foo[Packing::kBytes * 4];
|
|
||||||
Packing::MutableIterator begin(Packing::FromVoid(foo));
|
|
||||||
Packing::MutableIterator i = begin;
|
|
||||||
i->SetKey(0); ++i;
|
|
||||||
i->SetKey(2); ++i;
|
|
||||||
i->SetKey(3); ++i;
|
|
||||||
i->SetKey(1); ++i;
|
|
||||||
std::sort(begin, i);
|
|
||||||
BOOST_CHECK_EQUAL(0, begin[0].GetKey());
|
|
||||||
BOOST_CHECK_EQUAL(1, begin[1].GetKey());
|
|
||||||
BOOST_CHECK_EQUAL(2, begin[2].GetKey());
|
|
||||||
BOOST_CHECK_EQUAL(3, begin[3].GetKey());
|
|
||||||
}
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_CASE(big_sort) {
|
|
||||||
typedef ByteAlignedPacking<uint64_t, unsigned char> Packing;
|
|
||||||
boost::scoped_array<char> memory(new char[Packing::kBytes * 1000]);
|
|
||||||
Packing::MutableIterator begin(Packing::FromVoid(memory.get()));
|
|
||||||
|
|
||||||
boost::mt19937 rng;
|
|
||||||
boost::uniform_int<uint64_t> range(0, std::numeric_limits<uint64_t>::max());
|
|
||||||
boost::variate_generator<boost::mt19937&, boost::uniform_int<uint64_t> > gen(rng, range);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < 1000; ++i) {
|
|
||||||
(begin + i)->SetKey(gen());
|
|
||||||
}
|
|
||||||
std::sort(begin, begin + 1000);
|
|
||||||
for (size_t i = 0; i < 999; ++i) {
|
|
||||||
BOOST_CHECK(begin[i] < begin[i+1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
} // namespace util
|
|
21
util/mmap.cc
21
util/mmap.cc
@ -101,9 +101,10 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
|
|||||||
#if defined(_WIN32) || defined(_WIN64)
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
|
int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
|
||||||
int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
|
int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
|
||||||
HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, 0, size + offset, NULL);
|
uint64_t total_size = size + offset;
|
||||||
|
HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast<DWORD>(total_size), NULL);
|
||||||
UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
|
UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
|
||||||
LPVOID ret = MapViewOfFile(hMapping, protectM, 0, offset, size);
|
LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size);
|
||||||
CloseHandle(hMapping);
|
CloseHandle(hMapping);
|
||||||
UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
|
UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
|
||||||
#else
|
#else
|
||||||
@ -147,16 +148,20 @@ void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scope
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void *MapAnonymous(std::size_t size) {
|
// Allocates zeroed memory in to.
|
||||||
return MapOrThrow(size, true,
|
void MapAnonymous(std::size_t size, util::scoped_memory &to) {
|
||||||
|
to.reset();
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
0 // MapOrThrow ignores the flags anyway.
|
to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED);
|
||||||
#elif defined(MAP_ANONYMOUS)
|
|
||||||
MAP_ANONYMOUS | MAP_PRIVATE // Linux
|
|
||||||
#else
|
#else
|
||||||
|
to.reset(MapOrThrow(size, true,
|
||||||
|
# if defined(MAP_ANONYMOUS)
|
||||||
|
MAP_ANONYMOUS | MAP_PRIVATE // Linux
|
||||||
|
# else
|
||||||
MAP_ANON | MAP_PRIVATE // BSD
|
MAP_ANON | MAP_PRIVATE // BSD
|
||||||
|
# endif
|
||||||
|
, false, -1, 0), size, scoped_memory::MMAP_ALLOCATED);
|
||||||
#endif
|
#endif
|
||||||
, false, -1, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void *MapZeroedWrite(int fd, std::size_t size) {
|
void *MapZeroedWrite(int fd, std::size_t size) {
|
||||||
|
@ -100,7 +100,7 @@ void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int
|
|||||||
|
|
||||||
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
|
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
|
||||||
|
|
||||||
void *MapAnonymous(std::size_t size);
|
void MapAnonymous(std::size_t size, scoped_memory &to);
|
||||||
|
|
||||||
// Open file name with mmap of size bytes, all of which are initially zero.
|
// Open file name with mmap of size bytes, all of which are initially zero.
|
||||||
void *MapZeroedWrite(int fd, std::size_t size);
|
void *MapZeroedWrite(int fd, std::size_t size);
|
||||||
|
@ -7,9 +7,11 @@
|
|||||||
* placed in namespace util
|
* placed in namespace util
|
||||||
* add MurmurHashNative
|
* add MurmurHashNative
|
||||||
* default option = 0 for seed
|
* default option = 0 for seed
|
||||||
|
* ARM port from NICT
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "util/murmur_hash.hh"
|
#include "util/murmur_hash.hh"
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
namespace util {
|
namespace util {
|
||||||
|
|
||||||
@ -28,12 +30,24 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed )
|
|||||||
|
|
||||||
uint64_t h = seed ^ (len * m);
|
uint64_t h = seed ^ (len * m);
|
||||||
|
|
||||||
|
#if defined(__arm) || defined(__arm__)
|
||||||
|
const size_t ksize = sizeof(uint64_t);
|
||||||
|
const unsigned char * data = (const unsigned char *)key;
|
||||||
|
const unsigned char * end = data + (std::size_t)(len/8) * ksize;
|
||||||
|
#else
|
||||||
const uint64_t * data = (const uint64_t *)key;
|
const uint64_t * data = (const uint64_t *)key;
|
||||||
const uint64_t * end = data + (len/8);
|
const uint64_t * end = data + (len/8);
|
||||||
|
#endif
|
||||||
|
|
||||||
while(data != end)
|
while(data != end)
|
||||||
{
|
{
|
||||||
|
#if defined(__arm) || defined(__arm__)
|
||||||
|
uint64_t k;
|
||||||
|
memcpy(&k, data, ksize);
|
||||||
|
data += ksize;
|
||||||
|
#else
|
||||||
uint64_t k = *data++;
|
uint64_t k = *data++;
|
||||||
|
#endif
|
||||||
|
|
||||||
k *= m;
|
k *= m;
|
||||||
k ^= k >> r;
|
k ^= k >> r;
|
||||||
@ -75,16 +89,30 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
|
|||||||
unsigned int h1 = seed ^ len;
|
unsigned int h1 = seed ^ len;
|
||||||
unsigned int h2 = 0;
|
unsigned int h2 = 0;
|
||||||
|
|
||||||
|
#if defined(__arm) || defined(__arm__)
|
||||||
|
size_t ksize = sizeof(unsigned int);
|
||||||
|
const unsigned char * data = (const unsigned char *)key;
|
||||||
|
#else
|
||||||
const unsigned int * data = (const unsigned int *)key;
|
const unsigned int * data = (const unsigned int *)key;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
unsigned int k1, k2;
|
||||||
while(len >= 8)
|
while(len >= 8)
|
||||||
{
|
{
|
||||||
unsigned int k1 = *data++;
|
#if defined(__arm) || defined(__arm__)
|
||||||
|
memcpy(&k1, data, ksize);
|
||||||
|
data += ksize;
|
||||||
|
memcpy(&k2, data, ksize);
|
||||||
|
data += ksize;
|
||||||
|
#else
|
||||||
|
k1 = *data++;
|
||||||
|
k2 = *data++;
|
||||||
|
#endif
|
||||||
|
|
||||||
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
||||||
h1 *= m; h1 ^= k1;
|
h1 *= m; h1 ^= k1;
|
||||||
len -= 4;
|
len -= 4;
|
||||||
|
|
||||||
unsigned int k2 = *data++;
|
|
||||||
k2 *= m; k2 ^= k2 >> r; k2 *= m;
|
k2 *= m; k2 ^= k2 >> r; k2 *= m;
|
||||||
h2 *= m; h2 ^= k2;
|
h2 *= m; h2 ^= k2;
|
||||||
len -= 4;
|
len -= 4;
|
||||||
@ -92,7 +120,12 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed )
|
|||||||
|
|
||||||
if(len >= 4)
|
if(len >= 4)
|
||||||
{
|
{
|
||||||
unsigned int k1 = *data++;
|
#if defined(__arm) || defined(__arm__)
|
||||||
|
memcpy(&k1, data, ksize);
|
||||||
|
data += ksize;
|
||||||
|
#else
|
||||||
|
k1 = *data++;
|
||||||
|
#endif
|
||||||
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
||||||
h1 *= m; h1 ^= k1;
|
h1 *= m; h1 ^= k1;
|
||||||
len -= 4;
|
len -= 4;
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <boost/random/variate_generator.hpp>
|
#include <boost/random/variate_generator.hpp>
|
||||||
#include <boost/scoped_array.hpp>
|
#include <boost/scoped_array.hpp>
|
||||||
#include <boost/unordered_map.hpp>
|
#include <boost/unordered_map.hpp>
|
||||||
|
|
||||||
#define BOOST_TEST_MODULE SortedUniformTest
|
#define BOOST_TEST_MODULE SortedUniformTest
|
||||||
#include <boost/test/unit_test.hpp>
|
#include <boost/test/unit_test.hpp>
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user