mosesdecoder/lm/quantize.cc
Kenneth Heafield 14e02978fc KenLM 5cc905bc2d214efa7de2db56a9a672b749a95591
Avoid unspecified behavior of mmap when a file is resized reported by Christian Hardmeier
Fixes for Mavericks and a workaround for Boost's broken semaphore
Clean clang compile (of kenlm)

Merged some of 744376b3fb but also undid some of it because it was just masking a fundaemntal problem with pread rather than working around windows limitations
2014-01-27 16:51:35 -08:00

94 lines
3.7 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Quantize into bins of equal size as described in
* M. Federico and N. Bertoldi. 2006. How many bits are needed
* to store probabilities for phrase-based translation? In Proc.
* of the Workshop on Statistical Machine Translation, pages
* 94101, New York City, June. Association for Computa-
* tional Linguistics.
*/
#include "lm/quantize.hh"
#include "lm/binary_format.hh"
#include "lm/lm_exception.hh"
#include "util/file.hh"
#include <algorithm>
#include <numeric>
namespace lm {
namespace ngram {
namespace {
void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) {
std::sort(values.begin(), values.end());
std::vector<float>::const_iterator start = values.begin(), finish;
for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) {
finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins);
if (finish == start) {
// zero length bucket.
*centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity();
} else {
*centers = std::accumulate(start, finish, 0.0) / static_cast<float>(finish - start);
}
}
}
const char kSeparatelyQuantizeVersion = 2;
} // namespace
void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
unsigned char buffer[3];
file.ReadForConfig(buffer, 3, offset);
char version = buffer[0];
config.prob_bits = buffer[1];
config.backoff_bits = buffer[2];
if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
}
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
prob_bits_ = config.prob_bits;
backoff_bits_ = config.backoff_bits;
// We need the reserved values.
if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero");
if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero");
if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits.");
if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits.");
// Reserve 8 byte header for bit counts.
actual_base_ = static_cast<uint8_t*>(base);
float *start = reinterpret_cast<float*>(actual_base_ + 8);
for (unsigned char i = 0; i < order - 2; ++i) {
tables_[i][0] = Bins(prob_bits_, start);
start += (1ULL << prob_bits_);
tables_[i][1] = Bins(backoff_bits_, start);
start += (1ULL << backoff_bits_);
}
longest_ = tables_[order - 2][0] = Bins(prob_bits_, start);
}
void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff) {
TrainProb(order, prob);
// Backoff
float *centers = tables_[order - 2][1].Populate();
*(centers++) = kNoExtensionBackoff;
*(centers++) = kExtensionBackoff;
MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2);
}
void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
float *centers = tables_[order - 2][0].Populate();
MakeBins(prob, centers, (1ULL << prob_bits_));
}
void SeparatelyQuantize::FinishedLoading(const Config &config) {
uint8_t *actual_base = actual_base_;
*(actual_base++) = kSeparatelyQuantizeVersion; // version
*(actual_base++) = config.prob_bits;
*(actual_base++) = config.backoff_bits;
}
} // namespace ngram
} // namespace lm