mosesdecoder/lm/quantize.cc

/* Quantize into bins of equal size as described in
 * M. Federico and N. Bertoldi. 2006. How many bits are needed
 * to store probabilities for phrase-based translation? In Proc.
 * of the Workshop on Statistical Machine Translation, pages
 * 94–101, New York City, June. Association for Computa-
 * tional Linguistics.
 */

#include "lm/quantize.hh"

#include "lm/binary_format.hh"
#include "lm/lm_exception.hh"
#include "util/file.hh"

#include <algorithm>
#include <numeric>

namespace lm {
namespace ngram {

namespace {

void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) {
  std::sort(values.begin(), values.end());
  std::vector<float>::const_iterator start = values.begin(), finish;
  for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) {
    finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins);
    if (finish == start) {
      // zero length bucket.
      *centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity();
    } else {
      *centers = std::accumulate(start, finish, 0.0) / static_cast<float>(finish - start);
    }
  }
}

const char kSeparatelyQuantizeVersion = 2;

} // namespace

void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
  char version;
  util::ReadOrThrow(fd, &version, 1);
  util::ReadOrThrow(fd, &config.prob_bits, 1);
  util::ReadOrThrow(fd, &config.backoff_bits, 1);
  if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
  util::AdvanceOrThrow(fd, -3);
}

void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
  prob_bits_ = config.prob_bits;
  backoff_bits_ = config.backoff_bits;
  // We need the reserved values.  
  if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero");
  if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero");
  if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits.  Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits.");
  if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits.  Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits.");
  // Reserve 8 byte header for bit counts.  
  actual_base_ = static_cast<uint8_t*>(base);
  float *start = reinterpret_cast<float*>(actual_base_ + 8);
  for (unsigned char i = 0; i < order - 2; ++i) {
    tables_[i][0] = Bins(prob_bits_, start);
    start += (1ULL << prob_bits_);
    tables_[i][1] = Bins(backoff_bits_, start);
    start += (1ULL << backoff_bits_);
  }
  longest_ = tables_[order - 2][0] = Bins(prob_bits_, start);
}

void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff) {
  TrainProb(order, prob);

  // Backoff
  float *centers = tables_[order - 2][1].Populate();
  *(centers++) = kNoExtensionBackoff;
  *(centers++) = kExtensionBackoff;
  MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2);
}

void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
  float *centers = tables_[order - 2][0].Populate();
  MakeBins(prob, centers, (1ULL << prob_bits_));
}

void SeparatelyQuantize::FinishedLoading(const Config &config) {
  uint8_t *actual_base = actual_base_;
  *(actual_base++) = kSeparatelyQuantizeVersion; // version
  *(actual_base++) = config.prob_bits;
  *(actual_base++) = config.backoff_bits;
}

} // namespace ngram
} // namespace lm
-												KenLM update including progress on ARM and MinGW from NICT

											
										
										
											2011-11-11 00:46:59 +04:00
+								/* Quantize into bins of equal size as described in
 								 * M. Federico and N. Bertoldi. 2006. How many bits are needed
 								 * to store probabilities for phrase-based translation? In Proc.
 								 * of the Workshop on Statistical Machine Translation, pages
 								 * 94–101, New York City, June. Association for Computa-
 								 * tional Linguistics.
 								 */
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								#include "lm/quantize.hh"
-												Merge mtm_lm into trunk.
There's a fair number of files with no change that somebody must have touched in the branch so metadata is being recorded. 
Updates kenlm binary file format, sorry. 
It looks like OOV isn't being computed in EvaluateChart anyway, just phrasal.  
  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4247 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-09-21 20:06:48 +04:00
+								#include "lm/binary_format.hh"
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								#include "lm/lm_exception.hh"
-												KenLM update including progress on ARM and MinGW from NICT

											
										
										
											2011-11-11 00:46:59 +04:00
+								#include "util/file.hh"
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
 								#include <algorithm>
 								#include <numeric>
 								namespace lm {
 								namespace ngram {
 								namespace {
-												KenLM 98814b2 including faster malloc-backed building and portability improvements

											
										
										
											2012-02-28 22:58:00 +04:00
+								void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) {
 								  std::sort(values.begin(), values.end());
 								  std::vector<float>::const_iterator start = values.begin(), finish;
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								  for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) {
-												KenLM 98814b2 including faster malloc-backed building and portability improvements

											
										
										
											2012-02-28 22:58:00 +04:00
+								    finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins);
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								    if (finish == start) {
 								      // zero length bucket.
 								      *centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity();
 								    } else {
 								      *centers = std::accumulate(start, finish, 0.0) / static_cast<float>(finish - start);
 								    }
 								  }
 								}
-												Fix accidental format change


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4040 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-28 01:20:42 +04:00
+								const char kSeparatelyQuantizeVersion = 2;
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
 								} // namespace
-												KenLM update including progress on ARM and MinGW from NICT

											
										
										
											2011-11-11 00:46:59 +04:00
+								void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector<uint64_t> &/*counts*/, Config &config) {
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								  char version;
-												KenLM update including progress on ARM and MinGW from NICT

											
										
										
											2011-11-11 00:46:59 +04:00
+								  util::ReadOrThrow(fd, &version, 1);
 								  util::ReadOrThrow(fd, &config.prob_bits, 1);
 								  util::ReadOrThrow(fd, &config.backoff_bits, 1);
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								  if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
-												KenLM update including progress on ARM and MinGW from NICT

											
										
										
											2011-11-11 00:46:59 +04:00
+								  util::AdvanceOrThrow(fd, -3);
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								}
-												KenLM e3b5c55910 including rest costs for probing

											
										
										
											2012-06-28 18:58:59 +04:00
+								void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								  prob_bits_ = config.prob_bits;
 								  backoff_bits_ = config.backoff_bits;
 								  // We need the reserved values.
 								  if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero");
 								  if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero");
 								  if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits.  Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits.");
 								  if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits.  Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits.");
-												KenLM e3b5c55910 including rest costs for probing

											
										
										
											2012-06-28 18:58:59 +04:00
+								  // Reserve 8 byte header for bit counts.
 								  actual_base_ = static_cast<uint8_t*>(base);
 								  float *start = reinterpret_cast<float*>(actual_base_ + 8);
 								  for (unsigned char i = 0; i < order - 2; ++i) {
 								    tables_[i][0] = Bins(prob_bits_, start);
 								    start += (1ULL << prob_bits_);
 								    tables_[i][1] = Bins(backoff_bits_, start);
 								    start += (1ULL << backoff_bits_);
 								  }
 								  longest_ = tables_[order - 2][0] = Bins(prob_bits_, start);
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								}
 								void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff) {
 								  TrainProb(order, prob);
 								  // Backoff
-												KenLM e3b5c55910 including rest costs for probing

											
										
										
											2012-06-28 18:58:59 +04:00
+								  float *centers = tables_[order - 2][1].Populate();
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								  *(centers++) = kNoExtensionBackoff;
 								  *(centers++) = kExtensionBackoff;
-												KenLM 98814b2 including faster malloc-backed building and portability improvements

											
										
										
											2012-02-28 22:58:00 +04:00
+								  MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2);
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								}
 								void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
-												KenLM e3b5c55910 including rest costs for probing

											
										
										
											2012-06-28 18:58:59 +04:00
+								  float *centers = tables_[order - 2][0].Populate();
-												KenLM 98814b2 including faster malloc-backed building and portability improvements

											
										
										
											2012-02-28 22:58:00 +04:00
+								  MakeBins(prob, centers, (1ULL << prob_bits_));
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								}
 								void SeparatelyQuantize::FinishedLoading(const Config &config) {
-												KenLM e3b5c55910 including rest costs for probing

											
										
										
											2012-06-28 18:58:59 +04:00
+								  uint8_t *actual_base = actual_base_;
-												Quantization.  


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4037 1f5c12ca-751b-0410-a591-d2e778427230

											
										
										
											2011-06-27 02:21:44 +04:00
+								  *(actual_base++) = kSeparatelyQuantizeVersion; // version
 								  *(actual_base++) = config.prob_bits;
 								  *(actual_base++) = config.backoff_bits;
 								}
 								} // namespace ngram
 								} // namespace lm