2011-11-11 00:46:59 +04:00
/* Quantize into bins of equal size as described in
* M . Federico and N . Bertoldi . 2006. How many bits are needed
* to store probabilities for phrase - based translation ? In Proc .
* of the Workshop on Statistical Machine Translation , pages
* 94 – 101 , New York City , June . Association for Computa -
* tional Linguistics .
*/
2011-06-27 02:21:44 +04:00
# include "lm/quantize.hh"
2011-09-21 20:06:48 +04:00
# include "lm/binary_format.hh"
2011-06-27 02:21:44 +04:00
# include "lm/lm_exception.hh"
2011-11-11 00:46:59 +04:00
# include "util/file.hh"
2011-06-27 02:21:44 +04:00
# include <algorithm>
# include <numeric>
namespace lm {
namespace ngram {
namespace {
2012-02-28 22:58:00 +04:00
void MakeBins ( std : : vector < float > & values , float * centers , uint32_t bins ) {
std : : sort ( values . begin ( ) , values . end ( ) ) ;
std : : vector < float > : : const_iterator start = values . begin ( ) , finish ;
2011-06-27 02:21:44 +04:00
for ( uint32_t i = 0 ; i < bins ; + + i , + + centers , start = finish ) {
2012-02-28 22:58:00 +04:00
finish = values . begin ( ) + ( ( values . size ( ) * static_cast < uint64_t > ( i + 1 ) ) / bins ) ;
2011-06-27 02:21:44 +04:00
if ( finish = = start ) {
// zero length bucket.
* centers = i ? * ( centers - 1 ) : - std : : numeric_limits < float > : : infinity ( ) ;
} else {
* centers = std : : accumulate ( start , finish , 0.0 ) / static_cast < float > ( finish - start ) ;
}
}
}
2011-06-28 01:20:42 +04:00
const char kSeparatelyQuantizeVersion = 2 ;
2011-06-27 02:21:44 +04:00
} // namespace
2011-11-11 00:46:59 +04:00
void SeparatelyQuantize : : UpdateConfigFromBinary ( int fd , const std : : vector < uint64_t > & /*counts*/ , Config & config ) {
2011-06-27 02:21:44 +04:00
char version ;
2011-11-11 00:46:59 +04:00
util : : ReadOrThrow ( fd , & version , 1 ) ;
util : : ReadOrThrow ( fd , & config . prob_bits , 1 ) ;
util : : ReadOrThrow ( fd , & config . backoff_bits , 1 ) ;
2011-06-27 02:21:44 +04:00
if ( version ! = kSeparatelyQuantizeVersion ) UTIL_THROW ( FormatLoadException , " This file has quantization version " < < ( unsigned ) version < < " but the code expects version " < < ( unsigned ) kSeparatelyQuantizeVersion ) ;
2011-11-11 00:46:59 +04:00
util : : AdvanceOrThrow ( fd , - 3 ) ;
2011-06-27 02:21:44 +04:00
}
2012-06-28 18:58:59 +04:00
void SeparatelyQuantize : : SetupMemory ( void * base , unsigned char order , const Config & config ) {
2011-06-27 02:21:44 +04:00
prob_bits_ = config . prob_bits ;
backoff_bits_ = config . backoff_bits ;
// We need the reserved values.
if ( config . prob_bits = = 0 ) UTIL_THROW ( ConfigException , " You can't quantize probability to zero " ) ;
if ( config . backoff_bits = = 0 ) UTIL_THROW ( ConfigException , " You can't quantize backoff to zero " ) ;
if ( config . prob_bits > 25 ) UTIL_THROW ( ConfigException , " For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " < < static_cast < unsigned > ( config . prob_bits ) < < " bits. " ) ;
if ( config . backoff_bits > 25 ) UTIL_THROW ( ConfigException , " For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " < < static_cast < unsigned > ( config . backoff_bits ) < < " bits. " ) ;
2012-06-28 18:58:59 +04:00
// Reserve 8 byte header for bit counts.
actual_base_ = static_cast < uint8_t * > ( base ) ;
float * start = reinterpret_cast < float * > ( actual_base_ + 8 ) ;
for ( unsigned char i = 0 ; i < order - 2 ; + + i ) {
tables_ [ i ] [ 0 ] = Bins ( prob_bits_ , start ) ;
start + = ( 1ULL < < prob_bits_ ) ;
tables_ [ i ] [ 1 ] = Bins ( backoff_bits_ , start ) ;
start + = ( 1ULL < < backoff_bits_ ) ;
}
longest_ = tables_ [ order - 2 ] [ 0 ] = Bins ( prob_bits_ , start ) ;
2011-06-27 02:21:44 +04:00
}
void SeparatelyQuantize : : Train ( uint8_t order , std : : vector < float > & prob , std : : vector < float > & backoff ) {
TrainProb ( order , prob ) ;
// Backoff
2012-06-28 18:58:59 +04:00
float * centers = tables_ [ order - 2 ] [ 1 ] . Populate ( ) ;
2011-06-27 02:21:44 +04:00
* ( centers + + ) = kNoExtensionBackoff ;
* ( centers + + ) = kExtensionBackoff ;
2012-02-28 22:58:00 +04:00
MakeBins ( backoff , centers , ( 1ULL < < backoff_bits_ ) - 2 ) ;
2011-06-27 02:21:44 +04:00
}
void SeparatelyQuantize : : TrainProb ( uint8_t order , std : : vector < float > & prob ) {
2012-06-28 18:58:59 +04:00
float * centers = tables_ [ order - 2 ] [ 0 ] . Populate ( ) ;
2012-02-28 22:58:00 +04:00
MakeBins ( prob , centers , ( 1ULL < < prob_bits_ ) ) ;
2011-06-27 02:21:44 +04:00
}
void SeparatelyQuantize : : FinishedLoading ( const Config & config ) {
2012-06-28 18:58:59 +04:00
uint8_t * actual_base = actual_base_ ;
2011-06-27 02:21:44 +04:00
* ( actual_base + + ) = kSeparatelyQuantizeVersion ; // version
* ( actual_base + + ) = config . prob_bits ;
* ( actual_base + + ) = config . backoff_bits ;
}
} // namespace ngram
} // namespace lm