2011-07-14 00:53:18 +04:00
# include "lm/bhiksha.hh"
# include "lm/config.hh"
2011-11-11 00:46:59 +04:00
# include "util/file.hh"
2011-07-14 00:53:18 +04:00
# include <limits>
namespace lm {
namespace ngram {
namespace trie {
DontBhiksha : : DontBhiksha ( const void * /*base*/ , uint64_t /*max_offset*/ , uint64_t max_next , const Config & /*config*/ ) :
next_ ( util : : BitsMask : : ByMax ( max_next ) ) { }
const uint8_t kArrayBhikshaVersion = 0 ;
2011-11-11 00:46:59 +04:00
// TODO: put this in binary file header instead when I change the binary file format again.
void ArrayBhiksha : : UpdateConfigFromBinary ( int fd , Config & config ) {
2011-07-14 00:53:18 +04:00
uint8_t version ;
uint8_t configured_bits ;
2011-11-11 00:46:59 +04:00
util : : ReadOrThrow ( fd , & version , 1 ) ;
util : : ReadOrThrow ( fd , & configured_bits , 1 ) ;
2011-07-14 00:53:18 +04:00
if ( version ! = kArrayBhikshaVersion ) UTIL_THROW ( FormatLoadException , " This file has sorted array compression version " < < ( unsigned ) version < < " but the code expects version " < < ( unsigned ) kArrayBhikshaVersion ) ;
config . pointer_bhiksha_bits = configured_bits ;
}
namespace {
// Find argmin_{chopped \in [0, RequiredBits(max_next)]} ChoppedDelta(max_offset)
uint8_t ChopBits ( uint64_t max_offset , uint64_t max_next , const Config & config ) {
uint8_t required = util : : RequiredBits ( max_next ) ;
uint8_t best_chop = 0 ;
int64_t lowest_change = std : : numeric_limits < int64_t > : : max ( ) ;
// There are probably faster ways but I don't care because this is only done once per order at construction time.
for ( uint8_t chop = 0 ; chop < = std : : min ( required , config . pointer_bhiksha_bits ) ; + + chop ) {
int64_t change = ( max_next > > ( required - chop ) ) * 64 /* table cost in bits */
- max_offset * static_cast < int64_t > ( chop ) ; /* savings in bits*/
if ( change < lowest_change ) {
lowest_change = change ;
best_chop = chop ;
}
}
return best_chop ;
}
std : : size_t ArrayCount ( uint64_t max_offset , uint64_t max_next , const Config & config ) {
uint8_t required = util : : RequiredBits ( max_next ) ;
uint8_t chopping = ChopBits ( max_offset , max_next , config ) ;
return ( max_next > > ( required - chopping ) ) + 1 /* we store 0 too */ ;
}
} // namespace
std : : size_t ArrayBhiksha : : Size ( uint64_t max_offset , uint64_t max_next , const Config & config ) {
2011-08-16 16:57:21 +04:00
return sizeof ( uint64_t ) * ( 1 /* header */ + ArrayCount ( max_offset , max_next , config ) ) + 7 /* 8-byte alignment */ ;
2011-07-14 00:53:18 +04:00
}
uint8_t ArrayBhiksha : : InlineBits ( uint64_t max_offset , uint64_t max_next , const Config & config ) {
return util : : RequiredBits ( max_next ) - ChopBits ( max_offset , max_next , config ) ;
}
namespace {
void * AlignTo8 ( void * from ) {
uint8_t * val = reinterpret_cast < uint8_t * > ( from ) ;
std : : size_t remainder = reinterpret_cast < std : : size_t > ( val ) & 7 ;
if ( ! remainder ) return val ;
return val + 8 - remainder ;
}
} // namespace
ArrayBhiksha : : ArrayBhiksha ( void * base , uint64_t max_offset , uint64_t max_next , const Config & config )
: next_inline_ ( util : : BitsMask : : ByBits ( InlineBits ( max_offset , max_next , config ) ) ) ,
offset_begin_ ( reinterpret_cast < const uint64_t * > ( AlignTo8 ( base ) ) + 1 /* 8-byte header */ ) ,
offset_end_ ( offset_begin_ + ArrayCount ( max_offset , max_next , config ) ) ,
write_to_ ( reinterpret_cast < uint64_t * > ( AlignTo8 ( base ) ) + 1 /* 8-byte header */ + 1 /* first entry is 0 */ ) ,
original_base_ ( base ) { }
void ArrayBhiksha : : FinishedLoading ( const Config & config ) {
// *offset_begin_ = 0 but without a const_cast.
* ( write_to_ - ( write_to_ - offset_begin_ ) ) = 0 ;
if ( write_to_ ! = offset_end_ ) UTIL_THROW ( util : : Exception , " Did not get all the array entries that were expected. " ) ;
uint8_t * head_write = reinterpret_cast < uint8_t * > ( original_base_ ) ;
* ( head_write + + ) = kArrayBhikshaVersion ;
* ( head_write + + ) = config . pointer_bhiksha_bits ;
}
void ArrayBhiksha : : LoadedBinary ( ) {
}
} // namespace trie
} // namespace ngram
} // namespace lm