2010-10-27 21:50:40 +04:00
# ifndef LM_BINARY_FORMAT__
# define LM_BINARY_FORMAT__
2010-11-06 03:40:16 +03:00
# include "lm/config.hh"
2011-09-21 20:06:48 +04:00
# include "lm/model_type.hh"
2010-10-27 21:50:40 +04:00
# include "lm/read_arpa.hh"
# include "util/file_piece.hh"
# include "util/mmap.hh"
# include "util/scoped.hh"
# include <cstddef>
# include <vector>
2011-11-12 02:39:27 +04:00
# include <stdint.h>
2010-10-27 21:50:40 +04:00
namespace lm {
namespace ngram {
2011-04-19 19:17:01 +04:00
/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so , return true and set recognized to the type . This is the only API in
* this header designed for use by decoder authors .
*/
bool RecognizeBinary ( const char * file , ModelType & recognized ) ;
2010-10-27 21:50:40 +04:00
struct FixedWidthParameters {
unsigned char order ;
float probing_multiplier ;
// What type of model is this?
ModelType model_type ;
// Does the end of the file have the actual strings in the vocabulary?
bool has_vocabulary ;
2011-09-21 20:06:48 +04:00
unsigned int search_version ;
2010-10-27 21:50:40 +04:00
} ;
2011-11-11 00:46:59 +04:00
// This is a macro instead of an inline function so constants can be assigned using it.
# define ALIGN8(a) ((std::ptrdiff_t(((a)-1) / 8)+1)*8)
2011-09-21 20:06:48 +04:00
2011-04-19 19:17:01 +04:00
// Parameters stored in the header of a binary file.
2010-10-27 21:50:40 +04:00
struct Parameters {
FixedWidthParameters fixed ;
std : : vector < uint64_t > counts ;
} ;
struct Backing {
// File behind memory, if any.
util : : scoped_fd file ;
2011-01-25 22:11:48 +03:00
// Vocabulary lookup table. Not to be confused with the vocab words themselves.
util : : scoped_memory vocab ;
2010-10-27 21:50:40 +04:00
// Raw block of memory backing the language model data structures
2011-01-25 22:11:48 +03:00
util : : scoped_memory search ;
2010-10-27 21:50:40 +04:00
} ;
2011-04-19 19:17:01 +04:00
// Create just enough of a binary file to write vocabulary to it.
2011-01-25 22:11:48 +03:00
uint8_t * SetupJustVocab ( const Config & config , uint8_t order , std : : size_t memory_size , Backing & backing ) ;
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
2011-08-16 16:57:21 +04:00
uint8_t * GrowForSearch ( const Config & config , std : : size_t vocab_pad , std : : size_t memory_size , Backing & backing ) ;
2011-02-24 20:11:53 +03:00
2011-04-19 19:17:01 +04:00
// Write header to binary file. This is done last to prevent incomplete files
// from loading.
2011-09-21 20:06:48 +04:00
void FinishFile ( const Config & config , ModelType model_type , unsigned int search_version , const std : : vector < uint64_t > & counts , Backing & backing ) ;
2011-01-25 22:11:48 +03:00
2010-10-27 21:50:40 +04:00
namespace detail {
2011-11-11 00:46:59 +04:00
bool IsBinaryFormat ( int fd ) ;
2010-10-27 21:50:40 +04:00
2011-11-11 00:46:59 +04:00
void ReadHeader ( int fd , Parameters & params ) ;
2010-10-28 05:05:04 +04:00
2011-09-21 20:06:48 +04:00
void MatchCheck ( ModelType model_type , unsigned int search_version , const Parameters & params ) ;
2010-10-27 21:50:40 +04:00
2011-11-11 00:46:59 +04:00
void SeekPastHeader ( int fd , const Parameters & params ) ;
2011-06-27 02:21:44 +04:00
2010-10-27 21:50:40 +04:00
uint8_t * SetupBinary ( const Config & config , const Parameters & params , std : : size_t memory_size , Backing & backing ) ;
void ComplainAboutARPA ( const Config & config , ModelType model_type ) ;
} // namespace detail
template < class To > void LoadLM ( const char * file , const Config & config , To & to ) {
Backing & backing = to . MutableBacking ( ) ;
backing . file . reset ( util : : OpenReadOrThrow ( file ) ) ;
try {
if ( detail : : IsBinaryFormat ( backing . file . get ( ) ) ) {
2011-01-25 22:11:48 +03:00
Parameters params ;
2010-10-28 05:05:04 +04:00
detail : : ReadHeader ( backing . file . get ( ) , params ) ;
2011-09-21 20:06:48 +04:00
detail : : MatchCheck ( To : : kModelType , To : : kVersion , params ) ;
2011-01-25 22:11:48 +03:00
// Replace the run-time configured probing_multiplier with the one in the file.
2010-12-08 06:15:37 +03:00
Config new_config ( config ) ;
new_config . probing_multiplier = params . fixed . probing_multiplier ;
2011-06-27 02:21:44 +04:00
detail : : SeekPastHeader ( backing . file . get ( ) , params ) ;
To : : UpdateConfigFromBinary ( backing . file . get ( ) , params . counts , new_config ) ;
2010-12-08 06:15:37 +03:00
std : : size_t memory_size = To : : Size ( params . counts , new_config ) ;
uint8_t * start = detail : : SetupBinary ( new_config , params , memory_size , backing ) ;
to . InitializeFromBinary ( start , params , new_config , backing . file . get ( ) ) ;
2010-10-27 21:50:40 +04:00
} else {
detail : : ComplainAboutARPA ( config , To : : kModelType ) ;
2011-01-25 22:11:48 +03:00
to . InitializeFromARPA ( file , config ) ;
2010-10-27 21:50:40 +04:00
}
} catch ( util : : Exception & e ) {
2011-02-24 22:37:39 +03:00
e < < " File: " < < file ;
2010-10-27 21:50:40 +04:00
throw ;
}
}
} // namespace ngram
} // namespace lm
# endif // LM_BINARY_FORMAT__