mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-06 11:38:34 +03:00
107 lines
3.4 KiB
C++
107 lines
3.4 KiB
C++
#ifndef LM_BINARY_FORMAT_H
|
|
#define LM_BINARY_FORMAT_H
|
|
|
|
#include "lm/config.hh"
|
|
#include "lm/model_type.hh"
|
|
#include "lm/read_arpa.hh"
|
|
|
|
#include "util/file_piece.hh"
|
|
#include "util/mmap.hh"
|
|
#include "util/scoped.hh"
|
|
|
|
#include <cstddef>
|
|
#include <vector>
|
|
|
|
#include <stdint.h>
|
|
|
|
namespace lm {
|
|
namespace ngram {
|
|
|
|
extern const char *kModelNames[6];
|
|
|
|
/*Inspect a file to determine if it is a binary lm. If not, return false.
|
|
* If so, return true and set recognized to the type. This is the only API in
|
|
* this header designed for use by decoder authors.
|
|
*/
|
|
bool RecognizeBinary(const char *file, ModelType &recognized);
|
|
|
|
struct FixedWidthParameters {
|
|
unsigned char order;
|
|
float probing_multiplier;
|
|
// What type of model is this?
|
|
ModelType model_type;
|
|
// Does the end of the file have the actual strings in the vocabulary?
|
|
bool has_vocabulary;
|
|
unsigned int search_version;
|
|
};
|
|
|
|
// This is a macro instead of an inline function so constants can be assigned using it.
|
|
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
|
|
|
|
// Parameters stored in the header of a binary file.
|
|
struct Parameters {
|
|
FixedWidthParameters fixed;
|
|
std::vector<uint64_t> counts;
|
|
};
|
|
|
|
class BinaryFormat {
|
|
public:
|
|
explicit BinaryFormat(const Config &config);
|
|
|
|
// Reading a binary file:
|
|
// Takes ownership of fd
|
|
void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms);
|
|
// Used to read parts of the file to update the config object before figuring out full size.
|
|
void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
|
|
// Actually load the binary file and return a pointer to the beginning of the search area.
|
|
void *LoadBinary(std::size_t size);
|
|
|
|
uint64_t VocabStringReadingOffset() const {
|
|
assert(vocab_string_offset_ != kInvalidOffset);
|
|
return vocab_string_offset_;
|
|
}
|
|
|
|
// Writing a binary file or initializing in RAM from ARPA:
|
|
// Size for vocabulary.
|
|
void *SetupJustVocab(std::size_t memory_size, uint8_t order);
|
|
// Warning: can change the vocaulary base pointer.
|
|
void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
|
|
// Warning: can change vocabulary and search base addresses.
|
|
void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
|
|
// Write the header at the beginning of the file.
|
|
void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
|
|
|
|
private:
|
|
void MapFile(void *&vocab_base, void *&search_base);
|
|
|
|
// Copied from configuration.
|
|
const Config::WriteMethod write_method_;
|
|
const char *write_mmap_;
|
|
util::LoadMethod load_method_;
|
|
|
|
// File behind memory, if any.
|
|
util::scoped_fd file_;
|
|
|
|
// If there is a file involved, a single mapping.
|
|
util::scoped_memory mapping_;
|
|
|
|
// If the data is only in memory, separately allocate each because the trie
|
|
// knows vocab's size before it knows search's size (because SRILM might
|
|
// have pruned).
|
|
util::scoped_memory memory_vocab_, memory_search_;
|
|
|
|
// Memory ranges. Note that these may not be contiguous and may not all
|
|
// exist.
|
|
std::size_t header_size_, vocab_size_, vocab_pad_;
|
|
// aka end of search.
|
|
uint64_t vocab_string_offset_;
|
|
|
|
static const uint64_t kInvalidOffset = (uint64_t)-1;
|
|
};
|
|
|
|
bool IsBinaryFormat(int fd);
|
|
|
|
} // namespace ngram
|
|
} // namespace lm
|
|
#endif // LM_BINARY_FORMAT_H
|