2014-06-02 21:28:02 +04:00
|
|
|
#ifndef LM_CONFIG_H
|
|
|
|
#define LM_CONFIG_H
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2011-05-17 20:43:05 +04:00
|
|
|
#include "lm/lm_exception.hh"
|
2010-11-06 03:40:16 +03:00
|
|
|
#include "util/mmap.hh"
|
|
|
|
|
2012-06-28 18:58:59 +04:00
|
|
|
#include <iosfwd>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
/* Configuration for ngram model. Separate header to reduce pollution. */
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2011-10-12 14:18:23 +04:00
|
|
|
namespace lm {
|
2013-01-05 01:02:47 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
class EnumerateVocab;
|
|
|
|
|
2011-10-12 14:18:23 +04:00
|
|
|
namespace ngram {
|
|
|
|
|
2010-09-15 01:33:11 +04:00
|
|
|
struct Config {
|
2013-01-05 01:02:47 +04:00
|
|
|
// EFFECTIVE FOR BOTH ARPA AND BINARY READS
|
|
|
|
|
|
|
|
// (default true) print progress bar to messages
|
|
|
|
bool show_progress;
|
2010-10-27 21:50:40 +04:00
|
|
|
|
2010-09-15 01:33:11 +04:00
|
|
|
// Where to log messages including the progress bar. Set to NULL for
|
|
|
|
// silence.
|
|
|
|
std::ostream *messages;
|
|
|
|
|
2013-01-05 01:02:47 +04:00
|
|
|
std::ostream *ProgressMessages() const {
|
|
|
|
return show_progress ? messages : 0;
|
|
|
|
}
|
|
|
|
|
2015-03-25 17:40:21 +03:00
|
|
|
// This will be called with every string in the vocabulary by the
|
|
|
|
// constructor; it need only exist for the lifetime of the constructor.
|
|
|
|
// See enumerate_vocab.hh for more detail. Config does not take ownership;
|
|
|
|
// just delete/let it go out of scope after the constructor exits.
|
2010-10-27 21:50:40 +04:00
|
|
|
EnumerateVocab *enumerate_vocab;
|
|
|
|
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
// ONLY EFFECTIVE WHEN READING ARPA
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2013-01-05 01:02:47 +04:00
|
|
|
// What to do when <unk> isn't in the provided model.
|
2011-02-24 22:37:39 +03:00
|
|
|
WarningAction unknown_missing;
|
2013-01-05 01:02:47 +04:00
|
|
|
// What to do when <s> or </s> is missing from the model.
|
|
|
|
// If THROW_UP, the exception will be of type util::SpecialWordMissingException.
|
2011-02-24 22:37:39 +03:00
|
|
|
WarningAction sentence_marker_missing;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2011-05-23 06:23:01 +04:00
|
|
|
// What to do with a positive log probability. For COMPLAIN and SILENT, map
|
2013-01-05 01:02:47 +04:00
|
|
|
// to 0.
|
2011-05-17 20:43:05 +04:00
|
|
|
WarningAction positive_log_probability;
|
|
|
|
|
2013-01-05 01:02:47 +04:00
|
|
|
// The probability to substitute for <unk> if it's missing from the model.
|
2010-09-15 01:33:11 +04:00
|
|
|
// No effect if the model has <unk> or unknown_missing == THROW_UP.
|
2011-03-21 17:40:21 +03:00
|
|
|
float unknown_missing_logprob;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
|
|
|
// Size multiplier for probing hash table. Must be > 1. Space is linear in
|
|
|
|
// this. Time is probing_multiplier / (probing_multiplier - 1). No effect
|
2013-01-05 01:02:47 +04:00
|
|
|
// for sorted variant.
|
2010-09-15 01:33:11 +04:00
|
|
|
// If you find yourself setting this to a low number, consider using the
|
2013-01-05 01:02:47 +04:00
|
|
|
// TrieModel which has lower memory consumption.
|
2010-09-15 01:33:11 +04:00
|
|
|
float probing_multiplier;
|
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
// Amount of memory to use for building. The actual memory usage will be
|
|
|
|
// higher since this just sets sort buffer size. Only applies to trie
|
|
|
|
// models.
|
|
|
|
std::size_t building_memory;
|
|
|
|
|
2013-01-05 01:02:47 +04:00
|
|
|
// Template for temporary directory appropriate for passing to mkdtemp.
|
2010-10-27 21:50:40 +04:00
|
|
|
// The characters XXXXXX are appended before passing to mkdtemp. Only
|
2015-01-22 19:42:46 +03:00
|
|
|
// applies to trie. If empty, defaults to write_mmap. If that's NULL,
|
2013-01-05 01:02:47 +04:00
|
|
|
// defaults to input file name.
|
2015-01-22 19:42:46 +03:00
|
|
|
std::string temporary_directory_prefix;
|
2010-10-27 21:50:40 +04:00
|
|
|
|
2011-01-25 22:11:48 +03:00
|
|
|
// Level of complaining to do when loading from ARPA instead of binary format.
|
2012-06-28 18:58:59 +04:00
|
|
|
enum ARPALoadComplain {ALL, EXPENSIVE, NONE};
|
2010-10-27 21:50:40 +04:00
|
|
|
ARPALoadComplain arpa_complain;
|
|
|
|
|
2010-09-15 01:33:11 +04:00
|
|
|
// While loading an ARPA file, also write out this binary format file. Set
|
2013-01-05 01:02:47 +04:00
|
|
|
// to NULL to disable.
|
2010-09-15 01:33:11 +04:00
|
|
|
const char *write_mmap;
|
|
|
|
|
2012-06-28 18:58:59 +04:00
|
|
|
enum WriteMethod {
|
2013-01-05 01:02:47 +04:00
|
|
|
WRITE_MMAP, // Map the file directly.
|
|
|
|
WRITE_AFTER // Write after we're done.
|
2012-06-28 18:58:59 +04:00
|
|
|
};
|
2012-02-28 22:58:00 +04:00
|
|
|
WriteMethod write_method;
|
|
|
|
|
2013-01-05 01:02:47 +04:00
|
|
|
// Include the vocab in the binary file? Only effective if write_mmap != NULL.
|
2010-10-27 21:50:40 +04:00
|
|
|
bool include_vocab;
|
|
|
|
|
2012-02-28 22:58:00 +04:00
|
|
|
|
2013-01-05 01:02:47 +04:00
|
|
|
// Left rest options. Only used when the model includes rest costs.
|
2012-06-28 18:58:59 +04:00
|
|
|
enum RestFunction {
|
|
|
|
REST_MAX, // Maximum of any score to the left
|
2013-01-05 01:02:47 +04:00
|
|
|
REST_LOWER, // Use lower-order files given below.
|
2012-06-28 18:58:59 +04:00
|
|
|
};
|
|
|
|
RestFunction rest_function;
|
2013-01-05 01:02:47 +04:00
|
|
|
// Only used for REST_LOWER.
|
2012-06-28 18:58:59 +04:00
|
|
|
std::vector<std::string> rest_lower_files;
|
|
|
|
|
|
|
|
|
2011-06-27 02:21:44 +04:00
|
|
|
// Quantization options. Only effective for QuantTrieModel. One value is
|
|
|
|
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
|
2013-01-05 01:02:47 +04:00
|
|
|
// to quantize (and one of the remaining backoffs will be 0).
|
2011-06-27 02:21:44 +04:00
|
|
|
uint8_t prob_bits, backoff_bits;
|
|
|
|
|
2011-07-14 00:53:18 +04:00
|
|
|
// Bhiksha compression (simple form). Only works with trie.
|
|
|
|
uint8_t pointer_bhiksha_bits;
|
|
|
|
|
2013-01-05 01:02:47 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
// ONLY EFFECTIVE WHEN READING BINARY
|
2013-01-05 01:02:47 +04:00
|
|
|
|
2010-11-06 03:40:16 +03:00
|
|
|
// How to get the giant array into memory: lazy mmap, populate, read etc.
|
2013-01-05 01:02:47 +04:00
|
|
|
// See util/mmap.hh for details of MapMethod.
|
2010-11-06 03:40:16 +03:00
|
|
|
util::LoadMethod load_method;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
|
|
|
|
2013-01-05 01:02:47 +04:00
|
|
|
// Set defaults.
|
2010-10-27 21:50:40 +04:00
|
|
|
Config();
|
2010-09-15 01:33:11 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
} /* namespace ngram */ } /* namespace lm */
|
|
|
|
|
2014-06-02 21:28:02 +04:00
|
|
|
#endif // LM_CONFIG_H
|