2010-11-06 03:40:16 +03:00
|
|
|
#ifndef LM_MODEL__
|
|
|
|
#define LM_MODEL__
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2011-07-14 00:53:18 +04:00
|
|
|
#include "lm/bhiksha.hh"
|
2010-10-27 21:50:40 +04:00
|
|
|
#include "lm/binary_format.hh"
|
2010-11-06 03:40:16 +03:00
|
|
|
#include "lm/config.hh"
|
2010-09-28 20:26:55 +04:00
|
|
|
#include "lm/facade.hh"
|
2011-01-25 22:11:48 +03:00
|
|
|
#include "lm/max_order.hh"
|
2011-06-27 02:21:44 +04:00
|
|
|
#include "lm/quantize.hh"
|
2010-11-06 03:40:16 +03:00
|
|
|
#include "lm/search_hashed.hh"
|
|
|
|
#include "lm/search_trie.hh"
|
2010-09-28 20:26:55 +04:00
|
|
|
#include "lm/vocab.hh"
|
|
|
|
#include "lm/weights.hh"
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2011-09-24 19:24:33 +04:00
|
|
|
#include "util/murmur_hash.hh"
|
2011-10-23 06:37:47 +04:00
|
|
|
#include "util/portability.hh"
|
2011-09-24 19:24:33 +04:00
|
|
|
|
2010-09-10 04:36:07 +04:00
|
|
|
#include <algorithm>
|
|
|
|
#include <vector>
|
|
|
|
|
2010-11-28 05:54:56 +03:00
|
|
|
#include <string.h>
|
|
|
|
|
2010-09-10 04:36:07 +04:00
|
|
|
namespace util { class FilePiece; }
|
|
|
|
|
|
|
|
namespace lm {
|
|
|
|
namespace ngram {
|
|
|
|
|
2010-11-28 05:54:56 +03:00
|
|
|
// This is a POD but if you want memcmp to return the same as operator==, call
|
|
|
|
// ZeroRemaining first.
|
2010-09-10 04:36:07 +04:00
|
|
|
class State {
|
|
|
|
public:
|
|
|
|
bool operator==(const State &other) const {
|
2011-09-21 20:06:48 +04:00
|
|
|
if (length != other.length) return false;
|
2011-09-24 19:24:33 +04:00
|
|
|
return !memcmp(words, other.words, length * sizeof(WordIndex));
|
2010-09-10 04:36:07 +04:00
|
|
|
}
|
|
|
|
|
2010-11-28 05:54:56 +03:00
|
|
|
// Three way comparison function.
|
|
|
|
int Compare(const State &other) const {
|
2011-09-24 19:24:33 +04:00
|
|
|
if (length != other.length) return length < other.length ? -1 : 1;
|
|
|
|
return memcmp(words, other.words, length * sizeof(WordIndex));
|
|
|
|
}
|
|
|
|
|
|
|
|
bool operator<(const State &other) const {
|
|
|
|
if (length != other.length) return length < other.length;
|
|
|
|
return memcmp(words, other.words, length * sizeof(WordIndex)) < 0;
|
2010-11-28 05:54:56 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Call this before using raw memcmp.
|
|
|
|
void ZeroRemaining() {
|
2011-09-21 20:06:48 +04:00
|
|
|
for (unsigned char i = length; i < kMaxOrder - 1; ++i) {
|
|
|
|
words[i] = 0;
|
|
|
|
backoff[i] = 0.0;
|
2010-11-28 05:54:56 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-09-21 20:06:48 +04:00
|
|
|
unsigned char Length() const { return length; }
|
2011-01-25 22:11:48 +03:00
|
|
|
|
2010-09-10 04:36:07 +04:00
|
|
|
// You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
|
2010-09-15 01:33:11 +04:00
|
|
|
// This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
|
2011-09-21 20:06:48 +04:00
|
|
|
WordIndex words[kMaxOrder - 1];
|
|
|
|
float backoff[kMaxOrder - 1];
|
|
|
|
unsigned char length;
|
2010-09-10 04:36:07 +04:00
|
|
|
};
|
|
|
|
|
2011-09-24 19:24:33 +04:00
|
|
|
inline size_t hash_value(const State &state) {
|
|
|
|
return util::MurmurHashNative(state.words, sizeof(WordIndex) * state.length);
|
|
|
|
}
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-09-15 01:33:11 +04:00
|
|
|
namespace detail {
|
|
|
|
|
|
|
|
// Should return the same results as SRI.
|
2011-04-19 19:17:01 +04:00
|
|
|
// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
|
2010-09-15 01:33:11 +04:00
|
|
|
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
|
2010-09-10 04:36:07 +04:00
|
|
|
private:
|
2010-09-15 01:33:11 +04:00
|
|
|
typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
|
2010-09-10 04:36:07 +04:00
|
|
|
public:
|
2011-07-14 00:53:18 +04:00
|
|
|
// This is the model type returned by RecognizeBinary.
|
|
|
|
static const ModelType kModelType;
|
|
|
|
|
2011-09-21 20:06:48 +04:00
|
|
|
static const unsigned int kVersion = Search::kVersion;
|
|
|
|
|
2011-06-27 02:21:44 +04:00
|
|
|
/* Get the size of memory that will be mapped given ngram counts. This
|
|
|
|
* does not include small non-mapped control structures, such as this class
|
|
|
|
* itself.
|
|
|
|
*/
|
2010-10-27 21:50:40 +04:00
|
|
|
static size_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2011-04-19 19:17:01 +04:00
|
|
|
/* Load the model from a file. It may be an ARPA or binary file. Binary
|
|
|
|
* files must have the format expected by this class or you'll get an
|
|
|
|
* exception. So TrieModel can only load ARPA or binary created by
|
|
|
|
* TrieModel. To classify binary files, call RecognizeBinary in
|
|
|
|
* lm/binary_format.hh.
|
|
|
|
*/
|
2010-10-27 21:50:40 +04:00
|
|
|
GenericModel(const char *file, const Config &config = Config());
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2011-04-19 19:17:01 +04:00
|
|
|
/* Score p(new_word | in_state) and incorporate new_word into out_state.
|
|
|
|
* Note that in_state and out_state must be different references:
|
|
|
|
* &in_state != &out_state.
|
|
|
|
*/
|
2010-09-10 04:36:07 +04:00
|
|
|
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
|
|
|
|
|
2011-04-19 19:17:01 +04:00
|
|
|
/* Slower call without in_state. Try to remember state, but sometimes it
|
|
|
|
* would cost too much memory or your decoder isn't setup properly.
|
|
|
|
* To use this function, make an array of WordIndex containing the context
|
|
|
|
* vocabulary ids in reverse order. Then, pass the bounds of the array:
|
|
|
|
* [context_rbegin, context_rend). The new_word is not part of the context
|
|
|
|
* array unless you intend to repeat words.
|
2010-09-27 07:46:44 +04:00
|
|
|
*/
|
2010-10-27 21:50:40 +04:00
|
|
|
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
|
2010-09-27 07:46:44 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
/* Get the state for a context. Don't use this if you can avoid it. Use
|
|
|
|
* BeginSentenceState or EmptyContextState and extend from those. If
|
|
|
|
* you're only going to use this state to call FullScore once, use
|
2011-04-19 19:17:01 +04:00
|
|
|
* FullScoreForgotState.
|
|
|
|
* To use this function, make an array of WordIndex containing the context
|
|
|
|
* vocabulary ids in reverse order. Then, pass the bounds of the array:
|
|
|
|
* [context_rbegin, context_rend).
|
|
|
|
*/
|
2010-10-27 21:50:40 +04:00
|
|
|
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
|
2010-09-27 07:46:44 +04:00
|
|
|
|
2011-09-21 20:06:48 +04:00
|
|
|
/* More efficient version of FullScore where a partial n-gram has already
|
|
|
|
* been scored.
|
|
|
|
* NOTE: THE RETURNED .prob IS RELATIVE, NOT ABSOLUTE. So for example, if
|
|
|
|
* the n-gram does not end up extending further left, then 0 is returned.
|
|
|
|
*/
|
|
|
|
FullScoreReturn ExtendLeft(
|
|
|
|
// Additional context in reverse order. This will update add_rend to
|
|
|
|
const WordIndex *add_rbegin, const WordIndex *add_rend,
|
|
|
|
// Backoff weights to use.
|
|
|
|
const float *backoff_in,
|
|
|
|
// extend_left returned by a previous query.
|
|
|
|
uint64_t extend_pointer,
|
|
|
|
// Length of n-gram that the pointer corresponds to.
|
|
|
|
unsigned char extend_length,
|
|
|
|
// Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)]
|
|
|
|
float *backoff_out,
|
|
|
|
// Amount of additional content that should be considered by the next call.
|
|
|
|
unsigned char &next_use) const;
|
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
private:
|
2011-10-23 06:37:47 +04:00
|
|
|
friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2011-10-23 06:37:47 +04:00
|
|
|
static void UpdateConfigFromBinary(FD fd, const std::vector<uint64_t> &counts, Config &config) {
|
2011-06-27 02:21:44 +04:00
|
|
|
AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config));
|
|
|
|
Search::UpdateConfigFromBinary(fd, counts, config);
|
|
|
|
}
|
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
float SlowBackoffLookup(const WordIndex *const context_rbegin, const WordIndex *const context_rend, unsigned char start) const;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2011-01-25 22:11:48 +03:00
|
|
|
FullScoreReturn ScoreExceptBackoff(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
// Appears after Size in the cc file.
|
|
|
|
void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2011-10-23 06:37:47 +04:00
|
|
|
void InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, FD fd);
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2011-01-25 22:11:48 +03:00
|
|
|
void InitializeFromARPA(const char *file, const Config &config);
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
Backing &MutableBacking() { return backing_; }
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
Backing backing_;
|
|
|
|
|
|
|
|
VocabularyT vocab_;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
typedef typename Search::Middle Middle;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
Search search_;
|
2010-09-10 04:36:07 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace detail
|
|
|
|
|
|
|
|
// These must also be instantiated in the cc file.
|
2010-10-27 21:50:40 +04:00
|
|
|
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
|
2011-06-27 02:21:44 +04:00
|
|
|
typedef detail::GenericModel<detail::ProbingHashedSearch, Vocabulary> ProbingModel; // HASH_PROBING
|
2010-10-28 05:05:04 +04:00
|
|
|
// Default implementation. No real reason for it to be the default.
|
|
|
|
typedef ProbingModel Model;
|
2010-10-27 21:50:40 +04:00
|
|
|
|
2011-04-19 19:17:01 +04:00
|
|
|
// Smaller implementation.
|
2010-10-27 21:50:40 +04:00
|
|
|
typedef ::lm::ngram::SortedVocabulary SortedVocabulary;
|
2011-07-14 00:53:18 +04:00
|
|
|
typedef detail::GenericModel<trie::TrieSearch<DontQuantize, trie::DontBhiksha>, SortedVocabulary> TrieModel; // TRIE_SORTED
|
|
|
|
typedef detail::GenericModel<trie::TrieSearch<DontQuantize, trie::ArrayBhiksha>, SortedVocabulary> ArrayTrieModel;
|
2011-06-27 02:21:44 +04:00
|
|
|
|
2011-07-14 00:53:18 +04:00
|
|
|
typedef detail::GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::DontBhiksha>, SortedVocabulary> QuantTrieModel; // QUANT_TRIE_SORTED
|
|
|
|
typedef detail::GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::ArrayBhiksha>, SortedVocabulary> QuantArrayTrieModel;
|
2010-09-10 04:36:07 +04:00
|
|
|
|
|
|
|
} // namespace ngram
|
|
|
|
} // namespace lm
|
|
|
|
|
2010-11-06 03:40:16 +03:00
|
|
|
#endif // LM_MODEL__
|