2010-11-06 03:40:16 +03:00
|
|
|
#ifndef LM_MODEL__
|
|
|
|
#define LM_MODEL__
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
#include "lm/binary_format.hh"
|
2010-11-06 03:40:16 +03:00
|
|
|
#include "lm/config.hh"
|
2010-09-28 20:26:55 +04:00
|
|
|
#include "lm/facade.hh"
|
2010-11-06 03:40:16 +03:00
|
|
|
#include "lm/search_hashed.hh"
|
|
|
|
#include "lm/search_trie.hh"
|
2010-09-28 20:26:55 +04:00
|
|
|
#include "lm/vocab.hh"
|
|
|
|
#include "lm/weights.hh"
|
2010-09-10 04:36:07 +04:00
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
namespace util { class FilePiece; }
|
|
|
|
|
|
|
|
namespace lm {
|
|
|
|
namespace ngram {
|
|
|
|
|
|
|
|
// If you need higher order, change this and recompile.
|
|
|
|
// Having this limit means that State can be
|
|
|
|
// (kMaxOrder - 1) * sizeof(float) bytes instead of
|
|
|
|
// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
|
|
|
|
const std::size_t kMaxOrder = 6;
|
|
|
|
|
|
|
|
// This is a POD.
|
|
|
|
class State {
|
|
|
|
public:
|
|
|
|
bool operator==(const State &other) const {
|
|
|
|
if (valid_length_ != other.valid_length_) return false;
|
|
|
|
const WordIndex *end = history_ + valid_length_;
|
|
|
|
for (const WordIndex *first = history_, *second = other.history_;
|
|
|
|
first != end; ++first, ++second) {
|
|
|
|
if (*first != *second) return false;
|
|
|
|
}
|
|
|
|
// If the histories are equal, so are the backoffs.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
|
2010-09-15 01:33:11 +04:00
|
|
|
// This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
|
2010-09-10 04:36:07 +04:00
|
|
|
WordIndex history_[kMaxOrder - 1];
|
2010-09-15 01:33:11 +04:00
|
|
|
float backoff_[kMaxOrder - 1];
|
|
|
|
unsigned char valid_length_;
|
2010-09-10 04:36:07 +04:00
|
|
|
};
|
|
|
|
|
2010-09-16 23:53:33 +04:00
|
|
|
size_t hash_value(const State &state);
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-09-15 01:33:11 +04:00
|
|
|
namespace detail {
|
|
|
|
|
|
|
|
// Should return the same results as SRI.
|
|
|
|
// Why VocabularyT instead of just Vocabulary? ModelFacade defines Vocabulary.
|
|
|
|
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
|
2010-09-10 04:36:07 +04:00
|
|
|
private:
|
2010-09-15 01:33:11 +04:00
|
|
|
typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
|
2010-09-10 04:36:07 +04:00
|
|
|
public:
|
|
|
|
// Get the size of memory that will be mapped given ngram counts. This
|
|
|
|
// does not include small non-mapped control structures, such as this class
|
|
|
|
// itself.
|
2010-10-27 21:50:40 +04:00
|
|
|
static size_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
GenericModel(const char *file, const Config &config = Config());
|
2010-09-10 04:36:07 +04:00
|
|
|
|
|
|
|
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
|
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
/* Slower call without in_state. Don't use this if you can avoid it. This
|
|
|
|
* is mostly a hack for Hieu to integrate it into Moses which sometimes
|
|
|
|
* forgets LM state (i.e. it doesn't store it with the phrase). Sigh.
|
|
|
|
* The context indices should be in an array.
|
|
|
|
* If context_rbegin != context_rend then *context_rbegin is the word
|
|
|
|
* before new_word.
|
2010-09-27 07:46:44 +04:00
|
|
|
*/
|
2010-10-27 21:50:40 +04:00
|
|
|
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
|
2010-09-27 07:46:44 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
/* Get the state for a context. Don't use this if you can avoid it. Use
|
|
|
|
* BeginSentenceState or EmptyContextState and extend from those. If
|
|
|
|
* you're only going to use this state to call FullScore once, use
|
|
|
|
* FullScoreForgotState. */
|
|
|
|
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
|
2010-09-27 07:46:44 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
private:
|
|
|
|
friend void LoadLM<>(const char *file, const Config &config, GenericModel<Search, VocabularyT> &to);
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
float SlowBackoffLookup(const WordIndex *const context_rbegin, const WordIndex *const context_rend, unsigned char start) const;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
FullScoreReturn ScoreExceptBackoff(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, unsigned char &backoff_start, State &out_state) const;
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
// Appears after Size in the cc file.
|
|
|
|
void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
void InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd);
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
void InitializeFromARPA(const char *file, util::FilePiece &f, void *start, const Parameters ¶ms, const Config &config);
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
Backing &MutableBacking() { return backing_; }
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
static const ModelType kModelType = Search::kModelType;
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
Backing backing_;
|
|
|
|
|
|
|
|
VocabularyT vocab_;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
typedef typename Search::Unigram Unigram;
|
|
|
|
typedef typename Search::Middle Middle;
|
|
|
|
typedef typename Search::Longest Longest;
|
2010-09-15 01:33:11 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
Search search_;
|
2010-09-10 04:36:07 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace detail
|
|
|
|
|
|
|
|
// These must also be instantiated in the cc file.
|
2010-10-27 21:50:40 +04:00
|
|
|
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
|
2010-10-28 05:05:04 +04:00
|
|
|
typedef detail::GenericModel<detail::ProbingHashedSearch, Vocabulary> ProbingModel;
|
|
|
|
// Default implementation. No real reason for it to be the default.
|
|
|
|
typedef ProbingModel Model;
|
2010-10-27 21:50:40 +04:00
|
|
|
|
|
|
|
typedef ::lm::ngram::SortedVocabulary SortedVocabulary;
|
|
|
|
typedef detail::GenericModel<detail::SortedHashedSearch, SortedVocabulary> SortedModel;
|
2010-09-10 04:36:07 +04:00
|
|
|
|
2010-10-27 21:50:40 +04:00
|
|
|
typedef detail::GenericModel<trie::TrieSearch, SortedVocabulary> TrieModel;
|
2010-09-10 04:36:07 +04:00
|
|
|
|
|
|
|
} // namespace ngram
|
|
|
|
} // namespace lm
|
|
|
|
|
2010-11-06 03:40:16 +03:00
|
|
|
#endif // LM_MODEL__
|