More documentation

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3951 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
heafield 2011-04-19 15:17:01 +00:00
parent 83d02c11c6
commit 3274f72bfb
5 changed files with 76 additions and 18 deletions

View File

@ -18,6 +18,12 @@ namespace ngram {
typedef enum {HASH_PROBING=0, HASH_SORTED=1, TRIE_SORTED=2} ModelType;
/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in
* this header designed for use by decoder authors.
*/
bool RecognizeBinary(const char *file, ModelType &recognized);
struct FixedWidthParameters {
unsigned char order;
float probing_multiplier;
@ -27,6 +33,7 @@ struct FixedWidthParameters {
bool has_vocabulary;
};
// Parameters stored in the header of a binary file.
struct Parameters {
FixedWidthParameters fixed;
std::vector<uint64_t> counts;
@ -41,10 +48,13 @@ struct Backing {
util::scoped_memory search;
};
// Create just enough of a binary file to write vocabulary to it.
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
uint8_t *GrowForSearch(const Config &config, std::size_t memory_size, Backing &backing);
// Write header to binary file. This is done last to prevent incomplete files
// from loading.
void FinishFile(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, Backing &backing);
namespace detail {
@ -61,8 +71,6 @@ void ComplainAboutARPA(const Config &config, ModelType model_type);
} // namespace detail
bool RecognizeBinary(const char *file, ModelType &recognized);
template <class To> void LoadLM(const char *file, const Config &config, To &to) {
Backing &backing = to.MutableBacking();
backing.file.reset(util::OpenReadOrThrow(file));
@ -86,7 +94,6 @@ template <class To> void LoadLM(const char *file, const Config &config, To &to)
e << " File: " << file;
throw;
}
}
} // namespace ngram

View File

@ -63,7 +63,6 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
std::cout << "bytes\n"
"probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
"trie " << std::setw(length) << TrieModel::Size(counts, config) << "\n";
/* "sorted " << std::setw(length) << SortedModel::Size(counts, config) << "\n";*/
}
} // namespace ngram
@ -108,8 +107,6 @@ int main(int argc, char *argv[]) {
config.write_mmap = argv[optind + 2];
if (!strcmp(model_type, "probing")) {
ProbingModel(from_file, config);
} else if (!strcmp(model_type, "sorted")) {
SortedModel(from_file, config);
} else if (!strcmp(model_type, "trie")) {
TrieModel(from_file, config);
} else {

View File

@ -65,7 +65,7 @@ size_t hash_value(const State &state);
namespace detail {
// Should return the same results as SRI.
// Why VocabularyT instead of just Vocabulary? ModelFacade defines Vocabulary.
// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
private:
typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
@ -75,23 +75,37 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
// itself.
static size_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
/* Load the model from a file. It may be an ARPA or binary file. Binary
* files must have the format expected by this class or you'll get an
* exception. So TrieModel can only load ARPA or binary created by
* TrieModel. To classify binary files, call RecognizeBinary in
* lm/binary_format.hh.
*/
GenericModel(const char *file, const Config &config = Config());
/* Score p(new_word | in_state) and incorporate new_word into out_state.
* Note that in_state and out_state must be different references:
* &in_state != &out_state.
*/
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
/* Slower call without in_state. Don't use this if you can avoid it. This
* is mostly a hack for Hieu to integrate it into Moses which sometimes
* forgets LM state (i.e. it doesn't store it with the phrase). Sigh.
* The context indices should be in an array.
* If context_rbegin != context_rend then *context_rbegin is the word
* before new_word.
/* Slower call without in_state. Try to remember state, but sometimes it
* would cost too much memory or your decoder isn't setup properly.
* To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array:
* [context_rbegin, context_rend). The new_word is not part of the context
* array unless you intend to repeat words.
*/
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
/* Get the state for a context. Don't use this if you can avoid it. Use
* BeginSentenceState or EmptyContextState and extend from those. If
* you're only going to use this state to call FullScore once, use
* FullScoreForgotState. */
* FullScoreForgotState.
* To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array:
* [context_rbegin, context_rend).
*/
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
private:
@ -131,9 +145,8 @@ typedef detail::GenericModel<detail::ProbingHashedSearch, Vocabulary> ProbingMod
// Default implementation. No real reason for it to be the default.
typedef ProbingModel Model;
// Smaller implementation.
typedef ::lm::ngram::SortedVocabulary SortedVocabulary;
typedef detail::GenericModel<detail::SortedHashedSearch, SortedVocabulary> SortedModel;
typedef detail::GenericModel<trie::TrieSearch, SortedVocabulary> TrieModel;
} // namespace ngram

View File

@ -8,8 +8,27 @@
namespace lm {
/* Structure returned by scoring routines. */
struct FullScoreReturn {
// log10 probability
float prob;
/* The length of n-gram matched. Do not use this for recombination.
* Consider a model containing only the following n-grams:
* -1 foo
* -3.14 bar
* -2.718 baz -5
* -6 foo bar
*
* If you score ``bar'' then ngram_length is 1 and recombination state is the
* empty string because bar has zero backoff and does not extend to the
* right.
* If you score ``foo'' then ngram_length is 1 and recombination state is
* ``foo''.
*
* Ideally, keep output states around and compare them. Failing that,
* get out_state.ValidLength() and use that length for recombination.
*/
unsigned char ngram_length;
};
@ -72,7 +91,8 @@ class Vocabulary {
/* There are two ways to access a Model.
*
*
* OPTION 1: Access the Model directly (e.g. lm::ngram::Model in ngram.hh).
* OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh).
*
* Every Model implements the scoring function:
* float Score(
* const Model::State &in_state,
@ -85,6 +105,7 @@ class Vocabulary {
* const WordIndex new_word,
* Model::State &out_state) const;
*
*
* There are also accessor functions:
* const State &BeginSentenceState() const;
* const State &NullContextState() const;
@ -114,6 +135,7 @@ class Vocabulary {
*
* All the State objects are POD, so it's ok to use raw memory for storing
* State.
* in_state and out_state must not have the same address.
*/
class Model {
public:
@ -123,8 +145,10 @@ class Model {
const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
const void *NullContextMemory() const { return null_context_memory_; }
// Requires in_state != out_state
virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
// Requires in_state != out_state
virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
unsigned char Order() const { return order_; }

View File

@ -60,6 +60,23 @@
#ifdef HAVE_ICU
#include <unicode/stringpiece.h>
#include <unicode/uversion.h>
// Old versions of ICU don't define operator== and operator!=.
#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
inline bool operator==(const StringPiece& x, const StringPiece& y) {
if (x.size() != y.size())
return false;
return std::memcmp(x.data(), y.data(), x.size()) == 0;
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
#endif // old version of ICU
U_NAMESPACE_BEGIN
#else
@ -209,7 +226,7 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
#endif
#endif // HAVE_ICU undefined
inline bool operator<(const StringPiece& x, const StringPiece& y) {
const int r = std::memcmp(x.data(), y.data(),