mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
More documentation
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3951 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
83d02c11c6
commit
3274f72bfb
@ -18,6 +18,12 @@ namespace ngram {
|
||||
|
||||
typedef enum {HASH_PROBING=0, HASH_SORTED=1, TRIE_SORTED=2} ModelType;
|
||||
|
||||
/*Inspect a file to determine if it is a binary lm. If not, return false.
|
||||
* If so, return true and set recognized to the type. This is the only API in
|
||||
* this header designed for use by decoder authors.
|
||||
*/
|
||||
bool RecognizeBinary(const char *file, ModelType &recognized);
|
||||
|
||||
struct FixedWidthParameters {
|
||||
unsigned char order;
|
||||
float probing_multiplier;
|
||||
@ -27,6 +33,7 @@ struct FixedWidthParameters {
|
||||
bool has_vocabulary;
|
||||
};
|
||||
|
||||
// Parameters stored in the header of a binary file.
|
||||
struct Parameters {
|
||||
FixedWidthParameters fixed;
|
||||
std::vector<uint64_t> counts;
|
||||
@ -41,10 +48,13 @@ struct Backing {
|
||||
util::scoped_memory search;
|
||||
};
|
||||
|
||||
// Create just enough of a binary file to write vocabulary to it.
|
||||
uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
|
||||
// Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.
|
||||
uint8_t *GrowForSearch(const Config &config, std::size_t memory_size, Backing &backing);
|
||||
|
||||
// Write header to binary file. This is done last to prevent incomplete files
|
||||
// from loading.
|
||||
void FinishFile(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, Backing &backing);
|
||||
|
||||
namespace detail {
|
||||
@ -61,8 +71,6 @@ void ComplainAboutARPA(const Config &config, ModelType model_type);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
bool RecognizeBinary(const char *file, ModelType &recognized);
|
||||
|
||||
template <class To> void LoadLM(const char *file, const Config &config, To &to) {
|
||||
Backing &backing = to.MutableBacking();
|
||||
backing.file.reset(util::OpenReadOrThrow(file));
|
||||
@ -86,7 +94,6 @@ template <class To> void LoadLM(const char *file, const Config &config, To &to)
|
||||
e << " File: " << file;
|
||||
throw;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace ngram
|
||||
|
@ -63,7 +63,6 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
|
||||
std::cout << "bytes\n"
|
||||
"probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
|
||||
"trie " << std::setw(length) << TrieModel::Size(counts, config) << "\n";
|
||||
/* "sorted " << std::setw(length) << SortedModel::Size(counts, config) << "\n";*/
|
||||
}
|
||||
|
||||
} // namespace ngram
|
||||
@ -108,8 +107,6 @@ int main(int argc, char *argv[]) {
|
||||
config.write_mmap = argv[optind + 2];
|
||||
if (!strcmp(model_type, "probing")) {
|
||||
ProbingModel(from_file, config);
|
||||
} else if (!strcmp(model_type, "sorted")) {
|
||||
SortedModel(from_file, config);
|
||||
} else if (!strcmp(model_type, "trie")) {
|
||||
TrieModel(from_file, config);
|
||||
} else {
|
||||
|
@ -65,7 +65,7 @@ size_t hash_value(const State &state);
|
||||
namespace detail {
|
||||
|
||||
// Should return the same results as SRI.
|
||||
// Why VocabularyT instead of just Vocabulary? ModelFacade defines Vocabulary.
|
||||
// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
|
||||
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
|
||||
private:
|
||||
typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
|
||||
@ -75,23 +75,37 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
|
||||
// itself.
|
||||
static size_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
|
||||
|
||||
/* Load the model from a file. It may be an ARPA or binary file. Binary
|
||||
* files must have the format expected by this class or you'll get an
|
||||
* exception. So TrieModel can only load ARPA or binary created by
|
||||
* TrieModel. To classify binary files, call RecognizeBinary in
|
||||
* lm/binary_format.hh.
|
||||
*/
|
||||
GenericModel(const char *file, const Config &config = Config());
|
||||
|
||||
/* Score p(new_word | in_state) and incorporate new_word into out_state.
|
||||
* Note that in_state and out_state must be different references:
|
||||
* &in_state != &out_state.
|
||||
*/
|
||||
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
|
||||
|
||||
/* Slower call without in_state. Don't use this if you can avoid it. This
|
||||
* is mostly a hack for Hieu to integrate it into Moses which sometimes
|
||||
* forgets LM state (i.e. it doesn't store it with the phrase). Sigh.
|
||||
* The context indices should be in an array.
|
||||
* If context_rbegin != context_rend then *context_rbegin is the word
|
||||
* before new_word.
|
||||
/* Slower call without in_state. Try to remember state, but sometimes it
|
||||
* would cost too much memory or your decoder isn't setup properly.
|
||||
* To use this function, make an array of WordIndex containing the context
|
||||
* vocabulary ids in reverse order. Then, pass the bounds of the array:
|
||||
* [context_rbegin, context_rend). The new_word is not part of the context
|
||||
* array unless you intend to repeat words.
|
||||
*/
|
||||
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
|
||||
|
||||
/* Get the state for a context. Don't use this if you can avoid it. Use
|
||||
* BeginSentenceState or EmptyContextState and extend from those. If
|
||||
* you're only going to use this state to call FullScore once, use
|
||||
* FullScoreForgotState. */
|
||||
* FullScoreForgotState.
|
||||
* To use this function, make an array of WordIndex containing the context
|
||||
* vocabulary ids in reverse order. Then, pass the bounds of the array:
|
||||
* [context_rbegin, context_rend).
|
||||
*/
|
||||
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
|
||||
|
||||
private:
|
||||
@ -131,9 +145,8 @@ typedef detail::GenericModel<detail::ProbingHashedSearch, Vocabulary> ProbingMod
|
||||
// Default implementation. No real reason for it to be the default.
|
||||
typedef ProbingModel Model;
|
||||
|
||||
// Smaller implementation.
|
||||
typedef ::lm::ngram::SortedVocabulary SortedVocabulary;
|
||||
typedef detail::GenericModel<detail::SortedHashedSearch, SortedVocabulary> SortedModel;
|
||||
|
||||
typedef detail::GenericModel<trie::TrieSearch, SortedVocabulary> TrieModel;
|
||||
|
||||
} // namespace ngram
|
||||
|
@ -8,8 +8,27 @@
|
||||
|
||||
namespace lm {
|
||||
|
||||
/* Structure returned by scoring routines. */
|
||||
struct FullScoreReturn {
|
||||
// log10 probability
|
||||
float prob;
|
||||
|
||||
/* The length of n-gram matched. Do not use this for recombination.
|
||||
* Consider a model containing only the following n-grams:
|
||||
* -1 foo
|
||||
* -3.14 bar
|
||||
* -2.718 baz -5
|
||||
* -6 foo bar
|
||||
*
|
||||
* If you score ``bar'' then ngram_length is 1 and recombination state is the
|
||||
* empty string because bar has zero backoff and does not extend to the
|
||||
* right.
|
||||
* If you score ``foo'' then ngram_length is 1 and recombination state is
|
||||
* ``foo''.
|
||||
*
|
||||
* Ideally, keep output states around and compare them. Failing that,
|
||||
* get out_state.ValidLength() and use that length for recombination.
|
||||
*/
|
||||
unsigned char ngram_length;
|
||||
};
|
||||
|
||||
@ -72,7 +91,8 @@ class Vocabulary {
|
||||
/* There are two ways to access a Model.
|
||||
*
|
||||
*
|
||||
* OPTION 1: Access the Model directly (e.g. lm::ngram::Model in ngram.hh).
|
||||
* OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh).
|
||||
*
|
||||
* Every Model implements the scoring function:
|
||||
* float Score(
|
||||
* const Model::State &in_state,
|
||||
@ -85,6 +105,7 @@ class Vocabulary {
|
||||
* const WordIndex new_word,
|
||||
* Model::State &out_state) const;
|
||||
*
|
||||
*
|
||||
* There are also accessor functions:
|
||||
* const State &BeginSentenceState() const;
|
||||
* const State &NullContextState() const;
|
||||
@ -114,6 +135,7 @@ class Vocabulary {
|
||||
*
|
||||
* All the State objects are POD, so it's ok to use raw memory for storing
|
||||
* State.
|
||||
* in_state and out_state must not have the same address.
|
||||
*/
|
||||
class Model {
|
||||
public:
|
||||
@ -123,8 +145,10 @@ class Model {
|
||||
const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
|
||||
const void *NullContextMemory() const { return null_context_memory_; }
|
||||
|
||||
// Requires in_state != out_state
|
||||
virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
|
||||
|
||||
// Requires in_state != out_state
|
||||
virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
|
||||
|
||||
unsigned char Order() const { return order_; }
|
||||
|
@ -60,6 +60,23 @@
|
||||
|
||||
#ifdef HAVE_ICU
|
||||
#include <unicode/stringpiece.h>
|
||||
#include <unicode/uversion.h>
|
||||
|
||||
// Old versions of ICU don't define operator== and operator!=.
|
||||
#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
|
||||
#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6.
|
||||
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
||||
if (x.size() != y.size())
|
||||
return false;
|
||||
|
||||
return std::memcmp(x.data(), y.data(), x.size()) == 0;
|
||||
}
|
||||
|
||||
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x == y);
|
||||
}
|
||||
#endif // old version of ICU
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
#else
|
||||
|
||||
@ -209,7 +226,7 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x == y);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // HAVE_ICU undefined
|
||||
|
||||
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
||||
const int r = std::memcmp(x.data(), y.data(),
|
||||
|
Loading…
Reference in New Issue
Block a user