More documentation

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3951 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-27 22:14:57 +03:00 · 2011-04-19 15:17:01 +00:00 · 2011-04-19 15:17:01 +00:00 · 3274f72bfb
commit 3274f72bfb
parent 83d02c11c6
5 changed files with 76 additions and 18 deletions
--- a/kenlm/lm/binary_format.hh
+++ b/kenlm/lm/binary_format.hh
@ -18,6 +18,12 @@ namespace ngram {

 typedef enum {HASH_PROBING=0, HASH_SORTED=1, TRIE_SORTED=2} ModelType;

+/*Inspect a file to determine if it is a binary lm.  If not, return false.  
+ * If so, return true and set recognized to the type.  This is the only API in
+ * this header designed for use by decoder authors.  
+ */
+bool RecognizeBinary(const char *file, ModelType &recognized);
+
 struct FixedWidthParameters {
  unsigned char order;
  float probing_multiplier;
@ -27,6 +33,7 @@ struct FixedWidthParameters {
  bool has_vocabulary;
 };

+// Parameters stored in the header of a binary file.  
 struct Parameters {
  FixedWidthParameters fixed;
  std::vector<uint64_t> counts;
@ -41,10 +48,13 @@ struct Backing {
  util::scoped_memory search;
 };

+// Create just enough of a binary file to write vocabulary to it.  
 uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
 // Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.  
 uint8_t *GrowForSearch(const Config &config, std::size_t memory_size, Backing &backing);

+// Write header to binary file.  This is done last to prevent incomplete files
+// from loading.   
 void FinishFile(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, Backing &backing);

 namespace detail {
@ -61,8 +71,6 @@ void ComplainAboutARPA(const Config &config, ModelType model_type);

 } // namespace detail

-bool RecognizeBinary(const char *file, ModelType &recognized);
-
 template <class To> void LoadLM(const char *file, const Config &config, To &to) {
  Backing &backing = to.MutableBacking();
  backing.file.reset(util::OpenReadOrThrow(file));
@ -86,7 +94,6 @@ template <class To> void LoadLM(const char *file, const Config &config, To &to)
    e << " File: " << file;
    throw;
  }
-
 }

 } // namespace ngram
--- a/kenlm/lm/build_binary.cc
+++ b/kenlm/lm/build_binary.cc
@ -63,7 +63,6 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
  std::cout << "bytes\n"
    "probing " << std::setw(length) << probing_size << " assuming -p " << config.probing_multiplier << "\n"
    "trie    " << std::setw(length) << TrieModel::Size(counts, config) << "\n";
-/*    "sorted  " << std::setw(length) << SortedModel::Size(counts, config) << "\n";*/
 }

 } // namespace ngram
@ -108,8 +107,6 @@ int main(int argc, char *argv[]) {
      config.write_mmap = argv[optind + 2];
      if (!strcmp(model_type, "probing")) {
        ProbingModel(from_file, config);
-      } else if (!strcmp(model_type, "sorted")) {
-        SortedModel(from_file, config);
      } else if (!strcmp(model_type, "trie")) {
        TrieModel(from_file, config);
      } else {
--- a/kenlm/lm/model.hh
+++ b/kenlm/lm/model.hh
@ -65,7 +65,7 @@ size_t hash_value(const State &state);
 namespace detail {

 // Should return the same results as SRI.  
-// Why VocabularyT instead of just Vocabulary?  ModelFacade defines Vocabulary.  
+// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
 template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
  private:
    typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
@ -75,23 +75,37 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
    // itself.  
    static size_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());

+    /* Load the model from a file.  It may be an ARPA or binary file.  Binary
+     * files must have the format expected by this class or you'll get an
+     * exception.  So TrieModel can only load ARPA or binary created by
+     * TrieModel.  To classify binary files, call RecognizeBinary in
+     * lm/binary_format.hh.  
+     */
    GenericModel(const char *file, const Config &config = Config());

+    /* Score p(new_word | in_state) and incorporate new_word into out_state.
+     * Note that in_state and out_state must be different references:
+     * &in_state != &out_state.  
+     */
    FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;

-    /* Slower call without in_state.  Don't use this if you can avoid it.  This
-     * is mostly a hack for Hieu to integrate it into Moses which sometimes
-     * forgets LM state (i.e. it doesn't store it with the phrase).  Sigh.   
-     * The context indices should be in an array.  
-     * If context_rbegin != context_rend then *context_rbegin is the word
-     * before new_word.  
+    /* Slower call without in_state.  Try to remember state, but sometimes it
+     * would cost too much memory or your decoder isn't setup properly.  
+     * To use this function, make an array of WordIndex containing the context
+     * vocabulary ids in reverse order.  Then, pass the bounds of the array:
+     * [context_rbegin, context_rend).  The new_word is not part of the context
+     * array unless you intend to repeat words.  
     */
    FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;

    /* Get the state for a context.  Don't use this if you can avoid it.  Use
     * BeginSentenceState or EmptyContextState and extend from those.  If
     * you're only going to use this state to call FullScore once, use
-     * FullScoreForgotState. */
+     * FullScoreForgotState. 
+     * To use this function, make an array of WordIndex containing the context
+     * vocabulary ids in reverse order.  Then, pass the bounds of the array:
+     * [context_rbegin, context_rend).  
+     */
    void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;

  private:
@ -131,9 +145,8 @@ typedef detail::GenericModel<detail::ProbingHashedSearch, Vocabulary> ProbingMod
 // Default implementation.  No real reason for it to be the default.  
 typedef ProbingModel Model;

+// Smaller implementation.
 typedef ::lm::ngram::SortedVocabulary SortedVocabulary;
-typedef detail::GenericModel<detail::SortedHashedSearch, SortedVocabulary> SortedModel;
-
 typedef detail::GenericModel<trie::TrieSearch, SortedVocabulary> TrieModel;

 } // namespace ngram
--- a/kenlm/lm/virtual_interface.hh
+++ b/kenlm/lm/virtual_interface.hh
@ -8,8 +8,27 @@

 namespace lm {

+/* Structure returned by scoring routines. */
 struct FullScoreReturn {
+  // log10 probability
  float prob;
+
+  /* The length of n-gram matched.  Do not use this for recombination.  
+   * Consider a model containing only the following n-grams:
+   * -1 foo
+   * -3.14  bar
+   * -2.718 baz -5
+   * -6 foo bar
+   *
+   * If you score ``bar'' then ngram_length is 1 and recombination state is the
+   * empty string because bar has zero backoff and does not extend to the
+   * right.  
+   * If you score ``foo'' then ngram_length is 1 and recombination state is 
+   * ``foo''.  
+   *
+   * Ideally, keep output states around and compare them.  Failing that,
+   * get out_state.ValidLength() and use that length for recombination.
+   */
  unsigned char ngram_length;
 };

@ -72,7 +91,8 @@ class Vocabulary {
 /* There are two ways to access a Model.  
 *
 *
- * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in ngram.hh).
+ * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh).
+ *
 * Every Model implements the scoring function:
 * float Score(
 *   const Model::State &in_state,
@ -85,6 +105,7 @@ class Vocabulary {
 *   const WordIndex new_word,
 *   Model::State &out_state) const;
 *
+ *
 * There are also accessor functions:
 * const State &BeginSentenceState() const;
 * const State &NullContextState() const;
@ -114,6 +135,7 @@ class Vocabulary {
 *
 * All the State objects are POD, so it's ok to use raw memory for storing
 * State.
+ * in_state and out_state must not have the same address. 
 */
 class Model {
  public:
@ -123,8 +145,10 @@ class Model {
    const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
    const void *NullContextMemory() const { return null_context_memory_; }

+    // Requires in_state != out_state
    virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;

+    // Requires in_state != out_state
    virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;

    unsigned char Order() const { return order_; }
--- a/kenlm/util/string_piece.hh
+++ b/kenlm/util/string_piece.hh
@ -60,6 +60,23 @@

 #ifdef HAVE_ICU
 #include <unicode/stringpiece.h>
+#include <unicode/uversion.h>
+
+// Old versions of ICU don't define operator== and operator!=.  
+#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
+#warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.  
+inline bool operator==(const StringPiece& x, const StringPiece& y) {
+  if (x.size() != y.size())
+    return false;
+
+  return std::memcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece& x, const StringPiece& y) {
+  return !(x == y);
+}
+#endif // old version of ICU
+
 U_NAMESPACE_BEGIN
 #else

@ -209,7 +226,7 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {
  return !(x == y);
 }

-#endif
+#endif // HAVE_ICU undefined

 inline bool operator<(const StringPiece& x, const StringPiece& y) {
  const int r = std::memcmp(x.data(), y.data(),