Allow broken IRST ARPAs to still build but be passive-aggressive about it.

Slight update to SRI wrapper that nobody uses anyway. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3980 1f5c12ca-751b-0410-a591-d2e778427230
2024-09-19 23:27:46 +03:00 · 2011-05-17 16:43:05 +00:00 · 2011-05-17 16:43:05 +00:00 · 1e05ab182e
commit 1e05ab182e
parent 2c9f74ef71
13 changed files with 89 additions and 47 deletions
--- a/kenlm/compile.sh
+++ b/kenlm/compile.sh
@ -8,7 +8,7 @@
 set -e

 for i in util/{bit_packing,ersatz_progress,exception,file_piece,murmur_hash,scoped,mmap} lm/{binary_format,config,lm_exception,model,read_arpa,search_hashed,search_trie,trie,virtual_interface,vocab}; do
-  g++ -I. -O3 $CXXFLAGS -c $i.cc -o $i.o
+  g++ -I. -O3 -DNDEBUG $CXXFLAGS -c $i.cc -o $i.o
 done
-g++ -I. -O3 $CXXFLAGS lm/build_binary.cc {lm,util}/*.o -lz -o build_binary
-g++ -I. -O3 $CXXFLAGS lm/ngram_query.cc {lm,util}/*.o -lz -o query
+g++ -I. -O3 -DNDEBUG $CXXFLAGS lm/build_binary.cc {lm,util}/*.o -lz -o build_binary
+g++ -I. -O3 -DNDEBUG $CXXFLAGS lm/ngram_query.cc {lm,util}/*.o -lz -o query
--- a/kenlm/lm/build_binary.cc
+++ b/kenlm/lm/build_binary.cc
@ -15,10 +15,11 @@ namespace ngram {
 namespace {

 void Usage(const char *name) {
-  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
+  std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-n] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [type] input.arpa output.mmap\n\n"
 "-u sets the default log10 probability for <unk> if the ARPA file does not have\n"
 "one.\n"
-"-s allows models to be built even if they do not have <s> and </s>.\n\n"
+"-s allows models to be built even if they do not have <s> and </s>.\n"
+"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
 "type is one of probing, trie, or sorted:\n\n"
 "probing uses a probing hash table.  It is the fastest but uses the most memory.\n"
 "-p sets the space multiplier and must be >1.0.  The default is 1.5.\n\n"
@ -75,7 +76,7 @@ int main(int argc, char *argv[]) {
  try {
    lm::ngram::Config config;
    int opt;
-    while ((opt = getopt(argc, argv, "su:p:t:m:")) != -1) {
+    while ((opt = getopt(argc, argv, "siu:p:t:m:")) != -1) {
      switch(opt) {
        case 'u':
          config.unknown_missing_logprob = ParseFloat(optarg);
@ -90,7 +91,10 @@ int main(int argc, char *argv[]) {
          config.building_memory = ParseUInt(optarg) * 1048576;
          break;
        case 's':
-          config.sentence_marker_missing = lm::ngram::Config::SILENT;
+          config.sentence_marker_missing = lm::NOTHING;
+          break;
+        case 'i':
+          config.positive_log_probability = lm::NOTHING;
          break;
        default:
          Usage(argv[0]);
--- a/kenlm/lm/config.cc
+++ b/kenlm/lm/config.cc
@ -10,6 +10,7 @@ Config::Config() :
  enumerate_vocab(NULL),
  unknown_missing(COMPLAIN),
  sentence_marker_missing(THROW_UP),
+  positive_log_probability(THROW_UP),
  unknown_missing_logprob(-100.0),
  probing_multiplier(1.5),
  building_memory(1073741824ULL), // 1 GB
--- a/kenlm/lm/config.hh
+++ b/kenlm/lm/config.hh
@ -3,6 +3,7 @@

 #include <iosfwd>

+#include "lm/lm_exception.hh"
 #include "util/mmap.hh"

 /* Configuration for ngram model.  Separate header to reduce pollution. */
@ -27,13 +28,16 @@ struct Config {

  // ONLY EFFECTIVE WHEN READING ARPA

-  typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction;
  // What to do when <unk> isn't in the provided model. 
  WarningAction unknown_missing;
  // What to do when <s> or </s> is missing from the model. 
  // If THROW_UP, the exception will be of type util::SpecialWordMissingException.  
  WarningAction sentence_marker_missing;

+  // What to do with a positive log probability.  For COMPLAIN and NOTHING, map
+  // to 0.  
+  WarningAction positive_log_probability;
+
  // The probability to substitute for <unk> if it's missing from the model.  
  // No effect if the model has <unk> or unknown_missing == THROW_UP.
  float unknown_missing_logprob;
--- a/kenlm/lm/lm_exception.hh
+++ b/kenlm/lm/lm_exception.hh
@ -11,6 +11,8 @@

 namespace lm {

+typedef enum {THROW_UP, COMPLAIN, NOTHING} WarningAction;
+
 class ConfigException : public util::Exception {
  public:
    ConfigException() throw();
@ -45,4 +47,4 @@ class SpecialWordMissingException : public VocabLoadException {

 } // namespace lm

-#endif // LM_LM_EXCEPTION
+#endif // LM_LM_EXCEPTION__
--- a/kenlm/lm/model.cc
+++ b/kenlm/lm/model.cc
@ -83,7 +83,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
  }

  if (!vocab_.SawUnk()) {
-    assert(config.unknown_missing != Config::THROW_UP);
+    assert(config.unknown_missing != THROW_UP);
    // Default probabilities for unknown.  
    search_.unigram.Unknown().backoff = 0.0;
    search_.unigram.Unknown().prob = config.unknown_missing_logprob;
--- a/kenlm/lm/model.hh
+++ b/kenlm/lm/model.hh
@ -147,7 +147,6 @@ typedef ProbingModel Model;

 // Smaller implementation.
 typedef ::lm::ngram::SortedVocabulary SortedVocabulary;
-typedef detail::GenericModel<detail::SortedHashedSearch, SortedVocabulary> SortedModel;
 typedef detail::GenericModel<trie::TrieSearch, SortedVocabulary> TrieModel;

 } // namespace ngram
--- a/kenlm/lm/read_arpa.cc
+++ b/kenlm/lm/read_arpa.cc
@ -3,6 +3,7 @@
 #include "lm/blank.hh"

 #include <cstdlib>
+#include <iostream>
 #include <vector>

 #include <ctype.h>
@ -115,4 +116,17 @@ void ReadEnd(util::FilePiece &in) {
  } catch (const util::EndOfFileException &e) {}
 }

+void PositiveProbWarn::Warn(float prob) {
+  switch (action_) {
+    case THROW_UP:
+      UTIL_THROW(FormatLoadException, "Positive log probability " << prob << " in the model.  This is a bug in IRSTLM; you can set config.positive_log_probability = NOTHING or pass -i to build_binary to substitute 0.0 for the log probability.  Error");
+    case COMPLAIN:
+      std::cerr << "There's a positive log probability " << prob << " in the APRA file, probably because of a bug in IRSTLM.  This and subsequent entires will be mapepd to 0 log probability." << std::endl;
+      action_ = NOTHING;
+      break;
+    case NOTHING:
+      break;
+  }
+}
+
 } // namespace lm
--- a/kenlm/lm/read_arpa.hh
+++ b/kenlm/lm/read_arpa.hh
@ -22,10 +22,26 @@ void ReadEnd(util::FilePiece &in);

 extern const bool kARPASpaces[256];

-template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams) {
+// Positive log probability warning.  
+class PositiveProbWarn {
+  public:
+    PositiveProbWarn() : action_(THROW_UP) {}
+
+    explicit PositiveProbWarn(WarningAction action) : action_(action) {}
+
+    void Warn(float prob);
+
+  private:
+    WarningAction action_;
+};
+
+template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams, PositiveProbWarn &warn) {
  try {
    float prob = f.ReadFloat();
-    if (prob > 0) UTIL_THROW(FormatLoadException, "Positive probability " << prob);
+    if (prob > 0.0) {
+      warn.Warn(prob);
+      prob = 0.0;
+    }
    if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability");
    ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))];
    value.prob = prob;
@ -36,18 +52,23 @@ template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff
  }
 }

-template <class Voc> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, ProbBackoff *unigrams) {
+// Return true if a positive log probability came out.
+template <class Voc> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, ProbBackoff *unigrams, PositiveProbWarn &warn) {
  ReadNGramHeader(f, 1);
  for (std::size_t i = 0; i < count; ++i) {
-    Read1Gram(f, vocab, unigrams);
+    Read1Gram(f, vocab, unigrams, warn);
  }
  vocab.FinishedLoading(unigrams);
 }

-template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights) {
+// Return true if a positive log probability came out.
+template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) {
  try {
    weights.prob = f.ReadFloat();
-    if (weights.prob > 0) UTIL_THROW(FormatLoadException, "Positive probability " << weights.prob);
+    if (weights.prob > 0.0) {
+      warn.Warn(weights.prob);
+      weights.prob = 0.0;
+    }
    for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) {
      *vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces));
    }
--- a/kenlm/lm/search_hashed.cc
+++ b/kenlm/lm/search_hashed.cc
@ -48,7 +48,7 @@ class ActivateUnigram {
    ProbBackoff *modify_;
 };

-template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, std::vector<Middle> &middle, Activate activate, Store &store) {
+template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, std::vector<Middle> &middle, Activate activate, Store &store, PositiveProbWarn &warn) {
  
  ReadNGramHeader(f, n);
  ProbBackoff blank;
@ -61,7 +61,7 @@ template <class Voc, class Store, class Middle, class Activate> void ReadNGrams(
  typename Store::Packing::Value value;
  typename Middle::ConstIterator found;
  for (size_t i = 0; i < count; ++i) {
-    ReadNGram(f, n, vocab, vocab_ids, value);
+    ReadNGram(f, n, vocab, vocab_ids, value, warn);
    keys[0] = detail::CombineWordHash(static_cast<uint64_t>(*vocab_ids), vocab_ids[1]);
    for (unsigned int h = 1; h < n - 1; ++h) {
      keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
@ -85,20 +85,22 @@ template <class MiddleT, class LongestT> template <class Voc> void TemplateHashe
  // TODO: fix sorted.
  SetupMemory(GrowForSearch(config, Size(counts, config), backing), counts, config);

-  Read1Grams(f, counts[0], vocab, unigram.Raw());
+  PositiveProbWarn warn(config.positive_log_probability);
+
+  Read1Grams(f, counts[0], vocab, unigram.Raw(), warn);
  CheckSpecials(config, vocab);

  try {
    if (counts.size() > 2) {
-      ReadNGrams(f, 2, counts[1], vocab, middle, ActivateUnigram(unigram.Raw()), middle[0]);
+      ReadNGrams(f, 2, counts[1], vocab, middle, ActivateUnigram(unigram.Raw()), middle[0], warn);
    }
    for (unsigned int n = 3; n < counts.size(); ++n) {
-      ReadNGrams(f, n, counts[n-1], vocab, middle, ActivateLowerMiddle<Middle>(middle[n-3]), middle[n-2]);
+      ReadNGrams(f, n, counts[n-1], vocab, middle, ActivateLowerMiddle<Middle>(middle[n-3]), middle[n-2], warn);
    }
    if (counts.size() > 2) {
-      ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, middle, ActivateLowerMiddle<Middle>(middle.back()), longest);
+      ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, middle, ActivateLowerMiddle<Middle>(middle.back()), longest, warn);
    } else {
-      ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, middle, ActivateUnigram(unigram.Raw()), longest);
+      ReadNGrams(f, counts.size(), counts[counts.size() - 1], vocab, middle, ActivateUnigram(unigram.Raw()), longest, warn);
    }
  } catch (util::ProbingSizeException &e) {
    UTIL_THROW(util::ProbingSizeException, "Avoid pruning n-grams like \"bar baz quux\" when \"foo bar baz quux\" is still in the model.  KenLM will work when this pruning happens, but the probing model assumes these events are rare enough that using blank space in the probing hash table will cover all of them.  Increase probing_multiplier (-p to build_binary) to add more blank spaces.\n");
--- a/kenlm/lm/search_trie.cc
+++ b/kenlm/lm/search_trie.cc
@ -480,7 +480,7 @@ void MergeContextFiles(const std::string &first_base, const std::string &second_
  CopyRestOrThrow(remaining.GetFile(), out.get());
 }

-void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, util::scoped_memory &mem, const std::string &file_prefix, unsigned char order) {
+void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, util::scoped_memory &mem, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn) {
  ReadNGramHeader(f, order);
  const size_t count = counts[order - 1];
  // Size of weights.  Does it include backoff?  
@ -495,11 +495,11 @@ void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const st
    uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
    if (order == counts.size()) {
      for (; out != out_end; out += entry_size) {
-        ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size));
+        ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<Prob*>(out + words_size), warn);
      }
    } else {
      for (; out != out_end; out += entry_size) {
-        ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size));
+        ReadNGram(f, order, vocab, reinterpret_cast<WordIndex*>(out), *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
      }
    }
    // Sort full records by full n-gram.  
@ -536,13 +536,14 @@ void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const st
 }

 void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) {
+  PositiveProbWarn warn(config.positive_log_probability);
  {
    std::string unigram_name = file_prefix + "unigrams";
    util::scoped_fd unigram_file;
    // In case <unk> appears.  
    size_t extra_count = counts[0] + 1;
    util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), extra_count * sizeof(ProbBackoff), unigram_file), extra_count * sizeof(ProbBackoff));
-    Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()));
+    Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
    CheckSpecials(config, vocab);
    if (!vocab.SawUnk()) ++counts[0];
  }
@ -560,7 +561,7 @@ void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector<uin
  if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer);

  for (unsigned char order = 2; order <= counts.size(); ++order) {
-    ConvertToSorted(f, vocab, counts, mem, file_prefix, order);
+    ConvertToSorted(f, vocab, counts, mem, file_prefix, order, warn);
  }
  ReadEnd(f);
 }
--- a/kenlm/lm/sri.cc
+++ b/kenlm/lm/sri.cc
@ -93,18 +93,12 @@ FullScoreReturn Model::FullScore(const State &in_state, const WordIndex new_word
    const_history = local_history;
  }
  FullScoreReturn ret;
-  if (new_word != not_found_) {
-    ret.ngram_length = MatchedLength(*sri_, new_word, const_history);
-    out_state.history_[0] = new_word;
-    out_state.valid_length_ = std::min<unsigned char>(ret.ngram_length, Order() - 1);
-    std::copy(const_history, const_history + out_state.valid_length_ - 1, out_state.history_ + 1);
-    if (out_state.valid_length_ < kMaxOrder - 1) {
-      out_state.history_[out_state.valid_length_] = Vocab_None;
-    }
-  } else {
-    ret.ngram_length = 0;
-    if (kMaxOrder > 1) out_state.history_[0] = Vocab_None;
-    out_state.valid_length_ = 0;
+  ret.ngram_length = MatchedLength(*sri_, new_word, const_history);
+  out_state.history_[0] = new_word;
+  out_state.valid_length_ = std::min<unsigned char>(ret.ngram_length, Order() - 1);
+  std::copy(const_history, const_history + out_state.valid_length_ - 1, out_state.history_ + 1);
+  if (out_state.valid_length_ < kMaxOrder - 1) {
+    out_state.history_[out_state.valid_length_] = Vocab_None;
  }
  ret.prob = sri_->wordProb(new_word, const_history);
  return ret;
--- a/kenlm/lm/vocab.cc
+++ b/kenlm/lm/vocab.cc
@ -189,24 +189,24 @@ void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) {

 void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
  switch(config.unknown_missing) {
-    case Config::SILENT:
+    case NOTHING:
      return;
-    case Config::COMPLAIN:
+    case COMPLAIN:
      if (config.messages) *config.messages << "The ARPA file is missing <unk>.  Substituting log10 probability " << config.unknown_missing_logprob << "." << std::endl;
      break;
-    case Config::THROW_UP:
+    case THROW_UP:
      UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing <unk> and the model is configured to throw an exception.");
  }
 }

 void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException) {
  switch (config.sentence_marker_missing) {
-    case Config::SILENT:
+    case NOTHING:
      return;
-    case Config::COMPLAIN:
+    case COMPLAIN:
      if (config.messages) *config.messages << "Missing special word " << str << "; will treat it as <unk>.";
      break;
-    case Config::THROW_UP:
+    case THROW_UP:
      UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models.  Run build_binary -s to disable this check.");
  }
 }