add nplm wrapper from Kenneth's kenlm repo

2024-12-27 05:55:02 +03:00 · 2014-07-17 16:10:30 +01:00 · 2014-07-17 16:10:30 +01:00 · 0d8d77e3da
commit 0d8d77e3da
parent 8ef136a57c
3 changed files with 176 additions and 0 deletions
--- a/lm/wrappers/README
+++ b/lm/wrappers/README
@ -0,0 +1,3 @@
 This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's.  You will need to have their LM installed.
 NPLM is a work in progress.  
--- a/lm/wrappers/nplm.cc
+++ b/lm/wrappers/nplm.cc
@ -0,0 +1,90 @@
 #include "lm/wrappers/nplm.hh"
 #include "util/exception.hh"
 #include "util/file.hh"
 #include <algorithm>
 #include <string.h>
 #include "neuralLM.h"
 namespace lm {
 namespace np {
 Vocabulary::Vocabulary(const nplm::vocabulary &vocab) 
  : base::Vocabulary(vocab.lookup_word("<s>"), vocab.lookup_word("</s>"), vocab.lookup_word("<unk>")),
    vocab_(vocab), null_word_(vocab.lookup_word("<null>")) {}
 Vocabulary::~Vocabulary() {}
 WordIndex Vocabulary::Index(const std::string &str) const {
  return vocab_.lookup_word(str);
 }
 bool Model::Recognize(const std::string &name) {
  try {
    util::scoped_fd file(util::OpenReadOrThrow(name.c_str()));
    char magic_check[16];
    util::ReadOrThrow(file.get(), magic_check, sizeof(magic_check));
    const char nnlm_magic[] = "\\config\nversion ";
    return !memcmp(magic_check, nnlm_magic, 16);
  } catch (const util::Exception &) {
    return false;
  }
 } 
 Model::Model(const std::string &file, std::size_t cache) 
  : base_instance_(new nplm::neuralLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
  UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ".  Change the defintion of NPLM_MAX_ORDER and recompile.");
  // log10 compatible with backoff models.
  base_instance_->set_log_base(10.0);
  State begin_sentence, null_context;
  std::fill(begin_sentence.words, begin_sentence.words + NPLM_MAX_ORDER - 1, base_instance_->lookup_word("<s>"));
  null_word_ = base_instance_->lookup_word("<null>");
  std::fill(null_context.words, null_context.words + NPLM_MAX_ORDER - 1, null_word_);
  Init(begin_sentence, null_context, vocab_, base_instance_->get_order());
 }
 Model::~Model() {}
 FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const {
  nplm::neuralLM *lm = backend_.get();
  if (!lm) {
    lm = new nplm::neuralLM(*base_instance_);
    backend_.reset(lm);
    lm->set_cache(cache_size_);
  }
  // State is in natural word order.
  FullScoreReturn ret;
  for (int i = 0; i < lm->get_order() - 1; ++i) {
    lm->staging_ngram()(i) = from.words[i];
  }
  lm->staging_ngram()(lm->get_order() - 1) = new_word;
  ret.prob = lm->lookup_from_staging();
  // Always say full order.
  ret.ngram_length = lm->get_order();
  // Shift everything down by one.
  memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (lm->get_order() - 2));
  out_state.words[lm->get_order() - 2] = new_word;
  // Fill in trailing words with zeros so state comparison works.
  memset(out_state.words + lm->get_order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - lm->get_order()));
  return ret;
 }
 // TODO: optimize with direct call?
 FullScoreReturn Model::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const {
  // State is in natural word order.  The API here specifies reverse order.
  std::size_t state_length = std::min<std::size_t>(Order() - 1, context_rend - context_rbegin);
  State state;
  // Pad with null words.
  for (lm::WordIndex *i = state.words; i < state.words + Order() - 1 - state_length; ++i) {
    *i = null_word_;
  }
  // Put new words at the end.
  std::reverse_copy(context_rbegin, context_rbegin + state_length, state.words + Order() - 1 - state_length);
  return FullScore(state, new_word, out_state);
 }
 } // namespace np
 } // namespace lm
--- a/lm/wrappers/nplm.hh
+++ b/lm/wrappers/nplm.hh
@ -0,0 +1,83 @@
 #ifndef LM_WRAPPERS_NPLM_H
 #define LM_WRAPPERS_NPLM_H
 #include "lm/facade.hh"
 #include "lm/max_order.hh"
 #include "util/string_piece.hh"
 #include <boost/thread/tss.hpp>
 #include <boost/scoped_ptr.hpp>
 /* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang
 * and Victoria Fossum."  
 * http://nlg.isi.edu/software/nplm/
 */
 namespace nplm {
 class vocabulary;
 class neuralLM;
 } // namespace nplm
 namespace lm {
 namespace np {
 class Vocabulary : public base::Vocabulary {
  public:
    Vocabulary(const nplm::vocabulary &vocab);
    ~Vocabulary();
    WordIndex Index(const std::string &str) const;
    // TODO: lobby them to support StringPiece
    WordIndex Index(const StringPiece &str) const {
      return Index(std::string(str.data(), str.size()));
    }
    lm::WordIndex NullWord() const { return null_word_; }
  private:
    const nplm::vocabulary &vocab_;
    const lm::WordIndex null_word_;
 };
 // Sorry for imposing my limitations on your code.
 #define NPLM_MAX_ORDER 7
 struct State {
  WordIndex words[NPLM_MAX_ORDER - 1];
 };
 class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
  private:
    typedef lm::base::ModelFacade<Model, State, Vocabulary> P;
  public:
    // Does this look like an NPLM?
    static bool Recognize(const std::string &file);
    explicit Model(const std::string &file, std::size_t cache_size = 1 << 20);
    ~Model();
    FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const;
    FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
  private:
    boost::scoped_ptr<nplm::neuralLM> base_instance_;
    mutable boost::thread_specific_ptr<nplm::neuralLM> backend_;
    Vocabulary vocab_;
    lm::WordIndex null_word_;
    const std::size_t cache_size_;
 };
 } // namespace np
 } // namespace lm
 #endif // LM_WRAPPERS_NPLM_H
		`@ -0,0 +1,3 @@`
							`This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's. You will need to have their LM installed.`

							`NPLM is a work in progress.`