mosesdecoder/lm/ngram_query.hh

#ifndef LM_NGRAM_QUERY__
#define LM_NGRAM_QUERY__

#include "lm/enumerate_vocab.hh"
#include "lm/model.hh"
#include "util/usage.hh"

#include <cstdlib>
#include <iostream>
#include <ostream>
#include <istream>
#include <string>

#include <math.h>

namespace lm {
namespace ngram {

template <class Model> void Query(const Model &model, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) {
  typename Model::State state, out;
  lm::FullScoreReturn ret;
  std::string word;

  double corpus_total = 0.0;
  uint64_t corpus_oov = 0;
  uint64_t corpus_tokens = 0;

  while (in_stream) {
    state = sentence_context ? model.BeginSentenceState() : model.NullContextState();
    float total = 0.0;
    bool got = false;
    uint64_t oov = 0;
    while (in_stream >> word) {
      got = true;
      lm::WordIndex vocab = model.GetVocabulary().Index(word);
      if (vocab == 0) ++oov;
      ret = model.FullScore(state, vocab, out);
      total += ret.prob;
      out_stream << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length)  << ' ' << ret.prob << '\t';
      ++corpus_tokens;
      state = out;
      char c;
      while (true) {
        c = in_stream.get();
        if (!in_stream) break;
        if (c == '\n') break;
        if (!isspace(c)) {
          in_stream.unget();
          break;
        }
      }
      if (c == '\n') break;
    }
    if (!got && !in_stream) break;
    if (sentence_context) {
      ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);
      total += ret.prob;
      ++corpus_tokens;
      out_stream << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length)  << ' ' << ret.prob << '\t';
    }
    out_stream << "Total: " << total << " OOV: " << oov << '\n';
    corpus_total += total;
    corpus_oov += oov;
  }
  out_stream << "Perplexity " << pow(10.0, -(corpus_total / static_cast<double>(corpus_tokens))) << std::endl;
}

template <class M> void Query(const char *file, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) {
  Config config;
  M model(file, config);
  Query(model, sentence_context, in_stream, out_stream);
}

} // namespace ngram
} // namespace lm

#endif // LM_NGRAM_QUERY__
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`#ifndef LM_NGRAM_QUERY__`
			`#define LM_NGRAM_QUERY__`

			`#include "lm/enumerate_vocab.hh"`
			`#include "lm/model.hh"`
KenLM e3b5c55910 including rest costs for probing 2012-06-28 18:58:59 +04:00			`#include "util/usage.hh"`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00
			`#include <cstdlib>`
			`#include <iostream>`
KenLM e3b5c55910 including rest costs for probing 2012-06-28 18:58:59 +04:00			`#include <ostream>`
			`#include <istream>`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`#include <string>`

KenLM 85c82bd, revamp Moses timer to have more precision 2014-01-02 01:19:06 +04:00			`#include <math.h>`

Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`namespace lm {`
			`namespace ngram {`

			`template <class Model> void Query(const Model &model, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) {`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`typename Model::State state, out;`
			`lm::FullScoreReturn ret;`
			`std::string word;`

KenLM 85c82bd, revamp Moses timer to have more precision 2014-01-02 01:19:06 +04:00			`double corpus_total = 0.0;`
			`uint64_t corpus_oov = 0;`
			`uint64_t corpus_tokens = 0;`

Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`while (in_stream) {`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`state = sentence_context ? model.BeginSentenceState() : model.NullContextState();`
			`float total = 0.0;`
			`bool got = false;`
KenLM 85c82bd, revamp Moses timer to have more precision 2014-01-02 01:19:06 +04:00			`uint64_t oov = 0;`
Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`while (in_stream >> word) {`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`got = true;`
			`lm::WordIndex vocab = model.GetVocabulary().Index(word);`
			`if (vocab == 0) ++oov;`
			`ret = model.FullScore(state, vocab, out);`
			`total += ret.prob;`
Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`out_stream << word << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';`
KenLM 85c82bd, revamp Moses timer to have more precision 2014-01-02 01:19:06 +04:00			`++corpus_tokens;`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`state = out;`
			`char c;`
			`while (true) {`
Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`c = in_stream.get();`
			`if (!in_stream) break;`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`if (c == '\n') break;`
			`if (!isspace(c)) {`
Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`in_stream.unget();`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`break;`
			`}`
			`}`
			`if (c == '\n') break;`
			`}`
Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`if (!got && !in_stream) break;`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`if (sentence_context) {`
			`ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);`
			`total += ret.prob;`
KenLM 85c82bd, revamp Moses timer to have more precision 2014-01-02 01:19:06 +04:00			`++corpus_tokens;`
Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`out_stream << "</s>=" << model.GetVocabulary().EndSentence() << ' ' << static_cast<unsigned int>(ret.ngram_length) << ' ' << ret.prob << '\t';`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`}`
Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`out_stream << "Total: " << total << " OOV: " << oov << '\n';`
KenLM 85c82bd, revamp Moses timer to have more precision 2014-01-02 01:19:06 +04:00			`corpus_total += total;`
			`corpus_oov += oov;`
KenLM e3b5c55910 including rest costs for probing 2012-06-28 18:58:59 +04:00			`}`
KenLM 85c82bd, revamp Moses timer to have more precision 2014-01-02 01:19:06 +04:00			`out_stream << "Perplexity " << pow(10.0, -(corpus_total / static_cast<double>(corpus_tokens))) << std::endl;`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00			`}`

Make lm constructor explicit 2012-01-14 21:07:31 +04:00			`template <class M> void Query(const char *file, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) {`
			`Config config;`
			`M model(file, config);`
			`Query(model, sentence_context, in_stream, out_stream);`
			`}`

			`} // namespace ngram`
			`} // namespace lm`
move some functions to .hh file for easier integration with other products 2011-12-20 13:44:45 +04:00
			`#endif // LM_NGRAM_QUERY__`