mosesdecoder/lm/kenlm_benchmark_main.cc

#include "lm/model.hh"
#include "util/file_stream.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
#include "util/usage.hh"

#include <stdint.h>

namespace {

template <class Model, class Width> void ConvertToBytes(const Model &model, int fd_in) {
  util::FilePiece in(fd_in);
  util::FileStream out(1);
  Width width;
  StringPiece word;
  const Width end_sentence = (Width)model.GetVocabulary().EndSentence();
  while (true) {
    while (in.ReadWordSameLine(word)) {
      width = (Width)model.GetVocabulary().Index(word);
      out.write(&width, sizeof(Width));
    }
    if (!in.ReadLineOrEOF(word)) break;
    out.write(&end_sentence, sizeof(Width));
  }
}

template <class Model, class Width> void QueryFromBytes(const Model &model, int fd_in) {
  lm::ngram::State state[3];
  const lm::ngram::State *const begin_state = &model.BeginSentenceState();
  const lm::ngram::State *next_state = begin_state;
  Width kEOS = model.GetVocabulary().EndSentence();
  Width buf[4096];

  uint64_t completed = 0;
  double loaded = util::CPUTime();

  std::cout << "CPU_to_load: " << loaded << std::endl;

  // Numerical precision: batch sums.
  double total = 0.0;
  while (std::size_t got = util::ReadOrEOF(fd_in, buf, sizeof(buf))) {
    float sum = 0.0;
    UTIL_THROW_IF2(got % sizeof(Width), "File size not a multiple of vocab id size " << sizeof(Width));
    got /= sizeof(Width);
    completed += got;
    // Do even stuff first.
    const Width *even_end = buf + (got & ~1);
    // Alternating states
    const Width *i;
    for (i = buf; i != even_end;) {
      sum += model.FullScore(*next_state, *i, state[1]).prob;
      next_state = (*i++ == kEOS) ? begin_state : &state[1];
      sum += model.FullScore(*next_state, *i, state[0]).prob;
      next_state = (*i++ == kEOS) ? begin_state : &state[0];
    }
    // Odd corner case.
    if (got & 1) {
      sum += model.FullScore(*next_state, *i, state[2]).prob;
      next_state = (*i++ == kEOS) ? begin_state : &state[2];
    }
    total += sum;
  }
  double after = util::CPUTime();
  std::cerr << "Probability sum is " << total << std::endl;
  std::cout << "Queries: " << completed << std::endl;
  std::cout << "CPU_excluding_load: " << (after - loaded) << "\nCPU_per_query: " << ((after - loaded) / static_cast<double>(completed)) << std::endl;
  std::cout << "RSSMax: " << util::RSSMax() << std::endl;
}

template <class Model, class Width> void DispatchFunction(const Model &model, bool query) {
  if (query) {
    QueryFromBytes<Model, Width>(model, 0);
  } else {
    ConvertToBytes<Model, Width>(model, 0);
  }
}

template <class Model> void DispatchWidth(const char *file, bool query) {
  lm::ngram::Config config;
  config.load_method = util::READ;
  std::cerr << "Using load_method = READ." << std::endl;
  Model model(file, config);
  lm::WordIndex bound = model.GetVocabulary().Bound();
  if (bound <= 256) {
    DispatchFunction<Model, uint8_t>(model, query);
  } else if (bound <= 65536) {
    DispatchFunction<Model, uint16_t>(model, query);
  } else if (bound <= (1ULL << 32)) {
    DispatchFunction<Model, uint32_t>(model, query);
  } else {
    DispatchFunction<Model, uint64_t>(model, query);
  }
}

void Dispatch(const char *file, bool query) {
  using namespace lm::ngram;
  lm::ngram::ModelType model_type;
  if (lm::ngram::RecognizeBinary(file, model_type)) {
    switch(model_type) {
      case PROBING:
        DispatchWidth<lm::ngram::ProbingModel>(file, query);
        break;
      case REST_PROBING:
        DispatchWidth<lm::ngram::RestProbingModel>(file, query);
        break;
      case TRIE:
        DispatchWidth<lm::ngram::TrieModel>(file, query);
        break;
      case QUANT_TRIE:
        DispatchWidth<lm::ngram::QuantTrieModel>(file, query);
        break;
      case ARRAY_TRIE:
        DispatchWidth<lm::ngram::ArrayTrieModel>(file, query);
        break;
      case QUANT_ARRAY_TRIE:
        DispatchWidth<lm::ngram::QuantArrayTrieModel>(file, query);
        break;
      default:
        UTIL_THROW(util::Exception, "Unrecognized kenlm model type " << model_type);
    }
  } else {
    UTIL_THROW(util::Exception, "Binarize before running benchmarks.");
  }
}

} // namespace

int main(int argc, char *argv[]) {
  if (argc != 3 || (strcmp(argv[1], "vocab") && strcmp(argv[1], "query"))) {
    std::cerr
      << "Benchmark program for KenLM.  Intended usage:\n"
      << "#Convert text to vocabulary ids offline.  These ids are tied to a model.\n"
      << argv[0] << " vocab $model <$text >$text.vocab\n"
      << "#Ensure files are in RAM.\n"
      << "cat $text.vocab $model >/dev/null\n"
      << "#Timed query against the model.\n"
      << argv[0] << " query $model <$text.vocab\n";
    return 1;
  }
  Dispatch(argv[2], !strcmp(argv[1], "query"));
  return 0;
}
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`#include "lm/model.hh"`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00			`#include "util/file_stream.hh"`
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`#include "util/file.hh"`
			`#include "util/file_piece.hh"`
			`#include "util/usage.hh"`

			`#include <stdint.h>`

			`namespace {`

			`template <class Model, class Width> void ConvertToBytes(const Model &model, int fd_in) {`
			`util::FilePiece in(fd_in);`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00			`util::FileStream out(1);`
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`Width width;`
			`StringPiece word;`
			`const Width end_sentence = (Width)model.GetVocabulary().EndSentence();`
			`while (true) {`
			`while (in.ReadWordSameLine(word)) {`
			`width = (Width)model.GetVocabulary().Index(word);`
			`out.write(&width, sizeof(Width));`
			`}`
			`if (!in.ReadLineOrEOF(word)) break;`
			`out.write(&end_sentence, sizeof(Width));`
			`}`
			`}`

			`template <class Model, class Width> void QueryFromBytes(const Model &model, int fd_in) {`
			`lm::ngram::State state[3];`
			`const lm::ngram::State *const begin_state = &model.BeginSentenceState();`
			`const lm::ngram::State *next_state = begin_state;`
			`Width kEOS = model.GetVocabulary().EndSentence();`
			`Width buf[4096];`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00
			`uint64_t completed = 0;`
			`double loaded = util::CPUTime();`

			`std::cout << "CPU_to_load: " << loaded << std::endl;`

			`// Numerical precision: batch sums.`
			`double total = 0.0;`
Update kenlm 2015-09-10 18:04:09 +03:00			`while (std::size_t got = util::ReadOrEOF(fd_in, buf, sizeof(buf))) {`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00			`float sum = 0.0;`
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`UTIL_THROW_IF2(got % sizeof(Width), "File size not a multiple of vocab id size " << sizeof(Width));`
			`got /= sizeof(Width);`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00			`completed += got;`
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`// Do even stuff first.`
			`const Width *even_end = buf + (got & ~1);`
			`// Alternating states`
			`const Width *i;`
			`for (i = buf; i != even_end;) {`
			`sum += model.FullScore(next_state, i, state[1]).prob;`
			`next_state = (*i++ == kEOS) ? begin_state : &state[1];`
			`sum += model.FullScore(next_state, i, state[0]).prob;`
			`next_state = (*i++ == kEOS) ? begin_state : &state[0];`
			`}`
			`// Odd corner case.`
			`if (got & 1) {`
			`sum += model.FullScore(next_state, i, state[2]).prob;`
			`next_state = (*i++ == kEOS) ? begin_state : &state[2];`
			`}`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00			`total += sum;`
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`}`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00			`double after = util::CPUTime();`
			`std::cerr << "Probability sum is " << total << std::endl;`
			`std::cout << "Queries: " << completed << std::endl;`
			`std::cout << "CPU_excluding_load: " << (after - loaded) << "\nCPU_per_query: " << ((after - loaded) / static_cast<double>(completed)) << std::endl;`
			`std::cout << "RSSMax: " << util::RSSMax() << std::endl;`
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`}`

			`template <class Model, class Width> void DispatchFunction(const Model &model, bool query) {`
			`if (query) {`
			`QueryFromBytes<Model, Width>(model, 0);`
			`} else {`
			`ConvertToBytes<Model, Width>(model, 0);`
			`}`
			`}`

			`template <class Model> void DispatchWidth(const char *file, bool query) {`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00			`lm::ngram::Config config;`
			`config.load_method = util::READ;`
			`std::cerr << "Using load_method = READ." << std::endl;`
			`Model model(file, config);`
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`lm::WordIndex bound = model.GetVocabulary().Bound();`
			`if (bound <= 256) {`
			`DispatchFunction<Model, uint8_t>(model, query);`
			`} else if (bound <= 65536) {`
			`DispatchFunction<Model, uint16_t>(model, query);`
			`} else if (bound <= (1ULL << 32)) {`
			`DispatchFunction<Model, uint32_t>(model, query);`
			`} else {`
			`DispatchFunction<Model, uint64_t>(model, query);`
			`}`
			`}`

			`void Dispatch(const char *file, bool query) {`
			`using namespace lm::ngram;`
			`lm::ngram::ModelType model_type;`
			`if (lm::ngram::RecognizeBinary(file, model_type)) {`
			`switch(model_type) {`
			`case PROBING:`
			`DispatchWidth<lm::ngram::ProbingModel>(file, query);`
			`break;`
			`case REST_PROBING:`
			`DispatchWidth<lm::ngram::RestProbingModel>(file, query);`
			`break;`
			`case TRIE:`
			`DispatchWidth<lm::ngram::TrieModel>(file, query);`
			`break;`
			`case QUANT_TRIE:`
			`DispatchWidth<lm::ngram::QuantTrieModel>(file, query);`
			`break;`
			`case ARRAY_TRIE:`
			`DispatchWidth<lm::ngram::ArrayTrieModel>(file, query);`
			`break;`
			`case QUANT_ARRAY_TRIE:`
			`DispatchWidth<lm::ngram::QuantArrayTrieModel>(file, query);`
			`break;`
			`default:`
			`UTIL_THROW(util::Exception, "Unrecognized kenlm model type " << model_type);`
			`}`
			`} else {`
			`UTIL_THROW(util::Exception, "Binarize before running benchmarks.");`
			`}`
			`}`

			`} // namespace`

			`int main(int argc, char *argv[]) {`
			`if (argc != 3 \|\| (strcmp(argv[1], "vocab") && strcmp(argv[1], "query"))) {`
			`std::cerr`
			`<< "Benchmark program for KenLM. Intended usage:\n"`
			`<< "#Convert text to vocabulary ids offline. These ids are tied to a model.\n"`
			`<< argv[0] << " vocab $model <$text >$text.vocab\n"`
			`<< "#Ensure files are in RAM.\n"`
			`<< "cat $text.vocab $model >/dev/null\n"`
KenLM a590a3a4dadf516a1cff28c8f1c06aa89766f519 including StringStream TODO: kill istream 2015-09-29 18:58:02 +03:00			`<< "#Timed query against the model.\n"`
			`<< argv[0] << " query $model <$text.vocab\n";`
KenLM 7408730be415db9b650560a8b2bd3e4e3af49ec9. unistd.hh is dead. 2015-05-19 22:27:30 +03:00			`return 1;`
			`}`
			`Dispatch(argv[2], !strcmp(argv[1], "query"));`
			`return 0;`
			`}`