sentence-bleu-nbest

2024-10-05 15:58:03 +03:00 · 2015-04-30 19:44:29 +01:00 · 2015-04-30 19:44:29 +01:00 · 34d1d3a904
commit 34d1d3a904
parent e98a2fc980
9 changed files with 130 additions and 93 deletions
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@ -45,14 +45,14 @@ BleuScorer::BleuScorer(const string& config)
  } else if (reflen == REFLEN_CLOSEST) {
    m_ref_length_type = CLOSEST;
  } else {
-    throw runtime_error("Unknown reference length strategy: " + reflen);
+    UTIL_THROW2("Unknown reference length strategy: " + reflen);
  }
 }

 BleuScorer::~BleuScorer() {}

 size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
-                               unsigned int n, bool is_testing)
+                               unsigned int n, bool is_testing) const
 {
  assert(n > 0);
  vector<int> encoded_tokens;
@ -94,25 +94,18 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
  mert::VocabularyFactory::GetVocabulary()->clear();

  //load reference data
-  for (size_t i = 0; i < referenceFiles.size(); ++i) {
+  for (size_t i = 0; i < referenceFiles.size(); ++i) 
+  {
    TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);

-    if (!OpenReference(referenceFiles[i].c_str(), i)) {
-      throw runtime_error("Unable to open " + referenceFiles[i]);
+    ifstream ifs(referenceFiles[i].c_str());
+    UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
+    if (!OpenReferenceStream(&ifs, i)) {
+      UTIL_THROW2("Unable to open " + referenceFiles[i]);
    }
  }
 }

-bool BleuScorer::OpenReference(const char* filename, size_t file_id)
-{
-  ifstream ifs(filename);
-  if (!ifs) {
-    cerr << "Cannot open " << filename << endl;
-    return false;
-  }
-  return OpenReferenceStream(&ifs, file_id);
-}
-
 bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
 {
  if (is == NULL) return false;
@ -120,15 +113,27 @@ bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
  string line;
  size_t sid = 0;
  while (getline(*is, line)) {
+    // TODO: rather than loading the whole reference corpus into memory, can we stream it line by line?
+    //  (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences)
    line = preprocessSentence(line);
    if (file_id == 0) {
      Reference* ref = new Reference;
      m_references.push_back(ref);    // Take ownership of the Reference object.
    }
-    if (m_references.size() <= sid) {
-      cerr << "Reference " << file_id << "has too many sentences." << endl;
-      return false;
+    UTIL_THROW_IF2(m_references.size() <= sid, "Reference " << file_id << "has too many sentences.");
+
+    ProcessReferenceLine(line, m_references[sid]);
+
+    if (sid > 0 && sid % 100 == 0) {
+      TRACE_ERR(".");
    }
+    ++sid;
+  }
+  return true;
+}
+
+void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const
+{
    NgramCounts counts;
    size_t length = CountNgrams(line, counts, kBleuNgramOrder);

@ -138,35 +143,30 @@ bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
      const NgramCounts::Value newcount = ci->second;

      NgramCounts::Value oldcount = 0;
-      m_references[sid]->get_counts()->Lookup(ngram, &oldcount);
+      ref->get_counts()->Lookup(ngram, &oldcount);
      if (newcount > oldcount) {
-        m_references[sid]->get_counts()->operator[](ngram) = newcount;
+        ref->get_counts()->operator[](ngram) = newcount;
      }
    }
    //add in the length
-    m_references[sid]->push_back(length);
-    if (sid > 0 && sid % 100 == 0) {
-      TRACE_ERR(".");
-    }
-    ++sid;
-  }
-  return true;
+    ref->push_back(length);
 }

 void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
 {
-  if (sid >= m_references.size()) {
-    stringstream msg;
-    msg << "Sentence id (" << sid << ") not found in reference set";
-    throw runtime_error(msg.str());
-  }
+  UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
+  CalcBleuStats(m_references[sid], text, entry);
+}
+
+void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const
+{
  NgramCounts testcounts;
  // stats for this line
  vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
  string sentence = preprocessSentence(text);
  const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true);

-  const int reference_len = CalcReferenceLength(sid, length);
+  const int reference_len = CalcReferenceLength(ref, length);
  stats.push_back(reference_len);

  //precision on each ngram type
@ -177,7 +177,7 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
    NgramCounts::Value correct = 0;

    NgramCounts::Value v = 0;
-    if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) {
+    if (ref->get_counts()->Lookup(testcounts_it->first, &v)) {
      correct = min(v, guess);
    }
    stats[len * 2 - 2] += correct;
@ -207,21 +207,20 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
  return exp(logbleu);
 }

-int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length)
+int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const
 {
  switch (m_ref_length_type) {
  case AVERAGE:
-    return m_references[sentence_id]->CalcAverage();
+    return ref->CalcAverage();
    break;
  case CLOSEST:
-    return m_references[sentence_id]->CalcClosest(length);
+    return ref->CalcClosest(length);
    break;
  case SHORTEST:
-    return m_references[sentence_id]->CalcShortest();
+    return ref->CalcShortest();
    break;
  default:
-    cerr << "unknown reference types." << endl;
-    exit(1);
+    UTIL_THROW2("Unknown reference types");
  }
 }

@ -298,29 +297,23 @@ vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string&

  vector<FeatureDataIterator> featureDataIters;
  vector<ScoreDataIterator> scoreDataIters;
-  for (size_t i = 0; i < featureFiles.size(); ++i) {
+  for (size_t i = 0; i < featureFiles.size(); ++i) 
+  {
    featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
    scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
  }

  vector<pair<size_t,size_t> > hypotheses;
-  if (featureDataIters[0] == FeatureDataIterator::end()) {
-    cerr << "Error: at the end of feature data iterator" << endl;
-    exit(1);
-  }
-  for (size_t i = 0; i < featureFiles.size(); ++i) {
-    if (featureDataIters[i] == FeatureDataIterator::end()) {
-      cerr << "Error: Feature file " << i << " ended prematurely" << endl;
-      exit(1);
-    }
-    if (scoreDataIters[i] == ScoreDataIterator::end()) {
-      cerr << "Error: Score file " << i << " ended prematurely" << endl;
-      exit(1);
-    }
-    if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
-      cerr << "Error: features and scores have different size" << endl;
-      exit(1);
-    }
+  UTIL_THROW_IF2(featureDataIters[0] == FeatureDataIterator::end(), 
+                 "At the end of feature data iterator");
+  for (size_t i = 0; i < featureFiles.size(); ++i) 
+  {
+    UTIL_THROW_IF2(featureDataIters[i] == FeatureDataIterator::end(), 
+                   "Feature file " << i << " ended prematurely");
+    UTIL_THROW_IF2(scoreDataIters[i] == ScoreDataIterator::end(), 
+                   "Score file " << i << " ended prematurely");
+    UTIL_THROW_IF2(featureDataIters[i]->size() != scoreDataIters[i]->size(), 
+                   "Features and scores have different size");
    for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
      hypotheses.push_back(pair<size_t,size_t>(i,j));
    }
--- a/mert/BleuScorer.h
+++ b/mert/BleuScorer.h
@ -42,11 +42,14 @@ public:
    return 2 * kBleuNgramOrder + 1;
  }

-  int CalcReferenceLength(std::size_t sentence_id, std::size_t length);
+  void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const;
+
+  int CalcReferenceLength(const Reference* ref, std::size_t length) const;

  ReferenceLengthType GetReferenceLengthType() const {
    return m_ref_length_type;
  }
+
  void SetReferenceLengthType(ReferenceLengthType type) {
    m_ref_length_type = type;
  }
@ -62,14 +65,14 @@ public:
  /**
   * Count the ngrams of each type, up to the given length in the input line.
   */
-  std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false);
+  std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;

  void DumpCounts(std::ostream* os, const NgramCounts& counts) const;

-  bool OpenReference(const char* filename, std::size_t file_id);
-
  // NOTE: this function is used for unit testing.
-  virtual bool OpenReferenceStream(std::istream* is, std::size_t file_id);
+  bool OpenReferenceStream(std::istream* is, std::size_t file_id);
+
+  void ProcessReferenceLine(const std::string& line, Reference* ref) const;

  //private:
 protected:
--- a/mert/Jamfile
+++ b/mert/Jamfile
@ -66,11 +66,13 @@ exe evaluator : evaluator.cpp mert_lib ;

 exe sentence-bleu : sentence-bleu.cpp mert_lib ;

+exe sentence-bleu-nbest : sentence-bleu-nbest.cpp mert_lib ;
+
 exe pro : pro.cpp mert_lib ..//boost_program_options ;

 exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ..//boost_filesystem ;

-alias programs : mert extractor evaluator pro kbmira sentence-bleu ;
+alias programs : mert extractor evaluator pro kbmira sentence-bleu sentence-bleu-nbest ;

 unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;
--- a/mert/Scorer.cpp
+++ b/mert/Scorer.cpp
@ -64,7 +64,7 @@ void Scorer::InitConfig(const string& config)
  }
 }

-void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
+void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) const
 {
  for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
       it; ++it) {
@ -81,7 +81,7 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
  }
 }

-void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)
+void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) const
 {
  for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
       it; ++it) {
--- a/mert/Scorer.h
+++ b/mert/Scorer.h
@ -187,12 +187,12 @@ protected:
   * Tokenise line and encode.
   * Note: We assume that all tokens are separated by whitespaces.
   */
-  void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded);
+  void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded) const;

  /*
   * Tokenize functions for testing only.
   */
-  void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded);
+  void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded) const;

  /**
   * Every inherited scorer should call this function for each sentence
--- a/mert/sentence-bleu-nbest.cpp
+++ b/mert/sentence-bleu-nbest.cpp
@ -0,0 +1,44 @@
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "BleuScorer.h"
+#include "moses/Util.h"
+
+using namespace MosesTuning;
+
+int main(int argc, char **argv)
+{
+  if (argc == 1) {
+    std::cerr << "Usage: ./sentence-bleu-nbest ref1 [ref2 ...] < plain-nbest > bleu-scores" << std::endl;
+    return 1;
+  }
+
+  std::vector<std::string> refFiles(argv + 1, argv + argc);
+
+  // TODO all of these are empty for now
+  std::string config;
+  std::string factors;
+  std::string filter;
+
+  BleuScorer scorer(config);
+  scorer.setFactors(factors);
+  scorer.setFilter(filter);
+  scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
+
+  // Loading sentences and preparing statistics
+  std::string nbestLine;
+  while ( getline(std::cin, nbestLine) ) 
+  {
+    std::vector<std::string> items;
+    Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
+    size_t sid = Moses::Scan<size_t>(items[0]);
+
+    ScoreStats scoreStats;
+    scorer.prepareStats(sid, items[1], scoreStats);
+    std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
+    std::cout << smoothedSentenceBleu(stats) << std::endl;
+  }
+
+  return 0;
+}
--- a/mert/sentence-bleu.cpp
+++ b/mert/sentence-bleu.cpp
@ -23,22 +23,19 @@ int main(int argc, char **argv)
  BleuScorer scorer(config);
  scorer.setFactors(factors);
  scorer.setFilter(filter);
-  scorer.setReferenceFiles(refFiles);
-
-  vector<ScoreStats> entries;
+  scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)

  // Loading sentences and preparing statistics
-  ScoreStats scoreentry;
-  string line;
-  while (getline(cin, line)) {
-    scorer.prepareStats(entries.size(), line, scoreentry);
-    entries.push_back(scoreentry);
+  string hypothesisLine;
+  size_t sid = 0;
+  while (getline(std::cin, hypothesisLine)) 
+  {
+    ScoreStats scoreStats;
+    scorer.prepareStats(sid, hypothesisLine, scoreStats);
+    vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
+    std::cout << smoothedSentenceBleu(stats) << std::endl;
+    ++sid;
  }

-  vector<ScoreStats>::const_iterator sentIt;
-  for (sentIt = entries.begin(); sentIt != entries.end(); sentIt++) {
-    vector<float> stats(sentIt->getArray(), sentIt->getArray() + sentIt->size());
-    cout << smoothedSentenceBleu(stats) << "\n";
-  }
  return 0;
 }
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@ -90,13 +90,6 @@ bool FileExists(const std::string& filePath)
  return !ifs.fail();
 }

-const std::string Trim(const std::string& str, const std::string dropChars)
-{
-  std::string res = str;
-  res.erase(str.find_last_not_of(dropChars)+1);
-  return res.erase(0, res.find_first_not_of(dropChars));
-}
-
 void ResetUserTime()
 {
  g_timer.start();
--- a/moses/Util.h
+++ b/moses/Util.h
@ -19,8 +19,7 @@ License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/

-#ifndef moses_Util_h
-#define moses_Util_h
+#pragma once

 #include <iostream>
 #include <fstream>
@ -89,10 +88,17 @@ namespace Moses
 #define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer)
 #endif

-//! delete white spaces at beginning and end of string
-const std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
+
 const std::string ToLower(const std::string& str);

+//! delete white spaces at beginning and end of string
+inline std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
+{
+  std::string res = str;
+  res.erase(str.find_last_not_of(dropChars)+1);
+  return res.erase(0, res.find_first_not_of(dropChars));
+}
+
 //! get string representation of any object/variable, as long as it can pipe to a stream
 template<typename T>
 inline std::string SPrint(const T &input)
@ -533,4 +539,3 @@ void ShowWeights();

 } // namespace

-#endif