sentence-bleu less greedy regarding memory

Don't load all references, read them line by line.
Corpora with millions of sentences can now be evaluated without consuming gigabytes of RAM.
This commit is contained in:
Matthias Huck 2015-04-30 22:26:30 +01:00
parent 1d86b8fde7
commit 4ee8f2dec1
11 changed files with 98 additions and 43 deletions

View File

@ -174,7 +174,7 @@ statscore_t BleuDocScorer::calculateScore(const vector<int>& comps) const
UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error"); UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
float logbleu = 0.0; float logbleu = 0.0;
for (int i = 0; i < kBleuNgramOrder; ++i) { for (size_t i = 0; i < kBleuNgramOrder; ++i) {
if (comps[2*i] == 0) { if (comps[2*i] == 0) {
return 0.0; return 0.0;
} }

View File

@ -1,5 +1,4 @@
#ifndef MERT_BLEU_DOC_SCORER_H_ #pragma once
#define MERT_BLEU_DOC_SCORER_H_
#include <ostream> #include <ostream>
#include <string> #include <string>
@ -64,4 +63,3 @@ private:
} }
#endif // MERT_BLEU_DOC_SCORER_H_

View File

@ -99,9 +99,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl); TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
ifstream ifs(referenceFiles[i].c_str()); ifstream ifs(referenceFiles[i].c_str());
UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
if (!OpenReferenceStream(&ifs, i)) { if (!OpenReferenceStream(&ifs, i)) {
UTIL_THROW2("Unable to open " + referenceFiles[i]); UTIL_THROW2("Cannot open " + referenceFiles[i]);
} }
} }
} }
@ -152,13 +151,26 @@ void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) c
ref->push_back(length); ref->push_back(length);
} }
bool BleuScorer::GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const
{
for (vector<boost::shared_ptr<ifstream> >::iterator ifs=referenceStreams.begin(); ifs!=referenceStreams.end(); ++ifs)
{
if (!(*ifs)) return false;
string line;
if (!getline(**ifs, line)) return false;
line = preprocessSentence(line);
ProcessReferenceLine(line, &ref);
}
return true;
}
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{ {
UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set"); UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
CalcBleuStats(m_references[sid], text, entry); CalcBleuStats(*(m_references[sid]), text, entry);
} }
void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const void BleuScorer::CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const
{ {
NgramCounts testcounts; NgramCounts testcounts;
// stats for this line // stats for this line
@ -177,7 +189,7 @@ void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, Sc
NgramCounts::Value correct = 0; NgramCounts::Value correct = 0;
NgramCounts::Value v = 0; NgramCounts::Value v = 0;
if (ref->get_counts()->Lookup(testcounts_it->first, &v)) { if (ref.get_counts()->Lookup(testcounts_it->first, &v)) {
correct = min(v, guess); correct = min(v, guess);
} }
stats[len * 2 - 2] += correct; stats[len * 2 - 2] += correct;
@ -207,17 +219,17 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
return exp(logbleu); return exp(logbleu);
} }
int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const int BleuScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const
{ {
switch (m_ref_length_type) { switch (m_ref_length_type) {
case AVERAGE: case AVERAGE:
return ref->CalcAverage(); return ref.CalcAverage();
break; break;
case CLOSEST: case CLOSEST:
return ref->CalcClosest(length); return ref.CalcClosest(length);
break; break;
case SHORTEST: case SHORTEST:
return ref->CalcShortest(); return ref.CalcShortest();
break; break;
default: default:
UTIL_THROW2("Unknown reference types"); UTIL_THROW2("Unknown reference types");

View File

@ -1,23 +1,23 @@
#ifndef MERT_BLEU_SCORER_H_ #pragma once
#define MERT_BLEU_SCORER_H_
#include <ostream> #include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
#include "Types.h" #include <boost/shared_ptr.hpp>
#include "Ngram.h"
#include "Reference.h"
#include "ScopedVector.h"
#include "ScoreData.h" #include "ScoreData.h"
#include "StatisticsBasedScorer.h" #include "StatisticsBasedScorer.h"
#include "ScopedVector.h" #include "Types.h"
namespace MosesTuning namespace MosesTuning
{ {
const size_t kBleuNgramOrder = 4; const size_t kBleuNgramOrder = 4;
class NgramCounts;
class Reference;
/** /**
* Bleu scoring * Bleu scoring
*/ */
@ -42,9 +42,9 @@ public:
return 2 * kBleuNgramOrder + 1; return 2 * kBleuNgramOrder + 1;
} }
void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const; void CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const;
int CalcReferenceLength(const Reference* ref, std::size_t length) const; int CalcReferenceLength(const Reference& ref, std::size_t length) const;
ReferenceLengthType GetReferenceLengthType() const { ReferenceLengthType GetReferenceLengthType() const {
return m_ref_length_type; return m_ref_length_type;
@ -65,7 +65,7 @@ public:
/** /**
* Count the ngrams of each type, up to the given length in the input line. * Count the ngrams of each type, up to the given length in the input line.
*/ */
std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const; size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
void DumpCounts(std::ostream* os, const NgramCounts& counts) const; void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
@ -74,6 +74,8 @@ public:
void ProcessReferenceLine(const std::string& line, Reference* ref) const; void ProcessReferenceLine(const std::string& line, Reference* ref) const;
bool GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const;
//private: //private:
protected: protected:
ReferenceLengthType m_ref_length_type; ReferenceLengthType m_ref_length_type;
@ -102,4 +104,3 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
} }
#endif // MERT_BLEU_SCORER_H_

View File

@ -98,7 +98,7 @@ void NbestHopeFearDecoder::HopeFear(
size_t hope_index=0, fear_index=0, model_index=0; size_t hope_index=0, fear_index=0, model_index=0;
ValType hope_score=0, fear_score=0, model_score=0; ValType hope_score=0, fear_score=0, model_score=0;
for(size_t safe_loop=0; safe_loop<2; safe_loop++) { for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
ValType hope_bleu, hope_model; ValType hope_bleu=0, hope_model=0;
for(size_t i=0; i< train_->cur_size(); i++) { for(size_t i=0; i< train_->cur_size(); i++) {
const MiraFeatureVector& vec=train_->featuresAt(i); const MiraFeatureVector& vec=train_->featuresAt(i);
ValType score = wv.score(vec); ValType score = wv.score(vec);

View File

@ -16,8 +16,7 @@ You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/ ***********************************************************************/
#ifndef MERT_HOPEFEARDECODER_H #pragma once
#define MERT_HOPEFEARDECODER_H
#include <vector> #include <vector>
@ -160,5 +159,3 @@ private:
}; };
#endif

View File

@ -1,5 +1,4 @@
#ifndef MERT_NGRAM_H_ #pragma once
#define MERT_NGRAM_H_
#include <vector> #include <vector>
#include <string> #include <string>
@ -121,4 +120,3 @@ private:
} }
#endif // MERT_NGRAM_H_

View File

@ -59,6 +59,11 @@ public:
int CalcClosest(std::size_t length) const; int CalcClosest(std::size_t length) const;
int CalcShortest() const; int CalcShortest() const;
void clear() {
m_length.clear();
m_counts->clear();
}
private: private:
NgramCounts* m_counts; NgramCounts* m_counts;

View File

@ -1,5 +1,4 @@
#ifndef MERT_SCORER_H_ #pragma once
#define MERT_SCORER_H_
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
@ -236,4 +235,3 @@ inline float score_average(const statscores_t& scores, size_t start, size_t end)
} }
#endif // MERT_SCORER_H_

View File

@ -1,9 +1,14 @@
#include <fstream>
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <string> #include <string>
#include <boost/shared_ptr.hpp>
#include "BleuScorer.h" #include "BleuScorer.h"
#include "Reference.h"
#include "moses/Util.h" #include "moses/Util.h"
#include "util/exception.hh"
using namespace MosesTuning; using namespace MosesTuning;
@ -24,21 +29,40 @@ int main(int argc, char **argv)
BleuScorer scorer(config); BleuScorer scorer(config);
scorer.setFactors(factors); scorer.setFactors(factors);
scorer.setFilter(filter); scorer.setFilter(filter);
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
// Loading sentences and preparing statistics // initialize reference streams
std::vector<boost::shared_ptr<std::ifstream> > refStreams;
for (std::vector<std::string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
{
TRACE_ERR("Loading reference from " << *refFile << std::endl);
boost::shared_ptr<std::ifstream> ifs(new std::ifstream(refFile->c_str()));
UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
refStreams.push_back(ifs);
}
// load sentences, preparing statistics, score
std::string nbestLine; std::string nbestLine;
int sid = -1;
Reference ref;
while ( getline(std::cin, nbestLine) ) while ( getline(std::cin, nbestLine) )
{ {
std::vector<std::string> items; std::vector<std::string> items;
Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| "); Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
size_t sid = Moses::Scan<size_t>(items[0]); int sidCurrent = Moses::Scan<int>(items[0]);
if (sidCurrent != sid) {
ref.clear();
if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
UTIL_THROW2("Missing references");
}
sid = sidCurrent;
}
ScoreStats scoreStats; ScoreStats scoreStats;
scorer.prepareStats(sid, items[1], scoreStats); scorer.CalcBleuStats(ref, items[1], scoreStats);
std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size()); std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl; std::cout << smoothedSentenceBleu(stats) << std::endl;
} }
return 0; return 0;
} }

View File

@ -1,18 +1,26 @@
#include <fstream>
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <string> #include <string>
#include <boost/shared_ptr.hpp>
#include "BleuScorer.h" #include "BleuScorer.h"
#include "Reference.h"
#include "moses/Util.h"
#include "util/exception.hh"
using namespace std; using namespace std;
using namespace MosesTuning; using namespace MosesTuning;
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
if (argc == 1) { if (argc == 1) {
cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl; cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl;
return 1; return 1;
} }
vector<string> refFiles(argv + 1, argv + argc); vector<string> refFiles(argv + 1, argv + argc);
// TODO all of these are empty for now // TODO all of these are empty for now
@ -23,15 +31,28 @@ int main(int argc, char **argv)
BleuScorer scorer(config); BleuScorer scorer(config);
scorer.setFactors(factors); scorer.setFactors(factors);
scorer.setFilter(filter); scorer.setFilter(filter);
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
// Loading sentences and preparing statistics // initialize reference streams
vector<boost::shared_ptr<ifstream> > refStreams;
for (vector<string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
{
TRACE_ERR("Loading reference from " << *refFile << endl);
boost::shared_ptr<ifstream> ifs(new ifstream(refFile->c_str()));
UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
refStreams.push_back(ifs);
}
// load sentences, preparing statistics, score
string hypothesisLine; string hypothesisLine;
size_t sid = 0; size_t sid = 0;
while (getline(std::cin, hypothesisLine)) while (getline(std::cin, hypothesisLine))
{ {
Reference ref;
if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
UTIL_THROW2("Missing references");
}
ScoreStats scoreStats; ScoreStats scoreStats;
scorer.prepareStats(sid, hypothesisLine, scoreStats); scorer.CalcBleuStats(ref, hypothesisLine, scoreStats);
vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size()); vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl; std::cout << smoothedSentenceBleu(stats) << std::endl;
++sid; ++sid;
@ -39,3 +60,4 @@ int main(int argc, char **argv)
return 0; return 0;
} }