sentence-bleu less greedy regarding memory

Don't load all references, read them line by line.
Corpora with millions of sentences can now be evaluated without consuming gigabytes of RAM.
This commit is contained in:
Matthias Huck 2015-04-30 22:26:30 +01:00
parent 1d86b8fde7
commit 4ee8f2dec1
11 changed files with 98 additions and 43 deletions

View File

@ -174,7 +174,7 @@ statscore_t BleuDocScorer::calculateScore(const vector<int>& comps) const
UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
float logbleu = 0.0;
for (int i = 0; i < kBleuNgramOrder; ++i) {
for (size_t i = 0; i < kBleuNgramOrder; ++i) {
if (comps[2*i] == 0) {
return 0.0;
}

View File

@ -1,5 +1,4 @@
#ifndef MERT_BLEU_DOC_SCORER_H_
#define MERT_BLEU_DOC_SCORER_H_
#pragma once
#include <ostream>
#include <string>
@ -64,4 +63,3 @@ private:
}
#endif // MERT_BLEU_DOC_SCORER_H_

View File

@ -99,9 +99,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
ifstream ifs(referenceFiles[i].c_str());
UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
if (!OpenReferenceStream(&ifs, i)) {
UTIL_THROW2("Unable to open " + referenceFiles[i]);
UTIL_THROW2("Cannot open " + referenceFiles[i]);
}
}
}
@ -152,13 +151,26 @@ void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) c
ref->push_back(length);
}
bool BleuScorer::GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const
{
for (vector<boost::shared_ptr<ifstream> >::iterator ifs=referenceStreams.begin(); ifs!=referenceStreams.end(); ++ifs)
{
if (!(*ifs)) return false;
string line;
if (!getline(**ifs, line)) return false;
line = preprocessSentence(line);
ProcessReferenceLine(line, &ref);
}
return true;
}
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
CalcBleuStats(m_references[sid], text, entry);
CalcBleuStats(*(m_references[sid]), text, entry);
}
void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const
void BleuScorer::CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const
{
NgramCounts testcounts;
// stats for this line
@ -177,7 +189,7 @@ void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, Sc
NgramCounts::Value correct = 0;
NgramCounts::Value v = 0;
if (ref->get_counts()->Lookup(testcounts_it->first, &v)) {
if (ref.get_counts()->Lookup(testcounts_it->first, &v)) {
correct = min(v, guess);
}
stats[len * 2 - 2] += correct;
@ -207,17 +219,17 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
return exp(logbleu);
}
int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const
int BleuScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const
{
switch (m_ref_length_type) {
case AVERAGE:
return ref->CalcAverage();
return ref.CalcAverage();
break;
case CLOSEST:
return ref->CalcClosest(length);
return ref.CalcClosest(length);
break;
case SHORTEST:
return ref->CalcShortest();
return ref.CalcShortest();
break;
default:
UTIL_THROW2("Unknown reference types");

View File

@ -1,23 +1,23 @@
#ifndef MERT_BLEU_SCORER_H_
#define MERT_BLEU_SCORER_H_
#pragma once
#include <ostream>
#include <fstream>
#include <string>
#include <vector>
#include "Types.h"
#include <boost/shared_ptr.hpp>
#include "Ngram.h"
#include "Reference.h"
#include "ScopedVector.h"
#include "ScoreData.h"
#include "StatisticsBasedScorer.h"
#include "ScopedVector.h"
#include "Types.h"
namespace MosesTuning
{
const size_t kBleuNgramOrder = 4;
class NgramCounts;
class Reference;
/**
* Bleu scoring
*/
@ -42,9 +42,9 @@ public:
return 2 * kBleuNgramOrder + 1;
}
void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const;
void CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const;
int CalcReferenceLength(const Reference* ref, std::size_t length) const;
int CalcReferenceLength(const Reference& ref, std::size_t length) const;
ReferenceLengthType GetReferenceLengthType() const {
return m_ref_length_type;
@ -65,7 +65,7 @@ public:
/**
* Count the ngrams of each type, up to the given length in the input line.
*/
std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
@ -74,6 +74,8 @@ public:
void ProcessReferenceLine(const std::string& line, Reference* ref) const;
bool GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const;
//private:
protected:
ReferenceLengthType m_ref_length_type;
@ -102,4 +104,3 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
}
#endif // MERT_BLEU_SCORER_H_

View File

@ -98,7 +98,7 @@ void NbestHopeFearDecoder::HopeFear(
size_t hope_index=0, fear_index=0, model_index=0;
ValType hope_score=0, fear_score=0, model_score=0;
for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
ValType hope_bleu, hope_model;
ValType hope_bleu=0, hope_model=0;
for(size_t i=0; i< train_->cur_size(); i++) {
const MiraFeatureVector& vec=train_->featuresAt(i);
ValType score = wv.score(vec);

View File

@ -16,8 +16,7 @@ You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#ifndef MERT_HOPEFEARDECODER_H
#define MERT_HOPEFEARDECODER_H
#pragma once
#include <vector>
@ -160,5 +159,3 @@ private:
};
#endif

View File

@ -1,5 +1,4 @@
#ifndef MERT_NGRAM_H_
#define MERT_NGRAM_H_
#pragma once
#include <vector>
#include <string>
@ -121,4 +120,3 @@ private:
}
#endif // MERT_NGRAM_H_

View File

@ -59,6 +59,11 @@ public:
int CalcClosest(std::size_t length) const;
int CalcShortest() const;
void clear() {
m_length.clear();
m_counts->clear();
}
private:
NgramCounts* m_counts;

View File

@ -1,5 +1,4 @@
#ifndef MERT_SCORER_H_
#define MERT_SCORER_H_
#pragma once
#include <iostream>
#include <sstream>
@ -236,4 +235,3 @@ inline float score_average(const statscores_t& scores, size_t start, size_t end)
}
#endif // MERT_SCORER_H_

View File

@ -1,9 +1,14 @@
#include <fstream>
#include <iostream>
#include <vector>
#include <string>
#include <boost/shared_ptr.hpp>
#include "BleuScorer.h"
#include "Reference.h"
#include "moses/Util.h"
#include "util/exception.hh"
using namespace MosesTuning;
@ -24,21 +29,40 @@ int main(int argc, char **argv)
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
// Loading sentences and preparing statistics
// initialize reference streams
std::vector<boost::shared_ptr<std::ifstream> > refStreams;
for (std::vector<std::string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
{
TRACE_ERR("Loading reference from " << *refFile << std::endl);
boost::shared_ptr<std::ifstream> ifs(new std::ifstream(refFile->c_str()));
UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
refStreams.push_back(ifs);
}
// load sentences, preparing statistics, score
std::string nbestLine;
int sid = -1;
Reference ref;
while ( getline(std::cin, nbestLine) )
{
std::vector<std::string> items;
Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
size_t sid = Moses::Scan<size_t>(items[0]);
int sidCurrent = Moses::Scan<int>(items[0]);
if (sidCurrent != sid) {
ref.clear();
if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
UTIL_THROW2("Missing references");
}
sid = sidCurrent;
}
ScoreStats scoreStats;
scorer.prepareStats(sid, items[1], scoreStats);
scorer.CalcBleuStats(ref, items[1], scoreStats);
std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
}
return 0;
}

View File

@ -1,18 +1,26 @@
#include <fstream>
#include <iostream>
#include <vector>
#include <string>
#include <boost/shared_ptr.hpp>
#include "BleuScorer.h"
#include "Reference.h"
#include "moses/Util.h"
#include "util/exception.hh"
using namespace std;
using namespace MosesTuning;
int main(int argc, char **argv)
{
if (argc == 1) {
cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl;
return 1;
}
vector<string> refFiles(argv + 1, argv + argc);
// TODO all of these are empty for now
@ -23,15 +31,28 @@ int main(int argc, char **argv)
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
// Loading sentences and preparing statistics
// initialize reference streams
vector<boost::shared_ptr<ifstream> > refStreams;
for (vector<string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
{
TRACE_ERR("Loading reference from " << *refFile << endl);
boost::shared_ptr<ifstream> ifs(new ifstream(refFile->c_str()));
UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
refStreams.push_back(ifs);
}
// load sentences, preparing statistics, score
string hypothesisLine;
size_t sid = 0;
while (getline(std::cin, hypothesisLine))
{
Reference ref;
if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
UTIL_THROW2("Missing references");
}
ScoreStats scoreStats;
scorer.prepareStats(sid, hypothesisLine, scoreStats);
scorer.CalcBleuStats(ref, hypothesisLine, scoreStats);
vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
++sid;
@ -39,3 +60,4 @@ int main(int argc, char **argv)
return 0;
}