mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-08-16 06:50:32 +03:00
sentence-bleu less greedy regarding memory
Don't load all references, read them line by line. Corpora with millions of sentences can now be evaluated without consuming gigabytes of RAM.
This commit is contained in:
parent
1d86b8fde7
commit
4ee8f2dec1
@ -174,7 +174,7 @@ statscore_t BleuDocScorer::calculateScore(const vector<int>& comps) const
|
||||
UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
|
||||
|
||||
float logbleu = 0.0;
|
||||
for (int i = 0; i < kBleuNgramOrder; ++i) {
|
||||
for (size_t i = 0; i < kBleuNgramOrder; ++i) {
|
||||
if (comps[2*i] == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
@ -1,5 +1,4 @@
|
||||
#ifndef MERT_BLEU_DOC_SCORER_H_
|
||||
#define MERT_BLEU_DOC_SCORER_H_
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
@ -64,4 +63,3 @@ private:
|
||||
|
||||
}
|
||||
|
||||
#endif // MERT_BLEU_DOC_SCORER_H_
|
||||
|
@ -99,9 +99,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
|
||||
|
||||
ifstream ifs(referenceFiles[i].c_str());
|
||||
UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
|
||||
if (!OpenReferenceStream(&ifs, i)) {
|
||||
UTIL_THROW2("Unable to open " + referenceFiles[i]);
|
||||
UTIL_THROW2("Cannot open " + referenceFiles[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -152,13 +151,26 @@ void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) c
|
||||
ref->push_back(length);
|
||||
}
|
||||
|
||||
bool BleuScorer::GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const
|
||||
{
|
||||
for (vector<boost::shared_ptr<ifstream> >::iterator ifs=referenceStreams.begin(); ifs!=referenceStreams.end(); ++ifs)
|
||||
{
|
||||
if (!(*ifs)) return false;
|
||||
string line;
|
||||
if (!getline(**ifs, line)) return false;
|
||||
line = preprocessSentence(line);
|
||||
ProcessReferenceLine(line, &ref);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
|
||||
CalcBleuStats(m_references[sid], text, entry);
|
||||
CalcBleuStats(*(m_references[sid]), text, entry);
|
||||
}
|
||||
|
||||
void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const
|
||||
void BleuScorer::CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const
|
||||
{
|
||||
NgramCounts testcounts;
|
||||
// stats for this line
|
||||
@ -177,7 +189,7 @@ void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, Sc
|
||||
NgramCounts::Value correct = 0;
|
||||
|
||||
NgramCounts::Value v = 0;
|
||||
if (ref->get_counts()->Lookup(testcounts_it->first, &v)) {
|
||||
if (ref.get_counts()->Lookup(testcounts_it->first, &v)) {
|
||||
correct = min(v, guess);
|
||||
}
|
||||
stats[len * 2 - 2] += correct;
|
||||
@ -207,17 +219,17 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
||||
int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const
|
||||
int BleuScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const
|
||||
{
|
||||
switch (m_ref_length_type) {
|
||||
case AVERAGE:
|
||||
return ref->CalcAverage();
|
||||
return ref.CalcAverage();
|
||||
break;
|
||||
case CLOSEST:
|
||||
return ref->CalcClosest(length);
|
||||
return ref.CalcClosest(length);
|
||||
break;
|
||||
case SHORTEST:
|
||||
return ref->CalcShortest();
|
||||
return ref.CalcShortest();
|
||||
break;
|
||||
default:
|
||||
UTIL_THROW2("Unknown reference types");
|
||||
|
@ -1,23 +1,23 @@
|
||||
#ifndef MERT_BLEU_SCORER_H_
|
||||
#define MERT_BLEU_SCORER_H_
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "Types.h"
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "Ngram.h"
|
||||
#include "Reference.h"
|
||||
#include "ScopedVector.h"
|
||||
#include "ScoreData.h"
|
||||
#include "StatisticsBasedScorer.h"
|
||||
#include "ScopedVector.h"
|
||||
#include "Types.h"
|
||||
|
||||
namespace MosesTuning
|
||||
{
|
||||
|
||||
const size_t kBleuNgramOrder = 4;
|
||||
|
||||
class NgramCounts;
|
||||
class Reference;
|
||||
|
||||
/**
|
||||
* Bleu scoring
|
||||
*/
|
||||
@ -42,9 +42,9 @@ public:
|
||||
return 2 * kBleuNgramOrder + 1;
|
||||
}
|
||||
|
||||
void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const;
|
||||
void CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const;
|
||||
|
||||
int CalcReferenceLength(const Reference* ref, std::size_t length) const;
|
||||
int CalcReferenceLength(const Reference& ref, std::size_t length) const;
|
||||
|
||||
ReferenceLengthType GetReferenceLengthType() const {
|
||||
return m_ref_length_type;
|
||||
@ -65,7 +65,7 @@ public:
|
||||
/**
|
||||
* Count the ngrams of each type, up to the given length in the input line.
|
||||
*/
|
||||
std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
|
||||
size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
|
||||
|
||||
void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
|
||||
|
||||
@ -74,6 +74,8 @@ public:
|
||||
|
||||
void ProcessReferenceLine(const std::string& line, Reference* ref) const;
|
||||
|
||||
bool GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const;
|
||||
|
||||
//private:
|
||||
protected:
|
||||
ReferenceLengthType m_ref_length_type;
|
||||
@ -102,4 +104,3 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
|
||||
|
||||
}
|
||||
|
||||
#endif // MERT_BLEU_SCORER_H_
|
||||
|
@ -98,7 +98,7 @@ void NbestHopeFearDecoder::HopeFear(
|
||||
size_t hope_index=0, fear_index=0, model_index=0;
|
||||
ValType hope_score=0, fear_score=0, model_score=0;
|
||||
for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
|
||||
ValType hope_bleu, hope_model;
|
||||
ValType hope_bleu=0, hope_model=0;
|
||||
for(size_t i=0; i< train_->cur_size(); i++) {
|
||||
const MiraFeatureVector& vec=train_->featuresAt(i);
|
||||
ValType score = wv.score(vec);
|
||||
|
@ -16,8 +16,7 @@ You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
#ifndef MERT_HOPEFEARDECODER_H
|
||||
#define MERT_HOPEFEARDECODER_H
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
@ -160,5 +159,3 @@ private:
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
#ifndef MERT_NGRAM_H_
|
||||
#define MERT_NGRAM_H_
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
@ -121,4 +120,3 @@ private:
|
||||
|
||||
}
|
||||
|
||||
#endif // MERT_NGRAM_H_
|
||||
|
@ -59,6 +59,11 @@ public:
|
||||
int CalcClosest(std::size_t length) const;
|
||||
int CalcShortest() const;
|
||||
|
||||
void clear() {
|
||||
m_length.clear();
|
||||
m_counts->clear();
|
||||
}
|
||||
|
||||
private:
|
||||
NgramCounts* m_counts;
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
#ifndef MERT_SCORER_H_
|
||||
#define MERT_SCORER_H_
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
@ -236,4 +235,3 @@ inline float score_average(const statscores_t& scores, size_t start, size_t end)
|
||||
|
||||
}
|
||||
|
||||
#endif // MERT_SCORER_H_
|
||||
|
@ -1,9 +1,14 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "BleuScorer.h"
|
||||
#include "Reference.h"
|
||||
#include "moses/Util.h"
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace MosesTuning;
|
||||
|
||||
@ -24,21 +29,40 @@ int main(int argc, char **argv)
|
||||
BleuScorer scorer(config);
|
||||
scorer.setFactors(factors);
|
||||
scorer.setFilter(filter);
|
||||
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
|
||||
|
||||
// Loading sentences and preparing statistics
|
||||
// initialize reference streams
|
||||
std::vector<boost::shared_ptr<std::ifstream> > refStreams;
|
||||
for (std::vector<std::string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
|
||||
{
|
||||
TRACE_ERR("Loading reference from " << *refFile << std::endl);
|
||||
boost::shared_ptr<std::ifstream> ifs(new std::ifstream(refFile->c_str()));
|
||||
UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
|
||||
refStreams.push_back(ifs);
|
||||
}
|
||||
|
||||
// load sentences, preparing statistics, score
|
||||
std::string nbestLine;
|
||||
int sid = -1;
|
||||
Reference ref;
|
||||
while ( getline(std::cin, nbestLine) )
|
||||
{
|
||||
std::vector<std::string> items;
|
||||
Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
|
||||
size_t sid = Moses::Scan<size_t>(items[0]);
|
||||
int sidCurrent = Moses::Scan<int>(items[0]);
|
||||
|
||||
if (sidCurrent != sid) {
|
||||
ref.clear();
|
||||
if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
|
||||
UTIL_THROW2("Missing references");
|
||||
}
|
||||
sid = sidCurrent;
|
||||
}
|
||||
ScoreStats scoreStats;
|
||||
scorer.prepareStats(sid, items[1], scoreStats);
|
||||
scorer.CalcBleuStats(ref, items[1], scoreStats);
|
||||
std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
|
||||
std::cout << smoothedSentenceBleu(stats) << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1,18 +1,26 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "BleuScorer.h"
|
||||
#include "Reference.h"
|
||||
#include "moses/Util.h"
|
||||
#include "util/exception.hh"
|
||||
|
||||
using namespace std;
|
||||
using namespace MosesTuning;
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc == 1) {
|
||||
cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
vector<string> refFiles(argv + 1, argv + argc);
|
||||
|
||||
// TODO all of these are empty for now
|
||||
@ -23,15 +31,28 @@ int main(int argc, char **argv)
|
||||
BleuScorer scorer(config);
|
||||
scorer.setFactors(factors);
|
||||
scorer.setFilter(filter);
|
||||
scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
|
||||
|
||||
// Loading sentences and preparing statistics
|
||||
// initialize reference streams
|
||||
vector<boost::shared_ptr<ifstream> > refStreams;
|
||||
for (vector<string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
|
||||
{
|
||||
TRACE_ERR("Loading reference from " << *refFile << endl);
|
||||
boost::shared_ptr<ifstream> ifs(new ifstream(refFile->c_str()));
|
||||
UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
|
||||
refStreams.push_back(ifs);
|
||||
}
|
||||
|
||||
// load sentences, preparing statistics, score
|
||||
string hypothesisLine;
|
||||
size_t sid = 0;
|
||||
while (getline(std::cin, hypothesisLine))
|
||||
{
|
||||
Reference ref;
|
||||
if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
|
||||
UTIL_THROW2("Missing references");
|
||||
}
|
||||
ScoreStats scoreStats;
|
||||
scorer.prepareStats(sid, hypothesisLine, scoreStats);
|
||||
scorer.CalcBleuStats(ref, hypothesisLine, scoreStats);
|
||||
vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
|
||||
std::cout << smoothedSentenceBleu(stats) << std::endl;
|
||||
++sid;
|
||||
@ -39,3 +60,4 @@ int main(int argc, char **argv)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user