mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
Incremental interface for scorer
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1678 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
69c6164b82
commit
e2921af063
@ -176,25 +176,7 @@ void BleuScorer::prepareStats(int sid, const string& text, ScoreStats& entry) {
|
|||||||
entry.set(stats_str);
|
entry.set(stats_str);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float BleuScorer::bleu(const vector<int>& comps) {
|
||||||
float BleuScorer::score(const std::vector<unsigned int>& candidates) {
|
|
||||||
if (!_scoreData) {
|
|
||||||
throw std::runtime_error("score data not loaded");
|
|
||||||
}
|
|
||||||
vector<int> comps(LENGTH*2+1);
|
|
||||||
for (size_t i = 0; i < candidates.size(); ++i) {
|
|
||||||
ScoreStats stats = _scoreData->get(i,candidates[i]);
|
|
||||||
if (stats.size() != comps.size()) {
|
|
||||||
stringstream msg;
|
|
||||||
msg << "Bleu statistics for (" << "," << candidates[i] << ") have incorrect "
|
|
||||||
<< "number of fields. Found: " << stats.size() << " Expected: "
|
|
||||||
<< comps.size();
|
|
||||||
throw runtime_error(msg.str());
|
|
||||||
}
|
|
||||||
for (size_t k = 0; k < comps.size(); ++k) {
|
|
||||||
comps[k] += stats.get(k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
float logbleu = 0.0;
|
float logbleu = 0.0;
|
||||||
for (int i = 0; i < LENGTH; ++i) {
|
for (int i = 0; i < LENGTH; ++i) {
|
||||||
if (comps[2*i] == 0) {
|
if (comps[2*i] == 0) {
|
||||||
@ -212,6 +194,49 @@ float BleuScorer::score(const std::vector<unsigned int>& candidates) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void BleuScorer::score(const candidates_t& candidates, const diffs_t& diffs,
|
||||||
|
scores_t& scores) {
|
||||||
|
if (!_scoreData) {
|
||||||
|
throw runtime_error("score data not loaded");
|
||||||
|
}
|
||||||
|
//calculate the score for the candidates
|
||||||
|
vector<int> comps(LENGTH*2+1);
|
||||||
|
for (size_t i = 0; i < candidates.size(); ++i) {
|
||||||
|
ScoreStats stats = _scoreData->get(i,candidates[i]);
|
||||||
|
if (stats.size() != comps.size()) {
|
||||||
|
stringstream msg;
|
||||||
|
msg << "Bleu statistics for (" << "," << candidates[i] << ") have incorrect "
|
||||||
|
<< "number of fields. Found: " << stats.size() << " Expected: "
|
||||||
|
<< comps.size();
|
||||||
|
throw runtime_error(msg.str());
|
||||||
|
}
|
||||||
|
for (size_t k = 0; k < comps.size(); ++k) {
|
||||||
|
comps[k] += stats.get(k);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
scores.push_back(bleu(comps));
|
||||||
|
|
||||||
|
candidates_t last_candidates(candidates);
|
||||||
|
//apply each of the diffs, and get new scores
|
||||||
|
for (size_t i = 0; i < diffs.size(); ++i) {
|
||||||
|
for (size_t j = 0; j < diffs[i].size(); ++j) {
|
||||||
|
size_t sid = diffs[i][j].first;
|
||||||
|
size_t nid = diffs[i][j].second;
|
||||||
|
size_t last_nid = last_candidates[sid];
|
||||||
|
for (size_t k = 0; k < comps.size(); ++k) {
|
||||||
|
int diff = _scoreData->get(sid,nid).get(k)
|
||||||
|
- _scoreData->get(sid,last_nid).get(k);
|
||||||
|
comps[k] += diff;
|
||||||
|
}
|
||||||
|
last_candidates[sid] = nid;
|
||||||
|
}
|
||||||
|
scores.push_back(bleu(comps));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
void BleuScorer::prepare(const vector<string>& referencefiles, const string& nbestfile) {
|
void BleuScorer::prepare(const vector<string>& referencefiles, const string& nbestfile) {
|
||||||
//processReferences(referencefiles, refcounts,reflengths,encodings);
|
//processReferences(referencefiles, refcounts,reflengths,encodings);
|
||||||
|
@ -31,7 +31,8 @@ class BleuScorer: public Scorer {
|
|||||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||||
virtual void prepareStats(int sid, const string& text, ScoreStats& entry);
|
virtual void prepareStats(int sid, const string& text, ScoreStats& entry);
|
||||||
|
|
||||||
virtual float score(const std::vector<unsigned int>& candidates);
|
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
|
||||||
|
scores_t& scores);
|
||||||
|
|
||||||
static const int LENGTH;
|
static const int LENGTH;
|
||||||
|
|
||||||
@ -83,6 +84,7 @@ class BleuScorer: public Scorer {
|
|||||||
|
|
||||||
void encode(const string& line, vector<int>& encoded);
|
void encode(const string& line, vector<int>& encoded);
|
||||||
size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
|
size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
|
||||||
|
float bleu(const vector<int>& comps);
|
||||||
|
|
||||||
void dump_counts(counts_t& counts) {
|
void dump_counts(counts_t& counts) {
|
||||||
for (counts_it i = counts.begin(); i != counts.end(); ++i) {
|
for (counts_it i = counts.begin(); i != counts.end(); ++i) {
|
||||||
|
@ -9,24 +9,73 @@
|
|||||||
|
|
||||||
#include "ScoreData.h"
|
#include "ScoreData.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
typedef vector<pair<unsigned int, unsigned int> > diff_t;
|
||||||
|
typedef vector<diff_t> diffs_t;
|
||||||
|
typedef vector<unsigned int> candidates_t;
|
||||||
|
typedef vector<float> scores_t;
|
||||||
|
|
||||||
class ScoreStats;
|
class ScoreStats;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Superclass of all scorers and dummy implementation. In order to add a new
|
||||||
|
* scorer it should be sufficient to override prepareStats(), setReferenceFiles()
|
||||||
|
* and score()
|
||||||
|
**/
|
||||||
class Scorer {
|
class Scorer {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
Scorer(const std::string& name): _name(name), _scoreData(0) {}
|
Scorer(const string& name): _name(name), _scoreData(0) {}
|
||||||
|
|
||||||
const std::string& getName() const {return _name;}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* set the reference files. This must be called before prepareStats.
|
* set the reference files. This must be called before prepareStats.
|
||||||
**/
|
**/
|
||||||
void setReferenceFiles(const std::vector<std::string>& referenceFiles) {
|
virtual void setReferenceFiles(const vector<string>& referenceFiles) {
|
||||||
//do nothing
|
//do nothing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process the given guessed text, corresponding to the given reference sindex
|
||||||
|
* and add the appropriate statistics to the entry.
|
||||||
|
**/
|
||||||
|
virtual void prepareStats(int sindex, const string& text, ScoreStats& entry) {
|
||||||
|
//cerr << text << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Score using each of the candidate index, then go through the diffs
|
||||||
|
* applying each in turn, and calculating a new score each time.
|
||||||
|
**/
|
||||||
|
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
|
||||||
|
scores_t& scores) {
|
||||||
|
//dummy impl
|
||||||
|
if (!_scoreData) {
|
||||||
|
throw runtime_error("score data not loaded");
|
||||||
|
}
|
||||||
|
scores.push_back(0);
|
||||||
|
for (size_t i = 0; i < diffs.size(); ++i) {
|
||||||
|
scores.push_back(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate the score of the sentences corresponding to the list of candidate
|
||||||
|
* indices. Each index indicates the 1-best choice from the n-best list.
|
||||||
|
**/
|
||||||
|
float score(const candidates_t& candidates) {
|
||||||
|
diffs_t diffs;
|
||||||
|
scores_t scores;
|
||||||
|
score(candidates, diffs, scores);
|
||||||
|
return scores[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
const string& getName() const {return _name;}
|
||||||
|
|
||||||
size_t getReferenceSize() {
|
size_t getReferenceSize() {
|
||||||
if (_scoreData) {
|
if (_scoreData) {
|
||||||
return _scoreData->size();
|
return _scoreData->size();
|
||||||
@ -34,13 +83,6 @@ class Scorer {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Process the given guessed text, corresponding to the given reference sindex
|
|
||||||
* and add the appropriate statistics to the entry.
|
|
||||||
**/
|
|
||||||
virtual void prepareStats(int sindex, const std::string& text, ScoreStats& entry) {
|
|
||||||
//std::cerr << text << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the score data, prior to scoring.
|
* Set the score data, prior to scoring.
|
||||||
@ -49,23 +91,11 @@ class Scorer {
|
|||||||
_scoreData = scoreData;
|
_scoreData = scoreData;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate the score of the sentences corresponding to the list of candidate
|
|
||||||
* indices. Each index indicates the 1-best choice from the n-best list.
|
|
||||||
**/
|
|
||||||
virtual float score(const std::vector<unsigned int>& candidates) {
|
|
||||||
if (!_scoreData) {
|
|
||||||
throw std::runtime_error("score data not loaded");
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
ScoreData* _scoreData;
|
ScoreData* _scoreData;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string _name;
|
string _name;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -12,25 +12,27 @@ int main(int argc, char** argv) {
|
|||||||
vector<string> references;
|
vector<string> references;
|
||||||
references.push_back("test_scorer_data/reference.txt");
|
references.push_back("test_scorer_data/reference.txt");
|
||||||
//bs.prepare(references, "test-scorer-data/nbest.out");
|
//bs.prepare(references, "test-scorer-data/nbest.out");
|
||||||
BleuScorer scorer;
|
Scorer* scorer = new BleuScorer();;
|
||||||
scorer.setReferenceFiles(references);
|
scorer->setReferenceFiles(references);
|
||||||
ScoreData sd(scorer);
|
ScoreData sd(*scorer);
|
||||||
sd.loadnbest("test_scorer_data/nbest.out");
|
sd.loadnbest("test_scorer_data/nbest.out");
|
||||||
//sd.savetxt();
|
//sd.savetxt();
|
||||||
|
|
||||||
//calculate a bleu scores
|
//calculate two bleu scores, nbest and a diff
|
||||||
scorer.setScoreData(&sd);
|
scorer->setScoreData(&sd);
|
||||||
unsigned int index = 0;
|
candidates_t candidates(sd.size());;
|
||||||
vector<unsigned int> candidates;
|
|
||||||
for (size_t i = 0; i < sd.size(); ++i) {
|
for (size_t i = 0; i < sd.size(); ++i) {
|
||||||
sd.get(i,index).savetxt("/dev/stdout");
|
sd.get(i,0).savetxt("/dev/stdout");
|
||||||
candidates.push_back(index++);
|
|
||||||
if (index == 10) {
|
|
||||||
index = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cout << "Bleu ";
|
diffs_t diffs;
|
||||||
float bleu = scorer.score(candidates);
|
diff_t diff;
|
||||||
cout << bleu << endl;
|
diff.push_back(make_pair(1,2));
|
||||||
|
diff.push_back(make_pair(7,8));
|
||||||
|
diffs.push_back(diff);
|
||||||
|
|
||||||
|
scores_t scores;
|
||||||
|
scorer->score(candidates,diffs,scores);
|
||||||
|
|
||||||
|
cout << "Bleus: " << scores[0] << " " << scores[1] << endl;
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
#
|
#
|
||||||
# Calculate bleu score for test files using old (python) script
|
# Calculate bleu score for test files using old (python) script
|
||||||
@ -38,21 +38,30 @@ def main():
|
|||||||
tests[-1].append(text)
|
tests[-1].append(text)
|
||||||
nbest_fh.close()
|
nbest_fh.close()
|
||||||
|
|
||||||
# pick sentences to score with
|
# score with first best
|
||||||
index = 0
|
|
||||||
cookedtests = []
|
cookedtests = []
|
||||||
for i in range(len(tests)):
|
for i in range(len(tests)):
|
||||||
sentence = tests[i][index]
|
sentence = tests[i][0]
|
||||||
cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
|
cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
|
||||||
stats = " ".join(["%d %d" % (c,g) for (c,g) in zip(cookedtest['correct'], cookedtest['guess'])])
|
stats = " ".join(["%d %d" % (c,g) for (c,g) in zip(cookedtest['correct'], cookedtest['guess'])])
|
||||||
print " %s %d" % (stats ,cookedtest['reflen'])
|
print " %s %d" % (stats ,cookedtest['reflen'])
|
||||||
cookedtests.append(cookedtest)
|
cookedtests.append(cookedtest)
|
||||||
index = index + 1
|
bleu1 = bleu.score_cooked(cookedtests)
|
||||||
if index == 10:
|
|
||||||
index = 0
|
|
||||||
|
|
||||||
bleu = bleu.score_cooked(cookedtests)
|
# vary, and score again
|
||||||
print "Bleu: ", bleu
|
cookedtests = []
|
||||||
|
for i in range(len(tests)):
|
||||||
|
sentence = tests[i][0]
|
||||||
|
if i == 7:
|
||||||
|
sentence = tests[i][8]
|
||||||
|
elif i == 1:
|
||||||
|
sentences = tests[i][2]
|
||||||
|
cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
|
||||||
|
cookedtests.append(cookedtest)
|
||||||
|
bleu2 = bleu.score_cooked(cookedtests)
|
||||||
|
|
||||||
|
|
||||||
|
print "Bleus: ", bleu1,bleu2
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user