Incremental interface for scorer

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1678 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
bhaddow 2008-05-14 19:47:34 +00:00
parent 69c6164b82
commit e2921af063
5 changed files with 135 additions and 67 deletions

View File

@ -176,25 +176,7 @@ void BleuScorer::prepareStats(int sid, const string& text, ScoreStats& entry) {
entry.set(stats_str); entry.set(stats_str);
} }
float BleuScorer::bleu(const vector<int>& comps) {
float BleuScorer::score(const std::vector<unsigned int>& candidates) {
if (!_scoreData) {
throw std::runtime_error("score data not loaded");
}
vector<int> comps(LENGTH*2+1);
for (size_t i = 0; i < candidates.size(); ++i) {
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != comps.size()) {
stringstream msg;
msg << "Bleu statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< comps.size();
throw runtime_error(msg.str());
}
for (size_t k = 0; k < comps.size(); ++k) {
comps[k] += stats.get(k);
}
}
float logbleu = 0.0; float logbleu = 0.0;
for (int i = 0; i < LENGTH; ++i) { for (int i = 0; i < LENGTH; ++i) {
if (comps[2*i] == 0) { if (comps[2*i] == 0) {
@ -212,6 +194,49 @@ float BleuScorer::score(const std::vector<unsigned int>& candidates) {
} }
void BleuScorer::score(const candidates_t& candidates, const diffs_t& diffs,
scores_t& scores) {
if (!_scoreData) {
throw runtime_error("score data not loaded");
}
//calculate the score for the candidates
vector<int> comps(LENGTH*2+1);
for (size_t i = 0; i < candidates.size(); ++i) {
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != comps.size()) {
stringstream msg;
msg << "Bleu statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< comps.size();
throw runtime_error(msg.str());
}
for (size_t k = 0; k < comps.size(); ++k) {
comps[k] += stats.get(k);
}
}
scores.push_back(bleu(comps));
candidates_t last_candidates(candidates);
//apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) {
for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second;
size_t last_nid = last_candidates[sid];
for (size_t k = 0; k < comps.size(); ++k) {
int diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k);
comps[k] += diff;
}
last_candidates[sid] = nid;
}
scores.push_back(bleu(comps));
}
}
/* /*
void BleuScorer::prepare(const vector<string>& referencefiles, const string& nbestfile) { void BleuScorer::prepare(const vector<string>& referencefiles, const string& nbestfile) {
//processReferences(referencefiles, refcounts,reflengths,encodings); //processReferences(referencefiles, refcounts,reflengths,encodings);

View File

@ -31,7 +31,8 @@ class BleuScorer: public Scorer {
virtual void setReferenceFiles(const vector<string>& referenceFiles); virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(int sid, const string& text, ScoreStats& entry); virtual void prepareStats(int sid, const string& text, ScoreStats& entry);
virtual float score(const std::vector<unsigned int>& candidates); virtual void score(const candidates_t& candidates, const diffs_t& diffs,
scores_t& scores);
static const int LENGTH; static const int LENGTH;
@ -83,6 +84,7 @@ class BleuScorer: public Scorer {
void encode(const string& line, vector<int>& encoded); void encode(const string& line, vector<int>& encoded);
size_t countNgrams(const string& line, counts_t& counts, unsigned int n); size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
float bleu(const vector<int>& comps);
void dump_counts(counts_t& counts) { void dump_counts(counts_t& counts) {
for (counts_it i = counts.begin(); i != counts.end(); ++i) { for (counts_it i = counts.begin(); i != counts.end(); ++i) {

View File

@ -9,24 +9,73 @@
#include "ScoreData.h" #include "ScoreData.h"
using namespace std;
typedef vector<pair<unsigned int, unsigned int> > diff_t;
typedef vector<diff_t> diffs_t;
typedef vector<unsigned int> candidates_t;
typedef vector<float> scores_t;
class ScoreStats; class ScoreStats;
/**
* Superclass of all scorers and dummy implementation. In order to add a new
* scorer it should be sufficient to override prepareStats(), setReferenceFiles()
* and score()
**/
class Scorer { class Scorer {
public: public:
Scorer(const std::string& name): _name(name), _scoreData(0) {} Scorer(const string& name): _name(name), _scoreData(0) {}
const std::string& getName() const {return _name;}
/** /**
* set the reference files. This must be called before prepareStats. * set the reference files. This must be called before prepareStats.
**/ **/
void setReferenceFiles(const std::vector<std::string>& referenceFiles) { virtual void setReferenceFiles(const vector<string>& referenceFiles) {
//do nothing //do nothing
} }
/**
* Process the given guessed text, corresponding to the given reference sindex
* and add the appropriate statistics to the entry.
**/
virtual void prepareStats(int sindex, const string& text, ScoreStats& entry) {
//cerr << text << std::endl;
}
/**
* Score using each of the candidate index, then go through the diffs
* applying each in turn, and calculating a new score each time.
**/
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
scores_t& scores) {
//dummy impl
if (!_scoreData) {
throw runtime_error("score data not loaded");
}
scores.push_back(0);
for (size_t i = 0; i < diffs.size(); ++i) {
scores.push_back(0);
}
}
/**
* Calculate the score of the sentences corresponding to the list of candidate
* indices. Each index indicates the 1-best choice from the n-best list.
**/
float score(const candidates_t& candidates) {
diffs_t diffs;
scores_t scores;
score(candidates, diffs, scores);
return scores[0];
}
const string& getName() const {return _name;}
size_t getReferenceSize() { size_t getReferenceSize() {
if (_scoreData) { if (_scoreData) {
return _scoreData->size(); return _scoreData->size();
@ -34,13 +83,6 @@ class Scorer {
return 0; return 0;
} }
/**
* Process the given guessed text, corresponding to the given reference sindex
* and add the appropriate statistics to the entry.
**/
virtual void prepareStats(int sindex, const std::string& text, ScoreStats& entry) {
//std::cerr << text << std::endl;
}
/** /**
* Set the score data, prior to scoring. * Set the score data, prior to scoring.
@ -49,23 +91,11 @@ class Scorer {
_scoreData = scoreData; _scoreData = scoreData;
} }
/**
* Calculate the score of the sentences corresponding to the list of candidate
* indices. Each index indicates the 1-best choice from the n-best list.
**/
virtual float score(const std::vector<unsigned int>& candidates) {
if (!_scoreData) {
throw std::runtime_error("score data not loaded");
}
return 0;
}
protected: protected:
ScoreData* _scoreData; ScoreData* _scoreData;
private: private:
std::string _name; string _name;
}; };

View File

@ -12,25 +12,27 @@ int main(int argc, char** argv) {
vector<string> references; vector<string> references;
references.push_back("test_scorer_data/reference.txt"); references.push_back("test_scorer_data/reference.txt");
//bs.prepare(references, "test-scorer-data/nbest.out"); //bs.prepare(references, "test-scorer-data/nbest.out");
BleuScorer scorer; Scorer* scorer = new BleuScorer();;
scorer.setReferenceFiles(references); scorer->setReferenceFiles(references);
ScoreData sd(scorer); ScoreData sd(*scorer);
sd.loadnbest("test_scorer_data/nbest.out"); sd.loadnbest("test_scorer_data/nbest.out");
//sd.savetxt(); //sd.savetxt();
//calculate a bleu scores //calculate two bleu scores, nbest and a diff
scorer.setScoreData(&sd); scorer->setScoreData(&sd);
unsigned int index = 0; candidates_t candidates(sd.size());;
vector<unsigned int> candidates;
for (size_t i = 0; i < sd.size(); ++i) { for (size_t i = 0; i < sd.size(); ++i) {
sd.get(i,index).savetxt("/dev/stdout"); sd.get(i,0).savetxt("/dev/stdout");
candidates.push_back(index++);
if (index == 10) {
index = 0;
}
} }
cout << "Bleu "; diffs_t diffs;
float bleu = scorer.score(candidates); diff_t diff;
cout << bleu << endl; diff.push_back(make_pair(1,2));
diff.push_back(make_pair(7,8));
diffs.push_back(diff);
scores_t scores;
scorer->score(candidates,diffs,scores);
cout << "Bleus: " << scores[0] << " " << scores[1] << endl;
} }

View File

@ -1,4 +1,4 @@
#/usr/bin/python #!/usr/bin/python
# #
# Calculate bleu score for test files using old (python) script # Calculate bleu score for test files using old (python) script
@ -38,21 +38,30 @@ def main():
tests[-1].append(text) tests[-1].append(text)
nbest_fh.close() nbest_fh.close()
# pick sentences to score with # score with first best
index = 0
cookedtests = [] cookedtests = []
for i in range(len(tests)): for i in range(len(tests)):
sentence = tests[i][index] sentence = tests[i][0]
cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
stats = " ".join(["%d %d" % (c,g) for (c,g) in zip(cookedtest['correct'], cookedtest['guess'])]) stats = " ".join(["%d %d" % (c,g) for (c,g) in zip(cookedtest['correct'], cookedtest['guess'])])
print " %s %d" % (stats ,cookedtest['reflen']) print " %s %d" % (stats ,cookedtest['reflen'])
cookedtests.append(cookedtest) cookedtests.append(cookedtest)
index = index + 1 bleu1 = bleu.score_cooked(cookedtests)
if index == 10:
index = 0
bleu = bleu.score_cooked(cookedtests) # vary, and score again
print "Bleu: ", bleu cookedtests = []
for i in range(len(tests)):
sentence = tests[i][0]
if i == 7:
sentence = tests[i][8]
elif i == 1:
sentences = tests[i][2]
cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
cookedtests.append(cookedtest)
bleu2 = bleu.score_cooked(cookedtests)
print "Bleus: ", bleu1,bleu2
if __name__ == "__main__": if __name__ == "__main__":
main() main()