Added CDER metric to use in MERT.

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4152 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
machacekmatous 2011-08-18 21:35:16 +00:00
parent 96417949c2
commit 63fd490a51
4 changed files with 174 additions and 2 deletions

107
mert/CderScorer.cpp Normal file
View File

@ -0,0 +1,107 @@
#include "CderScorer.h"
#include <stdexcept>
#include <iostream>
#include <algorithm>
CderScorer::CderScorer(const string& config)
: StatisticsBasedScorer("CDER",config)
{
}
void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
//make sure reference data is clear
ref_sentences.clear();
//load reference data
for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
ifstream refin(referenceFiles[rid].c_str());
if (!refin) {
throw runtime_error("Unable to open: " + referenceFiles[rid]);
}
ref_sentences.push_back(vector<sent_t>());
string line;
while (getline(refin,line)) {
sent_t encoded;
encode(line, encoded);
ref_sentences[rid].push_back(encoded);
}
}
}
void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>& stats)
{
sent_t cand;
encode(text, cand);
float max = -2;
for (size_t rid = 0; rid < ref_sentences.size(); ++rid)
{
sent_t& ref = ref_sentences[rid][sid];
vector<int> tmp = computeCD(cand, ref);
if (calculateScore(tmp) > max)
{
stats = tmp;
}
}
}
float CderScorer::calculateScore(const vector<int>& comps)
{
if (comps.size() != 2)
{
throw runtime_error("Size of stat vector for CDER is not 2");
}
return 1 - (comps[0] / (float) comps[1]);
}
vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref)
{
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
int l = 0;
vector<int>* row = new vector<int>(I); // row[i] stores cost of cheapest path from (0,0) to (i,l) in CDER aligment grid.
// Initialization of first row
(*row)[0] = 0;
for (int i = 1; i < I; ++i) (*row)[i] = 1;
// Calculating costs for next row using costs from the previous row.
while (++l < L)
{
vector<int>* nextRow = new vector<int>(I);
for (int i = 0; i < I; ++i)
{
vector<int> possibleCosts;
if (i > 0)
{
possibleCosts.push_back((*nextRow)[i-1] + 1); // Deletion
possibleCosts.push_back((*row)[i-1] + distance(ref[l-1], cand[i-1])); // Substitution/Identity
}
possibleCosts.push_back((*row)[i] + 1); // Insertion
(*nextRow)[i] = *min_element(possibleCosts.begin(), possibleCosts.end());
}
int LJ = 1 + *min_element(nextRow->begin(), nextRow->end()); // Cost of LongJumps is the same for all in the row
for (int i = 0; i < I; ++i)
{
(*nextRow)[i] = min((*nextRow)[i], LJ); // LongJumps
}
delete row;
row = nextRow;
}
vector<int> stats(2);
stats[0] = *(row->rbegin()); // CD distance is the cost of path from (0,0) to (I,L)
stats[1] = ref.size();
delete row;
return stats;
}

59
mert/CderScorer.h Normal file
View File

@ -0,0 +1,59 @@
#ifndef __CDERSCORER_H__
#define __CDERSCORER_H__
#include <algorithm>
#include <iostream>
#include <string>
#include <vector>
#include "Types.h"
#include "ScoreData.h"
#include "Scorer.h"
using namespace std;
class CderScorer: public StatisticsBasedScorer
{
public:
CderScorer(const string& config);
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
vector<int> stats;
prepareStatsVector(sid, text, stats);
stringstream sout;
copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
string stats_str = sout.str();
entry.set(stats_str);
}
virtual void prepareStatsVector(size_t sid, const string& text, vector<int>& stats);
size_t NumberOfScores() {
return 2;
};
float calculateScore(const vector<int>& comps);
private:
typedef vector<int> sent_t;
vector<vector<sent_t> > ref_sentences;
vector<int> computeCD(const sent_t& cand, const sent_t& ref);
int distance(int word1, int word2)
{
if (word1 == word2)
return 0;
else
return 1;
}
//no copy
CderScorer(const CderScorer&);
~CderScorer() {};
CderScorer& operator=(const CderScorer&);
};
#endif

View File

@ -27,7 +27,8 @@ TERsrc/tercalc.cpp \
TERsrc/tinystr.cpp \
TERsrc/tinyxmlerror.cpp \
TERsrc/tools.cpp \
TerScorer.cpp
TerScorer.cpp \
CderScorer.cpp
extractor_SOURCES = Util.cpp \
Timer.cpp \
@ -57,7 +58,8 @@ TERsrc/tercalc.cpp \
TERsrc/tinystr.cpp \
TERsrc/tinyxmlerror.cpp \
TERsrc/tools.cpp \
TerScorer.cpp
TerScorer.cpp \
CderScorer.cpp
mert_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE
extractor_CPPFLAGS = -W -Wall -Wno-unused -ffor-scope -DTRACE_ENABLE

View File

@ -15,6 +15,7 @@
#include "BleuScorer.h"
#include "PerScorer.h"
#include "TerScorer.h"
#include "CderScorer.h"
using namespace std;
@ -27,6 +28,7 @@ public:
types.push_back(string("BLEU"));
types.push_back(string("PER"));
types.push_back(string("TER"));
types.push_back(string("CDER"));
return types;
}
@ -37,6 +39,8 @@ public:
return (PerScorer*) new PerScorer(config);
} else if (type == "TER") {
return (TerScorer*) new TerScorer(config);
} else if (type == "CDER") {
return (CderScorer*) new CderScorer(config);
} else {
throw runtime_error("Unknown scorer type: " + type);
}