Ability to use multiple metrics and weight them

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mert-mtm5@3535 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
lexibirch 2010-09-17 16:13:35 +00:00
parent 1f2296626e
commit 8b97df9367
29 changed files with 1319 additions and 72 deletions

View File

@ -1,6 +1,30 @@
#include "BleuScorer.h"
const int BleuScorer::LENGTH = 4;
BleuScorer::BleuScorer(const string& config = "") : StatisticsBasedScorer("BLEU",config),_refLengthStrategy(BLEU_CLOSEST) {
//configure regularisation
static string KEY_REFLEN = "reflen";
static string REFLEN_AVERAGE = "average";
static string REFLEN_SHORTEST = "shortest";
static string REFLEN_CLOSEST = "closest";
string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
_refLengthStrategy = BLEU_AVERAGE;
} else if (reflen == REFLEN_SHORTEST) {
_refLengthStrategy = BLEU_SHORTEST;
} else if (reflen == REFLEN_CLOSEST) {
_refLengthStrategy = BLEU_CLOSEST;
} else {
throw runtime_error("Unknown reference length strategy: " + reflen);
}
cerr << "Using reference length strategy: " << reflen << endl;
static string KEY_NGRAMS = "ngramlen";
string ngramlen = getConfig(KEY_NGRAMS,"4");
LENGTH = strtol(ngramlen.c_str(), NULL, 10);
}
/**
@ -84,6 +108,7 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles) {
}
++sid;
}
refin.close();
TRACE_ERR(endl);
}
}
@ -153,7 +178,7 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
entry.set(stats_str);
}
float BleuScorer::calculateScore(const vector<int>& comps) {
float BleuScorer::calculateScore(const vector<float>& comps) {
//cerr << "BLEU: ";
//copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," "));
float logbleu = 0.0;
@ -172,4 +197,3 @@ float BleuScorer::calculateScore(const vector<int>& comps) {
//cerr << " " << exp(logbleu) << endl;
return exp(logbleu);
}

View File

@ -25,35 +25,24 @@ enum BleuReferenceLengthStrategy { BLEU_AVERAGE, BLEU_SHORTEST, BLEU_CLOSEST };
**/
class BleuScorer: public StatisticsBasedScorer {
public:
BleuScorer(const string& config = "") : StatisticsBasedScorer("BLEU",config),_refLengthStrategy(BLEU_CLOSEST) {
//configure regularisation
static string KEY_REFLEN = "reflen";
static string REFLEN_AVERAGE = "average";
static string REFLEN_SHORTEST = "shortest";
static string REFLEN_CLOSEST = "closest";
string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
_refLengthStrategy = BLEU_AVERAGE;
} else if (reflen == REFLEN_SHORTEST) {
_refLengthStrategy = BLEU_SHORTEST;
} else if (reflen == REFLEN_CLOSEST) {
_refLengthStrategy = BLEU_CLOSEST;
} else {
throw runtime_error("Unknown reference length strategy: " + reflen);
}
cerr << "Using reference length strategy: " << reflen << endl;
}
BleuScorer(const string& config);
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
static const int LENGTH;
int LENGTH;
size_t NumberOfScores(){ cerr << "BleuScorer: " << (2 * LENGTH + 1) << endl; return (2 * LENGTH + 1); };
size_t NumberOfScores() const {
//cerr << "BleuScorer: " << (2 * LENGTH + 1) << endl;
return (2 * LENGTH + 1);
};
bool useAlignment() const {
//cout << "BleuScorer::useAlignment returning false" << endl;
return false;
};
protected:
float calculateScore(const vector<int>& comps);
float calculateScore(const vector<float>& comps);
private:
//no copy
@ -90,9 +79,9 @@ class BleuScorer: public StatisticsBasedScorer {
void dump_counts(counts_t& counts) {
for (counts_it i = counts.begin(); i != counts.end(); ++i) {
cerr << "(";
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
cerr << ") " << i->second << ", ";
cerr << "(";
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
cerr << ") " << i->second << ", ";
}
cerr << endl;
}

View File

@ -38,6 +38,8 @@ void Data::loadnbest(const std::string &file)
std::string substring, subsubstring, stringBuf;
std::string theSentence;
std::string theFeatures;
std::string theAlignment;
std::string::size_type loc;
@ -57,14 +59,32 @@ void Data::loadnbest(const std::string &file)
scoreentry.clear();
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
getNextPound(stringBuf, substring, "|||"); //third field
theFeatures = substring;
if (stringBuf.length() > 0) {
getNextPound(stringBuf, substring, "|||"); //fourth field sentence score
if (stringBuf.length() > 0) {
getNextPound(stringBuf, substring, "|||"); //fourth field only there if alignment scorer
theAlignment = substring;
}
}
//TODO check alignment exists if scorers need it
if (!theScorer->useAlignment()) {
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
} else {
//an interpolated score would need both sentence and alignment
theSentence += "|||";
theSentence += theAlignment;
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
}
scoredata->add(scoreentry, sentence_index);
getNextPound(stringBuf, substring, "|||"); //third field
if (!existsFeatureNames()){
std::string stringsupport=substring;
std::string stringsupport=theFeatures;
// adding feature names
std::string features="";
std::string tmpname="";
@ -89,9 +109,9 @@ void Data::loadnbest(const std::string &file)
}
// adding features
while (!substring.empty()){
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(substring, subsubstring);
while (!theFeatures.empty()){
// TRACE_ERR("Decompounding: " << theFeatures << std::endl);
getNextPound(theFeatures, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1){

View File

@ -95,8 +95,8 @@ public:
};
size_t getFeatureIndex(const std::string& name){
if (featname2idx_.find(name)==featname2idx_.end())
throw runtime_error("Error: feature " + name +" is unknown");
if (featname2idx_.find(name)!=featname2idx_.end())
throw runtime_error("Error: feature is unknown");
return featname2idx_[name];
};

200
mert/InterpolatedScorer.cpp Normal file
View File

@ -0,0 +1,200 @@
#include "Scorer.h"
#include "ScorerFactory.h"
#include "InterpolatedScorer.h"
using namespace std;
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config) {
//configure regularisation
static string KEY_WEIGHTS = "weights";
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case";
static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM;
} else {
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) {
_preserveCase = true;
}else if (preservecase == FALSE) {
_preserveCase = false;
}
cerr << "Using case preservation: " << _preserveCase << endl;
// name would be: HAMMING,BLEU or similar
string scorers = name;
while (scorers.length() > 0) {
string scorertype = "";
getNextPound(scorers,scorertype,",");
ScorerFactory SF;
Scorer *theScorer=SF.getScorer(scorertype,config);
_scorers.push_back(theScorer);
}
if (_scorers.size() == 0) {
throw runtime_error("There are no scorers");
}
cout << "Number of scorers: " << _scorers.size() << endl;
//TODO debug this
string wtype = getConfig(KEY_WEIGHTS,"");
//Default weights set to uniform ie. if two weights 0.5 each
//weights should add to 1
if (wtype.length() == 0) {
float weight = 1.0/_scorers.size() ;
//cout << " Default weights:" << weight << endl;
for (size_t i = 0; i < _scorers.size(); i ++) {
_scorerWeights.push_back(weight);
}
}else{
float tot=0;
//cout << "Defined weights:" << endl;
while (wtype.length() > 0) {
string scoreweight = "";
getNextPound(wtype,scoreweight,"+");
float weight = atof(scoreweight.c_str());
_scorerWeights.push_back(weight);
tot += weight;
//cout << " :" << weight ;
}
//cout << endl;
if (tot != float(1)) {
throw runtime_error("The interpolated scorers weights do not sum to 1");
}
}
cout << "The weights for the interpolated scorers are: " << endl;
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
cout << *it << " " ;
}
cout <<endl;
}
void InterpolatedScorer::setScoreData(ScoreData* data) {
size_t last = 0;
_scoreData = data;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){
int numScoresScorer = (*itsc)->NumberOfScores();
ScoreData* newData =new ScoreData(**itsc);
for (size_t i = 0; i < data->size(); i++){
ScoreArray scoreArray = data->get(i);
ScoreArray newScoreArray;
std::string istr;
std::stringstream out;
out << i;
istr = out.str();
size_t numNBest = scoreArray.size();
//cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
for (size_t j = 0; j < numNBest ; j++){
ScoreStats scoreStats = data->get(i, j);
//cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
ScoreStats newScoreStats;
for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
ScoreStatsType score = scoreStats.get(k);
newScoreStats.add(score);
}
//cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
newScoreArray.add(newScoreStats);
}
newScoreArray.setIndex(istr);
newData->add(newScoreArray);
}
//newData->dump();
(*itsc)->setScoreData(newData);
last += numScoresScorer;
}
}
/** The interpolated scorer calls a vector of scorers and combines them with
weights **/
void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
//cout << "*******InterpolatedScorer::score" << endl;
size_t scorerNum = 0;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){
int numScores = (*itsc)->NumberOfScores();
statscores_t tscores;
(*itsc)->score(candidates,diffs,tscores);
size_t inc = 0;
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end();itstatsc++){
//cout << "Scores " << (*itstatsc) << endl;
float weight = _scorerWeights[scorerNum];
if (weight == 0) {
stringstream msg;
msg << "No weights for scorer" << scorerNum ;
throw runtime_error(msg.str());
}
if (scorerNum == 0) {
scores.push_back(weight * (*itstatsc));
} else {
scores[inc] += weight * (*itstatsc);
}
//cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl;
inc++;
}
scorerNum++;
}
}
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles) {
for (vector<Scorer *>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){
//the scorers that use alignments use the reference files in the constructor through config
(*itsc)->setReferenceFiles(referenceFiles);
}
}
// Text can be:
// Reference sentence ||| Reference sentence alignment information (as given by MOSES -include-alignment-in-n-best)
// If a permutation distance scorer, send alignment info
// Else if other scorer, remove the alignment info and then send reference as usual
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) {
stringstream buff;
string align = text;
string sentence = "";
size_t alignmentData = text.find("|||");
//Get sentence and alignment parts
if(alignmentData != string::npos) {
getNextPound(align,sentence, "|||");
}
int i=0;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){
ScoreStats tempEntry;
if ((*itsc)->useAlignment()) {
(*itsc)->prepareStats(sid, text, tempEntry);
} else {
(*itsc)->prepareStats(sid, sentence, tempEntry);
}
if (i > 0) buff << " ";
buff << tempEntry;
i++;
}
//cout << " Scores for interpolated: " << buff << endl;
string str = buff.str();
entry.set(str);
}

66
mert/InterpolatedScorer.h Normal file
View File

@ -0,0 +1,66 @@
#ifndef __INTERPOLATED_SCORER_H__
#define __INTERPOLATED_SCORER_H__
#include <algorithm>
#include <cmath>
#include <iostream>
#include <iterator>
#include <limits>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include "Types.h"
#include "ScoreData.h"
#include "Scorer.h"
/**
* Abstract base class for scorers that include other scorers eg.
* Interpolated HAMMING and BLEU scorer **/
class InterpolatedScorer : public Scorer {
public:
// name would be: "HAMMING,BLEU" or similar
InterpolatedScorer(const string& name, const string& config);
~InterpolatedScorer(){};
void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
void setReferenceFiles(const vector<string>& referenceFiles);
void prepareStats(size_t sid, const string& text, ScoreStats& entry);
size_t NumberOfScores() const {
size_t sz=0;
for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end();itsc++){
sz += (*itsc)->NumberOfScores();
}
return sz;
};
bool useAlignment() const {
//cout << "InterpolatedScorer::useAlignment" << endl;
for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end();itsc++){
if ((*itsc)->useAlignment()){
//cout <<"InterpolatedScorer::useAlignment Returning true"<<endl;
return true;
}
}
return false;
};
//calculate the actual score - this gets done in the individual scorers
//statscore_t calculateScore(const vector<statscore_t>& totals);
void setScoreData(ScoreData* data);
protected:
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
vector<Scorer*> _scorers;
vector<float> _scorerWeights;
};
#endif //__INTERPOLATED_SCORER_H

View File

@ -1,14 +1,18 @@
bin_PROGRAMS = mert extractor
bin_PROGRAMS = mert extractor
mert_SOURCES = Util.cpp \
Timer.cpp \
ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
Data.cpp \
BleuScorer.cpp \
PermutationScorer.cpp \
Permutation.cpp \
Point.cpp \
PerScorer.cpp \
Scorer.cpp \
Optimizer.cpp \
InterpolatedScorer.cpp \
mert.cpp
extractor_SOURCES = Util.cpp \
@ -17,10 +21,13 @@ ScoreStats.cpp ScoreArray.cpp ScoreData.cpp \
FeatureStats.cpp FeatureArray.cpp FeatureData.cpp \
Data.cpp \
BleuScorer.cpp \
PermutationScorer.cpp \
Permutation.cpp \
Point.cpp \
PerScorer.cpp \
Scorer.cpp \
Optimizer.cpp \
InterpolatedScorer.cpp \
extractor.cpp

View File

@ -57,9 +57,9 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
entry.set(stats_str);
}
float PerScorer::calculateScore(const vector<int>& comps) {
float PerScorer::calculateScore(const vector<float>& comps) {
float denom = comps[2];
float num = comps[0] - max(0,comps[1]-comps[2]);
float num = comps[0] - max(float(0),comps[1]-comps[2]);
if (denom == 0) {
//shouldn't happen!
return 0.0;

View File

@ -32,11 +32,13 @@ class PerScorer: public StatisticsBasedScorer {
cerr << "I AM PerScorer" << std::endl;
}
size_t NumberOfScores(){ cerr << "PerScorer: 3" << endl; return 3; };
size_t NumberOfScores() const { cerr << "PerScorer: 3" << endl; return 3; };
bool useAlignment() const {return false;};
protected:
virtual float calculateScore(const vector<int>& comps) ;
virtual float calculateScore(const vector<float>& comps) ;
private:

335
mert/Permutation.cpp Normal file
View File

@ -0,0 +1,335 @@
/*
* Permutation.cpp
* met - Minimum Error Training
*
* Created by Alexandra Birch 18/11/09.
*
*/
#include <fstream>
#include <sstream>
#include <math.h>
#include "Permutation.h"
#include "Util.h"
using namespace std;
Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength )
{
if (sourceLength > 0)
{
set(alignment, sourceLength);
}
m_targetLength = targetLength;
}
size_t Permutation::getLength() const
{
return int(m_array.size());
}
void Permutation::dump() const
{
int j=0;
for (vector<int>::const_iterator i = m_array.begin(); i !=m_array.end(); i++){
cout << "(";
cout << j << ":" << *i ;
cout << "), ";
j++;
}
cout << endl;
}
//Sent alignment string
//Eg: "0-1 0-0 1-2 3-0 4-5 6-7 "
// Inidiviual word alignments which can be one-one,
// or null aligned, or many-many. The format is sourcepos - targetpos
//Its the output of the berkley aligner subtracting 1 from each number
//sourceLength needed because last source words might not be aligned
void Permutation::set(const string & alignment,const int sourceLength)
{
//cout << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
if(sourceLength <= 0)
{
//not found
cerr << "Source sentence length not positive:"<< sourceLength << endl;
exit(0);
}
if (alignment.length() <= 0)
{
//alignment empty - could happen but not good
cerr << "Alignment string empty:"<< alignment << endl;
}
//Tokenise on whitespace
string buf; // Have a buffer string
stringstream ss(alignment); // Insert the string into a stream
vector<string> tokens; // Create vector to hold our words
while (ss >> buf)
tokens.push_back(buf);
vector<int> tempPerm(sourceLength, -1);
//Set tempPerm to have one target position per source position
for (size_t i=0; i<tokens.size(); i++) {
string temp = tokens[i];
int posDelimeter = temp.find("-");
if(posDelimeter == int(string::npos)) {
cerr << "Delimiter not found - :"<< tokens[i] << endl;
exit(1);
}
int sourcePos = atoi((temp.substr(0, posDelimeter)).c_str());
int targetPos = atoi((temp.substr(posDelimeter+1)).c_str());
//cout << "SP:" << sourcePos << " TP:" << targetPos << endl;
if (sourcePos > sourceLength) {
cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
exit(1);
}
//If have multiple target pos aligned to one source,
// then ignore all but first alignment
if (tempPerm[sourcePos] == -1 || tempPerm[sourcePos] > targetPos)
{
tempPerm[sourcePos] = targetPos;
}
}
//TODO
//Set final permutation in m_array
//Take care of: source - null
// multiple_source - one target
// unaligned target
// Input: 1-9 2-1 4-3 4-4 5-6 6-6 7-6 8-8
// Convert source: 1 2 3 4 5 6 7 8
// target: 9 1 -1 3 6 6 6 8 -> 8 1 2 3 4 5 6 7
// 1st step: Add null aligned source to previous alignment
// target: 9 1 -1 3 6 6 6 8 -> 9 1 1 3 6 6 6 8
int last=0;
m_array.assign(sourceLength, -1);
//get a searcheable index
multimap<int, int> invMap;
multimap<int, int>::iterator it;
//cout << " SourceP -> TargetP " << endl;
for (size_t i=0; i<tempPerm.size(); i++)
{
if (tempPerm[i] == -1) {
tempPerm[i] = last;
} else {
last = tempPerm[i];
}
//cout << i << " -> " << tempPerm[i] << endl;
//Key is target pos, value is source pos
invMap.insert(pair<int,int>(tempPerm[i],int(i)));
}
// 2nd step: Get target into index of multimap and sort
// Convert source: 1 2 3 4 5 6 7 8
// target: 9 1 0 3 6 6 6 8 -> 0 1 3 6 6 6 8 9
// source: 3 2 4 5 6 7 8 1
int i=0;
//cout << " TargetP => SourceP : TargetIndex " << endl;
for ( it=invMap.begin() ; it != invMap.end(); it++ )
{
//cout << (*it).first << " => " << (*it).second << " : " << i << endl;
//find source position
m_array[(*it).second] = i;
i++;
}
bool ok = checkValidPermutation(m_array);
//dump();
if (!ok) {
throw runtime_error(" Created invalid permutation");
}
}
//Static
vector<int> Permutation::invert(const vector<int> & inVector)
{
vector<int> outVector(inVector.size());
for (size_t i=0; i<inVector.size(); i++){
outVector[inVector[i]] = int(i);
}
return outVector;
}
//Static
//Permutations start at 0
bool Permutation::checkValidPermutation(vector<int> const & inVector)
{
vector<int> test(inVector.size(),-1);
for (size_t i=0; i< inVector.size(); i++){
//No multiple entries of same value allowed
if (test[inVector[i]] > -1){
cerr << "Permutation error: multiple entries of same value\n" << endl;
return false;
}
test[inVector[i]] ++;
}
for (size_t i=0; i<inVector.size(); i++){
//No holes allowed
if (test[inVector[i]] == -1) {
cerr << "Permutation error: missing values\n" << endl;
return false;
}
}
return true;
}
//TODO default to HAMMING
//Note: it returns the distance that is not normalised
float Permutation::distance(const Permutation &permCompare, const distanceMetric_t &type) const
{
float score=0;
//cout << "*****Permutation::distance" <<endl;
//cout << "Ref:" << endl;
//dump();
//cout << "Comp:" << endl;
//permCompare.dump();
if (type == HAMMING_DISTANCE) {
score = calculateHamming(permCompare);
} else if (type == KENDALL_DISTANCE) {
score = calculateKendall(permCompare);
} else {
throw runtime_error("Distance type not valid");
}
float brevityPenalty = 1.0 - (float) permCompare.getTargetLength()/getTargetLength() ;//reflength divided by trans length
if (brevityPenalty < 0.0) {
score = score * exp(brevityPenalty);
}
//cout << "Distance type:" << type << endl;
//cout << "Score: "<< score << endl;
return score;
}
float Permutation::calculateHamming(const Permutation & compare) const
{
float score=0;
vector<int> compareArray = compare.getArray();
if (getLength() != compare.getLength()) {
cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
throw runtime_error("Length of permutations not equal");
}
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++)
{
if (m_array[i] != compareArray[i])
{
score++;
}
}
score = 1 - (score / getLength());
return score;
}
float Permutation::calculateKendall(const Permutation & compare) const
{
float score=0;
vector<int> compareArray = compare.getArray();
if (getLength() != compare.getLength()) {
cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
throw runtime_error("Length of permutations not equal");
}
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++)
{
for (size_t j=0; j<getLength(); j++)
{
if ((m_array[i] < m_array[j]) && (compareArray[i] > compareArray[j]))
{
score++;
}
}
}
score = (score / ((getLength()*getLength() - getLength()) /2 ) );
//Adjusted Kendall's tau correlates better with human judgements
score = sqrt (score);
score = 1 - score;
return score;
}
vector<int> Permutation::getArray() const
{
vector<int> ret = m_array;
return ret;
}
//Static
//This function is called with test which is
// the 5th field in moses nbest output when called with -include-alignment-in-n-best
//eg. 0=0 1-2=1-2 3=3 4=4 5=5 6=6 7-9=7-8 10=9 11-13=10-11 (source-target)
string Permutation::convertMosesToStandard(string const & alignment)
{
if (alignment.length() == 0)
{
cerr << "Alignment input string empty" << endl;
}
string working = alignment;
string out;
stringstream oss;
while (working.length() > 0)
{
string align;
getNextPound(working,align," ");
//If found an alignment
if (align.length() > 0)
{
size_t posDelimeter = align.find("=");
if(posDelimeter== string::npos)
{
cerr << "Delimiter not found = :"<< align << endl;
exit(0);
}
int firstSourcePos,lastSourcePos,firstTargetPos,lastTargetPos;
string sourcePoss = align.substr(0, posDelimeter);
string targetPoss = align.substr(posDelimeter+1);
posDelimeter = sourcePoss.find("-");
if(posDelimeter < string::npos) {
firstSourcePos = atoi((sourcePoss.substr(0, posDelimeter)).c_str());
lastSourcePos = atoi((sourcePoss.substr(posDelimeter+1)).c_str());
} else {
firstSourcePos = atoi(sourcePoss.c_str());
lastSourcePos = firstSourcePos;
}
posDelimeter = targetPoss.find("-");
if(posDelimeter < string::npos) {
firstTargetPos = atoi((targetPoss.substr(0, posDelimeter)).c_str());
lastTargetPos = atoi((targetPoss.substr(posDelimeter+1)).c_str());
} else {
firstTargetPos = atoi(targetPoss.c_str());
lastTargetPos = firstTargetPos;
}
for (int i = firstSourcePos; i <= lastSourcePos; i++) {
for (int j = firstTargetPos; j <= lastTargetPos; j++) {
oss << i << "-" << j << " ";
}
}
} //else case where two spaces ?
}
out = oss.str();
//cout << "ConverttoStandard: " << out << endl;
return out;
}

62
mert/Permutation.h Normal file
View File

@ -0,0 +1,62 @@
/*
* Permutation.h
* met - Minimum Error Training
*
* Created by Alexandra Birch 18 Nov 2009.
*
*/
#ifndef PERMUTATION_H
#define PERMUTATION_H
#include <limits>
#include <vector>
#include <iostream>
#include <fstream>
#include "Util.h"
class Permutation
{
public:
//Can be HAMMING_DISTANCE or KENDALLS_DISTANCE
Permutation(const std::string &alignment = std::string(), const int sourceLength = 0, const int targetLength = 0 );
~Permutation(){};
inline void clear() { m_array.clear(); }
inline size_t size(){ return m_array.size(); }
void set(const std::string &alignment,const int sourceLength);
float distance(const Permutation &permCompare, const distanceMetric_t &strategy = HAMMING_DISTANCE) const;
//Const
void dump() const;
size_t getLength() const;
vector<int> getArray() const;
int getTargetLength() const {
return m_targetLength;
}
//Static
static std::string convertMosesToStandard(std::string const & alignment);
static vector<int> invert(vector<int> const & inVector);
static bool checkValidPermutation(vector<int> const & inVector);
protected:
vector<int> m_array;
int m_targetLength;
float calculateHamming(const Permutation & compare) const;
float calculateKendall(const Permutation & compare) const;
private:
};
#endif

218
mert/PermutationScorer.cpp Normal file
View File

@ -0,0 +1,218 @@
#include "PermutationScorer.h"
using namespace std;
const int PermutationScorer::SCORE_PRECISION = 5;
PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config)
:SentenceLevelScorer(distanceMetric,config)
{
//configure regularisation
static string KEY_REFCHOICE = "refchoice";
static string REFCHOICE_AVERAGE = "average";
static string REFCHOICE_CLOSEST = "closest";
string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST);
if (refchoice == REFCHOICE_AVERAGE) {
m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE;
} else if (refchoice == REFCHOICE_CLOSEST) {
m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST;
} else {
throw runtime_error("Unknown reference choice strategy: " + refchoice);
}
cerr << "Using reference choice strategy: " << refchoice << endl;
if (distanceMetric.compare("HAMMING") == 0) {
m_distanceMetric = HAMMING_DISTANCE;
} else if (distanceMetric.compare("KENDALL") == 0) {
m_distanceMetric = KENDALL_DISTANCE;
}
cerr << "Using permutation distance metric: " << distanceMetric << endl;
//Get reference alignments from scconfig refalign option
static string KEY_ALIGNMENT_FILES = "refalign";
string refalign = getConfig(KEY_ALIGNMENT_FILES,"");
//cout << refalign << endl;
if (refalign.length() > 0){
string substring;
while (!refalign.empty()){
getNextPound(refalign, substring, "+");
m_referenceAlignments.push_back(substring);
}
}
//Get length of source sentences read in from scconfig source option
// this is essential for extractor but unneccesary for mert executable
static string KEY_SOURCE_FILE = "source";
string sourceFile = getConfig(KEY_SOURCE_FILE,"");
if (sourceFile.length() > 0) {
cerr << "Loading source sentence lengths from " << sourceFile << endl;
ifstream sourcein(sourceFile.c_str());
if (!sourcein) {
throw runtime_error("Unable to open: " + sourceFile);
}
string line;
while (getline(sourcein,line)) {
size_t wordNumber = 0;
string word;
while(!line.empty()){
getNextPound(line, word, " ");
wordNumber++;
}
m_sourceLengths.push_back(wordNumber);
}
sourcein.close();
}
}
void PermutationScorer::setReferenceFiles(const vector<string>& referenceFiles) {
cout << "*******setReferenceFiles" << endl;
//make sure reference data is clear
m_referencePerms.clear();
vector< vector< int> > targetLengths;
//Just getting target length from reference text file
for (size_t i = 0; i < referenceFiles.size(); ++i)
{
vector <int> lengths;
cout << "Loading reference from " << referenceFiles[i] << endl;
ifstream refin(referenceFiles[i].c_str());
if (!refin)
{
cerr << "Unable to open: " << referenceFiles[i] << endl;
throw runtime_error("Unable to open alignment file");
}
string line;
while (getline(refin,line))
{
int count = getNumberWords(line);
lengths.push_back(count);
}
targetLengths.push_back(lengths);
}
//load reference data
//NOTE ignoring normal reference file, only using previously saved alignment reference files
for (size_t i = 0; i < m_referenceAlignments.size(); ++i)
{
vector<Permutation> referencePerms;
cout << "Loading reference from " << m_referenceAlignments[i] << endl;
ifstream refin(m_referenceAlignments[i].c_str());
if (!refin)
{
cerr << "Unable to open: " << m_referenceAlignments[i] << endl;
throw runtime_error("Unable to open alignment file");
}
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line))
{
//cout << line << endl;
//Line needs to be of the format: 0-0 1-1 1-2 etc source-target
Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]);
//perm.dump();
referencePerms.push_back(perm);
//check the source sentence length is the same for previous file
if (perm.getLength() != m_sourceLengths[sid])
{
cerr << "Permutation Length: " << perm.getLength() << endl;
cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl;
throw runtime_error("Source sentence lengths not the same: ");
}
sid++;
}
m_referencePerms.push_back(referencePerms);
}
}
int PermutationScorer::getNumberWords (const string& text) const {
int count = 0;
string line = trimStr(text);
if (line.length()>0) {
int pos = line.find(" ");
while (pos!=int(string::npos)){
count++;
pos = line.find(" ",pos+1);
}
count++;
}
return count;
}
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) {
//cout << "*******prepareStats" ;
//cout << text << endl;
//cout << sid << endl;
//cout << "Reference0align:" << endl;
//m_referencePerms[0][sid].dump();
string sentence = "";
string align = text;
size_t alignmentData = text.find("|||");
//Get sentence and alignment parts
if(alignmentData != string::npos) {
getNextPound(align,sentence, "|||");
} else {
align = text;
}
int translationLength = getNumberWords(sentence);
//A vector of Permutations for each sentence
vector< vector<Permutation> > nBestPerms;
float distanceValue;
//need to create permutations for each nbest line
string standardFormat = Permutation::convertMosesToStandard(align);
Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
//perm.dump();
if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
float total = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
total += dist;
//cout << "Ref number: " << i << " distance: " << dist << endl;
}
float mean = (float)total/m_referencePerms.size();
//cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
distanceValue = mean;
} else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) {
float max_val = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
//look for the closest reference
float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
//cout << "Ref number: " << i << " distance: " << value << endl;
if (value > max_val) {
max_val = value;
}
}
distanceValue = max_val;
//cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
} else {
throw runtime_error("Unsupported reflength strategy");
}
//SCOREROUT eg: 0.04546
ostringstream tempStream;
tempStream.precision(SCORE_PRECISION);
tempStream << distanceValue;
string str = tempStream.str();
entry.set(str);
//cout << tempStream.str();
}
//Will just be final score
statscore_t PermutationScorer::calculateScore(const vector<statscore_t>& comps) {
//cerr << "*******PermutationScorer::calculateScore" ;
//cerr << " " << comps[0] << endl;
return comps[0];
}

63
mert/PermutationScorer.h Normal file
View File

@ -0,0 +1,63 @@
#ifndef __PERMUTATIONSCORER_H__
#define __PERMUTATIONSCORER_H__
#include <algorithm>
#include <cmath>
#include <iostream>
#include <iterator>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include <limits.h>
#include "Types.h"
#include "ScoreData.h"
#include "Scorer.h"
#include "Permutation.h"
/**
* Permutation
**/
class PermutationScorer: public SentenceLevelScorer
{
public:
PermutationScorer(const string &distanceMetric = "HAMMING",
const string &config = string());
void setReferenceFiles(const vector<string>& referenceFiles);
void prepareStats(size_t sid, const string& text, ScoreStats& entry);
static const int SCORE_PRECISION;
size_t NumberOfScores() const {
//cerr << "PermutationScorer number of scores: 1" << endl;
return 1;
};
bool useAlignment() const {
//cout << "PermutationScorer::useAlignment returning true" << endl;
return true;
};
protected:
statscore_t calculateScore(const vector<statscore_t>& scores);
PermutationScorer(const PermutationScorer&);
~PermutationScorer(){};
PermutationScorer& operator=(const PermutationScorer&);
int getNumberWords (const string & line) const;
distanceMetricReferenceChoice_t m_refChoiceStrategy;
distanceMetric_t m_distanceMetric;
// data extracted from reference files
// A vector of permutations for each reference file
vector< vector<Permutation> > m_referencePerms;
vector<size_t> m_sourceLengths;
vector<string> m_referenceAlignments;
private:
};
//TODO need to read in floats for scores - necessary for selecting mean reference strategy and for BLEU?
#endif //__PERMUTATIONSCORER_H

View File

@ -16,11 +16,27 @@ ScoreData::ScoreData(Scorer& ptr):
theScorer(&ptr)
{
score_type = theScorer->getName();
theScorer->setScoreData(this);//this is not dangerous: we dont use the this pointer in SetScoreData
//theScorer->setScoreData(this);//this is not dangerous: we dont use the this pointer in SetScoreData
number_of_scores = theScorer->NumberOfScores();
TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl);
};
void ScoreData::dump()
{
for (vector<ScoreArray>::iterator it = array_.begin(); it !=array_.end(); it++){
cout << "scorearray: " << endl;
for (size_t i = 0; i < (*it).size(); i++) {
ScoreStats scoreStats = (*it).get(i);
cout << "scorestats: " ;
for (size_t j = 0; j < scoreStats.size(); j ++ ){
ScoreStatsType scoreStatsType = scoreStats.get(j);
cout << scoreStatsType << " " ;
}
cout << endl;
}
}
}
void ScoreData::save(std::ofstream& outFile, bool bin)
{
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++){
@ -60,6 +76,7 @@ void ScoreData::load(ifstream& inFile)
}
add(entry);
}
theScorer->setScoreData(this);
}

View File

@ -81,6 +81,8 @@ public:
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
void dump();
};

View File

@ -26,9 +26,10 @@ static float score_average(const statscores_t& scores, size_t start, size_t end)
void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
if (!_scoreData) {
throw runtime_error("Score data not loaded");
}
//cout << "*******StatisticsBasedScorer::score" << endl;
if (!_scoreData) {
throw runtime_error("Score data not loaded");
}
//calculate the score for the candidates
if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
@ -37,20 +38,21 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
throw runtime_error("No candidates supplied");
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<int> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candidates[i] " << candidates[i] << endl;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
}
}
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
}
}
scores.push_back(calculateScore(totals));
candidates_t last_candidates(candidates);
@ -59,11 +61,18 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second;
//cout << "STSC:sid = " << sid << endl;
//cout << "STSC:nid = " << nid << endl;
size_t last_nid = last_candidates[sid];
//cout << "STSC:oid = " << last_nid << endl;
for (size_t k = 0; k < totals.size(); ++k) {
int diff = _scoreData->get(sid,nid).get(k)
float diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k);
totals[k] += diff;
//cout << "STSC:nid = " << _scoreData->get(sid,nid).get(k) << endl;
//cout << "STSC:oid = " << _scoreData->get(sid,last_nid).get(k) << endl;
//cout << "STSC:diff = " << diff << endl;
//cout << "STSC:totals = " << totals[k] << endl;
}
last_candidates[sid] = nid;
}
@ -95,3 +104,95 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
/** The sentence level scores have already been calculated, just need to average them
and include the differences. Allows scores which are floats **/
void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
//cout << "*******SentenceLevelScorer::score" << endl;
if (!_scoreData) {
throw runtime_error("Score data not loaded");
}
//calculate the score for the candidates
if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
}
if (candidates.size() == 0) {
throw runtime_error("No candidates supplied");
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candi " << candidates[i] ;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
//Add up scores for all sentences, would normally be just one score
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
//cout << " stats " << stats.get(k) ;
}
//cout << endl;
}
//take average
for (size_t k = 0; k < totals.size(); ++k) {
//cout << "totals = " << totals[k] << endl;
//cout << "cand = " << candidates.size() << endl;
totals[k] /= candidates.size();
//cout << "finaltotals = " << totals[k] << endl;
}
scores.push_back(calculateScore(totals));
candidates_t last_candidates(candidates);
//apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) {
for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second;
//cout << "sid = " << sid << endl;
//cout << "nid = " << nid << endl;
size_t last_nid = last_candidates[sid];
for (size_t k = 0; k < totals.size(); ++k) {
float diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k);
//cout << "diff = " << diff << endl;
totals[k] += diff/candidates.size();
//cout << "totals = " << totals[k] << endl;
}
last_candidates[sid] = nid;
}
scores.push_back(calculateScore(totals));
}
//regularisation. This can either be none, or the min or average as described in
//Cer, Jurafsky and Manning at WMT08
if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) {
//no regularisation
return;
}
//window size specifies the +/- in each direction
statscores_t raw_scores(scores);//copy scores
for (size_t i = 0; i < scores.size(); ++i) {
size_t start = 0;
if (i >= _regularisationWindow) {
start = i - _regularisationWindow;
}
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
}
}

View File

@ -58,7 +58,7 @@ class Scorer {
/**
* returns the number of statistics needed for the computation of the score
**/
virtual size_t NumberOfScores(){ cerr << "Scorer: 0" << endl; return 0; };
virtual size_t NumberOfScores() const { cerr << "Scorer: 0" << endl; return 0; };
/**
* set the reference files. This must be called before prepareStats.
@ -123,9 +123,19 @@ class Scorer {
/**
* Set the score data, prior to scoring.
**/
void setScoreData(ScoreData* data) {
virtual void setScoreData(ScoreData* data) {
_scoreData = data;
}
/**
* The scorer returns if it uses the reference alignment data
* for permutation distance scores
**/
virtual bool useAlignment() const {
//cout << "Scorer::useAlignment returning false " << endl;
return false;
};
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;};
protected:
typedef map<string,int> encodings_t;
@ -233,10 +243,70 @@ class StatisticsBasedScorer : public Scorer {
~StatisticsBasedScorer(){};
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;};
protected:
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
};
/**
* Abstract base class for scorers that work by using sentence level
* statistics eg. permutation distance metrics **/
class SentenceLevelScorer : public Scorer {
public:
SentenceLevelScorer(const string& name, const string& config): Scorer(name,config) {
//configure regularisation
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case";
static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM;
} else {
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) {
_preserveCase = true;
}else if (preservecase == FALSE) {
_preserveCase = false;
}
cerr << "Using case preservation: " << _preserveCase << endl;
}
~SentenceLevelScorer(){};
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
//calculate the actual score
virtual statscore_t calculateScore(const vector<int>& totals) = 0;
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;};
protected:
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;

View File

@ -13,6 +13,8 @@
#include "Types.h"
#include "Scorer.h"
#include "BleuScorer.h"
#include "PermutationScorer.h"
#include "InterpolatedScorer.h"
#include "PerScorer.h"
using namespace std;
@ -22,18 +24,36 @@ class ScorerFactory {
public:
vector<string> getTypes() {
vector<string> types;
types.push_back(string("BLEU1"));
types.push_back(string("BLEU"));
types.push_back(string("PER"));
types.push_back(string("HAMMING"));
types.push_back(string("KENDALL"));
return types;
}
Scorer* getScorer(const string& type, const string& config = "") {
if (type == "BLEU") {
return (BleuScorer*) new BleuScorer(config);
} else if (type == "PER") {
return (PerScorer*) new PerScorer(config);
size_t scorerTypes = type.find(",");
if(scorerTypes == string::npos) {
if (type == "BLEU1") {
string conf;
if (config.length() > 0) {
conf = config + ",ngramlen:1";
} else {
conf = config + "ngramlen:1";
}
return (BleuScorer*) new BleuScorer(conf);
} else if (type == "BLEU") {
return (BleuScorer*) new BleuScorer(config);
} else if (type == "PER") {
return (PerScorer*) new PerScorer(config);
} else if ((type == "HAMMING") || (type == "KENDALL")) {
return (PermutationScorer*) new PermutationScorer(type, config);
} else {
throw runtime_error("Unknown scorer type: " + type);
}
} else {
throw runtime_error("Unknown scorer type: " + type);
return (InterpolatedScorer*) new InterpolatedScorer(type, config);
}
}
};

View File

@ -30,12 +30,15 @@ typedef FeatureStatsType* featstats_t;
typedef vector<FeatureStats> featarray_t;
typedef vector<FeatureArray> featdata_t;
typedef int ScoreStatsType;
typedef float ScoreStatsType;
typedef ScoreStatsType* scorestats_t;
//typedef vector<ScoreStatsType> scorestats_t;
typedef vector<ScoreStats> scorearray_t;
typedef vector<ScoreArray> scoredata_t;
typedef enum { HAMMING_DISTANCE=0, KENDALL_DISTANCE } distanceMetric_t;
typedef enum { REFERENCE_CHOICE_AVERAGE=0, REFERENCE_CHOICE_CLOSEST } distanceMetricReferenceChoice_t;
typedef map<size_t, std::string> idx2name;
typedef map<std::string, size_t> name2idx;

View File

@ -97,5 +97,15 @@ void ResetUserTime();
void PrintUserTime(const std::string &message);
double GetUserTime();
inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n")
{
unsigned int p2 = Src.find_last_not_of(c);
if (p2 == std::string::npos) return std::string();
unsigned int p1 = Src.find_first_not_of(c);
if (p1 == std::string::npos) p1 = 0;
return Src.substr(p1, (p2-p1)+1);
}
#endif

View File

@ -0,0 +1 @@
1 1

View File

@ -0,0 +1,4 @@
0 ||| yet a étape critical for the balkan ||| d: -1 -5 ||| -217.357 ||| 0-1=0-1 1=1 2=2 3=3 4-5=4-5 6=6
0 ||| yet critical a étape for the balkans ||| d: -3 -1 ||| -217.357 ||| 0=0 3=1 1=2 2=3 4-5=4-5 6=6
1 ||| the public will fall the largess of tourner ||| d: -1 -5 ||| -10 ||| 0=2 6=1 5=0 4=4 5=6 7=7
1 ||| the province 's fate is looming of tourner ||| d: -3 -1 ||| -10 ||| 0-1=0-1 2=2 3=3 4=4 5=5 6=6 7=7

View File

@ -0,0 +1,2 @@
yet a étape critical for the balkan
the public will soon in the eye

View File

@ -0,0 +1,2 @@
0-0 3-1 1-2 2-3 4-4 5-5 6-6
1-6 2-1 2-2 4-7 5-5 6-7

View File

@ -0,0 +1,8 @@
SCORES_TXT_BEGIN_0 0 2 9 BLEU
7 7 6 6 5 5 4 4 7
6 7 2 6 0 5 0 4 7
SCORES_TXT_END_0
SCORES_TXT_BEGIN_0 1 2 9 BLEU
4 8 2 7 1 6 0 5 7
1 8 0 7 0 6 0 5 7
SCORES_TXT_END_0

View File

@ -0,0 +1,8 @@
SCORES_TXT_BEGIN_0 0 2 1 HAMMING
0.57143
1
SCORES_TXT_END_0
SCORES_TXT_BEGIN_0 1 2 1 HAMMING
0.125
0.375
SCORES_TXT_END_0

View File

@ -0,0 +1,2 @@
encore une étape cruciale pour les balkans
le public aura bientôt l' occasion de tourner

10
mert/example/run.sh Normal file
View File

@ -0,0 +1,10 @@
../extractor --nbest NBEST --reference REF --ffile FEATSTAT --scfile SCORESTAT --sctype BLEU
../mert --ifile init.opt --scfile SCORESTAT --ffile FEATSTAT -d 15 --verbose 4 -n 5 --sctype BLEU
../extractor --sctype HAMMING,BLEU --nbest interpolated/tinyNBestAlign --reference interpolated/tinyRef --scconfig refalign:interpolated/tinyRefAlign,source:interpolated/tinySource --ffile interpolated/tinyFeat --scfile interpolated/tinyScoreHB
../mert --ifile interpolated/init.opt --scfile interpolated/tinyScoreHB --ffile interpolated/tinyFeat -d 2 --verbose 4 -n 5 --sctype HAMMING,BLEU --scconfig weights:0.9+0.1
results in orig_BLEU_output

View File

@ -20,7 +20,7 @@ using namespace std;
void usage() {
cerr<<"usage: extractor [options])"<<endl;
cerr<<"[--sctype|-s] the scorer type (default BLEU)"<<endl;
cerr<<"[--sctype|-s] the scorer type (default BLEU), possibly comma separated list of interpolated types"<<endl;
cerr<<"[--scconfig|-c] configuration string passed to scorer"<<endl;
cerr<<"\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc "<<endl;
cerr<<"[--reference|-r] comma separated list of reference files"<<endl;
@ -75,7 +75,7 @@ int main(int argc, char** argv) {
bool binmode = false;
int verbosity = 0;
int c;
while ((c=getopt_long (argc,argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
while ((c=getopt_long (argc,argv, "s:w:r:a:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
switch(c) {
case 's':
scorerType = string(optarg);
@ -173,6 +173,7 @@ int main(int argc, char** argv) {
if (binmode) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
//TODO is comma separated list? split and create a scorer with multiple parts
TRACE_ERR("Scorer type: " << scorerType << endl);
ScorerFactory sfactory;
Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig);