run beautify.perl. Consistent formatting for .h & .cpp files in Mert directory

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mert-mtm5@4167 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
machacekmatous 2011-08-29 14:08:17 +00:00
parent 8b97df9367
commit 0a44787f55
39 changed files with 3122 additions and 2863 deletions

View File

@ -1,199 +1,204 @@
#include "BleuScorer.h" #include "BleuScorer.h"
BleuScorer::BleuScorer(const string& config = "") : StatisticsBasedScorer("BLEU",config),_refLengthStrategy(BLEU_CLOSEST) { BleuScorer::BleuScorer(const string& config = "") : StatisticsBasedScorer("BLEU",config),_refLengthStrategy(BLEU_CLOSEST)
//configure regularisation {
static string KEY_REFLEN = "reflen"; //configure regularisation
static string REFLEN_AVERAGE = "average"; static string KEY_REFLEN = "reflen";
static string REFLEN_SHORTEST = "shortest"; static string REFLEN_AVERAGE = "average";
static string REFLEN_CLOSEST = "closest"; static string REFLEN_SHORTEST = "shortest";
static string REFLEN_CLOSEST = "closest";
string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST); string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) { if (reflen == REFLEN_AVERAGE) {
_refLengthStrategy = BLEU_AVERAGE; _refLengthStrategy = BLEU_AVERAGE;
} else if (reflen == REFLEN_SHORTEST) { } else if (reflen == REFLEN_SHORTEST) {
_refLengthStrategy = BLEU_SHORTEST; _refLengthStrategy = BLEU_SHORTEST;
} else if (reflen == REFLEN_CLOSEST) { } else if (reflen == REFLEN_CLOSEST) {
_refLengthStrategy = BLEU_CLOSEST; _refLengthStrategy = BLEU_CLOSEST;
} else { } else {
throw runtime_error("Unknown reference length strategy: " + reflen); throw runtime_error("Unknown reference length strategy: " + reflen);
} }
cerr << "Using reference length strategy: " << reflen << endl; cerr << "Using reference length strategy: " << reflen << endl;
static string KEY_NGRAMS = "ngramlen"; static string KEY_NGRAMS = "ngramlen";
string ngramlen = getConfig(KEY_NGRAMS,"4"); string ngramlen = getConfig(KEY_NGRAMS,"4");
LENGTH = strtol(ngramlen.c_str(), NULL, 10); LENGTH = strtol(ngramlen.c_str(), NULL, 10);
} }
/** /**
* count the ngrams of each type, up to the given length in the input line. * count the ngrams of each type, up to the given length in the input line.
**/ **/
size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n) { size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n)
vector<int> encoded_tokens; {
//cerr << line << endl; vector<int> encoded_tokens;
encode(line,encoded_tokens); //cerr << line << endl;
//copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," ")); encode(line,encoded_tokens);
//cerr << endl; //copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
for (size_t k = 1; k <= n; ++k) { //cerr << endl;
//ngram order longer than sentence - no point for (size_t k = 1; k <= n; ++k) {
if (k > encoded_tokens.size()) { //ngram order longer than sentence - no point
continue; if (k > encoded_tokens.size()) {
} continue;
for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) { }
vector<int> ngram; for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) {
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) { vector<int> ngram;
ngram.push_back(encoded_tokens[j]); for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
} ngram.push_back(encoded_tokens[j]);
int count = 1; }
counts_it oldcount = counts.find(ngram); int count = 1;
if (oldcount != counts.end()) { counts_it oldcount = counts.find(ngram);
count = (oldcount->second) + 1; if (oldcount != counts.end()) {
} count = (oldcount->second) + 1;
//cerr << count << endl; }
counts[ngram] = count; //cerr << count << endl;
//cerr << endl; counts[ngram] = count;
} //cerr << endl;
} }
//cerr << "counted ngrams" << endl; }
//dump_counts(counts); //cerr << "counted ngrams" << endl;
return encoded_tokens.size(); //dump_counts(counts);
return encoded_tokens.size();
} }
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles) { void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
//make sure reference data is clear {
_refcounts.clear(); //make sure reference data is clear
_reflengths.clear(); _refcounts.clear();
_encodings.clear(); _reflengths.clear();
_encodings.clear();
//load reference data //load reference data
for (size_t i = 0; i < referenceFiles.size(); ++i) { for (size_t i = 0; i < referenceFiles.size(); ++i) {
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl); TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
ifstream refin(referenceFiles[i].c_str()); ifstream refin(referenceFiles[i].c_str());
if (!refin) { if (!refin) {
throw runtime_error("Unable to open: " + referenceFiles[i]); throw runtime_error("Unable to open: " + referenceFiles[i]);
} }
string line; string line;
size_t sid = 0; //sentence counter size_t sid = 0; //sentence counter
while (getline(refin,line)) { while (getline(refin,line)) {
//cerr << line << endl; //cerr << line << endl;
if (i == 0) { if (i == 0) {
counts_t* counts = new counts_t(); //these get leaked counts_t* counts = new counts_t(); //these get leaked
_refcounts.push_back(counts); _refcounts.push_back(counts);
vector<size_t> lengths; vector<size_t> lengths;
_reflengths.push_back(lengths); _reflengths.push_back(lengths);
} }
if (_refcounts.size() <= sid) { if (_refcounts.size() <= sid) {
throw runtime_error("File " + referenceFiles[i] + " has too many sentences"); throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
} }
counts_t counts; counts_t counts;
size_t length = countNgrams(line,counts,LENGTH); size_t length = countNgrams(line,counts,LENGTH);
//for any counts larger than those already there, merge them in //for any counts larger than those already there, merge them in
for (counts_it ci = counts.begin(); ci != counts.end(); ++ci) { for (counts_it ci = counts.begin(); ci != counts.end(); ++ci) {
counts_it oldcount_it = _refcounts[sid]->find(ci->first); counts_it oldcount_it = _refcounts[sid]->find(ci->first);
int oldcount = 0; int oldcount = 0;
if (oldcount_it != _refcounts[sid]->end()) { if (oldcount_it != _refcounts[sid]->end()) {
oldcount = oldcount_it->second; oldcount = oldcount_it->second;
} }
int newcount = ci->second; int newcount = ci->second;
if (newcount > oldcount) { if (newcount > oldcount) {
_refcounts[sid]->operator[](ci->first) = newcount; _refcounts[sid]->operator[](ci->first) = newcount;
} }
} }
//add in the length //add in the length
_reflengths[sid].push_back(length); _reflengths[sid].push_back(length);
if (sid > 0 && sid % 100 == 0) { if (sid > 0 && sid % 100 == 0) {
TRACE_ERR("."); TRACE_ERR(".");
} }
++sid; ++sid;
} }
refin.close(); refin.close();
TRACE_ERR(endl); TRACE_ERR(endl);
} }
} }
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
// cerr << text << endl; // cerr << text << endl;
// cerr << sid << endl; // cerr << sid << endl;
//dump_counts(*_refcounts[sid]); //dump_counts(*_refcounts[sid]);
if (sid >= _refcounts.size()) { if (sid >= _refcounts.size()) {
stringstream msg; stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set"; msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str()); throw runtime_error(msg.str());
} }
counts_t testcounts; counts_t testcounts;
//stats for this line //stats for this line
vector<float> stats(LENGTH*2);; vector<float> stats(LENGTH*2);;
size_t length = countNgrams(text,testcounts,LENGTH); size_t length = countNgrams(text,testcounts,LENGTH);
//dump_counts(testcounts); //dump_counts(testcounts);
if (_refLengthStrategy == BLEU_SHORTEST) { if (_refLengthStrategy == BLEU_SHORTEST) {
//cerr << reflengths.size() << " " << sid << endl; //cerr << reflengths.size() << " " << sid << endl;
int shortest = *min_element(_reflengths[sid].begin(),_reflengths[sid].end()); int shortest = *min_element(_reflengths[sid].begin(),_reflengths[sid].end());
stats.push_back(shortest); stats.push_back(shortest);
} else if (_refLengthStrategy == BLEU_AVERAGE) { } else if (_refLengthStrategy == BLEU_AVERAGE) {
int total = 0; int total = 0;
for (size_t i = 0; i < _reflengths[sid].size(); ++i) { for (size_t i = 0; i < _reflengths[sid].size(); ++i) {
total += _reflengths[sid][i]; total += _reflengths[sid][i];
} }
float mean = (float)total/_reflengths[sid].size(); float mean = (float)total/_reflengths[sid].size();
stats.push_back(mean); stats.push_back(mean);
} else if (_refLengthStrategy == BLEU_CLOSEST) { } else if (_refLengthStrategy == BLEU_CLOSEST) {
int min_diff = INT_MAX; int min_diff = INT_MAX;
int min_idx = 0; int min_idx = 0;
for (size_t i = 0; i < _reflengths[sid].size(); ++i) { for (size_t i = 0; i < _reflengths[sid].size(); ++i) {
int reflength = _reflengths[sid][i]; int reflength = _reflengths[sid][i];
if (abs(reflength-(int)length) < abs(min_diff)) { //look for the closest reference if (abs(reflength-(int)length) < abs(min_diff)) { //look for the closest reference
min_diff = reflength-length; min_diff = reflength-length;
min_idx = i; min_idx = i;
}else if (abs(reflength-(int)length) == abs(min_diff)) { // if two references has the same closest length, take the shortest } else if (abs(reflength-(int)length) == abs(min_diff)) { // if two references has the same closest length, take the shortest
if (reflength < (int)_reflengths[sid][min_idx]){ if (reflength < (int)_reflengths[sid][min_idx]) {
min_idx = i; min_idx = i;
} }
} }
} }
stats.push_back(_reflengths[sid][min_idx]); stats.push_back(_reflengths[sid][min_idx]);
} else { } else {
throw runtime_error("Unsupported reflength strategy"); throw runtime_error("Unsupported reflength strategy");
} }
//cerr << "computed length" << endl; //cerr << "computed length" << endl;
//precision on each ngram type //precision on each ngram type
for (counts_it testcounts_it = testcounts.begin(); for (counts_it testcounts_it = testcounts.begin();
testcounts_it != testcounts.end(); ++testcounts_it) { testcounts_it != testcounts.end(); ++testcounts_it) {
counts_it refcounts_it = _refcounts[sid]->find(testcounts_it->first); counts_it refcounts_it = _refcounts[sid]->find(testcounts_it->first);
int correct = 0; int correct = 0;
int guess = testcounts_it->second; int guess = testcounts_it->second;
if (refcounts_it != _refcounts[sid]->end()) { if (refcounts_it != _refcounts[sid]->end()) {
correct = min(refcounts_it->second,guess); correct = min(refcounts_it->second,guess);
} }
size_t len = testcounts_it->first.size(); size_t len = testcounts_it->first.size();
stats[len*2-2] += correct; stats[len*2-2] += correct;
stats[len*2-1] += guess; stats[len*2-1] += guess;
} }
stringstream sout; stringstream sout;
copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," ")); copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
//TRACE_ERR(sout.str() << endl); //TRACE_ERR(sout.str() << endl);
string stats_str = sout.str(); string stats_str = sout.str();
entry.set(stats_str); entry.set(stats_str);
} }
float BleuScorer::calculateScore(const vector<float>& comps) { float BleuScorer::calculateScore(const vector<float>& comps)
//cerr << "BLEU: "; {
//copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," ")); //cerr << "BLEU: ";
float logbleu = 0.0; //copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," "));
for (int i = 0; i < LENGTH; ++i) { float logbleu = 0.0;
if (comps[2*i] == 0) { for (int i = 0; i < LENGTH; ++i) {
return 0.0; if (comps[2*i] == 0) {
} return 0.0;
logbleu += log(comps[2*i]) - log(comps[2*i+1]); }
logbleu += log(comps[2*i]) - log(comps[2*i+1]);
}
logbleu /= LENGTH; }
float brevity = 1.0 - (float)comps[LENGTH*2]/comps[1];//reflength divided by test length logbleu /= LENGTH;
if (brevity < 0.0) { float brevity = 1.0 - (float)comps[LENGTH*2]/comps[1];//reflength divided by test length
logbleu += brevity; if (brevity < 0.0) {
} logbleu += brevity;
//cerr << " " << exp(logbleu) << endl; }
return exp(logbleu); //cerr << " " << exp(logbleu) << endl;
return exp(logbleu);
} }

View File

@ -23,73 +23,74 @@ enum BleuReferenceLengthStrategy { BLEU_AVERAGE, BLEU_SHORTEST, BLEU_CLOSEST };
/** /**
* Bleu scoring * Bleu scoring
**/ **/
class BleuScorer: public StatisticsBasedScorer { class BleuScorer: public StatisticsBasedScorer
public: {
BleuScorer(const string& config); public:
virtual void setReferenceFiles(const vector<string>& referenceFiles); BleuScorer(const string& config);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); virtual void setReferenceFiles(const vector<string>& referenceFiles);
int LENGTH; virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
int LENGTH;
size_t NumberOfScores() const {
//cerr << "BleuScorer: " << (2 * LENGTH + 1) << endl;
return (2 * LENGTH + 1);
};
bool useAlignment() const {
//cout << "BleuScorer::useAlignment returning false" << endl;
return false;
};
size_t NumberOfScores() const {
//cerr << "BleuScorer: " << (2 * LENGTH + 1) << endl;
protected: return (2 * LENGTH + 1);
float calculateScore(const vector<float>& comps); };
bool useAlignment() const {
private: //cout << "BleuScorer::useAlignment returning false" << endl;
//no copy return false;
BleuScorer(const BleuScorer&); };
~BleuScorer(){};
BleuScorer& operator=(const BleuScorer&);
//Used to construct the ngram map
struct CompareNgrams {
int operator() (const vector<int>& a, const vector<int>& b) {
size_t i;
size_t as = a.size();
size_t bs = b.size();
for (i = 0; i < as && i < bs; ++i) {
if (a[i] < b[i]) {
//cerr << "true" << endl;
return true;
}
if (a[i] > b[i]) {
//cerr << "false" << endl;
return false;
}
}
//entries are equal, shortest wins
return as < bs;;
}
};
typedef map<vector<int>,int,CompareNgrams> counts_t;
typedef map<vector<int>,int,CompareNgrams>::iterator counts_it;
typedef vector<counts_t*> refcounts_t;
size_t countNgrams(const string& line, counts_t& counts, unsigned int n); protected:
float calculateScore(const vector<float>& comps);
void dump_counts(counts_t& counts) { private:
for (counts_it i = counts.begin(); i != counts.end(); ++i) { //no copy
cerr << "("; BleuScorer(const BleuScorer&);
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," ")); ~BleuScorer() {};
cerr << ") " << i->second << ", "; BleuScorer& operator=(const BleuScorer&);
} //Used to construct the ngram map
cerr << endl; struct CompareNgrams {
} int operator() (const vector<int>& a, const vector<int>& b) {
BleuReferenceLengthStrategy _refLengthStrategy; size_t i;
size_t as = a.size();
// data extracted from reference files size_t bs = b.size();
refcounts_t _refcounts; for (i = 0; i < as && i < bs; ++i) {
vector<vector<size_t> > _reflengths; if (a[i] < b[i]) {
//cerr << "true" << endl;
return true;
}
if (a[i] > b[i]) {
//cerr << "false" << endl;
return false;
}
}
//entries are equal, shortest wins
return as < bs;;
}
};
typedef map<vector<int>,int,CompareNgrams> counts_t;
typedef map<vector<int>,int,CompareNgrams>::iterator counts_it;
typedef vector<counts_t*> refcounts_t;
size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
void dump_counts(counts_t& counts) {
for (counts_it i = counts.begin(); i != counts.end(); ++i) {
cerr << "(";
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
cerr << ") " << i->second << ", ";
}
cerr << endl;
}
BleuReferenceLengthStrategy _refLengthStrategy;
// data extracted from reference files
refcounts_t _refcounts;
vector<vector<size_t> > _reflengths;
}; };

View File

@ -13,114 +13,113 @@
Data::Data(Scorer& ptr): Data::Data(Scorer& ptr):
theScorer(&ptr) theScorer(&ptr)
{ {
score_type = (*theScorer).getName(); score_type = (*theScorer).getName();
TRACE_ERR("Data::score_type " << score_type << std::endl); TRACE_ERR("Data::score_type " << score_type << std::endl);
TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl); TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
featdata=new FeatureData; featdata=new FeatureData;
scoredata=new ScoreData(*theScorer); scoredata=new ScoreData(*theScorer);
}; };
void Data::loadnbest(const std::string &file) void Data::loadnbest(const std::string &file)
{ {
TRACE_ERR("loading nbest from " << file << std::endl); TRACE_ERR("loading nbest from " << file << std::endl);
FeatureStats featentry; FeatureStats featentry;
ScoreStats scoreentry; ScoreStats scoreentry;
std::string sentence_index; std::string sentence_index;
inputfilestream inp(file); // matches a stream with a file. Opens the file inputfilestream inp(file); // matches a stream with a file. Opens the file
if (!inp.good()) if (!inp.good())
throw runtime_error("Unable to open: " + file); throw runtime_error("Unable to open: " + file);
std::string substring, subsubstring, stringBuf; std::string substring, subsubstring, stringBuf;
std::string theSentence; std::string theSentence;
std::string theFeatures; std::string theFeatures;
std::string theAlignment; std::string theAlignment;
std::string::size_type loc; std::string::size_type loc;
while (getline(inp,stringBuf,'\n')){ while (getline(inp,stringBuf,'\n')) {
if (stringBuf.empty()) continue; if (stringBuf.empty()) continue;
// TRACE_ERR("stringBuf: " << stringBuf << std::endl); // TRACE_ERR("stringBuf: " << stringBuf << std::endl);
getNextPound(stringBuf, substring, "|||"); //first field getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = substring; sentence_index = substring;
getNextPound(stringBuf, substring, "|||"); //second field getNextPound(stringBuf, substring, "|||"); //second field
theSentence = substring; theSentence = substring;
// adding statistics for error measures // adding statistics for error measures
featentry.reset(); featentry.reset();
scoreentry.clear(); scoreentry.clear();
getNextPound(stringBuf, substring, "|||"); //third field getNextPound(stringBuf, substring, "|||"); //third field
theFeatures = substring; theFeatures = substring;
if (stringBuf.length() > 0) { if (stringBuf.length() > 0) {
getNextPound(stringBuf, substring, "|||"); //fourth field sentence score getNextPound(stringBuf, substring, "|||"); //fourth field sentence score
if (stringBuf.length() > 0) { if (stringBuf.length() > 0) {
getNextPound(stringBuf, substring, "|||"); //fourth field only there if alignment scorer getNextPound(stringBuf, substring, "|||"); //fourth field only there if alignment scorer
theAlignment = substring; theAlignment = substring;
} }
} }
//TODO check alignment exists if scorers need it //TODO check alignment exists if scorers need it
if (!theScorer->useAlignment()) { if (!theScorer->useAlignment()) {
theScorer->prepareStats(sentence_index, theSentence, scoreentry); theScorer->prepareStats(sentence_index, theSentence, scoreentry);
} else { } else {
//an interpolated score would need both sentence and alignment //an interpolated score would need both sentence and alignment
theSentence += "|||"; theSentence += "|||";
theSentence += theAlignment; theSentence += theAlignment;
theScorer->prepareStats(sentence_index, theSentence, scoreentry); theScorer->prepareStats(sentence_index, theSentence, scoreentry);
} }
scoredata->add(scoreentry, sentence_index); scoredata->add(scoreentry, sentence_index);
if (!existsFeatureNames()){ if (!existsFeatureNames()) {
std::string stringsupport=theFeatures; std::string stringsupport=theFeatures;
// adding feature names // adding feature names
std::string features=""; std::string features="";
std::string tmpname=""; std::string tmpname="";
size_t tmpidx=0;
while (!stringsupport.empty()) {
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(stringsupport, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
features+=tmpname+"_"+stringify(tmpidx)+" ";
tmpidx++;
} else {
tmpidx=0;
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
}
}
featdata->setFeatureMap(features);
}
size_t tmpidx=0;
while (!stringsupport.empty()){
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(stringsupport, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1){
features+=tmpname+"_"+stringify(tmpidx)+" ";
tmpidx++;
}
else{
tmpidx=0;
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
}
}
featdata->setFeatureMap(features);
}
// adding features // adding features
while (!theFeatures.empty()){ while (!theFeatures.empty()) {
// TRACE_ERR("Decompounding: " << theFeatures << std::endl); // TRACE_ERR("Decompounding: " << theFeatures << std::endl);
getNextPound(theFeatures, subsubstring); getNextPound(theFeatures, subsubstring);
// string ending with ":" are skipped, because they are the names of the features // string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1){ if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
featentry.add(ATOFST(subsubstring.c_str())); featentry.add(ATOFST(subsubstring.c_str()));
} }
} }
featdata->add(featentry,sentence_index); featdata->add(featentry,sentence_index);
} }
inp.close(); inp.close();
} }

View File

@ -24,49 +24,70 @@ class Scorer;
class Data class Data
{ {
protected: protected:
ScoreData* scoredata; ScoreData* scoredata;
FeatureData* featdata; FeatureData* featdata;
private: private:
Scorer* theScorer; Scorer* theScorer;
std::string score_type; std::string score_type;
size_t number_of_scores; //number of scores size_t number_of_scores; //number of scores
public: public:
Data(Scorer& sc); Data(Scorer& sc);
~Data(){};
inline void clear() { scoredata->clear(); featdata->clear(); }
ScoreData* getScoreData() { return scoredata; };
FeatureData* getFeatureData() { return featdata; };
inline size_t NumberOfFeatures() const{ return featdata->NumberOfFeatures(); }
inline void NumberOfFeatures(size_t v){ featdata->NumberOfFeatures(v); }
inline std::string Features() const{ return featdata->Features(); }
inline void Features(const std::string f){ featdata->Features(f); }
void loadnbest(const std::string &file); ~Data() {};
void load(const std::string &featfile,const std::string &scorefile){ inline void clear() {
featdata->load(featfile); scoredata->clear();
scoredata->load(scorefile); featdata->clear();
} }
void save(const std::string &featfile,const std::string &scorefile, bool bin=false){
if (bin) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
featdata->save(featfile, bin);
scoredata->save(scorefile, bin);
}
inline bool existsFeatureNames(){ return featdata->existsFeatureNames(); }; ScoreData* getScoreData() {
return scoredata;
inline std::string getFeatureName(size_t idx){ return featdata->getFeatureName(idx); }; };
inline size_t getFeatureIndex(const std::string& name){ return featdata->getFeatureIndex(name); }; FeatureData* getFeatureData() {
return featdata;
};
inline size_t NumberOfFeatures() const {
return featdata->NumberOfFeatures();
}
inline void NumberOfFeatures(size_t v) {
featdata->NumberOfFeatures(v);
}
inline std::string Features() const {
return featdata->Features();
}
inline void Features(const std::string f) {
featdata->Features(f);
}
void loadnbest(const std::string &file);
void load(const std::string &featfile,const std::string &scorefile) {
featdata->load(featfile);
scoredata->load(scorefile);
}
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
if (bin) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
featdata->save(featfile, bin);
scoredata->save(scorefile, bin);
}
inline bool existsFeatureNames() {
return featdata->existsFeatureNames();
};
inline std::string getFeatureName(size_t idx) {
return featdata->getFeatureName(idx);
};
inline size_t getFeatureIndex(const std::string& name) {
return featdata->getFeatureIndex(name);
};
}; };

View File

@ -16,137 +16,137 @@ FeatureArray::FeatureArray(): idx("")
void FeatureArray::savetxt(std::ofstream& outFile) void FeatureArray::savetxt(std::ofstream& outFile)
{ {
outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size() outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_features << " " << features << std::endl; << " " << number_of_features << " " << features << std::endl;
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++){ for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->savetxt(outFile); i->savetxt(outFile);
outFile << std::endl; outFile << std::endl;
} }
outFile << FEATURES_TXT_END << std::endl; outFile << FEATURES_TXT_END << std::endl;
} }
void FeatureArray::savebin(std::ofstream& outFile) void FeatureArray::savebin(std::ofstream& outFile)
{ {
outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size() outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_features << " " << features << std::endl; << " " << number_of_features << " " << features << std::endl;
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->savebin(outFile); i->savebin(outFile);
outFile << FEATURES_BIN_END << std::endl; outFile << FEATURES_BIN_END << std::endl;
} }
void FeatureArray::save(std::ofstream& inFile, bool bin) void FeatureArray::save(std::ofstream& inFile, bool bin)
{ {
if (size()>0) if (size()>0)
(bin)?savebin(inFile):savetxt(inFile); (bin)?savebin(inFile):savetxt(inFile);
} }
void FeatureArray::save(const std::string &file, bool bin) void FeatureArray::save(const std::string &file, bool bin)
{ {
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile); save(outFile);
outFile.close(); outFile.close();
} }
void FeatureArray::loadbin(ifstream& inFile, size_t n) void FeatureArray::loadbin(ifstream& inFile, size_t n)
{ {
FeatureStats entry(number_of_features); FeatureStats entry(number_of_features);
for (size_t i=0 ; i < n; i++){ for (size_t i=0 ; i < n; i++) {
entry.loadbin(inFile); entry.loadbin(inFile);
add(entry); add(entry);
} }
} }
void FeatureArray::loadtxt(ifstream& inFile, size_t n) void FeatureArray::loadtxt(ifstream& inFile, size_t n)
{ {
FeatureStats entry(number_of_features); FeatureStats entry(number_of_features);
for (size_t i=0 ; i < n; i++){ for (size_t i=0 ; i < n; i++) {
entry.loadtxt(inFile); entry.loadtxt(inFile);
add(entry); add(entry);
} }
} }
void FeatureArray::load(ifstream& inFile) void FeatureArray::load(ifstream& inFile)
{ {
size_t number_of_entries=0; size_t number_of_entries=0;
bool binmode=false; bool binmode=false;
std::string substring, stringBuf; std::string substring, stringBuf;
std::string::size_type loc; std::string::size_type loc;
std::getline(inFile, stringBuf); std::getline(inFile, stringBuf);
if (!inFile.good()){ if (!inFile.good()) {
return; return;
} }
if (!stringBuf.empty()){ if (!stringBuf.empty()) {
if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0){ if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0) {
binmode=false; binmode=false;
}else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0){ } else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0) {
binmode=true; binmode=true;
}else{ } else {
TRACE_ERR("ERROR: FeatureArray::load(): Wrong header"); TRACE_ERR("ERROR: FeatureArray::load(): Wrong header");
return; return;
} }
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
idx = substring; idx = substring;
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str()); number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
number_of_features = atoi(substring.c_str()); number_of_features = atoi(substring.c_str());
features = stringBuf; features = stringBuf;
} }
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries); (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
std::getline(inFile, stringBuf); std::getline(inFile, stringBuf);
if (!stringBuf.empty()){ if (!stringBuf.empty()) {
if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0){ if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0) {
TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer"); TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer");
return; return;
} }
} }
} }
void FeatureArray::load(const std::string &file) void FeatureArray::load(const std::string &file)
{ {
TRACE_ERR("loading data from " << file << std::endl); TRACE_ERR("loading data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file inputfilestream inFile(file); // matches a stream with a file. Opens the file
load((ifstream&) inFile); load((ifstream&) inFile);
inFile.close(); inFile.close();
} }
void FeatureArray::merge(FeatureArray& e) void FeatureArray::merge(FeatureArray& e)
{ {
//dummy implementation //dummy implementation
for (size_t i=0; i<e.size(); i++) for (size_t i=0; i<e.size(); i++)
add(e.get(i)); add(e.get(i));
} }
bool FeatureArray::check_consistency() bool FeatureArray::check_consistency()
{ {
size_t sz = NumberOfFeatures(); size_t sz = NumberOfFeatures();
if (sz == 0) if (sz == 0)
return true; return true;
for (featarray_t::iterator i=array_.begin(); i!=array_.end(); i++) for (featarray_t::iterator i=array_.begin(); i!=array_.end(); i++)
if (i->size()!=sz) if (i->size()!=sz)
return false; return false;
return true; return true;
} }

View File

@ -27,47 +27,71 @@ using namespace std;
class FeatureArray class FeatureArray
{ {
protected: protected:
featarray_t array_; featarray_t array_;
size_t number_of_features; size_t number_of_features;
std::string features; std::string features;
private: private:
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
public: public:
FeatureArray(); FeatureArray();
~FeatureArray(){};
inline void clear() { array_.clear(); }
inline std::string getIndex(){ return idx; }
inline void setIndex(const std::string & value){ idx=value; }
inline FeatureStats& get(size_t i){ return array_.at(i); } ~FeatureArray() {};
inline const FeatureStats& get(size_t i)const{ return array_.at(i); }
void add(FeatureStats e){ array_.push_back(e); }
void merge(FeatureArray& e); inline void clear() {
array_.clear();
}
inline size_t size(){ return array_.size(); } inline std::string getIndex() {
inline size_t NumberOfFeatures() const{ return number_of_features; } return idx;
inline void NumberOfFeatures(size_t v){ number_of_features = v; } }
inline std::string Features() const{ return features; } inline void setIndex(const std::string & value) {
inline void Features(const std::string f){ features = f; } idx=value;
}
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
void save(ofstream& outFile, bool bin=false);
void save(const std::string &file, bool bin=false);
inline void save(bool bin=false){ save("/dev/stdout",bin); }
void loadtxt(ifstream& inFile, size_t n); inline FeatureStats& get(size_t i) {
void loadbin(ifstream& inFile, size_t n); return array_.at(i);
void load(ifstream& inFile); }
void load(const std::string &file); inline const FeatureStats& get(size_t i)const {
return array_.at(i);
bool check_consistency(); }
void add(FeatureStats e) {
array_.push_back(e);
}
void merge(FeatureArray& e);
inline size_t size() {
return array_.size();
}
inline size_t NumberOfFeatures() const {
return number_of_features;
}
inline void NumberOfFeatures(size_t v) {
number_of_features = v;
}
inline std::string Features() const {
return features;
}
inline void Features(const std::string f) {
features = f;
}
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
void save(ofstream& outFile, bool bin=false);
void save(const std::string &file, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout",bin);
}
void loadtxt(ifstream& inFile, size_t n);
void loadbin(ifstream& inFile, size_t n);
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
}; };

View File

@ -18,127 +18,127 @@ FeatureData::FeatureData() {};
void FeatureData::save(std::ofstream& outFile, bool bin) void FeatureData::save(std::ofstream& outFile, bool bin)
{ {
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->save(outFile, bin); i->save(outFile, bin);
} }
void FeatureData::save(const std::string &file, bool bin) void FeatureData::save(const std::string &file, bool bin)
{ {
if (file.empty()) return; if (file.empty()) return;
TRACE_ERR("saving the array into " << file << std::endl); TRACE_ERR("saving the array into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile, bin); save(outFile, bin);
outFile.close(); outFile.close();
} }
void FeatureData::load(ifstream& inFile) void FeatureData::load(ifstream& inFile)
{ {
FeatureArray entry; FeatureArray entry;
while (!inFile.eof()){ while (!inFile.eof()) {
if (!inFile.good()){ if (!inFile.good()) {
std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl; std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl;
} }
entry.clear(); entry.clear();
entry.load(inFile); entry.load(inFile);
if (entry.size() == 0) if (entry.size() == 0)
break; break;
if (size() == 0){ if (size() == 0) {
setFeatureMap(entry.Features()); setFeatureMap(entry.Features());
} }
add(entry); add(entry);
} }
} }
void FeatureData::load(const std::string &file) void FeatureData::load(const std::string &file)
{ {
TRACE_ERR("loading feature data from " << file << std::endl); TRACE_ERR("loading feature data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file inputfilestream inFile(file); // matches a stream with a file. Opens the file
if (!inFile) { if (!inFile) {
throw runtime_error("Unable to open feature file: " + file); throw runtime_error("Unable to open feature file: " + file);
} }
load((ifstream&) inFile); load((ifstream&) inFile);
inFile.close(); inFile.close();
} }
void FeatureData::add(FeatureArray& e){ void FeatureData::add(FeatureArray& e)
if (exists(e.getIndex())){ // array at position e.getIndex() already exists {
//enlarge array at position e.getIndex() if (exists(e.getIndex())) { // array at position e.getIndex() already exists
size_t pos = getIndex(e.getIndex()); //enlarge array at position e.getIndex()
array_.at(pos).merge(e); size_t pos = getIndex(e.getIndex());
} array_.at(pos).merge(e);
else{ } else {
array_.push_back(e); array_.push_back(e);
setIndex(); setIndex();
} }
} }
void FeatureData::add(FeatureStats& e, const std::string & sent_idx){ void FeatureData::add(FeatureStats& e, const std::string & sent_idx)
if (exists(sent_idx)){ // array at position e.getIndex() already exists {
//enlarge array at position e.getIndex() if (exists(sent_idx)) { // array at position e.getIndex() already exists
size_t pos = getIndex(sent_idx); //enlarge array at position e.getIndex()
// TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl); size_t pos = getIndex(sent_idx);
array_.at(pos).add(e); // TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl);
} array_.at(pos).add(e);
else{ } else {
// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl); // TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl);
FeatureArray a; FeatureArray a;
a.NumberOfFeatures(number_of_features); a.NumberOfFeatures(number_of_features);
a.Features(features); a.Features(features);
a.setIndex(sent_idx); a.setIndex(sent_idx);
a.add(e); a.add(e);
add(a); add(a);
} }
} }
bool FeatureData::check_consistency() bool FeatureData::check_consistency()
{ {
if (array_.size() == 0) if (array_.size() == 0)
return true; return true;
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
if (!i->check_consistency()) return false;
return true; for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
if (!i->check_consistency()) return false;
return true;
} }
void FeatureData::setIndex() void FeatureData::setIndex()
{ {
size_t j=0; size_t j=0;
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++){ for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
idx2arrayname_[j]=(*i).getIndex(); idx2arrayname_[j]=(*i).getIndex();
arrayname2idx_[(*i).getIndex()] = j; arrayname2idx_[(*i).getIndex()] = j;
j++; j++;
} }
} }
void FeatureData::setFeatureMap(const std::string feat) void FeatureData::setFeatureMap(const std::string feat)
{ {
number_of_features = 0; number_of_features = 0;
features=feat; features=feat;
std::string substring, stringBuf; std::string substring, stringBuf;
stringBuf=features; stringBuf=features;
while (!stringBuf.empty()){ while (!stringBuf.empty()) {
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
featname2idx_[substring]=idx2featname_.size(); featname2idx_[substring]=idx2featname_.size();
idx2featname_[idx2featname_.size()]=substring; idx2featname_[idx2featname_.size()]=substring;
number_of_features++; number_of_features++;
} }
} }

View File

@ -20,86 +20,116 @@ using namespace std;
class FeatureData class FeatureData
{ {
protected: protected:
featdata_t array_; featdata_t array_;
idx2name idx2arrayname_; //map from index to name of array idx2name idx2arrayname_; //map from index to name of array
name2idx arrayname2idx_; //map from name to index of array name2idx arrayname2idx_; //map from name to index of array
private: private:
size_t number_of_features; size_t number_of_features;
std::string features; std::string features;
map<std::string, size_t> featname2idx_; //map from name to index of features
map<size_t, std::string> idx2featname_; //map from index to name of features
map<std::string, size_t> featname2idx_; //map from name to index of features
map<size_t, std::string> idx2featname_; //map from index to name of features
public: public:
FeatureData(); FeatureData();
~FeatureData(){};
inline void clear() { array_.clear(); }
inline FeatureArray get(const std::string& idx){ return array_.at(getIndex(idx)); }
inline FeatureArray& get(size_t idx){ return array_.at(idx); }
inline const FeatureArray& get(size_t idx) const{ return array_.at(idx); }
inline bool exists(const std::string & sent_idx){ return exists(getIndex(sent_idx)); } ~FeatureData() {};
inline bool exists(int sent_idx){ return (sent_idx>-1 && sent_idx<(int) array_.size())?true:false; }
inline FeatureStats& get(size_t i, size_t j){ return array_.at(i).get(j); } inline void clear() {
inline const FeatureStats& get(size_t i, size_t j) const { return array_.at(i).get(j); } array_.clear();
void add(FeatureArray& e);
void add(FeatureStats& e, const std::string& sent_idx);
inline size_t size(){ return array_.size(); }
inline size_t NumberOfFeatures() const{ return number_of_features; }
inline void NumberOfFeatures(size_t v){ number_of_features = v; }
inline std::string Features() const{ return features; }
inline void Features(const std::string f){ features = f; }
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false){ save("/dev/stdout", bin); }
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
void setIndex();
inline int getIndex(const std::string& idx){
name2idx::iterator i = arrayname2idx_.find(idx);
if (i!=arrayname2idx_.end())
return i->second;
else
return -1;
} }
inline std::string getIndex(size_t idx){ inline FeatureArray get(const std::string& idx) {
idx2name::iterator i = idx2arrayname_.find(idx); return array_.at(getIndex(idx));
if (i!=idx2arrayname_.end()) }
throw runtime_error("there is no entry at index " + idx); inline FeatureArray& get(size_t idx) {
return i->second; return array_.at(idx);
} }
inline const FeatureArray& get(size_t idx) const {
return array_.at(idx);
bool existsFeatureNames(){ return (idx2featname_.size() > 0)?true:false; }; }
std::string getFeatureName(size_t idx){ inline bool exists(const std::string & sent_idx) {
if (idx >= idx2featname_.size()) return exists(getIndex(sent_idx));
throw runtime_error("Error: you required an too big index"); }
return idx2featname_[idx]; inline bool exists(int sent_idx) {
}; return (sent_idx>-1 && sent_idx<(int) array_.size())?true:false;
}
size_t getFeatureIndex(const std::string& name){
if (featname2idx_.find(name)!=featname2idx_.end()) inline FeatureStats& get(size_t i, size_t j) {
throw runtime_error("Error: feature is unknown"); return array_.at(i).get(j);
return featname2idx_[name]; }
}; inline const FeatureStats& get(size_t i, size_t j) const {
return array_.at(i).get(j);
}
void add(FeatureArray& e);
void add(FeatureStats& e, const std::string& sent_idx);
inline size_t size() {
return array_.size();
}
inline size_t NumberOfFeatures() const {
return number_of_features;
}
inline void NumberOfFeatures(size_t v) {
number_of_features = v;
}
inline std::string Features() const {
return features;
}
inline void Features(const std::string f) {
features = f;
}
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout", bin);
}
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
void setIndex();
inline int getIndex(const std::string& idx) {
name2idx::iterator i = arrayname2idx_.find(idx);
if (i!=arrayname2idx_.end())
return i->second;
else
return -1;
}
inline std::string getIndex(size_t idx) {
idx2name::iterator i = idx2arrayname_.find(idx);
if (i!=idx2arrayname_.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
bool existsFeatureNames() {
return (idx2featname_.size() > 0)?true:false;
};
std::string getFeatureName(size_t idx) {
if (idx >= idx2featname_.size())
throw runtime_error("Error: you required an too big index");
return idx2featname_[idx];
};
size_t getFeatureIndex(const std::string& name) {
if (featname2idx_.find(name)!=featname2idx_.end())
throw runtime_error("Error: feature is unknown");
return featname2idx_[name];
};
void setFeatureMap(const std::string feat); void setFeatureMap(const std::string feat);
}; };

View File

@ -14,123 +14,124 @@
FeatureStats::FeatureStats() FeatureStats::FeatureStats()
{ {
available_ = AVAILABLE_; available_ = AVAILABLE_;
entries_ = 0; entries_ = 0;
array_ = new FeatureStatsType[available_]; array_ = new FeatureStatsType[available_];
}; };
FeatureStats::~FeatureStats() FeatureStats::~FeatureStats()
{ {
delete array_; delete array_;
}; };
FeatureStats::FeatureStats(const FeatureStats &stats) FeatureStats::FeatureStats(const FeatureStats &stats)
{ {
available_ = stats.available(); available_ = stats.available();
entries_ = stats.size(); entries_ = stats.size();
array_ = new FeatureStatsType[available_]; array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_); memcpy(array_,stats.getArray(),featbytes_);
}; };
FeatureStats::FeatureStats(const size_t size) FeatureStats::FeatureStats(const size_t size)
{ {
available_ = size; available_ = size;
entries_ = size; entries_ = size;
array_ = new FeatureStatsType[available_]; array_ = new FeatureStatsType[available_];
memset(array_,0,featbytes_); memset(array_,0,featbytes_);
}; };
FeatureStats::FeatureStats(std::string &theString) FeatureStats::FeatureStats(std::string &theString)
{ {
set(theString); set(theString);
} }
void FeatureStats::expand() void FeatureStats::expand()
{ {
available_*=2; available_*=2;
featstats_t t_ = new FeatureStatsType[available_]; featstats_t t_ = new FeatureStatsType[available_];
memcpy(t_,array_,featbytes_); memcpy(t_,array_,featbytes_);
delete array_; delete array_;
array_=t_; array_=t_;
} }
void FeatureStats::add(FeatureStatsType v) void FeatureStats::add(FeatureStatsType v)
{ {
if (isfull()) expand(); if (isfull()) expand();
array_[entries_++]=v; array_[entries_++]=v;
} }
void FeatureStats::set(std::string &theString) void FeatureStats::set(std::string &theString)
{ {
std::string substring, stringBuf; std::string substring, stringBuf;
reset(); reset();
while (!theString.empty()){ while (!theString.empty()) {
getNextPound(theString, substring); getNextPound(theString, substring);
add(ATOFST(substring.c_str())); add(ATOFST(substring.c_str()));
} }
} }
void FeatureStats::loadbin(std::ifstream& inFile) void FeatureStats::loadbin(std::ifstream& inFile)
{ {
inFile.read((char*) array_, featbytes_); inFile.read((char*) array_, featbytes_);
} }
void FeatureStats::loadtxt(std::ifstream& inFile) void FeatureStats::loadtxt(std::ifstream& inFile)
{ {
std::string theString; std::string theString;
std::getline(inFile, theString); std::getline(inFile, theString);
set(theString); set(theString);
} }
void FeatureStats::loadtxt(const std::string &file) void FeatureStats::loadtxt(const std::string &file)
{ {
// TRACE_ERR("loading the stats from " << file << std::endl); // TRACE_ERR("loading the stats from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile); loadtxt(inFile);
} }
void FeatureStats::savetxt(const std::string &file) void FeatureStats::savetxt(const std::string &file)
{ {
// TRACE_ERR("saving the stats into " << file << std::endl); // TRACE_ERR("saving the stats into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
savetxt(outFile); savetxt(outFile);
} }
void FeatureStats::savetxt(std::ofstream& outFile) void FeatureStats::savetxt(std::ofstream& outFile)
{ {
// TRACE_ERR("saving the stats" << std::endl); // TRACE_ERR("saving the stats" << std::endl);
outFile << *this; outFile << *this;
} }
void FeatureStats::savebin(std::ofstream& outFile) void FeatureStats::savebin(std::ofstream& outFile)
{ {
outFile.write((char*) array_, featbytes_); outFile.write((char*) array_, featbytes_);
} }
FeatureStats& FeatureStats::operator=(const FeatureStats &stats) FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
{ {
delete array_; delete array_;
available_ = stats.available(); available_ = stats.available();
entries_ = stats.size(); entries_ = stats.size();
array_ = new FeatureStatsType[available_]; array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_); memcpy(array_,stats.getArray(),featbytes_);
return *this; return *this;
} }
/**write the whole object to a stream*/ /**write the whole object to a stream*/
ostream& operator<<(ostream& o, const FeatureStats& e){ ostream& operator<<(ostream& o, const FeatureStats& e)
for (size_t i=0; i< e.size(); i++) {
o << e.get(i) << " "; for (size_t i=0; i< e.size(); i++)
return o; o << e.get(i) << " ";
return o;
} }

View File

@ -25,46 +25,67 @@ using namespace std;
class FeatureStats class FeatureStats
{ {
private: private:
featstats_t array_; featstats_t array_;
size_t entries_; size_t entries_;
size_t available_; size_t available_;
public: public:
FeatureStats(); FeatureStats();
FeatureStats(const size_t size); FeatureStats(const size_t size);
FeatureStats(const FeatureStats &stats); FeatureStats(const FeatureStats &stats);
FeatureStats(std::string &theString); FeatureStats(std::string &theString);
FeatureStats& operator=(const FeatureStats &stats); FeatureStats& operator=(const FeatureStats &stats);
~FeatureStats();
bool isfull(){return (entries_ < available_)?0:1; }
void expand();
void add(FeatureStatsType v);
inline void clear() { memset((void*) array_,0,featbytes_); }
inline FeatureStatsType get(size_t i){ return array_[i]; }
inline FeatureStatsType get(size_t i)const{ return array_[i]; }
inline featstats_t getArray() const { return array_; }
void set(std::string &theString); ~FeatureStats();
inline size_t bytes() const{ return featbytes_; } bool isfull() {
inline size_t size() const{ return entries_; } return (entries_ < available_)?0:1;
inline size_t available() const{ return available_; } }
void expand();
void savetxt(const std::string &file); void add(FeatureStatsType v);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile); inline void clear() {
inline void savetxt(){ savetxt("/dev/stdout"); } memset((void*) array_,0,featbytes_);
}
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile); inline FeatureStatsType get(size_t i) {
void loadbin(ifstream& inFile); return array_[i];
}
inline FeatureStatsType get(size_t i)const {
return array_[i];
}
inline featstats_t getArray() const {
return array_;
}
void set(std::string &theString);
inline size_t bytes() const {
return featbytes_;
}
inline size_t size() const {
return entries_;
}
inline size_t available() const {
return available_;
}
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt() {
savetxt("/dev/stdout");
}
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile);
void loadbin(ifstream& inFile);
inline void reset() {
entries_ = 0;
clear();
}
inline void reset(){ entries_ = 0; clear(); }
/**write the whole object to a stream*/ /**write the whole object to a stream*/
friend ostream& operator<<(ostream& o, const FeatureStats& e); friend ostream& operator<<(ostream& o, const FeatureStats& e);
}; };

View File

@ -5,196 +5,201 @@
using namespace std; using namespace std;
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config) { InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config)
//configure regularisation {
static string KEY_WEIGHTS = "weights"; //configure regularisation
static string KEY_TYPE = "regtype"; static string KEY_WEIGHTS = "weights";
static string KEY_WINDOW = "regwin"; static string KEY_TYPE = "regtype";
static string KEY_CASE = "case"; static string KEY_WINDOW = "regwin";
static string TYPE_NONE = "none"; static string KEY_CASE = "case";
static string TYPE_AVERAGE = "average"; static string TYPE_NONE = "none";
static string TYPE_MINIMUM = "min"; static string TYPE_AVERAGE = "average";
static string TRUE = "true"; static string TYPE_MINIMUM = "min";
static string FALSE = "false"; static string TRUE = "true";
static string FALSE = "false";
string type = getConfig(KEY_TYPE,TYPE_NONE); string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) { if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE; _regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) { } else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE; _regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) { } else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM; _regularisationStrategy = REG_MINIMUM;
} else { } else {
throw runtime_error("Unknown scorer regularisation strategy: " + type); throw runtime_error("Unknown scorer regularisation strategy: " + type);
} }
cerr << "Using scorer regularisation strategy: " << type << endl; cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0"); string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str()); _regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl; cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE); string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) { if (preservecase == TRUE) {
_preserveCase = true; _preserveCase = true;
}else if (preservecase == FALSE) { } else if (preservecase == FALSE) {
_preserveCase = false; _preserveCase = false;
} }
cerr << "Using case preservation: " << _preserveCase << endl; cerr << "Using case preservation: " << _preserveCase << endl;
// name would be: HAMMING,BLEU or similar // name would be: HAMMING,BLEU or similar
string scorers = name; string scorers = name;
while (scorers.length() > 0) { while (scorers.length() > 0) {
string scorertype = ""; string scorertype = "";
getNextPound(scorers,scorertype,","); getNextPound(scorers,scorertype,",");
ScorerFactory SF; ScorerFactory SF;
Scorer *theScorer=SF.getScorer(scorertype,config); Scorer *theScorer=SF.getScorer(scorertype,config);
_scorers.push_back(theScorer); _scorers.push_back(theScorer);
} }
if (_scorers.size() == 0) { if (_scorers.size() == 0) {
throw runtime_error("There are no scorers"); throw runtime_error("There are no scorers");
} }
cout << "Number of scorers: " << _scorers.size() << endl; cout << "Number of scorers: " << _scorers.size() << endl;
//TODO debug this //TODO debug this
string wtype = getConfig(KEY_WEIGHTS,""); string wtype = getConfig(KEY_WEIGHTS,"");
//Default weights set to uniform ie. if two weights 0.5 each //Default weights set to uniform ie. if two weights 0.5 each
//weights should add to 1 //weights should add to 1
if (wtype.length() == 0) { if (wtype.length() == 0) {
float weight = 1.0/_scorers.size() ; float weight = 1.0/_scorers.size() ;
//cout << " Default weights:" << weight << endl; //cout << " Default weights:" << weight << endl;
for (size_t i = 0; i < _scorers.size(); i ++) { for (size_t i = 0; i < _scorers.size(); i ++) {
_scorerWeights.push_back(weight); _scorerWeights.push_back(weight);
}
}else{
float tot=0;
//cout << "Defined weights:" << endl;
while (wtype.length() > 0) {
string scoreweight = "";
getNextPound(wtype,scoreweight,"+");
float weight = atof(scoreweight.c_str());
_scorerWeights.push_back(weight);
tot += weight;
//cout << " :" << weight ;
}
//cout << endl;
if (tot != float(1)) {
throw runtime_error("The interpolated scorers weights do not sum to 1");
}
} }
cout << "The weights for the interpolated scorers are: " << endl; } else {
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) { float tot=0;
cout << *it << " " ; //cout << "Defined weights:" << endl;
while (wtype.length() > 0) {
string scoreweight = "";
getNextPound(wtype,scoreweight,"+");
float weight = atof(scoreweight.c_str());
_scorerWeights.push_back(weight);
tot += weight;
//cout << " :" << weight ;
} }
cout <<endl; //cout << endl;
if (tot != float(1)) {
throw runtime_error("The interpolated scorers weights do not sum to 1");
}
}
cout << "The weights for the interpolated scorers are: " << endl;
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
cout << *it << " " ;
}
cout <<endl;
} }
void InterpolatedScorer::setScoreData(ScoreData* data) { void InterpolatedScorer::setScoreData(ScoreData* data)
size_t last = 0; {
_scoreData = data; size_t last = 0;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){ _scoreData = data;
int numScoresScorer = (*itsc)->NumberOfScores(); for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
ScoreData* newData =new ScoreData(**itsc); int numScoresScorer = (*itsc)->NumberOfScores();
for (size_t i = 0; i < data->size(); i++){ ScoreData* newData =new ScoreData(**itsc);
ScoreArray scoreArray = data->get(i); for (size_t i = 0; i < data->size(); i++) {
ScoreArray newScoreArray; ScoreArray scoreArray = data->get(i);
std::string istr; ScoreArray newScoreArray;
std::stringstream out; std::string istr;
out << i; std::stringstream out;
istr = out.str(); out << i;
size_t numNBest = scoreArray.size(); istr = out.str();
//cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ; size_t numNBest = scoreArray.size();
for (size_t j = 0; j < numNBest ; j++){ //cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
ScoreStats scoreStats = data->get(i, j); for (size_t j = 0; j < numNBest ; j++) {
//cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl; ScoreStats scoreStats = data->get(i, j);
ScoreStats newScoreStats; //cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
for (size_t k = last; k < size_t(numScoresScorer + last); k++) { ScoreStats newScoreStats;
ScoreStatsType score = scoreStats.get(k); for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
newScoreStats.add(score); ScoreStatsType score = scoreStats.get(k);
} newScoreStats.add(score);
//cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl; }
newScoreArray.add(newScoreStats); //cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
} newScoreArray.add(newScoreStats);
newScoreArray.setIndex(istr); }
newData->add(newScoreArray); newScoreArray.setIndex(istr);
} newData->add(newScoreArray);
//newData->dump(); }
(*itsc)->setScoreData(newData); //newData->dump();
last += numScoresScorer; (*itsc)->setScoreData(newData);
} last += numScoresScorer;
}
} }
/** The interpolated scorer calls a vector of scorers and combines them with /** The interpolated scorer calls a vector of scorers and combines them with
weights **/ weights **/
void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs, void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) { statscores_t& scores)
{
//cout << "*******InterpolatedScorer::score" << endl; //cout << "*******InterpolatedScorer::score" << endl;
size_t scorerNum = 0; size_t scorerNum = 0;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){ for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
int numScores = (*itsc)->NumberOfScores(); int numScores = (*itsc)->NumberOfScores();
statscores_t tscores; statscores_t tscores;
(*itsc)->score(candidates,diffs,tscores); (*itsc)->score(candidates,diffs,tscores);
size_t inc = 0; size_t inc = 0;
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end();itstatsc++){ for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) {
//cout << "Scores " << (*itstatsc) << endl; //cout << "Scores " << (*itstatsc) << endl;
float weight = _scorerWeights[scorerNum]; float weight = _scorerWeights[scorerNum];
if (weight == 0) { if (weight == 0) {
stringstream msg; stringstream msg;
msg << "No weights for scorer" << scorerNum ; msg << "No weights for scorer" << scorerNum ;
throw runtime_error(msg.str()); throw runtime_error(msg.str());
} }
if (scorerNum == 0) { if (scorerNum == 0) {
scores.push_back(weight * (*itstatsc)); scores.push_back(weight * (*itstatsc));
} else { } else {
scores[inc] += weight * (*itstatsc); scores[inc] += weight * (*itstatsc);
} }
//cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl; //cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl;
inc++; inc++;
} }
scorerNum++; scorerNum++;
} }
} }
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles) { void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
for (vector<Scorer *>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){ {
//the scorers that use alignments use the reference files in the constructor through config for (vector<Scorer *>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
(*itsc)->setReferenceFiles(referenceFiles); //the scorers that use alignments use the reference files in the constructor through config
} (*itsc)->setReferenceFiles(referenceFiles);
}
} }
// Text can be: // Text can be:
// Reference sentence ||| Reference sentence alignment information (as given by MOSES -include-alignment-in-n-best) // Reference sentence ||| Reference sentence alignment information (as given by MOSES -include-alignment-in-n-best)
// If a permutation distance scorer, send alignment info // If a permutation distance scorer, send alignment info
// Else if other scorer, remove the alignment info and then send reference as usual // Else if other scorer, remove the alignment info and then send reference as usual
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
stringstream buff; {
string align = text; stringstream buff;
string sentence = ""; string align = text;
size_t alignmentData = text.find("|||"); string sentence = "";
//Get sentence and alignment parts size_t alignmentData = text.find("|||");
if(alignmentData != string::npos) { //Get sentence and alignment parts
getNextPound(align,sentence, "|||"); if(alignmentData != string::npos) {
} getNextPound(align,sentence, "|||");
int i=0; }
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){ int i=0;
ScoreStats tempEntry; for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
if ((*itsc)->useAlignment()) { ScoreStats tempEntry;
(*itsc)->prepareStats(sid, text, tempEntry); if ((*itsc)->useAlignment()) {
} else { (*itsc)->prepareStats(sid, text, tempEntry);
(*itsc)->prepareStats(sid, sentence, tempEntry); } else {
} (*itsc)->prepareStats(sid, sentence, tempEntry);
if (i > 0) buff << " ";
buff << tempEntry;
i++;
} }
//cout << " Scores for interpolated: " << buff << endl; if (i > 0) buff << " ";
string str = buff.str(); buff << tempEntry;
entry.set(str); i++;
}
//cout << " Scores for interpolated: " << buff << endl;
string str = buff.str();
entry.set(str);
} }

View File

@ -18,48 +18,49 @@
/** /**
* Abstract base class for scorers that include other scorers eg. * Abstract base class for scorers that include other scorers eg.
* Interpolated HAMMING and BLEU scorer **/ * Interpolated HAMMING and BLEU scorer **/
class InterpolatedScorer : public Scorer { class InterpolatedScorer : public Scorer
{
public: public:
// name would be: "HAMMING,BLEU" or similar // name would be: "HAMMING,BLEU" or similar
InterpolatedScorer(const string& name, const string& config); InterpolatedScorer(const string& name, const string& config);
~InterpolatedScorer(){}; ~InterpolatedScorer() {};
void score(const candidates_t& candidates, const diffs_t& diffs, void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores); statscores_t& scores);
void setReferenceFiles(const vector<string>& referenceFiles); void setReferenceFiles(const vector<string>& referenceFiles);
void prepareStats(size_t sid, const string& text, ScoreStats& entry); void prepareStats(size_t sid, const string& text, ScoreStats& entry);
size_t NumberOfScores() const { size_t NumberOfScores() const {
size_t sz=0; size_t sz=0;
for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end();itsc++){ for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end(); itsc++) {
sz += (*itsc)->NumberOfScores(); sz += (*itsc)->NumberOfScores();
} }
return sz; return sz;
}; };
bool useAlignment() const {
//cout << "InterpolatedScorer::useAlignment" << endl;
for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end();itsc++){
if ((*itsc)->useAlignment()){
//cout <<"InterpolatedScorer::useAlignment Returning true"<<endl;
return true;
}
}
return false;
};
//calculate the actual score - this gets done in the individual scorers bool useAlignment() const {
//statscore_t calculateScore(const vector<statscore_t>& totals); //cout << "InterpolatedScorer::useAlignment" << endl;
void setScoreData(ScoreData* data); for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end(); itsc++) {
if ((*itsc)->useAlignment()) {
//cout <<"InterpolatedScorer::useAlignment Returning true"<<endl;
return true;
}
}
return false;
};
protected: //calculate the actual score - this gets done in the individual scorers
//statscore_t calculateScore(const vector<statscore_t>& totals);
void setScoreData(ScoreData* data);
//regularisation protected:
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
vector<Scorer*> _scorers; //regularisation
vector<float> _scorerWeights; ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
vector<Scorer*> _scorers;
vector<float> _scorerWeights;
}; };

View File

@ -14,31 +14,34 @@ static const float MAX_FLOAT=numeric_limits<float>::max();
void Optimizer::SetScorer(Scorer *S){ void Optimizer::SetScorer(Scorer *S)
{
if(scorer) if(scorer)
delete scorer; delete scorer;
scorer=S; scorer=S;
} }
void Optimizer::SetFData(FeatureData *F){ void Optimizer::SetFData(FeatureData *F)
{
if(FData) if(FData)
delete FData; delete FData;
FData=F; FData=F;
}; };
Optimizer::Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start):scorer(NULL),FData(NULL){ Optimizer::Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start):scorer(NULL),FData(NULL)
{
//warning: the init vector is a full set of parameters, of dimension pdim! //warning: the init vector is a full set of parameters, of dimension pdim!
Point::pdim=Pd; Point::pdim=Pd;
assert(start.size()==Pd); assert(start.size()==Pd);
Point::dim=i2O.size(); Point::dim=i2O.size();
Point::optindices=i2O; Point::optindices=i2O;
if (Point::pdim>Point::dim){ if (Point::pdim>Point::dim) {
for (unsigned int i=0;i<Point::pdim;i++){ for (unsigned int i=0; i<Point::pdim; i++) {
unsigned int j = 0; unsigned int j = 0;
while (j<Point::dim && i!=i2O[j]) while (j<Point::dim && i!=i2O[j])
j++; j++;
if (j==Point::dim)//the index i wasnt found on optindices, it is a fixed index, we use the value of the start vector if (j==Point::dim)//the index i wasnt found on optindices, it is a fixed index, we use the value of the start vector
Point::fixedweights[i]=start[i]; Point::fixedweights[i]=start[i];
@ -46,12 +49,14 @@ Optimizer::Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start)
} }
}; };
Optimizer::~Optimizer(){ Optimizer::~Optimizer()
{
delete scorer; delete scorer;
delete FData; delete FData;
} }
statscore_t Optimizer::GetStatScore(const Point& param)const{ statscore_t Optimizer::GetStatScore(const Point& param)const
{
vector<unsigned> bests; vector<unsigned> bests;
Get1bests(param,bests); Get1bests(param,bests);
//copy(bests.begin(),bests.end(),ostream_iterator<unsigned>(cerr," ")); //copy(bests.begin(),bests.end(),ostream_iterator<unsigned>(cerr," "));
@ -60,23 +65,25 @@ statscore_t Optimizer::GetStatScore(const Point& param)const{
}; };
/**compute the intersection of 2 lines*/ /**compute the intersection of 2 lines*/
float intersect (float m1, float b1,float m2,float b2){ float intersect (float m1, float b1,float m2,float b2)
{
float isect = ((b2-b1)/(m1-m2)); float isect = ((b2-b1)/(m1-m2));
if (!isfinite(isect)) { if (!isfinite(isect)) {
isect = MAX_FLOAT; isect = MAX_FLOAT;
} }
return isect; return isect;
} }
map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap,float newt,pair<unsigned,unsigned> newdiff){ map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap,float newt,pair<unsigned,unsigned> newdiff)
{
map<float,diff_t>::iterator it=thresholdmap.find(newt); map<float,diff_t>::iterator it=thresholdmap.find(newt);
if(it!=thresholdmap.end()){ if(it!=thresholdmap.end()) {
//the threshold already exists!! this is very unlikely //the threshold already exists!! this is very unlikely
if(it->second.back().first==newdiff.first) if(it->second.back().first==newdiff.first)
it->second.back().second=newdiff.second;//there was already a diff for this sentence, we change the 1 best; it->second.back().second=newdiff.second;//there was already a diff for this sentence, we change the 1 best;
else else
it->second.push_back(newdiff); it->second.push_back(newdiff);
}else{ } else {
//normal case //normal case
pair< map<float,diff_t >::iterator,bool > ins=thresholdmap.insert(threshold(newt,diff_t(1,newdiff))); pair< map<float,diff_t >::iterator,bool > ins=thresholdmap.insert(threshold(newt,diff_t(1,newdiff)));
assert(ins.second);//we really inserted something assert(ins.second);//we really inserted something
@ -86,244 +93,247 @@ map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap,float
}; };
statscore_t Optimizer::LineOptimize(const Point& origin,const Point& direction,Point& bestpoint)const{ statscore_t Optimizer::LineOptimize(const Point& origin,const Point& direction,Point& bestpoint)const
{
// we are looking for the best Point on the line y=Origin+x*direction // we are looking for the best Point on the line y=Origin+x*direction
float min_int=0.0001; float min_int=0.0001;
//typedef pair<unsigned,unsigned> diff;//first the sentence that changes, second is the new 1best for this sentence //typedef pair<unsigned,unsigned> diff;//first the sentence that changes, second is the new 1best for this sentence
//list<threshold> thresholdlist; //list<threshold> thresholdlist;
map<float,diff_t> thresholdmap; map<float,diff_t> thresholdmap;
thresholdmap[MIN_FLOAT]=diff_t(); thresholdmap[MIN_FLOAT]=diff_t();
vector<unsigned> first1best;//the vector of nbests for x=-inf vector<unsigned> first1best;//the vector of nbests for x=-inf
for(unsigned int S=0;S<size();S++){ for(unsigned int S=0; S<size(); S++) {
map<float,diff_t >::iterator previnserted=thresholdmap.begin(); map<float,diff_t >::iterator previnserted=thresholdmap.begin();
//first we determine the translation with the best feature score for each sentence and each value of x //first we determine the translation with the best feature score for each sentence and each value of x
//cerr << "Sentence " << S << endl; //cerr << "Sentence " << S << endl;
multimap<float,unsigned> gradient; multimap<float,unsigned> gradient;
vector<float> f0; vector<float> f0;
f0.resize(FData->get(S).size()); f0.resize(FData->get(S).size());
for(unsigned j=0;j<FData->get(S).size();j++){ for(unsigned j=0; j<FData->get(S).size(); j++) {
gradient.insert(pair<float,unsigned>(direction*(FData->get(S,j)),j));//gradient of the feature function for this particular target sentence gradient.insert(pair<float,unsigned>(direction*(FData->get(S,j)),j));//gradient of the feature function for this particular target sentence
f0[j]=origin*FData->get(S,j);//compute the feature function at the origin point f0[j]=origin*FData->get(S,j);//compute the feature function at the origin point
} }
//now lets compute the 1best for each value of x //now lets compute the 1best for each value of x
// vector<pair<float,unsigned> > onebest; // vector<pair<float,unsigned> > onebest;
multimap<float,unsigned>::iterator gradientit=gradient.begin(); multimap<float,unsigned>::iterator gradientit=gradient.begin();
multimap<float,unsigned>::iterator highest_f0=gradient.begin(); multimap<float,unsigned>::iterator highest_f0=gradient.begin();
float smallest=gradientit->first;//smallest gradient float smallest=gradientit->first;//smallest gradient
//several candidates can have the lowest slope (eg for word penalty where the gradient is an integer ) //several candidates can have the lowest slope (eg for word penalty where the gradient is an integer )
gradientit++; gradientit++;
while(gradientit!=gradient.end()&&gradientit->first==smallest){ while(gradientit!=gradient.end()&&gradientit->first==smallest) {
// cerr<<"ni"<<gradientit->second<<endl;; // cerr<<"ni"<<gradientit->second<<endl;;
//cerr<<"fos"<<f0[gradientit->second]<<" "<<f0[index]<<" "<<index<<endl; //cerr<<"fos"<<f0[gradientit->second]<<" "<<f0[index]<<" "<<index<<endl;
if(f0[gradientit->second]>f0[highest_f0->second]) if(f0[gradientit->second]>f0[highest_f0->second])
highest_f0=gradientit;//the highest line is the one with he highest f0 highest_f0=gradientit;//the highest line is the one with he highest f0
gradientit++; gradientit++;
} }
gradientit = highest_f0; gradientit = highest_f0;
first1best.push_back(highest_f0->second); first1best.push_back(highest_f0->second);
//now we look for the intersections points indicating a change of 1 best //now we look for the intersections points indicating a change of 1 best
//we use the fact that the function is convex, which means that the gradient can only go up //we use the fact that the function is convex, which means that the gradient can only go up
while(gradientit!=gradient.end()){ while(gradientit!=gradient.end()) {
map<float,unsigned>::iterator leftmost=gradientit; map<float,unsigned>::iterator leftmost=gradientit;
float m=gradientit->first; float m=gradientit->first;
float b=f0[gradientit->second]; float b=f0[gradientit->second];
multimap<float,unsigned>::iterator gradientit2=gradientit; multimap<float,unsigned>::iterator gradientit2=gradientit;
gradientit2++; gradientit2++;
float leftmostx=MAX_FLOAT; float leftmostx=MAX_FLOAT;
for(;gradientit2!=gradient.end();gradientit2++){ for(; gradientit2!=gradient.end(); gradientit2++) {
//cerr<<"--"<<d++<<' '<<gradientit2->first<<' '<<gradientit2->second<<endl; //cerr<<"--"<<d++<<' '<<gradientit2->first<<' '<<gradientit2->second<<endl;
//look for all candidate with a gradient bigger than the current one and find the one with the leftmost intersection //look for all candidate with a gradient bigger than the current one and find the one with the leftmost intersection
float curintersect; float curintersect;
if(m!=gradientit2->first){ if(m!=gradientit2->first) {
curintersect=intersect(m,b,gradientit2->first,f0[gradientit2->second]); curintersect=intersect(m,b,gradientit2->first,f0[gradientit2->second]);
//cerr << "curintersect: " << curintersect << " leftmostx: " << leftmostx << endl; //cerr << "curintersect: " << curintersect << " leftmostx: " << leftmostx << endl;
if(curintersect<=leftmostx){ if(curintersect<=leftmostx) {
//we have found an intersection to the left of the leftmost we had so far. //we have found an intersection to the left of the leftmost we had so far.
//we might have curintersect==leftmostx for example is 2 candidates are the same //we might have curintersect==leftmostx for example is 2 candidates are the same
//in that case its better its better to update leftmost to gradientit2 to avoid some recomputing later //in that case its better its better to update leftmost to gradientit2 to avoid some recomputing later
leftmostx=curintersect; leftmostx=curintersect;
leftmost=gradientit2;//this is the new reference leftmost=gradientit2;//this is the new reference
} }
} }
} }
if (leftmost == gradientit) { if (leftmost == gradientit) {
//we didn't find any more intersections //we didn't find any more intersections
//the rightmost bestindex is the one with the highest slope. //the rightmost bestindex is the one with the highest slope.
assert(abs(leftmost->first-gradient.rbegin()->first)<0.0001);//they should be egal but there might be assert(abs(leftmost->first-gradient.rbegin()->first)<0.0001);//they should be egal but there might be
//a small difference due to rounding error //a small difference due to rounding error
break; break;
} }
//we have found the next intersection! //we have found the next intersection!
pair<unsigned,unsigned> newd(S,leftmost->second);//new onebest for Sentence S is leftmost->second pair<unsigned,unsigned> newd(S,leftmost->second);//new onebest for Sentence S is leftmost->second
if(leftmostx-previnserted->first<min_int){ if(leftmostx-previnserted->first<min_int) {
/* Require that the intersection Point be at least min_int /* Require that the intersection Point be at least min_int
to the right of the previous one(for this sentence). If not, we replace the to the right of the previous one(for this sentence). If not, we replace the
previous intersection Point with this one. Yes, it can even previous intersection Point with this one. Yes, it can even
happen that the new intersection Point is slightly to the happen that the new intersection Point is slightly to the
left of the old one, because of numerical imprecision. left of the old one, because of numerical imprecision.
we do not check that we are to the right of the penultimate point also. it this happen the 1best the inteval will be wrong we do not check that we are to the right of the penultimate point also. it this happen the 1best the inteval will be wrong
we are going to replace previnsert by the new one because we do not want to keep we are going to replace previnsert by the new one because we do not want to keep
2 very close threshold: if the minima is there it could be an artifact 2 very close threshold: if the minima is there it could be an artifact
*/ */
map<float,diff_t>::iterator tit=thresholdmap.find(leftmostx); map<float,diff_t>::iterator tit=thresholdmap.find(leftmostx);
if(tit==previnserted){ if(tit==previnserted) {
//the threshold is the same as before can happen if 2 candidates are the same for example //the threshold is the same as before can happen if 2 candidates are the same for example
assert(previnserted->second.back().first==newd.first); assert(previnserted->second.back().first==newd.first);
previnserted->second.back()=newd;//just replace the 1 best fors sentence S previnserted->second.back()=newd;//just replace the 1 best fors sentence S
//previnsert doesnt change //previnsert doesnt change
}else{ } else {
if(tit==thresholdmap.end()){ if(tit==thresholdmap.end()) {
thresholdmap[leftmostx]=previnserted->second;//We keep the diffs at previnsert thresholdmap[leftmostx]=previnserted->second;//We keep the diffs at previnsert
thresholdmap.erase(previnserted);//erase old previnsert thresholdmap.erase(previnserted);//erase old previnsert
previnserted=thresholdmap.find(leftmostx);//point previnsert to the new threshold previnserted=thresholdmap.find(leftmostx);//point previnsert to the new threshold
previnserted->second.back()=newd;//we update the diff for sentence S previnserted->second.back()=newd;//we update the diff for sentence S
}else{//threshold already exists but is not the previous one. } else { //threshold already exists but is not the previous one.
//we append the diffs in previnsert to tit before destroying previnsert //we append the diffs in previnsert to tit before destroying previnsert
tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end()); tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end());
assert(tit->second.back().first==newd.first); assert(tit->second.back().first==newd.first);
tit->second.back()=newd;//change diff for sentence S tit->second.back()=newd;//change diff for sentence S
thresholdmap.erase(previnserted);//erase old previnsert thresholdmap.erase(previnserted);//erase old previnsert
previnserted=tit;//point previnsert to the new threshold previnserted=tit;//point previnsert to the new threshold
} }
} }
assert(previnserted != thresholdmap.end()); assert(previnserted != thresholdmap.end());
}else{//normal insertion process } else { //normal insertion process
previnserted=AddThreshold(thresholdmap,leftmostx,newd); previnserted=AddThreshold(thresholdmap,leftmostx,newd);
} }
gradientit=leftmost; gradientit=leftmost;
} //while(gradientit!=gradient.end()){ } //while(gradientit!=gradient.end()){
} //loop on S } //loop on S
//now the thresholdlist is up to date: //now the thresholdlist is up to date:
//it contains a list of all the parameter_ts where the function changed its value, along with the nbest list for the interval after each threshold //it contains a list of all the parameter_ts where the function changed its value, along with the nbest list for the interval after each threshold
map<float,diff_t >::iterator thrit; map<float,diff_t >::iterator thrit;
if(verboselevel()>6){ if(verboselevel()>6) {
cerr << "Thresholds:(" <<thresholdmap.size()<<")"<< endl; cerr << "Thresholds:(" <<thresholdmap.size()<<")"<< endl;
for (thrit = thresholdmap.begin();thrit!=thresholdmap.end();thrit++){ for (thrit = thresholdmap.begin(); thrit!=thresholdmap.end(); thrit++) {
cerr << "x: " << thrit->first << " diffs"; cerr << "x: " << thrit->first << " diffs";
for (size_t j = 0; j < thrit->second.size(); ++j) { for (size_t j = 0; j < thrit->second.size(); ++j) {
cerr << " " <<thrit->second[j].first << "," << thrit->second[j].second; cerr << " " <<thrit->second[j].first << "," << thrit->second[j].second;
} }
cerr << endl; cerr << endl;
} }
} }
//last thing to do is compute the Stat score (ie BLEU) and find the minimum //last thing to do is compute the Stat score (ie BLEU) and find the minimum
thrit=thresholdmap.begin(); thrit=thresholdmap.begin();
++thrit;//first diff corrrespond to MIN_FLOAT and first1best ++thrit;//first diff corrrespond to MIN_FLOAT and first1best
diffs_t diffs; diffs_t diffs;
for(;thrit!=thresholdmap.end();thrit++) for(; thrit!=thresholdmap.end(); thrit++)
diffs.push_back(thrit->second); diffs.push_back(thrit->second);
vector<statscore_t> scores=GetIncStatScore(first1best,diffs); vector<statscore_t> scores=GetIncStatScore(first1best,diffs);
thrit=thresholdmap.begin(); thrit=thresholdmap.begin();
statscore_t bestscore=MIN_FLOAT; statscore_t bestscore=MIN_FLOAT;
float bestx=MIN_FLOAT; float bestx=MIN_FLOAT;
assert(scores.size()==thresholdmap.size());//we skipped the first el of thresholdlist but GetIncStatScore return 1 more for first1best assert(scores.size()==thresholdmap.size());//we skipped the first el of thresholdlist but GetIncStatScore return 1 more for first1best
for(unsigned int sc=0;sc!=scores.size();sc++){ for(unsigned int sc=0; sc!=scores.size(); sc++) {
//cerr << "x=" << thrit->first << " => " << scores[sc] << endl; //cerr << "x=" << thrit->first << " => " << scores[sc] << endl;
if (scores[sc] > bestscore) { if (scores[sc] > bestscore) {
//This is the score for the interval [lit2->first, (lit2+1)->first] //This is the score for the interval [lit2->first, (lit2+1)->first]
//unless we're at the last score, when it's the score //unless we're at the last score, when it's the score
//for the interval [lit2->first,+inf] //for the interval [lit2->first,+inf]
bestscore = scores[sc]; bestscore = scores[sc];
//if we're not in [-inf,x1] or [xn,+inf] then just take the value //if we're not in [-inf,x1] or [xn,+inf] then just take the value
//if x which splits the interval in half. For the rightmost interval, //if x which splits the interval in half. For the rightmost interval,
//take x to be the last interval boundary + 0.1, and for the leftmost //take x to be the last interval boundary + 0.1, and for the leftmost
//interval, take x to be the first interval boundary - 1000. //interval, take x to be the first interval boundary - 1000.
//These values are taken from cmert. //These values are taken from cmert.
float leftx = thrit->first; float leftx = thrit->first;
if (thrit == thresholdmap.begin()) { if (thrit == thresholdmap.begin()) {
leftx = MIN_FLOAT; leftx = MIN_FLOAT;
} }
++thrit; ++thrit;
float rightx = MAX_FLOAT; float rightx = MAX_FLOAT;
if (thrit != thresholdmap.end()) { if (thrit != thresholdmap.end()) {
rightx = thrit->first; rightx = thrit->first;
} }
--thrit; --thrit;
//cerr << "leftx: " << leftx << " rightx: " << rightx << endl; //cerr << "leftx: " << leftx << " rightx: " << rightx << endl;
if (leftx == MIN_FLOAT) { if (leftx == MIN_FLOAT) {
bestx = rightx-1000; bestx = rightx-1000;
} else if (rightx == MAX_FLOAT) { } else if (rightx == MAX_FLOAT) {
bestx = leftx+0.1; bestx = leftx+0.1;
} else { } else {
bestx = 0.5 * (rightx + leftx); bestx = 0.5 * (rightx + leftx);
} }
//cerr << "x = " << "set new bestx to: " << bestx << endl; //cerr << "x = " << "set new bestx to: " << bestx << endl;
} }
++thrit; ++thrit;
} }
if(abs(bestx)<0.00015){ if(abs(bestx)<0.00015) {
bestx=0.0;//the origin of the line is the best point!we put it back at 0 so we do not propagate rounding erros bestx=0.0;//the origin of the line is the best point!we put it back at 0 so we do not propagate rounding erros
//finally! we manage to extract the best score; //finally! we manage to extract the best score;
//now we convert bestx (position on the line) to a point! //now we convert bestx (position on the line) to a point!
if(verboselevel()>4) if(verboselevel()>4)
cerr<<"best point on line at origin"<<endl; cerr<<"best point on line at origin"<<endl;
} }
if(verboselevel()>3){ if(verboselevel()>3) {
// cerr<<"end Lineopt, bestx="<<bestx<<endl; // cerr<<"end Lineopt, bestx="<<bestx<<endl;
} }
bestpoint=direction*bestx+origin; bestpoint=direction*bestx+origin;
bestpoint.score=bestscore; bestpoint.score=bestscore;
return bestscore; return bestscore;
}; };
void Optimizer::Get1bests(const Point& P,vector<unsigned>& bests)const{ void Optimizer::Get1bests(const Point& P,vector<unsigned>& bests)const
{
assert(FData); assert(FData);
bests.clear(); bests.clear();
bests.resize(size()); bests.resize(size());
for(unsigned i=0;i<size();i++){ for(unsigned i=0; i<size(); i++) {
float bestfs=MIN_FLOAT; float bestfs=MIN_FLOAT;
unsigned idx=0; unsigned idx=0;
unsigned j; unsigned j;
for(j=0;j<FData->get(i).size();j++){ for(j=0; j<FData->get(i).size(); j++) {
float curfs=P*FData->get(i,j); float curfs=P*FData->get(i,j);
if(curfs>bestfs){ if(curfs>bestfs) {
bestfs=curfs; bestfs=curfs;
idx=j; idx=j;
} }
} }
bests[i]=idx; bests[i]=idx;
} }
} }
statscore_t Optimizer::Run(Point& P)const{ statscore_t Optimizer::Run(Point& P)const
if(!FData){ {
if(!FData) {
cerr<<"error trying to optimize without Features loaded"<<endl; cerr<<"error trying to optimize without Features loaded"<<endl;
exit(2); exit(2);
} }
if(!scorer){ if(!scorer) {
cerr<<"error trying to optimize without a Scorer loaded"<<endl; cerr<<"error trying to optimize without a Scorer loaded"<<endl;
exit(2); exit(2);
} }
if (scorer->getReferenceSize()!=FData->size()){ if (scorer->getReferenceSize()!=FData->size()) {
cerr<<"error size mismatch between FeatureData and Scorer"<<endl; cerr<<"error size mismatch between FeatureData and Scorer"<<endl;
exit(2); exit(2);
} }
statscore_t score=GetStatScore(P); statscore_t score=GetStatScore(P);
P.score=score; P.score=score;
if(verboselevel()>2) if(verboselevel()>2)
cerr<<"Starting point: "<< P << " => "<< P.score << endl; cerr<<"Starting point: "<< P << " => "<< P.score << endl;
statscore_t s=TrueRun(P); statscore_t s=TrueRun(P);
P.score=s;//just in case its not done in TrueRun P.score=s;//just in case its not done in TrueRun
@ -331,9 +341,10 @@ statscore_t Optimizer::Run(Point& P)const{
cerr<<"Ending point: "<< P <<" => "<< s << endl; cerr<<"Ending point: "<< P <<" => "<< s << endl;
return s; return s;
} }
vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst,vector<vector <pair<unsigned,unsigned> > > thediffs)const{
vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst,vector<vector <pair<unsigned,unsigned> > > thediffs)const
{
assert(scorer); assert(scorer);
vector<statscore_t> theres; vector<statscore_t> theres;
@ -347,61 +358,62 @@ vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst,vector<
//---------------- code for the powell optimizer //---------------- code for the powell optimizer
float SimpleOptimizer::eps=0.0001; float SimpleOptimizer::eps=0.0001;
statscore_t SimpleOptimizer::TrueRun(Point& P)const{ statscore_t SimpleOptimizer::TrueRun(Point& P)const
{
statscore_t prevscore=0; statscore_t prevscore=0;
statscore_t bestscore=MIN_FLOAT; statscore_t bestscore=MIN_FLOAT;
Point best; Point best;
//If P is already defined and provides a score //If P is already defined and provides a score
//we must improve over this score //we must improve over this score
if(P.score>bestscore){ if(P.score>bestscore) {
bestscore=P.score; bestscore=P.score;
best=P; best=P;
} }
int nrun=0; int nrun=0;
do{ do {
++nrun; ++nrun;
if(verboselevel()>2&&nrun>1) if(verboselevel()>2&&nrun>1)
cerr<<"last diff="<<bestscore-prevscore<<" nrun "<<nrun<<endl; cerr<<"last diff="<<bestscore-prevscore<<" nrun "<<nrun<<endl;
prevscore=bestscore; prevscore=bestscore;
Point linebest; Point linebest;
for(unsigned int d=0;d<Point::getdim();d++){ for(unsigned int d=0; d<Point::getdim(); d++) {
if(verboselevel()>4){ if(verboselevel()>4) {
// cerr<<"minimizing along direction "<<d<<endl; // cerr<<"minimizing along direction "<<d<<endl;
cerr<<"starting point: " << P << " => " << prevscore << endl; cerr<<"starting point: " << P << " => " << prevscore << endl;
} }
Point direction; Point direction;
for(unsigned int i=0;i<Point::getdim();i++) for(unsigned int i=0; i<Point::getdim(); i++)
direction[i]; direction[i];
direction[d]=1.0; direction[d]=1.0;
statscore_t curscore=LineOptimize(P,direction,linebest);//find the minimum on the line statscore_t curscore=LineOptimize(P,direction,linebest);//find the minimum on the line
if(verboselevel()>5){ if(verboselevel()>5) {
cerr<<"direction: "<< d << " => " << curscore << endl; cerr<<"direction: "<< d << " => " << curscore << endl;
cerr<<"\tending point: "<< linebest << " => " << curscore << endl; cerr<<"\tending point: "<< linebest << " => " << curscore << endl;
} }
if(curscore>bestscore){ if(curscore>bestscore) {
bestscore=curscore; bestscore=curscore;
best=linebest; best=linebest;
if(verboselevel()>3){ if(verboselevel()>3) {
cerr<<"new best dir:"<<d<<" ("<<nrun<<")"<<endl; cerr<<"new best dir:"<<d<<" ("<<nrun<<")"<<endl;
cerr<<"new best Point "<<best<< " => " <<curscore<<endl; cerr<<"new best Point "<<best<< " => " <<curscore<<endl;
} }
} }
} }
P=best;//update the current vector with the best point on all line tested P=best;//update the current vector with the best point on all line tested
if(verboselevel()>3) if(verboselevel()>3)
cerr<<nrun<<"\t"<<P<<endl; cerr<<nrun<<"\t"<<P<<endl;
}while(bestscore-prevscore>eps); } while(bestscore-prevscore>eps);
if(verboselevel()>2){ if(verboselevel()>2) {
cerr<<"end Powell Algo, nrun="<<nrun<<endl; cerr<<"end Powell Algo, nrun="<<nrun<<endl;
cerr<<"last diff="<<bestscore-prevscore<<endl; cerr<<"last diff="<<bestscore-prevscore<<endl;
cerr<<"\t"<<P<<endl; cerr<<"\t"<<P<<endl;
} }
return bestscore; return bestscore;
} }
@ -409,58 +421,63 @@ statscore_t SimpleOptimizer::TrueRun(Point& P)const{
/**RandomOptimizer to use as beaseline and test.\n /**RandomOptimizer to use as beaseline and test.\n
Just return a random point*/ Just return a random point*/
statscore_t RandomOptimizer::TrueRun(Point& P)const{ statscore_t RandomOptimizer::TrueRun(Point& P)const
{
vector<parameter_t> min(Point::getdim()); vector<parameter_t> min(Point::getdim());
vector<parameter_t> max(Point::getdim()); vector<parameter_t> max(Point::getdim());
for(unsigned int d=0;d<Point::getdim();d++){ for(unsigned int d=0; d<Point::getdim(); d++) {
min[d]=0.0; min[d]=0.0;
max[d]=1.0; max[d]=1.0;
} }
P.Randomize(min,max); P.Randomize(min,max);
statscore_t score=GetStatScore(P); statscore_t score=GetStatScore(P);
P.score=score; P.score=score;
return score; return score;
} }
//-------------------------------------- //--------------------------------------
vector<string> OptimizerFactory::typenames; vector<string> OptimizerFactory::typenames;
void OptimizerFactory::SetTypeNames(){ void OptimizerFactory::SetTypeNames()
if(typenames.empty()){ {
if(typenames.empty()) {
typenames.resize(NOPTIMIZER); typenames.resize(NOPTIMIZER);
typenames[POWELL]="powell"; typenames[POWELL]="powell";
typenames[RANDOM]="random"; typenames[RANDOM]="random";
//add new type there //add new type there
} }
} }
vector<string> OptimizerFactory::GetTypeNames(){ vector<string> OptimizerFactory::GetTypeNames()
{
if(typenames.empty()) if(typenames.empty())
SetTypeNames(); SetTypeNames();
return typenames; return typenames;
} }
OptimizerFactory::OptType OptimizerFactory::GetOType(string type){ OptimizerFactory::OptType OptimizerFactory::GetOType(string type)
{
unsigned int thetype; unsigned int thetype;
if(typenames.empty()) if(typenames.empty())
SetTypeNames(); SetTypeNames();
for(thetype=0;thetype<typenames.size();thetype++) for(thetype=0; thetype<typenames.size(); thetype++)
if(typenames[thetype]==type) if(typenames[thetype]==type)
break; break;
return((OptType)thetype); return((OptType)thetype);
}; };
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,vector<parameter_t> start,string type){ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,vector<parameter_t> start,string type)
{
OptType T=GetOType(type); OptType T=GetOType(type);
if(T==NOPTIMIZER){ if(T==NOPTIMIZER) {
cerr<<"Error: unknown Optimizer type "<<type<<endl; cerr<<"Error: unknown Optimizer type "<<type<<endl;
cerr<<"Known Algorithm are:"<<endl; cerr<<"Known Algorithm are:"<<endl;
unsigned int thetype; unsigned int thetype;
for(thetype=0;thetype<typenames.size();thetype++) for(thetype=0; thetype<typenames.size(); thetype++)
cerr<<typenames[thetype]<<endl; cerr<<typenames[thetype]<<endl;
throw ("unknown Optimizer Type"); throw ("unknown Optimizer Type");
} }
switch((OptType)T){ switch((OptType)T) {
case POWELL: case POWELL:
return new SimpleOptimizer(dim,i2o,start); return new SimpleOptimizer(dim,i2o,start);
break; break;
@ -469,6 +486,6 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,ve
break; break;
default: default:
cerr<<"Error: unknown optimizer"<<type<<endl; cerr<<"Error: unknown optimizer"<<type<<endl;
return NULL; return NULL;
} }
} }

View File

@ -15,61 +15,69 @@ typedef float featurescore;
using namespace std; using namespace std;
/**abstract virtual class*/ /**abstract virtual class*/
class Optimizer{ class Optimizer
protected: {
Scorer * scorer; //no accessor for them only child can use them protected:
FeatureData * FData;//no accessor for them only child can use them Scorer * scorer; //no accessor for them only child can use them
public: FeatureData * FData;//no accessor for them only child can use them
public:
Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start); Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start);
void SetScorer(Scorer *S); void SetScorer(Scorer *S);
void SetFData(FeatureData *F); void SetFData(FeatureData *F);
virtual ~Optimizer(); virtual ~Optimizer();
unsigned size()const{return (FData?FData->size():0);} unsigned size()const {
return (FData?FData->size():0);
}
/**Generic wrapper around TrueRun to check a few things. Non virtual*/ /**Generic wrapper around TrueRun to check a few things. Non virtual*/
statscore_t Run(Point&)const; statscore_t Run(Point&)const;
/**main function that perform an optimization*/ /**main function that perform an optimization*/
virtual statscore_t TrueRun(Point&)const=0; virtual statscore_t TrueRun(Point&)const=0;
/**given a set of lambdas, get the nbest for each sentence*/ /**given a set of lambdas, get the nbest for each sentence*/
void Get1bests(const Point& param,vector<unsigned>& bests)const; void Get1bests(const Point& param,vector<unsigned>& bests)const;
/**given a set of nbests, get the Statistical score*/ /**given a set of nbests, get the Statistical score*/
statscore_t GetStatScore(const vector<unsigned>& nbests)const{return scorer->score(nbests);}; statscore_t GetStatScore(const vector<unsigned>& nbests)const {
return scorer->score(nbests);
};
/**given a set of lambdas, get the total statistical score*/ /**given a set of lambdas, get the total statistical score*/
statscore_t GetStatScore(const Point& param)const; statscore_t GetStatScore(const Point& param)const;
vector<statscore_t > GetIncStatScore(vector<unsigned> ref,vector<vector <pair<unsigned,unsigned> > >)const; vector<statscore_t > GetIncStatScore(vector<unsigned> ref,vector<vector <pair<unsigned,unsigned> > >)const;
statscore_t LineOptimize(const Point& start,const Point& direction,Point& best)const;//Get the optimal Lambda and the best score in a particular direction from a given Point statscore_t LineOptimize(const Point& start,const Point& direction,Point& best)const;//Get the optimal Lambda and the best score in a particular direction from a given Point
}; };
/**default basic optimizer*/ /**default basic optimizer*/
class SimpleOptimizer: public Optimizer{ class SimpleOptimizer: public Optimizer
{
private: private:
static float eps; static float eps;
public: public:
SimpleOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start){}; SimpleOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start) {};
virtual statscore_t TrueRun(Point&)const; virtual statscore_t TrueRun(Point&)const;
}; };
class RandomOptimizer: public Optimizer{ class RandomOptimizer: public Optimizer
{
public: public:
RandomOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start){}; RandomOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start) {};
virtual statscore_t TrueRun(Point&)const; virtual statscore_t TrueRun(Point&)const;
}; };
class OptimizerFactory{ class OptimizerFactory
public: {
public:
// unsigned dim; // unsigned dim;
//Point Start; //Point Start;
static vector<string> GetTypeNames(); static vector<string> GetTypeNames();
static Optimizer* BuildOptimizer(unsigned dim,vector<unsigned>tooptimize,vector<parameter_t> start,string type); static Optimizer* BuildOptimizer(unsigned dim,vector<unsigned>tooptimize,vector<parameter_t> start,string type);
private: private:
enum OptType{POWELL=0,RANDOM,NOPTIMIZER};//Add new optimizer here BEFORE NOPTIMZER enum OptType {POWELL=0,RANDOM,NOPTIMIZER}; //Add new optimizer here BEFORE NOPTIMZER
static OptType GetOType(string); static OptType GetOType(string);
static vector<string> typenames; static vector<string> typenames;
static void SetTypeNames(); static void SetTypeNames();
}; };

View File

@ -1,69 +1,72 @@
#include "PerScorer.h" #include "PerScorer.h"
void PerScorer::setReferenceFiles(const vector<string>& referenceFiles) { void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
// for each line in the reference file, create a multiset of the {
// word ids // for each line in the reference file, create a multiset of the
if (referenceFiles.size() != 1) { // word ids
throw runtime_error("PER only supports a single reference"); if (referenceFiles.size() != 1) {
throw runtime_error("PER only supports a single reference");
}
_reftokens.clear();
_reflengths.clear();
ifstream in(referenceFiles[0].c_str());
if (!in) {
throw runtime_error("Unable to open " + referenceFiles[0]);
}
string line;
int sid = 0;
while (getline(in,line)) {
vector<int> tokens;
encode(line,tokens);
_reftokens.push_back(multiset<int>());
for (size_t i = 0; i < tokens.size(); ++i) {
_reftokens.back().insert(tokens[i]);
} }
_reftokens.clear(); _reflengths.push_back(tokens.size());
_reflengths.clear(); if (sid > 0 && sid % 100 == 0) {
ifstream in(referenceFiles[0].c_str()); TRACE_ERR(".");
if (!in) {
throw runtime_error("Unable to open " + referenceFiles[0]);
} }
string line; ++sid;
int sid = 0; }
while (getline(in,line)) { TRACE_ERR(endl);
vector<int> tokens;
encode(line,tokens);
_reftokens.push_back(multiset<int>());
for (size_t i = 0; i < tokens.size(); ++i) {
_reftokens.back().insert(tokens[i]);
}
_reflengths.push_back(tokens.size());
if (sid > 0 && sid % 100 == 0) {
TRACE_ERR(".");
}
++sid;
}
TRACE_ERR(endl);
} }
void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
if (sid >= _reflengths.size()) { {
stringstream msg; if (sid >= _reflengths.size()) {
msg << "Sentence id (" << sid << ") not found in reference set"; stringstream msg;
throw runtime_error(msg.str()); msg << "Sentence id (" << sid << ") not found in reference set";
} throw runtime_error(msg.str());
//calculate correct, output_length and ref_length for }
//the line and store it in entry //calculate correct, output_length and ref_length for
vector<int> testtokens; //the line and store it in entry
encode(text,testtokens); vector<int> testtokens;
multiset<int> testtokens_all(testtokens.begin(),testtokens.end()); encode(text,testtokens);
set<int> testtokens_unique(testtokens.begin(),testtokens.end()); multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
int correct = 0; set<int> testtokens_unique(testtokens.begin(),testtokens.end());
for (set<int>::iterator i = testtokens_unique.begin(); int correct = 0;
i != testtokens_unique.end(); ++i) { for (set<int>::iterator i = testtokens_unique.begin();
int token = *i; i != testtokens_unique.end(); ++i) {
correct += min(_reftokens[sid].count(token), testtokens_all.count(token)); int token = *i;
} correct += min(_reftokens[sid].count(token), testtokens_all.count(token));
}
ostringstream stats;
stats << correct << " " << testtokens.size() << " " << _reflengths[sid] << " " ; ostringstream stats;
string stats_str = stats.str(); stats << correct << " " << testtokens.size() << " " << _reflengths[sid] << " " ;
entry.set(stats_str); string stats_str = stats.str();
entry.set(stats_str);
} }
float PerScorer::calculateScore(const vector<float>& comps) { float PerScorer::calculateScore(const vector<float>& comps)
float denom = comps[2]; {
float num = comps[0] - max(float(0),comps[1]-comps[2]); float denom = comps[2];
if (denom == 0) { float num = comps[0] - max(float(0),comps[1]-comps[2]);
//shouldn't happen! if (denom == 0) {
return 0.0; //shouldn't happen!
} else { return 0.0;
return num/denom; } else {
} return num/denom;
}
} }

View File

@ -22,34 +22,40 @@ using namespace std;
* as 1 - (correct - max(0,output_length - ref_length)) / ref_length * as 1 - (correct - max(0,output_length - ref_length)) / ref_length
* In fact, we ignore the " 1 - " so that it can be maximised. * In fact, we ignore the " 1 - " so that it can be maximised.
**/ **/
class PerScorer: public StatisticsBasedScorer { class PerScorer: public StatisticsBasedScorer
public: {
PerScorer(const string& config = "") : StatisticsBasedScorer("PER",config) {} public:
virtual void setReferenceFiles(const vector<string>& referenceFiles); PerScorer(const string& config = "") : StatisticsBasedScorer("PER",config) {}
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual void whoami() {
cerr << "I AM PerScorer" << std::endl;
}
size_t NumberOfScores() const { cerr << "PerScorer: 3" << endl; return 3; };
bool useAlignment() const {return false;};
virtual void whoami() {
protected: cerr << "I AM PerScorer" << std::endl;
}
virtual float calculateScore(const vector<float>& comps) ;
size_t NumberOfScores() const {
private: cerr << "PerScorer: 3" << endl;
return 3;
//no copy };
PerScorer(const PerScorer&); bool useAlignment() const {
~PerScorer(){}; return false;
PerScorer& operator=(const PerScorer&); };
// data extracted from reference files
vector<size_t> _reflengths; protected:
vector<multiset<int> > _reftokens;
virtual float calculateScore(const vector<float>& comps) ;
private:
//no copy
PerScorer(const PerScorer&);
~PerScorer() {};
PerScorer& operator=(const PerScorer&);
// data extracted from reference files
vector<size_t> _reflengths;
vector<multiset<int> > _reftokens;
}; };
#endif //__PERSCORER_H #endif //__PERSCORER_H

View File

@ -16,27 +16,26 @@ using namespace std;
Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength ) Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength )
{ {
if (sourceLength > 0) if (sourceLength > 0) {
{ set(alignment, sourceLength);
set(alignment, sourceLength); }
} m_targetLength = targetLength;
m_targetLength = targetLength;
} }
size_t Permutation::getLength() const size_t Permutation::getLength() const
{ {
return int(m_array.size()); return int(m_array.size());
} }
void Permutation::dump() const void Permutation::dump() const
{ {
int j=0; int j=0;
for (vector<int>::const_iterator i = m_array.begin(); i !=m_array.end(); i++){ for (vector<int>::const_iterator i = m_array.begin(); i !=m_array.end(); i++) {
cout << "("; cout << "(";
cout << j << ":" << *i ; cout << j << ":" << *i ;
cout << "), "; cout << "), ";
j++; j++;
} }
cout << endl; cout << endl;
} }
@ -49,286 +48,272 @@ void Permutation::dump() const
void Permutation::set(const string & alignment,const int sourceLength) void Permutation::set(const string & alignment,const int sourceLength)
{ {
//cout << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl; //cout << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
if(sourceLength <= 0) if(sourceLength <= 0) {
{ //not found
//not found cerr << "Source sentence length not positive:"<< sourceLength << endl;
cerr << "Source sentence length not positive:"<< sourceLength << endl; exit(0);
exit(0); }
if (alignment.length() <= 0) {
//alignment empty - could happen but not good
cerr << "Alignment string empty:"<< alignment << endl;
}
//Tokenise on whitespace
string buf; // Have a buffer string
stringstream ss(alignment); // Insert the string into a stream
vector<string> tokens; // Create vector to hold our words
while (ss >> buf)
tokens.push_back(buf);
vector<int> tempPerm(sourceLength, -1);
//Set tempPerm to have one target position per source position
for (size_t i=0; i<tokens.size(); i++) {
string temp = tokens[i];
int posDelimeter = temp.find("-");
if(posDelimeter == int(string::npos)) {
cerr << "Delimiter not found - :"<< tokens[i] << endl;
exit(1);
} }
int sourcePos = atoi((temp.substr(0, posDelimeter)).c_str());
if (alignment.length() <= 0) int targetPos = atoi((temp.substr(posDelimeter+1)).c_str());
{ //cout << "SP:" << sourcePos << " TP:" << targetPos << endl;
//alignment empty - could happen but not good if (sourcePos > sourceLength) {
cerr << "Alignment string empty:"<< alignment << endl; cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
exit(1);
} }
//If have multiple target pos aligned to one source,
//Tokenise on whitespace // then ignore all but first alignment
string buf; // Have a buffer string if (tempPerm[sourcePos] == -1 || tempPerm[sourcePos] > targetPos) {
stringstream ss(alignment); // Insert the string into a stream tempPerm[sourcePos] = targetPos;
vector<string> tokens; // Create vector to hold our words
while (ss >> buf)
tokens.push_back(buf);
vector<int> tempPerm(sourceLength, -1);
//Set tempPerm to have one target position per source position
for (size_t i=0; i<tokens.size(); i++) {
string temp = tokens[i];
int posDelimeter = temp.find("-");
if(posDelimeter == int(string::npos)) {
cerr << "Delimiter not found - :"<< tokens[i] << endl;
exit(1);
}
int sourcePos = atoi((temp.substr(0, posDelimeter)).c_str());
int targetPos = atoi((temp.substr(posDelimeter+1)).c_str());
//cout << "SP:" << sourcePos << " TP:" << targetPos << endl;
if (sourcePos > sourceLength) {
cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
exit(1);
}
//If have multiple target pos aligned to one source,
// then ignore all but first alignment
if (tempPerm[sourcePos] == -1 || tempPerm[sourcePos] > targetPos)
{
tempPerm[sourcePos] = targetPos;
}
} }
}
//TODO //TODO
//Set final permutation in m_array //Set final permutation in m_array
//Take care of: source - null //Take care of: source - null
// multiple_source - one target // multiple_source - one target
// unaligned target // unaligned target
// Input: 1-9 2-1 4-3 4-4 5-6 6-6 7-6 8-8 // Input: 1-9 2-1 4-3 4-4 5-6 6-6 7-6 8-8
// Convert source: 1 2 3 4 5 6 7 8 // Convert source: 1 2 3 4 5 6 7 8
// target: 9 1 -1 3 6 6 6 8 -> 8 1 2 3 4 5 6 7 // target: 9 1 -1 3 6 6 6 8 -> 8 1 2 3 4 5 6 7
// 1st step: Add null aligned source to previous alignment // 1st step: Add null aligned source to previous alignment
// target: 9 1 -1 3 6 6 6 8 -> 9 1 1 3 6 6 6 8 // target: 9 1 -1 3 6 6 6 8 -> 9 1 1 3 6 6 6 8
int last=0; int last=0;
m_array.assign(sourceLength, -1); m_array.assign(sourceLength, -1);
//get a searcheable index //get a searcheable index
multimap<int, int> invMap; multimap<int, int> invMap;
multimap<int, int>::iterator it; multimap<int, int>::iterator it;
//cout << " SourceP -> TargetP " << endl; //cout << " SourceP -> TargetP " << endl;
for (size_t i=0; i<tempPerm.size(); i++) for (size_t i=0; i<tempPerm.size(); i++) {
{ if (tempPerm[i] == -1) {
if (tempPerm[i] == -1) { tempPerm[i] = last;
tempPerm[i] = last; } else {
} else { last = tempPerm[i];
last = tempPerm[i];
}
//cout << i << " -> " << tempPerm[i] << endl;
//Key is target pos, value is source pos
invMap.insert(pair<int,int>(tempPerm[i],int(i)));
} }
//cout << i << " -> " << tempPerm[i] << endl;
//Key is target pos, value is source pos
invMap.insert(pair<int,int>(tempPerm[i],int(i)));
}
// 2nd step: Get target into index of multimap and sort // 2nd step: Get target into index of multimap and sort
// Convert source: 1 2 3 4 5 6 7 8 // Convert source: 1 2 3 4 5 6 7 8
// target: 9 1 0 3 6 6 6 8 -> 0 1 3 6 6 6 8 9 // target: 9 1 0 3 6 6 6 8 -> 0 1 3 6 6 6 8 9
// source: 3 2 4 5 6 7 8 1 // source: 3 2 4 5 6 7 8 1
int i=0; int i=0;
//cout << " TargetP => SourceP : TargetIndex " << endl; //cout << " TargetP => SourceP : TargetIndex " << endl;
for ( it=invMap.begin() ; it != invMap.end(); it++ ) for ( it=invMap.begin() ; it != invMap.end(); it++ ) {
{ //cout << (*it).first << " => " << (*it).second << " : " << i << endl;
//cout << (*it).first << " => " << (*it).second << " : " << i << endl; //find source position
//find source position m_array[(*it).second] = i;
m_array[(*it).second] = i; i++;
i++; }
}
bool ok = checkValidPermutation(m_array); bool ok = checkValidPermutation(m_array);
//dump(); //dump();
if (!ok) { if (!ok) {
throw runtime_error(" Created invalid permutation"); throw runtime_error(" Created invalid permutation");
} }
} }
//Static //Static
vector<int> Permutation::invert(const vector<int> & inVector) vector<int> Permutation::invert(const vector<int> & inVector)
{ {
vector<int> outVector(inVector.size()); vector<int> outVector(inVector.size());
for (size_t i=0; i<inVector.size(); i++){ for (size_t i=0; i<inVector.size(); i++) {
outVector[inVector[i]] = int(i); outVector[inVector[i]] = int(i);
} }
return outVector; return outVector;
} }
//Static //Static
//Permutations start at 0 //Permutations start at 0
bool Permutation::checkValidPermutation(vector<int> const & inVector) bool Permutation::checkValidPermutation(vector<int> const & inVector)
{ {
vector<int> test(inVector.size(),-1); vector<int> test(inVector.size(),-1);
for (size_t i=0; i< inVector.size(); i++){ for (size_t i=0; i< inVector.size(); i++) {
//No multiple entries of same value allowed //No multiple entries of same value allowed
if (test[inVector[i]] > -1){ if (test[inVector[i]] > -1) {
cerr << "Permutation error: multiple entries of same value\n" << endl; cerr << "Permutation error: multiple entries of same value\n" << endl;
return false; return false;
}
test[inVector[i]] ++;
} }
for (size_t i=0; i<inVector.size(); i++){ test[inVector[i]] ++;
//No holes allowed }
if (test[inVector[i]] == -1) { for (size_t i=0; i<inVector.size(); i++) {
cerr << "Permutation error: missing values\n" << endl; //No holes allowed
return false; if (test[inVector[i]] == -1) {
} cerr << "Permutation error: missing values\n" << endl;
return false;
} }
return true; }
return true;
} }
//TODO default to HAMMING //TODO default to HAMMING
//Note: it returns the distance that is not normalised //Note: it returns the distance that is not normalised
float Permutation::distance(const Permutation &permCompare, const distanceMetric_t &type) const float Permutation::distance(const Permutation &permCompare, const distanceMetric_t &type) const
{ {
float score=0; float score=0;
//cout << "*****Permutation::distance" <<endl;
//cout << "Ref:" << endl;
//dump();
//cout << "Comp:" << endl;
//permCompare.dump();
if (type == HAMMING_DISTANCE) { //cout << "*****Permutation::distance" <<endl;
score = calculateHamming(permCompare); //cout << "Ref:" << endl;
} else if (type == KENDALL_DISTANCE) { //dump();
score = calculateKendall(permCompare); //cout << "Comp:" << endl;
} else { //permCompare.dump();
throw runtime_error("Distance type not valid");
}
float brevityPenalty = 1.0 - (float) permCompare.getTargetLength()/getTargetLength() ;//reflength divided by trans length
if (brevityPenalty < 0.0) {
score = score * exp(brevityPenalty);
}
//cout << "Distance type:" << type << endl; if (type == HAMMING_DISTANCE) {
//cout << "Score: "<< score << endl; score = calculateHamming(permCompare);
return score; } else if (type == KENDALL_DISTANCE) {
score = calculateKendall(permCompare);
} else {
throw runtime_error("Distance type not valid");
}
float brevityPenalty = 1.0 - (float) permCompare.getTargetLength()/getTargetLength() ;//reflength divided by trans length
if (brevityPenalty < 0.0) {
score = score * exp(brevityPenalty);
}
//cout << "Distance type:" << type << endl;
//cout << "Score: "<< score << endl;
return score;
} }
float Permutation::calculateHamming(const Permutation & compare) const float Permutation::calculateHamming(const Permutation & compare) const
{ {
float score=0; float score=0;
vector<int> compareArray = compare.getArray(); vector<int> compareArray = compare.getArray();
if (getLength() != compare.getLength()) { if (getLength() != compare.getLength()) {
cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl; cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
throw runtime_error("Length of permutations not equal"); throw runtime_error("Length of permutations not equal");
}
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++) {
if (m_array[i] != compareArray[i]) {
score++;
} }
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++)
{
if (m_array[i] != compareArray[i])
{
score++;
}
} }
score = 1 - (score / getLength()); score = 1 - (score / getLength());
return score; return score;
} }
float Permutation::calculateKendall(const Permutation & compare) const float Permutation::calculateKendall(const Permutation & compare) const
{ {
float score=0; float score=0;
vector<int> compareArray = compare.getArray(); vector<int> compareArray = compare.getArray();
if (getLength() != compare.getLength()) { if (getLength() != compare.getLength()) {
cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl; cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
throw runtime_error("Length of permutations not equal"); throw runtime_error("Length of permutations not equal");
}
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++) {
for (size_t j=0; j<getLength(); j++) {
if ((m_array[i] < m_array[j]) && (compareArray[i] > compareArray[j])) {
score++;
}
} }
if (getLength() == 0) { }
cerr << "Empty permutation" << endl; score = (score / ((getLength()*getLength() - getLength()) /2 ) );
return 0; //Adjusted Kendall's tau correlates better with human judgements
} score = sqrt (score);
for (size_t i=0; i<getLength(); i++) score = 1 - score;
{
for (size_t j=0; j<getLength(); j++)
{
if ((m_array[i] < m_array[j]) && (compareArray[i] > compareArray[j]))
{
score++;
}
}
}
score = (score / ((getLength()*getLength() - getLength()) /2 ) );
//Adjusted Kendall's tau correlates better with human judgements
score = sqrt (score);
score = 1 - score;
return score; return score;
} }
vector<int> Permutation::getArray() const vector<int> Permutation::getArray() const
{ {
vector<int> ret = m_array; vector<int> ret = m_array;
return ret; return ret;
} }
//Static //Static
//This function is called with test which is //This function is called with test which is
// the 5th field in moses nbest output when called with -include-alignment-in-n-best // the 5th field in moses nbest output when called with -include-alignment-in-n-best
//eg. 0=0 1-2=1-2 3=3 4=4 5=5 6=6 7-9=7-8 10=9 11-13=10-11 (source-target) //eg. 0=0 1-2=1-2 3=3 4=4 5=5 6=6 7-9=7-8 10=9 11-13=10-11 (source-target)
string Permutation::convertMosesToStandard(string const & alignment) string Permutation::convertMosesToStandard(string const & alignment)
{ {
if (alignment.length() == 0) if (alignment.length() == 0) {
{
cerr << "Alignment input string empty" << endl; cerr << "Alignment input string empty" << endl;
} }
string working = alignment; string working = alignment;
string out; string out;
stringstream oss; stringstream oss;
while (working.length() > 0) while (working.length() > 0) {
{ string align;
string align; getNextPound(working,align," ");
getNextPound(working,align," ");
//If found an alignment //If found an alignment
if (align.length() > 0) if (align.length() > 0) {
{ size_t posDelimeter = align.find("=");
size_t posDelimeter = align.find("="); if(posDelimeter== string::npos) {
if(posDelimeter== string::npos) cerr << "Delimiter not found = :"<< align << endl;
{ exit(0);
cerr << "Delimiter not found = :"<< align << endl; }
exit(0); int firstSourcePos,lastSourcePos,firstTargetPos,lastTargetPos;
} string sourcePoss = align.substr(0, posDelimeter);
int firstSourcePos,lastSourcePos,firstTargetPos,lastTargetPos; string targetPoss = align.substr(posDelimeter+1);
string sourcePoss = align.substr(0, posDelimeter); posDelimeter = sourcePoss.find("-");
string targetPoss = align.substr(posDelimeter+1); if(posDelimeter < string::npos) {
posDelimeter = sourcePoss.find("-"); firstSourcePos = atoi((sourcePoss.substr(0, posDelimeter)).c_str());
if(posDelimeter < string::npos) { lastSourcePos = atoi((sourcePoss.substr(posDelimeter+1)).c_str());
firstSourcePos = atoi((sourcePoss.substr(0, posDelimeter)).c_str()); } else {
lastSourcePos = atoi((sourcePoss.substr(posDelimeter+1)).c_str()); firstSourcePos = atoi(sourcePoss.c_str());
} else { lastSourcePos = firstSourcePos;
firstSourcePos = atoi(sourcePoss.c_str()); }
lastSourcePos = firstSourcePos; posDelimeter = targetPoss.find("-");
} if(posDelimeter < string::npos) {
posDelimeter = targetPoss.find("-"); firstTargetPos = atoi((targetPoss.substr(0, posDelimeter)).c_str());
if(posDelimeter < string::npos) { lastTargetPos = atoi((targetPoss.substr(posDelimeter+1)).c_str());
firstTargetPos = atoi((targetPoss.substr(0, posDelimeter)).c_str()); } else {
lastTargetPos = atoi((targetPoss.substr(posDelimeter+1)).c_str()); firstTargetPos = atoi(targetPoss.c_str());
} else { lastTargetPos = firstTargetPos;
firstTargetPos = atoi(targetPoss.c_str()); }
lastTargetPos = firstTargetPos; for (int i = firstSourcePos; i <= lastSourcePos; i++) {
} for (int j = firstTargetPos; j <= lastTargetPos; j++) {
for (int i = firstSourcePos; i <= lastSourcePos; i++) { oss << i << "-" << j << " ";
for (int j = firstTargetPos; j <= lastTargetPos; j++) { }
oss << i << "-" << j << " "; }
}
}
} //else case where two spaces ? } //else case where two spaces ?
} }
out = oss.str(); out = oss.str();
//cout << "ConverttoStandard: " << out << endl; //cout << "ConverttoStandard: " << out << endl;
return out; return out;
} }

View File

@ -20,41 +20,45 @@
class Permutation class Permutation
{ {
public: public:
//Can be HAMMING_DISTANCE or KENDALLS_DISTANCE //Can be HAMMING_DISTANCE or KENDALLS_DISTANCE
Permutation(const std::string &alignment = std::string(), const int sourceLength = 0, const int targetLength = 0 ); Permutation(const std::string &alignment = std::string(), const int sourceLength = 0, const int targetLength = 0 );
~Permutation(){}; ~Permutation() {};
inline void clear() { m_array.clear(); } inline void clear() {
inline size_t size(){ return m_array.size(); } m_array.clear();
}
inline size_t size() {
return m_array.size();
}
void set(const std::string &alignment,const int sourceLength); void set(const std::string &alignment,const int sourceLength);
float distance(const Permutation &permCompare, const distanceMetric_t &strategy = HAMMING_DISTANCE) const; float distance(const Permutation &permCompare, const distanceMetric_t &strategy = HAMMING_DISTANCE) const;
//Const //Const
void dump() const; void dump() const;
size_t getLength() const; size_t getLength() const;
vector<int> getArray() const; vector<int> getArray() const;
int getTargetLength() const { int getTargetLength() const {
return m_targetLength; return m_targetLength;
} }
//Static //Static
static std::string convertMosesToStandard(std::string const & alignment); static std::string convertMosesToStandard(std::string const & alignment);
static vector<int> invert(vector<int> const & inVector); static vector<int> invert(vector<int> const & inVector);
static bool checkValidPermutation(vector<int> const & inVector); static bool checkValidPermutation(vector<int> const & inVector);
protected: protected:
vector<int> m_array; vector<int> m_array;
int m_targetLength; int m_targetLength;
float calculateHamming(const Permutation & compare) const; float calculateHamming(const Permutation & compare) const;
float calculateKendall(const Permutation & compare) const; float calculateKendall(const Permutation & compare) const;
private: private:
}; };

View File

@ -4,215 +4,212 @@ using namespace std;
const int PermutationScorer::SCORE_PRECISION = 5; const int PermutationScorer::SCORE_PRECISION = 5;
PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config) PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config)
:SentenceLevelScorer(distanceMetric,config) :SentenceLevelScorer(distanceMetric,config)
{ {
//configure regularisation //configure regularisation
static string KEY_REFCHOICE = "refchoice"; static string KEY_REFCHOICE = "refchoice";
static string REFCHOICE_AVERAGE = "average"; static string REFCHOICE_AVERAGE = "average";
static string REFCHOICE_CLOSEST = "closest"; static string REFCHOICE_CLOSEST = "closest";
string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST);
if (refchoice == REFCHOICE_AVERAGE) {
m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE;
} else if (refchoice == REFCHOICE_CLOSEST) {
m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST;
} else {
throw runtime_error("Unknown reference choice strategy: " + refchoice);
}
cerr << "Using reference choice strategy: " << refchoice << endl;
if (distanceMetric.compare("HAMMING") == 0) { string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST);
m_distanceMetric = HAMMING_DISTANCE; if (refchoice == REFCHOICE_AVERAGE) {
} else if (distanceMetric.compare("KENDALL") == 0) { m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE;
m_distanceMetric = KENDALL_DISTANCE; } else if (refchoice == REFCHOICE_CLOSEST) {
} m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST;
cerr << "Using permutation distance metric: " << distanceMetric << endl; } else {
throw runtime_error("Unknown reference choice strategy: " + refchoice);
}
cerr << "Using reference choice strategy: " << refchoice << endl;
//Get reference alignments from scconfig refalign option if (distanceMetric.compare("HAMMING") == 0) {
static string KEY_ALIGNMENT_FILES = "refalign"; m_distanceMetric = HAMMING_DISTANCE;
string refalign = getConfig(KEY_ALIGNMENT_FILES,""); } else if (distanceMetric.compare("KENDALL") == 0) {
//cout << refalign << endl; m_distanceMetric = KENDALL_DISTANCE;
if (refalign.length() > 0){ }
string substring; cerr << "Using permutation distance metric: " << distanceMetric << endl;
while (!refalign.empty()){
getNextPound(refalign, substring, "+");
m_referenceAlignments.push_back(substring);
}
}
//Get length of source sentences read in from scconfig source option //Get reference alignments from scconfig refalign option
// this is essential for extractor but unneccesary for mert executable static string KEY_ALIGNMENT_FILES = "refalign";
static string KEY_SOURCE_FILE = "source"; string refalign = getConfig(KEY_ALIGNMENT_FILES,"");
string sourceFile = getConfig(KEY_SOURCE_FILE,""); //cout << refalign << endl;
if (sourceFile.length() > 0) { if (refalign.length() > 0) {
cerr << "Loading source sentence lengths from " << sourceFile << endl; string substring;
ifstream sourcein(sourceFile.c_str()); while (!refalign.empty()) {
if (!sourcein) { getNextPound(refalign, substring, "+");
throw runtime_error("Unable to open: " + sourceFile); m_referenceAlignments.push_back(substring);
}
string line;
while (getline(sourcein,line)) {
size_t wordNumber = 0;
string word;
while(!line.empty()){
getNextPound(line, word, " ");
wordNumber++;
}
m_sourceLengths.push_back(wordNumber);
}
sourcein.close();
} }
}
//Get length of source sentences read in from scconfig source option
// this is essential for extractor but unneccesary for mert executable
static string KEY_SOURCE_FILE = "source";
string sourceFile = getConfig(KEY_SOURCE_FILE,"");
if (sourceFile.length() > 0) {
cerr << "Loading source sentence lengths from " << sourceFile << endl;
ifstream sourcein(sourceFile.c_str());
if (!sourcein) {
throw runtime_error("Unable to open: " + sourceFile);
}
string line;
while (getline(sourcein,line)) {
size_t wordNumber = 0;
string word;
while(!line.empty()) {
getNextPound(line, word, " ");
wordNumber++;
}
m_sourceLengths.push_back(wordNumber);
}
sourcein.close();
}
} }
void PermutationScorer::setReferenceFiles(const vector<string>& referenceFiles) { void PermutationScorer::setReferenceFiles(const vector<string>& referenceFiles)
cout << "*******setReferenceFiles" << endl; {
//make sure reference data is clear cout << "*******setReferenceFiles" << endl;
m_referencePerms.clear(); //make sure reference data is clear
m_referencePerms.clear();
vector< vector< int> > targetLengths; vector< vector< int> > targetLengths;
//Just getting target length from reference text file //Just getting target length from reference text file
for (size_t i = 0; i < referenceFiles.size(); ++i) for (size_t i = 0; i < referenceFiles.size(); ++i) {
{ vector <int> lengths;
vector <int> lengths; cout << "Loading reference from " << referenceFiles[i] << endl;
cout << "Loading reference from " << referenceFiles[i] << endl; ifstream refin(referenceFiles[i].c_str());
ifstream refin(referenceFiles[i].c_str()); if (!refin) {
if (!refin) cerr << "Unable to open: " << referenceFiles[i] << endl;
{ throw runtime_error("Unable to open alignment file");
cerr << "Unable to open: " << referenceFiles[i] << endl;
throw runtime_error("Unable to open alignment file");
}
string line;
while (getline(refin,line))
{
int count = getNumberWords(line);
lengths.push_back(count);
}
targetLengths.push_back(lengths);
} }
string line;
while (getline(refin,line)) {
int count = getNumberWords(line);
lengths.push_back(count);
}
targetLengths.push_back(lengths);
}
//load reference data //load reference data
//NOTE ignoring normal reference file, only using previously saved alignment reference files //NOTE ignoring normal reference file, only using previously saved alignment reference files
for (size_t i = 0; i < m_referenceAlignments.size(); ++i) for (size_t i = 0; i < m_referenceAlignments.size(); ++i) {
{ vector<Permutation> referencePerms;
vector<Permutation> referencePerms; cout << "Loading reference from " << m_referenceAlignments[i] << endl;
cout << "Loading reference from " << m_referenceAlignments[i] << endl; ifstream refin(m_referenceAlignments[i].c_str());
ifstream refin(m_referenceAlignments[i].c_str()); if (!refin) {
if (!refin) cerr << "Unable to open: " << m_referenceAlignments[i] << endl;
{ throw runtime_error("Unable to open alignment file");
cerr << "Unable to open: " << m_referenceAlignments[i] << endl;
throw runtime_error("Unable to open alignment file");
}
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line))
{
//cout << line << endl;
//Line needs to be of the format: 0-0 1-1 1-2 etc source-target
Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]);
//perm.dump();
referencePerms.push_back(perm);
//check the source sentence length is the same for previous file
if (perm.getLength() != m_sourceLengths[sid])
{
cerr << "Permutation Length: " << perm.getLength() << endl;
cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl;
throw runtime_error("Source sentence lengths not the same: ");
}
sid++;
}
m_referencePerms.push_back(referencePerms);
} }
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line)) {
//cout << line << endl;
//Line needs to be of the format: 0-0 1-1 1-2 etc source-target
Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]);
//perm.dump();
referencePerms.push_back(perm);
//check the source sentence length is the same for previous file
if (perm.getLength() != m_sourceLengths[sid]) {
cerr << "Permutation Length: " << perm.getLength() << endl;
cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl;
throw runtime_error("Source sentence lengths not the same: ");
}
sid++;
}
m_referencePerms.push_back(referencePerms);
}
} }
int PermutationScorer::getNumberWords (const string& text) const { int PermutationScorer::getNumberWords (const string& text) const
int count = 0; {
string line = trimStr(text); int count = 0;
if (line.length()>0) { string line = trimStr(text);
int pos = line.find(" "); if (line.length()>0) {
while (pos!=int(string::npos)){ int pos = line.find(" ");
count++; while (pos!=int(string::npos)) {
pos = line.find(" ",pos+1); count++;
} pos = line.find(" ",pos+1);
count++;
} }
return count; count++;
}
return count;
} }
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
//cout << "*******prepareStats" ; {
//cout << text << endl; //cout << "*******prepareStats" ;
//cout << sid << endl; //cout << text << endl;
//cout << "Reference0align:" << endl; //cout << sid << endl;
//m_referencePerms[0][sid].dump(); //cout << "Reference0align:" << endl;
//m_referencePerms[0][sid].dump();
string sentence = ""; string sentence = "";
string align = text; string align = text;
size_t alignmentData = text.find("|||"); size_t alignmentData = text.find("|||");
//Get sentence and alignment parts //Get sentence and alignment parts
if(alignmentData != string::npos) { if(alignmentData != string::npos) {
getNextPound(align,sentence, "|||"); getNextPound(align,sentence, "|||");
} else { } else {
align = text; align = text;
}
int translationLength = getNumberWords(sentence);
//A vector of Permutations for each sentence
vector< vector<Permutation> > nBestPerms;
float distanceValue;
//need to create permutations for each nbest line
string standardFormat = Permutation::convertMosesToStandard(align);
Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
//perm.dump();
if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
float total = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
total += dist;
//cout << "Ref number: " << i << " distance: " << dist << endl;
} }
int translationLength = getNumberWords(sentence); float mean = (float)total/m_referencePerms.size();
//cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
distanceValue = mean;
} else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) {
float max_val = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
//A vector of Permutations for each sentence //look for the closest reference
vector< vector<Permutation> > nBestPerms; float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
float distanceValue; //cout << "Ref number: " << i << " distance: " << value << endl;
if (value > max_val) {
//need to create permutations for each nbest line max_val = value;
string standardFormat = Permutation::convertMosesToStandard(align); }
Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
//perm.dump();
if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
float total = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
total += dist;
//cout << "Ref number: " << i << " distance: " << dist << endl;
}
float mean = (float)total/m_referencePerms.size();
//cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
distanceValue = mean;
} else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) {
float max_val = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
//look for the closest reference
float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
//cout << "Ref number: " << i << " distance: " << value << endl;
if (value > max_val) {
max_val = value;
}
}
distanceValue = max_val;
//cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
} else {
throw runtime_error("Unsupported reflength strategy");
} }
distanceValue = max_val;
//cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
} else {
throw runtime_error("Unsupported reflength strategy");
}
//SCOREROUT eg: 0.04546 //SCOREROUT eg: 0.04546
ostringstream tempStream; ostringstream tempStream;
tempStream.precision(SCORE_PRECISION); tempStream.precision(SCORE_PRECISION);
tempStream << distanceValue; tempStream << distanceValue;
string str = tempStream.str(); string str = tempStream.str();
entry.set(str); entry.set(str);
//cout << tempStream.str(); //cout << tempStream.str();
} }
//Will just be final score //Will just be final score
statscore_t PermutationScorer::calculateScore(const vector<statscore_t>& comps) { statscore_t PermutationScorer::calculateScore(const vector<statscore_t>& comps)
//cerr << "*******PermutationScorer::calculateScore" ; {
//cerr << " " << comps[0] << endl; //cerr << "*******PermutationScorer::calculateScore" ;
return comps[0]; //cerr << " " << comps[0] << endl;
return comps[0];
} }

View File

@ -17,44 +17,44 @@
#include "Permutation.h" #include "Permutation.h"
/** /**
* Permutation * Permutation
**/ **/
class PermutationScorer: public SentenceLevelScorer class PermutationScorer: public SentenceLevelScorer
{ {
public: public:
PermutationScorer(const string &distanceMetric = "HAMMING", PermutationScorer(const string &distanceMetric = "HAMMING",
const string &config = string()); const string &config = string());
void setReferenceFiles(const vector<string>& referenceFiles); void setReferenceFiles(const vector<string>& referenceFiles);
void prepareStats(size_t sid, const string& text, ScoreStats& entry); void prepareStats(size_t sid, const string& text, ScoreStats& entry);
static const int SCORE_PRECISION; static const int SCORE_PRECISION;
size_t NumberOfScores() const {
//cerr << "PermutationScorer number of scores: 1" << endl;
return 1;
};
bool useAlignment() const {
//cout << "PermutationScorer::useAlignment returning true" << endl;
return true;
};
protected:
statscore_t calculateScore(const vector<statscore_t>& scores);
PermutationScorer(const PermutationScorer&);
~PermutationScorer(){};
PermutationScorer& operator=(const PermutationScorer&);
int getNumberWords (const string & line) const;
distanceMetricReferenceChoice_t m_refChoiceStrategy; size_t NumberOfScores() const {
distanceMetric_t m_distanceMetric; //cerr << "PermutationScorer number of scores: 1" << endl;
return 1;
// data extracted from reference files };
// A vector of permutations for each reference file bool useAlignment() const {
vector< vector<Permutation> > m_referencePerms; //cout << "PermutationScorer::useAlignment returning true" << endl;
vector<size_t> m_sourceLengths; return true;
vector<string> m_referenceAlignments; };
private: protected:
statscore_t calculateScore(const vector<statscore_t>& scores);
PermutationScorer(const PermutationScorer&);
~PermutationScorer() {};
PermutationScorer& operator=(const PermutationScorer&);
int getNumberWords (const string & line) const;
distanceMetricReferenceChoice_t m_refChoiceStrategy;
distanceMetric_t m_distanceMetric;
// data extracted from reference files
// A vector of permutations for each reference file
vector< vector<Permutation> > m_referencePerms;
vector<size_t> m_sourceLengths;
vector<string> m_referenceAlignments;
private:
}; };
//TODO need to read in floats for scores - necessary for selecting mean reference strategy and for BLEU? //TODO need to read in floats for scores - necessary for selecting mean reference strategy and for BLEU?

View File

@ -10,22 +10,24 @@ vector<unsigned> Point::optindices;
unsigned Point::dim=0; unsigned Point::dim=0;
map<unsigned,statscore_t> Point::fixedweights; map<unsigned,statscore_t> Point::fixedweights;
unsigned Point::pdim=0; unsigned Point::pdim=0;
unsigned Point::ncall=0; unsigned Point::ncall=0;
void Point::Randomize(const vector<parameter_t>& min,const vector<parameter_t>& max){ void Point::Randomize(const vector<parameter_t>& min,const vector<parameter_t>& max)
{
assert(min.size()==Point::dim); assert(min.size()==Point::dim);
assert(max.size()==Point::dim); assert(max.size()==Point::dim);
for (unsigned int i=0; i<size(); i++) for (unsigned int i=0; i<size(); i++)
operator[](i)= min[i] + (float)random()/(float)RAND_MAX * (float)(max[i]-min[i]); operator[](i)= min[i] + (float)random()/(float)RAND_MAX * (float)(max[i]-min[i]);
} }
void Point::NormalizeL2(){ void Point::NormalizeL2()
{
parameter_t norm=0.0; parameter_t norm=0.0;
for (unsigned int i=0; i<size(); i++) for (unsigned int i=0; i<size(); i++)
norm+= operator[](i)*operator[](i); norm+= operator[](i)*operator[](i);
if(norm!=0.0){ if(norm!=0.0) {
norm=sqrt(norm); norm=sqrt(norm);
for (unsigned int i=0; i<size(); i++) for (unsigned int i=0; i<size(); i++)
operator[](i)/=norm; operator[](i)/=norm;
@ -33,22 +35,24 @@ void Point::NormalizeL2(){
} }
void Point::NormalizeL1(){ void Point::NormalizeL1()
{
parameter_t norm=0.0; parameter_t norm=0.0;
for (unsigned int i=0; i<size(); i++) for (unsigned int i=0; i<size(); i++)
norm+= abs(operator[](i)); norm+= abs(operator[](i));
if(norm!=0.0){ if(norm!=0.0) {
for (unsigned int i=0; i<size(); i++) for (unsigned int i=0; i<size(); i++)
operator[](i)/=norm; operator[](i)/=norm;
} }
} }
//Can initialize from a vector of dim or pdim //Can initialize from a vector of dim or pdim
Point::Point(const vector<parameter_t>& init):vector<parameter_t>(Point::dim){ Point::Point(const vector<parameter_t>& init):vector<parameter_t>(Point::dim)
if(init.size()==dim){ {
if(init.size()==dim) {
for (unsigned int i=0; i<Point::dim; i++) for (unsigned int i=0; i<Point::dim; i++)
operator[](i)=init[i]; operator[](i)=init[i];
}else{ } else {
assert(init.size()==pdim); assert(init.size()==pdim);
for (unsigned int i=0; i<Point::dim; i++) for (unsigned int i=0; i<Point::dim; i++)
operator[](i)=init[optindices[i]]; operator[](i)=init[optindices[i]];
@ -56,59 +60,64 @@ Point::Point(const vector<parameter_t>& init):vector<parameter_t>(Point::dim){
}; };
double Point::operator*(const FeatureStats& F)const{ double Point::operator*(const FeatureStats& F)const
{
ncall++;//to track performance ncall++;//to track performance
double prod=0.0; double prod=0.0;
if(OptimizeAll()) if(OptimizeAll())
for (unsigned i=0; i<size(); i++) for (unsigned i=0; i<size(); i++)
prod+= operator[](i)*F.get(i); prod+= operator[](i)*F.get(i);
else{ else {
for (unsigned i=0; i<size(); i++) for (unsigned i=0; i<size(); i++)
prod+= operator[](i)*F.get(optindices[i]); prod+= operator[](i)*F.get(optindices[i]);
for(map<unsigned,float >::iterator it=fixedweights.begin();it!=fixedweights.end();it++) for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
prod+=it->second*F.get(it->first); prod+=it->second*F.get(it->first);
} }
return prod; return prod;
} }
Point Point::operator+(const Point& p2)const{ Point Point::operator+(const Point& p2)const
{
assert(p2.size()==size()); assert(p2.size()==size());
Point Res(*this); Point Res(*this);
for(unsigned i=0;i<size();i++) for(unsigned i=0; i<size(); i++)
Res[i]+=p2[i]; Res[i]+=p2[i];
Res.score=numeric_limits<statscore_t>::max(); Res.score=numeric_limits<statscore_t>::max();
return Res; return Res;
}; };
Point Point::operator*(float l)const{ Point Point::operator*(float l)const
{
Point Res(*this); Point Res(*this);
for(unsigned i=0;i<size();i++) for(unsigned i=0; i<size(); i++)
Res[i]*=l; Res[i]*=l;
Res.score=numeric_limits<statscore_t>::max(); Res.score=numeric_limits<statscore_t>::max();
return Res; return Res;
}; };
ostream& operator<<(ostream& o,const Point& P){ ostream& operator<<(ostream& o,const Point& P)
vector<parameter_t> w=P.GetAllWeights(); {
vector<parameter_t> w=P.GetAllWeights();
// o << "[" << Point::pdim << "] "; // o << "[" << Point::pdim << "] ";
for(unsigned int i=0;i<Point::pdim;i++) for(unsigned int i=0; i<Point::pdim; i++)
o << w[i] << " "; o << w[i] << " ";
// o << "=> " << P.GetScore(); // o << "=> " << P.GetScore();
return o; return o;
}; };
vector<parameter_t> Point::GetAllWeights()const{ vector<parameter_t> Point::GetAllWeights()const
{
vector<parameter_t> w; vector<parameter_t> w;
if(OptimizeAll()){ if(OptimizeAll()) {
w=*this; w=*this;
}else{ } else {
w.resize(pdim); w.resize(pdim);
for (unsigned int i=0; i<size(); i++) for (unsigned int i=0; i<size(); i++)
w[optindices[i]]=operator[](i); w[optindices[i]]=operator[](i);
for(map<unsigned,float >::iterator it=fixedweights.begin();it!=fixedweights.end();it++) for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
w[it->first]=it->second; w[it->first]=it->second;
} }
return w; return w;
}; };

View File

@ -10,9 +10,10 @@ class Optimizer;
/**class that handle the subset of the Feature weight on which we run the optimization*/ /**class that handle the subset of the Feature weight on which we run the optimization*/
class Point:public vector<parameter_t>{ class Point:public vector<parameter_t>
{
friend class Optimizer; friend class Optimizer;
private: private:
/**The indices over which we optimize*/ /**The indices over which we optimize*/
static vector<unsigned int> optindices; static vector<unsigned int> optindices;
/**dimension of optindices and of the parent vector*/ /**dimension of optindices and of the parent vector*/
@ -22,12 +23,18 @@ class Point:public vector<parameter_t>{
/**total size of the parameter space; we have pdim=FixedWeight.size()+optinidices.size()*/ /**total size of the parameter space; we have pdim=FixedWeight.size()+optinidices.size()*/
static unsigned int pdim; static unsigned int pdim;
static unsigned int ncall; static unsigned int ncall;
public: public:
static unsigned int getdim(){return dim;} static unsigned int getdim() {
static unsigned int getpdim(){return pdim;} return dim;
static bool OptimizeAll(){return fixedweights.empty();}; }
static unsigned int getpdim() {
return pdim;
}
static bool OptimizeAll() {
return fixedweights.empty();
};
statscore_t score; statscore_t score;
Point():vector<parameter_t>(dim){}; Point():vector<parameter_t>(dim) {};
Point(const vector<parameter_t>& init); Point(const vector<parameter_t>& init);
void Randomize(const vector<parameter_t>& min,const vector<parameter_t>& max); void Randomize(const vector<parameter_t>& min,const vector<parameter_t>& max);
@ -36,12 +43,16 @@ class Point:public vector<parameter_t>{
Point operator*(float)const; Point operator*(float)const;
/**write the Whole featureweight to a stream (ie pdim float)*/ /**write the Whole featureweight to a stream (ie pdim float)*/
friend ostream& operator<<(ostream& o,const Point& P); friend ostream& operator<<(ostream& o,const Point& P);
void Normalize(){ NormalizeL2(); }; void Normalize() {
NormalizeL2();
};
void NormalizeL2(); void NormalizeL2();
void NormalizeL1(); void NormalizeL1();
/**return a vector of size pdim where all weights have been put(including fixed ones)*/ /**return a vector of size pdim where all weights have been put(including fixed ones)*/
vector<parameter_t> GetAllWeights()const; vector<parameter_t> GetAllWeights()const;
statscore_t GetScore()const { return score; }; statscore_t GetScore()const {
return score;
};
}; };
#endif #endif

View File

@ -15,134 +15,134 @@ ScoreArray::ScoreArray(): idx("")
void ScoreArray::savetxt(std::ofstream& outFile, const std::string& sctype) void ScoreArray::savetxt(std::ofstream& outFile, const std::string& sctype)
{ {
outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size() outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_scores << " " << sctype << std::endl; << " " << number_of_scores << " " << sctype << std::endl;
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++){ for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->savetxt(outFile); i->savetxt(outFile);
outFile << std::endl; outFile << std::endl;
} }
outFile << SCORES_TXT_END << std::endl; outFile << SCORES_TXT_END << std::endl;
} }
void ScoreArray::savebin(std::ofstream& outFile, const std::string& sctype) void ScoreArray::savebin(std::ofstream& outFile, const std::string& sctype)
{ {
outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size() outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_scores << " " << sctype << std::endl; << " " << number_of_scores << " " << sctype << std::endl;
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->savebin(outFile); i->savebin(outFile);
outFile << SCORES_BIN_END << std::endl; outFile << SCORES_BIN_END << std::endl;
} }
void ScoreArray::save(std::ofstream& inFile, const std::string& sctype, bool bin) void ScoreArray::save(std::ofstream& inFile, const std::string& sctype, bool bin)
{ {
if (size()>0) if (size()>0)
(bin)?savebin(inFile, sctype):savetxt(inFile, sctype); (bin)?savebin(inFile, sctype):savetxt(inFile, sctype);
} }
void ScoreArray::save(const std::string &file, const std::string& sctype, bool bin) void ScoreArray::save(const std::string &file, const std::string& sctype, bool bin)
{ {
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile, sctype, bin); save(outFile, sctype, bin);
outFile.close(); outFile.close();
} }
void ScoreArray::loadbin(ifstream& inFile, size_t n) void ScoreArray::loadbin(ifstream& inFile, size_t n)
{ {
ScoreStats entry(number_of_scores); ScoreStats entry(number_of_scores);
for (size_t i=0 ; i < n; i++){ for (size_t i=0 ; i < n; i++) {
entry.loadbin(inFile); entry.loadbin(inFile);
add(entry); add(entry);
} }
} }
void ScoreArray::loadtxt(ifstream& inFile, size_t n) void ScoreArray::loadtxt(ifstream& inFile, size_t n)
{ {
ScoreStats entry(number_of_scores); ScoreStats entry(number_of_scores);
for (size_t i=0 ; i < n; i++){ for (size_t i=0 ; i < n; i++) {
entry.loadtxt(inFile); entry.loadtxt(inFile);
add(entry); add(entry);
} }
} }
void ScoreArray::load(ifstream& inFile) void ScoreArray::load(ifstream& inFile)
{ {
size_t number_of_entries=0; size_t number_of_entries=0;
bool binmode=false; bool binmode=false;
std::string substring, stringBuf; std::string substring, stringBuf;
std::string::size_type loc; std::string::size_type loc;
std::getline(inFile, stringBuf); std::getline(inFile, stringBuf);
if (!inFile.good()){ if (!inFile.good()) {
return; return;
} }
if (!stringBuf.empty()){ if (!stringBuf.empty()) {
if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0){ if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) {
binmode=false; binmode=false;
}else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0){ } else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) {
binmode=true; binmode=true;
}else{ } else {
TRACE_ERR("ERROR: ScoreArray::load(): Wrong header"); TRACE_ERR("ERROR: ScoreArray::load(): Wrong header");
return; return;
} }
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
idx = substring; idx = substring;
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str()); number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
number_of_scores = atoi(substring.c_str()); number_of_scores = atoi(substring.c_str());
getNextPound(stringBuf, substring); getNextPound(stringBuf, substring);
score_type = substring; score_type = substring;
} }
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries); (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
std::getline(inFile, stringBuf); std::getline(inFile, stringBuf);
if (!stringBuf.empty()){ if (!stringBuf.empty()) {
if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0){ if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer"); TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
return; return;
} }
} }
} }
void ScoreArray::load(const std::string &file) void ScoreArray::load(const std::string &file)
{ {
TRACE_ERR("loading data from " << file << std::endl); TRACE_ERR("loading data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file inputfilestream inFile(file); // matches a stream with a file. Opens the file
load((ifstream&) inFile); load((ifstream&) inFile);
inFile.close(); inFile.close();
} }
void ScoreArray::merge(ScoreArray& e) void ScoreArray::merge(ScoreArray& e)
{ {
//dummy implementation //dummy implementation
for (size_t i=0; i<e.size(); i++) for (size_t i=0; i<e.size(); i++)
add(e.get(i)); add(e.get(i));
} }
bool ScoreArray::check_consistency() bool ScoreArray::check_consistency()
{ {
size_t sz = NumberOfScores(); size_t sz = NumberOfScores();
if (sz == 0) if (sz == 0)
return true; return true;
for (scorearray_t::iterator i=array_.begin(); i!=array_.end(); i++) for (scorearray_t::iterator i=array_.begin(); i!=array_.end(); i++)
if (i->size()!=sz) if (i->size()!=sz)
return false; return false;
return true; return true;
} }

View File

@ -27,52 +27,76 @@ using namespace std;
class ScoreArray class ScoreArray
{ {
protected: protected:
scorearray_t array_; scorearray_t array_;
std::string score_type; std::string score_type;
size_t number_of_scores; size_t number_of_scores;
private: private:
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
public: public:
ScoreArray(); ScoreArray();
~ScoreArray(){}; ~ScoreArray() {};
inline void clear() { array_.clear(); } inline void clear() {
array_.clear();
inline std::string getIndex(){ return idx; } }
inline void setIndex(const std::string& value){ idx=value; }
inline std::string getIndex() {
return idx;
}
inline void setIndex(const std::string& value) {
idx=value;
}
// inline ScoreStats get(size_t i){ return array_.at(i); } // inline ScoreStats get(size_t i){ return array_.at(i); }
inline ScoreStats& get(size_t i){ return array_.at(i); }
inline const ScoreStats& get(size_t i)const{ return array_.at(i); }
void add(const ScoreStats& e){ array_.push_back(e); } inline ScoreStats& get(size_t i) {
return array_.at(i);
}
inline const ScoreStats& get(size_t i)const {
return array_.at(i);
}
void merge(ScoreArray& e); void add(const ScoreStats& e) {
array_.push_back(e);
}
inline std::string name() const{ return score_type; }; void merge(ScoreArray& e);
inline void name(std::string &sctype){ score_type = sctype; };
inline size_t size(){ return array_.size(); } inline std::string name() const {
inline size_t NumberOfScores() const{ return number_of_scores; } return score_type;
inline void NumberOfScores(size_t v){ number_of_scores = v; } };
inline void name(std::string &sctype) {
void savetxt(ofstream& outFile, const std::string& sctype); score_type = sctype;
void savebin(ofstream& outFile, const std::string& sctype); };
void save(ofstream& outFile, const std::string& sctype, bool bin=false);
void save(const std::string &file, const std::string& sctype, bool bin=false); inline size_t size() {
inline void save(const std::string& sctype, bool bin=false){ save("/dev/stdout", sctype, bin); } return array_.size();
}
void loadtxt(ifstream& inFile, size_t n); inline size_t NumberOfScores() const {
void loadbin(ifstream& inFile, size_t n); return number_of_scores;
void load(ifstream& inFile); }
void load(const std::string &file); inline void NumberOfScores(size_t v) {
number_of_scores = v;
bool check_consistency(); }
void savetxt(ofstream& outFile, const std::string& sctype);
void savebin(ofstream& outFile, const std::string& sctype);
void save(ofstream& outFile, const std::string& sctype, bool bin=false);
void save(const std::string &file, const std::string& sctype, bool bin=false);
inline void save(const std::string& sctype, bool bin=false) {
save("/dev/stdout", sctype, bin);
}
void loadtxt(ifstream& inFile, size_t n);
void loadbin(ifstream& inFile, size_t n);
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
}; };

View File

@ -13,138 +13,138 @@
ScoreData::ScoreData(Scorer& ptr): ScoreData::ScoreData(Scorer& ptr):
theScorer(&ptr) theScorer(&ptr)
{ {
score_type = theScorer->getName(); score_type = theScorer->getName();
//theScorer->setScoreData(this);//this is not dangerous: we dont use the this pointer in SetScoreData //theScorer->setScoreData(this);//this is not dangerous: we dont use the this pointer in SetScoreData
number_of_scores = theScorer->NumberOfScores(); number_of_scores = theScorer->NumberOfScores();
TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl); TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl);
}; };
void ScoreData::dump() void ScoreData::dump()
{ {
for (vector<ScoreArray>::iterator it = array_.begin(); it !=array_.end(); it++){ for (vector<ScoreArray>::iterator it = array_.begin(); it !=array_.end(); it++) {
cout << "scorearray: " << endl; cout << "scorearray: " << endl;
for (size_t i = 0; i < (*it).size(); i++) { for (size_t i = 0; i < (*it).size(); i++) {
ScoreStats scoreStats = (*it).get(i); ScoreStats scoreStats = (*it).get(i);
cout << "scorestats: " ; cout << "scorestats: " ;
for (size_t j = 0; j < scoreStats.size(); j ++ ){ for (size_t j = 0; j < scoreStats.size(); j ++ ) {
ScoreStatsType scoreStatsType = scoreStats.get(j); ScoreStatsType scoreStatsType = scoreStats.get(j);
cout << scoreStatsType << " " ; cout << scoreStatsType << " " ;
} }
cout << endl; cout << endl;
} }
} }
} }
void ScoreData::save(std::ofstream& outFile, bool bin) void ScoreData::save(std::ofstream& outFile, bool bin)
{ {
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++){ for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->save(outFile, score_type, bin); i->save(outFile, score_type, bin);
} }
} }
void ScoreData::save(const std::string &file, bool bin) void ScoreData::save(const std::string &file, bool bin)
{ {
if (file.empty()) return; if (file.empty()) return;
TRACE_ERR("saving the array into " << file << std::endl); TRACE_ERR("saving the array into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
ScoreStats entry; ScoreStats entry;
save(outFile, bin); save(outFile, bin);
outFile.close(); outFile.close();
} }
void ScoreData::load(ifstream& inFile) void ScoreData::load(ifstream& inFile)
{ {
ScoreArray entry; ScoreArray entry;
while (!inFile.eof()){ while (!inFile.eof()) {
if (!inFile.good()){
std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl;
}
entry.clear();
entry.load(inFile);
if (entry.size() == 0){ if (!inFile.good()) {
break; std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl;
} }
add(entry);
} entry.clear();
theScorer->setScoreData(this); entry.load(inFile);
if (entry.size() == 0) {
break;
}
add(entry);
}
theScorer->setScoreData(this);
} }
void ScoreData::load(const std::string &file) void ScoreData::load(const std::string &file)
{ {
TRACE_ERR("loading score data from " << file << std::endl); TRACE_ERR("loading score data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file inputfilestream inFile(file); // matches a stream with a file. Opens the file
if (!inFile) { if (!inFile) {
throw runtime_error("Unable to open score file: " + file); throw runtime_error("Unable to open score file: " + file);
} }
load((ifstream&) inFile); load((ifstream&) inFile);
inFile.close(); inFile.close();
} }
void ScoreData::add(ScoreArray& e){ void ScoreData::add(ScoreArray& e)
if (exists(e.getIndex())){ // array at position e.getIndex() already exists {
//enlarge array at position e.getIndex() if (exists(e.getIndex())) { // array at position e.getIndex() already exists
size_t pos = getIndex(e.getIndex()); //enlarge array at position e.getIndex()
array_.at(pos).merge(e); size_t pos = getIndex(e.getIndex());
} array_.at(pos).merge(e);
else{ } else {
array_.push_back(e); array_.push_back(e);
setIndex(); setIndex();
} }
} }
void ScoreData::add(const ScoreStats& e, const std::string& sent_idx){ void ScoreData::add(const ScoreStats& e, const std::string& sent_idx)
if (exists(sent_idx)){ // array at position e.getIndex() already exists {
//enlarge array at position e.getIndex() if (exists(sent_idx)) { // array at position e.getIndex() already exists
size_t pos = getIndex(sent_idx); //enlarge array at position e.getIndex()
// TRACE_ERR("Inserting in array " << sent_idx << std::endl); size_t pos = getIndex(sent_idx);
array_.at(pos).add(e); // TRACE_ERR("Inserting in array " << sent_idx << std::endl);
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl); array_.at(pos).add(e);
} // TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
else{ } else {
// TRACE_ERR("Creating a new entry in the array" << std::endl); // TRACE_ERR("Creating a new entry in the array" << std::endl);
ScoreArray a; ScoreArray a;
a.NumberOfScores(number_of_scores); a.NumberOfScores(number_of_scores);
a.add(e); a.add(e);
a.setIndex(sent_idx); a.setIndex(sent_idx);
add(a); add(a);
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl); // TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
} }
} }
bool ScoreData::check_consistency() bool ScoreData::check_consistency()
{ {
if (array_.size() == 0) if (array_.size() == 0)
return true; return true;
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++)
if (!i->check_consistency()) return false; if (!i->check_consistency()) return false;
return true; return true;
} }
void ScoreData::setIndex() void ScoreData::setIndex()
{ {
size_t j=0; size_t j=0;
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++){ for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
idx2arrayname_[j]=i->getIndex(); idx2arrayname_[j]=i->getIndex();
arrayname2idx_[i->getIndex()]=j; arrayname2idx_[i->getIndex()]=j;
j++; j++;
} }
} }

View File

@ -23,66 +23,92 @@ class Scorer;
class ScoreData class ScoreData
{ {
protected: protected:
scoredata_t array_; scoredata_t array_;
idx2name idx2arrayname_; //map from index to name of array idx2name idx2arrayname_; //map from index to name of array
name2idx arrayname2idx_; //map from name to index of array name2idx arrayname2idx_; //map from name to index of array
private: private:
Scorer* theScorer; Scorer* theScorer;
std::string score_type; std::string score_type;
size_t number_of_scores; size_t number_of_scores;
public: public:
ScoreData(Scorer& sc); ScoreData(Scorer& sc);
~ScoreData(){};
inline void clear() { array_.clear(); }
inline ScoreArray get(const std::string& idx){ return array_.at(getIndex(idx)); }
inline ScoreArray& get(size_t idx){ return array_.at(idx); }
inline const ScoreArray& get(size_t idx) const { return array_.at(idx); }
inline bool exists(const std::string & sent_idx){ return exists(getIndex(sent_idx)); }
inline bool exists(int sent_idx){ return (sent_idx>-1 && sent_idx<(int)array_.size())?true:false; }
inline ScoreStats& get(size_t i, size_t j){ return array_.at(i).get(j); }
inline const ScoreStats& get(size_t i, size_t j) const { return array_.at(i).get(j); }
inline std::string name(){ return score_type; };
inline std::string name(std::string &sctype){ return score_type = sctype; };
void add(ScoreArray& e); ~ScoreData() {};
void add(const ScoreStats& e, const std::string& sent_idx);
inline size_t NumberOfScores(){ return number_of_scores; }
inline size_t size(){ return array_.size(); }
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false){ save("/dev/stdout", bin); }
void load(ifstream& inFile); inline void clear() {
void load(const std::string &file); array_.clear();
bool check_consistency();
void setIndex();
inline int getIndex(const std::string& idx){
name2idx::iterator i = arrayname2idx_.find(idx);
if (i!=arrayname2idx_.end())
return i->second;
else
return -1;
} }
inline std::string getIndex(size_t idx){
idx2name::iterator i = idx2arrayname_.find(idx);
if (i!=idx2arrayname_.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
void dump(); inline ScoreArray get(const std::string& idx) {
return array_.at(getIndex(idx));
}
inline ScoreArray& get(size_t idx) {
return array_.at(idx);
}
inline const ScoreArray& get(size_t idx) const {
return array_.at(idx);
}
inline bool exists(const std::string & sent_idx) {
return exists(getIndex(sent_idx));
}
inline bool exists(int sent_idx) {
return (sent_idx>-1 && sent_idx<(int)array_.size())?true:false;
}
inline ScoreStats& get(size_t i, size_t j) {
return array_.at(i).get(j);
}
inline const ScoreStats& get(size_t i, size_t j) const {
return array_.at(i).get(j);
}
inline std::string name() {
return score_type;
};
inline std::string name(std::string &sctype) {
return score_type = sctype;
};
void add(ScoreArray& e);
void add(const ScoreStats& e, const std::string& sent_idx);
inline size_t NumberOfScores() {
return number_of_scores;
}
inline size_t size() {
return array_.size();
}
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout", bin);
}
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
void setIndex();
inline int getIndex(const std::string& idx) {
name2idx::iterator i = arrayname2idx_.find(idx);
if (i!=arrayname2idx_.end())
return i->second;
else
return -1;
}
inline std::string getIndex(size_t idx) {
idx2name::iterator i = idx2arrayname_.find(idx);
if (i!=idx2arrayname_.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
void dump();
}; };

View File

@ -14,123 +14,124 @@
ScoreStats::ScoreStats() ScoreStats::ScoreStats()
{ {
available_ = AVAILABLE_; available_ = AVAILABLE_;
entries_ = 0; entries_ = 0;
array_ = new ScoreStatsType[available_]; array_ = new ScoreStatsType[available_];
}; };
ScoreStats::~ScoreStats() ScoreStats::~ScoreStats()
{ {
delete array_; delete array_;
}; };
ScoreStats::ScoreStats(const ScoreStats &stats) ScoreStats::ScoreStats(const ScoreStats &stats)
{ {
available_ = stats.available(); available_ = stats.available();
entries_ = stats.size(); entries_ = stats.size();
array_ = new ScoreStatsType[available_]; array_ = new ScoreStatsType[available_];
memcpy(array_,stats.getArray(),scorebytes_); memcpy(array_,stats.getArray(),scorebytes_);
}; };
ScoreStats::ScoreStats(const size_t size) ScoreStats::ScoreStats(const size_t size)
{ {
available_ = size; available_ = size;
entries_ = size; entries_ = size;
array_ = new ScoreStatsType[available_]; array_ = new ScoreStatsType[available_];
memset(array_,0,scorebytes_); memset(array_,0,scorebytes_);
}; };
ScoreStats::ScoreStats(std::string &theString) ScoreStats::ScoreStats(std::string &theString)
{ {
set(theString); set(theString);
} }
void ScoreStats::expand() void ScoreStats::expand()
{ {
available_*=2; available_*=2;
scorestats_t t_ = new ScoreStatsType[available_]; scorestats_t t_ = new ScoreStatsType[available_];
memcpy(t_,array_,scorebytes_); memcpy(t_,array_,scorebytes_);
delete array_; delete array_;
array_=t_; array_=t_;
} }
void ScoreStats::add(ScoreStatsType v) void ScoreStats::add(ScoreStatsType v)
{ {
if (isfull()) expand(); if (isfull()) expand();
array_[entries_++]=v; array_[entries_++]=v;
} }
void ScoreStats::set(std::string &theString) void ScoreStats::set(std::string &theString)
{ {
std::string substring, stringBuf; std::string substring, stringBuf;
reset(); reset();
while (!theString.empty()){ while (!theString.empty()) {
getNextPound(theString, substring); getNextPound(theString, substring);
add(ATOSST(substring.c_str())); add(ATOSST(substring.c_str()));
} }
} }
void ScoreStats::loadbin(std::ifstream& inFile) void ScoreStats::loadbin(std::ifstream& inFile)
{ {
inFile.read((char*) array_, scorebytes_); inFile.read((char*) array_, scorebytes_);
} }
void ScoreStats::loadtxt(std::ifstream& inFile) void ScoreStats::loadtxt(std::ifstream& inFile)
{ {
std::string theString; std::string theString;
std::getline(inFile, theString); std::getline(inFile, theString);
set(theString); set(theString);
} }
void ScoreStats::loadtxt(const std::string &file) void ScoreStats::loadtxt(const std::string &file)
{ {
// TRACE_ERR("loading the stats from " << file << std::endl); // TRACE_ERR("loading the stats from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile); loadtxt(inFile);
} }
void ScoreStats::savetxt(const std::string &file) void ScoreStats::savetxt(const std::string &file)
{ {
// TRACE_ERR("saving the stats into " << file << std::endl); // TRACE_ERR("saving the stats into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
savetxt(outFile); savetxt(outFile);
} }
void ScoreStats::savetxt(std::ofstream& outFile) void ScoreStats::savetxt(std::ofstream& outFile)
{ {
outFile << *this; outFile << *this;
} }
void ScoreStats::savebin(std::ofstream& outFile) void ScoreStats::savebin(std::ofstream& outFile)
{ {
outFile.write((char*) array_, scorebytes_); outFile.write((char*) array_, scorebytes_);
} }
ScoreStats& ScoreStats::operator=(const ScoreStats &stats) ScoreStats& ScoreStats::operator=(const ScoreStats &stats)
{ {
delete array_; delete array_;
available_ = stats.available(); available_ = stats.available();
entries_ = stats.size(); entries_ = stats.size();
array_ = new ScoreStatsType[available_]; array_ = new ScoreStatsType[available_];
memcpy(array_,stats.getArray(),scorebytes_); memcpy(array_,stats.getArray(),scorebytes_);
return *this; return *this;
} }
/**write the whole object to a stream*/ /**write the whole object to a stream*/
ostream& operator<<(ostream& o, const ScoreStats& e){ ostream& operator<<(ostream& o, const ScoreStats& e)
for (size_t i=0; i< e.size(); i++) {
o << e.get(i) << " "; for (size_t i=0; i< e.size(); i++)
return o; o << e.get(i) << " ";
return o;
} }

View File

@ -26,51 +26,72 @@ using namespace std;
class ScoreStats class ScoreStats
{ {
private: private:
scorestats_t array_; scorestats_t array_;
size_t entries_; size_t entries_;
size_t available_; size_t available_;
public: public:
ScoreStats(); ScoreStats();
ScoreStats(const size_t size); ScoreStats(const size_t size);
ScoreStats(const ScoreStats &stats); ScoreStats(const ScoreStats &stats);
ScoreStats(std::string &theString); ScoreStats(std::string &theString);
ScoreStats& operator=(const ScoreStats &stats); ScoreStats& operator=(const ScoreStats &stats);
~ScoreStats();
bool isfull(){return (entries_ < available_)?0:1; }
void expand();
void add(ScoreStatsType v);
inline void clear() { memset((void*) array_,0,scorebytes_); }
inline ScoreStatsType get(size_t i){ return array_[i]; }
inline ScoreStatsType get(size_t i)const{ return array_[i]; }
inline scorestats_t getArray() const { return array_; }
void set(std::string &theString);
inline size_t bytes() const{ return scorebytes_; } ~ScoreStats();
inline size_t size() const{ return entries_; }
inline size_t available() const{ return available_; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt(){ savetxt("/dev/stdout"); }
bool isfull() {
return (entries_ < available_)?0:1;
void loadtxt(const std::string &file); }
void loadtxt(ifstream& inFile); void expand();
void loadbin(ifstream& inFile); void add(ScoreStatsType v);
inline void reset(){ entries_ = 0; clear(); }
/**write the whole object to a stream*/ inline void clear() {
friend ostream& operator<<(ostream& o, const ScoreStats& e); memset((void*) array_,0,scorebytes_);
}
inline ScoreStatsType get(size_t i) {
return array_[i];
}
inline ScoreStatsType get(size_t i)const {
return array_[i];
}
inline scorestats_t getArray() const {
return array_;
}
void set(std::string &theString);
inline size_t bytes() const {
return scorebytes_;
}
inline size_t size() const {
return entries_;
}
inline size_t available() const {
return available_;
}
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt() {
savetxt("/dev/stdout");
}
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile);
void loadbin(ifstream& inFile);
inline void reset() {
entries_ = 0;
clear();
}
/**write the whole object to a stream*/
friend ostream& operator<<(ostream& o, const ScoreStats& e);
}; };

View File

@ -1,105 +1,108 @@
#include "Scorer.h" #include "Scorer.h"
//regularisation strategies //regularisation strategies
static float score_min(const statscores_t& scores, size_t start, size_t end) { static float score_min(const statscores_t& scores, size_t start, size_t end)
float min = numeric_limits<float>::max(); {
for (size_t i = start; i < end; ++i) { float min = numeric_limits<float>::max();
if (scores[i] < min) { for (size_t i = start; i < end; ++i) {
min = scores[i]; if (scores[i] < min) {
} min = scores[i];
} }
return min; }
return min;
} }
static float score_average(const statscores_t& scores, size_t start, size_t end) { static float score_average(const statscores_t& scores, size_t start, size_t end)
if ((end - start) < 1) { {
//shouldn't happen if ((end - start) < 1) {
return 0; //shouldn't happen
} return 0;
float total = 0; }
for (size_t j = start; j < end; ++j) { float total = 0;
total += scores[j]; for (size_t j = start; j < end; ++j) {
} total += scores[j];
}
return total / (end - start); return total / (end - start);
} }
void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t& diffs, void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) { statscores_t& scores)
//cout << "*******StatisticsBasedScorer::score" << endl; {
if (!_scoreData) { //cout << "*******StatisticsBasedScorer::score" << endl;
throw runtime_error("Score data not loaded"); if (!_scoreData) {
} throw runtime_error("Score data not loaded");
//calculate the score for the candidates }
if (_scoreData->size() == 0) { //calculate the score for the candidates
throw runtime_error("Score data is empty"); if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
}
if (candidates.size() == 0) {
throw runtime_error("No candidates supplied");
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candidates[i] " << candidates[i] << endl;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
} }
if (candidates.size() == 0) { for (size_t k = 0; k < totals.size(); ++k) {
throw runtime_error("No candidates supplied"); totals[k] += stats.get(k);
} }
int numCounts = _scoreData->get(0,candidates[0]).size(); }
vector<float> totals(numCounts); scores.push_back(calculateScore(totals));
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candidates[i] " << candidates[i] << endl;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
}
}
scores.push_back(calculateScore(totals));
candidates_t last_candidates(candidates); candidates_t last_candidates(candidates);
//apply each of the diffs, and get new scores //apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) { for (size_t i = 0; i < diffs.size(); ++i) {
for (size_t j = 0; j < diffs[i].size(); ++j) { for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first; size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second; size_t nid = diffs[i][j].second;
//cout << "STSC:sid = " << sid << endl; //cout << "STSC:sid = " << sid << endl;
//cout << "STSC:nid = " << nid << endl; //cout << "STSC:nid = " << nid << endl;
size_t last_nid = last_candidates[sid]; size_t last_nid = last_candidates[sid];
//cout << "STSC:oid = " << last_nid << endl; //cout << "STSC:oid = " << last_nid << endl;
for (size_t k = 0; k < totals.size(); ++k) { for (size_t k = 0; k < totals.size(); ++k) {
float diff = _scoreData->get(sid,nid).get(k) float diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k); - _scoreData->get(sid,last_nid).get(k);
totals[k] += diff; totals[k] += diff;
//cout << "STSC:nid = " << _scoreData->get(sid,nid).get(k) << endl; //cout << "STSC:nid = " << _scoreData->get(sid,nid).get(k) << endl;
//cout << "STSC:oid = " << _scoreData->get(sid,last_nid).get(k) << endl; //cout << "STSC:oid = " << _scoreData->get(sid,last_nid).get(k) << endl;
//cout << "STSC:diff = " << diff << endl; //cout << "STSC:diff = " << diff << endl;
//cout << "STSC:totals = " << totals[k] << endl; //cout << "STSC:totals = " << totals[k] << endl;
} }
last_candidates[sid] = nid; last_candidates[sid] = nid;
}
scores.push_back(calculateScore(totals));
} }
scores.push_back(calculateScore(totals));
}
//regularisation. This can either be none, or the min or average as described in //regularisation. This can either be none, or the min or average as described in
//Cer, Jurafsky and Manning at WMT08 //Cer, Jurafsky and Manning at WMT08
if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) { if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) {
//no regularisation //no regularisation
return; return;
} }
//window size specifies the +/- in each direction //window size specifies the +/- in each direction
statscores_t raw_scores(scores);//copy scores statscores_t raw_scores(scores);//copy scores
for (size_t i = 0; i < scores.size(); ++i) { for (size_t i = 0; i < scores.size(); ++i) {
size_t start = 0; size_t start = 0;
if (i >= _regularisationWindow) { if (i >= _regularisationWindow) {
start = i - _regularisationWindow; start = i - _regularisationWindow;
}
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
} }
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
}
} }
@ -110,89 +113,90 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
/** The sentence level scores have already been calculated, just need to average them /** The sentence level scores have already been calculated, just need to average them
and include the differences. Allows scores which are floats **/ and include the differences. Allows scores which are floats **/
void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs, void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) { statscores_t& scores)
//cout << "*******SentenceLevelScorer::score" << endl; {
if (!_scoreData) { //cout << "*******SentenceLevelScorer::score" << endl;
throw runtime_error("Score data not loaded"); if (!_scoreData) {
throw runtime_error("Score data not loaded");
}
//calculate the score for the candidates
if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
}
if (candidates.size() == 0) {
throw runtime_error("No candidates supplied");
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candi " << candidates[i] ;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
} }
//calculate the score for the candidates //Add up scores for all sentences, would normally be just one score
if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
}
if (candidates.size() == 0) {
throw runtime_error("No candidates supplied");
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candi " << candidates[i] ;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
//Add up scores for all sentences, would normally be just one score
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
//cout << " stats " << stats.get(k) ;
}
//cout << endl;
}
//take average
for (size_t k = 0; k < totals.size(); ++k) { for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
//cout << " stats " << stats.get(k) ;
}
//cout << endl;
}
//take average
for (size_t k = 0; k < totals.size(); ++k) {
//cout << "totals = " << totals[k] << endl; //cout << "totals = " << totals[k] << endl;
//cout << "cand = " << candidates.size() << endl; //cout << "cand = " << candidates.size() << endl;
totals[k] /= candidates.size(); totals[k] /= candidates.size();
//cout << "finaltotals = " << totals[k] << endl; //cout << "finaltotals = " << totals[k] << endl;
} }
scores.push_back(calculateScore(totals)); scores.push_back(calculateScore(totals));
candidates_t last_candidates(candidates); candidates_t last_candidates(candidates);
//apply each of the diffs, and get new scores //apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) { for (size_t i = 0; i < diffs.size(); ++i) {
for (size_t j = 0; j < diffs[i].size(); ++j) { for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first; size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second; size_t nid = diffs[i][j].second;
//cout << "sid = " << sid << endl; //cout << "sid = " << sid << endl;
//cout << "nid = " << nid << endl; //cout << "nid = " << nid << endl;
size_t last_nid = last_candidates[sid]; size_t last_nid = last_candidates[sid];
for (size_t k = 0; k < totals.size(); ++k) { for (size_t k = 0; k < totals.size(); ++k) {
float diff = _scoreData->get(sid,nid).get(k) float diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k); - _scoreData->get(sid,last_nid).get(k);
//cout << "diff = " << diff << endl; //cout << "diff = " << diff << endl;
totals[k] += diff/candidates.size(); totals[k] += diff/candidates.size();
//cout << "totals = " << totals[k] << endl; //cout << "totals = " << totals[k] << endl;
} }
last_candidates[sid] = nid; last_candidates[sid] = nid;
}
scores.push_back(calculateScore(totals));
} }
scores.push_back(calculateScore(totals));
}
//regularisation. This can either be none, or the min or average as described in //regularisation. This can either be none, or the min or average as described in
//Cer, Jurafsky and Manning at WMT08 //Cer, Jurafsky and Manning at WMT08
if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) { if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) {
//no regularisation //no regularisation
return; return;
} }
//window size specifies the +/- in each direction //window size specifies the +/- in each direction
statscores_t raw_scores(scores);//copy scores statscores_t raw_scores(scores);//copy scores
for (size_t i = 0; i < scores.size(); ++i) { for (size_t i = 0; i < scores.size(); ++i) {
size_t start = 0; size_t start = 0;
if (i >= _regularisationWindow) { if (i >= _regularisationWindow) {
start = i - _regularisationWindow; start = i - _regularisationWindow;
}
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
} }
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
}
} }

View File

@ -23,173 +23,180 @@ class ScoreStats;
/** /**
* Superclass of all scorers and dummy implementation. In order to add a new * Superclass of all scorers and dummy implementation. In order to add a new
* scorer it should be sufficient to override prepareStats(), setReferenceFiles() * scorer it should be sufficient to override prepareStats(), setReferenceFiles()
* and score() (or calculateScore()). * and score() (or calculateScore()).
**/ **/
class Scorer { class Scorer
private: {
string _name; private:
string _name;
public:
Scorer(const string& name, const string& config): _name(name), _scoreData(0), _preserveCase(true){
cerr << "Scorer config string: " << config << endl;
size_t start = 0;
while (start < config.size()) {
size_t end = config.find(",",start);
if (end == string::npos) {
end = config.size();
}
string nv = config.substr(start,end-start);
size_t split = nv.find(":");
if (split == string::npos) {
throw runtime_error("Missing colon when processing scorer config: " + config);
}
string name = nv.substr(0,split);
string value = nv.substr(split+1,nv.size()-split-1);
cerr << "name: " << name << " value: " << value << endl;
_config[name] = value;
start = end+1;
}
}; public:
virtual ~Scorer(){};
Scorer(const string& name, const string& config): _name(name), _scoreData(0), _preserveCase(true) {
/** cerr << "Scorer config string: " << config << endl;
* returns the number of statistics needed for the computation of the score size_t start = 0;
**/ while (start < config.size()) {
virtual size_t NumberOfScores() const { cerr << "Scorer: 0" << endl; return 0; }; size_t end = config.find(",",start);
if (end == string::npos) {
/** end = config.size();
* set the reference files. This must be called before prepareStats.
**/
virtual void setReferenceFiles(const vector<string>& referenceFiles) {
//do nothing
}
/**
* Process the given guessed text, corresponding to the given reference sindex
* and add the appropriate statistics to the entry.
**/
virtual void prepareStats(size_t sindex, const string& text, ScoreStats& entry)
{}
virtual void prepareStats(const string& sindex, const string& text, ScoreStats& entry)
{
// cerr << sindex << endl;
this->prepareStats((size_t) atoi(sindex.c_str()), text, entry);
//cerr << text << std::endl;
}
/**
* Score using each of the candidate index, then go through the diffs
* applying each in turn, and calculating a new score each time.
**/
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
//dummy impl
if (!_scoreData) {
throw runtime_error("score data not loaded");
}
scores.push_back(0);
for (size_t i = 0; i < diffs.size(); ++i) {
scores.push_back(0);
}
}
/**
* Calculate the score of the sentences corresponding to the list of candidate
* indices. Each index indicates the 1-best choice from the n-best list.
**/
float score(const candidates_t& candidates) {
diffs_t diffs;
statscores_t scores;
score(candidates, diffs, scores);
return scores[0];
}
const string& getName() const {return _name;}
size_t getReferenceSize() {
if (_scoreData) {
return _scoreData->size();
}
return 0;
}
/**
* Set the score data, prior to scoring.
**/
virtual void setScoreData(ScoreData* data) {
_scoreData = data;
}
/**
* The scorer returns if it uses the reference alignment data
* for permutation distance scores
**/
virtual bool useAlignment() const {
//cout << "Scorer::useAlignment returning false " << endl;
return false;
};
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;};
protected:
typedef map<string,int> encodings_t;
typedef map<string,int>::iterator encodings_it;
ScoreData* _scoreData;
encodings_t _encodings;
bool _preserveCase;
/**
* Value of config variable. If not provided, return default.
**/
string getConfig(const string& key, const string& def="") {
map<string,string>::iterator i = _config.find(key);
if (i == _config.end()) {
return def;
} else {
return i->second;
}
} }
string nv = config.substr(start,end-start);
size_t split = nv.find(":");
/** if (split == string::npos) {
* Tokenise line and encode. throw runtime_error("Missing colon when processing scorer config: " + config);
* Note: We assume that all tokens are separated by single spaces }
**/ string name = nv.substr(0,split);
void encode(const string& line, vector<int>& encoded) { string value = nv.substr(split+1,nv.size()-split-1);
//cerr << line << endl; cerr << "name: " << name << " value: " << value << endl;
istringstream in (line); _config[name] = value;
string token; start = end+1;
while (in >> token) {
if (!_preserveCase) {
for (string::iterator i = token.begin(); i != token.end(); ++i) {
*i = tolower(*i);
}
}
encodings_it encoding = _encodings.find(token);
int encoded_token;
if (encoding == _encodings.end()) {
encoded_token = (int)_encodings.size();
_encodings[token] = encoded_token;
//cerr << encoded_token << "(n) ";
} else {
encoded_token = encoding->second;
//cerr << encoded_token << " ";
}
encoded.push_back(encoded_token);
}
//cerr << endl;
} }
private: };
map<string,string> _config; virtual ~Scorer() {};
/**
* returns the number of statistics needed for the computation of the score
**/
virtual size_t NumberOfScores() const {
cerr << "Scorer: 0" << endl;
return 0;
};
/**
* set the reference files. This must be called before prepareStats.
**/
virtual void setReferenceFiles(const vector<string>& referenceFiles) {
//do nothing
}
/**
* Process the given guessed text, corresponding to the given reference sindex
* and add the appropriate statistics to the entry.
**/
virtual void prepareStats(size_t sindex, const string& text, ScoreStats& entry)
{}
virtual void prepareStats(const string& sindex, const string& text, ScoreStats& entry) {
// cerr << sindex << endl;
this->prepareStats((size_t) atoi(sindex.c_str()), text, entry);
//cerr << text << std::endl;
}
/**
* Score using each of the candidate index, then go through the diffs
* applying each in turn, and calculating a new score each time.
**/
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
//dummy impl
if (!_scoreData) {
throw runtime_error("score data not loaded");
}
scores.push_back(0);
for (size_t i = 0; i < diffs.size(); ++i) {
scores.push_back(0);
}
}
/**
* Calculate the score of the sentences corresponding to the list of candidate
* indices. Each index indicates the 1-best choice from the n-best list.
**/
float score(const candidates_t& candidates) {
diffs_t diffs;
statscores_t scores;
score(candidates, diffs, scores);
return scores[0];
}
const string& getName() const {
return _name;
}
size_t getReferenceSize() {
if (_scoreData) {
return _scoreData->size();
}
return 0;
}
/**
* Set the score data, prior to scoring.
**/
virtual void setScoreData(ScoreData* data) {
_scoreData = data;
}
/**
* The scorer returns if it uses the reference alignment data
* for permutation distance scores
**/
virtual bool useAlignment() const {
//cout << "Scorer::useAlignment returning false " << endl;
return false;
};
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals) {
return 0;
};
protected:
typedef map<string,int> encodings_t;
typedef map<string,int>::iterator encodings_it;
ScoreData* _scoreData;
encodings_t _encodings;
bool _preserveCase;
/**
* Value of config variable. If not provided, return default.
**/
string getConfig(const string& key, const string& def="") {
map<string,string>::iterator i = _config.find(key);
if (i == _config.end()) {
return def;
} else {
return i->second;
}
}
/**
* Tokenise line and encode.
* Note: We assume that all tokens are separated by single spaces
**/
void encode(const string& line, vector<int>& encoded) {
//cerr << line << endl;
istringstream in (line);
string token;
while (in >> token) {
if (!_preserveCase) {
for (string::iterator i = token.begin(); i != token.end(); ++i) {
*i = tolower(*i);
}
}
encodings_it encoding = _encodings.find(token);
int encoded_token;
if (encoding == _encodings.end()) {
encoded_token = (int)_encodings.size();
_encodings[token] = encoded_token;
//cerr << encoded_token << "(n) ";
} else {
encoded_token = encoding->second;
//cerr << encoded_token << " ";
}
encoded.push_back(encoded_token);
}
//cerr << endl;
}
private:
map<string,string> _config;
}; };
@ -197,11 +204,12 @@ class Scorer {
/** /**
* Abstract base class for scorers that work by adding statistics across all * Abstract base class for scorers that work by adding statistics across all
* outout sentences, then apply some formula, e.g. bleu, per. **/ * outout sentences, then apply some formula, e.g. bleu, per. **/
class StatisticsBasedScorer : public Scorer { class StatisticsBasedScorer : public Scorer
{
public: public:
StatisticsBasedScorer(const string& name, const string& config): Scorer(name,config) { StatisticsBasedScorer(const string& name, const string& config): Scorer(name,config) {
//configure regularisation //configure regularisation
static string KEY_TYPE = "regtype"; static string KEY_TYPE = "regtype";
@ -212,105 +220,110 @@ class StatisticsBasedScorer : public Scorer {
static string TYPE_MINIMUM = "min"; static string TYPE_MINIMUM = "min";
static string TRUE = "true"; static string TRUE = "true";
static string FALSE = "false"; static string FALSE = "false";
string type = getConfig(KEY_TYPE,TYPE_NONE); string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) { if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE; _regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) { } else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE; _regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) { } else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM; _regularisationStrategy = REG_MINIMUM;
} else { } else {
throw runtime_error("Unknown scorer regularisation strategy: " + type); throw runtime_error("Unknown scorer regularisation strategy: " + type);
} }
cerr << "Using scorer regularisation strategy: " << type << endl; cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0"); string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str()); _regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl; cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE); string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) { if (preservecase == TRUE) {
_preserveCase = true; _preserveCase = true;
}else if (preservecase == FALSE) { } else if (preservecase == FALSE) {
_preserveCase = false; _preserveCase = false;
} }
cerr << "Using case preservation: " << _preserveCase << endl; cerr << "Using case preservation: " << _preserveCase << endl;
} }
~StatisticsBasedScorer(){}; ~StatisticsBasedScorer() {};
virtual void score(const candidates_t& candidates, const diffs_t& diffs, virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores); statscores_t& scores);
//calculate the actual score //calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;}; virtual statscore_t calculateScore(const vector<statscore_t>& totals) {
return 0;
};
protected: protected:
//regularisation //regularisation
ScorerRegularisationStrategy _regularisationStrategy; ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow; size_t _regularisationWindow;
}; };
/** /**
* Abstract base class for scorers that work by using sentence level * Abstract base class for scorers that work by using sentence level
* statistics eg. permutation distance metrics **/ * statistics eg. permutation distance metrics **/
class SentenceLevelScorer : public Scorer { class SentenceLevelScorer : public Scorer
{
public: public:
SentenceLevelScorer(const string& name, const string& config): Scorer(name,config) { SentenceLevelScorer(const string& name, const string& config): Scorer(name,config) {
//configure regularisation //configure regularisation
static string KEY_TYPE = "regtype"; static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin"; static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case"; static string KEY_CASE = "case";
static string TYPE_NONE = "none"; static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average"; static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min"; static string TYPE_MINIMUM = "min";
static string TRUE = "true"; static string TRUE = "true";
static string FALSE = "false"; static string FALSE = "false";
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM;
} else {
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0"); string type = getConfig(KEY_TYPE,TYPE_NONE);
_regularisationWindow = atoi(window.c_str()); if (type == TYPE_NONE) {
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl; _regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
string preservecase = getConfig(KEY_CASE,TRUE); _regularisationStrategy = REG_AVERAGE;
if (preservecase == TRUE) { } else if (type == TYPE_MINIMUM) {
_preserveCase = true; _regularisationStrategy = REG_MINIMUM;
}else if (preservecase == FALSE) { } else {
_preserveCase = false; throw runtime_error("Unknown scorer regularisation strategy: " + type);
} }
cerr << "Using case preservation: " << _preserveCase << endl; cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) {
_preserveCase = true;
} else if (preservecase == FALSE) {
_preserveCase = false;
}
cerr << "Using case preservation: " << _preserveCase << endl;
} }
~SentenceLevelScorer(){}; ~SentenceLevelScorer() {};
virtual void score(const candidates_t& candidates, const diffs_t& diffs, virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores); statscores_t& scores);
//calculate the actual score //calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;}; virtual statscore_t calculateScore(const vector<statscore_t>& totals) {
return 0;
};
protected: protected:
//regularisation //regularisation
ScorerRegularisationStrategy _regularisationStrategy; ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow; size_t _regularisationWindow;
}; };

View File

@ -19,43 +19,44 @@
using namespace std; using namespace std;
class ScorerFactory { class ScorerFactory
{
public: public:
vector<string> getTypes() { vector<string> getTypes() {
vector<string> types; vector<string> types;
types.push_back(string("BLEU1")); types.push_back(string("BLEU1"));
types.push_back(string("BLEU")); types.push_back(string("BLEU"));
types.push_back(string("PER")); types.push_back(string("PER"));
types.push_back(string("HAMMING")); types.push_back(string("HAMMING"));
types.push_back(string("KENDALL")); types.push_back(string("KENDALL"));
return types; return types;
}
Scorer* getScorer(const string& type, const string& config = "") {
size_t scorerTypes = type.find(",");
if(scorerTypes == string::npos) {
if (type == "BLEU1") {
string conf;
if (config.length() > 0) {
conf = config + ",ngramlen:1";
} else {
conf = config + "ngramlen:1";
} }
return (BleuScorer*) new BleuScorer(conf);
Scorer* getScorer(const string& type, const string& config = "") { } else if (type == "BLEU") {
size_t scorerTypes = type.find(","); return (BleuScorer*) new BleuScorer(config);
if(scorerTypes == string::npos) { } else if (type == "PER") {
if (type == "BLEU1") { return (PerScorer*) new PerScorer(config);
string conf; } else if ((type == "HAMMING") || (type == "KENDALL")) {
if (config.length() > 0) { return (PermutationScorer*) new PermutationScorer(type, config);
conf = config + ",ngramlen:1"; } else {
} else { throw runtime_error("Unknown scorer type: " + type);
conf = config + "ngramlen:1"; }
} } else {
return (BleuScorer*) new BleuScorer(conf); return (InterpolatedScorer*) new InterpolatedScorer(type, config);
} else if (type == "BLEU") { }
return (BleuScorer*) new BleuScorer(config); }
} else if (type == "PER") {
return (PerScorer*) new PerScorer(config);
} else if ((type == "HAMMING") || (type == "KENDALL")) {
return (PermutationScorer*) new PermutationScorer(type, config);
} else {
throw runtime_error("Unknown scorer type: " + type);
}
} else {
return (InterpolatedScorer*) new InterpolatedScorer(type, config);
}
}
}; };
#endif //__SCORER_FACTORY_H #endif //__SCORER_FACTORY_H

View File

@ -12,8 +12,8 @@
*/ */
double Timer::elapsed_time() double Timer::elapsed_time()
{ {
time_t now; time_t now;
time(&now); time(&now);
return difftime(now, start_time); return difftime(now, start_time);
} }
@ -36,7 +36,7 @@ double Timer::get_elapsed_time()
void Timer::start(const char* msg) void Timer::start(const char* msg)
{ {
// Print an optional message, something like "Starting timer t"; // Print an optional message, something like "Starting timer t";
if (msg) TRACE_ERR( msg << std::endl); if (msg) TRACE_ERR( msg << std::endl);
// Return immediately if the timer is already running // Return immediately if the timer is already running
if (running) return; if (running) return;

View File

@ -8,16 +8,16 @@
class Timer class Timer
{ {
friend std::ostream& operator<<(std::ostream& os, Timer& t); friend std::ostream& operator<<(std::ostream& os, Timer& t);
private: private:
bool running; bool running;
time_t start_time; time_t start_time;
//TODO in seconds? //TODO in seconds?
double elapsed_time(); double elapsed_time();
public: public:
/*** /***
* 'running' is initially false. A timer needs to be explicitly started * 'running' is initially false. A timer needs to be explicitly started
* using 'start' or 'restart' * using 'start' or 'restart'

View File

@ -1,7 +1,7 @@
/* /*
* Util.cpp * Util.cpp
* met - Minimum Error Training * met - Minimum Error Training
* *
* Created by Nicola Bertoldi on 13/05/08. * Created by Nicola Bertoldi on 13/05/08.
* *
*/ */
@ -18,47 +18,47 @@ Timer g_timer;
int verbose=0; int verbose=0;
int verboselevel(){ int verboselevel()
{
return verbose; return verbose;
} }
int setverboselevel(int v){ int setverboselevel(int v)
{
verbose=v; verbose=v;
return verbose; return verbose;
} }
int getNextPound(std::string &theString, std::string &substring, const std::string delimiter) int getNextPound(std::string &theString, std::string &substring, const std::string delimiter)
{ {
unsigned int pos = 0; unsigned int pos = 0;
//skip all occurrences of delimiter //skip all occurrences of delimiter
while ( pos == 0 ) while ( pos == 0 ) {
{ if ((pos = theString.find(delimiter)) != std::string::npos) {
if ((pos = theString.find(delimiter)) != std::string::npos){ substring.assign(theString, 0, pos);
substring.assign(theString, 0, pos); theString.erase(0,pos + delimiter.size());
theString.erase(0,pos + delimiter.size()); } else {
} substring.assign(theString);
else{ theString.assign("");
substring.assign(theString); }
theString.assign(""); }
} return (pos);
}
return (pos);
}; };
inputfilestream::inputfilestream(const std::string &filePath) inputfilestream::inputfilestream(const std::string &filePath)
: std::istream(0), : std::istream(0),
m_streambuf(0) m_streambuf(0)
{ {
//check if file is readable //check if file is readable
std::filebuf* fb = new std::filebuf(); std::filebuf* fb = new std::filebuf();
_good=(fb->open(filePath.c_str(), std::ios::in)!=NULL); _good=(fb->open(filePath.c_str(), std::ios::in)!=NULL);
if (filePath.size() > 3 && if (filePath.size() > 3 &&
filePath.substr(filePath.size() - 3, 3) == ".gz") filePath.substr(filePath.size() - 3, 3) == ".gz") {
{ fb->close();
fb->close(); delete fb; delete fb;
m_streambuf = new gzfilebuf(filePath.c_str()); m_streambuf = new gzfilebuf(filePath.c_str());
} else { } else {
m_streambuf = fb; m_streambuf = fb;
} }
@ -67,7 +67,8 @@ m_streambuf(0)
inputfilestream::~inputfilestream() inputfilestream::~inputfilestream()
{ {
delete m_streambuf; m_streambuf = 0; delete m_streambuf;
m_streambuf = 0;
} }
void inputfilestream::close() void inputfilestream::close()
@ -75,16 +76,15 @@ void inputfilestream::close()
} }
outputfilestream::outputfilestream(const std::string &filePath) outputfilestream::outputfilestream(const std::string &filePath)
: std::ostream(0), : std::ostream(0),
m_streambuf(0) m_streambuf(0)
{ {
//check if file is readable //check if file is readable
std::filebuf* fb = new std::filebuf(); std::filebuf* fb = new std::filebuf();
_good=(fb->open(filePath.c_str(), std::ios::out)!=NULL); _good=(fb->open(filePath.c_str(), std::ios::out)!=NULL);
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
{ throw runtime_error("Output to a zipped file not supported!");
throw runtime_error("Output to a zipped file not supported!");
} else { } else {
m_streambuf = fb; m_streambuf = fb;
} }
@ -93,7 +93,8 @@ m_streambuf(0)
outputfilestream::~outputfilestream() outputfilestream::~outputfilestream()
{ {
delete m_streambuf; m_streambuf = 0; delete m_streambuf;
m_streambuf = 0;
} }
void outputfilestream::close() void outputfilestream::close()
@ -103,10 +104,14 @@ void outputfilestream::close()
int swapbytes(char *p, int sz, int n) int swapbytes(char *p, int sz, int n)
{ {
char c, *l, *h; char c, *l, *h;
if((n<1) || (sz<2)) return 0; if((n<1) || (sz<2)) return 0;
for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; } for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) {
return 0; c=*h;
*h=*l;
*l=c;
}
return 0;
}; };
@ -116,12 +121,12 @@ void ResetUserTime()
}; };
void PrintUserTime(const std::string &message) void PrintUserTime(const std::string &message)
{ {
g_timer.check(message.c_str()); g_timer.check(message.c_str());
} }
double GetUserTime() double GetUserTime()
{ {
return g_timer.get_elapsed_time(); return g_timer.get_elapsed_time();
} }

View File

@ -51,45 +51,49 @@ int getNextPound(std::string &theString, std::string &substring, const std::stri
template<typename T> template<typename T>
inline T Scan(const std::string &input) inline T Scan(const std::string &input)
{ {
std::stringstream stream(input); std::stringstream stream(input);
T ret; T ret;
stream >> ret; stream >> ret;
return ret; return ret;
}; };
class inputfilestream : public std::istream class inputfilestream : public std::istream
{ {
protected: protected:
std::streambuf *m_streambuf; std::streambuf *m_streambuf;
bool _good; bool _good;
public: public:
inputfilestream(const std::string &filePath); inputfilestream(const std::string &filePath);
~inputfilestream(); ~inputfilestream();
bool good(){return _good;} bool good() {
void close(); return _good;
}
void close();
}; };
class outputfilestream : public std::ostream class outputfilestream : public std::ostream
{ {
protected: protected:
std::streambuf *m_streambuf; std::streambuf *m_streambuf;
bool _good; bool _good;
public: public:
outputfilestream(const std::string &filePath); outputfilestream(const std::string &filePath);
~outputfilestream(); ~outputfilestream();
bool good(){return _good;} bool good() {
void close(); return _good;
}
void close();
}; };
template<typename T> template<typename T>
inline std::string stringify(T x) inline std::string stringify(T x)
{ {
std::ostringstream o; std::ostringstream o;
if (!(o << x)) if (!(o << x))
throw std::runtime_error("stringify(template<typename T>)"); throw std::runtime_error("stringify(template<typename T>)");
return o.str(); return o.str();
} }
// Utilities to measure decoding time // Utilities to measure decoding time
@ -99,11 +103,11 @@ double GetUserTime();
inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n") inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n")
{ {
unsigned int p2 = Src.find_last_not_of(c); unsigned int p2 = Src.find_last_not_of(c);
if (p2 == std::string::npos) return std::string(); if (p2 == std::string::npos) return std::string();
unsigned int p1 = Src.find_first_not_of(c); unsigned int p1 = Src.find_first_not_of(c);
if (p1 == std::string::npos) p1 = 0; if (p1 == std::string::npos) p1 = 0;
return Src.substr(p1, (p2-p1)+1); return Src.substr(p1, (p2-p1)+1);
} }

View File

@ -18,7 +18,8 @@
using namespace std; using namespace std;
void usage() { void usage()
{
cerr<<"usage: extractor [options])"<<endl; cerr<<"usage: extractor [options])"<<endl;
cerr<<"[--sctype|-s] the scorer type (default BLEU), possibly comma separated list of interpolated types"<<endl; cerr<<"[--sctype|-s] the scorer type (default BLEU), possibly comma separated list of interpolated types"<<endl;
cerr<<"[--scconfig|-c] configuration string passed to scorer"<<endl; cerr<<"[--scconfig|-c] configuration string passed to scorer"<<endl;
@ -28,7 +29,7 @@ void usage() {
cerr<<"[--nbest|-n] the nbest file"<<endl; cerr<<"[--nbest|-n] the nbest file"<<endl;
cerr<<"[--scfile|-S] the scorer data output file"<<endl; cerr<<"[--scfile|-S] the scorer data output file"<<endl;
cerr<<"[--ffile|-F] the feature data output file"<<endl; cerr<<"[--ffile|-F] the feature data output file"<<endl;
cerr<<"[--prev-ffile|-E] comma separated list of previous feature data" <<endl; cerr<<"[--prev-ffile|-E] comma separated list of previous feature data" <<endl;
cerr<<"[--prev-scfile|-R] comma separated list of previous scorer data"<<endl; cerr<<"[--prev-scfile|-R] comma separated list of previous scorer data"<<endl;
cerr<<"[-v] verbose level"<<endl; cerr<<"[-v] verbose level"<<endl;
cerr<<"[--help|-h] print this message and exit"<<endl; cerr<<"[--help|-h] print this message and exit"<<endl;
@ -36,185 +37,185 @@ cerr<<"[--prev-ffile|-E] comma separated list of previous feature data" <<endl;
} }
static struct option long_options[] = static struct option long_options[] = {
{ {"sctype",required_argument,0,'s'},
{"sctype",required_argument,0,'s'}, {"scconfig",required_argument,0,'c'},
{"scconfig",required_argument,0,'c'}, {"reference",required_argument,0,'r'},
{"reference",required_argument,0,'r'}, {"binary",no_argument,0,'b'},
{"binary",no_argument,0,'b'}, {"nbest",required_argument,0,'n'},
{"nbest",required_argument,0,'n'}, {"scfile",required_argument,0,'S'},
{"scfile",required_argument,0,'S'}, {"ffile",required_argument,0,'F'},
{"ffile",required_argument,0,'F'}, {"prev-scfile",required_argument,0,'R'},
{"prev-scfile",required_argument,0,'R'}, {"prev-ffile",required_argument,0,'E'},
{"prev-ffile",required_argument,0,'E'}, {"verbose",required_argument,0,'v'},
{"verbose",required_argument,0,'v'}, {"help",no_argument,0,'h'},
{"help",no_argument,0,'h'}, {0, 0, 0, 0}
{0, 0, 0, 0} };
};
int option_index; int option_index;
int main(int argc, char** argv) { int main(int argc, char** argv)
{
ResetUserTime();
ResetUserTime();
/*
Timer timer; /*
timer.start("Starting..."); Timer timer;
*/ timer.start("Starting...");
*/
//defaults
string scorerType("BLEU"); //defaults
string scorerConfig(""); string scorerType("BLEU");
string referenceFile(""); string scorerConfig("");
string nbestFile(""); string referenceFile("");
string scoreDataFile("statscore.data"); string nbestFile("");
string featureDataFile("features.data"); string scoreDataFile("statscore.data");
string prevScoreDataFile(""); string featureDataFile("features.data");
string prevFeatureDataFile(""); string prevScoreDataFile("");
bool binmode = false; string prevFeatureDataFile("");
int verbosity = 0; bool binmode = false;
int c; int verbosity = 0;
while ((c=getopt_long (argc,argv, "s:w:r:a:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) { int c;
switch(c) { while ((c=getopt_long (argc,argv, "s:w:r:a:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
case 's': switch(c) {
scorerType = string(optarg); case 's':
break; scorerType = string(optarg);
case 'c': break;
scorerConfig = string(optarg); case 'c':
break; scorerConfig = string(optarg);
case 'r': break;
referenceFile = string(optarg); case 'r':
break; referenceFile = string(optarg);
case 'b': break;
binmode = true; case 'b':
break; binmode = true;
case 'n': break;
nbestFile = string(optarg); case 'n':
break; nbestFile = string(optarg);
case 'S': break;
scoreDataFile = string(optarg); case 'S':
break; scoreDataFile = string(optarg);
case 'F': break;
featureDataFile = string(optarg); case 'F':
break; featureDataFile = string(optarg);
case 'E': break;
prevFeatureDataFile = string(optarg); case 'E':
break; prevFeatureDataFile = string(optarg);
case 'R': break;
prevScoreDataFile = string(optarg); case 'R':
break; prevScoreDataFile = string(optarg);
case 'v': break;
verbosity = atoi(optarg); case 'v':
break; verbosity = atoi(optarg);
default: break;
usage(); default:
} usage();
} }
try { }
try {
//check whether score statistics file is specified //check whether score statistics file is specified
if (scoreDataFile.length() == 0){ if (scoreDataFile.length() == 0) {
throw runtime_error("Error: output score statistics file is not specified"); throw runtime_error("Error: output score statistics file is not specified");
} }
//check wheter feature file is specified //check wheter feature file is specified
if (featureDataFile.length() == 0){ if (featureDataFile.length() == 0) {
throw runtime_error("Error: output feature file is not specified"); throw runtime_error("Error: output feature file is not specified");
} }
//check whether reference file is specified when nbest is specified //check whether reference file is specified when nbest is specified
if ((nbestFile.length() > 0 && referenceFile.length() == 0)){ if ((nbestFile.length() > 0 && referenceFile.length() == 0)) {
throw runtime_error("Error: reference file is not specified; you can not score the nbest"); throw runtime_error("Error: reference file is not specified; you can not score the nbest");
} }
vector<string> nbestFiles; vector<string> nbestFiles;
if (nbestFile.length() > 0){ if (nbestFile.length() > 0) {
std::string substring; std::string substring;
while (!nbestFile.empty()){ while (!nbestFile.empty()) {
getNextPound(nbestFile, substring, ","); getNextPound(nbestFile, substring, ",");
nbestFiles.push_back(substring); nbestFiles.push_back(substring);
} }
} }
vector<string> referenceFiles; vector<string> referenceFiles;
if (referenceFile.length() > 0){ if (referenceFile.length() > 0) {
std::string substring; std::string substring;
while (!referenceFile.empty()){ while (!referenceFile.empty()) {
getNextPound(referenceFile, substring, ","); getNextPound(referenceFile, substring, ",");
referenceFiles.push_back(substring); referenceFiles.push_back(substring);
} }
} }
vector<string> prevScoreDataFiles; vector<string> prevScoreDataFiles;
if (prevScoreDataFile.length() > 0){ if (prevScoreDataFile.length() > 0) {
std::string substring; std::string substring;
while (!prevScoreDataFile.empty()){ while (!prevScoreDataFile.empty()) {
getNextPound(prevScoreDataFile, substring, ","); getNextPound(prevScoreDataFile, substring, ",");
prevScoreDataFiles.push_back(substring); prevScoreDataFiles.push_back(substring);
} }
} }
vector<string> prevFeatureDataFiles; vector<string> prevFeatureDataFiles;
if (prevFeatureDataFile.length() > 0){ if (prevFeatureDataFile.length() > 0) {
std::string substring; std::string substring;
while (!prevFeatureDataFile.empty()){ while (!prevFeatureDataFile.empty()) {
getNextPound(prevFeatureDataFile, substring, ","); getNextPound(prevFeatureDataFile, substring, ",");
prevFeatureDataFiles.push_back(substring); prevFeatureDataFiles.push_back(substring);
} }
} }
if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()){ if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()) {
throw runtime_error("Error: there is a different number of previous score and feature files"); throw runtime_error("Error: there is a different number of previous score and feature files");
} }
if (binmode) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
//TODO is comma separated list? split and create a scorer with multiple parts
TRACE_ERR("Scorer type: " << scorerType << endl);
ScorerFactory sfactory;
Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig);
//load references
if (referenceFiles.size() > 0)
scorer->setReferenceFiles(referenceFiles);
PrintUserTime("References loaded"); if (binmode) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
Data data(*scorer);
//load old data
for (size_t i=0;i < prevScoreDataFiles.size(); i++){
data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i));
}
PrintUserTime("Previous data loaded");
//computing score statistics of each nbest file
for (size_t i=0;i < nbestFiles.size(); i++){
data.loadnbest(nbestFiles.at(i));
}
PrintUserTime("Nbest entries loaded and scored"); //TODO is comma separated list? split and create a scorer with multiple parts
TRACE_ERR("Scorer type: " << scorerType << endl);
if (binmode) ScorerFactory sfactory;
cerr << "Binary write mode is selected" << endl; Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig);
else
cerr << "Binary write mode is NOT selected" << endl; //load references
if (referenceFiles.size() > 0)
data.save(featureDataFile, scoreDataFile, binmode); scorer->setReferenceFiles(referenceFiles);
PrintUserTime("Stopping...");
/* PrintUserTime("References loaded");
timer.stop("Stopping...");
*/ Data data(*scorer);
return EXIT_SUCCESS; //load old data
} catch (const exception& e) { for (size_t i=0; i < prevScoreDataFiles.size(); i++) {
cerr << "Exception: " << e.what() << endl; data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i));
return EXIT_FAILURE;
} }
PrintUserTime("Previous data loaded");
//computing score statistics of each nbest file
for (size_t i=0; i < nbestFiles.size(); i++) {
data.loadnbest(nbestFiles.at(i));
}
PrintUserTime("Nbest entries loaded and scored");
if (binmode)
cerr << "Binary write mode is selected" << endl;
else
cerr << "Binary write mode is NOT selected" << endl;
data.save(featureDataFile, scoreDataFile, binmode);
PrintUserTime("Stopping...");
/*
timer.stop("Stopping...");
*/
return EXIT_SUCCESS;
} catch (const exception& e) {
cerr << "Exception: " << e.what() << endl;
return EXIT_FAILURE;
}
} }

View File

@ -4,66 +4,70 @@
#include <streambuf> #include <streambuf>
#include <zlib.h> #include <zlib.h>
class gzfilebuf : public std::streambuf { class gzfilebuf : public std::streambuf
{
public: public:
gzfilebuf(const char *filename) gzfilebuf(const char *filename) {
{ _gzf = gzopen(filename, "rb"); _gzf = gzopen(filename, "rb");
setg (_buff+sizeof(int), // beginning of putback area setg (_buff+sizeof(int), // beginning of putback area
_buff+sizeof(int), // read position _buff+sizeof(int), // read position
_buff+sizeof(int)); // end position _buff+sizeof(int)); // end position
} }
~gzfilebuf() { gzclose(_gzf); } ~gzfilebuf() {
gzclose(_gzf);
}
protected: protected:
virtual int_type overflow (int_type c) { virtual int_type overflow (int_type c) {
throw; throw;
} }
// write multiple characters // write multiple characters
virtual virtual
std::streamsize xsputn (const char* s, std::streamsize xsputn (const char* s,
std::streamsize num) { std::streamsize num) {
throw; throw;
} }
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw; virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ) {
throw;
} }
//read one character //read one character
virtual int_type underflow () { virtual int_type underflow () {
// is read position before end of _buff? // is read position before end of _buff?
if (gptr() < egptr()) { if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr()); return traits_type::to_int_type(*gptr());
} }
/* process size of putback area /* process size of putback area
* - use number of characters read * - use number of characters read
* - but at most four * - but at most four
*/ */
unsigned int numPutback = gptr() - eback(); unsigned int numPutback = gptr() - eback();
if (numPutback > sizeof(int)) { if (numPutback > sizeof(int)) {
numPutback = sizeof(int); numPutback = sizeof(int);
} }
/* copy up to four characters previously read into /* copy up to four characters previously read into
* the putback _buff (area of first four characters) * the putback _buff (area of first four characters)
*/ */
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback, std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
numPutback); numPutback);
// read new characters // read new characters
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int)); int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
if (num <= 0) { if (num <= 0) {
// ERROR or EOF // ERROR or EOF
return EOF; return EOF;
} }
// reset _buff pointers // reset _buff pointers
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
_buff+sizeof(int), // read position _buff+sizeof(int), // read position
_buff+sizeof(int)+num); // end of buffer _buff+sizeof(int)+num); // end of buffer
// return next character // return next character
return traits_type::to_int_type(*gptr()); return traits_type::to_int_type(*gptr());
} }
std::streamsize xsgetn (char* s, std::streamsize xsgetn (char* s,

View File

@ -28,7 +28,8 @@ float min_interval = 1e-3;
using namespace std; using namespace std;
void usage(void) { void usage(void)
{
cerr<<"usage: mert -d <dimensions> (mandatory )"<<endl; cerr<<"usage: mert -d <dimensions> (mandatory )"<<endl;
cerr<<"[-n retry ntimes (default 1)]"<<endl; cerr<<"[-n retry ntimes (default 1)]"<<endl;
cerr<<"[-o\tthe indexes to optimize(default all)]"<<endl; cerr<<"[-o\tthe indexes to optimize(default all)]"<<endl;
@ -44,34 +45,34 @@ void usage(void) {
exit(1); exit(1);
} }
static struct option long_options[] = static struct option long_options[] = {
{ {"pdim", 1, 0, 'd'},
{"pdim", 1, 0, 'd'}, {"ntry",1,0,'n'},
{"ntry",1,0,'n'}, {"rseed",required_argument,0,'r'},
{"rseed",required_argument,0,'r'}, {"optimize",1,0,'o'},
{"optimize",1,0,'o'}, {"type",1,0,'t'},
{"type",1,0,'t'}, {"sctype",1,0,'s'},
{"sctype",1,0,'s'}, {"scconfig",required_argument,0,'c'},
{"scconfig",required_argument,0,'c'}, {"scfile",1,0,'S'},
{"scfile",1,0,'S'}, {"ffile",1,0,'F'},
{"ffile",1,0,'F'}, {"ifile",1,0,'i'},
{"ifile",1,0,'i'}, {"verbose",1,0,'v'},
{"verbose",1,0,'v'}, {"help",no_argument,0,'h'},
{"help",no_argument,0,'h'}, {0, 0, 0, 0}
{0, 0, 0, 0} };
};
int option_index; int option_index;
int main (int argc, char **argv) { int main (int argc, char **argv)
{
ResetUserTime();
ResetUserTime();
/*
Timer timer; /*
timer.start("Starting..."); Timer timer;
*/ timer.start("Starting...");
*/
int c,pdim,i; int c,pdim,i;
pdim=-1; pdim=-1;
int ntry=1; int ntry=1;
@ -132,23 +133,23 @@ int main (int argc, char **argv) {
usage(); usage();
if (hasSeed) { if (hasSeed) {
cerr << "Seeding random numbers with " << seed << endl; cerr << "Seeding random numbers with " << seed << endl;
srandom(seed); srandom(seed);
} else { } else {
cerr << "Seeding random numbers with system clock " << endl; cerr << "Seeding random numbers with system clock " << endl;
srandom(time(NULL)); srandom(time(NULL));
} }
ifstream opt(initfile.c_str()); ifstream opt(initfile.c_str());
if(opt.fail()){ if(opt.fail()) {
cerr<<"could not open initfile: " << initfile << endl; cerr<<"could not open initfile: " << initfile << endl;
exit(3); exit(3);
} }
start.resize(pdim);//to do:read from file start.resize(pdim);//to do:read from file
int j; int j;
for( j=0;j<pdim&&!opt.fail();j++) for( j=0; j<pdim&&!opt.fail(); j++)
opt>>start[j]; opt>>start[j];
if(j<pdim){ if(j<pdim) {
cerr<<"error could not initialize start point with " << initfile << endl; cerr<<"error could not initialize start point with " << initfile << endl;
exit(3); exit(3);
} }
@ -156,24 +157,24 @@ int main (int argc, char **argv) {
opt.close(); opt.close();
vector<string> ScoreDataFiles; vector<string> ScoreDataFiles;
if (scorerfile.length() > 0){ if (scorerfile.length() > 0) {
std::string substring; std::string substring;
while (!scorerfile.empty()){ while (!scorerfile.empty()) {
getNextPound(scorerfile, substring, ","); getNextPound(scorerfile, substring, ",");
ScoreDataFiles.push_back(substring); ScoreDataFiles.push_back(substring);
} }
} }
vector<string> FeatureDataFiles; vector<string> FeatureDataFiles;
if (featurefile.length() > 0){ if (featurefile.length() > 0) {
std::string substring; std::string substring;
while (!featurefile.empty()){ while (!featurefile.empty()) {
getNextPound(featurefile, substring, ","); getNextPound(featurefile, substring, ",");
FeatureDataFiles.push_back(substring); FeatureDataFiles.push_back(substring);
} }
} }
if (ScoreDataFiles.size() != FeatureDataFiles.size()){ if (ScoreDataFiles.size() != FeatureDataFiles.size()) {
throw runtime_error("Error: there is a different number of previous score and feature files"); throw runtime_error("Error: there is a different number of previous score and feature files");
} }
@ -183,32 +184,37 @@ int main (int argc, char **argv) {
//load data //load data
Data D(*TheScorer); Data D(*TheScorer);
for (size_t i=0;i < ScoreDataFiles.size(); i++){ for (size_t i=0; i < ScoreDataFiles.size(); i++) {
cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl; cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl;
D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i)); D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
} }
PrintUserTime("Data loaded"); PrintUserTime("Data loaded");
if (tooptimizestr.length() > 0){ if (tooptimizestr.length() > 0) {
cerr << "Weights to optimize: " << tooptimizestr << endl; cerr << "Weights to optimize: " << tooptimizestr << endl;
//parse string to get weights to optimize //parse string to get weights to optimize
//and set them as active //and set them as active
std::string substring; std::string substring;
int index; int index;
while (!tooptimizestr.empty()){ while (!tooptimizestr.empty()) {
getNextPound(tooptimizestr, substring, ","); getNextPound(tooptimizestr, substring, ",");
index = D.getFeatureIndex(substring); index = D.getFeatureIndex(substring);
cerr << "FeatNameIndex:" << index << " to insert" << endl; cerr << "FeatNameIndex:" << index << " to insert" << endl;
//index = strtol(substring.c_str(), NULL, 10); //index = strtol(substring.c_str(), NULL, 10);
if (index >= 0 && index < pdim){ tooptimize.push_back(index); } if (index >= 0 && index < pdim) {
else{ cerr << "Index " << index << " is out of bounds. Allowed indexes are [0," << (pdim-1) << "]." << endl; } tooptimize.push_back(index);
} else {
cerr << "Index " << index << " is out of bounds. Allowed indexes are [0," << (pdim-1) << "]." << endl;
}
} }
}else{ } else {
//set all weights as active //set all weights as active
tooptimize.resize(pdim);//We'll optimize on everything tooptimize.resize(pdim);//We'll optimize on everything
for(int i=0;i<pdim;i++){ tooptimize[i]=1; } for(int i=0; i<pdim; i++) {
tooptimize[i]=1;
}
} }
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start,type); Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start,type);
@ -216,51 +222,51 @@ int main (int argc, char **argv) {
O->SetFData(D.getFeatureData()); O->SetFData(D.getFeatureData());
Point P(start);//Generate from the full feature set. Warning: must be done after Optimizer initialization Point P(start);//Generate from the full feature set. Warning: must be done after Optimizer initialization
statscore_t best=O->Run(P); statscore_t best=O->Run(P);
Point bestP=P; Point bestP=P;
statscore_t mean=best; statscore_t mean=best;
statscore_t var=best*best; statscore_t var=best*best;
stringstream oss; stringstream oss;
oss << "Try number 1"; oss << "Try number 1";
PrintUserTime(oss.str()); PrintUserTime(oss.str());
vector<parameter_t> min(Point::getdim()); vector<parameter_t> min(Point::getdim());
vector<parameter_t> max(Point::getdim()); vector<parameter_t> max(Point::getdim());
for(unsigned int d=0;d<Point::getdim();d++){ for(unsigned int d=0; d<Point::getdim(); d++) {
min[d]=0.0; min[d]=0.0;
max[d]=1.0; max[d]=1.0;
} }
//note: those mins and max are the bound for the starting points of the algorithm, not strict bound on the result! //note: those mins and max are the bound for the starting points of the algorithm, not strict bound on the result!
for(int i=1;i<ntry;i++){
P.Randomize(min,max);
statscore_t score=O->Run(P);
if(score>best){
best=score;
bestP=P;
}
mean+=score;
var+=(score*score);
oss.str("");
oss << "Try number " << (i+1);
PrintUserTime(oss.str());
}
mean/=(float)ntry;
var/=(float)ntry;
var=sqrt(abs(var-mean*mean));
if (verboselevel()>1)
cerr<<"best score: "<< best << " variance of the score (for "<<ntry<<" try): "<<var<<endl;
//L1-Normalization of the best Point
bestP.NormalizeL1(); for(int i=1; i<ntry; i++) {
P.Randomize(min,max);
cerr << "Best point: " << bestP << " => " << best << endl; statscore_t score=O->Run(P);
ofstream res("weights.txt"); if(score>best) {
res<<bestP<<endl; best=score;
bestP=P;
PrintUserTime("Stopping..."); }
mean+=score;
var+=(score*score);
oss.str("");
oss << "Try number " << (i+1);
PrintUserTime(oss.str());
}
mean/=(float)ntry;
var/=(float)ntry;
var=sqrt(abs(var-mean*mean));
if (verboselevel()>1)
cerr<<"best score: "<< best << " variance of the score (for "<<ntry<<" try): "<<var<<endl;
//L1-Normalization of the best Point
bestP.NormalizeL1();
cerr << "Best point: " << bestP << " => " << best << endl;
ofstream res("weights.txt");
res<<bestP<<endl;
PrintUserTime("Stopping...");
} }

View File

@ -8,52 +8,53 @@
using namespace std; using namespace std;
int main(int argc, char** argv) { int main(int argc, char** argv)
cout << "Testing the scorer" << endl; {
//BleuScorer bs("test-scorer-data/cppstats.feats.opt");; cout << "Testing the scorer" << endl;
vector<string> references; //BleuScorer bs("test-scorer-data/cppstats.feats.opt");;
references.push_back("test_scorer_data/reference.txt"); vector<string> references;
//bs.prepare(references, "test-scorer-data/nbest.out"); references.push_back("test_scorer_data/reference.txt");
Scorer* scorer = new BleuScorer();; //bs.prepare(references, "test-scorer-data/nbest.out");
scorer->setReferenceFiles(references); Scorer* scorer = new BleuScorer();;
Data d(*scorer); scorer->setReferenceFiles(references);
d.loadnbest("test_scorer_data/nbest.out"); Data d(*scorer);
//sd.savetxt(); d.loadnbest("test_scorer_data/nbest.out");
//sd.savetxt();
//calculate two bleu scores, nbest and a diff //calculate two bleu scores, nbest and a diff
ScoreData* sd=d.getScoreData(); ScoreData* sd=d.getScoreData();
scorer->setScoreData(sd); scorer->setScoreData(sd);
candidates_t candidates(sd->size());; candidates_t candidates(sd->size());;
for (size_t i = 0; i < sd->size(); ++i) { for (size_t i = 0; i < sd->size(); ++i) {
sd->get(i,0).savetxt("/dev/stdout"); sd->get(i,0).savetxt("/dev/stdout");
} }
diffs_t diffs; diffs_t diffs;
diff_t diff; diff_t diff;
diff.push_back(make_pair(1,2)); diff.push_back(make_pair(1,2));
diff.push_back(make_pair(7,8)); diff.push_back(make_pair(7,8));
diffs.push_back(diff); diffs.push_back(diff);
statscores_t scores;
scorer->score(candidates,diffs,scores);
cout << "Bleus: " << scores[0] << " " << scores[1] << endl; statscores_t scores;
scorer->score(candidates,diffs,scores);
//try the per cout << "Bleus: " << scores[0] << " " << scores[1] << endl;
scorer = new PerScorer();
Data pd(*scorer);
scorer->setReferenceFiles(references);
pd.loadnbest("test_scorer_data/nbest.out"); //try the per
//sd.savetxt(); scorer = new PerScorer();
Data pd(*scorer);
scorer->setReferenceFiles(references);
ScoreData* psd=pd.getScoreData(); pd.loadnbest("test_scorer_data/nbest.out");
scorer->setScoreData(psd); //sd.savetxt();
for (size_t i = 0; i < psd->size(); ++i) {
psd->get(i,0).savetxt("/dev/stdout"); ScoreData* psd=pd.getScoreData();
} scorer->setScoreData(psd);
for (size_t i = 0; i < psd->size(); ++i) {
psd->get(i,0).savetxt("/dev/stdout");
}
cout << "PER: " << scorer->score(candidates) << endl; cout << "PER: " << scorer->score(candidates) << endl;
} }