run beautify.perl. Consistent formatting for .h & .cpp files in Mert directory

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/mert-mtm5@4167 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
machacekmatous 2011-08-29 14:08:17 +00:00
parent 8b97df9367
commit 0a44787f55
39 changed files with 3122 additions and 2863 deletions

View File

@ -1,199 +1,204 @@
#include "BleuScorer.h"
BleuScorer::BleuScorer(const string& config = "") : StatisticsBasedScorer("BLEU",config),_refLengthStrategy(BLEU_CLOSEST) {
//configure regularisation
static string KEY_REFLEN = "reflen";
static string REFLEN_AVERAGE = "average";
static string REFLEN_SHORTEST = "shortest";
static string REFLEN_CLOSEST = "closest";
BleuScorer::BleuScorer(const string& config = "") : StatisticsBasedScorer("BLEU",config),_refLengthStrategy(BLEU_CLOSEST)
{
//configure regularisation
static string KEY_REFLEN = "reflen";
static string REFLEN_AVERAGE = "average";
static string REFLEN_SHORTEST = "shortest";
static string REFLEN_CLOSEST = "closest";
string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
_refLengthStrategy = BLEU_AVERAGE;
} else if (reflen == REFLEN_SHORTEST) {
_refLengthStrategy = BLEU_SHORTEST;
} else if (reflen == REFLEN_CLOSEST) {
_refLengthStrategy = BLEU_CLOSEST;
} else {
throw runtime_error("Unknown reference length strategy: " + reflen);
}
cerr << "Using reference length strategy: " << reflen << endl;
string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
_refLengthStrategy = BLEU_AVERAGE;
} else if (reflen == REFLEN_SHORTEST) {
_refLengthStrategy = BLEU_SHORTEST;
} else if (reflen == REFLEN_CLOSEST) {
_refLengthStrategy = BLEU_CLOSEST;
} else {
throw runtime_error("Unknown reference length strategy: " + reflen);
}
cerr << "Using reference length strategy: " << reflen << endl;
static string KEY_NGRAMS = "ngramlen";
string ngramlen = getConfig(KEY_NGRAMS,"4");
static string KEY_NGRAMS = "ngramlen";
string ngramlen = getConfig(KEY_NGRAMS,"4");
LENGTH = strtol(ngramlen.c_str(), NULL, 10);
LENGTH = strtol(ngramlen.c_str(), NULL, 10);
}
/**
* count the ngrams of each type, up to the given length in the input line.
**/
size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n) {
vector<int> encoded_tokens;
//cerr << line << endl;
encode(line,encoded_tokens);
//copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
//cerr << endl;
for (size_t k = 1; k <= n; ++k) {
//ngram order longer than sentence - no point
if (k > encoded_tokens.size()) {
continue;
}
for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) {
vector<int> ngram;
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
ngram.push_back(encoded_tokens[j]);
}
int count = 1;
counts_it oldcount = counts.find(ngram);
if (oldcount != counts.end()) {
count = (oldcount->second) + 1;
}
//cerr << count << endl;
counts[ngram] = count;
//cerr << endl;
}
}
//cerr << "counted ngrams" << endl;
//dump_counts(counts);
return encoded_tokens.size();
size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n)
{
vector<int> encoded_tokens;
//cerr << line << endl;
encode(line,encoded_tokens);
//copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
//cerr << endl;
for (size_t k = 1; k <= n; ++k) {
//ngram order longer than sentence - no point
if (k > encoded_tokens.size()) {
continue;
}
for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) {
vector<int> ngram;
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
ngram.push_back(encoded_tokens[j]);
}
int count = 1;
counts_it oldcount = counts.find(ngram);
if (oldcount != counts.end()) {
count = (oldcount->second) + 1;
}
//cerr << count << endl;
counts[ngram] = count;
//cerr << endl;
}
}
//cerr << "counted ngrams" << endl;
//dump_counts(counts);
return encoded_tokens.size();
}
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles) {
//make sure reference data is clear
_refcounts.clear();
_reflengths.clear();
_encodings.clear();
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
//make sure reference data is clear
_refcounts.clear();
_reflengths.clear();
_encodings.clear();
//load reference data
for (size_t i = 0; i < referenceFiles.size(); ++i) {
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
ifstream refin(referenceFiles[i].c_str());
if (!refin) {
throw runtime_error("Unable to open: " + referenceFiles[i]);
}
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line)) {
//cerr << line << endl;
if (i == 0) {
counts_t* counts = new counts_t(); //these get leaked
_refcounts.push_back(counts);
vector<size_t> lengths;
_reflengths.push_back(lengths);
}
if (_refcounts.size() <= sid) {
throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
}
counts_t counts;
size_t length = countNgrams(line,counts,LENGTH);
//for any counts larger than those already there, merge them in
for (counts_it ci = counts.begin(); ci != counts.end(); ++ci) {
counts_it oldcount_it = _refcounts[sid]->find(ci->first);
int oldcount = 0;
if (oldcount_it != _refcounts[sid]->end()) {
oldcount = oldcount_it->second;
}
int newcount = ci->second;
if (newcount > oldcount) {
_refcounts[sid]->operator[](ci->first) = newcount;
}
}
//add in the length
_reflengths[sid].push_back(length);
if (sid > 0 && sid % 100 == 0) {
TRACE_ERR(".");
}
++sid;
}
refin.close();
TRACE_ERR(endl);
}
//load reference data
for (size_t i = 0; i < referenceFiles.size(); ++i) {
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
ifstream refin(referenceFiles[i].c_str());
if (!refin) {
throw runtime_error("Unable to open: " + referenceFiles[i]);
}
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line)) {
//cerr << line << endl;
if (i == 0) {
counts_t* counts = new counts_t(); //these get leaked
_refcounts.push_back(counts);
vector<size_t> lengths;
_reflengths.push_back(lengths);
}
if (_refcounts.size() <= sid) {
throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
}
counts_t counts;
size_t length = countNgrams(line,counts,LENGTH);
//for any counts larger than those already there, merge them in
for (counts_it ci = counts.begin(); ci != counts.end(); ++ci) {
counts_it oldcount_it = _refcounts[sid]->find(ci->first);
int oldcount = 0;
if (oldcount_it != _refcounts[sid]->end()) {
oldcount = oldcount_it->second;
}
int newcount = ci->second;
if (newcount > oldcount) {
_refcounts[sid]->operator[](ci->first) = newcount;
}
}
//add in the length
_reflengths[sid].push_back(length);
if (sid > 0 && sid % 100 == 0) {
TRACE_ERR(".");
}
++sid;
}
refin.close();
TRACE_ERR(endl);
}
}
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) {
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
// cerr << text << endl;
// cerr << sid << endl;
//dump_counts(*_refcounts[sid]);
if (sid >= _refcounts.size()) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
counts_t testcounts;
//stats for this line
vector<float> stats(LENGTH*2);;
size_t length = countNgrams(text,testcounts,LENGTH);
//dump_counts(testcounts);
if (_refLengthStrategy == BLEU_SHORTEST) {
//cerr << reflengths.size() << " " << sid << endl;
int shortest = *min_element(_reflengths[sid].begin(),_reflengths[sid].end());
stats.push_back(shortest);
} else if (_refLengthStrategy == BLEU_AVERAGE) {
int total = 0;
for (size_t i = 0; i < _reflengths[sid].size(); ++i) {
total += _reflengths[sid][i];
}
float mean = (float)total/_reflengths[sid].size();
stats.push_back(mean);
} else if (_refLengthStrategy == BLEU_CLOSEST) {
int min_diff = INT_MAX;
int min_idx = 0;
for (size_t i = 0; i < _reflengths[sid].size(); ++i) {
int reflength = _reflengths[sid][i];
if (abs(reflength-(int)length) < abs(min_diff)) { //look for the closest reference
min_diff = reflength-length;
min_idx = i;
}else if (abs(reflength-(int)length) == abs(min_diff)) { // if two references has the same closest length, take the shortest
if (reflength < (int)_reflengths[sid][min_idx]){
min_idx = i;
}
}
}
stats.push_back(_reflengths[sid][min_idx]);
} else {
throw runtime_error("Unsupported reflength strategy");
}
//cerr << "computed length" << endl;
//precision on each ngram type
for (counts_it testcounts_it = testcounts.begin();
testcounts_it != testcounts.end(); ++testcounts_it) {
counts_it refcounts_it = _refcounts[sid]->find(testcounts_it->first);
int correct = 0;
int guess = testcounts_it->second;
if (refcounts_it != _refcounts[sid]->end()) {
correct = min(refcounts_it->second,guess);
}
size_t len = testcounts_it->first.size();
stats[len*2-2] += correct;
stats[len*2-1] += guess;
}
stringstream sout;
copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
//TRACE_ERR(sout.str() << endl);
string stats_str = sout.str();
entry.set(stats_str);
//dump_counts(*_refcounts[sid]);
if (sid >= _refcounts.size()) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
counts_t testcounts;
//stats for this line
vector<float> stats(LENGTH*2);;
size_t length = countNgrams(text,testcounts,LENGTH);
//dump_counts(testcounts);
if (_refLengthStrategy == BLEU_SHORTEST) {
//cerr << reflengths.size() << " " << sid << endl;
int shortest = *min_element(_reflengths[sid].begin(),_reflengths[sid].end());
stats.push_back(shortest);
} else if (_refLengthStrategy == BLEU_AVERAGE) {
int total = 0;
for (size_t i = 0; i < _reflengths[sid].size(); ++i) {
total += _reflengths[sid][i];
}
float mean = (float)total/_reflengths[sid].size();
stats.push_back(mean);
} else if (_refLengthStrategy == BLEU_CLOSEST) {
int min_diff = INT_MAX;
int min_idx = 0;
for (size_t i = 0; i < _reflengths[sid].size(); ++i) {
int reflength = _reflengths[sid][i];
if (abs(reflength-(int)length) < abs(min_diff)) { //look for the closest reference
min_diff = reflength-length;
min_idx = i;
} else if (abs(reflength-(int)length) == abs(min_diff)) { // if two references has the same closest length, take the shortest
if (reflength < (int)_reflengths[sid][min_idx]) {
min_idx = i;
}
}
}
stats.push_back(_reflengths[sid][min_idx]);
} else {
throw runtime_error("Unsupported reflength strategy");
}
//cerr << "computed length" << endl;
//precision on each ngram type
for (counts_it testcounts_it = testcounts.begin();
testcounts_it != testcounts.end(); ++testcounts_it) {
counts_it refcounts_it = _refcounts[sid]->find(testcounts_it->first);
int correct = 0;
int guess = testcounts_it->second;
if (refcounts_it != _refcounts[sid]->end()) {
correct = min(refcounts_it->second,guess);
}
size_t len = testcounts_it->first.size();
stats[len*2-2] += correct;
stats[len*2-1] += guess;
}
stringstream sout;
copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
//TRACE_ERR(sout.str() << endl);
string stats_str = sout.str();
entry.set(stats_str);
}
float BleuScorer::calculateScore(const vector<float>& comps) {
//cerr << "BLEU: ";
//copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," "));
float logbleu = 0.0;
for (int i = 0; i < LENGTH; ++i) {
if (comps[2*i] == 0) {
return 0.0;
}
logbleu += log(comps[2*i]) - log(comps[2*i+1]);
}
logbleu /= LENGTH;
float brevity = 1.0 - (float)comps[LENGTH*2]/comps[1];//reflength divided by test length
if (brevity < 0.0) {
logbleu += brevity;
}
//cerr << " " << exp(logbleu) << endl;
return exp(logbleu);
float BleuScorer::calculateScore(const vector<float>& comps)
{
//cerr << "BLEU: ";
//copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," "));
float logbleu = 0.0;
for (int i = 0; i < LENGTH; ++i) {
if (comps[2*i] == 0) {
return 0.0;
}
logbleu += log(comps[2*i]) - log(comps[2*i+1]);
}
logbleu /= LENGTH;
float brevity = 1.0 - (float)comps[LENGTH*2]/comps[1];//reflength divided by test length
if (brevity < 0.0) {
logbleu += brevity;
}
//cerr << " " << exp(logbleu) << endl;
return exp(logbleu);
}

View File

@ -23,73 +23,74 @@ enum BleuReferenceLengthStrategy { BLEU_AVERAGE, BLEU_SHORTEST, BLEU_CLOSEST };
/**
* Bleu scoring
**/
class BleuScorer: public StatisticsBasedScorer {
public:
BleuScorer(const string& config);
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
int LENGTH;
size_t NumberOfScores() const {
//cerr << "BleuScorer: " << (2 * LENGTH + 1) << endl;
return (2 * LENGTH + 1);
};
bool useAlignment() const {
//cout << "BleuScorer::useAlignment returning false" << endl;
return false;
};
class BleuScorer: public StatisticsBasedScorer
{
public:
BleuScorer(const string& config);
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
int LENGTH;
protected:
float calculateScore(const vector<float>& comps);
private:
//no copy
BleuScorer(const BleuScorer&);
~BleuScorer(){};
BleuScorer& operator=(const BleuScorer&);
//Used to construct the ngram map
struct CompareNgrams {
int operator() (const vector<int>& a, const vector<int>& b) {
size_t i;
size_t as = a.size();
size_t bs = b.size();
for (i = 0; i < as && i < bs; ++i) {
if (a[i] < b[i]) {
//cerr << "true" << endl;
return true;
}
if (a[i] > b[i]) {
//cerr << "false" << endl;
return false;
}
}
//entries are equal, shortest wins
return as < bs;;
}
};
size_t NumberOfScores() const {
//cerr << "BleuScorer: " << (2 * LENGTH + 1) << endl;
return (2 * LENGTH + 1);
};
bool useAlignment() const {
//cout << "BleuScorer::useAlignment returning false" << endl;
return false;
};
typedef map<vector<int>,int,CompareNgrams> counts_t;
typedef map<vector<int>,int,CompareNgrams>::iterator counts_it;
typedef vector<counts_t*> refcounts_t;
size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
protected:
float calculateScore(const vector<float>& comps);
void dump_counts(counts_t& counts) {
for (counts_it i = counts.begin(); i != counts.end(); ++i) {
cerr << "(";
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
cerr << ") " << i->second << ", ";
}
cerr << endl;
}
BleuReferenceLengthStrategy _refLengthStrategy;
// data extracted from reference files
refcounts_t _refcounts;
vector<vector<size_t> > _reflengths;
private:
//no copy
BleuScorer(const BleuScorer&);
~BleuScorer() {};
BleuScorer& operator=(const BleuScorer&);
//Used to construct the ngram map
struct CompareNgrams {
int operator() (const vector<int>& a, const vector<int>& b) {
size_t i;
size_t as = a.size();
size_t bs = b.size();
for (i = 0; i < as && i < bs; ++i) {
if (a[i] < b[i]) {
//cerr << "true" << endl;
return true;
}
if (a[i] > b[i]) {
//cerr << "false" << endl;
return false;
}
}
//entries are equal, shortest wins
return as < bs;;
}
};
typedef map<vector<int>,int,CompareNgrams> counts_t;
typedef map<vector<int>,int,CompareNgrams>::iterator counts_it;
typedef vector<counts_t*> refcounts_t;
size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
void dump_counts(counts_t& counts) {
for (counts_it i = counts.begin(); i != counts.end(); ++i) {
cerr << "(";
copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
cerr << ") " << i->second << ", ";
}
cerr << endl;
}
BleuReferenceLengthStrategy _refLengthStrategy;
// data extracted from reference files
refcounts_t _refcounts;
vector<vector<size_t> > _reflengths;
};

View File

@ -13,114 +13,113 @@
Data::Data(Scorer& ptr):
theScorer(&ptr)
theScorer(&ptr)
{
score_type = (*theScorer).getName();
TRACE_ERR("Data::score_type " << score_type << std::endl);
TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
score_type = (*theScorer).getName();
TRACE_ERR("Data::score_type " << score_type << std::endl);
TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
featdata=new FeatureData;
scoredata=new ScoreData(*theScorer);
};
void Data::loadnbest(const std::string &file)
{
TRACE_ERR("loading nbest from " << file << std::endl);
TRACE_ERR("loading nbest from " << file << std::endl);
FeatureStats featentry;
ScoreStats scoreentry;
std::string sentence_index;
FeatureStats featentry;
ScoreStats scoreentry;
std::string sentence_index;
inputfilestream inp(file); // matches a stream with a file. Opens the file
inputfilestream inp(file); // matches a stream with a file. Opens the file
if (!inp.good())
throw runtime_error("Unable to open: " + file);
if (!inp.good())
throw runtime_error("Unable to open: " + file);
std::string substring, subsubstring, stringBuf;
std::string theSentence;
std::string theFeatures;
std::string theAlignment;
std::string::size_type loc;
std::string substring, subsubstring, stringBuf;
std::string theSentence;
std::string theFeatures;
std::string theAlignment;
std::string::size_type loc;
while (getline(inp,stringBuf,'\n')){
if (stringBuf.empty()) continue;
while (getline(inp,stringBuf,'\n')) {
if (stringBuf.empty()) continue;
// TRACE_ERR("stringBuf: " << stringBuf << std::endl);
// TRACE_ERR("stringBuf: " << stringBuf << std::endl);
getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = substring;
getNextPound(stringBuf, substring, "|||"); //first field
sentence_index = substring;
getNextPound(stringBuf, substring, "|||"); //second field
theSentence = substring;
getNextPound(stringBuf, substring, "|||"); //second field
theSentence = substring;
// adding statistics for error measures
featentry.reset();
scoreentry.clear();
featentry.reset();
scoreentry.clear();
getNextPound(stringBuf, substring, "|||"); //third field
theFeatures = substring;
getNextPound(stringBuf, substring, "|||"); //third field
theFeatures = substring;
if (stringBuf.length() > 0) {
getNextPound(stringBuf, substring, "|||"); //fourth field sentence score
if (stringBuf.length() > 0) {
getNextPound(stringBuf, substring, "|||"); //fourth field only there if alignment scorer
theAlignment = substring;
}
}
//TODO check alignment exists if scorers need it
if (stringBuf.length() > 0) {
getNextPound(stringBuf, substring, "|||"); //fourth field sentence score
if (stringBuf.length() > 0) {
getNextPound(stringBuf, substring, "|||"); //fourth field only there if alignment scorer
theAlignment = substring;
}
}
//TODO check alignment exists if scorers need it
if (!theScorer->useAlignment()) {
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
} else {
//an interpolated score would need both sentence and alignment
theSentence += "|||";
theSentence += theAlignment;
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
}
if (!theScorer->useAlignment()) {
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
} else {
//an interpolated score would need both sentence and alignment
theSentence += "|||";
theSentence += theAlignment;
theScorer->prepareStats(sentence_index, theSentence, scoreentry);
}
scoredata->add(scoreentry, sentence_index);
scoredata->add(scoreentry, sentence_index);
if (!existsFeatureNames()){
std::string stringsupport=theFeatures;
// adding feature names
std::string features="";
std::string tmpname="";
if (!existsFeatureNames()) {
std::string stringsupport=theFeatures;
// adding feature names
std::string features="";
std::string tmpname="";
size_t tmpidx=0;
while (!stringsupport.empty()) {
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(stringsupport, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
features+=tmpname+"_"+stringify(tmpidx)+" ";
tmpidx++;
} else {
tmpidx=0;
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
}
}
featdata->setFeatureMap(features);
}
size_t tmpidx=0;
while (!stringsupport.empty()){
// TRACE_ERR("Decompounding: " << substring << std::endl);
getNextPound(stringsupport, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1){
features+=tmpname+"_"+stringify(tmpidx)+" ";
tmpidx++;
}
else{
tmpidx=0;
tmpname=subsubstring.substr(0,subsubstring.size() - 1);
}
}
featdata->setFeatureMap(features);
}
// adding features
while (!theFeatures.empty()){
// TRACE_ERR("Decompounding: " << theFeatures << std::endl);
getNextPound(theFeatures, subsubstring);
while (!theFeatures.empty()) {
// TRACE_ERR("Decompounding: " << theFeatures << std::endl);
getNextPound(theFeatures, subsubstring);
// string ending with ":" are skipped, because they are the names of the features
if ((loc = subsubstring.find(":")) != subsubstring.length()-1){
featentry.add(ATOFST(subsubstring.c_str()));
}
}
featdata->add(featentry,sentence_index);
}
inp.close();
if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
featentry.add(ATOFST(subsubstring.c_str()));
}
}
featdata->add(featentry,sentence_index);
}
inp.close();
}

View File

@ -24,49 +24,70 @@ class Scorer;
class Data
{
protected:
ScoreData* scoredata;
FeatureData* featdata;
ScoreData* scoredata;
FeatureData* featdata;
private:
Scorer* theScorer;
Scorer* theScorer;
std::string score_type;
size_t number_of_scores; //number of scores
size_t number_of_scores; //number of scores
public:
Data(Scorer& sc);
~Data(){};
inline void clear() { scoredata->clear(); featdata->clear(); }
ScoreData* getScoreData() { return scoredata; };
FeatureData* getFeatureData() { return featdata; };
inline size_t NumberOfFeatures() const{ return featdata->NumberOfFeatures(); }
inline void NumberOfFeatures(size_t v){ featdata->NumberOfFeatures(v); }
inline std::string Features() const{ return featdata->Features(); }
inline void Features(const std::string f){ featdata->Features(f); }
Data(Scorer& sc);
void loadnbest(const std::string &file);
~Data() {};
void load(const std::string &featfile,const std::string &scorefile){
featdata->load(featfile);
scoredata->load(scorefile);
inline void clear() {
scoredata->clear();
featdata->clear();
}
void save(const std::string &featfile,const std::string &scorefile, bool bin=false){
if (bin) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
featdata->save(featfile, bin);
scoredata->save(scorefile, bin);
}
inline bool existsFeatureNames(){ return featdata->existsFeatureNames(); };
inline std::string getFeatureName(size_t idx){ return featdata->getFeatureName(idx); };
inline size_t getFeatureIndex(const std::string& name){ return featdata->getFeatureIndex(name); };
ScoreData* getScoreData() {
return scoredata;
};
FeatureData* getFeatureData() {
return featdata;
};
inline size_t NumberOfFeatures() const {
return featdata->NumberOfFeatures();
}
inline void NumberOfFeatures(size_t v) {
featdata->NumberOfFeatures(v);
}
inline std::string Features() const {
return featdata->Features();
}
inline void Features(const std::string f) {
featdata->Features(f);
}
void loadnbest(const std::string &file);
void load(const std::string &featfile,const std::string &scorefile) {
featdata->load(featfile);
scoredata->load(scorefile);
}
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
if (bin) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
featdata->save(featfile, bin);
scoredata->save(scorefile, bin);
}
inline bool existsFeatureNames() {
return featdata->existsFeatureNames();
};
inline std::string getFeatureName(size_t idx) {
return featdata->getFeatureName(idx);
};
inline size_t getFeatureIndex(const std::string& name) {
return featdata->getFeatureIndex(name);
};
};

View File

@ -16,137 +16,137 @@ FeatureArray::FeatureArray(): idx("")
void FeatureArray::savetxt(std::ofstream& outFile)
{
outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_features << " " << features << std::endl;
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++){
i->savetxt(outFile);
outFile << std::endl;
}
outFile << FEATURES_TXT_END << std::endl;
outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_features << " " << features << std::endl;
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->savetxt(outFile);
outFile << std::endl;
}
outFile << FEATURES_TXT_END << std::endl;
}
void FeatureArray::savebin(std::ofstream& outFile)
{
outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_features << " " << features << std::endl;
outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_features << " " << features << std::endl;
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->savebin(outFile);
i->savebin(outFile);
outFile << FEATURES_BIN_END << std::endl;
outFile << FEATURES_BIN_END << std::endl;
}
void FeatureArray::save(std::ofstream& inFile, bool bin)
{
if (size()>0)
(bin)?savebin(inFile):savetxt(inFile);
if (size()>0)
(bin)?savebin(inFile):savetxt(inFile);
}
void FeatureArray::save(const std::string &file, bool bin)
{
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile);
save(outFile);
outFile.close();
outFile.close();
}
void FeatureArray::loadbin(ifstream& inFile, size_t n)
{
FeatureStats entry(number_of_features);
FeatureStats entry(number_of_features);
for (size_t i=0 ; i < n; i++){
entry.loadbin(inFile);
add(entry);
}
for (size_t i=0 ; i < n; i++) {
entry.loadbin(inFile);
add(entry);
}
}
void FeatureArray::loadtxt(ifstream& inFile, size_t n)
{
FeatureStats entry(number_of_features);
for (size_t i=0 ; i < n; i++){
entry.loadtxt(inFile);
add(entry);
}
FeatureStats entry(number_of_features);
for (size_t i=0 ; i < n; i++) {
entry.loadtxt(inFile);
add(entry);
}
}
void FeatureArray::load(ifstream& inFile)
{
size_t number_of_entries=0;
bool binmode=false;
std::string substring, stringBuf;
bool binmode=false;
std::string substring, stringBuf;
std::string::size_type loc;
std::getline(inFile, stringBuf);
if (!inFile.good()){
return;
}
std::getline(inFile, stringBuf);
if (!inFile.good()) {
return;
}
if (!stringBuf.empty()){
if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0){
binmode=false;
}else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0){
binmode=true;
}else{
TRACE_ERR("ERROR: FeatureArray::load(): Wrong header");
return;
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
if (!stringBuf.empty()) {
if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0) {
binmode=false;
} else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0) {
binmode=true;
} else {
TRACE_ERR("ERROR: FeatureArray::load(): Wrong header");
return;
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
idx = substring;
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
number_of_features = atoi(substring.c_str());
features = stringBuf;
}
features = stringBuf;
}
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
std::getline(inFile, stringBuf);
if (!stringBuf.empty()){
if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0){
TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer");
return;
}
}
std::getline(inFile, stringBuf);
if (!stringBuf.empty()) {
if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0) {
TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer");
return;
}
}
}
void FeatureArray::load(const std::string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
TRACE_ERR("loading data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
inputfilestream inFile(file); // matches a stream with a file. Opens the file
load((ifstream&) inFile);
load((ifstream&) inFile);
inFile.close();
inFile.close();
}
void FeatureArray::merge(FeatureArray& e)
{
//dummy implementation
for (size_t i=0; i<e.size(); i++)
add(e.get(i));
//dummy implementation
for (size_t i=0; i<e.size(); i++)
add(e.get(i));
}
bool FeatureArray::check_consistency()
{
size_t sz = NumberOfFeatures();
if (sz == 0)
return true;
for (featarray_t::iterator i=array_.begin(); i!=array_.end(); i++)
if (i->size()!=sz)
return false;
return true;
size_t sz = NumberOfFeatures();
if (sz == 0)
return true;
for (featarray_t::iterator i=array_.begin(); i!=array_.end(); i++)
if (i->size()!=sz)
return false;
return true;
}

View File

@ -27,47 +27,71 @@ using namespace std;
class FeatureArray
{
protected:
featarray_t array_;
size_t number_of_features;
std::string features;
featarray_t array_;
size_t number_of_features;
std::string features;
private:
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
public:
FeatureArray();
~FeatureArray(){};
inline void clear() { array_.clear(); }
inline std::string getIndex(){ return idx; }
inline void setIndex(const std::string & value){ idx=value; }
FeatureArray();
inline FeatureStats& get(size_t i){ return array_.at(i); }
inline const FeatureStats& get(size_t i)const{ return array_.at(i); }
void add(FeatureStats e){ array_.push_back(e); }
~FeatureArray() {};
void merge(FeatureArray& e);
inline void clear() {
array_.clear();
}
inline size_t size(){ return array_.size(); }
inline size_t NumberOfFeatures() const{ return number_of_features; }
inline void NumberOfFeatures(size_t v){ number_of_features = v; }
inline std::string Features() const{ return features; }
inline void Features(const std::string f){ features = f; }
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
void save(ofstream& outFile, bool bin=false);
void save(const std::string &file, bool bin=false);
inline void save(bool bin=false){ save("/dev/stdout",bin); }
inline std::string getIndex() {
return idx;
}
inline void setIndex(const std::string & value) {
idx=value;
}
void loadtxt(ifstream& inFile, size_t n);
void loadbin(ifstream& inFile, size_t n);
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
inline FeatureStats& get(size_t i) {
return array_.at(i);
}
inline const FeatureStats& get(size_t i)const {
return array_.at(i);
}
void add(FeatureStats e) {
array_.push_back(e);
}
void merge(FeatureArray& e);
inline size_t size() {
return array_.size();
}
inline size_t NumberOfFeatures() const {
return number_of_features;
}
inline void NumberOfFeatures(size_t v) {
number_of_features = v;
}
inline std::string Features() const {
return features;
}
inline void Features(const std::string f) {
features = f;
}
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
void save(ofstream& outFile, bool bin=false);
void save(const std::string &file, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout",bin);
}
void loadtxt(ifstream& inFile, size_t n);
void loadbin(ifstream& inFile, size_t n);
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
};

View File

@ -18,127 +18,127 @@ FeatureData::FeatureData() {};
void FeatureData::save(std::ofstream& outFile, bool bin)
{
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->save(outFile, bin);
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->save(outFile, bin);
}
void FeatureData::save(const std::string &file, bool bin)
{
if (file.empty()) return;
if (file.empty()) return;
TRACE_ERR("saving the array into " << file << std::endl);
TRACE_ERR("saving the array into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile, bin);
save(outFile, bin);
outFile.close();
outFile.close();
}
void FeatureData::load(ifstream& inFile)
{
FeatureArray entry;
while (!inFile.eof()){
while (!inFile.eof()) {
if (!inFile.good()){
std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl;
}
if (!inFile.good()) {
std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl;
}
entry.clear();
entry.load(inFile);
entry.clear();
entry.load(inFile);
if (entry.size() == 0)
break;
if (entry.size() == 0)
break;
if (size() == 0){
setFeatureMap(entry.Features());
}
add(entry);
}
if (size() == 0) {
setFeatureMap(entry.Features());
}
add(entry);
}
}
void FeatureData::load(const std::string &file)
{
TRACE_ERR("loading feature data from " << file << std::endl);
TRACE_ERR("loading feature data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
inputfilestream inFile(file); // matches a stream with a file. Opens the file
if (!inFile) {
throw runtime_error("Unable to open feature file: " + file);
}
if (!inFile) {
throw runtime_error("Unable to open feature file: " + file);
}
load((ifstream&) inFile);
load((ifstream&) inFile);
inFile.close();
inFile.close();
}
void FeatureData::add(FeatureArray& e){
if (exists(e.getIndex())){ // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(e.getIndex());
array_.at(pos).merge(e);
}
else{
array_.push_back(e);
setIndex();
}
void FeatureData::add(FeatureArray& e)
{
if (exists(e.getIndex())) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(e.getIndex());
array_.at(pos).merge(e);
} else {
array_.push_back(e);
setIndex();
}
}
void FeatureData::add(FeatureStats& e, const std::string & sent_idx){
if (exists(sent_idx)){ // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(sent_idx);
// TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl);
array_.at(pos).add(e);
}
else{
// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl);
FeatureArray a;
a.NumberOfFeatures(number_of_features);
a.Features(features);
a.setIndex(sent_idx);
a.add(e);
add(a);
}
}
void FeatureData::add(FeatureStats& e, const std::string & sent_idx)
{
if (exists(sent_idx)) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(sent_idx);
// TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl);
array_.at(pos).add(e);
} else {
// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl);
FeatureArray a;
a.NumberOfFeatures(number_of_features);
a.Features(features);
a.setIndex(sent_idx);
a.add(e);
add(a);
}
}
bool FeatureData::check_consistency()
{
if (array_.size() == 0)
return true;
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
if (!i->check_consistency()) return false;
if (array_.size() == 0)
return true;
return true;
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
if (!i->check_consistency()) return false;
return true;
}
void FeatureData::setIndex()
{
size_t j=0;
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++){
idx2arrayname_[j]=(*i).getIndex();
arrayname2idx_[(*i).getIndex()] = j;
j++;
}
size_t j=0;
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
idx2arrayname_[j]=(*i).getIndex();
arrayname2idx_[(*i).getIndex()] = j;
j++;
}
}
void FeatureData::setFeatureMap(const std::string feat)
{
number_of_features = 0;
features=feat;
number_of_features = 0;
features=feat;
std::string substring, stringBuf;
stringBuf=features;
while (!stringBuf.empty()){
getNextPound(stringBuf, substring);
featname2idx_[substring]=idx2featname_.size();
idx2featname_[idx2featname_.size()]=substring;
number_of_features++;
}
std::string substring, stringBuf;
stringBuf=features;
while (!stringBuf.empty()) {
getNextPound(stringBuf, substring);
featname2idx_[substring]=idx2featname_.size();
idx2featname_[idx2featname_.size()]=substring;
number_of_features++;
}
}

View File

@ -20,86 +20,116 @@ using namespace std;
class FeatureData
{
protected:
featdata_t array_;
idx2name idx2arrayname_; //map from index to name of array
name2idx arrayname2idx_; //map from name to index of array
featdata_t array_;
idx2name idx2arrayname_; //map from index to name of array
name2idx arrayname2idx_; //map from name to index of array
private:
size_t number_of_features;
std::string features;
size_t number_of_features;
std::string features;
map<std::string, size_t> featname2idx_; //map from name to index of features
map<size_t, std::string> idx2featname_; //map from index to name of features
map<std::string, size_t> featname2idx_; //map from name to index of features
map<size_t, std::string> idx2featname_; //map from index to name of features
public:
FeatureData();
~FeatureData(){};
inline void clear() { array_.clear(); }
inline FeatureArray get(const std::string& idx){ return array_.at(getIndex(idx)); }
inline FeatureArray& get(size_t idx){ return array_.at(idx); }
inline const FeatureArray& get(size_t idx) const{ return array_.at(idx); }
FeatureData();
inline bool exists(const std::string & sent_idx){ return exists(getIndex(sent_idx)); }
inline bool exists(int sent_idx){ return (sent_idx>-1 && sent_idx<(int) array_.size())?true:false; }
~FeatureData() {};
inline FeatureStats& get(size_t i, size_t j){ return array_.at(i).get(j); }
inline const FeatureStats& get(size_t i, size_t j) const { return array_.at(i).get(j); }
void add(FeatureArray& e);
void add(FeatureStats& e, const std::string& sent_idx);
inline size_t size(){ return array_.size(); }
inline size_t NumberOfFeatures() const{ return number_of_features; }
inline void NumberOfFeatures(size_t v){ number_of_features = v; }
inline std::string Features() const{ return features; }
inline void Features(const std::string f){ features = f; }
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false){ save("/dev/stdout", bin); }
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
void setIndex();
inline int getIndex(const std::string& idx){
name2idx::iterator i = arrayname2idx_.find(idx);
if (i!=arrayname2idx_.end())
return i->second;
else
return -1;
inline void clear() {
array_.clear();
}
inline std::string getIndex(size_t idx){
idx2name::iterator i = idx2arrayname_.find(idx);
if (i!=idx2arrayname_.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
bool existsFeatureNames(){ return (idx2featname_.size() > 0)?true:false; };
std::string getFeatureName(size_t idx){
if (idx >= idx2featname_.size())
throw runtime_error("Error: you required an too big index");
return idx2featname_[idx];
};
size_t getFeatureIndex(const std::string& name){
if (featname2idx_.find(name)!=featname2idx_.end())
throw runtime_error("Error: feature is unknown");
return featname2idx_[name];
};
inline FeatureArray get(const std::string& idx) {
return array_.at(getIndex(idx));
}
inline FeatureArray& get(size_t idx) {
return array_.at(idx);
}
inline const FeatureArray& get(size_t idx) const {
return array_.at(idx);
}
inline bool exists(const std::string & sent_idx) {
return exists(getIndex(sent_idx));
}
inline bool exists(int sent_idx) {
return (sent_idx>-1 && sent_idx<(int) array_.size())?true:false;
}
inline FeatureStats& get(size_t i, size_t j) {
return array_.at(i).get(j);
}
inline const FeatureStats& get(size_t i, size_t j) const {
return array_.at(i).get(j);
}
void add(FeatureArray& e);
void add(FeatureStats& e, const std::string& sent_idx);
inline size_t size() {
return array_.size();
}
inline size_t NumberOfFeatures() const {
return number_of_features;
}
inline void NumberOfFeatures(size_t v) {
number_of_features = v;
}
inline std::string Features() const {
return features;
}
inline void Features(const std::string f) {
features = f;
}
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout", bin);
}
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
void setIndex();
inline int getIndex(const std::string& idx) {
name2idx::iterator i = arrayname2idx_.find(idx);
if (i!=arrayname2idx_.end())
return i->second;
else
return -1;
}
inline std::string getIndex(size_t idx) {
idx2name::iterator i = idx2arrayname_.find(idx);
if (i!=idx2arrayname_.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
bool existsFeatureNames() {
return (idx2featname_.size() > 0)?true:false;
};
std::string getFeatureName(size_t idx) {
if (idx >= idx2featname_.size())
throw runtime_error("Error: you required an too big index");
return idx2featname_[idx];
};
size_t getFeatureIndex(const std::string& name) {
if (featname2idx_.find(name)!=featname2idx_.end())
throw runtime_error("Error: feature is unknown");
return featname2idx_[name];
};
void setFeatureMap(const std::string feat);
};

View File

@ -14,123 +14,124 @@
FeatureStats::FeatureStats()
{
available_ = AVAILABLE_;
entries_ = 0;
array_ = new FeatureStatsType[available_];
available_ = AVAILABLE_;
entries_ = 0;
array_ = new FeatureStatsType[available_];
};
FeatureStats::~FeatureStats()
{
delete array_;
delete array_;
};
FeatureStats::FeatureStats(const FeatureStats &stats)
{
available_ = stats.available();
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_);
available_ = stats.available();
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_);
};
FeatureStats::FeatureStats(const size_t size)
{
available_ = size;
entries_ = size;
array_ = new FeatureStatsType[available_];
memset(array_,0,featbytes_);
available_ = size;
entries_ = size;
array_ = new FeatureStatsType[available_];
memset(array_,0,featbytes_);
};
FeatureStats::FeatureStats(std::string &theString)
{
set(theString);
set(theString);
}
void FeatureStats::expand()
{
available_*=2;
featstats_t t_ = new FeatureStatsType[available_];
memcpy(t_,array_,featbytes_);
delete array_;
array_=t_;
available_*=2;
featstats_t t_ = new FeatureStatsType[available_];
memcpy(t_,array_,featbytes_);
delete array_;
array_=t_;
}
void FeatureStats::add(FeatureStatsType v)
{
if (isfull()) expand();
array_[entries_++]=v;
if (isfull()) expand();
array_[entries_++]=v;
}
void FeatureStats::set(std::string &theString)
{
std::string substring, stringBuf;
reset();
while (!theString.empty()){
getNextPound(theString, substring);
add(ATOFST(substring.c_str()));
}
reset();
while (!theString.empty()) {
getNextPound(theString, substring);
add(ATOFST(substring.c_str()));
}
}
void FeatureStats::loadbin(std::ifstream& inFile)
{
inFile.read((char*) array_, featbytes_);
}
inFile.read((char*) array_, featbytes_);
}
void FeatureStats::loadtxt(std::ifstream& inFile)
{
std::string theString;
std::getline(inFile, theString);
set(theString);
std::string theString;
std::getline(inFile, theString);
set(theString);
}
void FeatureStats::loadtxt(const std::string &file)
{
// TRACE_ERR("loading the stats from " << file << std::endl);
// TRACE_ERR("loading the stats from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile);
loadtxt(inFile);
}
void FeatureStats::savetxt(const std::string &file)
{
// TRACE_ERR("saving the stats into " << file << std::endl);
// TRACE_ERR("saving the stats into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
savetxt(outFile);
savetxt(outFile);
}
void FeatureStats::savetxt(std::ofstream& outFile)
{
// TRACE_ERR("saving the stats" << std::endl);
outFile << *this;
// TRACE_ERR("saving the stats" << std::endl);
outFile << *this;
}
void FeatureStats::savebin(std::ofstream& outFile)
{
outFile.write((char*) array_, featbytes_);
}
outFile.write((char*) array_, featbytes_);
}
FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
{
delete array_;
available_ = stats.available();
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_);
return *this;
delete array_;
available_ = stats.available();
entries_ = stats.size();
array_ = new FeatureStatsType[available_];
memcpy(array_,stats.getArray(),featbytes_);
return *this;
}
/**write the whole object to a stream*/
ostream& operator<<(ostream& o, const FeatureStats& e){
for (size_t i=0; i< e.size(); i++)
o << e.get(i) << " ";
return o;
ostream& operator<<(ostream& o, const FeatureStats& e)
{
for (size_t i=0; i< e.size(); i++)
o << e.get(i) << " ";
return o;
}

View File

@ -25,46 +25,67 @@ using namespace std;
class FeatureStats
{
private:
featstats_t array_;
size_t entries_;
size_t available_;
featstats_t array_;
size_t entries_;
size_t available_;
public:
FeatureStats();
FeatureStats(const size_t size);
FeatureStats(const FeatureStats &stats);
FeatureStats(std::string &theString);
FeatureStats& operator=(const FeatureStats &stats);
~FeatureStats();
bool isfull(){return (entries_ < available_)?0:1; }
void expand();
void add(FeatureStatsType v);
inline void clear() { memset((void*) array_,0,featbytes_); }
inline FeatureStatsType get(size_t i){ return array_[i]; }
inline FeatureStatsType get(size_t i)const{ return array_[i]; }
inline featstats_t getArray() const { return array_; }
FeatureStats();
FeatureStats(const size_t size);
FeatureStats(const FeatureStats &stats);
FeatureStats(std::string &theString);
FeatureStats& operator=(const FeatureStats &stats);
void set(std::string &theString);
~FeatureStats();
inline size_t bytes() const{ return featbytes_; }
inline size_t size() const{ return entries_; }
inline size_t available() const{ return available_; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt(){ savetxt("/dev/stdout"); }
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile);
void loadbin(ifstream& inFile);
bool isfull() {
return (entries_ < available_)?0:1;
}
void expand();
void add(FeatureStatsType v);
inline void clear() {
memset((void*) array_,0,featbytes_);
}
inline FeatureStatsType get(size_t i) {
return array_[i];
}
inline FeatureStatsType get(size_t i)const {
return array_[i];
}
inline featstats_t getArray() const {
return array_;
}
void set(std::string &theString);
inline size_t bytes() const {
return featbytes_;
}
inline size_t size() const {
return entries_;
}
inline size_t available() const {
return available_;
}
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt() {
savetxt("/dev/stdout");
}
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile);
void loadbin(ifstream& inFile);
inline void reset() {
entries_ = 0;
clear();
}
inline void reset(){ entries_ = 0; clear(); }
/**write the whole object to a stream*/
friend ostream& operator<<(ostream& o, const FeatureStats& e);
};

View File

@ -5,196 +5,201 @@
using namespace std;
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config) {
//configure regularisation
static string KEY_WEIGHTS = "weights";
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case";
static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config)
{
//configure regularisation
static string KEY_WEIGHTS = "weights";
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case";
static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM;
} else {
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM;
} else {
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) {
_preserveCase = true;
}else if (preservecase == FALSE) {
_preserveCase = false;
}
cerr << "Using case preservation: " << _preserveCase << endl;
string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) {
_preserveCase = true;
} else if (preservecase == FALSE) {
_preserveCase = false;
}
cerr << "Using case preservation: " << _preserveCase << endl;
// name would be: HAMMING,BLEU or similar
// name would be: HAMMING,BLEU or similar
string scorers = name;
while (scorers.length() > 0) {
string scorertype = "";
getNextPound(scorers,scorertype,",");
ScorerFactory SF;
Scorer *theScorer=SF.getScorer(scorertype,config);
_scorers.push_back(theScorer);
}
if (_scorers.size() == 0) {
throw runtime_error("There are no scorers");
}
cout << "Number of scorers: " << _scorers.size() << endl;
string scorers = name;
while (scorers.length() > 0) {
string scorertype = "";
getNextPound(scorers,scorertype,",");
ScorerFactory SF;
Scorer *theScorer=SF.getScorer(scorertype,config);
_scorers.push_back(theScorer);
}
if (_scorers.size() == 0) {
throw runtime_error("There are no scorers");
}
cout << "Number of scorers: " << _scorers.size() << endl;
//TODO debug this
string wtype = getConfig(KEY_WEIGHTS,"");
//Default weights set to uniform ie. if two weights 0.5 each
//weights should add to 1
if (wtype.length() == 0) {
float weight = 1.0/_scorers.size() ;
//cout << " Default weights:" << weight << endl;
for (size_t i = 0; i < _scorers.size(); i ++) {
_scorerWeights.push_back(weight);
}
}else{
float tot=0;
//cout << "Defined weights:" << endl;
while (wtype.length() > 0) {
string scoreweight = "";
getNextPound(wtype,scoreweight,"+");
float weight = atof(scoreweight.c_str());
_scorerWeights.push_back(weight);
tot += weight;
//cout << " :" << weight ;
}
//cout << endl;
if (tot != float(1)) {
throw runtime_error("The interpolated scorers weights do not sum to 1");
}
//TODO debug this
string wtype = getConfig(KEY_WEIGHTS,"");
//Default weights set to uniform ie. if two weights 0.5 each
//weights should add to 1
if (wtype.length() == 0) {
float weight = 1.0/_scorers.size() ;
//cout << " Default weights:" << weight << endl;
for (size_t i = 0; i < _scorers.size(); i ++) {
_scorerWeights.push_back(weight);
}
cout << "The weights for the interpolated scorers are: " << endl;
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
cout << *it << " " ;
} else {
float tot=0;
//cout << "Defined weights:" << endl;
while (wtype.length() > 0) {
string scoreweight = "";
getNextPound(wtype,scoreweight,"+");
float weight = atof(scoreweight.c_str());
_scorerWeights.push_back(weight);
tot += weight;
//cout << " :" << weight ;
}
cout <<endl;
//cout << endl;
if (tot != float(1)) {
throw runtime_error("The interpolated scorers weights do not sum to 1");
}
}
cout << "The weights for the interpolated scorers are: " << endl;
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
cout << *it << " " ;
}
cout <<endl;
}
void InterpolatedScorer::setScoreData(ScoreData* data) {
size_t last = 0;
_scoreData = data;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){
int numScoresScorer = (*itsc)->NumberOfScores();
ScoreData* newData =new ScoreData(**itsc);
for (size_t i = 0; i < data->size(); i++){
ScoreArray scoreArray = data->get(i);
ScoreArray newScoreArray;
std::string istr;
std::stringstream out;
out << i;
istr = out.str();
size_t numNBest = scoreArray.size();
//cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
for (size_t j = 0; j < numNBest ; j++){
ScoreStats scoreStats = data->get(i, j);
//cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
ScoreStats newScoreStats;
for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
ScoreStatsType score = scoreStats.get(k);
newScoreStats.add(score);
}
//cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
newScoreArray.add(newScoreStats);
}
newScoreArray.setIndex(istr);
newData->add(newScoreArray);
}
//newData->dump();
(*itsc)->setScoreData(newData);
last += numScoresScorer;
}
void InterpolatedScorer::setScoreData(ScoreData* data)
{
size_t last = 0;
_scoreData = data;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
int numScoresScorer = (*itsc)->NumberOfScores();
ScoreData* newData =new ScoreData(**itsc);
for (size_t i = 0; i < data->size(); i++) {
ScoreArray scoreArray = data->get(i);
ScoreArray newScoreArray;
std::string istr;
std::stringstream out;
out << i;
istr = out.str();
size_t numNBest = scoreArray.size();
//cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
for (size_t j = 0; j < numNBest ; j++) {
ScoreStats scoreStats = data->get(i, j);
//cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
ScoreStats newScoreStats;
for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
ScoreStatsType score = scoreStats.get(k);
newScoreStats.add(score);
}
//cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
newScoreArray.add(newScoreStats);
}
newScoreArray.setIndex(istr);
newData->add(newScoreArray);
}
//newData->dump();
(*itsc)->setScoreData(newData);
last += numScoresScorer;
}
}
/** The interpolated scorer calls a vector of scorers and combines them with
/** The interpolated scorer calls a vector of scorers and combines them with
weights **/
void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
statscores_t& scores)
{
//cout << "*******InterpolatedScorer::score" << endl;
size_t scorerNum = 0;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){
int numScores = (*itsc)->NumberOfScores();
statscores_t tscores;
(*itsc)->score(candidates,diffs,tscores);
size_t inc = 0;
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end();itstatsc++){
//cout << "Scores " << (*itstatsc) << endl;
float weight = _scorerWeights[scorerNum];
if (weight == 0) {
stringstream msg;
msg << "No weights for scorer" << scorerNum ;
throw runtime_error(msg.str());
}
if (scorerNum == 0) {
scores.push_back(weight * (*itstatsc));
} else {
scores[inc] += weight * (*itstatsc);
}
//cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl;
inc++;
}
scorerNum++;
}
//cout << "*******InterpolatedScorer::score" << endl;
size_t scorerNum = 0;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
int numScores = (*itsc)->NumberOfScores();
statscores_t tscores;
(*itsc)->score(candidates,diffs,tscores);
size_t inc = 0;
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) {
//cout << "Scores " << (*itstatsc) << endl;
float weight = _scorerWeights[scorerNum];
if (weight == 0) {
stringstream msg;
msg << "No weights for scorer" << scorerNum ;
throw runtime_error(msg.str());
}
if (scorerNum == 0) {
scores.push_back(weight * (*itstatsc));
} else {
scores[inc] += weight * (*itstatsc);
}
//cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl;
inc++;
}
scorerNum++;
}
}
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles) {
for (vector<Scorer *>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){
//the scorers that use alignments use the reference files in the constructor through config
(*itsc)->setReferenceFiles(referenceFiles);
}
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
for (vector<Scorer *>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
//the scorers that use alignments use the reference files in the constructor through config
(*itsc)->setReferenceFiles(referenceFiles);
}
}
// Text can be:
// Reference sentence ||| Reference sentence alignment information (as given by MOSES -include-alignment-in-n-best)
// If a permutation distance scorer, send alignment info
// Else if other scorer, remove the alignment info and then send reference as usual
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) {
stringstream buff;
string align = text;
string sentence = "";
size_t alignmentData = text.find("|||");
//Get sentence and alignment parts
if(alignmentData != string::npos) {
getNextPound(align,sentence, "|||");
}
int i=0;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){
ScoreStats tempEntry;
if ((*itsc)->useAlignment()) {
(*itsc)->prepareStats(sid, text, tempEntry);
} else {
(*itsc)->prepareStats(sid, sentence, tempEntry);
}
if (i > 0) buff << " ";
buff << tempEntry;
i++;
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
stringstream buff;
string align = text;
string sentence = "";
size_t alignmentData = text.find("|||");
//Get sentence and alignment parts
if(alignmentData != string::npos) {
getNextPound(align,sentence, "|||");
}
int i=0;
for (vector<Scorer*>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
ScoreStats tempEntry;
if ((*itsc)->useAlignment()) {
(*itsc)->prepareStats(sid, text, tempEntry);
} else {
(*itsc)->prepareStats(sid, sentence, tempEntry);
}
//cout << " Scores for interpolated: " << buff << endl;
string str = buff.str();
entry.set(str);
if (i > 0) buff << " ";
buff << tempEntry;
i++;
}
//cout << " Scores for interpolated: " << buff << endl;
string str = buff.str();
entry.set(str);
}

View File

@ -18,48 +18,49 @@
/**
* Abstract base class for scorers that include other scorers eg.
* Interpolated HAMMING and BLEU scorer **/
class InterpolatedScorer : public Scorer {
class InterpolatedScorer : public Scorer
{
public:
// name would be: "HAMMING,BLEU" or similar
InterpolatedScorer(const string& name, const string& config);
~InterpolatedScorer(){};
void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
public:
// name would be: "HAMMING,BLEU" or similar
InterpolatedScorer(const string& name, const string& config);
~InterpolatedScorer() {};
void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
void setReferenceFiles(const vector<string>& referenceFiles);
void prepareStats(size_t sid, const string& text, ScoreStats& entry);
size_t NumberOfScores() const {
size_t sz=0;
for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end();itsc++){
sz += (*itsc)->NumberOfScores();
}
return sz;
};
bool useAlignment() const {
//cout << "InterpolatedScorer::useAlignment" << endl;
for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end();itsc++){
if ((*itsc)->useAlignment()){
//cout <<"InterpolatedScorer::useAlignment Returning true"<<endl;
return true;
}
}
return false;
};
void setReferenceFiles(const vector<string>& referenceFiles);
void prepareStats(size_t sid, const string& text, ScoreStats& entry);
size_t NumberOfScores() const {
size_t sz=0;
for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end(); itsc++) {
sz += (*itsc)->NumberOfScores();
}
return sz;
};
//calculate the actual score - this gets done in the individual scorers
//statscore_t calculateScore(const vector<statscore_t>& totals);
void setScoreData(ScoreData* data);
bool useAlignment() const {
//cout << "InterpolatedScorer::useAlignment" << endl;
for (vector<Scorer*>::const_iterator itsc = _scorers.begin(); itsc < _scorers.end(); itsc++) {
if ((*itsc)->useAlignment()) {
//cout <<"InterpolatedScorer::useAlignment Returning true"<<endl;
return true;
}
}
return false;
};
protected:
//calculate the actual score - this gets done in the individual scorers
//statscore_t calculateScore(const vector<statscore_t>& totals);
void setScoreData(ScoreData* data);
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
protected:
vector<Scorer*> _scorers;
vector<float> _scorerWeights;
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
vector<Scorer*> _scorers;
vector<float> _scorerWeights;
};

View File

@ -14,31 +14,34 @@ static const float MAX_FLOAT=numeric_limits<float>::max();
void Optimizer::SetScorer(Scorer *S){
void Optimizer::SetScorer(Scorer *S)
{
if(scorer)
delete scorer;
scorer=S;
}
void Optimizer::SetFData(FeatureData *F){
void Optimizer::SetFData(FeatureData *F)
{
if(FData)
delete FData;
FData=F;
};
Optimizer::Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start):scorer(NULL),FData(NULL){
Optimizer::Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start):scorer(NULL),FData(NULL)
{
//warning: the init vector is a full set of parameters, of dimension pdim!
Point::pdim=Pd;
assert(start.size()==Pd);
Point::dim=i2O.size();
Point::optindices=i2O;
if (Point::pdim>Point::dim){
for (unsigned int i=0;i<Point::pdim;i++){
if (Point::pdim>Point::dim) {
for (unsigned int i=0; i<Point::pdim; i++) {
unsigned int j = 0;
while (j<Point::dim && i!=i2O[j])
j++;
j++;
if (j==Point::dim)//the index i wasnt found on optindices, it is a fixed index, we use the value of the start vector
Point::fixedweights[i]=start[i];
@ -46,12 +49,14 @@ Optimizer::Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start)
}
};
Optimizer::~Optimizer(){
Optimizer::~Optimizer()
{
delete scorer;
delete FData;
}
statscore_t Optimizer::GetStatScore(const Point& param)const{
statscore_t Optimizer::GetStatScore(const Point& param)const
{
vector<unsigned> bests;
Get1bests(param,bests);
//copy(bests.begin(),bests.end(),ostream_iterator<unsigned>(cerr," "));
@ -60,23 +65,25 @@ statscore_t Optimizer::GetStatScore(const Point& param)const{
};
/**compute the intersection of 2 lines*/
float intersect (float m1, float b1,float m2,float b2){
float intersect (float m1, float b1,float m2,float b2)
{
float isect = ((b2-b1)/(m1-m2));
if (!isfinite(isect)) {
isect = MAX_FLOAT;
isect = MAX_FLOAT;
}
return isect;
}
map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap,float newt,pair<unsigned,unsigned> newdiff){
map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap,float newt,pair<unsigned,unsigned> newdiff)
{
map<float,diff_t>::iterator it=thresholdmap.find(newt);
if(it!=thresholdmap.end()){
if(it!=thresholdmap.end()) {
//the threshold already exists!! this is very unlikely
if(it->second.back().first==newdiff.first)
it->second.back().second=newdiff.second;//there was already a diff for this sentence, we change the 1 best;
else
it->second.push_back(newdiff);
}else{
} else {
//normal case
pair< map<float,diff_t >::iterator,bool > ins=thresholdmap.insert(threshold(newt,diff_t(1,newdiff)));
assert(ins.second);//we really inserted something
@ -86,244 +93,247 @@ map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap,float
};
statscore_t Optimizer::LineOptimize(const Point& origin,const Point& direction,Point& bestpoint)const{
statscore_t Optimizer::LineOptimize(const Point& origin,const Point& direction,Point& bestpoint)const
{
// we are looking for the best Point on the line y=Origin+x*direction
float min_int=0.0001;
//typedef pair<unsigned,unsigned> diff;//first the sentence that changes, second is the new 1best for this sentence
//list<threshold> thresholdlist;
map<float,diff_t> thresholdmap;
thresholdmap[MIN_FLOAT]=diff_t();
vector<unsigned> first1best;//the vector of nbests for x=-inf
for(unsigned int S=0;S<size();S++){
for(unsigned int S=0; S<size(); S++) {
map<float,diff_t >::iterator previnserted=thresholdmap.begin();
//first we determine the translation with the best feature score for each sentence and each value of x
//cerr << "Sentence " << S << endl;
multimap<float,unsigned> gradient;
vector<float> f0;
f0.resize(FData->get(S).size());
for(unsigned j=0;j<FData->get(S).size();j++){
for(unsigned j=0; j<FData->get(S).size(); j++) {
gradient.insert(pair<float,unsigned>(direction*(FData->get(S,j)),j));//gradient of the feature function for this particular target sentence
f0[j]=origin*FData->get(S,j);//compute the feature function at the origin point
}
//now lets compute the 1best for each value of x
// vector<pair<float,unsigned> > onebest;
multimap<float,unsigned>::iterator gradientit=gradient.begin();
multimap<float,unsigned>::iterator highest_f0=gradient.begin();
float smallest=gradientit->first;//smallest gradient
//several candidates can have the lowest slope (eg for word penalty where the gradient is an integer )
gradientit++;
while(gradientit!=gradient.end()&&gradientit->first==smallest){
while(gradientit!=gradient.end()&&gradientit->first==smallest) {
// cerr<<"ni"<<gradientit->second<<endl;;
//cerr<<"fos"<<f0[gradientit->second]<<" "<<f0[index]<<" "<<index<<endl;
if(f0[gradientit->second]>f0[highest_f0->second])
highest_f0=gradientit;//the highest line is the one with he highest f0
highest_f0=gradientit;//the highest line is the one with he highest f0
gradientit++;
}
gradientit = highest_f0;
first1best.push_back(highest_f0->second);
first1best.push_back(highest_f0->second);
//now we look for the intersections points indicating a change of 1 best
//we use the fact that the function is convex, which means that the gradient can only go up
while(gradientit!=gradient.end()){
//we use the fact that the function is convex, which means that the gradient can only go up
while(gradientit!=gradient.end()) {
map<float,unsigned>::iterator leftmost=gradientit;
float m=gradientit->first;
float b=f0[gradientit->second];
multimap<float,unsigned>::iterator gradientit2=gradientit;
gradientit2++;
float leftmostx=MAX_FLOAT;
for(;gradientit2!=gradient.end();gradientit2++){
//cerr<<"--"<<d++<<' '<<gradientit2->first<<' '<<gradientit2->second<<endl;
//look for all candidate with a gradient bigger than the current one and find the one with the leftmost intersection
float curintersect;
if(m!=gradientit2->first){
curintersect=intersect(m,b,gradientit2->first,f0[gradientit2->second]);
for(; gradientit2!=gradient.end(); gradientit2++) {
//cerr<<"--"<<d++<<' '<<gradientit2->first<<' '<<gradientit2->second<<endl;
//look for all candidate with a gradient bigger than the current one and find the one with the leftmost intersection
float curintersect;
if(m!=gradientit2->first) {
curintersect=intersect(m,b,gradientit2->first,f0[gradientit2->second]);
//cerr << "curintersect: " << curintersect << " leftmostx: " << leftmostx << endl;
if(curintersect<=leftmostx){
//we have found an intersection to the left of the leftmost we had so far.
//we might have curintersect==leftmostx for example is 2 candidates are the same
//in that case its better its better to update leftmost to gradientit2 to avoid some recomputing later
leftmostx=curintersect;
leftmost=gradientit2;//this is the new reference
}
}
if(curintersect<=leftmostx) {
//we have found an intersection to the left of the leftmost we had so far.
//we might have curintersect==leftmostx for example is 2 candidates are the same
//in that case its better its better to update leftmost to gradientit2 to avoid some recomputing later
leftmostx=curintersect;
leftmost=gradientit2;//this is the new reference
}
}
}
if (leftmost == gradientit) {
//we didn't find any more intersections
//the rightmost bestindex is the one with the highest slope.
assert(abs(leftmost->first-gradient.rbegin()->first)<0.0001);//they should be egal but there might be
//a small difference due to rounding error
break;
//we didn't find any more intersections
//the rightmost bestindex is the one with the highest slope.
assert(abs(leftmost->first-gradient.rbegin()->first)<0.0001);//they should be egal but there might be
//a small difference due to rounding error
break;
}
//we have found the next intersection!
pair<unsigned,unsigned> newd(S,leftmost->second);//new onebest for Sentence S is leftmost->second
if(leftmostx-previnserted->first<min_int){
/* Require that the intersection Point be at least min_int
to the right of the previous one(for this sentence). If not, we replace the
previous intersection Point with this one. Yes, it can even
happen that the new intersection Point is slightly to the
left of the old one, because of numerical imprecision.
we do not check that we are to the right of the penultimate point also. it this happen the 1best the inteval will be wrong
we are going to replace previnsert by the new one because we do not want to keep
2 very close threshold: if the minima is there it could be an artifact
*/
map<float,diff_t>::iterator tit=thresholdmap.find(leftmostx);
if(tit==previnserted){
//the threshold is the same as before can happen if 2 candidates are the same for example
assert(previnserted->second.back().first==newd.first);
previnserted->second.back()=newd;//just replace the 1 best fors sentence S
//previnsert doesnt change
}else{
if(leftmostx-previnserted->first<min_int) {
/* Require that the intersection Point be at least min_int
to the right of the previous one(for this sentence). If not, we replace the
previous intersection Point with this one. Yes, it can even
happen that the new intersection Point is slightly to the
left of the old one, because of numerical imprecision.
we do not check that we are to the right of the penultimate point also. it this happen the 1best the inteval will be wrong
we are going to replace previnsert by the new one because we do not want to keep
2 very close threshold: if the minima is there it could be an artifact
*/
map<float,diff_t>::iterator tit=thresholdmap.find(leftmostx);
if(tit==previnserted) {
//the threshold is the same as before can happen if 2 candidates are the same for example
assert(previnserted->second.back().first==newd.first);
previnserted->second.back()=newd;//just replace the 1 best fors sentence S
//previnsert doesnt change
} else {
if(tit==thresholdmap.end()){
thresholdmap[leftmostx]=previnserted->second;//We keep the diffs at previnsert
thresholdmap.erase(previnserted);//erase old previnsert
previnserted=thresholdmap.find(leftmostx);//point previnsert to the new threshold
previnserted->second.back()=newd;//we update the diff for sentence S
}else{//threshold already exists but is not the previous one.
//we append the diffs in previnsert to tit before destroying previnsert
tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end());
assert(tit->second.back().first==newd.first);
tit->second.back()=newd;//change diff for sentence S
thresholdmap.erase(previnserted);//erase old previnsert
previnserted=tit;//point previnsert to the new threshold
}
}
if(tit==thresholdmap.end()) {
thresholdmap[leftmostx]=previnserted->second;//We keep the diffs at previnsert
thresholdmap.erase(previnserted);//erase old previnsert
previnserted=thresholdmap.find(leftmostx);//point previnsert to the new threshold
previnserted->second.back()=newd;//we update the diff for sentence S
} else { //threshold already exists but is not the previous one.
//we append the diffs in previnsert to tit before destroying previnsert
tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end());
assert(tit->second.back().first==newd.first);
tit->second.back()=newd;//change diff for sentence S
thresholdmap.erase(previnserted);//erase old previnsert
previnserted=tit;//point previnsert to the new threshold
}
}
assert(previnserted != thresholdmap.end());
}else{//normal insertion process
previnserted=AddThreshold(thresholdmap,leftmostx,newd);
assert(previnserted != thresholdmap.end());
} else { //normal insertion process
previnserted=AddThreshold(thresholdmap,leftmostx,newd);
}
gradientit=leftmost;
} //while(gradientit!=gradient.end()){
} //loop on S
//now the thresholdlist is up to date:
//now the thresholdlist is up to date:
//it contains a list of all the parameter_ts where the function changed its value, along with the nbest list for the interval after each threshold
map<float,diff_t >::iterator thrit;
if(verboselevel()>6){
if(verboselevel()>6) {
cerr << "Thresholds:(" <<thresholdmap.size()<<")"<< endl;
for (thrit = thresholdmap.begin();thrit!=thresholdmap.end();thrit++){
for (thrit = thresholdmap.begin(); thrit!=thresholdmap.end(); thrit++) {
cerr << "x: " << thrit->first << " diffs";
for (size_t j = 0; j < thrit->second.size(); ++j) {
cerr << " " <<thrit->second[j].first << "," << thrit->second[j].second;
cerr << " " <<thrit->second[j].first << "," << thrit->second[j].second;
}
cerr << endl;
}
}
//last thing to do is compute the Stat score (ie BLEU) and find the minimum
thrit=thresholdmap.begin();
++thrit;//first diff corrrespond to MIN_FLOAT and first1best
diffs_t diffs;
for(;thrit!=thresholdmap.end();thrit++)
for(; thrit!=thresholdmap.end(); thrit++)
diffs.push_back(thrit->second);
vector<statscore_t> scores=GetIncStatScore(first1best,diffs);
thrit=thresholdmap.begin();
statscore_t bestscore=MIN_FLOAT;
float bestx=MIN_FLOAT;
assert(scores.size()==thresholdmap.size());//we skipped the first el of thresholdlist but GetIncStatScore return 1 more for first1best
for(unsigned int sc=0;sc!=scores.size();sc++){
for(unsigned int sc=0; sc!=scores.size(); sc++) {
//cerr << "x=" << thrit->first << " => " << scores[sc] << endl;
if (scores[sc] > bestscore) {
//This is the score for the interval [lit2->first, (lit2+1)->first]
//unless we're at the last score, when it's the score
//for the interval [lit2->first,+inf]
bestscore = scores[sc];
//This is the score for the interval [lit2->first, (lit2+1)->first]
//unless we're at the last score, when it's the score
//for the interval [lit2->first,+inf]
bestscore = scores[sc];
//if we're not in [-inf,x1] or [xn,+inf] then just take the value
//if x which splits the interval in half. For the rightmost interval,
//take x to be the last interval boundary + 0.1, and for the leftmost
//interval, take x to be the first interval boundary - 1000.
//These values are taken from cmert.
float leftx = thrit->first;
if (thrit == thresholdmap.begin()) {
leftx = MIN_FLOAT;
}
++thrit;
float rightx = MAX_FLOAT;
if (thrit != thresholdmap.end()) {
rightx = thrit->first;
}
--thrit;
//cerr << "leftx: " << leftx << " rightx: " << rightx << endl;
if (leftx == MIN_FLOAT) {
bestx = rightx-1000;
} else if (rightx == MAX_FLOAT) {
bestx = leftx+0.1;
} else {
bestx = 0.5 * (rightx + leftx);
}
//cerr << "x = " << "set new bestx to: " << bestx << endl;
//if we're not in [-inf,x1] or [xn,+inf] then just take the value
//if x which splits the interval in half. For the rightmost interval,
//take x to be the last interval boundary + 0.1, and for the leftmost
//interval, take x to be the first interval boundary - 1000.
//These values are taken from cmert.
float leftx = thrit->first;
if (thrit == thresholdmap.begin()) {
leftx = MIN_FLOAT;
}
++thrit;
float rightx = MAX_FLOAT;
if (thrit != thresholdmap.end()) {
rightx = thrit->first;
}
--thrit;
//cerr << "leftx: " << leftx << " rightx: " << rightx << endl;
if (leftx == MIN_FLOAT) {
bestx = rightx-1000;
} else if (rightx == MAX_FLOAT) {
bestx = leftx+0.1;
} else {
bestx = 0.5 * (rightx + leftx);
}
//cerr << "x = " << "set new bestx to: " << bestx << endl;
}
++thrit;
}
if(abs(bestx)<0.00015){
if(abs(bestx)<0.00015) {
bestx=0.0;//the origin of the line is the best point!we put it back at 0 so we do not propagate rounding erros
//finally! we manage to extract the best score;
//now we convert bestx (position on the line) to a point!
//finally! we manage to extract the best score;
//now we convert bestx (position on the line) to a point!
if(verboselevel()>4)
cerr<<"best point on line at origin"<<endl;
}
if(verboselevel()>3){
if(verboselevel()>3) {
// cerr<<"end Lineopt, bestx="<<bestx<<endl;
}
bestpoint=direction*bestx+origin;
}
bestpoint=direction*bestx+origin;
bestpoint.score=bestscore;
return bestscore;
return bestscore;
};
void Optimizer::Get1bests(const Point& P,vector<unsigned>& bests)const{
void Optimizer::Get1bests(const Point& P,vector<unsigned>& bests)const
{
assert(FData);
bests.clear();
bests.resize(size());
for(unsigned i=0;i<size();i++){
for(unsigned i=0; i<size(); i++) {
float bestfs=MIN_FLOAT;
unsigned idx=0;
unsigned j;
for(j=0;j<FData->get(i).size();j++){
for(j=0; j<FData->get(i).size(); j++) {
float curfs=P*FData->get(i,j);
if(curfs>bestfs){
bestfs=curfs;
idx=j;
if(curfs>bestfs) {
bestfs=curfs;
idx=j;
}
}
bests[i]=idx;
}
}
statscore_t Optimizer::Run(Point& P)const{
if(!FData){
statscore_t Optimizer::Run(Point& P)const
{
if(!FData) {
cerr<<"error trying to optimize without Features loaded"<<endl;
exit(2);
}
if(!scorer){
if(!scorer) {
cerr<<"error trying to optimize without a Scorer loaded"<<endl;
exit(2);
}
if (scorer->getReferenceSize()!=FData->size()){
if (scorer->getReferenceSize()!=FData->size()) {
cerr<<"error size mismatch between FeatureData and Scorer"<<endl;
exit(2);
}
statscore_t score=GetStatScore(P);
P.score=score;
if(verboselevel()>2)
statscore_t score=GetStatScore(P);
P.score=score;
if(verboselevel()>2)
cerr<<"Starting point: "<< P << " => "<< P.score << endl;
statscore_t s=TrueRun(P);
P.score=s;//just in case its not done in TrueRun
@ -331,9 +341,10 @@ statscore_t Optimizer::Run(Point& P)const{
cerr<<"Ending point: "<< P <<" => "<< s << endl;
return s;
}
vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst,vector<vector <pair<unsigned,unsigned> > > thediffs)const{
vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst,vector<vector <pair<unsigned,unsigned> > > thediffs)const
{
assert(scorer);
vector<statscore_t> theres;
@ -347,61 +358,62 @@ vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst,vector<
//---------------- code for the powell optimizer
float SimpleOptimizer::eps=0.0001;
statscore_t SimpleOptimizer::TrueRun(Point& P)const{
statscore_t SimpleOptimizer::TrueRun(Point& P)const
{
statscore_t prevscore=0;
statscore_t bestscore=MIN_FLOAT;
Point best;
//If P is already defined and provides a score
//If P is already defined and provides a score
//we must improve over this score
if(P.score>bestscore){
bestscore=P.score;
best=P;
}
if(P.score>bestscore) {
bestscore=P.score;
best=P;
}
int nrun=0;
do{
++nrun;
do {
++nrun;
if(verboselevel()>2&&nrun>1)
cerr<<"last diff="<<bestscore-prevscore<<" nrun "<<nrun<<endl;
prevscore=bestscore;
Point linebest;
for(unsigned int d=0;d<Point::getdim();d++){
if(verboselevel()>4){
// cerr<<"minimizing along direction "<<d<<endl;
cerr<<"starting point: " << P << " => " << prevscore << endl;
for(unsigned int d=0; d<Point::getdim(); d++) {
if(verboselevel()>4) {
// cerr<<"minimizing along direction "<<d<<endl;
cerr<<"starting point: " << P << " => " << prevscore << endl;
}
Point direction;
for(unsigned int i=0;i<Point::getdim();i++)
direction[i];
for(unsigned int i=0; i<Point::getdim(); i++)
direction[i];
direction[d]=1.0;
statscore_t curscore=LineOptimize(P,direction,linebest);//find the minimum on the line
if(verboselevel()>5){
cerr<<"direction: "<< d << " => " << curscore << endl;
cerr<<"\tending point: "<< linebest << " => " << curscore << endl;
}
if(curscore>bestscore){
bestscore=curscore;
best=linebest;
if(verboselevel()>3){
cerr<<"new best dir:"<<d<<" ("<<nrun<<")"<<endl;
cerr<<"new best Point "<<best<< " => " <<curscore<<endl;
}
}
if(verboselevel()>5) {
cerr<<"direction: "<< d << " => " << curscore << endl;
cerr<<"\tending point: "<< linebest << " => " << curscore << endl;
}
if(curscore>bestscore) {
bestscore=curscore;
best=linebest;
if(verboselevel()>3) {
cerr<<"new best dir:"<<d<<" ("<<nrun<<")"<<endl;
cerr<<"new best Point "<<best<< " => " <<curscore<<endl;
}
}
}
P=best;//update the current vector with the best point on all line tested
if(verboselevel()>3)
cerr<<nrun<<"\t"<<P<<endl;
}while(bestscore-prevscore>eps);
if(verboselevel()>2){
if(verboselevel()>3)
cerr<<nrun<<"\t"<<P<<endl;
} while(bestscore-prevscore>eps);
if(verboselevel()>2) {
cerr<<"end Powell Algo, nrun="<<nrun<<endl;
cerr<<"last diff="<<bestscore-prevscore<<endl;
cerr<<"\t"<<P<<endl;
}
}
return bestscore;
}
@ -409,58 +421,63 @@ statscore_t SimpleOptimizer::TrueRun(Point& P)const{
/**RandomOptimizer to use as beaseline and test.\n
Just return a random point*/
statscore_t RandomOptimizer::TrueRun(Point& P)const{
statscore_t RandomOptimizer::TrueRun(Point& P)const
{
vector<parameter_t> min(Point::getdim());
vector<parameter_t> max(Point::getdim());
for(unsigned int d=0;d<Point::getdim();d++){
for(unsigned int d=0; d<Point::getdim(); d++) {
min[d]=0.0;
max[d]=1.0;
}
P.Randomize(min,max);
statscore_t score=GetStatScore(P);
P.score=score;
return score;
P.Randomize(min,max);
statscore_t score=GetStatScore(P);
P.score=score;
return score;
}
//--------------------------------------
vector<string> OptimizerFactory::typenames;
void OptimizerFactory::SetTypeNames(){
if(typenames.empty()){
void OptimizerFactory::SetTypeNames()
{
if(typenames.empty()) {
typenames.resize(NOPTIMIZER);
typenames[POWELL]="powell";
typenames[RANDOM]="random";
//add new type there
}
}
}
vector<string> OptimizerFactory::GetTypeNames(){
vector<string> OptimizerFactory::GetTypeNames()
{
if(typenames.empty())
SetTypeNames();
return typenames;
}
OptimizerFactory::OptType OptimizerFactory::GetOType(string type){
OptimizerFactory::OptType OptimizerFactory::GetOType(string type)
{
unsigned int thetype;
if(typenames.empty())
SetTypeNames();
for(thetype=0;thetype<typenames.size();thetype++)
for(thetype=0; thetype<typenames.size(); thetype++)
if(typenames[thetype]==type)
break;
return((OptType)thetype);
};
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,vector<parameter_t> start,string type){
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,vector<parameter_t> start,string type)
{
OptType T=GetOType(type);
if(T==NOPTIMIZER){
if(T==NOPTIMIZER) {
cerr<<"Error: unknown Optimizer type "<<type<<endl;
cerr<<"Known Algorithm are:"<<endl;
unsigned int thetype;
for(thetype=0;thetype<typenames.size();thetype++)
for(thetype=0; thetype<typenames.size(); thetype++)
cerr<<typenames[thetype]<<endl;
throw ("unknown Optimizer Type");
}
switch((OptType)T){
switch((OptType)T) {
case POWELL:
return new SimpleOptimizer(dim,i2o,start);
break;
@ -469,6 +486,6 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector<unsigned> i2o,ve
break;
default:
cerr<<"Error: unknown optimizer"<<type<<endl;
return NULL;
}
return NULL;
}
}

View File

@ -15,61 +15,69 @@ typedef float featurescore;
using namespace std;
/**abstract virtual class*/
class Optimizer{
protected:
Scorer * scorer; //no accessor for them only child can use them
FeatureData * FData;//no accessor for them only child can use them
public:
class Optimizer
{
protected:
Scorer * scorer; //no accessor for them only child can use them
FeatureData * FData;//no accessor for them only child can use them
public:
Optimizer(unsigned Pd,vector<unsigned> i2O,vector<parameter_t> start);
void SetScorer(Scorer *S);
void SetFData(FeatureData *F);
virtual ~Optimizer();
unsigned size()const{return (FData?FData->size():0);}
unsigned size()const {
return (FData?FData->size():0);
}
/**Generic wrapper around TrueRun to check a few things. Non virtual*/
statscore_t Run(Point&)const;
/**main function that perform an optimization*/
/**main function that perform an optimization*/
virtual statscore_t TrueRun(Point&)const=0;
/**given a set of lambdas, get the nbest for each sentence*/
void Get1bests(const Point& param,vector<unsigned>& bests)const;
/**given a set of nbests, get the Statistical score*/
statscore_t GetStatScore(const vector<unsigned>& nbests)const{return scorer->score(nbests);};
statscore_t GetStatScore(const vector<unsigned>& nbests)const {
return scorer->score(nbests);
};
/**given a set of lambdas, get the total statistical score*/
statscore_t GetStatScore(const Point& param)const;
statscore_t GetStatScore(const Point& param)const;
vector<statscore_t > GetIncStatScore(vector<unsigned> ref,vector<vector <pair<unsigned,unsigned> > >)const;
statscore_t LineOptimize(const Point& start,const Point& direction,Point& best)const;//Get the optimal Lambda and the best score in a particular direction from a given Point
};
/**default basic optimizer*/
class SimpleOptimizer: public Optimizer{
class SimpleOptimizer: public Optimizer
{
private:
static float eps;
static float eps;
public:
SimpleOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start){};
SimpleOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start) {};
virtual statscore_t TrueRun(Point&)const;
};
class RandomOptimizer: public Optimizer{
class RandomOptimizer: public Optimizer
{
public:
RandomOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start){};
RandomOptimizer(unsigned dim,vector<unsigned> i2O,vector<parameter_t> start):Optimizer(dim,i2O,start) {};
virtual statscore_t TrueRun(Point&)const;
};
class OptimizerFactory{
public:
class OptimizerFactory
{
public:
// unsigned dim;
//Point Start;
static vector<string> GetTypeNames();
static Optimizer* BuildOptimizer(unsigned dim,vector<unsigned>tooptimize,vector<parameter_t> start,string type);
private:
enum OptType{POWELL=0,RANDOM,NOPTIMIZER};//Add new optimizer here BEFORE NOPTIMZER
private:
enum OptType {POWELL=0,RANDOM,NOPTIMIZER}; //Add new optimizer here BEFORE NOPTIMZER
static OptType GetOType(string);
static vector<string> typenames;
static void SetTypeNames();
};

View File

@ -1,69 +1,72 @@
#include "PerScorer.h"
void PerScorer::setReferenceFiles(const vector<string>& referenceFiles) {
// for each line in the reference file, create a multiset of the
// word ids
if (referenceFiles.size() != 1) {
throw runtime_error("PER only supports a single reference");
void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
// for each line in the reference file, create a multiset of the
// word ids
if (referenceFiles.size() != 1) {
throw runtime_error("PER only supports a single reference");
}
_reftokens.clear();
_reflengths.clear();
ifstream in(referenceFiles[0].c_str());
if (!in) {
throw runtime_error("Unable to open " + referenceFiles[0]);
}
string line;
int sid = 0;
while (getline(in,line)) {
vector<int> tokens;
encode(line,tokens);
_reftokens.push_back(multiset<int>());
for (size_t i = 0; i < tokens.size(); ++i) {
_reftokens.back().insert(tokens[i]);
}
_reftokens.clear();
_reflengths.clear();
ifstream in(referenceFiles[0].c_str());
if (!in) {
throw runtime_error("Unable to open " + referenceFiles[0]);
_reflengths.push_back(tokens.size());
if (sid > 0 && sid % 100 == 0) {
TRACE_ERR(".");
}
string line;
int sid = 0;
while (getline(in,line)) {
vector<int> tokens;
encode(line,tokens);
_reftokens.push_back(multiset<int>());
for (size_t i = 0; i < tokens.size(); ++i) {
_reftokens.back().insert(tokens[i]);
}
_reflengths.push_back(tokens.size());
if (sid > 0 && sid % 100 == 0) {
TRACE_ERR(".");
}
++sid;
}
TRACE_ERR(endl);
++sid;
}
TRACE_ERR(endl);
}
void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) {
if (sid >= _reflengths.size()) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
//calculate correct, output_length and ref_length for
//the line and store it in entry
vector<int> testtokens;
encode(text,testtokens);
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
int correct = 0;
for (set<int>::iterator i = testtokens_unique.begin();
i != testtokens_unique.end(); ++i) {
int token = *i;
correct += min(_reftokens[sid].count(token), testtokens_all.count(token));
}
ostringstream stats;
stats << correct << " " << testtokens.size() << " " << _reflengths[sid] << " " ;
string stats_str = stats.str();
entry.set(stats_str);
void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
if (sid >= _reflengths.size()) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
//calculate correct, output_length and ref_length for
//the line and store it in entry
vector<int> testtokens;
encode(text,testtokens);
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
int correct = 0;
for (set<int>::iterator i = testtokens_unique.begin();
i != testtokens_unique.end(); ++i) {
int token = *i;
correct += min(_reftokens[sid].count(token), testtokens_all.count(token));
}
ostringstream stats;
stats << correct << " " << testtokens.size() << " " << _reflengths[sid] << " " ;
string stats_str = stats.str();
entry.set(stats_str);
}
float PerScorer::calculateScore(const vector<float>& comps) {
float denom = comps[2];
float num = comps[0] - max(float(0),comps[1]-comps[2]);
if (denom == 0) {
//shouldn't happen!
return 0.0;
} else {
return num/denom;
}
float PerScorer::calculateScore(const vector<float>& comps)
{
float denom = comps[2];
float num = comps[0] - max(float(0),comps[1]-comps[2]);
if (denom == 0) {
//shouldn't happen!
return 0.0;
} else {
return num/denom;
}
}

View File

@ -22,34 +22,40 @@ using namespace std;
* as 1 - (correct - max(0,output_length - ref_length)) / ref_length
* In fact, we ignore the " 1 - " so that it can be maximised.
**/
class PerScorer: public StatisticsBasedScorer {
public:
PerScorer(const string& config = "") : StatisticsBasedScorer("PER",config) {}
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual void whoami() {
cerr << "I AM PerScorer" << std::endl;
}
size_t NumberOfScores() const { cerr << "PerScorer: 3" << endl; return 3; };
bool useAlignment() const {return false;};
class PerScorer: public StatisticsBasedScorer
{
public:
PerScorer(const string& config = "") : StatisticsBasedScorer("PER",config) {}
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
protected:
virtual float calculateScore(const vector<float>& comps) ;
private:
//no copy
PerScorer(const PerScorer&);
~PerScorer(){};
PerScorer& operator=(const PerScorer&);
// data extracted from reference files
vector<size_t> _reflengths;
vector<multiset<int> > _reftokens;
virtual void whoami() {
cerr << "I AM PerScorer" << std::endl;
}
size_t NumberOfScores() const {
cerr << "PerScorer: 3" << endl;
return 3;
};
bool useAlignment() const {
return false;
};
protected:
virtual float calculateScore(const vector<float>& comps) ;
private:
//no copy
PerScorer(const PerScorer&);
~PerScorer() {};
PerScorer& operator=(const PerScorer&);
// data extracted from reference files
vector<size_t> _reflengths;
vector<multiset<int> > _reftokens;
};
#endif //__PERSCORER_H

View File

@ -16,27 +16,26 @@ using namespace std;
Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength )
{
if (sourceLength > 0)
{
set(alignment, sourceLength);
}
m_targetLength = targetLength;
if (sourceLength > 0) {
set(alignment, sourceLength);
}
m_targetLength = targetLength;
}
size_t Permutation::getLength() const
{
return int(m_array.size());
return int(m_array.size());
}
void Permutation::dump() const
{
int j=0;
for (vector<int>::const_iterator i = m_array.begin(); i !=m_array.end(); i++){
cout << "(";
cout << j << ":" << *i ;
cout << "), ";
j++;
}
cout << endl;
int j=0;
for (vector<int>::const_iterator i = m_array.begin(); i !=m_array.end(); i++) {
cout << "(";
cout << j << ":" << *i ;
cout << "), ";
j++;
}
cout << endl;
}
@ -49,286 +48,272 @@ void Permutation::dump() const
void Permutation::set(const string & alignment,const int sourceLength)
{
//cout << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
if(sourceLength <= 0)
{
//not found
cerr << "Source sentence length not positive:"<< sourceLength << endl;
exit(0);
//cout << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
if(sourceLength <= 0) {
//not found
cerr << "Source sentence length not positive:"<< sourceLength << endl;
exit(0);
}
if (alignment.length() <= 0) {
//alignment empty - could happen but not good
cerr << "Alignment string empty:"<< alignment << endl;
}
//Tokenise on whitespace
string buf; // Have a buffer string
stringstream ss(alignment); // Insert the string into a stream
vector<string> tokens; // Create vector to hold our words
while (ss >> buf)
tokens.push_back(buf);
vector<int> tempPerm(sourceLength, -1);
//Set tempPerm to have one target position per source position
for (size_t i=0; i<tokens.size(); i++) {
string temp = tokens[i];
int posDelimeter = temp.find("-");
if(posDelimeter == int(string::npos)) {
cerr << "Delimiter not found - :"<< tokens[i] << endl;
exit(1);
}
if (alignment.length() <= 0)
{
//alignment empty - could happen but not good
cerr << "Alignment string empty:"<< alignment << endl;
int sourcePos = atoi((temp.substr(0, posDelimeter)).c_str());
int targetPos = atoi((temp.substr(posDelimeter+1)).c_str());
//cout << "SP:" << sourcePos << " TP:" << targetPos << endl;
if (sourcePos > sourceLength) {
cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
exit(1);
}
//Tokenise on whitespace
string buf; // Have a buffer string
stringstream ss(alignment); // Insert the string into a stream
vector<string> tokens; // Create vector to hold our words
while (ss >> buf)
tokens.push_back(buf);
vector<int> tempPerm(sourceLength, -1);
//Set tempPerm to have one target position per source position
for (size_t i=0; i<tokens.size(); i++) {
string temp = tokens[i];
int posDelimeter = temp.find("-");
if(posDelimeter == int(string::npos)) {
cerr << "Delimiter not found - :"<< tokens[i] << endl;
exit(1);
}
int sourcePos = atoi((temp.substr(0, posDelimeter)).c_str());
int targetPos = atoi((temp.substr(posDelimeter+1)).c_str());
//cout << "SP:" << sourcePos << " TP:" << targetPos << endl;
if (sourcePos > sourceLength) {
cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
exit(1);
}
//If have multiple target pos aligned to one source,
// then ignore all but first alignment
if (tempPerm[sourcePos] == -1 || tempPerm[sourcePos] > targetPos)
{
tempPerm[sourcePos] = targetPos;
}
//If have multiple target pos aligned to one source,
// then ignore all but first alignment
if (tempPerm[sourcePos] == -1 || tempPerm[sourcePos] > targetPos) {
tempPerm[sourcePos] = targetPos;
}
}
//TODO
//Set final permutation in m_array
//Take care of: source - null
// multiple_source - one target
// unaligned target
// Input: 1-9 2-1 4-3 4-4 5-6 6-6 7-6 8-8
// Convert source: 1 2 3 4 5 6 7 8
// target: 9 1 -1 3 6 6 6 8 -> 8 1 2 3 4 5 6 7
//TODO
//Set final permutation in m_array
//Take care of: source - null
// multiple_source - one target
// unaligned target
// Input: 1-9 2-1 4-3 4-4 5-6 6-6 7-6 8-8
// Convert source: 1 2 3 4 5 6 7 8
// target: 9 1 -1 3 6 6 6 8 -> 8 1 2 3 4 5 6 7
// 1st step: Add null aligned source to previous alignment
// target: 9 1 -1 3 6 6 6 8 -> 9 1 1 3 6 6 6 8
int last=0;
m_array.assign(sourceLength, -1);
//get a searcheable index
multimap<int, int> invMap;
multimap<int, int>::iterator it;
//cout << " SourceP -> TargetP " << endl;
for (size_t i=0; i<tempPerm.size(); i++)
{
if (tempPerm[i] == -1) {
tempPerm[i] = last;
} else {
last = tempPerm[i];
}
//cout << i << " -> " << tempPerm[i] << endl;
//Key is target pos, value is source pos
invMap.insert(pair<int,int>(tempPerm[i],int(i)));
// 1st step: Add null aligned source to previous alignment
// target: 9 1 -1 3 6 6 6 8 -> 9 1 1 3 6 6 6 8
int last=0;
m_array.assign(sourceLength, -1);
//get a searcheable index
multimap<int, int> invMap;
multimap<int, int>::iterator it;
//cout << " SourceP -> TargetP " << endl;
for (size_t i=0; i<tempPerm.size(); i++) {
if (tempPerm[i] == -1) {
tempPerm[i] = last;
} else {
last = tempPerm[i];
}
//cout << i << " -> " << tempPerm[i] << endl;
//Key is target pos, value is source pos
invMap.insert(pair<int,int>(tempPerm[i],int(i)));
}
// 2nd step: Get target into index of multimap and sort
// Convert source: 1 2 3 4 5 6 7 8
// target: 9 1 0 3 6 6 6 8 -> 0 1 3 6 6 6 8 9
// source: 3 2 4 5 6 7 8 1
int i=0;
//cout << " TargetP => SourceP : TargetIndex " << endl;
for ( it=invMap.begin() ; it != invMap.end(); it++ )
{
//cout << (*it).first << " => " << (*it).second << " : " << i << endl;
//find source position
m_array[(*it).second] = i;
i++;
}
// 2nd step: Get target into index of multimap and sort
// Convert source: 1 2 3 4 5 6 7 8
// target: 9 1 0 3 6 6 6 8 -> 0 1 3 6 6 6 8 9
// source: 3 2 4 5 6 7 8 1
int i=0;
//cout << " TargetP => SourceP : TargetIndex " << endl;
for ( it=invMap.begin() ; it != invMap.end(); it++ ) {
//cout << (*it).first << " => " << (*it).second << " : " << i << endl;
//find source position
m_array[(*it).second] = i;
i++;
}
bool ok = checkValidPermutation(m_array);
//dump();
if (!ok) {
throw runtime_error(" Created invalid permutation");
}
bool ok = checkValidPermutation(m_array);
//dump();
if (!ok) {
throw runtime_error(" Created invalid permutation");
}
}
//Static
vector<int> Permutation::invert(const vector<int> & inVector)
vector<int> Permutation::invert(const vector<int> & inVector)
{
vector<int> outVector(inVector.size());
for (size_t i=0; i<inVector.size(); i++){
outVector[inVector[i]] = int(i);
}
return outVector;
vector<int> outVector(inVector.size());
for (size_t i=0; i<inVector.size(); i++) {
outVector[inVector[i]] = int(i);
}
return outVector;
}
//Static
//Permutations start at 0
bool Permutation::checkValidPermutation(vector<int> const & inVector)
bool Permutation::checkValidPermutation(vector<int> const & inVector)
{
vector<int> test(inVector.size(),-1);
for (size_t i=0; i< inVector.size(); i++){
//No multiple entries of same value allowed
if (test[inVector[i]] > -1){
cerr << "Permutation error: multiple entries of same value\n" << endl;
return false;
}
test[inVector[i]] ++;
vector<int> test(inVector.size(),-1);
for (size_t i=0; i< inVector.size(); i++) {
//No multiple entries of same value allowed
if (test[inVector[i]] > -1) {
cerr << "Permutation error: multiple entries of same value\n" << endl;
return false;
}
for (size_t i=0; i<inVector.size(); i++){
//No holes allowed
if (test[inVector[i]] == -1) {
cerr << "Permutation error: missing values\n" << endl;
return false;
}
test[inVector[i]] ++;
}
for (size_t i=0; i<inVector.size(); i++) {
//No holes allowed
if (test[inVector[i]] == -1) {
cerr << "Permutation error: missing values\n" << endl;
return false;
}
return true;
}
return true;
}
//TODO default to HAMMING
//Note: it returns the distance that is not normalised
float Permutation::distance(const Permutation &permCompare, const distanceMetric_t &type) const
float Permutation::distance(const Permutation &permCompare, const distanceMetric_t &type) const
{
float score=0;
//cout << "*****Permutation::distance" <<endl;
//cout << "Ref:" << endl;
//dump();
//cout << "Comp:" << endl;
//permCompare.dump();
float score=0;
if (type == HAMMING_DISTANCE) {
score = calculateHamming(permCompare);
} else if (type == KENDALL_DISTANCE) {
score = calculateKendall(permCompare);
} else {
throw runtime_error("Distance type not valid");
}
float brevityPenalty = 1.0 - (float) permCompare.getTargetLength()/getTargetLength() ;//reflength divided by trans length
if (brevityPenalty < 0.0) {
score = score * exp(brevityPenalty);
}
//cout << "*****Permutation::distance" <<endl;
//cout << "Ref:" << endl;
//dump();
//cout << "Comp:" << endl;
//permCompare.dump();
//cout << "Distance type:" << type << endl;
//cout << "Score: "<< score << endl;
return score;
if (type == HAMMING_DISTANCE) {
score = calculateHamming(permCompare);
} else if (type == KENDALL_DISTANCE) {
score = calculateKendall(permCompare);
} else {
throw runtime_error("Distance type not valid");
}
float brevityPenalty = 1.0 - (float) permCompare.getTargetLength()/getTargetLength() ;//reflength divided by trans length
if (brevityPenalty < 0.0) {
score = score * exp(brevityPenalty);
}
//cout << "Distance type:" << type << endl;
//cout << "Score: "<< score << endl;
return score;
}
float Permutation::calculateHamming(const Permutation & compare) const
{
float score=0;
vector<int> compareArray = compare.getArray();
if (getLength() != compare.getLength()) {
cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
throw runtime_error("Length of permutations not equal");
float score=0;
vector<int> compareArray = compare.getArray();
if (getLength() != compare.getLength()) {
cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
throw runtime_error("Length of permutations not equal");
}
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++) {
if (m_array[i] != compareArray[i]) {
score++;
}
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++)
{
if (m_array[i] != compareArray[i])
{
score++;
}
}
score = 1 - (score / getLength());
return score;
}
score = 1 - (score / getLength());
return score;
}
float Permutation::calculateKendall(const Permutation & compare) const
{
float score=0;
vector<int> compareArray = compare.getArray();
if (getLength() != compare.getLength()) {
cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
throw runtime_error("Length of permutations not equal");
float score=0;
vector<int> compareArray = compare.getArray();
if (getLength() != compare.getLength()) {
cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl;
throw runtime_error("Length of permutations not equal");
}
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++) {
for (size_t j=0; j<getLength(); j++) {
if ((m_array[i] < m_array[j]) && (compareArray[i] > compareArray[j])) {
score++;
}
}
if (getLength() == 0) {
cerr << "Empty permutation" << endl;
return 0;
}
for (size_t i=0; i<getLength(); i++)
{
for (size_t j=0; j<getLength(); j++)
{
if ((m_array[i] < m_array[j]) && (compareArray[i] > compareArray[j]))
{
score++;
}
}
}
score = (score / ((getLength()*getLength() - getLength()) /2 ) );
//Adjusted Kendall's tau correlates better with human judgements
score = sqrt (score);
score = 1 - score;
}
score = (score / ((getLength()*getLength() - getLength()) /2 ) );
//Adjusted Kendall's tau correlates better with human judgements
score = sqrt (score);
score = 1 - score;
return score;
return score;
}
vector<int> Permutation::getArray() const
{
vector<int> ret = m_array;
return ret;
vector<int> ret = m_array;
return ret;
}
//Static
//This function is called with test which is
//This function is called with test which is
// the 5th field in moses nbest output when called with -include-alignment-in-n-best
//eg. 0=0 1-2=1-2 3=3 4=4 5=5 6=6 7-9=7-8 10=9 11-13=10-11 (source-target)
string Permutation::convertMosesToStandard(string const & alignment)
{
if (alignment.length() == 0)
{
if (alignment.length() == 0) {
cerr << "Alignment input string empty" << endl;
}
string working = alignment;
string out;
stringstream oss;
while (working.length() > 0)
{
string align;
getNextPound(working,align," ");
while (working.length() > 0) {
string align;
getNextPound(working,align," ");
//If found an alignment
if (align.length() > 0)
{
size_t posDelimeter = align.find("=");
if(posDelimeter== string::npos)
{
cerr << "Delimiter not found = :"<< align << endl;
exit(0);
}
int firstSourcePos,lastSourcePos,firstTargetPos,lastTargetPos;
string sourcePoss = align.substr(0, posDelimeter);
string targetPoss = align.substr(posDelimeter+1);
posDelimeter = sourcePoss.find("-");
if(posDelimeter < string::npos) {
firstSourcePos = atoi((sourcePoss.substr(0, posDelimeter)).c_str());
lastSourcePos = atoi((sourcePoss.substr(posDelimeter+1)).c_str());
} else {
firstSourcePos = atoi(sourcePoss.c_str());
lastSourcePos = firstSourcePos;
}
posDelimeter = targetPoss.find("-");
if(posDelimeter < string::npos) {
firstTargetPos = atoi((targetPoss.substr(0, posDelimeter)).c_str());
lastTargetPos = atoi((targetPoss.substr(posDelimeter+1)).c_str());
} else {
firstTargetPos = atoi(targetPoss.c_str());
lastTargetPos = firstTargetPos;
}
for (int i = firstSourcePos; i <= lastSourcePos; i++) {
for (int j = firstTargetPos; j <= lastTargetPos; j++) {
oss << i << "-" << j << " ";
}
}
//If found an alignment
if (align.length() > 0) {
size_t posDelimeter = align.find("=");
if(posDelimeter== string::npos) {
cerr << "Delimiter not found = :"<< align << endl;
exit(0);
}
int firstSourcePos,lastSourcePos,firstTargetPos,lastTargetPos;
string sourcePoss = align.substr(0, posDelimeter);
string targetPoss = align.substr(posDelimeter+1);
posDelimeter = sourcePoss.find("-");
if(posDelimeter < string::npos) {
firstSourcePos = atoi((sourcePoss.substr(0, posDelimeter)).c_str());
lastSourcePos = atoi((sourcePoss.substr(posDelimeter+1)).c_str());
} else {
firstSourcePos = atoi(sourcePoss.c_str());
lastSourcePos = firstSourcePos;
}
posDelimeter = targetPoss.find("-");
if(posDelimeter < string::npos) {
firstTargetPos = atoi((targetPoss.substr(0, posDelimeter)).c_str());
lastTargetPos = atoi((targetPoss.substr(posDelimeter+1)).c_str());
} else {
firstTargetPos = atoi(targetPoss.c_str());
lastTargetPos = firstTargetPos;
}
for (int i = firstSourcePos; i <= lastSourcePos; i++) {
for (int j = firstTargetPos; j <= lastTargetPos; j++) {
oss << i << "-" << j << " ";
}
}
} //else case where two spaces ?
} //else case where two spaces ?
}
out = oss.str();
//cout << "ConverttoStandard: " << out << endl;
//cout << "ConverttoStandard: " << out << endl;
return out;
}

View File

@ -20,41 +20,45 @@
class Permutation
{
public:
//Can be HAMMING_DISTANCE or KENDALLS_DISTANCE
Permutation(const std::string &alignment = std::string(), const int sourceLength = 0, const int targetLength = 0 );
~Permutation(){};
inline void clear() { m_array.clear(); }
inline size_t size(){ return m_array.size(); }
//Can be HAMMING_DISTANCE or KENDALLS_DISTANCE
Permutation(const std::string &alignment = std::string(), const int sourceLength = 0, const int targetLength = 0 );
~Permutation() {};
inline void clear() {
m_array.clear();
}
inline size_t size() {
return m_array.size();
}
void set(const std::string &alignment,const int sourceLength);
void set(const std::string &alignment,const int sourceLength);
float distance(const Permutation &permCompare, const distanceMetric_t &strategy = HAMMING_DISTANCE) const;
float distance(const Permutation &permCompare, const distanceMetric_t &strategy = HAMMING_DISTANCE) const;
//Const
void dump() const;
size_t getLength() const;
vector<int> getArray() const;
int getTargetLength() const {
return m_targetLength;
}
//Const
void dump() const;
size_t getLength() const;
vector<int> getArray() const;
int getTargetLength() const {
return m_targetLength;
}
//Static
static std::string convertMosesToStandard(std::string const & alignment);
static vector<int> invert(vector<int> const & inVector);
static bool checkValidPermutation(vector<int> const & inVector);
//Static
static std::string convertMosesToStandard(std::string const & alignment);
static vector<int> invert(vector<int> const & inVector);
static bool checkValidPermutation(vector<int> const & inVector);
protected:
vector<int> m_array;
int m_targetLength;
float calculateHamming(const Permutation & compare) const;
float calculateKendall(const Permutation & compare) const;
vector<int> m_array;
int m_targetLength;
float calculateHamming(const Permutation & compare) const;
float calculateKendall(const Permutation & compare) const;
private:
};

View File

@ -4,215 +4,212 @@ using namespace std;
const int PermutationScorer::SCORE_PRECISION = 5;
PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config)
:SentenceLevelScorer(distanceMetric,config)
PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config)
:SentenceLevelScorer(distanceMetric,config)
{
//configure regularisation
//configure regularisation
static string KEY_REFCHOICE = "refchoice";
static string REFCHOICE_AVERAGE = "average";
static string REFCHOICE_CLOSEST = "closest";
string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST);
if (refchoice == REFCHOICE_AVERAGE) {
m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE;
} else if (refchoice == REFCHOICE_CLOSEST) {
m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST;
} else {
throw runtime_error("Unknown reference choice strategy: " + refchoice);
}
cerr << "Using reference choice strategy: " << refchoice << endl;
static string KEY_REFCHOICE = "refchoice";
static string REFCHOICE_AVERAGE = "average";
static string REFCHOICE_CLOSEST = "closest";
if (distanceMetric.compare("HAMMING") == 0) {
m_distanceMetric = HAMMING_DISTANCE;
} else if (distanceMetric.compare("KENDALL") == 0) {
m_distanceMetric = KENDALL_DISTANCE;
}
cerr << "Using permutation distance metric: " << distanceMetric << endl;
string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST);
if (refchoice == REFCHOICE_AVERAGE) {
m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE;
} else if (refchoice == REFCHOICE_CLOSEST) {
m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST;
} else {
throw runtime_error("Unknown reference choice strategy: " + refchoice);
}
cerr << "Using reference choice strategy: " << refchoice << endl;
//Get reference alignments from scconfig refalign option
static string KEY_ALIGNMENT_FILES = "refalign";
string refalign = getConfig(KEY_ALIGNMENT_FILES,"");
//cout << refalign << endl;
if (refalign.length() > 0){
string substring;
while (!refalign.empty()){
getNextPound(refalign, substring, "+");
m_referenceAlignments.push_back(substring);
}
}
if (distanceMetric.compare("HAMMING") == 0) {
m_distanceMetric = HAMMING_DISTANCE;
} else if (distanceMetric.compare("KENDALL") == 0) {
m_distanceMetric = KENDALL_DISTANCE;
}
cerr << "Using permutation distance metric: " << distanceMetric << endl;
//Get length of source sentences read in from scconfig source option
// this is essential for extractor but unneccesary for mert executable
static string KEY_SOURCE_FILE = "source";
string sourceFile = getConfig(KEY_SOURCE_FILE,"");
if (sourceFile.length() > 0) {
cerr << "Loading source sentence lengths from " << sourceFile << endl;
ifstream sourcein(sourceFile.c_str());
if (!sourcein) {
throw runtime_error("Unable to open: " + sourceFile);
}
string line;
while (getline(sourcein,line)) {
size_t wordNumber = 0;
string word;
while(!line.empty()){
getNextPound(line, word, " ");
wordNumber++;
}
m_sourceLengths.push_back(wordNumber);
}
sourcein.close();
//Get reference alignments from scconfig refalign option
static string KEY_ALIGNMENT_FILES = "refalign";
string refalign = getConfig(KEY_ALIGNMENT_FILES,"");
//cout << refalign << endl;
if (refalign.length() > 0) {
string substring;
while (!refalign.empty()) {
getNextPound(refalign, substring, "+");
m_referenceAlignments.push_back(substring);
}
}
//Get length of source sentences read in from scconfig source option
// this is essential for extractor but unneccesary for mert executable
static string KEY_SOURCE_FILE = "source";
string sourceFile = getConfig(KEY_SOURCE_FILE,"");
if (sourceFile.length() > 0) {
cerr << "Loading source sentence lengths from " << sourceFile << endl;
ifstream sourcein(sourceFile.c_str());
if (!sourcein) {
throw runtime_error("Unable to open: " + sourceFile);
}
string line;
while (getline(sourcein,line)) {
size_t wordNumber = 0;
string word;
while(!line.empty()) {
getNextPound(line, word, " ");
wordNumber++;
}
m_sourceLengths.push_back(wordNumber);
}
sourcein.close();
}
}
void PermutationScorer::setReferenceFiles(const vector<string>& referenceFiles) {
cout << "*******setReferenceFiles" << endl;
//make sure reference data is clear
m_referencePerms.clear();
void PermutationScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
cout << "*******setReferenceFiles" << endl;
//make sure reference data is clear
m_referencePerms.clear();
vector< vector< int> > targetLengths;
//Just getting target length from reference text file
for (size_t i = 0; i < referenceFiles.size(); ++i)
{
vector <int> lengths;
cout << "Loading reference from " << referenceFiles[i] << endl;
ifstream refin(referenceFiles[i].c_str());
if (!refin)
{
cerr << "Unable to open: " << referenceFiles[i] << endl;
throw runtime_error("Unable to open alignment file");
}
string line;
while (getline(refin,line))
{
int count = getNumberWords(line);
lengths.push_back(count);
}
targetLengths.push_back(lengths);
vector< vector< int> > targetLengths;
//Just getting target length from reference text file
for (size_t i = 0; i < referenceFiles.size(); ++i) {
vector <int> lengths;
cout << "Loading reference from " << referenceFiles[i] << endl;
ifstream refin(referenceFiles[i].c_str());
if (!refin) {
cerr << "Unable to open: " << referenceFiles[i] << endl;
throw runtime_error("Unable to open alignment file");
}
string line;
while (getline(refin,line)) {
int count = getNumberWords(line);
lengths.push_back(count);
}
targetLengths.push_back(lengths);
}
//load reference data
//NOTE ignoring normal reference file, only using previously saved alignment reference files
for (size_t i = 0; i < m_referenceAlignments.size(); ++i)
{
vector<Permutation> referencePerms;
cout << "Loading reference from " << m_referenceAlignments[i] << endl;
ifstream refin(m_referenceAlignments[i].c_str());
if (!refin)
{
cerr << "Unable to open: " << m_referenceAlignments[i] << endl;
throw runtime_error("Unable to open alignment file");
}
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line))
{
//cout << line << endl;
//Line needs to be of the format: 0-0 1-1 1-2 etc source-target
Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]);
//perm.dump();
referencePerms.push_back(perm);
//check the source sentence length is the same for previous file
if (perm.getLength() != m_sourceLengths[sid])
{
cerr << "Permutation Length: " << perm.getLength() << endl;
cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl;
throw runtime_error("Source sentence lengths not the same: ");
}
sid++;
}
m_referencePerms.push_back(referencePerms);
//load reference data
//NOTE ignoring normal reference file, only using previously saved alignment reference files
for (size_t i = 0; i < m_referenceAlignments.size(); ++i) {
vector<Permutation> referencePerms;
cout << "Loading reference from " << m_referenceAlignments[i] << endl;
ifstream refin(m_referenceAlignments[i].c_str());
if (!refin) {
cerr << "Unable to open: " << m_referenceAlignments[i] << endl;
throw runtime_error("Unable to open alignment file");
}
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line)) {
//cout << line << endl;
//Line needs to be of the format: 0-0 1-1 1-2 etc source-target
Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]);
//perm.dump();
referencePerms.push_back(perm);
//check the source sentence length is the same for previous file
if (perm.getLength() != m_sourceLengths[sid]) {
cerr << "Permutation Length: " << perm.getLength() << endl;
cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl;
throw runtime_error("Source sentence lengths not the same: ");
}
sid++;
}
m_referencePerms.push_back(referencePerms);
}
}
int PermutationScorer::getNumberWords (const string& text) const {
int count = 0;
string line = trimStr(text);
if (line.length()>0) {
int pos = line.find(" ");
while (pos!=int(string::npos)){
count++;
pos = line.find(" ",pos+1);
}
count++;
int PermutationScorer::getNumberWords (const string& text) const
{
int count = 0;
string line = trimStr(text);
if (line.length()>0) {
int pos = line.find(" ");
while (pos!=int(string::npos)) {
count++;
pos = line.find(" ",pos+1);
}
return count;
count++;
}
return count;
}
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) {
//cout << "*******prepareStats" ;
//cout << text << endl;
//cout << sid << endl;
//cout << "Reference0align:" << endl;
//m_referencePerms[0][sid].dump();
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
//cout << "*******prepareStats" ;
//cout << text << endl;
//cout << sid << endl;
//cout << "Reference0align:" << endl;
//m_referencePerms[0][sid].dump();
string sentence = "";
string align = text;
size_t alignmentData = text.find("|||");
//Get sentence and alignment parts
if(alignmentData != string::npos) {
getNextPound(align,sentence, "|||");
} else {
align = text;
string sentence = "";
string align = text;
size_t alignmentData = text.find("|||");
//Get sentence and alignment parts
if(alignmentData != string::npos) {
getNextPound(align,sentence, "|||");
} else {
align = text;
}
int translationLength = getNumberWords(sentence);
//A vector of Permutations for each sentence
vector< vector<Permutation> > nBestPerms;
float distanceValue;
//need to create permutations for each nbest line
string standardFormat = Permutation::convertMosesToStandard(align);
Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
//perm.dump();
if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
float total = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
total += dist;
//cout << "Ref number: " << i << " distance: " << dist << endl;
}
int translationLength = getNumberWords(sentence);
float mean = (float)total/m_referencePerms.size();
//cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
distanceValue = mean;
} else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) {
float max_val = 0;
//A vector of Permutations for each sentence
vector< vector<Permutation> > nBestPerms;
float distanceValue;
//need to create permutations for each nbest line
string standardFormat = Permutation::convertMosesToStandard(align);
Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
//perm.dump();
if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
float total = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
total += dist;
//cout << "Ref number: " << i << " distance: " << dist << endl;
}
float mean = (float)total/m_referencePerms.size();
//cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
distanceValue = mean;
} else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) {
float max_val = 0;
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
//look for the closest reference
float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
//cout << "Ref number: " << i << " distance: " << value << endl;
if (value > max_val) {
max_val = value;
}
}
distanceValue = max_val;
//cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
} else {
throw runtime_error("Unsupported reflength strategy");
for (size_t i = 0; i < m_referencePerms.size(); ++i) {
//look for the closest reference
float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
//cout << "Ref number: " << i << " distance: " << value << endl;
if (value > max_val) {
max_val = value;
}
}
distanceValue = max_val;
//cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
} else {
throw runtime_error("Unsupported reflength strategy");
}
//SCOREROUT eg: 0.04546
ostringstream tempStream;
tempStream.precision(SCORE_PRECISION);
tempStream << distanceValue;
string str = tempStream.str();
entry.set(str);
//cout << tempStream.str();
//SCOREROUT eg: 0.04546
ostringstream tempStream;
tempStream.precision(SCORE_PRECISION);
tempStream << distanceValue;
string str = tempStream.str();
entry.set(str);
//cout << tempStream.str();
}
//Will just be final score
statscore_t PermutationScorer::calculateScore(const vector<statscore_t>& comps) {
//cerr << "*******PermutationScorer::calculateScore" ;
//cerr << " " << comps[0] << endl;
return comps[0];
statscore_t PermutationScorer::calculateScore(const vector<statscore_t>& comps)
{
//cerr << "*******PermutationScorer::calculateScore" ;
//cerr << " " << comps[0] << endl;
return comps[0];
}

View File

@ -17,44 +17,44 @@
#include "Permutation.h"
/**
* Permutation
* Permutation
**/
class PermutationScorer: public SentenceLevelScorer
class PermutationScorer: public SentenceLevelScorer
{
public:
PermutationScorer(const string &distanceMetric = "HAMMING",
const string &config = string());
void setReferenceFiles(const vector<string>& referenceFiles);
void prepareStats(size_t sid, const string& text, ScoreStats& entry);
static const int SCORE_PRECISION;
size_t NumberOfScores() const {
//cerr << "PermutationScorer number of scores: 1" << endl;
return 1;
};
bool useAlignment() const {
//cout << "PermutationScorer::useAlignment returning true" << endl;
return true;
};
protected:
statscore_t calculateScore(const vector<statscore_t>& scores);
PermutationScorer(const PermutationScorer&);
~PermutationScorer(){};
PermutationScorer& operator=(const PermutationScorer&);
int getNumberWords (const string & line) const;
public:
PermutationScorer(const string &distanceMetric = "HAMMING",
const string &config = string());
void setReferenceFiles(const vector<string>& referenceFiles);
void prepareStats(size_t sid, const string& text, ScoreStats& entry);
static const int SCORE_PRECISION;
distanceMetricReferenceChoice_t m_refChoiceStrategy;
distanceMetric_t m_distanceMetric;
// data extracted from reference files
// A vector of permutations for each reference file
vector< vector<Permutation> > m_referencePerms;
vector<size_t> m_sourceLengths;
vector<string> m_referenceAlignments;
private:
size_t NumberOfScores() const {
//cerr << "PermutationScorer number of scores: 1" << endl;
return 1;
};
bool useAlignment() const {
//cout << "PermutationScorer::useAlignment returning true" << endl;
return true;
};
protected:
statscore_t calculateScore(const vector<statscore_t>& scores);
PermutationScorer(const PermutationScorer&);
~PermutationScorer() {};
PermutationScorer& operator=(const PermutationScorer&);
int getNumberWords (const string & line) const;
distanceMetricReferenceChoice_t m_refChoiceStrategy;
distanceMetric_t m_distanceMetric;
// data extracted from reference files
// A vector of permutations for each reference file
vector< vector<Permutation> > m_referencePerms;
vector<size_t> m_sourceLengths;
vector<string> m_referenceAlignments;
private:
};
//TODO need to read in floats for scores - necessary for selecting mean reference strategy and for BLEU?

View File

@ -10,22 +10,24 @@ vector<unsigned> Point::optindices;
unsigned Point::dim=0;
map<unsigned,statscore_t> Point::fixedweights;
unsigned Point::pdim=0;
unsigned Point::ncall=0;
void Point::Randomize(const vector<parameter_t>& min,const vector<parameter_t>& max){
void Point::Randomize(const vector<parameter_t>& min,const vector<parameter_t>& max)
{
assert(min.size()==Point::dim);
assert(max.size()==Point::dim);
for (unsigned int i=0; i<size(); i++)
operator[](i)= min[i] + (float)random()/(float)RAND_MAX * (float)(max[i]-min[i]);
}
void Point::NormalizeL2(){
void Point::NormalizeL2()
{
parameter_t norm=0.0;
for (unsigned int i=0; i<size(); i++)
norm+= operator[](i)*operator[](i);
if(norm!=0.0){
if(norm!=0.0) {
norm=sqrt(norm);
for (unsigned int i=0; i<size(); i++)
operator[](i)/=norm;
@ -33,22 +35,24 @@ void Point::NormalizeL2(){
}
void Point::NormalizeL1(){
void Point::NormalizeL1()
{
parameter_t norm=0.0;
for (unsigned int i=0; i<size(); i++)
norm+= abs(operator[](i));
if(norm!=0.0){
for (unsigned int i=0; i<size(); i++)
operator[](i)/=norm;
}
if(norm!=0.0) {
for (unsigned int i=0; i<size(); i++)
operator[](i)/=norm;
}
}
//Can initialize from a vector of dim or pdim
Point::Point(const vector<parameter_t>& init):vector<parameter_t>(Point::dim){
if(init.size()==dim){
Point::Point(const vector<parameter_t>& init):vector<parameter_t>(Point::dim)
{
if(init.size()==dim) {
for (unsigned int i=0; i<Point::dim; i++)
operator[](i)=init[i];
}else{
} else {
assert(init.size()==pdim);
for (unsigned int i=0; i<Point::dim; i++)
operator[](i)=init[optindices[i]];
@ -56,59 +60,64 @@ Point::Point(const vector<parameter_t>& init):vector<parameter_t>(Point::dim){
};
double Point::operator*(const FeatureStats& F)const{
double Point::operator*(const FeatureStats& F)const
{
ncall++;//to track performance
double prod=0.0;
if(OptimizeAll())
for (unsigned i=0; i<size(); i++)
prod+= operator[](i)*F.get(i);
else{
else {
for (unsigned i=0; i<size(); i++)
prod+= operator[](i)*F.get(optindices[i]);
for(map<unsigned,float >::iterator it=fixedweights.begin();it!=fixedweights.end();it++)
for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
prod+=it->second*F.get(it->first);
}
return prod;
}
Point Point::operator+(const Point& p2)const{
Point Point::operator+(const Point& p2)const
{
assert(p2.size()==size());
Point Res(*this);
for(unsigned i=0;i<size();i++)
for(unsigned i=0; i<size(); i++)
Res[i]+=p2[i];
Res.score=numeric_limits<statscore_t>::max();
return Res;
};
Point Point::operator*(float l)const{
Point Point::operator*(float l)const
{
Point Res(*this);
for(unsigned i=0;i<size();i++)
for(unsigned i=0; i<size(); i++)
Res[i]*=l;
Res.score=numeric_limits<statscore_t>::max();
return Res;
};
ostream& operator<<(ostream& o,const Point& P){
vector<parameter_t> w=P.GetAllWeights();
ostream& operator<<(ostream& o,const Point& P)
{
vector<parameter_t> w=P.GetAllWeights();
// o << "[" << Point::pdim << "] ";
for(unsigned int i=0;i<Point::pdim;i++)
o << w[i] << " ";
for(unsigned int i=0; i<Point::pdim; i++)
o << w[i] << " ";
// o << "=> " << P.GetScore();
return o;
return o;
};
vector<parameter_t> Point::GetAllWeights()const{
vector<parameter_t> Point::GetAllWeights()const
{
vector<parameter_t> w;
if(OptimizeAll()){
if(OptimizeAll()) {
w=*this;
}else{
} else {
w.resize(pdim);
for (unsigned int i=0; i<size(); i++)
w[optindices[i]]=operator[](i);
for(map<unsigned,float >::iterator it=fixedweights.begin();it!=fixedweights.end();it++)
w[it->first]=it->second;
for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
w[it->first]=it->second;
}
return w;
};

View File

@ -10,9 +10,10 @@ class Optimizer;
/**class that handle the subset of the Feature weight on which we run the optimization*/
class Point:public vector<parameter_t>{
class Point:public vector<parameter_t>
{
friend class Optimizer;
private:
private:
/**The indices over which we optimize*/
static vector<unsigned int> optindices;
/**dimension of optindices and of the parent vector*/
@ -22,12 +23,18 @@ class Point:public vector<parameter_t>{
/**total size of the parameter space; we have pdim=FixedWeight.size()+optinidices.size()*/
static unsigned int pdim;
static unsigned int ncall;
public:
static unsigned int getdim(){return dim;}
static unsigned int getpdim(){return pdim;}
static bool OptimizeAll(){return fixedweights.empty();};
public:
static unsigned int getdim() {
return dim;
}
static unsigned int getpdim() {
return pdim;
}
static bool OptimizeAll() {
return fixedweights.empty();
};
statscore_t score;
Point():vector<parameter_t>(dim){};
Point():vector<parameter_t>(dim) {};
Point(const vector<parameter_t>& init);
void Randomize(const vector<parameter_t>& min,const vector<parameter_t>& max);
@ -36,12 +43,16 @@ class Point:public vector<parameter_t>{
Point operator*(float)const;
/**write the Whole featureweight to a stream (ie pdim float)*/
friend ostream& operator<<(ostream& o,const Point& P);
void Normalize(){ NormalizeL2(); };
void Normalize() {
NormalizeL2();
};
void NormalizeL2();
void NormalizeL1();
/**return a vector of size pdim where all weights have been put(including fixed ones)*/
vector<parameter_t> GetAllWeights()const;
statscore_t GetScore()const { return score; };
statscore_t GetScore()const {
return score;
};
};
#endif

View File

@ -15,134 +15,134 @@ ScoreArray::ScoreArray(): idx("")
void ScoreArray::savetxt(std::ofstream& outFile, const std::string& sctype)
{
outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_scores << " " << sctype << std::endl;
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++){
i->savetxt(outFile);
outFile << std::endl;
}
outFile << SCORES_TXT_END << std::endl;
outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_scores << " " << sctype << std::endl;
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->savetxt(outFile);
outFile << std::endl;
}
outFile << SCORES_TXT_END << std::endl;
}
void ScoreArray::savebin(std::ofstream& outFile, const std::string& sctype)
{
outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_scores << " " << sctype << std::endl;
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->savebin(outFile);
outFile << SCORES_BIN_END << std::endl;
outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size()
<< " " << number_of_scores << " " << sctype << std::endl;
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++)
i->savebin(outFile);
outFile << SCORES_BIN_END << std::endl;
}
void ScoreArray::save(std::ofstream& inFile, const std::string& sctype, bool bin)
{
if (size()>0)
(bin)?savebin(inFile, sctype):savetxt(inFile, sctype);
if (size()>0)
(bin)?savebin(inFile, sctype):savetxt(inFile, sctype);
}
void ScoreArray::save(const std::string &file, const std::string& sctype, bool bin)
{
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
save(outFile, sctype, bin);
save(outFile, sctype, bin);
outFile.close();
outFile.close();
}
void ScoreArray::loadbin(ifstream& inFile, size_t n)
{
ScoreStats entry(number_of_scores);
for (size_t i=0 ; i < n; i++){
entry.loadbin(inFile);
add(entry);
}
ScoreStats entry(number_of_scores);
for (size_t i=0 ; i < n; i++) {
entry.loadbin(inFile);
add(entry);
}
}
void ScoreArray::loadtxt(ifstream& inFile, size_t n)
{
ScoreStats entry(number_of_scores);
for (size_t i=0 ; i < n; i++){
entry.loadtxt(inFile);
add(entry);
}
ScoreStats entry(number_of_scores);
for (size_t i=0 ; i < n; i++) {
entry.loadtxt(inFile);
add(entry);
}
}
void ScoreArray::load(ifstream& inFile)
{
size_t number_of_entries=0;
bool binmode=false;
std::string substring, stringBuf;
bool binmode=false;
std::string substring, stringBuf;
std::string::size_type loc;
std::getline(inFile, stringBuf);
if (!inFile.good()){
return;
}
if (!stringBuf.empty()){
if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0){
binmode=false;
}else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0){
binmode=true;
}else{
TRACE_ERR("ERROR: ScoreArray::load(): Wrong header");
return;
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
idx = substring;
getNextPound(stringBuf, substring);
std::getline(inFile, stringBuf);
if (!inFile.good()) {
return;
}
if (!stringBuf.empty()) {
if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) {
binmode=false;
} else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) {
binmode=true;
} else {
TRACE_ERR("ERROR: ScoreArray::load(): Wrong header");
return;
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
idx = substring;
getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
number_of_scores = atoi(substring.c_str());
getNextPound(stringBuf, substring);
score_type = substring;
}
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
std::getline(inFile, stringBuf);
if (!stringBuf.empty()){
if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0){
TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
return;
}
}
getNextPound(stringBuf, substring);
score_type = substring;
}
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
std::getline(inFile, stringBuf);
if (!stringBuf.empty()) {
if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
return;
}
}
}
void ScoreArray::load(const std::string &file)
{
TRACE_ERR("loading data from " << file << std::endl);
TRACE_ERR("loading data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
inputfilestream inFile(file); // matches a stream with a file. Opens the file
load((ifstream&) inFile);
load((ifstream&) inFile);
inFile.close();
inFile.close();
}
void ScoreArray::merge(ScoreArray& e)
{
//dummy implementation
for (size_t i=0; i<e.size(); i++)
add(e.get(i));
//dummy implementation
for (size_t i=0; i<e.size(); i++)
add(e.get(i));
}
bool ScoreArray::check_consistency()
{
size_t sz = NumberOfScores();
if (sz == 0)
return true;
for (scorearray_t::iterator i=array_.begin(); i!=array_.end(); i++)
if (i->size()!=sz)
return false;
return true;
size_t sz = NumberOfScores();
if (sz == 0)
return true;
for (scorearray_t::iterator i=array_.begin(); i!=array_.end(); i++)
if (i->size()!=sz)
return false;
return true;
}

View File

@ -27,52 +27,76 @@ using namespace std;
class ScoreArray
{
protected:
scorearray_t array_;
std::string score_type;
size_t number_of_scores;
private:
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
scorearray_t array_;
std::string score_type;
size_t number_of_scores;
private:
std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
public:
ScoreArray();
~ScoreArray(){};
inline void clear() { array_.clear(); }
inline std::string getIndex(){ return idx; }
inline void setIndex(const std::string& value){ idx=value; }
ScoreArray();
~ScoreArray() {};
inline void clear() {
array_.clear();
}
inline std::string getIndex() {
return idx;
}
inline void setIndex(const std::string& value) {
idx=value;
}
// inline ScoreStats get(size_t i){ return array_.at(i); }
inline ScoreStats& get(size_t i){ return array_.at(i); }
inline const ScoreStats& get(size_t i)const{ return array_.at(i); }
void add(const ScoreStats& e){ array_.push_back(e); }
inline ScoreStats& get(size_t i) {
return array_.at(i);
}
inline const ScoreStats& get(size_t i)const {
return array_.at(i);
}
void merge(ScoreArray& e);
void add(const ScoreStats& e) {
array_.push_back(e);
}
inline std::string name() const{ return score_type; };
inline void name(std::string &sctype){ score_type = sctype; };
void merge(ScoreArray& e);
inline size_t size(){ return array_.size(); }
inline size_t NumberOfScores() const{ return number_of_scores; }
inline void NumberOfScores(size_t v){ number_of_scores = v; }
void savetxt(ofstream& outFile, const std::string& sctype);
void savebin(ofstream& outFile, const std::string& sctype);
void save(ofstream& outFile, const std::string& sctype, bool bin=false);
void save(const std::string &file, const std::string& sctype, bool bin=false);
inline void save(const std::string& sctype, bool bin=false){ save("/dev/stdout", sctype, bin); }
void loadtxt(ifstream& inFile, size_t n);
void loadbin(ifstream& inFile, size_t n);
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
inline std::string name() const {
return score_type;
};
inline void name(std::string &sctype) {
score_type = sctype;
};
inline size_t size() {
return array_.size();
}
inline size_t NumberOfScores() const {
return number_of_scores;
}
inline void NumberOfScores(size_t v) {
number_of_scores = v;
}
void savetxt(ofstream& outFile, const std::string& sctype);
void savebin(ofstream& outFile, const std::string& sctype);
void save(ofstream& outFile, const std::string& sctype, bool bin=false);
void save(const std::string &file, const std::string& sctype, bool bin=false);
inline void save(const std::string& sctype, bool bin=false) {
save("/dev/stdout", sctype, bin);
}
void loadtxt(ifstream& inFile, size_t n);
void loadbin(ifstream& inFile, size_t n);
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
};

View File

@ -13,138 +13,138 @@
ScoreData::ScoreData(Scorer& ptr):
theScorer(&ptr)
theScorer(&ptr)
{
score_type = theScorer->getName();
//theScorer->setScoreData(this);//this is not dangerous: we dont use the this pointer in SetScoreData
number_of_scores = theScorer->NumberOfScores();
TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl);
score_type = theScorer->getName();
//theScorer->setScoreData(this);//this is not dangerous: we dont use the this pointer in SetScoreData
number_of_scores = theScorer->NumberOfScores();
TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl);
};
void ScoreData::dump()
void ScoreData::dump()
{
for (vector<ScoreArray>::iterator it = array_.begin(); it !=array_.end(); it++){
cout << "scorearray: " << endl;
for (size_t i = 0; i < (*it).size(); i++) {
ScoreStats scoreStats = (*it).get(i);
cout << "scorestats: " ;
for (size_t j = 0; j < scoreStats.size(); j ++ ){
ScoreStatsType scoreStatsType = scoreStats.get(j);
cout << scoreStatsType << " " ;
}
cout << endl;
}
}
for (vector<ScoreArray>::iterator it = array_.begin(); it !=array_.end(); it++) {
cout << "scorearray: " << endl;
for (size_t i = 0; i < (*it).size(); i++) {
ScoreStats scoreStats = (*it).get(i);
cout << "scorestats: " ;
for (size_t j = 0; j < scoreStats.size(); j ++ ) {
ScoreStatsType scoreStatsType = scoreStats.get(j);
cout << scoreStatsType << " " ;
}
cout << endl;
}
}
}
void ScoreData::save(std::ofstream& outFile, bool bin)
{
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++){
i->save(outFile, score_type, bin);
}
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
i->save(outFile, score_type, bin);
}
}
void ScoreData::save(const std::string &file, bool bin)
{
if (file.empty()) return;
TRACE_ERR("saving the array into " << file << std::endl);
if (file.empty()) return;
TRACE_ERR("saving the array into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
ScoreStats entry;
save(outFile, bin);
save(outFile, bin);
outFile.close();
outFile.close();
}
void ScoreData::load(ifstream& inFile)
{
ScoreArray entry;
while (!inFile.eof()){
if (!inFile.good()){
std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl;
}
entry.clear();
entry.load(inFile);
while (!inFile.eof()) {
if (entry.size() == 0){
break;
}
add(entry);
}
theScorer->setScoreData(this);
if (!inFile.good()) {
std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl;
}
entry.clear();
entry.load(inFile);
if (entry.size() == 0) {
break;
}
add(entry);
}
theScorer->setScoreData(this);
}
void ScoreData::load(const std::string &file)
{
TRACE_ERR("loading score data from " << file << std::endl);
TRACE_ERR("loading score data from " << file << std::endl);
inputfilestream inFile(file); // matches a stream with a file. Opens the file
inputfilestream inFile(file); // matches a stream with a file. Opens the file
if (!inFile) {
throw runtime_error("Unable to open score file: " + file);
}
if (!inFile) {
throw runtime_error("Unable to open score file: " + file);
}
load((ifstream&) inFile);
load((ifstream&) inFile);
inFile.close();
inFile.close();
}
void ScoreData::add(ScoreArray& e){
if (exists(e.getIndex())){ // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(e.getIndex());
array_.at(pos).merge(e);
}
else{
array_.push_back(e);
setIndex();
}
void ScoreData::add(ScoreArray& e)
{
if (exists(e.getIndex())) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(e.getIndex());
array_.at(pos).merge(e);
} else {
array_.push_back(e);
setIndex();
}
}
void ScoreData::add(const ScoreStats& e, const std::string& sent_idx){
if (exists(sent_idx)){ // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(sent_idx);
// TRACE_ERR("Inserting in array " << sent_idx << std::endl);
array_.at(pos).add(e);
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
}
else{
// TRACE_ERR("Creating a new entry in the array" << std::endl);
ScoreArray a;
a.NumberOfScores(number_of_scores);
a.add(e);
a.setIndex(sent_idx);
add(a);
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
}
}
void ScoreData::add(const ScoreStats& e, const std::string& sent_idx)
{
if (exists(sent_idx)) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(sent_idx);
// TRACE_ERR("Inserting in array " << sent_idx << std::endl);
array_.at(pos).add(e);
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
} else {
// TRACE_ERR("Creating a new entry in the array" << std::endl);
ScoreArray a;
a.NumberOfScores(number_of_scores);
a.add(e);
a.setIndex(sent_idx);
add(a);
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
}
}
bool ScoreData::check_consistency()
{
if (array_.size() == 0)
return true;
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++)
if (!i->check_consistency()) return false;
return true;
if (array_.size() == 0)
return true;
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++)
if (!i->check_consistency()) return false;
return true;
}
void ScoreData::setIndex()
{
size_t j=0;
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++){
idx2arrayname_[j]=i->getIndex();
arrayname2idx_[i->getIndex()]=j;
j++;
}
size_t j=0;
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
idx2arrayname_[j]=i->getIndex();
arrayname2idx_[i->getIndex()]=j;
j++;
}
}

View File

@ -23,66 +23,92 @@ class Scorer;
class ScoreData
{
protected:
scoredata_t array_;
idx2name idx2arrayname_; //map from index to name of array
name2idx arrayname2idx_; //map from name to index of array
scoredata_t array_;
idx2name idx2arrayname_; //map from index to name of array
name2idx arrayname2idx_; //map from name to index of array
private:
Scorer* theScorer;
std::string score_type;
size_t number_of_scores;
Scorer* theScorer;
std::string score_type;
size_t number_of_scores;
public:
ScoreData(Scorer& sc);
~ScoreData(){};
inline void clear() { array_.clear(); }
inline ScoreArray get(const std::string& idx){ return array_.at(getIndex(idx)); }
inline ScoreArray& get(size_t idx){ return array_.at(idx); }
inline const ScoreArray& get(size_t idx) const { return array_.at(idx); }
inline bool exists(const std::string & sent_idx){ return exists(getIndex(sent_idx)); }
inline bool exists(int sent_idx){ return (sent_idx>-1 && sent_idx<(int)array_.size())?true:false; }
inline ScoreStats& get(size_t i, size_t j){ return array_.at(i).get(j); }
inline const ScoreStats& get(size_t i, size_t j) const { return array_.at(i).get(j); }
inline std::string name(){ return score_type; };
inline std::string name(std::string &sctype){ return score_type = sctype; };
ScoreData(Scorer& sc);
void add(ScoreArray& e);
void add(const ScoreStats& e, const std::string& sent_idx);
inline size_t NumberOfScores(){ return number_of_scores; }
inline size_t size(){ return array_.size(); }
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false){ save("/dev/stdout", bin); }
~ScoreData() {};
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
void setIndex();
inline int getIndex(const std::string& idx){
name2idx::iterator i = arrayname2idx_.find(idx);
if (i!=arrayname2idx_.end())
return i->second;
else
return -1;
inline void clear() {
array_.clear();
}
inline std::string getIndex(size_t idx){
idx2name::iterator i = idx2arrayname_.find(idx);
if (i!=idx2arrayname_.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
void dump();
inline ScoreArray get(const std::string& idx) {
return array_.at(getIndex(idx));
}
inline ScoreArray& get(size_t idx) {
return array_.at(idx);
}
inline const ScoreArray& get(size_t idx) const {
return array_.at(idx);
}
inline bool exists(const std::string & sent_idx) {
return exists(getIndex(sent_idx));
}
inline bool exists(int sent_idx) {
return (sent_idx>-1 && sent_idx<(int)array_.size())?true:false;
}
inline ScoreStats& get(size_t i, size_t j) {
return array_.at(i).get(j);
}
inline const ScoreStats& get(size_t i, size_t j) const {
return array_.at(i).get(j);
}
inline std::string name() {
return score_type;
};
inline std::string name(std::string &sctype) {
return score_type = sctype;
};
void add(ScoreArray& e);
void add(const ScoreStats& e, const std::string& sent_idx);
inline size_t NumberOfScores() {
return number_of_scores;
}
inline size_t size() {
return array_.size();
}
void save(const std::string &file, bool bin=false);
void save(ofstream& outFile, bool bin=false);
inline void save(bool bin=false) {
save("/dev/stdout", bin);
}
void load(ifstream& inFile);
void load(const std::string &file);
bool check_consistency();
void setIndex();
inline int getIndex(const std::string& idx) {
name2idx::iterator i = arrayname2idx_.find(idx);
if (i!=arrayname2idx_.end())
return i->second;
else
return -1;
}
inline std::string getIndex(size_t idx) {
idx2name::iterator i = idx2arrayname_.find(idx);
if (i!=idx2arrayname_.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
void dump();
};

View File

@ -14,123 +14,124 @@
ScoreStats::ScoreStats()
{
available_ = AVAILABLE_;
entries_ = 0;
array_ = new ScoreStatsType[available_];
available_ = AVAILABLE_;
entries_ = 0;
array_ = new ScoreStatsType[available_];
};
ScoreStats::~ScoreStats()
{
delete array_;
delete array_;
};
ScoreStats::ScoreStats(const ScoreStats &stats)
ScoreStats::ScoreStats(const ScoreStats &stats)
{
available_ = stats.available();
entries_ = stats.size();
array_ = new ScoreStatsType[available_];
memcpy(array_,stats.getArray(),scorebytes_);
available_ = stats.available();
entries_ = stats.size();
array_ = new ScoreStatsType[available_];
memcpy(array_,stats.getArray(),scorebytes_);
};
ScoreStats::ScoreStats(const size_t size)
{
available_ = size;
entries_ = size;
array_ = new ScoreStatsType[available_];
memset(array_,0,scorebytes_);
available_ = size;
entries_ = size;
array_ = new ScoreStatsType[available_];
memset(array_,0,scorebytes_);
};
ScoreStats::ScoreStats(std::string &theString)
{
set(theString);
set(theString);
}
void ScoreStats::expand()
{
available_*=2;
scorestats_t t_ = new ScoreStatsType[available_];
memcpy(t_,array_,scorebytes_);
delete array_;
array_=t_;
available_*=2;
scorestats_t t_ = new ScoreStatsType[available_];
memcpy(t_,array_,scorebytes_);
delete array_;
array_=t_;
}
void ScoreStats::add(ScoreStatsType v)
{
if (isfull()) expand();
array_[entries_++]=v;
if (isfull()) expand();
array_[entries_++]=v;
}
void ScoreStats::set(std::string &theString)
{
std::string substring, stringBuf;
reset();
while (!theString.empty()){
getNextPound(theString, substring);
add(ATOSST(substring.c_str()));
}
reset();
while (!theString.empty()) {
getNextPound(theString, substring);
add(ATOSST(substring.c_str()));
}
}
void ScoreStats::loadbin(std::ifstream& inFile)
{
inFile.read((char*) array_, scorebytes_);
}
inFile.read((char*) array_, scorebytes_);
}
void ScoreStats::loadtxt(std::ifstream& inFile)
{
std::string theString;
std::getline(inFile, theString);
set(theString);
std::getline(inFile, theString);
set(theString);
}
void ScoreStats::loadtxt(const std::string &file)
{
// TRACE_ERR("loading the stats from " << file << std::endl);
// TRACE_ERR("loading the stats from " << file << std::endl);
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
loadtxt(inFile);
loadtxt(inFile);
}
void ScoreStats::savetxt(const std::string &file)
{
// TRACE_ERR("saving the stats into " << file << std::endl);
// TRACE_ERR("saving the stats into " << file << std::endl);
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
savetxt(outFile);
savetxt(outFile);
}
void ScoreStats::savetxt(std::ofstream& outFile)
{
outFile << *this;
outFile << *this;
}
void ScoreStats::savebin(std::ofstream& outFile)
{
outFile.write((char*) array_, scorebytes_);
}
outFile.write((char*) array_, scorebytes_);
}
ScoreStats& ScoreStats::operator=(const ScoreStats &stats)
{
delete array_;
available_ = stats.available();
entries_ = stats.size();
array_ = new ScoreStatsType[available_];
memcpy(array_,stats.getArray(),scorebytes_);
return *this;
delete array_;
available_ = stats.available();
entries_ = stats.size();
array_ = new ScoreStatsType[available_];
memcpy(array_,stats.getArray(),scorebytes_);
return *this;
}
/**write the whole object to a stream*/
ostream& operator<<(ostream& o, const ScoreStats& e){
for (size_t i=0; i< e.size(); i++)
o << e.get(i) << " ";
return o;
ostream& operator<<(ostream& o, const ScoreStats& e)
{
for (size_t i=0; i< e.size(); i++)
o << e.get(i) << " ";
return o;
}

View File

@ -26,51 +26,72 @@ using namespace std;
class ScoreStats
{
private:
scorestats_t array_;
size_t entries_;
size_t available_;
scorestats_t array_;
size_t entries_;
size_t available_;
public:
ScoreStats();
ScoreStats();
ScoreStats(const size_t size);
ScoreStats(const ScoreStats &stats);
ScoreStats(std::string &theString);
ScoreStats& operator=(const ScoreStats &stats);
~ScoreStats();
bool isfull(){return (entries_ < available_)?0:1; }
void expand();
void add(ScoreStatsType v);
inline void clear() { memset((void*) array_,0,scorebytes_); }
inline ScoreStatsType get(size_t i){ return array_[i]; }
inline ScoreStatsType get(size_t i)const{ return array_[i]; }
inline scorestats_t getArray() const { return array_; }
void set(std::string &theString);
ScoreStats(const ScoreStats &stats);
ScoreStats(std::string &theString);
ScoreStats& operator=(const ScoreStats &stats);
inline size_t bytes() const{ return scorebytes_; }
inline size_t size() const{ return entries_; }
inline size_t available() const{ return available_; }
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt(){ savetxt("/dev/stdout"); }
~ScoreStats();
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile);
void loadbin(ifstream& inFile);
inline void reset(){ entries_ = 0; clear(); }
bool isfull() {
return (entries_ < available_)?0:1;
}
void expand();
void add(ScoreStatsType v);
/**write the whole object to a stream*/
friend ostream& operator<<(ostream& o, const ScoreStats& e);
inline void clear() {
memset((void*) array_,0,scorebytes_);
}
inline ScoreStatsType get(size_t i) {
return array_[i];
}
inline ScoreStatsType get(size_t i)const {
return array_[i];
}
inline scorestats_t getArray() const {
return array_;
}
void set(std::string &theString);
inline size_t bytes() const {
return scorebytes_;
}
inline size_t size() const {
return entries_;
}
inline size_t available() const {
return available_;
}
void savetxt(const std::string &file);
void savetxt(ofstream& outFile);
void savebin(ofstream& outFile);
inline void savetxt() {
savetxt("/dev/stdout");
}
void loadtxt(const std::string &file);
void loadtxt(ifstream& inFile);
void loadbin(ifstream& inFile);
inline void reset() {
entries_ = 0;
clear();
}
/**write the whole object to a stream*/
friend ostream& operator<<(ostream& o, const ScoreStats& e);
};

View File

@ -1,105 +1,108 @@
#include "Scorer.h"
//regularisation strategies
static float score_min(const statscores_t& scores, size_t start, size_t end) {
float min = numeric_limits<float>::max();
for (size_t i = start; i < end; ++i) {
if (scores[i] < min) {
min = scores[i];
}
}
return min;
static float score_min(const statscores_t& scores, size_t start, size_t end)
{
float min = numeric_limits<float>::max();
for (size_t i = start; i < end; ++i) {
if (scores[i] < min) {
min = scores[i];
}
}
return min;
}
static float score_average(const statscores_t& scores, size_t start, size_t end) {
if ((end - start) < 1) {
//shouldn't happen
return 0;
}
float total = 0;
for (size_t j = start; j < end; ++j) {
total += scores[j];
}
static float score_average(const statscores_t& scores, size_t start, size_t end)
{
if ((end - start) < 1) {
//shouldn't happen
return 0;
}
float total = 0;
for (size_t j = start; j < end; ++j) {
total += scores[j];
}
return total / (end - start);
return total / (end - start);
}
void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
//cout << "*******StatisticsBasedScorer::score" << endl;
if (!_scoreData) {
throw runtime_error("Score data not loaded");
}
//calculate the score for the candidates
if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
statscores_t& scores)
{
//cout << "*******StatisticsBasedScorer::score" << endl;
if (!_scoreData) {
throw runtime_error("Score data not loaded");
}
//calculate the score for the candidates
if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
}
if (candidates.size() == 0) {
throw runtime_error("No candidates supplied");
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candidates[i] " << candidates[i] << endl;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
if (candidates.size() == 0) {
throw runtime_error("No candidates supplied");
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candidates[i] " << candidates[i] << endl;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
}
}
scores.push_back(calculateScore(totals));
}
scores.push_back(calculateScore(totals));
candidates_t last_candidates(candidates);
//apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) {
for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second;
candidates_t last_candidates(candidates);
//apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) {
for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second;
//cout << "STSC:sid = " << sid << endl;
//cout << "STSC:nid = " << nid << endl;
size_t last_nid = last_candidates[sid];
size_t last_nid = last_candidates[sid];
//cout << "STSC:oid = " << last_nid << endl;
for (size_t k = 0; k < totals.size(); ++k) {
float diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k);
totals[k] += diff;
for (size_t k = 0; k < totals.size(); ++k) {
float diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k);
totals[k] += diff;
//cout << "STSC:nid = " << _scoreData->get(sid,nid).get(k) << endl;
//cout << "STSC:oid = " << _scoreData->get(sid,last_nid).get(k) << endl;
//cout << "STSC:diff = " << diff << endl;
//cout << "STSC:totals = " << totals[k] << endl;
}
last_candidates[sid] = nid;
}
scores.push_back(calculateScore(totals));
}
last_candidates[sid] = nid;
}
scores.push_back(calculateScore(totals));
}
//regularisation. This can either be none, or the min or average as described in
//Cer, Jurafsky and Manning at WMT08
if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) {
//no regularisation
return;
}
//regularisation. This can either be none, or the min or average as described in
//Cer, Jurafsky and Manning at WMT08
if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) {
//no regularisation
return;
}
//window size specifies the +/- in each direction
statscores_t raw_scores(scores);//copy scores
for (size_t i = 0; i < scores.size(); ++i) {
size_t start = 0;
if (i >= _regularisationWindow) {
start = i - _regularisationWindow;
}
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
//window size specifies the +/- in each direction
statscores_t raw_scores(scores);//copy scores
for (size_t i = 0; i < scores.size(); ++i) {
size_t start = 0;
if (i >= _regularisationWindow) {
start = i - _regularisationWindow;
}
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
}
}
@ -110,89 +113,90 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
/** The sentence level scores have already been calculated, just need to average them
and include the differences. Allows scores which are floats **/
void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
//cout << "*******SentenceLevelScorer::score" << endl;
if (!_scoreData) {
throw runtime_error("Score data not loaded");
statscores_t& scores)
{
//cout << "*******SentenceLevelScorer::score" << endl;
if (!_scoreData) {
throw runtime_error("Score data not loaded");
}
//calculate the score for the candidates
if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
}
if (candidates.size() == 0) {
throw runtime_error("No candidates supplied");
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candi " << candidates[i] ;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
//calculate the score for the candidates
if (_scoreData->size() == 0) {
throw runtime_error("Score data is empty");
}
if (candidates.size() == 0) {
throw runtime_error("No candidates supplied");
}
int numCounts = _scoreData->get(0,candidates[0]).size();
vector<float> totals(numCounts);
for (size_t i = 0; i < candidates.size(); ++i) {
//cout << " i " << i << " candi " << candidates[i] ;
ScoreStats stats = _scoreData->get(i,candidates[i]);
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
<< "number of fields. Found: " << stats.size() << " Expected: "
<< totals.size();
throw runtime_error(msg.str());
}
//Add up scores for all sentences, would normally be just one score
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
//cout << " stats " << stats.get(k) ;
}
//cout << endl;
}
//take average
//Add up scores for all sentences, would normally be just one score
for (size_t k = 0; k < totals.size(); ++k) {
totals[k] += stats.get(k);
//cout << " stats " << stats.get(k) ;
}
//cout << endl;
}
//take average
for (size_t k = 0; k < totals.size(); ++k) {
//cout << "totals = " << totals[k] << endl;
//cout << "cand = " << candidates.size() << endl;
totals[k] /= candidates.size();
totals[k] /= candidates.size();
//cout << "finaltotals = " << totals[k] << endl;
}
}
scores.push_back(calculateScore(totals));
scores.push_back(calculateScore(totals));
candidates_t last_candidates(candidates);
//apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) {
for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second;
candidates_t last_candidates(candidates);
//apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) {
for (size_t j = 0; j < diffs[i].size(); ++j) {
size_t sid = diffs[i][j].first;
size_t nid = diffs[i][j].second;
//cout << "sid = " << sid << endl;
//cout << "nid = " << nid << endl;
size_t last_nid = last_candidates[sid];
for (size_t k = 0; k < totals.size(); ++k) {
float diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k);
size_t last_nid = last_candidates[sid];
for (size_t k = 0; k < totals.size(); ++k) {
float diff = _scoreData->get(sid,nid).get(k)
- _scoreData->get(sid,last_nid).get(k);
//cout << "diff = " << diff << endl;
totals[k] += diff/candidates.size();
totals[k] += diff/candidates.size();
//cout << "totals = " << totals[k] << endl;
}
last_candidates[sid] = nid;
}
scores.push_back(calculateScore(totals));
}
last_candidates[sid] = nid;
}
scores.push_back(calculateScore(totals));
}
//regularisation. This can either be none, or the min or average as described in
//Cer, Jurafsky and Manning at WMT08
if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) {
//no regularisation
return;
}
//regularisation. This can either be none, or the min or average as described in
//Cer, Jurafsky and Manning at WMT08
if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) {
//no regularisation
return;
}
//window size specifies the +/- in each direction
statscores_t raw_scores(scores);//copy scores
for (size_t i = 0; i < scores.size(); ++i) {
size_t start = 0;
if (i >= _regularisationWindow) {
start = i - _regularisationWindow;
}
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
//window size specifies the +/- in each direction
statscores_t raw_scores(scores);//copy scores
for (size_t i = 0; i < scores.size(); ++i) {
size_t start = 0;
if (i >= _regularisationWindow) {
start = i - _regularisationWindow;
}
size_t end = min(scores.size(), i + _regularisationWindow+1);
if (_regularisationStrategy == REG_AVERAGE) {
scores[i] = score_average(raw_scores,start,end);
} else {
scores[i] = score_min(raw_scores,start,end);
}
}
}

View File

@ -23,173 +23,180 @@ class ScoreStats;
/**
* Superclass of all scorers and dummy implementation. In order to add a new
* scorer it should be sufficient to override prepareStats(), setReferenceFiles()
* and score() (or calculateScore()).
* and score() (or calculateScore()).
**/
class Scorer {
private:
string _name;
public:
Scorer(const string& name, const string& config): _name(name), _scoreData(0), _preserveCase(true){
cerr << "Scorer config string: " << config << endl;
size_t start = 0;
while (start < config.size()) {
size_t end = config.find(",",start);
if (end == string::npos) {
end = config.size();
}
string nv = config.substr(start,end-start);
size_t split = nv.find(":");
if (split == string::npos) {
throw runtime_error("Missing colon when processing scorer config: " + config);
}
string name = nv.substr(0,split);
string value = nv.substr(split+1,nv.size()-split-1);
cerr << "name: " << name << " value: " << value << endl;
_config[name] = value;
start = end+1;
}
class Scorer
{
private:
string _name;
};
virtual ~Scorer(){};
public:
/**
* returns the number of statistics needed for the computation of the score
**/
virtual size_t NumberOfScores() const { cerr << "Scorer: 0" << endl; return 0; };
/**
* set the reference files. This must be called before prepareStats.
**/
virtual void setReferenceFiles(const vector<string>& referenceFiles) {
//do nothing
}
/**
* Process the given guessed text, corresponding to the given reference sindex
* and add the appropriate statistics to the entry.
**/
virtual void prepareStats(size_t sindex, const string& text, ScoreStats& entry)
{}
virtual void prepareStats(const string& sindex, const string& text, ScoreStats& entry)
{
// cerr << sindex << endl;
this->prepareStats((size_t) atoi(sindex.c_str()), text, entry);
//cerr << text << std::endl;
}
/**
* Score using each of the candidate index, then go through the diffs
* applying each in turn, and calculating a new score each time.
**/
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
//dummy impl
if (!_scoreData) {
throw runtime_error("score data not loaded");
}
scores.push_back(0);
for (size_t i = 0; i < diffs.size(); ++i) {
scores.push_back(0);
}
}
/**
* Calculate the score of the sentences corresponding to the list of candidate
* indices. Each index indicates the 1-best choice from the n-best list.
**/
float score(const candidates_t& candidates) {
diffs_t diffs;
statscores_t scores;
score(candidates, diffs, scores);
return scores[0];
}
const string& getName() const {return _name;}
size_t getReferenceSize() {
if (_scoreData) {
return _scoreData->size();
}
return 0;
}
/**
* Set the score data, prior to scoring.
**/
virtual void setScoreData(ScoreData* data) {
_scoreData = data;
}
/**
* The scorer returns if it uses the reference alignment data
* for permutation distance scores
**/
virtual bool useAlignment() const {
//cout << "Scorer::useAlignment returning false " << endl;
return false;
};
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;};
protected:
typedef map<string,int> encodings_t;
typedef map<string,int>::iterator encodings_it;
ScoreData* _scoreData;
encodings_t _encodings;
bool _preserveCase;
/**
* Value of config variable. If not provided, return default.
**/
string getConfig(const string& key, const string& def="") {
map<string,string>::iterator i = _config.find(key);
if (i == _config.end()) {
return def;
} else {
return i->second;
}
Scorer(const string& name, const string& config): _name(name), _scoreData(0), _preserveCase(true) {
cerr << "Scorer config string: " << config << endl;
size_t start = 0;
while (start < config.size()) {
size_t end = config.find(",",start);
if (end == string::npos) {
end = config.size();
}
/**
* Tokenise line and encode.
* Note: We assume that all tokens are separated by single spaces
**/
void encode(const string& line, vector<int>& encoded) {
//cerr << line << endl;
istringstream in (line);
string token;
while (in >> token) {
if (!_preserveCase) {
for (string::iterator i = token.begin(); i != token.end(); ++i) {
*i = tolower(*i);
}
}
encodings_it encoding = _encodings.find(token);
int encoded_token;
if (encoding == _encodings.end()) {
encoded_token = (int)_encodings.size();
_encodings[token] = encoded_token;
//cerr << encoded_token << "(n) ";
} else {
encoded_token = encoding->second;
//cerr << encoded_token << " ";
}
encoded.push_back(encoded_token);
}
//cerr << endl;
string nv = config.substr(start,end-start);
size_t split = nv.find(":");
if (split == string::npos) {
throw runtime_error("Missing colon when processing scorer config: " + config);
}
string name = nv.substr(0,split);
string value = nv.substr(split+1,nv.size()-split-1);
cerr << "name: " << name << " value: " << value << endl;
_config[name] = value;
start = end+1;
}
private:
map<string,string> _config;
};
virtual ~Scorer() {};
/**
* returns the number of statistics needed for the computation of the score
**/
virtual size_t NumberOfScores() const {
cerr << "Scorer: 0" << endl;
return 0;
};
/**
* set the reference files. This must be called before prepareStats.
**/
virtual void setReferenceFiles(const vector<string>& referenceFiles) {
//do nothing
}
/**
* Process the given guessed text, corresponding to the given reference sindex
* and add the appropriate statistics to the entry.
**/
virtual void prepareStats(size_t sindex, const string& text, ScoreStats& entry)
{}
virtual void prepareStats(const string& sindex, const string& text, ScoreStats& entry) {
// cerr << sindex << endl;
this->prepareStats((size_t) atoi(sindex.c_str()), text, entry);
//cerr << text << std::endl;
}
/**
* Score using each of the candidate index, then go through the diffs
* applying each in turn, and calculating a new score each time.
**/
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) {
//dummy impl
if (!_scoreData) {
throw runtime_error("score data not loaded");
}
scores.push_back(0);
for (size_t i = 0; i < diffs.size(); ++i) {
scores.push_back(0);
}
}
/**
* Calculate the score of the sentences corresponding to the list of candidate
* indices. Each index indicates the 1-best choice from the n-best list.
**/
float score(const candidates_t& candidates) {
diffs_t diffs;
statscores_t scores;
score(candidates, diffs, scores);
return scores[0];
}
const string& getName() const {
return _name;
}
size_t getReferenceSize() {
if (_scoreData) {
return _scoreData->size();
}
return 0;
}
/**
* Set the score data, prior to scoring.
**/
virtual void setScoreData(ScoreData* data) {
_scoreData = data;
}
/**
* The scorer returns if it uses the reference alignment data
* for permutation distance scores
**/
virtual bool useAlignment() const {
//cout << "Scorer::useAlignment returning false " << endl;
return false;
};
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals) {
return 0;
};
protected:
typedef map<string,int> encodings_t;
typedef map<string,int>::iterator encodings_it;
ScoreData* _scoreData;
encodings_t _encodings;
bool _preserveCase;
/**
* Value of config variable. If not provided, return default.
**/
string getConfig(const string& key, const string& def="") {
map<string,string>::iterator i = _config.find(key);
if (i == _config.end()) {
return def;
} else {
return i->second;
}
}
/**
* Tokenise line and encode.
* Note: We assume that all tokens are separated by single spaces
**/
void encode(const string& line, vector<int>& encoded) {
//cerr << line << endl;
istringstream in (line);
string token;
while (in >> token) {
if (!_preserveCase) {
for (string::iterator i = token.begin(); i != token.end(); ++i) {
*i = tolower(*i);
}
}
encodings_it encoding = _encodings.find(token);
int encoded_token;
if (encoding == _encodings.end()) {
encoded_token = (int)_encodings.size();
_encodings[token] = encoded_token;
//cerr << encoded_token << "(n) ";
} else {
encoded_token = encoding->second;
//cerr << encoded_token << " ";
}
encoded.push_back(encoded_token);
}
//cerr << endl;
}
private:
map<string,string> _config;
};
@ -197,11 +204,12 @@ class Scorer {
/**
* Abstract base class for scorers that work by adding statistics across all
* Abstract base class for scorers that work by adding statistics across all
* outout sentences, then apply some formula, e.g. bleu, per. **/
class StatisticsBasedScorer : public Scorer {
class StatisticsBasedScorer : public Scorer
{
public:
public:
StatisticsBasedScorer(const string& name, const string& config): Scorer(name,config) {
//configure regularisation
static string KEY_TYPE = "regtype";
@ -212,105 +220,110 @@ class StatisticsBasedScorer : public Scorer {
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE;
_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE;
_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM;
_regularisationStrategy = REG_MINIMUM;
} else {
throw runtime_error("Unknown scorer regularisation strategy: " + type);
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) {
_preserveCase = true;
}else if (preservecase == FALSE) {
_preserveCase = false;
_preserveCase = true;
} else if (preservecase == FALSE) {
_preserveCase = false;
}
cerr << "Using case preservation: " << _preserveCase << endl;
}
~StatisticsBasedScorer(){};
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;};
~StatisticsBasedScorer() {};
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals) {
return 0;
};
protected:
protected:
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
};
/**
* Abstract base class for scorers that work by using sentence level
* statistics eg. permutation distance metrics **/
class SentenceLevelScorer : public Scorer {
class SentenceLevelScorer : public Scorer
{
public:
SentenceLevelScorer(const string& name, const string& config): Scorer(name,config) {
//configure regularisation
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case";
static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM;
} else {
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
public:
SentenceLevelScorer(const string& name, const string& config): Scorer(name,config) {
//configure regularisation
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
static string KEY_CASE = "case";
static string TYPE_NONE = "none";
static string TYPE_AVERAGE = "average";
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) {
_preserveCase = true;
}else if (preservecase == FALSE) {
_preserveCase = false;
}
cerr << "Using case preservation: " << _preserveCase << endl;
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
_regularisationStrategy = REG_NONE;
} else if (type == TYPE_AVERAGE) {
_regularisationStrategy = REG_AVERAGE;
} else if (type == TYPE_MINIMUM) {
_regularisationStrategy = REG_MINIMUM;
} else {
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
cerr << "Using scorer regularisation strategy: " << type << endl;
string window = getConfig(KEY_WINDOW,"0");
_regularisationWindow = atoi(window.c_str());
cerr << "Using scorer regularisation window: " << _regularisationWindow << endl;
string preservecase = getConfig(KEY_CASE,TRUE);
if (preservecase == TRUE) {
_preserveCase = true;
} else if (preservecase == FALSE) {
_preserveCase = false;
}
cerr << "Using case preservation: " << _preserveCase << endl;
}
~SentenceLevelScorer(){};
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
}
~SentenceLevelScorer() {};
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores);
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals){return 0;};
//calculate the actual score
virtual statscore_t calculateScore(const vector<statscore_t>& totals) {
return 0;
};
protected:
protected:
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
//regularisation
ScorerRegularisationStrategy _regularisationStrategy;
size_t _regularisationWindow;
};

View File

@ -19,43 +19,44 @@
using namespace std;
class ScorerFactory {
class ScorerFactory
{
public:
vector<string> getTypes() {
vector<string> types;
types.push_back(string("BLEU1"));
types.push_back(string("BLEU"));
types.push_back(string("PER"));
types.push_back(string("HAMMING"));
types.push_back(string("KENDALL"));
return types;
public:
vector<string> getTypes() {
vector<string> types;
types.push_back(string("BLEU1"));
types.push_back(string("BLEU"));
types.push_back(string("PER"));
types.push_back(string("HAMMING"));
types.push_back(string("KENDALL"));
return types;
}
Scorer* getScorer(const string& type, const string& config = "") {
size_t scorerTypes = type.find(",");
if(scorerTypes == string::npos) {
if (type == "BLEU1") {
string conf;
if (config.length() > 0) {
conf = config + ",ngramlen:1";
} else {
conf = config + "ngramlen:1";
}
Scorer* getScorer(const string& type, const string& config = "") {
size_t scorerTypes = type.find(",");
if(scorerTypes == string::npos) {
if (type == "BLEU1") {
string conf;
if (config.length() > 0) {
conf = config + ",ngramlen:1";
} else {
conf = config + "ngramlen:1";
}
return (BleuScorer*) new BleuScorer(conf);
} else if (type == "BLEU") {
return (BleuScorer*) new BleuScorer(config);
} else if (type == "PER") {
return (PerScorer*) new PerScorer(config);
} else if ((type == "HAMMING") || (type == "KENDALL")) {
return (PermutationScorer*) new PermutationScorer(type, config);
} else {
throw runtime_error("Unknown scorer type: " + type);
}
} else {
return (InterpolatedScorer*) new InterpolatedScorer(type, config);
}
}
return (BleuScorer*) new BleuScorer(conf);
} else if (type == "BLEU") {
return (BleuScorer*) new BleuScorer(config);
} else if (type == "PER") {
return (PerScorer*) new PerScorer(config);
} else if ((type == "HAMMING") || (type == "KENDALL")) {
return (PermutationScorer*) new PermutationScorer(type, config);
} else {
throw runtime_error("Unknown scorer type: " + type);
}
} else {
return (InterpolatedScorer*) new InterpolatedScorer(type, config);
}
}
};
#endif //__SCORER_FACTORY_H

View File

@ -12,8 +12,8 @@
*/
double Timer::elapsed_time()
{
time_t now;
time(&now);
time_t now;
time(&now);
return difftime(now, start_time);
}
@ -36,7 +36,7 @@ double Timer::get_elapsed_time()
void Timer::start(const char* msg)
{
// Print an optional message, something like "Starting timer t";
if (msg) TRACE_ERR( msg << std::endl);
if (msg) TRACE_ERR( msg << std::endl);
// Return immediately if the timer is already running
if (running) return;

View File

@ -8,16 +8,16 @@
class Timer
{
friend std::ostream& operator<<(std::ostream& os, Timer& t);
friend std::ostream& operator<<(std::ostream& os, Timer& t);
private:
private:
bool running;
time_t start_time;
//TODO in seconds?
//TODO in seconds?
double elapsed_time();
public:
public:
/***
* 'running' is initially false. A timer needs to be explicitly started
* using 'start' or 'restart'

View File

@ -1,7 +1,7 @@
/*
* Util.cpp
* met - Minimum Error Training
*
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
@ -18,47 +18,47 @@ Timer g_timer;
int verbose=0;
int verboselevel(){
int verboselevel()
{
return verbose;
}
int setverboselevel(int v){
int setverboselevel(int v)
{
verbose=v;
return verbose;
}
int getNextPound(std::string &theString, std::string &substring, const std::string delimiter)
{
unsigned int pos = 0;
//skip all occurrences of delimiter
while ( pos == 0 )
{
if ((pos = theString.find(delimiter)) != std::string::npos){
substring.assign(theString, 0, pos);
theString.erase(0,pos + delimiter.size());
}
else{
substring.assign(theString);
theString.assign("");
}
}
return (pos);
unsigned int pos = 0;
//skip all occurrences of delimiter
while ( pos == 0 ) {
if ((pos = theString.find(delimiter)) != std::string::npos) {
substring.assign(theString, 0, pos);
theString.erase(0,pos + delimiter.size());
} else {
substring.assign(theString);
theString.assign("");
}
}
return (pos);
};
inputfilestream::inputfilestream(const std::string &filePath)
: std::istream(0),
m_streambuf(0)
: std::istream(0),
m_streambuf(0)
{
//check if file is readable
std::filebuf* fb = new std::filebuf();
_good=(fb->open(filePath.c_str(), std::ios::in)!=NULL);
if (filePath.size() > 3 &&
filePath.substr(filePath.size() - 3, 3) == ".gz")
{
fb->close(); delete fb;
m_streambuf = new gzfilebuf(filePath.c_str());
filePath.substr(filePath.size() - 3, 3) == ".gz") {
fb->close();
delete fb;
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
m_streambuf = fb;
}
@ -67,7 +67,8 @@ m_streambuf(0)
inputfilestream::~inputfilestream()
{
delete m_streambuf; m_streambuf = 0;
delete m_streambuf;
m_streambuf = 0;
}
void inputfilestream::close()
@ -75,16 +76,15 @@ void inputfilestream::close()
}
outputfilestream::outputfilestream(const std::string &filePath)
: std::ostream(0),
m_streambuf(0)
: std::ostream(0),
m_streambuf(0)
{
//check if file is readable
std::filebuf* fb = new std::filebuf();
_good=(fb->open(filePath.c_str(), std::ios::out)!=NULL);
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz")
{
throw runtime_error("Output to a zipped file not supported!");
_good=(fb->open(filePath.c_str(), std::ios::out)!=NULL);
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
throw runtime_error("Output to a zipped file not supported!");
} else {
m_streambuf = fb;
}
@ -93,7 +93,8 @@ m_streambuf(0)
outputfilestream::~outputfilestream()
{
delete m_streambuf; m_streambuf = 0;
delete m_streambuf;
m_streambuf = 0;
}
void outputfilestream::close()
@ -103,10 +104,14 @@ void outputfilestream::close()
int swapbytes(char *p, int sz, int n)
{
char c, *l, *h;
if((n<1) || (sz<2)) return 0;
for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; }
return 0;
for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) {
c=*h;
*h=*l;
*l=c;
}
return 0;
};
@ -116,12 +121,12 @@ void ResetUserTime()
};
void PrintUserTime(const std::string &message)
{
g_timer.check(message.c_str());
{
g_timer.check(message.c_str());
}
double GetUserTime()
{
return g_timer.get_elapsed_time();
return g_timer.get_elapsed_time();
}

View File

@ -51,45 +51,49 @@ int getNextPound(std::string &theString, std::string &substring, const std::stri
template<typename T>
inline T Scan(const std::string &input)
{
std::stringstream stream(input);
T ret;
stream >> ret;
return ret;
std::stringstream stream(input);
T ret;
stream >> ret;
return ret;
};
class inputfilestream : public std::istream
{
protected:
std::streambuf *m_streambuf;
bool _good;
std::streambuf *m_streambuf;
bool _good;
public:
inputfilestream(const std::string &filePath);
~inputfilestream();
bool good(){return _good;}
void close();
inputfilestream(const std::string &filePath);
~inputfilestream();
bool good() {
return _good;
}
void close();
};
class outputfilestream : public std::ostream
{
protected:
std::streambuf *m_streambuf;
bool _good;
std::streambuf *m_streambuf;
bool _good;
public:
outputfilestream(const std::string &filePath);
~outputfilestream();
bool good(){return _good;}
void close();
outputfilestream(const std::string &filePath);
~outputfilestream();
bool good() {
return _good;
}
void close();
};
template<typename T>
inline std::string stringify(T x)
{
std::ostringstream o;
if (!(o << x))
throw std::runtime_error("stringify(template<typename T>)");
return o.str();
std::ostringstream o;
if (!(o << x))
throw std::runtime_error("stringify(template<typename T>)");
return o.str();
}
// Utilities to measure decoding time
@ -99,11 +103,11 @@ double GetUserTime();
inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n")
{
unsigned int p2 = Src.find_last_not_of(c);
if (p2 == std::string::npos) return std::string();
unsigned int p1 = Src.find_first_not_of(c);
if (p1 == std::string::npos) p1 = 0;
return Src.substr(p1, (p2-p1)+1);
unsigned int p2 = Src.find_last_not_of(c);
if (p2 == std::string::npos) return std::string();
unsigned int p1 = Src.find_first_not_of(c);
if (p1 == std::string::npos) p1 = 0;
return Src.substr(p1, (p2-p1)+1);
}

View File

@ -18,7 +18,8 @@
using namespace std;
void usage() {
void usage()
{
cerr<<"usage: extractor [options])"<<endl;
cerr<<"[--sctype|-s] the scorer type (default BLEU), possibly comma separated list of interpolated types"<<endl;
cerr<<"[--scconfig|-c] configuration string passed to scorer"<<endl;
@ -28,7 +29,7 @@ void usage() {
cerr<<"[--nbest|-n] the nbest file"<<endl;
cerr<<"[--scfile|-S] the scorer data output file"<<endl;
cerr<<"[--ffile|-F] the feature data output file"<<endl;
cerr<<"[--prev-ffile|-E] comma separated list of previous feature data" <<endl;
cerr<<"[--prev-ffile|-E] comma separated list of previous feature data" <<endl;
cerr<<"[--prev-scfile|-R] comma separated list of previous scorer data"<<endl;
cerr<<"[-v] verbose level"<<endl;
cerr<<"[--help|-h] print this message and exit"<<endl;
@ -36,185 +37,185 @@ cerr<<"[--prev-ffile|-E] comma separated list of previous feature data" <<endl;
}
static struct option long_options[] =
{
{"sctype",required_argument,0,'s'},
{"scconfig",required_argument,0,'c'},
{"reference",required_argument,0,'r'},
{"binary",no_argument,0,'b'},
{"nbest",required_argument,0,'n'},
{"scfile",required_argument,0,'S'},
{"ffile",required_argument,0,'F'},
{"prev-scfile",required_argument,0,'R'},
{"prev-ffile",required_argument,0,'E'},
{"verbose",required_argument,0,'v'},
{"help",no_argument,0,'h'},
{0, 0, 0, 0}
};
static struct option long_options[] = {
{"sctype",required_argument,0,'s'},
{"scconfig",required_argument,0,'c'},
{"reference",required_argument,0,'r'},
{"binary",no_argument,0,'b'},
{"nbest",required_argument,0,'n'},
{"scfile",required_argument,0,'S'},
{"ffile",required_argument,0,'F'},
{"prev-scfile",required_argument,0,'R'},
{"prev-ffile",required_argument,0,'E'},
{"verbose",required_argument,0,'v'},
{"help",no_argument,0,'h'},
{0, 0, 0, 0}
};
int option_index;
int main(int argc, char** argv) {
ResetUserTime();
/*
Timer timer;
timer.start("Starting...");
*/
//defaults
string scorerType("BLEU");
string scorerConfig("");
string referenceFile("");
string nbestFile("");
string scoreDataFile("statscore.data");
string featureDataFile("features.data");
string prevScoreDataFile("");
string prevFeatureDataFile("");
bool binmode = false;
int verbosity = 0;
int c;
while ((c=getopt_long (argc,argv, "s:w:r:a:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
switch(c) {
case 's':
scorerType = string(optarg);
break;
case 'c':
scorerConfig = string(optarg);
break;
case 'r':
referenceFile = string(optarg);
break;
case 'b':
binmode = true;
break;
case 'n':
nbestFile = string(optarg);
break;
case 'S':
scoreDataFile = string(optarg);
break;
case 'F':
featureDataFile = string(optarg);
break;
case 'E':
prevFeatureDataFile = string(optarg);
break;
case 'R':
prevScoreDataFile = string(optarg);
break;
case 'v':
verbosity = atoi(optarg);
break;
default:
usage();
}
int main(int argc, char** argv)
{
ResetUserTime();
/*
Timer timer;
timer.start("Starting...");
*/
//defaults
string scorerType("BLEU");
string scorerConfig("");
string referenceFile("");
string nbestFile("");
string scoreDataFile("statscore.data");
string featureDataFile("features.data");
string prevScoreDataFile("");
string prevFeatureDataFile("");
bool binmode = false;
int verbosity = 0;
int c;
while ((c=getopt_long (argc,argv, "s:w:r:a:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
switch(c) {
case 's':
scorerType = string(optarg);
break;
case 'c':
scorerConfig = string(optarg);
break;
case 'r':
referenceFile = string(optarg);
break;
case 'b':
binmode = true;
break;
case 'n':
nbestFile = string(optarg);
break;
case 'S':
scoreDataFile = string(optarg);
break;
case 'F':
featureDataFile = string(optarg);
break;
case 'E':
prevFeatureDataFile = string(optarg);
break;
case 'R':
prevScoreDataFile = string(optarg);
break;
case 'v':
verbosity = atoi(optarg);
break;
default:
usage();
}
try {
}
try {
//check whether score statistics file is specified
if (scoreDataFile.length() == 0){
throw runtime_error("Error: output score statistics file is not specified");
if (scoreDataFile.length() == 0) {
throw runtime_error("Error: output score statistics file is not specified");
}
//check wheter feature file is specified
if (featureDataFile.length() == 0){
throw runtime_error("Error: output feature file is not specified");
if (featureDataFile.length() == 0) {
throw runtime_error("Error: output feature file is not specified");
}
//check whether reference file is specified when nbest is specified
if ((nbestFile.length() > 0 && referenceFile.length() == 0)){
throw runtime_error("Error: reference file is not specified; you can not score the nbest");
if ((nbestFile.length() > 0 && referenceFile.length() == 0)) {
throw runtime_error("Error: reference file is not specified; you can not score the nbest");
}
vector<string> nbestFiles;
if (nbestFile.length() > 0){
std::string substring;
while (!nbestFile.empty()){
getNextPound(nbestFile, substring, ",");
nbestFiles.push_back(substring);
}
if (nbestFile.length() > 0) {
std::string substring;
while (!nbestFile.empty()) {
getNextPound(nbestFile, substring, ",");
nbestFiles.push_back(substring);
}
}
vector<string> referenceFiles;
if (referenceFile.length() > 0){
std::string substring;
while (!referenceFile.empty()){
getNextPound(referenceFile, substring, ",");
referenceFiles.push_back(substring);
}
if (referenceFile.length() > 0) {
std::string substring;
while (!referenceFile.empty()) {
getNextPound(referenceFile, substring, ",");
referenceFiles.push_back(substring);
}
}
vector<string> prevScoreDataFiles;
if (prevScoreDataFile.length() > 0){
std::string substring;
while (!prevScoreDataFile.empty()){
getNextPound(prevScoreDataFile, substring, ",");
prevScoreDataFiles.push_back(substring);
}
if (prevScoreDataFile.length() > 0) {
std::string substring;
while (!prevScoreDataFile.empty()) {
getNextPound(prevScoreDataFile, substring, ",");
prevScoreDataFiles.push_back(substring);
}
}
vector<string> prevFeatureDataFiles;
if (prevFeatureDataFile.length() > 0){
std::string substring;
while (!prevFeatureDataFile.empty()){
getNextPound(prevFeatureDataFile, substring, ",");
prevFeatureDataFiles.push_back(substring);
}
if (prevFeatureDataFile.length() > 0) {
std::string substring;
while (!prevFeatureDataFile.empty()) {
getNextPound(prevFeatureDataFile, substring, ",");
prevFeatureDataFiles.push_back(substring);
}
}
if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()){
throw runtime_error("Error: there is a different number of previous score and feature files");
if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()) {
throw runtime_error("Error: there is a different number of previous score and feature files");
}
if (binmode) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
//TODO is comma separated list? split and create a scorer with multiple parts
TRACE_ERR("Scorer type: " << scorerType << endl);
ScorerFactory sfactory;
Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig);
//load references
if (referenceFiles.size() > 0)
scorer->setReferenceFiles(referenceFiles);
PrintUserTime("References loaded");
Data data(*scorer);
//load old data
for (size_t i=0;i < prevScoreDataFiles.size(); i++){
data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i));
}
PrintUserTime("Previous data loaded");
//computing score statistics of each nbest file
for (size_t i=0;i < nbestFiles.size(); i++){
data.loadnbest(nbestFiles.at(i));
}
if (binmode) cerr << "Binary write mode is selected" << endl;
else cerr << "Binary write mode is NOT selected" << endl;
PrintUserTime("Nbest entries loaded and scored");
if (binmode)
cerr << "Binary write mode is selected" << endl;
else
cerr << "Binary write mode is NOT selected" << endl;
data.save(featureDataFile, scoreDataFile, binmode);
PrintUserTime("Stopping...");
/*
timer.stop("Stopping...");
*/
return EXIT_SUCCESS;
} catch (const exception& e) {
cerr << "Exception: " << e.what() << endl;
return EXIT_FAILURE;
//TODO is comma separated list? split and create a scorer with multiple parts
TRACE_ERR("Scorer type: " << scorerType << endl);
ScorerFactory sfactory;
Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig);
//load references
if (referenceFiles.size() > 0)
scorer->setReferenceFiles(referenceFiles);
PrintUserTime("References loaded");
Data data(*scorer);
//load old data
for (size_t i=0; i < prevScoreDataFiles.size(); i++) {
data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i));
}
PrintUserTime("Previous data loaded");
//computing score statistics of each nbest file
for (size_t i=0; i < nbestFiles.size(); i++) {
data.loadnbest(nbestFiles.at(i));
}
PrintUserTime("Nbest entries loaded and scored");
if (binmode)
cerr << "Binary write mode is selected" << endl;
else
cerr << "Binary write mode is NOT selected" << endl;
data.save(featureDataFile, scoreDataFile, binmode);
PrintUserTime("Stopping...");
/*
timer.stop("Stopping...");
*/
return EXIT_SUCCESS;
} catch (const exception& e) {
cerr << "Exception: " << e.what() << endl;
return EXIT_FAILURE;
}
}

View File

@ -4,66 +4,70 @@
#include <streambuf>
#include <zlib.h>
class gzfilebuf : public std::streambuf {
class gzfilebuf : public std::streambuf
{
public:
gzfilebuf(const char *filename)
{ _gzf = gzopen(filename, "rb");
gzfilebuf(const char *filename) {
_gzf = gzopen(filename, "rb");
setg (_buff+sizeof(int), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)); // end position
}
~gzfilebuf() { gzclose(_gzf); }
~gzfilebuf() {
gzclose(_gzf);
}
protected:
virtual int_type overflow (int_type c) {
throw;
throw;
}
// write multiple characters
virtual
std::streamsize xsputn (const char* s,
std::streamsize num) {
throw;
throw;
}
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ) {
throw;
}
//read one character
virtual int_type underflow () {
// is read position before end of _buff?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
}
/* process size of putback area
* - use number of characters read
* - but at most four
*/
unsigned int numPutback = gptr() - eback();
if (numPutback > sizeof(int)) {
numPutback = sizeof(int);
}
/* process size of putback area
* - use number of characters read
* - but at most four
*/
unsigned int numPutback = gptr() - eback();
if (numPutback > sizeof(int)) {
numPutback = sizeof(int);
}
/* copy up to four characters previously read into
* the putback _buff (area of first four characters)
*/
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
numPutback);
/* copy up to four characters previously read into
* the putback _buff (area of first four characters)
*/
std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
numPutback);
// read new characters
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// read new characters
int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
if (num <= 0) {
// ERROR or EOF
return EOF;
}
// reset _buff pointers
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)+num); // end of buffer
// reset _buff pointers
setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
_buff+sizeof(int), // read position
_buff+sizeof(int)+num); // end of buffer
// return next character
return traits_type::to_int_type(*gptr());
// return next character
return traits_type::to_int_type(*gptr());
}
std::streamsize xsgetn (char* s,

View File

@ -28,7 +28,8 @@ float min_interval = 1e-3;
using namespace std;
void usage(void) {
void usage(void)
{
cerr<<"usage: mert -d <dimensions> (mandatory )"<<endl;
cerr<<"[-n retry ntimes (default 1)]"<<endl;
cerr<<"[-o\tthe indexes to optimize(default all)]"<<endl;
@ -44,34 +45,34 @@ void usage(void) {
exit(1);
}
static struct option long_options[] =
{
{"pdim", 1, 0, 'd'},
{"ntry",1,0,'n'},
{"rseed",required_argument,0,'r'},
{"optimize",1,0,'o'},
{"type",1,0,'t'},
{"sctype",1,0,'s'},
{"scconfig",required_argument,0,'c'},
{"scfile",1,0,'S'},
{"ffile",1,0,'F'},
{"ifile",1,0,'i'},
{"verbose",1,0,'v'},
{"help",no_argument,0,'h'},
{0, 0, 0, 0}
};
static struct option long_options[] = {
{"pdim", 1, 0, 'd'},
{"ntry",1,0,'n'},
{"rseed",required_argument,0,'r'},
{"optimize",1,0,'o'},
{"type",1,0,'t'},
{"sctype",1,0,'s'},
{"scconfig",required_argument,0,'c'},
{"scfile",1,0,'S'},
{"ffile",1,0,'F'},
{"ifile",1,0,'i'},
{"verbose",1,0,'v'},
{"help",no_argument,0,'h'},
{0, 0, 0, 0}
};
int option_index;
int main (int argc, char **argv) {
ResetUserTime();
/*
Timer timer;
timer.start("Starting...");
*/
int main (int argc, char **argv)
{
ResetUserTime();
/*
Timer timer;
timer.start("Starting...");
*/
int c,pdim,i;
pdim=-1;
int ntry=1;
@ -132,23 +133,23 @@ int main (int argc, char **argv) {
usage();
if (hasSeed) {
cerr << "Seeding random numbers with " << seed << endl;
srandom(seed);
cerr << "Seeding random numbers with " << seed << endl;
srandom(seed);
} else {
cerr << "Seeding random numbers with system clock " << endl;
srandom(time(NULL));
cerr << "Seeding random numbers with system clock " << endl;
srandom(time(NULL));
}
ifstream opt(initfile.c_str());
if(opt.fail()){
if(opt.fail()) {
cerr<<"could not open initfile: " << initfile << endl;
exit(3);
}
start.resize(pdim);//to do:read from file
int j;
for( j=0;j<pdim&&!opt.fail();j++)
for( j=0; j<pdim&&!opt.fail(); j++)
opt>>start[j];
if(j<pdim){
if(j<pdim) {
cerr<<"error could not initialize start point with " << initfile << endl;
exit(3);
}
@ -156,24 +157,24 @@ int main (int argc, char **argv) {
opt.close();
vector<string> ScoreDataFiles;
if (scorerfile.length() > 0){
if (scorerfile.length() > 0) {
std::string substring;
while (!scorerfile.empty()){
while (!scorerfile.empty()) {
getNextPound(scorerfile, substring, ",");
ScoreDataFiles.push_back(substring);
}
}
vector<string> FeatureDataFiles;
if (featurefile.length() > 0){
if (featurefile.length() > 0) {
std::string substring;
while (!featurefile.empty()){
while (!featurefile.empty()) {
getNextPound(featurefile, substring, ",");
FeatureDataFiles.push_back(substring);
}
}
if (ScoreDataFiles.size() != FeatureDataFiles.size()){
if (ScoreDataFiles.size() != FeatureDataFiles.size()) {
throw runtime_error("Error: there is a different number of previous score and feature files");
}
@ -183,32 +184,37 @@ int main (int argc, char **argv) {
//load data
Data D(*TheScorer);
for (size_t i=0;i < ScoreDataFiles.size(); i++){
for (size_t i=0; i < ScoreDataFiles.size(); i++) {
cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl;
D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
}
PrintUserTime("Data loaded");
if (tooptimizestr.length() > 0){
if (tooptimizestr.length() > 0) {
cerr << "Weights to optimize: " << tooptimizestr << endl;
//parse string to get weights to optimize
//and set them as active
std::string substring;
int index;
while (!tooptimizestr.empty()){
while (!tooptimizestr.empty()) {
getNextPound(tooptimizestr, substring, ",");
index = D.getFeatureIndex(substring);
cerr << "FeatNameIndex:" << index << " to insert" << endl;
//index = strtol(substring.c_str(), NULL, 10);
if (index >= 0 && index < pdim){ tooptimize.push_back(index); }
else{ cerr << "Index " << index << " is out of bounds. Allowed indexes are [0," << (pdim-1) << "]." << endl; }
if (index >= 0 && index < pdim) {
tooptimize.push_back(index);
} else {
cerr << "Index " << index << " is out of bounds. Allowed indexes are [0," << (pdim-1) << "]." << endl;
}
}
}else{
} else {
//set all weights as active
tooptimize.resize(pdim);//We'll optimize on everything
for(int i=0;i<pdim;i++){ tooptimize[i]=1; }
for(int i=0; i<pdim; i++) {
tooptimize[i]=1;
}
}
Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start,type);
@ -216,51 +222,51 @@ int main (int argc, char **argv) {
O->SetFData(D.getFeatureData());
Point P(start);//Generate from the full feature set. Warning: must be done after Optimizer initialization
statscore_t best=O->Run(P);
Point bestP=P;
Point bestP=P;
statscore_t mean=best;
statscore_t var=best*best;
stringstream oss;
stringstream oss;
oss << "Try number 1";
PrintUserTime(oss.str());
vector<parameter_t> min(Point::getdim());
vector<parameter_t> max(Point::getdim());
for(unsigned int d=0;d<Point::getdim();d++){
for(unsigned int d=0; d<Point::getdim(); d++) {
min[d]=0.0;
max[d]=1.0;
}
//note: those mins and max are the bound for the starting points of the algorithm, not strict bound on the result!
for(int i=1;i<ntry;i++){
P.Randomize(min,max);
statscore_t score=O->Run(P);
if(score>best){
best=score;
bestP=P;
}
mean+=score;
var+=(score*score);
oss.str("");
oss << "Try number " << (i+1);
PrintUserTime(oss.str());
}
mean/=(float)ntry;
var/=(float)ntry;
var=sqrt(abs(var-mean*mean));
if (verboselevel()>1)
cerr<<"best score: "<< best << " variance of the score (for "<<ntry<<" try): "<<var<<endl;
//L1-Normalization of the best Point
bestP.NormalizeL1();
cerr << "Best point: " << bestP << " => " << best << endl;
ofstream res("weights.txt");
res<<bestP<<endl;
PrintUserTime("Stopping...");
for(int i=1; i<ntry; i++) {
P.Randomize(min,max);
statscore_t score=O->Run(P);
if(score>best) {
best=score;
bestP=P;
}
mean+=score;
var+=(score*score);
oss.str("");
oss << "Try number " << (i+1);
PrintUserTime(oss.str());
}
mean/=(float)ntry;
var/=(float)ntry;
var=sqrt(abs(var-mean*mean));
if (verboselevel()>1)
cerr<<"best score: "<< best << " variance of the score (for "<<ntry<<" try): "<<var<<endl;
//L1-Normalization of the best Point
bestP.NormalizeL1();
cerr << "Best point: " << bestP << " => " << best << endl;
ofstream res("weights.txt");
res<<bestP<<endl;
PrintUserTime("Stopping...");
}

View File

@ -8,52 +8,53 @@
using namespace std;
int main(int argc, char** argv) {
cout << "Testing the scorer" << endl;
//BleuScorer bs("test-scorer-data/cppstats.feats.opt");;
vector<string> references;
references.push_back("test_scorer_data/reference.txt");
//bs.prepare(references, "test-scorer-data/nbest.out");
Scorer* scorer = new BleuScorer();;
scorer->setReferenceFiles(references);
Data d(*scorer);
d.loadnbest("test_scorer_data/nbest.out");
//sd.savetxt();
int main(int argc, char** argv)
{
cout << "Testing the scorer" << endl;
//BleuScorer bs("test-scorer-data/cppstats.feats.opt");;
vector<string> references;
references.push_back("test_scorer_data/reference.txt");
//bs.prepare(references, "test-scorer-data/nbest.out");
Scorer* scorer = new BleuScorer();;
scorer->setReferenceFiles(references);
Data d(*scorer);
d.loadnbest("test_scorer_data/nbest.out");
//sd.savetxt();
//calculate two bleu scores, nbest and a diff
ScoreData* sd=d.getScoreData();
scorer->setScoreData(sd);
candidates_t candidates(sd->size());;
for (size_t i = 0; i < sd->size(); ++i) {
sd->get(i,0).savetxt("/dev/stdout");
}
//calculate two bleu scores, nbest and a diff
ScoreData* sd=d.getScoreData();
scorer->setScoreData(sd);
candidates_t candidates(sd->size());;
for (size_t i = 0; i < sd->size(); ++i) {
sd->get(i,0).savetxt("/dev/stdout");
}
diffs_t diffs;
diff_t diff;
diff.push_back(make_pair(1,2));
diff.push_back(make_pair(7,8));
diffs.push_back(diff);
statscores_t scores;
scorer->score(candidates,diffs,scores);
diffs_t diffs;
diff_t diff;
diff.push_back(make_pair(1,2));
diff.push_back(make_pair(7,8));
diffs.push_back(diff);
cout << "Bleus: " << scores[0] << " " << scores[1] << endl;
statscores_t scores;
scorer->score(candidates,diffs,scores);
//try the per
scorer = new PerScorer();
Data pd(*scorer);
scorer->setReferenceFiles(references);
cout << "Bleus: " << scores[0] << " " << scores[1] << endl;
pd.loadnbest("test_scorer_data/nbest.out");
//sd.savetxt();
//try the per
scorer = new PerScorer();
Data pd(*scorer);
scorer->setReferenceFiles(references);
ScoreData* psd=pd.getScoreData();
scorer->setScoreData(psd);
for (size_t i = 0; i < psd->size(); ++i) {
psd->get(i,0).savetxt("/dev/stdout");
}
pd.loadnbest("test_scorer_data/nbest.out");
//sd.savetxt();
ScoreData* psd=pd.getScoreData();
scorer->setScoreData(psd);
for (size_t i = 0; i < psd->size(); ++i) {
psd->get(i,0).savetxt("/dev/stdout");
}
cout << "PER: " << scorer->score(candidates) << endl;
cout << "PER: " << scorer->score(candidates) << endl;
}