diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp index 4993439e3..a865aa3ce 100644 --- a/mert/BleuScorer.cpp +++ b/mert/BleuScorer.cpp @@ -1,199 +1,204 @@ #include "BleuScorer.h" -BleuScorer::BleuScorer(const string& config = "") : StatisticsBasedScorer("BLEU",config),_refLengthStrategy(BLEU_CLOSEST) { - //configure regularisation - static string KEY_REFLEN = "reflen"; - static string REFLEN_AVERAGE = "average"; - static string REFLEN_SHORTEST = "shortest"; - static string REFLEN_CLOSEST = "closest"; +BleuScorer::BleuScorer(const string& config = "") : StatisticsBasedScorer("BLEU",config),_refLengthStrategy(BLEU_CLOSEST) +{ + //configure regularisation + static string KEY_REFLEN = "reflen"; + static string REFLEN_AVERAGE = "average"; + static string REFLEN_SHORTEST = "shortest"; + static string REFLEN_CLOSEST = "closest"; - string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST); - if (reflen == REFLEN_AVERAGE) { - _refLengthStrategy = BLEU_AVERAGE; - } else if (reflen == REFLEN_SHORTEST) { - _refLengthStrategy = BLEU_SHORTEST; - } else if (reflen == REFLEN_CLOSEST) { - _refLengthStrategy = BLEU_CLOSEST; - } else { - throw runtime_error("Unknown reference length strategy: " + reflen); - } - cerr << "Using reference length strategy: " << reflen << endl; + string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST); + if (reflen == REFLEN_AVERAGE) { + _refLengthStrategy = BLEU_AVERAGE; + } else if (reflen == REFLEN_SHORTEST) { + _refLengthStrategy = BLEU_SHORTEST; + } else if (reflen == REFLEN_CLOSEST) { + _refLengthStrategy = BLEU_CLOSEST; + } else { + throw runtime_error("Unknown reference length strategy: " + reflen); + } + cerr << "Using reference length strategy: " << reflen << endl; - static string KEY_NGRAMS = "ngramlen"; - string ngramlen = getConfig(KEY_NGRAMS,"4"); + static string KEY_NGRAMS = "ngramlen"; + string ngramlen = getConfig(KEY_NGRAMS,"4"); - LENGTH = strtol(ngramlen.c_str(), NULL, 10); + LENGTH = strtol(ngramlen.c_str(), NULL, 10); } /** * count the ngrams of each type, up to the given length in the input line. **/ -size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n) { - vector encoded_tokens; - //cerr << line << endl; - encode(line,encoded_tokens); - //copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator(cerr," ")); - //cerr << endl; - for (size_t k = 1; k <= n; ++k) { - //ngram order longer than sentence - no point - if (k > encoded_tokens.size()) { - continue; - } - for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) { - vector ngram; - for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) { - ngram.push_back(encoded_tokens[j]); - } - int count = 1; - counts_it oldcount = counts.find(ngram); - if (oldcount != counts.end()) { - count = (oldcount->second) + 1; - } - //cerr << count << endl; - counts[ngram] = count; - //cerr << endl; - } - } - //cerr << "counted ngrams" << endl; - //dump_counts(counts); - return encoded_tokens.size(); +size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n) +{ + vector encoded_tokens; + //cerr << line << endl; + encode(line,encoded_tokens); + //copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator(cerr," ")); + //cerr << endl; + for (size_t k = 1; k <= n; ++k) { + //ngram order longer than sentence - no point + if (k > encoded_tokens.size()) { + continue; + } + for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) { + vector ngram; + for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) { + ngram.push_back(encoded_tokens[j]); + } + int count = 1; + counts_it oldcount = counts.find(ngram); + if (oldcount != counts.end()) { + count = (oldcount->second) + 1; + } + //cerr << count << endl; + counts[ngram] = count; + //cerr << endl; + } + } + //cerr << "counted ngrams" << endl; + //dump_counts(counts); + return encoded_tokens.size(); } -void BleuScorer::setReferenceFiles(const vector& referenceFiles) { - //make sure reference data is clear - _refcounts.clear(); - _reflengths.clear(); - _encodings.clear(); +void BleuScorer::setReferenceFiles(const vector& referenceFiles) +{ + //make sure reference data is clear + _refcounts.clear(); + _reflengths.clear(); + _encodings.clear(); - //load reference data - for (size_t i = 0; i < referenceFiles.size(); ++i) { - TRACE_ERR("Loading reference from " << referenceFiles[i] << endl); - ifstream refin(referenceFiles[i].c_str()); - if (!refin) { - throw runtime_error("Unable to open: " + referenceFiles[i]); - } - string line; - size_t sid = 0; //sentence counter - while (getline(refin,line)) { - //cerr << line << endl; - if (i == 0) { - counts_t* counts = new counts_t(); //these get leaked - _refcounts.push_back(counts); - vector lengths; - _reflengths.push_back(lengths); - } - if (_refcounts.size() <= sid) { - throw runtime_error("File " + referenceFiles[i] + " has too many sentences"); - } - counts_t counts; - size_t length = countNgrams(line,counts,LENGTH); - //for any counts larger than those already there, merge them in - for (counts_it ci = counts.begin(); ci != counts.end(); ++ci) { - counts_it oldcount_it = _refcounts[sid]->find(ci->first); - int oldcount = 0; - if (oldcount_it != _refcounts[sid]->end()) { - oldcount = oldcount_it->second; - } - int newcount = ci->second; - if (newcount > oldcount) { - _refcounts[sid]->operator[](ci->first) = newcount; - } - } - //add in the length - _reflengths[sid].push_back(length); - if (sid > 0 && sid % 100 == 0) { - TRACE_ERR("."); - } - ++sid; - } - refin.close(); - TRACE_ERR(endl); - } + //load reference data + for (size_t i = 0; i < referenceFiles.size(); ++i) { + TRACE_ERR("Loading reference from " << referenceFiles[i] << endl); + ifstream refin(referenceFiles[i].c_str()); + if (!refin) { + throw runtime_error("Unable to open: " + referenceFiles[i]); + } + string line; + size_t sid = 0; //sentence counter + while (getline(refin,line)) { + //cerr << line << endl; + if (i == 0) { + counts_t* counts = new counts_t(); //these get leaked + _refcounts.push_back(counts); + vector lengths; + _reflengths.push_back(lengths); + } + if (_refcounts.size() <= sid) { + throw runtime_error("File " + referenceFiles[i] + " has too many sentences"); + } + counts_t counts; + size_t length = countNgrams(line,counts,LENGTH); + //for any counts larger than those already there, merge them in + for (counts_it ci = counts.begin(); ci != counts.end(); ++ci) { + counts_it oldcount_it = _refcounts[sid]->find(ci->first); + int oldcount = 0; + if (oldcount_it != _refcounts[sid]->end()) { + oldcount = oldcount_it->second; + } + int newcount = ci->second; + if (newcount > oldcount) { + _refcounts[sid]->operator[](ci->first) = newcount; + } + } + //add in the length + _reflengths[sid].push_back(length); + if (sid > 0 && sid % 100 == 0) { + TRACE_ERR("."); + } + ++sid; + } + refin.close(); + TRACE_ERR(endl); + } } -void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { +void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) +{ // cerr << text << endl; // cerr << sid << endl; - //dump_counts(*_refcounts[sid]); - if (sid >= _refcounts.size()) { - stringstream msg; - msg << "Sentence id (" << sid << ") not found in reference set"; - throw runtime_error(msg.str()); - } - counts_t testcounts; - //stats for this line - vector stats(LENGTH*2);; - size_t length = countNgrams(text,testcounts,LENGTH); - //dump_counts(testcounts); - if (_refLengthStrategy == BLEU_SHORTEST) { - //cerr << reflengths.size() << " " << sid << endl; - int shortest = *min_element(_reflengths[sid].begin(),_reflengths[sid].end()); - stats.push_back(shortest); - } else if (_refLengthStrategy == BLEU_AVERAGE) { - int total = 0; - for (size_t i = 0; i < _reflengths[sid].size(); ++i) { - total += _reflengths[sid][i]; - } - float mean = (float)total/_reflengths[sid].size(); - stats.push_back(mean); - } else if (_refLengthStrategy == BLEU_CLOSEST) { - int min_diff = INT_MAX; - int min_idx = 0; - for (size_t i = 0; i < _reflengths[sid].size(); ++i) { - int reflength = _reflengths[sid][i]; - if (abs(reflength-(int)length) < abs(min_diff)) { //look for the closest reference - min_diff = reflength-length; - min_idx = i; - }else if (abs(reflength-(int)length) == abs(min_diff)) { // if two references has the same closest length, take the shortest - if (reflength < (int)_reflengths[sid][min_idx]){ - min_idx = i; - } - } - } - stats.push_back(_reflengths[sid][min_idx]); - } else { - throw runtime_error("Unsupported reflength strategy"); - } - //cerr << "computed length" << endl; - //precision on each ngram type - for (counts_it testcounts_it = testcounts.begin(); - testcounts_it != testcounts.end(); ++testcounts_it) { - counts_it refcounts_it = _refcounts[sid]->find(testcounts_it->first); - int correct = 0; - int guess = testcounts_it->second; - if (refcounts_it != _refcounts[sid]->end()) { - correct = min(refcounts_it->second,guess); - } - size_t len = testcounts_it->first.size(); - stats[len*2-2] += correct; - stats[len*2-1] += guess; - } - stringstream sout; - copy(stats.begin(),stats.end(),ostream_iterator(sout," ")); - //TRACE_ERR(sout.str() << endl); - string stats_str = sout.str(); - entry.set(stats_str); + //dump_counts(*_refcounts[sid]); + if (sid >= _refcounts.size()) { + stringstream msg; + msg << "Sentence id (" << sid << ") not found in reference set"; + throw runtime_error(msg.str()); + } + counts_t testcounts; + //stats for this line + vector stats(LENGTH*2);; + size_t length = countNgrams(text,testcounts,LENGTH); + //dump_counts(testcounts); + if (_refLengthStrategy == BLEU_SHORTEST) { + //cerr << reflengths.size() << " " << sid << endl; + int shortest = *min_element(_reflengths[sid].begin(),_reflengths[sid].end()); + stats.push_back(shortest); + } else if (_refLengthStrategy == BLEU_AVERAGE) { + int total = 0; + for (size_t i = 0; i < _reflengths[sid].size(); ++i) { + total += _reflengths[sid][i]; + } + float mean = (float)total/_reflengths[sid].size(); + stats.push_back(mean); + } else if (_refLengthStrategy == BLEU_CLOSEST) { + int min_diff = INT_MAX; + int min_idx = 0; + for (size_t i = 0; i < _reflengths[sid].size(); ++i) { + int reflength = _reflengths[sid][i]; + if (abs(reflength-(int)length) < abs(min_diff)) { //look for the closest reference + min_diff = reflength-length; + min_idx = i; + } else if (abs(reflength-(int)length) == abs(min_diff)) { // if two references has the same closest length, take the shortest + if (reflength < (int)_reflengths[sid][min_idx]) { + min_idx = i; + } + } + } + stats.push_back(_reflengths[sid][min_idx]); + } else { + throw runtime_error("Unsupported reflength strategy"); + } + //cerr << "computed length" << endl; + //precision on each ngram type + for (counts_it testcounts_it = testcounts.begin(); + testcounts_it != testcounts.end(); ++testcounts_it) { + counts_it refcounts_it = _refcounts[sid]->find(testcounts_it->first); + int correct = 0; + int guess = testcounts_it->second; + if (refcounts_it != _refcounts[sid]->end()) { + correct = min(refcounts_it->second,guess); + } + size_t len = testcounts_it->first.size(); + stats[len*2-2] += correct; + stats[len*2-1] += guess; + } + stringstream sout; + copy(stats.begin(),stats.end(),ostream_iterator(sout," ")); + //TRACE_ERR(sout.str() << endl); + string stats_str = sout.str(); + entry.set(stats_str); } -float BleuScorer::calculateScore(const vector& comps) { - //cerr << "BLEU: "; - //copy(comps.begin(),comps.end(), ostream_iterator(cerr," ")); - float logbleu = 0.0; - for (int i = 0; i < LENGTH; ++i) { - if (comps[2*i] == 0) { - return 0.0; - } - logbleu += log(comps[2*i]) - log(comps[2*i+1]); - - } - logbleu /= LENGTH; - float brevity = 1.0 - (float)comps[LENGTH*2]/comps[1];//reflength divided by test length - if (brevity < 0.0) { - logbleu += brevity; - } - //cerr << " " << exp(logbleu) << endl; - return exp(logbleu); +float BleuScorer::calculateScore(const vector& comps) +{ + //cerr << "BLEU: "; + //copy(comps.begin(),comps.end(), ostream_iterator(cerr," ")); + float logbleu = 0.0; + for (int i = 0; i < LENGTH; ++i) { + if (comps[2*i] == 0) { + return 0.0; + } + logbleu += log(comps[2*i]) - log(comps[2*i+1]); + + } + logbleu /= LENGTH; + float brevity = 1.0 - (float)comps[LENGTH*2]/comps[1];//reflength divided by test length + if (brevity < 0.0) { + logbleu += brevity; + } + //cerr << " " << exp(logbleu) << endl; + return exp(logbleu); } diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 1a052edb3..e853856bf 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -23,73 +23,74 @@ enum BleuReferenceLengthStrategy { BLEU_AVERAGE, BLEU_SHORTEST, BLEU_CLOSEST }; /** * Bleu scoring **/ -class BleuScorer: public StatisticsBasedScorer { - public: - BleuScorer(const string& config); - virtual void setReferenceFiles(const vector& referenceFiles); - virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); - int LENGTH; - - size_t NumberOfScores() const { - //cerr << "BleuScorer: " << (2 * LENGTH + 1) << endl; - return (2 * LENGTH + 1); - }; - bool useAlignment() const { - //cout << "BleuScorer::useAlignment returning false" << endl; - return false; - }; +class BleuScorer: public StatisticsBasedScorer +{ +public: + BleuScorer(const string& config); + virtual void setReferenceFiles(const vector& referenceFiles); + virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); + int LENGTH; - - - protected: - float calculateScore(const vector& comps); - - private: - //no copy - BleuScorer(const BleuScorer&); - ~BleuScorer(){}; - BleuScorer& operator=(const BleuScorer&); - //Used to construct the ngram map - struct CompareNgrams { - int operator() (const vector& a, const vector& b) { - size_t i; - size_t as = a.size(); - size_t bs = b.size(); - for (i = 0; i < as && i < bs; ++i) { - if (a[i] < b[i]) { - //cerr << "true" << endl; - return true; - } - if (a[i] > b[i]) { - //cerr << "false" << endl; - return false; - } - } - //entries are equal, shortest wins - return as < bs;; - } - }; + size_t NumberOfScores() const { + //cerr << "BleuScorer: " << (2 * LENGTH + 1) << endl; + return (2 * LENGTH + 1); + }; + bool useAlignment() const { + //cout << "BleuScorer::useAlignment returning false" << endl; + return false; + }; - typedef map,int,CompareNgrams> counts_t; - typedef map,int,CompareNgrams>::iterator counts_it; - typedef vector refcounts_t; - size_t countNgrams(const string& line, counts_t& counts, unsigned int n); +protected: + float calculateScore(const vector& comps); - void dump_counts(counts_t& counts) { - for (counts_it i = counts.begin(); i != counts.end(); ++i) { - cerr << "("; - copy(i->first.begin(), i->first.end(), ostream_iterator(cerr," ")); - cerr << ") " << i->second << ", "; - } - cerr << endl; - } - BleuReferenceLengthStrategy _refLengthStrategy; - - // data extracted from reference files - refcounts_t _refcounts; - vector > _reflengths; +private: + //no copy + BleuScorer(const BleuScorer&); + ~BleuScorer() {}; + BleuScorer& operator=(const BleuScorer&); + //Used to construct the ngram map + struct CompareNgrams { + int operator() (const vector& a, const vector& b) { + size_t i; + size_t as = a.size(); + size_t bs = b.size(); + for (i = 0; i < as && i < bs; ++i) { + if (a[i] < b[i]) { + //cerr << "true" << endl; + return true; + } + if (a[i] > b[i]) { + //cerr << "false" << endl; + return false; + } + } + //entries are equal, shortest wins + return as < bs;; + } + }; + + typedef map,int,CompareNgrams> counts_t; + typedef map,int,CompareNgrams>::iterator counts_it; + + typedef vector refcounts_t; + + size_t countNgrams(const string& line, counts_t& counts, unsigned int n); + + void dump_counts(counts_t& counts) { + for (counts_it i = counts.begin(); i != counts.end(); ++i) { + cerr << "("; + copy(i->first.begin(), i->first.end(), ostream_iterator(cerr," ")); + cerr << ") " << i->second << ", "; + } + cerr << endl; + } + BleuReferenceLengthStrategy _refLengthStrategy; + + // data extracted from reference files + refcounts_t _refcounts; + vector > _reflengths; }; diff --git a/mert/Data.cpp b/mert/Data.cpp index b0aa08e32..0527a8851 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -13,114 +13,113 @@ Data::Data(Scorer& ptr): -theScorer(&ptr) + theScorer(&ptr) { - score_type = (*theScorer).getName(); - TRACE_ERR("Data::score_type " << score_type << std::endl); - - TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl); + score_type = (*theScorer).getName(); + TRACE_ERR("Data::score_type " << score_type << std::endl); + + TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl); featdata=new FeatureData; scoredata=new ScoreData(*theScorer); }; void Data::loadnbest(const std::string &file) { - TRACE_ERR("loading nbest from " << file << std::endl); + TRACE_ERR("loading nbest from " << file << std::endl); - FeatureStats featentry; - ScoreStats scoreentry; - std::string sentence_index; + FeatureStats featentry; + ScoreStats scoreentry; + std::string sentence_index; - inputfilestream inp(file); // matches a stream with a file. Opens the file + inputfilestream inp(file); // matches a stream with a file. Opens the file - if (!inp.good()) - throw runtime_error("Unable to open: " + file); + if (!inp.good()) + throw runtime_error("Unable to open: " + file); - std::string substring, subsubstring, stringBuf; - std::string theSentence; - std::string theFeatures; - std::string theAlignment; - std::string::size_type loc; + std::string substring, subsubstring, stringBuf; + std::string theSentence; + std::string theFeatures; + std::string theAlignment; + std::string::size_type loc; - while (getline(inp,stringBuf,'\n')){ - if (stringBuf.empty()) continue; + while (getline(inp,stringBuf,'\n')) { + if (stringBuf.empty()) continue; -// TRACE_ERR("stringBuf: " << stringBuf << std::endl); +// TRACE_ERR("stringBuf: " << stringBuf << std::endl); - getNextPound(stringBuf, substring, "|||"); //first field - sentence_index = substring; + getNextPound(stringBuf, substring, "|||"); //first field + sentence_index = substring; - getNextPound(stringBuf, substring, "|||"); //second field - theSentence = substring; + getNextPound(stringBuf, substring, "|||"); //second field + theSentence = substring; // adding statistics for error measures - featentry.reset(); - scoreentry.clear(); + featentry.reset(); + scoreentry.clear(); - getNextPound(stringBuf, substring, "|||"); //third field - theFeatures = substring; + getNextPound(stringBuf, substring, "|||"); //third field + theFeatures = substring; - if (stringBuf.length() > 0) { - getNextPound(stringBuf, substring, "|||"); //fourth field sentence score - if (stringBuf.length() > 0) { - getNextPound(stringBuf, substring, "|||"); //fourth field only there if alignment scorer - theAlignment = substring; - } - } - //TODO check alignment exists if scorers need it + if (stringBuf.length() > 0) { + getNextPound(stringBuf, substring, "|||"); //fourth field sentence score + if (stringBuf.length() > 0) { + getNextPound(stringBuf, substring, "|||"); //fourth field only there if alignment scorer + theAlignment = substring; + } + } + //TODO check alignment exists if scorers need it - if (!theScorer->useAlignment()) { - theScorer->prepareStats(sentence_index, theSentence, scoreentry); - } else { - //an interpolated score would need both sentence and alignment - theSentence += "|||"; - theSentence += theAlignment; - theScorer->prepareStats(sentence_index, theSentence, scoreentry); - } + if (!theScorer->useAlignment()) { + theScorer->prepareStats(sentence_index, theSentence, scoreentry); + } else { + //an interpolated score would need both sentence and alignment + theSentence += "|||"; + theSentence += theAlignment; + theScorer->prepareStats(sentence_index, theSentence, scoreentry); + } - scoredata->add(scoreentry, sentence_index); + scoredata->add(scoreentry, sentence_index); - if (!existsFeatureNames()){ - std::string stringsupport=theFeatures; - // adding feature names - std::string features=""; - std::string tmpname=""; + if (!existsFeatureNames()) { + std::string stringsupport=theFeatures; + // adding feature names + std::string features=""; + std::string tmpname=""; + + size_t tmpidx=0; + while (!stringsupport.empty()) { + // TRACE_ERR("Decompounding: " << substring << std::endl); + getNextPound(stringsupport, subsubstring); + + // string ending with ":" are skipped, because they are the names of the features + if ((loc = subsubstring.find(":")) != subsubstring.length()-1) { + features+=tmpname+"_"+stringify(tmpidx)+" "; + tmpidx++; + } else { + tmpidx=0; + tmpname=subsubstring.substr(0,subsubstring.size() - 1); + } + } + + featdata->setFeatureMap(features); + } - size_t tmpidx=0; - while (!stringsupport.empty()){ - // TRACE_ERR("Decompounding: " << substring << std::endl); - getNextPound(stringsupport, subsubstring); - - // string ending with ":" are skipped, because they are the names of the features - if ((loc = subsubstring.find(":")) != subsubstring.length()-1){ - features+=tmpname+"_"+stringify(tmpidx)+" "; - tmpidx++; - } - else{ - tmpidx=0; - tmpname=subsubstring.substr(0,subsubstring.size() - 1); - } - } - - featdata->setFeatureMap(features); - } - // adding features - while (!theFeatures.empty()){ -// TRACE_ERR("Decompounding: " << theFeatures << std::endl); - getNextPound(theFeatures, subsubstring); + while (!theFeatures.empty()) { +// TRACE_ERR("Decompounding: " << theFeatures << std::endl); + getNextPound(theFeatures, subsubstring); // string ending with ":" are skipped, because they are the names of the features - if ((loc = subsubstring.find(":")) != subsubstring.length()-1){ - featentry.add(ATOFST(subsubstring.c_str())); - } - } - featdata->add(featentry,sentence_index); - } - - inp.close(); + if ((loc = subsubstring.find(":")) != subsubstring.length()-1) { + featentry.add(ATOFST(subsubstring.c_str())); + } + } + featdata->add(featentry,sentence_index); + } + + inp.close(); } diff --git a/mert/Data.h b/mert/Data.h index 22996f69f..e46b7dc18 100644 --- a/mert/Data.h +++ b/mert/Data.h @@ -24,49 +24,70 @@ class Scorer; class Data { protected: - ScoreData* scoredata; - FeatureData* featdata; - + ScoreData* scoredata; + FeatureData* featdata; + private: - Scorer* theScorer; + Scorer* theScorer; std::string score_type; - size_t number_of_scores; //number of scores - + size_t number_of_scores; //number of scores + public: - Data(Scorer& sc); - - ~Data(){}; - - inline void clear() { scoredata->clear(); featdata->clear(); } - - ScoreData* getScoreData() { return scoredata; }; - FeatureData* getFeatureData() { return featdata; }; - - inline size_t NumberOfFeatures() const{ return featdata->NumberOfFeatures(); } - inline void NumberOfFeatures(size_t v){ featdata->NumberOfFeatures(v); } - inline std::string Features() const{ return featdata->Features(); } - inline void Features(const std::string f){ featdata->Features(f); } + Data(Scorer& sc); - void loadnbest(const std::string &file); + ~Data() {}; - void load(const std::string &featfile,const std::string &scorefile){ - featdata->load(featfile); - scoredata->load(scorefile); + inline void clear() { + scoredata->clear(); + featdata->clear(); } - - void save(const std::string &featfile,const std::string &scorefile, bool bin=false){ - - if (bin) cerr << "Binary write mode is selected" << endl; - else cerr << "Binary write mode is NOT selected" << endl; - - featdata->save(featfile, bin); - scoredata->save(scorefile, bin); - } - inline bool existsFeatureNames(){ return featdata->existsFeatureNames(); }; - - inline std::string getFeatureName(size_t idx){ return featdata->getFeatureName(idx); }; - inline size_t getFeatureIndex(const std::string& name){ return featdata->getFeatureIndex(name); }; + ScoreData* getScoreData() { + return scoredata; + }; + FeatureData* getFeatureData() { + return featdata; + }; + + inline size_t NumberOfFeatures() const { + return featdata->NumberOfFeatures(); + } + inline void NumberOfFeatures(size_t v) { + featdata->NumberOfFeatures(v); + } + inline std::string Features() const { + return featdata->Features(); + } + inline void Features(const std::string f) { + featdata->Features(f); + } + + void loadnbest(const std::string &file); + + void load(const std::string &featfile,const std::string &scorefile) { + featdata->load(featfile); + scoredata->load(scorefile); + } + + void save(const std::string &featfile,const std::string &scorefile, bool bin=false) { + + if (bin) cerr << "Binary write mode is selected" << endl; + else cerr << "Binary write mode is NOT selected" << endl; + + featdata->save(featfile, bin); + scoredata->save(scorefile, bin); + } + + inline bool existsFeatureNames() { + return featdata->existsFeatureNames(); + }; + + inline std::string getFeatureName(size_t idx) { + return featdata->getFeatureName(idx); + }; + inline size_t getFeatureIndex(const std::string& name) { + return featdata->getFeatureIndex(name); + }; }; diff --git a/mert/FeatureArray.cpp b/mert/FeatureArray.cpp index 4645e1aef..b5c6bc8d1 100644 --- a/mert/FeatureArray.cpp +++ b/mert/FeatureArray.cpp @@ -16,137 +16,137 @@ FeatureArray::FeatureArray(): idx("") void FeatureArray::savetxt(std::ofstream& outFile) { - outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size() - << " " << number_of_features << " " << features << std::endl; - for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++){ - i->savetxt(outFile); - outFile << std::endl; - } - outFile << FEATURES_TXT_END << std::endl; + outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size() + << " " << number_of_features << " " << features << std::endl; + for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) { + i->savetxt(outFile); + outFile << std::endl; + } + outFile << FEATURES_TXT_END << std::endl; } void FeatureArray::savebin(std::ofstream& outFile) { - outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size() - << " " << number_of_features << " " << features << std::endl; + outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size() + << " " << number_of_features << " " << features << std::endl; for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) - i->savebin(outFile); + i->savebin(outFile); - outFile << FEATURES_BIN_END << std::endl; + outFile << FEATURES_BIN_END << std::endl; } void FeatureArray::save(std::ofstream& inFile, bool bin) { - if (size()>0) - (bin)?savebin(inFile):savetxt(inFile); + if (size()>0) + (bin)?savebin(inFile):savetxt(inFile); } void FeatureArray::save(const std::string &file, bool bin) { - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file + std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - save(outFile); + save(outFile); - outFile.close(); + outFile.close(); } void FeatureArray::loadbin(ifstream& inFile, size_t n) { - FeatureStats entry(number_of_features); + FeatureStats entry(number_of_features); - for (size_t i=0 ; i < n; i++){ - entry.loadbin(inFile); - add(entry); - } + for (size_t i=0 ; i < n; i++) { + entry.loadbin(inFile); + add(entry); + } } void FeatureArray::loadtxt(ifstream& inFile, size_t n) { - FeatureStats entry(number_of_features); - - for (size_t i=0 ; i < n; i++){ - entry.loadtxt(inFile); - add(entry); - } + FeatureStats entry(number_of_features); + + for (size_t i=0 ; i < n; i++) { + entry.loadtxt(inFile); + add(entry); + } } void FeatureArray::load(ifstream& inFile) { size_t number_of_entries=0; - bool binmode=false; - - std::string substring, stringBuf; + bool binmode=false; + + std::string substring, stringBuf; std::string::size_type loc; - std::getline(inFile, stringBuf); - if (!inFile.good()){ - return; - } + std::getline(inFile, stringBuf); + if (!inFile.good()) { + return; + } - if (!stringBuf.empty()){ - if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0){ - binmode=false; - }else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0){ - binmode=true; - }else{ - TRACE_ERR("ERROR: FeatureArray::load(): Wrong header"); - return; - } - getNextPound(stringBuf, substring); - getNextPound(stringBuf, substring); + if (!stringBuf.empty()) { + if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0) { + binmode=false; + } else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0) { + binmode=true; + } else { + TRACE_ERR("ERROR: FeatureArray::load(): Wrong header"); + return; + } + getNextPound(stringBuf, substring); + getNextPound(stringBuf, substring); idx = substring; - getNextPound(stringBuf, substring); + getNextPound(stringBuf, substring); number_of_entries = atoi(substring.c_str()); - getNextPound(stringBuf, substring); + getNextPound(stringBuf, substring); number_of_features = atoi(substring.c_str()); - features = stringBuf; - } + features = stringBuf; + } - (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries); + (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries); - std::getline(inFile, stringBuf); - if (!stringBuf.empty()){ - if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0){ - TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer"); - return; - } - } + std::getline(inFile, stringBuf); + if (!stringBuf.empty()) { + if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0) { + TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer"); + return; + } + } } void FeatureArray::load(const std::string &file) { - TRACE_ERR("loading data from " << file << std::endl); + TRACE_ERR("loading data from " << file << std::endl); - inputfilestream inFile(file); // matches a stream with a file. Opens the file + inputfilestream inFile(file); // matches a stream with a file. Opens the file - load((ifstream&) inFile); + load((ifstream&) inFile); - inFile.close(); + inFile.close(); } void FeatureArray::merge(FeatureArray& e) { - //dummy implementation - for (size_t i=0; isize()!=sz) - return false; - - return true; + size_t sz = NumberOfFeatures(); + + if (sz == 0) + return true; + + for (featarray_t::iterator i=array_.begin(); i!=array_.end(); i++) + if (i->size()!=sz) + return false; + + return true; } diff --git a/mert/FeatureArray.h b/mert/FeatureArray.h index d08bf54db..f707d0d98 100644 --- a/mert/FeatureArray.h +++ b/mert/FeatureArray.h @@ -27,47 +27,71 @@ using namespace std; class FeatureArray { protected: - featarray_t array_; - size_t number_of_features; - std::string features; - + featarray_t array_; + size_t number_of_features; + std::string features; + private: - std::string idx; // idx to identify the utterance, it can differ from the index inside the vector - + std::string idx; // idx to identify the utterance, it can differ from the index inside the vector + public: - FeatureArray(); - - ~FeatureArray(){}; - - inline void clear() { array_.clear(); } - - inline std::string getIndex(){ return idx; } - inline void setIndex(const std::string & value){ idx=value; } + FeatureArray(); - inline FeatureStats& get(size_t i){ return array_.at(i); } - inline const FeatureStats& get(size_t i)const{ return array_.at(i); } - void add(FeatureStats e){ array_.push_back(e); } + ~FeatureArray() {}; - void merge(FeatureArray& e); + inline void clear() { + array_.clear(); + } - inline size_t size(){ return array_.size(); } - inline size_t NumberOfFeatures() const{ return number_of_features; } - inline void NumberOfFeatures(size_t v){ number_of_features = v; } - inline std::string Features() const{ return features; } - inline void Features(const std::string f){ features = f; } - - void savetxt(ofstream& outFile); - void savebin(ofstream& outFile); - void save(ofstream& outFile, bool bin=false); - void save(const std::string &file, bool bin=false); - inline void save(bool bin=false){ save("/dev/stdout",bin); } + inline std::string getIndex() { + return idx; + } + inline void setIndex(const std::string & value) { + idx=value; + } - void loadtxt(ifstream& inFile, size_t n); - void loadbin(ifstream& inFile, size_t n); - void load(ifstream& inFile); - void load(const std::string &file); - - bool check_consistency(); + inline FeatureStats& get(size_t i) { + return array_.at(i); + } + inline const FeatureStats& get(size_t i)const { + return array_.at(i); + } + void add(FeatureStats e) { + array_.push_back(e); + } + + void merge(FeatureArray& e); + + inline size_t size() { + return array_.size(); + } + inline size_t NumberOfFeatures() const { + return number_of_features; + } + inline void NumberOfFeatures(size_t v) { + number_of_features = v; + } + inline std::string Features() const { + return features; + } + inline void Features(const std::string f) { + features = f; + } + + void savetxt(ofstream& outFile); + void savebin(ofstream& outFile); + void save(ofstream& outFile, bool bin=false); + void save(const std::string &file, bool bin=false); + inline void save(bool bin=false) { + save("/dev/stdout",bin); + } + + void loadtxt(ifstream& inFile, size_t n); + void loadbin(ifstream& inFile, size_t n); + void load(ifstream& inFile); + void load(const std::string &file); + + bool check_consistency(); }; diff --git a/mert/FeatureData.cpp b/mert/FeatureData.cpp index 6fc390b42..9f8272b22 100644 --- a/mert/FeatureData.cpp +++ b/mert/FeatureData.cpp @@ -18,127 +18,127 @@ FeatureData::FeatureData() {}; void FeatureData::save(std::ofstream& outFile, bool bin) { - for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) - i->save(outFile, bin); + for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) + i->save(outFile, bin); } void FeatureData::save(const std::string &file, bool bin) { - if (file.empty()) return; + if (file.empty()) return; - TRACE_ERR("saving the array into " << file << std::endl); + TRACE_ERR("saving the array into " << file << std::endl); - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file + std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - save(outFile, bin); + save(outFile, bin); - outFile.close(); + outFile.close(); } void FeatureData::load(ifstream& inFile) { FeatureArray entry; - while (!inFile.eof()){ + while (!inFile.eof()) { - if (!inFile.good()){ - std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl; - } + if (!inFile.good()) { + std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl; + } - entry.clear(); - entry.load(inFile); + entry.clear(); + entry.load(inFile); - if (entry.size() == 0) - break; + if (entry.size() == 0) + break; - if (size() == 0){ - setFeatureMap(entry.Features()); - } - add(entry); - } + if (size() == 0) { + setFeatureMap(entry.Features()); + } + add(entry); + } } void FeatureData::load(const std::string &file) { - TRACE_ERR("loading feature data from " << file << std::endl); + TRACE_ERR("loading feature data from " << file << std::endl); - inputfilestream inFile(file); // matches a stream with a file. Opens the file + inputfilestream inFile(file); // matches a stream with a file. Opens the file - if (!inFile) { - throw runtime_error("Unable to open feature file: " + file); - } + if (!inFile) { + throw runtime_error("Unable to open feature file: " + file); + } - load((ifstream&) inFile); + load((ifstream&) inFile); - inFile.close(); + inFile.close(); } -void FeatureData::add(FeatureArray& e){ - if (exists(e.getIndex())){ // array at position e.getIndex() already exists - //enlarge array at position e.getIndex() - size_t pos = getIndex(e.getIndex()); - array_.at(pos).merge(e); - } - else{ - array_.push_back(e); - setIndex(); - } +void FeatureData::add(FeatureArray& e) +{ + if (exists(e.getIndex())) { // array at position e.getIndex() already exists + //enlarge array at position e.getIndex() + size_t pos = getIndex(e.getIndex()); + array_.at(pos).merge(e); + } else { + array_.push_back(e); + setIndex(); + } } -void FeatureData::add(FeatureStats& e, const std::string & sent_idx){ - if (exists(sent_idx)){ // array at position e.getIndex() already exists - //enlarge array at position e.getIndex() - size_t pos = getIndex(sent_idx); -// TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl); - array_.at(pos).add(e); - } - else{ -// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl); - FeatureArray a; - a.NumberOfFeatures(number_of_features); - a.Features(features); - a.setIndex(sent_idx); - a.add(e); - add(a); - } - } +void FeatureData::add(FeatureStats& e, const std::string & sent_idx) +{ + if (exists(sent_idx)) { // array at position e.getIndex() already exists + //enlarge array at position e.getIndex() + size_t pos = getIndex(sent_idx); +// TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl); + array_.at(pos).add(e); + } else { +// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl); + FeatureArray a; + a.NumberOfFeatures(number_of_features); + a.Features(features); + a.setIndex(sent_idx); + a.add(e); + add(a); + } +} bool FeatureData::check_consistency() { - if (array_.size() == 0) - return true; - - for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) - if (!i->check_consistency()) return false; + if (array_.size() == 0) + return true; - return true; + for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) + if (!i->check_consistency()) return false; + + return true; } void FeatureData::setIndex() { - size_t j=0; - for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++){ - idx2arrayname_[j]=(*i).getIndex(); - arrayname2idx_[(*i).getIndex()] = j; - j++; - } + size_t j=0; + for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) { + idx2arrayname_[j]=(*i).getIndex(); + arrayname2idx_[(*i).getIndex()] = j; + j++; + } } void FeatureData::setFeatureMap(const std::string feat) { - number_of_features = 0; - features=feat; + number_of_features = 0; + features=feat; - std::string substring, stringBuf; - stringBuf=features; - while (!stringBuf.empty()){ - getNextPound(stringBuf, substring); - - featname2idx_[substring]=idx2featname_.size(); - idx2featname_[idx2featname_.size()]=substring; - number_of_features++; - } + std::string substring, stringBuf; + stringBuf=features; + while (!stringBuf.empty()) { + getNextPound(stringBuf, substring); + + featname2idx_[substring]=idx2featname_.size(); + idx2featname_[idx2featname_.size()]=substring; + number_of_features++; + } } diff --git a/mert/FeatureData.h b/mert/FeatureData.h index b0b2f1f88..cf0038bb7 100644 --- a/mert/FeatureData.h +++ b/mert/FeatureData.h @@ -20,86 +20,116 @@ using namespace std; class FeatureData { - + protected: - featdata_t array_; - idx2name idx2arrayname_; //map from index to name of array - name2idx arrayname2idx_; //map from name to index of array - - + featdata_t array_; + idx2name idx2arrayname_; //map from index to name of array + name2idx arrayname2idx_; //map from name to index of array + + private: - size_t number_of_features; - std::string features; + size_t number_of_features; + std::string features; + + map featname2idx_; //map from name to index of features + map idx2featname_; //map from index to name of features - map featname2idx_; //map from name to index of features - map idx2featname_; //map from index to name of features - public: - FeatureData(); - - ~FeatureData(){}; - - inline void clear() { array_.clear(); } - - inline FeatureArray get(const std::string& idx){ return array_.at(getIndex(idx)); } - inline FeatureArray& get(size_t idx){ return array_.at(idx); } - inline const FeatureArray& get(size_t idx) const{ return array_.at(idx); } + FeatureData(); - inline bool exists(const std::string & sent_idx){ return exists(getIndex(sent_idx)); } - inline bool exists(int sent_idx){ return (sent_idx>-1 && sent_idx<(int) array_.size())?true:false; } + ~FeatureData() {}; - inline FeatureStats& get(size_t i, size_t j){ return array_.at(i).get(j); } - inline const FeatureStats& get(size_t i, size_t j) const { return array_.at(i).get(j); } - - void add(FeatureArray& e); - void add(FeatureStats& e, const std::string& sent_idx); - - inline size_t size(){ return array_.size(); } - inline size_t NumberOfFeatures() const{ return number_of_features; } - inline void NumberOfFeatures(size_t v){ number_of_features = v; } - inline std::string Features() const{ return features; } - inline void Features(const std::string f){ features = f; } - - void save(const std::string &file, bool bin=false); - void save(ofstream& outFile, bool bin=false); - inline void save(bool bin=false){ save("/dev/stdout", bin); } - - void load(ifstream& inFile); - void load(const std::string &file); - - bool check_consistency(); - void setIndex(); - - inline int getIndex(const std::string& idx){ - name2idx::iterator i = arrayname2idx_.find(idx); - if (i!=arrayname2idx_.end()) - return i->second; - else - return -1; + inline void clear() { + array_.clear(); } - - inline std::string getIndex(size_t idx){ - idx2name::iterator i = idx2arrayname_.find(idx); - if (i!=idx2arrayname_.end()) - throw runtime_error("there is no entry at index " + idx); - return i->second; - } - - - bool existsFeatureNames(){ return (idx2featname_.size() > 0)?true:false; }; - - std::string getFeatureName(size_t idx){ - if (idx >= idx2featname_.size()) - throw runtime_error("Error: you required an too big index"); - return idx2featname_[idx]; - }; - - size_t getFeatureIndex(const std::string& name){ - if (featname2idx_.find(name)!=featname2idx_.end()) - throw runtime_error("Error: feature is unknown"); - return featname2idx_[name]; - }; - + + inline FeatureArray get(const std::string& idx) { + return array_.at(getIndex(idx)); + } + inline FeatureArray& get(size_t idx) { + return array_.at(idx); + } + inline const FeatureArray& get(size_t idx) const { + return array_.at(idx); + } + + inline bool exists(const std::string & sent_idx) { + return exists(getIndex(sent_idx)); + } + inline bool exists(int sent_idx) { + return (sent_idx>-1 && sent_idx<(int) array_.size())?true:false; + } + + inline FeatureStats& get(size_t i, size_t j) { + return array_.at(i).get(j); + } + inline const FeatureStats& get(size_t i, size_t j) const { + return array_.at(i).get(j); + } + + void add(FeatureArray& e); + void add(FeatureStats& e, const std::string& sent_idx); + + inline size_t size() { + return array_.size(); + } + inline size_t NumberOfFeatures() const { + return number_of_features; + } + inline void NumberOfFeatures(size_t v) { + number_of_features = v; + } + inline std::string Features() const { + return features; + } + inline void Features(const std::string f) { + features = f; + } + + void save(const std::string &file, bool bin=false); + void save(ofstream& outFile, bool bin=false); + inline void save(bool bin=false) { + save("/dev/stdout", bin); + } + + void load(ifstream& inFile); + void load(const std::string &file); + + bool check_consistency(); + void setIndex(); + + inline int getIndex(const std::string& idx) { + name2idx::iterator i = arrayname2idx_.find(idx); + if (i!=arrayname2idx_.end()) + return i->second; + else + return -1; + } + + inline std::string getIndex(size_t idx) { + idx2name::iterator i = idx2arrayname_.find(idx); + if (i!=idx2arrayname_.end()) + throw runtime_error("there is no entry at index " + idx); + return i->second; + } + + + bool existsFeatureNames() { + return (idx2featname_.size() > 0)?true:false; + }; + + std::string getFeatureName(size_t idx) { + if (idx >= idx2featname_.size()) + throw runtime_error("Error: you required an too big index"); + return idx2featname_[idx]; + }; + + size_t getFeatureIndex(const std::string& name) { + if (featname2idx_.find(name)!=featname2idx_.end()) + throw runtime_error("Error: feature is unknown"); + return featname2idx_[name]; + }; + void setFeatureMap(const std::string feat); }; diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp index d9a80ce7d..74f1ff8a9 100644 --- a/mert/FeatureStats.cpp +++ b/mert/FeatureStats.cpp @@ -14,123 +14,124 @@ FeatureStats::FeatureStats() { - available_ = AVAILABLE_; - entries_ = 0; - array_ = new FeatureStatsType[available_]; + available_ = AVAILABLE_; + entries_ = 0; + array_ = new FeatureStatsType[available_]; }; FeatureStats::~FeatureStats() { - delete array_; + delete array_; }; FeatureStats::FeatureStats(const FeatureStats &stats) { - available_ = stats.available(); - entries_ = stats.size(); - array_ = new FeatureStatsType[available_]; - memcpy(array_,stats.getArray(),featbytes_); + available_ = stats.available(); + entries_ = stats.size(); + array_ = new FeatureStatsType[available_]; + memcpy(array_,stats.getArray(),featbytes_); }; FeatureStats::FeatureStats(const size_t size) { - available_ = size; - entries_ = size; - array_ = new FeatureStatsType[available_]; - memset(array_,0,featbytes_); + available_ = size; + entries_ = size; + array_ = new FeatureStatsType[available_]; + memset(array_,0,featbytes_); }; FeatureStats::FeatureStats(std::string &theString) { - set(theString); + set(theString); } void FeatureStats::expand() { - available_*=2; - featstats_t t_ = new FeatureStatsType[available_]; - memcpy(t_,array_,featbytes_); - delete array_; - array_=t_; + available_*=2; + featstats_t t_ = new FeatureStatsType[available_]; + memcpy(t_,array_,featbytes_); + delete array_; + array_=t_; } void FeatureStats::add(FeatureStatsType v) { - if (isfull()) expand(); - array_[entries_++]=v; + if (isfull()) expand(); + array_[entries_++]=v; } void FeatureStats::set(std::string &theString) { std::string substring, stringBuf; - reset(); - - while (!theString.empty()){ - getNextPound(theString, substring); - add(ATOFST(substring.c_str())); - } + reset(); + + while (!theString.empty()) { + getNextPound(theString, substring); + add(ATOFST(substring.c_str())); + } } void FeatureStats::loadbin(std::ifstream& inFile) { - inFile.read((char*) array_, featbytes_); -} + inFile.read((char*) array_, featbytes_); +} void FeatureStats::loadtxt(std::ifstream& inFile) { - std::string theString; - std::getline(inFile, theString); - set(theString); + std::string theString; + std::getline(inFile, theString); + set(theString); } void FeatureStats::loadtxt(const std::string &file) { - // TRACE_ERR("loading the stats from " << file << std::endl); + // TRACE_ERR("loading the stats from " << file << std::endl); - std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file + std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file - loadtxt(inFile); + loadtxt(inFile); } void FeatureStats::savetxt(const std::string &file) { -// TRACE_ERR("saving the stats into " << file << std::endl); +// TRACE_ERR("saving the stats into " << file << std::endl); - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file + std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - savetxt(outFile); + savetxt(outFile); } void FeatureStats::savetxt(std::ofstream& outFile) { -// TRACE_ERR("saving the stats" << std::endl); - outFile << *this; +// TRACE_ERR("saving the stats" << std::endl); + outFile << *this; } void FeatureStats::savebin(std::ofstream& outFile) { - outFile.write((char*) array_, featbytes_); -} + outFile.write((char*) array_, featbytes_); +} FeatureStats& FeatureStats::operator=(const FeatureStats &stats) { - delete array_; - available_ = stats.available(); - entries_ = stats.size(); - array_ = new FeatureStatsType[available_]; - memcpy(array_,stats.getArray(),featbytes_); - - return *this; + delete array_; + available_ = stats.available(); + entries_ = stats.size(); + array_ = new FeatureStatsType[available_]; + memcpy(array_,stats.getArray(),featbytes_); + + return *this; } /**write the whole object to a stream*/ -ostream& operator<<(ostream& o, const FeatureStats& e){ - for (size_t i=0; i< e.size(); i++) - o << e.get(i) << " "; - return o; +ostream& operator<<(ostream& o, const FeatureStats& e) +{ + for (size_t i=0; i< e.size(); i++) + o << e.get(i) << " "; + return o; } diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h index 4e9ca84fa..93620e6d4 100644 --- a/mert/FeatureStats.h +++ b/mert/FeatureStats.h @@ -25,46 +25,67 @@ using namespace std; class FeatureStats { private: - featstats_t array_; - size_t entries_; - size_t available_; - + featstats_t array_; + size_t entries_; + size_t available_; + public: - FeatureStats(); - FeatureStats(const size_t size); - FeatureStats(const FeatureStats &stats); - FeatureStats(std::string &theString); - FeatureStats& operator=(const FeatureStats &stats); - - ~FeatureStats(); - - bool isfull(){return (entries_ < available_)?0:1; } - void expand(); - void add(FeatureStatsType v); - - inline void clear() { memset((void*) array_,0,featbytes_); } - - inline FeatureStatsType get(size_t i){ return array_[i]; } - inline FeatureStatsType get(size_t i)const{ return array_[i]; } - inline featstats_t getArray() const { return array_; } + FeatureStats(); + FeatureStats(const size_t size); + FeatureStats(const FeatureStats &stats); + FeatureStats(std::string &theString); + FeatureStats& operator=(const FeatureStats &stats); - void set(std::string &theString); + ~FeatureStats(); - inline size_t bytes() const{ return featbytes_; } - inline size_t size() const{ return entries_; } - inline size_t available() const{ return available_; } - - void savetxt(const std::string &file); - void savetxt(ofstream& outFile); - void savebin(ofstream& outFile); - inline void savetxt(){ savetxt("/dev/stdout"); } - - void loadtxt(const std::string &file); - void loadtxt(ifstream& inFile); - void loadbin(ifstream& inFile); + bool isfull() { + return (entries_ < available_)?0:1; + } + void expand(); + void add(FeatureStatsType v); + + inline void clear() { + memset((void*) array_,0,featbytes_); + } + + inline FeatureStatsType get(size_t i) { + return array_[i]; + } + inline FeatureStatsType get(size_t i)const { + return array_[i]; + } + inline featstats_t getArray() const { + return array_; + } + + void set(std::string &theString); + + inline size_t bytes() const { + return featbytes_; + } + inline size_t size() const { + return entries_; + } + inline size_t available() const { + return available_; + } + + void savetxt(const std::string &file); + void savetxt(ofstream& outFile); + void savebin(ofstream& outFile); + inline void savetxt() { + savetxt("/dev/stdout"); + } + + void loadtxt(const std::string &file); + void loadtxt(ifstream& inFile); + void loadbin(ifstream& inFile); + + inline void reset() { + entries_ = 0; + clear(); + } - inline void reset(){ entries_ = 0; clear(); } - /**write the whole object to a stream*/ friend ostream& operator<<(ostream& o, const FeatureStats& e); }; diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp index 1907a19fa..0a8f26faf 100644 --- a/mert/InterpolatedScorer.cpp +++ b/mert/InterpolatedScorer.cpp @@ -5,196 +5,201 @@ using namespace std; -InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config) { - //configure regularisation - static string KEY_WEIGHTS = "weights"; - static string KEY_TYPE = "regtype"; - static string KEY_WINDOW = "regwin"; - static string KEY_CASE = "case"; - static string TYPE_NONE = "none"; - static string TYPE_AVERAGE = "average"; - static string TYPE_MINIMUM = "min"; - static string TRUE = "true"; - static string FALSE = "false"; +InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config) +{ + //configure regularisation + static string KEY_WEIGHTS = "weights"; + static string KEY_TYPE = "regtype"; + static string KEY_WINDOW = "regwin"; + static string KEY_CASE = "case"; + static string TYPE_NONE = "none"; + static string TYPE_AVERAGE = "average"; + static string TYPE_MINIMUM = "min"; + static string TRUE = "true"; + static string FALSE = "false"; - string type = getConfig(KEY_TYPE,TYPE_NONE); - if (type == TYPE_NONE) { - _regularisationStrategy = REG_NONE; - } else if (type == TYPE_AVERAGE) { - _regularisationStrategy = REG_AVERAGE; - } else if (type == TYPE_MINIMUM) { - _regularisationStrategy = REG_MINIMUM; - } else { - throw runtime_error("Unknown scorer regularisation strategy: " + type); - } - cerr << "Using scorer regularisation strategy: " << type << endl; + string type = getConfig(KEY_TYPE,TYPE_NONE); + if (type == TYPE_NONE) { + _regularisationStrategy = REG_NONE; + } else if (type == TYPE_AVERAGE) { + _regularisationStrategy = REG_AVERAGE; + } else if (type == TYPE_MINIMUM) { + _regularisationStrategy = REG_MINIMUM; + } else { + throw runtime_error("Unknown scorer regularisation strategy: " + type); + } + cerr << "Using scorer regularisation strategy: " << type << endl; - string window = getConfig(KEY_WINDOW,"0"); - _regularisationWindow = atoi(window.c_str()); - cerr << "Using scorer regularisation window: " << _regularisationWindow << endl; + string window = getConfig(KEY_WINDOW,"0"); + _regularisationWindow = atoi(window.c_str()); + cerr << "Using scorer regularisation window: " << _regularisationWindow << endl; - string preservecase = getConfig(KEY_CASE,TRUE); - if (preservecase == TRUE) { - _preserveCase = true; - }else if (preservecase == FALSE) { - _preserveCase = false; - } - cerr << "Using case preservation: " << _preserveCase << endl; + string preservecase = getConfig(KEY_CASE,TRUE); + if (preservecase == TRUE) { + _preserveCase = true; + } else if (preservecase == FALSE) { + _preserveCase = false; + } + cerr << "Using case preservation: " << _preserveCase << endl; - // name would be: HAMMING,BLEU or similar + // name would be: HAMMING,BLEU or similar - string scorers = name; - while (scorers.length() > 0) { - string scorertype = ""; - getNextPound(scorers,scorertype,","); - ScorerFactory SF; - Scorer *theScorer=SF.getScorer(scorertype,config); - _scorers.push_back(theScorer); - } - if (_scorers.size() == 0) { - throw runtime_error("There are no scorers"); - } - cout << "Number of scorers: " << _scorers.size() << endl; + string scorers = name; + while (scorers.length() > 0) { + string scorertype = ""; + getNextPound(scorers,scorertype,","); + ScorerFactory SF; + Scorer *theScorer=SF.getScorer(scorertype,config); + _scorers.push_back(theScorer); + } + if (_scorers.size() == 0) { + throw runtime_error("There are no scorers"); + } + cout << "Number of scorers: " << _scorers.size() << endl; - //TODO debug this - string wtype = getConfig(KEY_WEIGHTS,""); - //Default weights set to uniform ie. if two weights 0.5 each - //weights should add to 1 - if (wtype.length() == 0) { - float weight = 1.0/_scorers.size() ; - //cout << " Default weights:" << weight << endl; - for (size_t i = 0; i < _scorers.size(); i ++) { - _scorerWeights.push_back(weight); - } - }else{ - float tot=0; - //cout << "Defined weights:" << endl; - while (wtype.length() > 0) { - string scoreweight = ""; - getNextPound(wtype,scoreweight,"+"); - float weight = atof(scoreweight.c_str()); - _scorerWeights.push_back(weight); - tot += weight; - //cout << " :" << weight ; - } - //cout << endl; - if (tot != float(1)) { - throw runtime_error("The interpolated scorers weights do not sum to 1"); - } + //TODO debug this + string wtype = getConfig(KEY_WEIGHTS,""); + //Default weights set to uniform ie. if two weights 0.5 each + //weights should add to 1 + if (wtype.length() == 0) { + float weight = 1.0/_scorers.size() ; + //cout << " Default weights:" << weight << endl; + for (size_t i = 0; i < _scorers.size(); i ++) { + _scorerWeights.push_back(weight); } - cout << "The weights for the interpolated scorers are: " << endl; - for (vector::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) { - cout << *it << " " ; + } else { + float tot=0; + //cout << "Defined weights:" << endl; + while (wtype.length() > 0) { + string scoreweight = ""; + getNextPound(wtype,scoreweight,"+"); + float weight = atof(scoreweight.c_str()); + _scorerWeights.push_back(weight); + tot += weight; + //cout << " :" << weight ; } - cout <::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) { + cout << *it << " " ; + } + cout <::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){ - int numScoresScorer = (*itsc)->NumberOfScores(); - ScoreData* newData =new ScoreData(**itsc); - for (size_t i = 0; i < data->size(); i++){ - ScoreArray scoreArray = data->get(i); - ScoreArray newScoreArray; - std::string istr; - std::stringstream out; - out << i; - istr = out.str(); - size_t numNBest = scoreArray.size(); - //cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ; - for (size_t j = 0; j < numNBest ; j++){ - ScoreStats scoreStats = data->get(i, j); - //cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl; - ScoreStats newScoreStats; - for (size_t k = last; k < size_t(numScoresScorer + last); k++) { - ScoreStatsType score = scoreStats.get(k); - newScoreStats.add(score); - } - //cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl; - newScoreArray.add(newScoreStats); - } - newScoreArray.setIndex(istr); - newData->add(newScoreArray); - } - //newData->dump(); - (*itsc)->setScoreData(newData); - last += numScoresScorer; - } +void InterpolatedScorer::setScoreData(ScoreData* data) +{ + size_t last = 0; + _scoreData = data; + for (vector::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) { + int numScoresScorer = (*itsc)->NumberOfScores(); + ScoreData* newData =new ScoreData(**itsc); + for (size_t i = 0; i < data->size(); i++) { + ScoreArray scoreArray = data->get(i); + ScoreArray newScoreArray; + std::string istr; + std::stringstream out; + out << i; + istr = out.str(); + size_t numNBest = scoreArray.size(); + //cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ; + for (size_t j = 0; j < numNBest ; j++) { + ScoreStats scoreStats = data->get(i, j); + //cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl; + ScoreStats newScoreStats; + for (size_t k = last; k < size_t(numScoresScorer + last); k++) { + ScoreStatsType score = scoreStats.get(k); + newScoreStats.add(score); + } + //cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl; + newScoreArray.add(newScoreStats); + } + newScoreArray.setIndex(istr); + newData->add(newScoreArray); + } + //newData->dump(); + (*itsc)->setScoreData(newData); + last += numScoresScorer; + } } -/** The interpolated scorer calls a vector of scorers and combines them with +/** The interpolated scorer calls a vector of scorers and combines them with weights **/ void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs, - statscores_t& scores) { + statscores_t& scores) +{ - //cout << "*******InterpolatedScorer::score" << endl; - size_t scorerNum = 0; - for (vector::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){ - int numScores = (*itsc)->NumberOfScores(); - statscores_t tscores; - (*itsc)->score(candidates,diffs,tscores); - size_t inc = 0; - for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end();itstatsc++){ - //cout << "Scores " << (*itstatsc) << endl; - float weight = _scorerWeights[scorerNum]; - if (weight == 0) { - stringstream msg; - msg << "No weights for scorer" << scorerNum ; - throw runtime_error(msg.str()); - } - if (scorerNum == 0) { - scores.push_back(weight * (*itstatsc)); - } else { - scores[inc] += weight * (*itstatsc); - } - //cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl; - inc++; - - } - scorerNum++; - } + //cout << "*******InterpolatedScorer::score" << endl; + size_t scorerNum = 0; + for (vector::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) { + int numScores = (*itsc)->NumberOfScores(); + statscores_t tscores; + (*itsc)->score(candidates,diffs,tscores); + size_t inc = 0; + for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) { + //cout << "Scores " << (*itstatsc) << endl; + float weight = _scorerWeights[scorerNum]; + if (weight == 0) { + stringstream msg; + msg << "No weights for scorer" << scorerNum ; + throw runtime_error(msg.str()); + } + if (scorerNum == 0) { + scores.push_back(weight * (*itstatsc)); + } else { + scores[inc] += weight * (*itstatsc); + } + //cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl; + inc++; + + } + scorerNum++; + } } -void InterpolatedScorer::setReferenceFiles(const vector& referenceFiles) { - for (vector::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){ - //the scorers that use alignments use the reference files in the constructor through config - (*itsc)->setReferenceFiles(referenceFiles); - } +void InterpolatedScorer::setReferenceFiles(const vector& referenceFiles) +{ + for (vector::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) { + //the scorers that use alignments use the reference files in the constructor through config + (*itsc)->setReferenceFiles(referenceFiles); + } } // Text can be: // Reference sentence ||| Reference sentence alignment information (as given by MOSES -include-alignment-in-n-best) // If a permutation distance scorer, send alignment info // Else if other scorer, remove the alignment info and then send reference as usual -void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { - stringstream buff; - string align = text; - string sentence = ""; - size_t alignmentData = text.find("|||"); - //Get sentence and alignment parts - if(alignmentData != string::npos) { - getNextPound(align,sentence, "|||"); - } - int i=0; - for (vector::iterator itsc = _scorers.begin(); itsc!=_scorers.end();itsc++){ - ScoreStats tempEntry; - if ((*itsc)->useAlignment()) { - (*itsc)->prepareStats(sid, text, tempEntry); - } else { - (*itsc)->prepareStats(sid, sentence, tempEntry); - } - if (i > 0) buff << " "; - buff << tempEntry; - i++; +void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) +{ + stringstream buff; + string align = text; + string sentence = ""; + size_t alignmentData = text.find("|||"); + //Get sentence and alignment parts + if(alignmentData != string::npos) { + getNextPound(align,sentence, "|||"); + } + int i=0; + for (vector::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) { + ScoreStats tempEntry; + if ((*itsc)->useAlignment()) { + (*itsc)->prepareStats(sid, text, tempEntry); + } else { + (*itsc)->prepareStats(sid, sentence, tempEntry); } - //cout << " Scores for interpolated: " << buff << endl; - string str = buff.str(); - entry.set(str); + if (i > 0) buff << " "; + buff << tempEntry; + i++; + } + //cout << " Scores for interpolated: " << buff << endl; + string str = buff.str(); + entry.set(str); } diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h index e057f5748..517f82263 100644 --- a/mert/InterpolatedScorer.h +++ b/mert/InterpolatedScorer.h @@ -18,48 +18,49 @@ /** * Abstract base class for scorers that include other scorers eg. * Interpolated HAMMING and BLEU scorer **/ -class InterpolatedScorer : public Scorer { +class InterpolatedScorer : public Scorer +{ - public: - // name would be: "HAMMING,BLEU" or similar - InterpolatedScorer(const string& name, const string& config); - ~InterpolatedScorer(){}; - void score(const candidates_t& candidates, const diffs_t& diffs, - statscores_t& scores); +public: + // name would be: "HAMMING,BLEU" or similar + InterpolatedScorer(const string& name, const string& config); + ~InterpolatedScorer() {}; + void score(const candidates_t& candidates, const diffs_t& diffs, + statscores_t& scores); - void setReferenceFiles(const vector& referenceFiles); - void prepareStats(size_t sid, const string& text, ScoreStats& entry); - size_t NumberOfScores() const { - size_t sz=0; - for (vector::const_iterator itsc = _scorers.begin(); itsc < _scorers.end();itsc++){ - sz += (*itsc)->NumberOfScores(); - } - return sz; - }; - - bool useAlignment() const { - //cout << "InterpolatedScorer::useAlignment" << endl; - for (vector::const_iterator itsc = _scorers.begin(); itsc < _scorers.end();itsc++){ - if ((*itsc)->useAlignment()){ - //cout <<"InterpolatedScorer::useAlignment Returning true"<& referenceFiles); + void prepareStats(size_t sid, const string& text, ScoreStats& entry); + size_t NumberOfScores() const { + size_t sz=0; + for (vector::const_iterator itsc = _scorers.begin(); itsc < _scorers.end(); itsc++) { + sz += (*itsc)->NumberOfScores(); + } + return sz; + }; - //calculate the actual score - this gets done in the individual scorers - //statscore_t calculateScore(const vector& totals); - void setScoreData(ScoreData* data); + bool useAlignment() const { + //cout << "InterpolatedScorer::useAlignment" << endl; + for (vector::const_iterator itsc = _scorers.begin(); itsc < _scorers.end(); itsc++) { + if ((*itsc)->useAlignment()) { + //cout <<"InterpolatedScorer::useAlignment Returning true"<& totals); + void setScoreData(ScoreData* data); - //regularisation - ScorerRegularisationStrategy _regularisationStrategy; - size_t _regularisationWindow; +protected: - vector _scorers; - vector _scorerWeights; + //regularisation + ScorerRegularisationStrategy _regularisationStrategy; + size_t _regularisationWindow; + + vector _scorers; + vector _scorerWeights; }; diff --git a/mert/Optimizer.cpp b/mert/Optimizer.cpp index 9edfc4be7..63265de25 100644 --- a/mert/Optimizer.cpp +++ b/mert/Optimizer.cpp @@ -14,31 +14,34 @@ static const float MAX_FLOAT=numeric_limits::max(); -void Optimizer::SetScorer(Scorer *S){ +void Optimizer::SetScorer(Scorer *S) +{ if(scorer) delete scorer; scorer=S; } -void Optimizer::SetFData(FeatureData *F){ +void Optimizer::SetFData(FeatureData *F) +{ if(FData) delete FData; FData=F; }; -Optimizer::Optimizer(unsigned Pd,vector i2O,vector start):scorer(NULL),FData(NULL){ +Optimizer::Optimizer(unsigned Pd,vector i2O,vector start):scorer(NULL),FData(NULL) +{ //warning: the init vector is a full set of parameters, of dimension pdim! - + Point::pdim=Pd; - + assert(start.size()==Pd); Point::dim=i2O.size(); Point::optindices=i2O; - if (Point::pdim>Point::dim){ - for (unsigned int i=0;iPoint::dim) { + for (unsigned int i=0; i i2O,vector start) } }; -Optimizer::~Optimizer(){ +Optimizer::~Optimizer() +{ delete scorer; delete FData; } -statscore_t Optimizer::GetStatScore(const Point& param)const{ +statscore_t Optimizer::GetStatScore(const Point& param)const +{ vector bests; Get1bests(param,bests); //copy(bests.begin(),bests.end(),ostream_iterator(cerr," ")); @@ -60,23 +65,25 @@ statscore_t Optimizer::GetStatScore(const Point& param)const{ }; /**compute the intersection of 2 lines*/ -float intersect (float m1, float b1,float m2,float b2){ +float intersect (float m1, float b1,float m2,float b2) +{ float isect = ((b2-b1)/(m1-m2)); if (!isfinite(isect)) { - isect = MAX_FLOAT; + isect = MAX_FLOAT; } return isect; } -map::iterator AddThreshold(map& thresholdmap,float newt,pair newdiff){ +map::iterator AddThreshold(map& thresholdmap,float newt,pair newdiff) +{ map::iterator it=thresholdmap.find(newt); - if(it!=thresholdmap.end()){ + if(it!=thresholdmap.end()) { //the threshold already exists!! this is very unlikely if(it->second.back().first==newdiff.first) it->second.back().second=newdiff.second;//there was already a diff for this sentence, we change the 1 best; else it->second.push_back(newdiff); - }else{ + } else { //normal case pair< map::iterator,bool > ins=thresholdmap.insert(threshold(newt,diff_t(1,newdiff))); assert(ins.second);//we really inserted something @@ -86,244 +93,247 @@ map::iterator AddThreshold(map& thresholdmap,float }; -statscore_t Optimizer::LineOptimize(const Point& origin,const Point& direction,Point& bestpoint)const{ +statscore_t Optimizer::LineOptimize(const Point& origin,const Point& direction,Point& bestpoint)const +{ // we are looking for the best Point on the line y=Origin+x*direction float min_int=0.0001; //typedef pair diff;//first the sentence that changes, second is the new 1best for this sentence //list thresholdlist; - + map thresholdmap; thresholdmap[MIN_FLOAT]=diff_t(); vector first1best;//the vector of nbests for x=-inf - for(unsigned int S=0;S::iterator previnserted=thresholdmap.begin(); //first we determine the translation with the best feature score for each sentence and each value of x //cerr << "Sentence " << S << endl; multimap gradient; vector f0; f0.resize(FData->get(S).size()); - for(unsigned j=0;jget(S).size();j++){ + for(unsigned j=0; jget(S).size(); j++) { gradient.insert(pair(direction*(FData->get(S,j)),j));//gradient of the feature function for this particular target sentence f0[j]=origin*FData->get(S,j);//compute the feature function at the origin point } //now lets compute the 1best for each value of x - + // vector > onebest; - + multimap::iterator gradientit=gradient.begin(); multimap::iterator highest_f0=gradient.begin(); - + float smallest=gradientit->first;//smallest gradient //several candidates can have the lowest slope (eg for word penalty where the gradient is an integer ) gradientit++; - while(gradientit!=gradient.end()&&gradientit->first==smallest){ + while(gradientit!=gradient.end()&&gradientit->first==smallest) { // cerr<<"ni"<second<second]<<" "<second]>f0[highest_f0->second]) - highest_f0=gradientit;//the highest line is the one with he highest f0 + highest_f0=gradientit;//the highest line is the one with he highest f0 gradientit++; } - + gradientit = highest_f0; - first1best.push_back(highest_f0->second); + first1best.push_back(highest_f0->second); //now we look for the intersections points indicating a change of 1 best - //we use the fact that the function is convex, which means that the gradient can only go up - while(gradientit!=gradient.end()){ + //we use the fact that the function is convex, which means that the gradient can only go up + while(gradientit!=gradient.end()) { map::iterator leftmost=gradientit; float m=gradientit->first; float b=f0[gradientit->second]; multimap::iterator gradientit2=gradientit; gradientit2++; float leftmostx=MAX_FLOAT; - for(;gradientit2!=gradient.end();gradientit2++){ - //cerr<<"--"<first<<' '<second<first){ - curintersect=intersect(m,b,gradientit2->first,f0[gradientit2->second]); + for(; gradientit2!=gradient.end(); gradientit2++) { + //cerr<<"--"<first<<' '<second<first) { + curintersect=intersect(m,b,gradientit2->first,f0[gradientit2->second]); //cerr << "curintersect: " << curintersect << " leftmostx: " << leftmostx << endl; - if(curintersect<=leftmostx){ - //we have found an intersection to the left of the leftmost we had so far. - //we might have curintersect==leftmostx for example is 2 candidates are the same - //in that case its better its better to update leftmost to gradientit2 to avoid some recomputing later - leftmostx=curintersect; - leftmost=gradientit2;//this is the new reference - } - } + if(curintersect<=leftmostx) { + //we have found an intersection to the left of the leftmost we had so far. + //we might have curintersect==leftmostx for example is 2 candidates are the same + //in that case its better its better to update leftmost to gradientit2 to avoid some recomputing later + leftmostx=curintersect; + leftmost=gradientit2;//this is the new reference + } + } } if (leftmost == gradientit) { - //we didn't find any more intersections - //the rightmost bestindex is the one with the highest slope. - assert(abs(leftmost->first-gradient.rbegin()->first)<0.0001);//they should be egal but there might be - //a small difference due to rounding error - break; + //we didn't find any more intersections + //the rightmost bestindex is the one with the highest slope. + assert(abs(leftmost->first-gradient.rbegin()->first)<0.0001);//they should be egal but there might be + //a small difference due to rounding error + break; } //we have found the next intersection! pair newd(S,leftmost->second);//new onebest for Sentence S is leftmost->second - if(leftmostx-previnserted->first::iterator tit=thresholdmap.find(leftmostx); - if(tit==previnserted){ - //the threshold is the same as before can happen if 2 candidates are the same for example - assert(previnserted->second.back().first==newd.first); - previnserted->second.back()=newd;//just replace the 1 best fors sentence S - //previnsert doesnt change - }else{ + if(leftmostx-previnserted->first::iterator tit=thresholdmap.find(leftmostx); + if(tit==previnserted) { + //the threshold is the same as before can happen if 2 candidates are the same for example + assert(previnserted->second.back().first==newd.first); + previnserted->second.back()=newd;//just replace the 1 best fors sentence S + //previnsert doesnt change + } else { - if(tit==thresholdmap.end()){ - thresholdmap[leftmostx]=previnserted->second;//We keep the diffs at previnsert - thresholdmap.erase(previnserted);//erase old previnsert - previnserted=thresholdmap.find(leftmostx);//point previnsert to the new threshold - previnserted->second.back()=newd;//we update the diff for sentence S - }else{//threshold already exists but is not the previous one. - //we append the diffs in previnsert to tit before destroying previnsert - tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end()); - assert(tit->second.back().first==newd.first); - tit->second.back()=newd;//change diff for sentence S - thresholdmap.erase(previnserted);//erase old previnsert - previnserted=tit;//point previnsert to the new threshold - } - } + if(tit==thresholdmap.end()) { + thresholdmap[leftmostx]=previnserted->second;//We keep the diffs at previnsert + thresholdmap.erase(previnserted);//erase old previnsert + previnserted=thresholdmap.find(leftmostx);//point previnsert to the new threshold + previnserted->second.back()=newd;//we update the diff for sentence S + } else { //threshold already exists but is not the previous one. + //we append the diffs in previnsert to tit before destroying previnsert + tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end()); + assert(tit->second.back().first==newd.first); + tit->second.back()=newd;//change diff for sentence S + thresholdmap.erase(previnserted);//erase old previnsert + previnserted=tit;//point previnsert to the new threshold + } + } - assert(previnserted != thresholdmap.end()); - }else{//normal insertion process - previnserted=AddThreshold(thresholdmap,leftmostx,newd); + assert(previnserted != thresholdmap.end()); + } else { //normal insertion process + previnserted=AddThreshold(thresholdmap,leftmostx,newd); } gradientit=leftmost; } //while(gradientit!=gradient.end()){ } //loop on S - //now the thresholdlist is up to date: + //now the thresholdlist is up to date: //it contains a list of all the parameter_ts where the function changed its value, along with the nbest list for the interval after each threshold - + map::iterator thrit; - if(verboselevel()>6){ + if(verboselevel()>6) { cerr << "Thresholds:(" <first << " diffs"; for (size_t j = 0; j < thrit->second.size(); ++j) { - cerr << " " <second[j].first << "," << thrit->second[j].second; + cerr << " " <second[j].first << "," << thrit->second[j].second; } cerr << endl; } } - + //last thing to do is compute the Stat score (ie BLEU) and find the minimum thrit=thresholdmap.begin(); ++thrit;//first diff corrrespond to MIN_FLOAT and first1best diffs_t diffs; - for(;thrit!=thresholdmap.end();thrit++) + for(; thrit!=thresholdmap.end(); thrit++) diffs.push_back(thrit->second); vector scores=GetIncStatScore(first1best,diffs); - + thrit=thresholdmap.begin(); statscore_t bestscore=MIN_FLOAT; float bestx=MIN_FLOAT; assert(scores.size()==thresholdmap.size());//we skipped the first el of thresholdlist but GetIncStatScore return 1 more for first1best - for(unsigned int sc=0;sc!=scores.size();sc++){ + for(unsigned int sc=0; sc!=scores.size(); sc++) { //cerr << "x=" << thrit->first << " => " << scores[sc] << endl; if (scores[sc] > bestscore) { - //This is the score for the interval [lit2->first, (lit2+1)->first] - //unless we're at the last score, when it's the score - //for the interval [lit2->first,+inf] - bestscore = scores[sc]; + //This is the score for the interval [lit2->first, (lit2+1)->first] + //unless we're at the last score, when it's the score + //for the interval [lit2->first,+inf] + bestscore = scores[sc]; - //if we're not in [-inf,x1] or [xn,+inf] then just take the value - //if x which splits the interval in half. For the rightmost interval, - //take x to be the last interval boundary + 0.1, and for the leftmost - //interval, take x to be the first interval boundary - 1000. - //These values are taken from cmert. - float leftx = thrit->first; - if (thrit == thresholdmap.begin()) { - leftx = MIN_FLOAT; - } - ++thrit; - float rightx = MAX_FLOAT; - if (thrit != thresholdmap.end()) { - rightx = thrit->first; - } - --thrit; - //cerr << "leftx: " << leftx << " rightx: " << rightx << endl; - if (leftx == MIN_FLOAT) { - bestx = rightx-1000; - } else if (rightx == MAX_FLOAT) { - bestx = leftx+0.1; - } else { - bestx = 0.5 * (rightx + leftx); - } - //cerr << "x = " << "set new bestx to: " << bestx << endl; + //if we're not in [-inf,x1] or [xn,+inf] then just take the value + //if x which splits the interval in half. For the rightmost interval, + //take x to be the last interval boundary + 0.1, and for the leftmost + //interval, take x to be the first interval boundary - 1000. + //These values are taken from cmert. + float leftx = thrit->first; + if (thrit == thresholdmap.begin()) { + leftx = MIN_FLOAT; + } + ++thrit; + float rightx = MAX_FLOAT; + if (thrit != thresholdmap.end()) { + rightx = thrit->first; + } + --thrit; + //cerr << "leftx: " << leftx << " rightx: " << rightx << endl; + if (leftx == MIN_FLOAT) { + bestx = rightx-1000; + } else if (rightx == MAX_FLOAT) { + bestx = leftx+0.1; + } else { + bestx = 0.5 * (rightx + leftx); + } + //cerr << "x = " << "set new bestx to: " << bestx << endl; } ++thrit; } - if(abs(bestx)<0.00015){ + if(abs(bestx)<0.00015) { bestx=0.0;//the origin of the line is the best point!we put it back at 0 so we do not propagate rounding erros - //finally! we manage to extract the best score; - //now we convert bestx (position on the line) to a point! + //finally! we manage to extract the best score; + //now we convert bestx (position on the line) to a point! if(verboselevel()>4) cerr<<"best point on line at origin"<3){ + if(verboselevel()>3) { // cerr<<"end Lineopt, bestx="<& bests)const{ +void Optimizer::Get1bests(const Point& P,vector& bests)const +{ assert(FData); bests.clear(); bests.resize(size()); - - for(unsigned i=0;iget(i).size();j++){ + for(j=0; jget(i).size(); j++) { float curfs=P*FData->get(i,j); - if(curfs>bestfs){ - bestfs=curfs; - idx=j; + if(curfs>bestfs) { + bestfs=curfs; + idx=j; } } bests[i]=idx; } - + } -statscore_t Optimizer::Run(Point& P)const{ - if(!FData){ +statscore_t Optimizer::Run(Point& P)const +{ + if(!FData) { cerr<<"error trying to optimize without Features loaded"<getReferenceSize()!=FData->size()){ + if (scorer->getReferenceSize()!=FData->size()) { cerr<<"error size mismatch between FeatureData and Scorer"<2) + statscore_t score=GetStatScore(P); + P.score=score; + + if(verboselevel()>2) cerr<<"Starting point: "<< P << " => "<< P.score << endl; statscore_t s=TrueRun(P); P.score=s;//just in case its not done in TrueRun @@ -331,9 +341,10 @@ statscore_t Optimizer::Run(Point& P)const{ cerr<<"Ending point: "<< P <<" => "<< s << endl; return s; } - -vector Optimizer::GetIncStatScore(vector thefirst,vector > > thediffs)const{ + +vector Optimizer::GetIncStatScore(vector thefirst,vector > > thediffs)const +{ assert(scorer); vector theres; @@ -347,61 +358,62 @@ vector Optimizer::GetIncStatScore(vector thefirst,vector< //---------------- code for the powell optimizer float SimpleOptimizer::eps=0.0001; -statscore_t SimpleOptimizer::TrueRun(Point& P)const{ - +statscore_t SimpleOptimizer::TrueRun(Point& P)const +{ + statscore_t prevscore=0; statscore_t bestscore=MIN_FLOAT; Point best; - - //If P is already defined and provides a score + + //If P is already defined and provides a score //we must improve over this score - if(P.score>bestscore){ - bestscore=P.score; - best=P; - } - + if(P.score>bestscore) { + bestscore=P.score; + best=P; + } + int nrun=0; - do{ - ++nrun; + do { + ++nrun; if(verboselevel()>2&&nrun>1) cerr<<"last diff="<4){ - // cerr<<"minimizing along direction "< " << prevscore << endl; + + for(unsigned int d=0; d4) { + // cerr<<"minimizing along direction "< " << prevscore << endl; } Point direction; - for(unsigned int i=0;i5){ - cerr<<"direction: "<< d << " => " << curscore << endl; - cerr<<"\tending point: "<< linebest << " => " << curscore << endl; - } - if(curscore>bestscore){ - bestscore=curscore; - best=linebest; - if(verboselevel()>3){ - cerr<<"new best dir:"< " <5) { + cerr<<"direction: "<< d << " => " << curscore << endl; + cerr<<"\tending point: "<< linebest << " => " << curscore << endl; + } + if(curscore>bestscore) { + bestscore=curscore; + best=linebest; + if(verboselevel()>3) { + cerr<<"new best dir:"< " <3) - cerr<eps); - - if(verboselevel()>2){ + if(verboselevel()>3) + cerr<eps); + + if(verboselevel()>2) { cerr<<"end Powell Algo, nrun="< min(Point::getdim()); vector max(Point::getdim()); - for(unsigned int d=0;d OptimizerFactory::typenames; -void OptimizerFactory::SetTypeNames(){ - if(typenames.empty()){ +void OptimizerFactory::SetTypeNames() +{ + if(typenames.empty()) { typenames.resize(NOPTIMIZER); typenames[POWELL]="powell"; typenames[RANDOM]="random"; //add new type there - } + } } -vector OptimizerFactory::GetTypeNames(){ +vector OptimizerFactory::GetTypeNames() +{ if(typenames.empty()) SetTypeNames(); return typenames; } -OptimizerFactory::OptType OptimizerFactory::GetOType(string type){ +OptimizerFactory::OptType OptimizerFactory::GetOType(string type) +{ unsigned int thetype; if(typenames.empty()) SetTypeNames(); - for(thetype=0;thetype i2o,vector start,string type){ +Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,vector i2o,vector start,string type) +{ OptType T=GetOType(type); - if(T==NOPTIMIZER){ + if(T==NOPTIMIZER) { cerr<<"Error: unknown Optimizer type "< i2o,ve break; default: cerr<<"Error: unknown optimizer"< i2O,vector start); void SetScorer(Scorer *S); void SetFData(FeatureData *F); virtual ~Optimizer(); - unsigned size()const{return (FData?FData->size():0);} + unsigned size()const { + return (FData?FData->size():0); + } /**Generic wrapper around TrueRun to check a few things. Non virtual*/ statscore_t Run(Point&)const; -/**main function that perform an optimization*/ + /**main function that perform an optimization*/ virtual statscore_t TrueRun(Point&)const=0; /**given a set of lambdas, get the nbest for each sentence*/ void Get1bests(const Point& param,vector& bests)const; /**given a set of nbests, get the Statistical score*/ - statscore_t GetStatScore(const vector& nbests)const{return scorer->score(nbests);}; + statscore_t GetStatScore(const vector& nbests)const { + return scorer->score(nbests); + }; /**given a set of lambdas, get the total statistical score*/ - statscore_t GetStatScore(const Point& param)const; + statscore_t GetStatScore(const Point& param)const; vector GetIncStatScore(vector ref,vector > >)const; statscore_t LineOptimize(const Point& start,const Point& direction,Point& best)const;//Get the optimal Lambda and the best score in a particular direction from a given Point }; /**default basic optimizer*/ -class SimpleOptimizer: public Optimizer{ +class SimpleOptimizer: public Optimizer +{ private: -static float eps; + static float eps; public: - SimpleOptimizer(unsigned dim,vector i2O,vector start):Optimizer(dim,i2O,start){}; + SimpleOptimizer(unsigned dim,vector i2O,vector start):Optimizer(dim,i2O,start) {}; virtual statscore_t TrueRun(Point&)const; }; -class RandomOptimizer: public Optimizer{ +class RandomOptimizer: public Optimizer +{ public: - RandomOptimizer(unsigned dim,vector i2O,vector start):Optimizer(dim,i2O,start){}; + RandomOptimizer(unsigned dim,vector i2O,vector start):Optimizer(dim,i2O,start) {}; virtual statscore_t TrueRun(Point&)const; }; -class OptimizerFactory{ - public: +class OptimizerFactory +{ +public: // unsigned dim; //Point Start; static vector GetTypeNames(); static Optimizer* BuildOptimizer(unsigned dim,vectortooptimize,vector start,string type); - private: - enum OptType{POWELL=0,RANDOM,NOPTIMIZER};//Add new optimizer here BEFORE NOPTIMZER +private: + enum OptType {POWELL=0,RANDOM,NOPTIMIZER}; //Add new optimizer here BEFORE NOPTIMZER static OptType GetOType(string); static vector typenames; static void SetTypeNames(); - + }; diff --git a/mert/PerScorer.cpp b/mert/PerScorer.cpp index 0fbb367c0..072ba636d 100644 --- a/mert/PerScorer.cpp +++ b/mert/PerScorer.cpp @@ -1,69 +1,72 @@ #include "PerScorer.h" -void PerScorer::setReferenceFiles(const vector& referenceFiles) { - // for each line in the reference file, create a multiset of the - // word ids - if (referenceFiles.size() != 1) { - throw runtime_error("PER only supports a single reference"); +void PerScorer::setReferenceFiles(const vector& referenceFiles) +{ + // for each line in the reference file, create a multiset of the + // word ids + if (referenceFiles.size() != 1) { + throw runtime_error("PER only supports a single reference"); + } + _reftokens.clear(); + _reflengths.clear(); + ifstream in(referenceFiles[0].c_str()); + if (!in) { + throw runtime_error("Unable to open " + referenceFiles[0]); + } + string line; + int sid = 0; + while (getline(in,line)) { + vector tokens; + encode(line,tokens); + _reftokens.push_back(multiset()); + for (size_t i = 0; i < tokens.size(); ++i) { + _reftokens.back().insert(tokens[i]); } - _reftokens.clear(); - _reflengths.clear(); - ifstream in(referenceFiles[0].c_str()); - if (!in) { - throw runtime_error("Unable to open " + referenceFiles[0]); + _reflengths.push_back(tokens.size()); + if (sid > 0 && sid % 100 == 0) { + TRACE_ERR("."); } - string line; - int sid = 0; - while (getline(in,line)) { - vector tokens; - encode(line,tokens); - _reftokens.push_back(multiset()); - for (size_t i = 0; i < tokens.size(); ++i) { - _reftokens.back().insert(tokens[i]); - } - _reflengths.push_back(tokens.size()); - if (sid > 0 && sid % 100 == 0) { - TRACE_ERR("."); - } - ++sid; - } - TRACE_ERR(endl); + ++sid; + } + TRACE_ERR(endl); } -void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { - if (sid >= _reflengths.size()) { - stringstream msg; - msg << "Sentence id (" << sid << ") not found in reference set"; - throw runtime_error(msg.str()); - } - //calculate correct, output_length and ref_length for - //the line and store it in entry - vector testtokens; - encode(text,testtokens); - multiset testtokens_all(testtokens.begin(),testtokens.end()); - set testtokens_unique(testtokens.begin(),testtokens.end()); - int correct = 0; - for (set::iterator i = testtokens_unique.begin(); - i != testtokens_unique.end(); ++i) { - int token = *i; - correct += min(_reftokens[sid].count(token), testtokens_all.count(token)); - } - - ostringstream stats; - stats << correct << " " << testtokens.size() << " " << _reflengths[sid] << " " ; - string stats_str = stats.str(); - entry.set(stats_str); +void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) +{ + if (sid >= _reflengths.size()) { + stringstream msg; + msg << "Sentence id (" << sid << ") not found in reference set"; + throw runtime_error(msg.str()); + } + //calculate correct, output_length and ref_length for + //the line and store it in entry + vector testtokens; + encode(text,testtokens); + multiset testtokens_all(testtokens.begin(),testtokens.end()); + set testtokens_unique(testtokens.begin(),testtokens.end()); + int correct = 0; + for (set::iterator i = testtokens_unique.begin(); + i != testtokens_unique.end(); ++i) { + int token = *i; + correct += min(_reftokens[sid].count(token), testtokens_all.count(token)); + } + + ostringstream stats; + stats << correct << " " << testtokens.size() << " " << _reflengths[sid] << " " ; + string stats_str = stats.str(); + entry.set(stats_str); } -float PerScorer::calculateScore(const vector& comps) { - float denom = comps[2]; - float num = comps[0] - max(float(0),comps[1]-comps[2]); - if (denom == 0) { - //shouldn't happen! - return 0.0; - } else { - return num/denom; - } +float PerScorer::calculateScore(const vector& comps) +{ + float denom = comps[2]; + float num = comps[0] - max(float(0),comps[1]-comps[2]); + if (denom == 0) { + //shouldn't happen! + return 0.0; + } else { + return num/denom; + } } diff --git a/mert/PerScorer.h b/mert/PerScorer.h index 023f87907..801efbc8c 100644 --- a/mert/PerScorer.h +++ b/mert/PerScorer.h @@ -22,34 +22,40 @@ using namespace std; * as 1 - (correct - max(0,output_length - ref_length)) / ref_length * In fact, we ignore the " 1 - " so that it can be maximised. **/ -class PerScorer: public StatisticsBasedScorer { - public: - PerScorer(const string& config = "") : StatisticsBasedScorer("PER",config) {} - virtual void setReferenceFiles(const vector& referenceFiles); - virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); - - virtual void whoami() { - cerr << "I AM PerScorer" << std::endl; - } - - size_t NumberOfScores() const { cerr << "PerScorer: 3" << endl; return 3; }; - bool useAlignment() const {return false;}; +class PerScorer: public StatisticsBasedScorer +{ +public: + PerScorer(const string& config = "") : StatisticsBasedScorer("PER",config) {} + virtual void setReferenceFiles(const vector& referenceFiles); + virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); - - protected: - - virtual float calculateScore(const vector& comps) ; - - private: - - //no copy - PerScorer(const PerScorer&); - ~PerScorer(){}; - PerScorer& operator=(const PerScorer&); - - // data extracted from reference files - vector _reflengths; - vector > _reftokens; + virtual void whoami() { + cerr << "I AM PerScorer" << std::endl; + } + + size_t NumberOfScores() const { + cerr << "PerScorer: 3" << endl; + return 3; + }; + bool useAlignment() const { + return false; + }; + + +protected: + + virtual float calculateScore(const vector& comps) ; + +private: + + //no copy + PerScorer(const PerScorer&); + ~PerScorer() {}; + PerScorer& operator=(const PerScorer&); + + // data extracted from reference files + vector _reflengths; + vector > _reftokens; }; #endif //__PERSCORER_H diff --git a/mert/Permutation.cpp b/mert/Permutation.cpp index 6f7d442cb..f03d54477 100644 --- a/mert/Permutation.cpp +++ b/mert/Permutation.cpp @@ -16,27 +16,26 @@ using namespace std; Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength ) { - if (sourceLength > 0) - { - set(alignment, sourceLength); - } - m_targetLength = targetLength; + if (sourceLength > 0) { + set(alignment, sourceLength); + } + m_targetLength = targetLength; } size_t Permutation::getLength() const { - return int(m_array.size()); + return int(m_array.size()); } void Permutation::dump() const { - int j=0; - for (vector::const_iterator i = m_array.begin(); i !=m_array.end(); i++){ - cout << "("; - cout << j << ":" << *i ; - cout << "), "; - j++; - } - cout << endl; + int j=0; + for (vector::const_iterator i = m_array.begin(); i !=m_array.end(); i++) { + cout << "("; + cout << j << ":" << *i ; + cout << "), "; + j++; + } + cout << endl; } @@ -49,286 +48,272 @@ void Permutation::dump() const void Permutation::set(const string & alignment,const int sourceLength) { - //cout << "******** Permutation::set :" << alignment << ": len : " << sourceLength < tokens; // Create vector to hold our words + while (ss >> buf) + tokens.push_back(buf); + + vector tempPerm(sourceLength, -1); + //Set tempPerm to have one target position per source position + for (size_t i=0; i sourceLength) { + cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl; + exit(1); } - - //Tokenise on whitespace - string buf; // Have a buffer string - stringstream ss(alignment); // Insert the string into a stream - vector tokens; // Create vector to hold our words - while (ss >> buf) - tokens.push_back(buf); - - vector tempPerm(sourceLength, -1); - //Set tempPerm to have one target position per source position - for (size_t i=0; i sourceLength) { - cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl; - exit(1); - } - //If have multiple target pos aligned to one source, - // then ignore all but first alignment - if (tempPerm[sourcePos] == -1 || tempPerm[sourcePos] > targetPos) - { - tempPerm[sourcePos] = targetPos; - } + //If have multiple target pos aligned to one source, + // then ignore all but first alignment + if (tempPerm[sourcePos] == -1 || tempPerm[sourcePos] > targetPos) { + tempPerm[sourcePos] = targetPos; } + } - //TODO - //Set final permutation in m_array - //Take care of: source - null - // multiple_source - one target - // unaligned target - // Input: 1-9 2-1 4-3 4-4 5-6 6-6 7-6 8-8 - // Convert source: 1 2 3 4 5 6 7 8 - // target: 9 1 -1 3 6 6 6 8 -> 8 1 2 3 4 5 6 7 + //TODO + //Set final permutation in m_array + //Take care of: source - null + // multiple_source - one target + // unaligned target + // Input: 1-9 2-1 4-3 4-4 5-6 6-6 7-6 8-8 + // Convert source: 1 2 3 4 5 6 7 8 + // target: 9 1 -1 3 6 6 6 8 -> 8 1 2 3 4 5 6 7 - // 1st step: Add null aligned source to previous alignment - // target: 9 1 -1 3 6 6 6 8 -> 9 1 1 3 6 6 6 8 - int last=0; - m_array.assign(sourceLength, -1); - //get a searcheable index - multimap invMap; - multimap::iterator it; - //cout << " SourceP -> TargetP " << endl; - for (size_t i=0; i " << tempPerm[i] << endl; - //Key is target pos, value is source pos - invMap.insert(pair(tempPerm[i],int(i))); + // 1st step: Add null aligned source to previous alignment + // target: 9 1 -1 3 6 6 6 8 -> 9 1 1 3 6 6 6 8 + int last=0; + m_array.assign(sourceLength, -1); + //get a searcheable index + multimap invMap; + multimap::iterator it; + //cout << " SourceP -> TargetP " << endl; + for (size_t i=0; i " << tempPerm[i] << endl; + //Key is target pos, value is source pos + invMap.insert(pair(tempPerm[i],int(i))); + } - // 2nd step: Get target into index of multimap and sort - // Convert source: 1 2 3 4 5 6 7 8 - // target: 9 1 0 3 6 6 6 8 -> 0 1 3 6 6 6 8 9 - // source: 3 2 4 5 6 7 8 1 - int i=0; - //cout << " TargetP => SourceP : TargetIndex " << endl; - for ( it=invMap.begin() ; it != invMap.end(); it++ ) - { - //cout << (*it).first << " => " << (*it).second << " : " << i << endl; - //find source position - m_array[(*it).second] = i; - i++; - } + // 2nd step: Get target into index of multimap and sort + // Convert source: 1 2 3 4 5 6 7 8 + // target: 9 1 0 3 6 6 6 8 -> 0 1 3 6 6 6 8 9 + // source: 3 2 4 5 6 7 8 1 + int i=0; + //cout << " TargetP => SourceP : TargetIndex " << endl; + for ( it=invMap.begin() ; it != invMap.end(); it++ ) { + //cout << (*it).first << " => " << (*it).second << " : " << i << endl; + //find source position + m_array[(*it).second] = i; + i++; + } - bool ok = checkValidPermutation(m_array); - //dump(); - if (!ok) { - throw runtime_error(" Created invalid permutation"); - } + bool ok = checkValidPermutation(m_array); + //dump(); + if (!ok) { + throw runtime_error(" Created invalid permutation"); + } } //Static -vector Permutation::invert(const vector & inVector) +vector Permutation::invert(const vector & inVector) { - vector outVector(inVector.size()); - for (size_t i=0; i outVector(inVector.size()); + for (size_t i=0; i const & inVector) +bool Permutation::checkValidPermutation(vector const & inVector) { - vector test(inVector.size(),-1); - for (size_t i=0; i< inVector.size(); i++){ - //No multiple entries of same value allowed - if (test[inVector[i]] > -1){ - cerr << "Permutation error: multiple entries of same value\n" << endl; - return false; - } - test[inVector[i]] ++; + vector test(inVector.size(),-1); + for (size_t i=0; i< inVector.size(); i++) { + //No multiple entries of same value allowed + if (test[inVector[i]] > -1) { + cerr << "Permutation error: multiple entries of same value\n" << endl; + return false; } - for (size_t i=0; i compareArray = compare.getArray(); - if (getLength() != compare.getLength()) { - cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl; - throw runtime_error("Length of permutations not equal"); + float score=0; + vector compareArray = compare.getArray(); + if (getLength() != compare.getLength()) { + cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl; + throw runtime_error("Length of permutations not equal"); + } + if (getLength() == 0) { + cerr << "Empty permutation" << endl; + return 0; + } + for (size_t i=0; i compareArray = compare.getArray(); - if (getLength() != compare.getLength()) { - cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl; - throw runtime_error("Length of permutations not equal"); + float score=0; + vector compareArray = compare.getArray(); + if (getLength() != compare.getLength()) { + cerr << "1stperm: " << getLength() << " 2ndperm: " << compare.getLength() << endl; + throw runtime_error("Length of permutations not equal"); + } + if (getLength() == 0) { + cerr << "Empty permutation" << endl; + return 0; + } + for (size_t i=0; i compareArray[j])) { + score++; + } } - if (getLength() == 0) { - cerr << "Empty permutation" << endl; - return 0; - } - for (size_t i=0; i compareArray[j])) - { - score++; - } - } - } - score = (score / ((getLength()*getLength() - getLength()) /2 ) ); - //Adjusted Kendall's tau correlates better with human judgements - score = sqrt (score); - score = 1 - score; + } + score = (score / ((getLength()*getLength() - getLength()) /2 ) ); + //Adjusted Kendall's tau correlates better with human judgements + score = sqrt (score); + score = 1 - score; - return score; + return score; } vector Permutation::getArray() const { - vector ret = m_array; - return ret; + vector ret = m_array; + return ret; } //Static -//This function is called with test which is +//This function is called with test which is // the 5th field in moses nbest output when called with -include-alignment-in-n-best //eg. 0=0 1-2=1-2 3=3 4=4 5=5 6=6 7-9=7-8 10=9 11-13=10-11 (source-target) string Permutation::convertMosesToStandard(string const & alignment) { - if (alignment.length() == 0) - { + if (alignment.length() == 0) { cerr << "Alignment input string empty" << endl; } string working = alignment; string out; stringstream oss; - while (working.length() > 0) - { - string align; - getNextPound(working,align," "); + while (working.length() > 0) { + string align; + getNextPound(working,align," "); - //If found an alignment - if (align.length() > 0) - { - size_t posDelimeter = align.find("="); - if(posDelimeter== string::npos) - { - cerr << "Delimiter not found = :"<< align << endl; - exit(0); - } - int firstSourcePos,lastSourcePos,firstTargetPos,lastTargetPos; - string sourcePoss = align.substr(0, posDelimeter); - string targetPoss = align.substr(posDelimeter+1); - posDelimeter = sourcePoss.find("-"); - if(posDelimeter < string::npos) { - firstSourcePos = atoi((sourcePoss.substr(0, posDelimeter)).c_str()); - lastSourcePos = atoi((sourcePoss.substr(posDelimeter+1)).c_str()); - } else { - firstSourcePos = atoi(sourcePoss.c_str()); - lastSourcePos = firstSourcePos; - } - posDelimeter = targetPoss.find("-"); - if(posDelimeter < string::npos) { - firstTargetPos = atoi((targetPoss.substr(0, posDelimeter)).c_str()); - lastTargetPos = atoi((targetPoss.substr(posDelimeter+1)).c_str()); - } else { - firstTargetPos = atoi(targetPoss.c_str()); - lastTargetPos = firstTargetPos; - } - for (int i = firstSourcePos; i <= lastSourcePos; i++) { - for (int j = firstTargetPos; j <= lastTargetPos; j++) { - oss << i << "-" << j << " "; - } - } + //If found an alignment + if (align.length() > 0) { + size_t posDelimeter = align.find("="); + if(posDelimeter== string::npos) { + cerr << "Delimiter not found = :"<< align << endl; + exit(0); + } + int firstSourcePos,lastSourcePos,firstTargetPos,lastTargetPos; + string sourcePoss = align.substr(0, posDelimeter); + string targetPoss = align.substr(posDelimeter+1); + posDelimeter = sourcePoss.find("-"); + if(posDelimeter < string::npos) { + firstSourcePos = atoi((sourcePoss.substr(0, posDelimeter)).c_str()); + lastSourcePos = atoi((sourcePoss.substr(posDelimeter+1)).c_str()); + } else { + firstSourcePos = atoi(sourcePoss.c_str()); + lastSourcePos = firstSourcePos; + } + posDelimeter = targetPoss.find("-"); + if(posDelimeter < string::npos) { + firstTargetPos = atoi((targetPoss.substr(0, posDelimeter)).c_str()); + lastTargetPos = atoi((targetPoss.substr(posDelimeter+1)).c_str()); + } else { + firstTargetPos = atoi(targetPoss.c_str()); + lastTargetPos = firstTargetPos; + } + for (int i = firstSourcePos; i <= lastSourcePos; i++) { + for (int j = firstTargetPos; j <= lastTargetPos; j++) { + oss << i << "-" << j << " "; + } + } - } //else case where two spaces ? + } //else case where two spaces ? } out = oss.str(); - //cout << "ConverttoStandard: " << out << endl; + //cout << "ConverttoStandard: " << out << endl; return out; } diff --git a/mert/Permutation.h b/mert/Permutation.h index df5610133..7c7828387 100644 --- a/mert/Permutation.h +++ b/mert/Permutation.h @@ -20,41 +20,45 @@ class Permutation { - + public: - //Can be HAMMING_DISTANCE or KENDALLS_DISTANCE - Permutation(const std::string &alignment = std::string(), const int sourceLength = 0, const int targetLength = 0 ); - - ~Permutation(){}; - - inline void clear() { m_array.clear(); } - inline size_t size(){ return m_array.size(); } + //Can be HAMMING_DISTANCE or KENDALLS_DISTANCE + Permutation(const std::string &alignment = std::string(), const int sourceLength = 0, const int targetLength = 0 ); + + ~Permutation() {}; + + inline void clear() { + m_array.clear(); + } + inline size_t size() { + return m_array.size(); + } - void set(const std::string &alignment,const int sourceLength); + void set(const std::string &alignment,const int sourceLength); - float distance(const Permutation &permCompare, const distanceMetric_t &strategy = HAMMING_DISTANCE) const; + float distance(const Permutation &permCompare, const distanceMetric_t &strategy = HAMMING_DISTANCE) const; - //Const - void dump() const; - size_t getLength() const; - vector getArray() const; - int getTargetLength() const { - return m_targetLength; - } + //Const + void dump() const; + size_t getLength() const; + vector getArray() const; + int getTargetLength() const { + return m_targetLength; + } - //Static - static std::string convertMosesToStandard(std::string const & alignment); - static vector invert(vector const & inVector); - static bool checkValidPermutation(vector const & inVector); + //Static + static std::string convertMosesToStandard(std::string const & alignment); + static vector invert(vector const & inVector); + static bool checkValidPermutation(vector const & inVector); protected: - vector m_array; - int m_targetLength; - float calculateHamming(const Permutation & compare) const; - float calculateKendall(const Permutation & compare) const; - + vector m_array; + int m_targetLength; + float calculateHamming(const Permutation & compare) const; + float calculateKendall(const Permutation & compare) const; + private: }; diff --git a/mert/PermutationScorer.cpp b/mert/PermutationScorer.cpp index 0c172e87a..0dfe7eba5 100644 --- a/mert/PermutationScorer.cpp +++ b/mert/PermutationScorer.cpp @@ -4,215 +4,212 @@ using namespace std; const int PermutationScorer::SCORE_PRECISION = 5; -PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config) -:SentenceLevelScorer(distanceMetric,config) +PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config) + :SentenceLevelScorer(distanceMetric,config) { - //configure regularisation + //configure regularisation - static string KEY_REFCHOICE = "refchoice"; - static string REFCHOICE_AVERAGE = "average"; - static string REFCHOICE_CLOSEST = "closest"; - - string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST); - if (refchoice == REFCHOICE_AVERAGE) { - m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE; - } else if (refchoice == REFCHOICE_CLOSEST) { - m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST; - } else { - throw runtime_error("Unknown reference choice strategy: " + refchoice); - } - cerr << "Using reference choice strategy: " << refchoice << endl; + static string KEY_REFCHOICE = "refchoice"; + static string REFCHOICE_AVERAGE = "average"; + static string REFCHOICE_CLOSEST = "closest"; - if (distanceMetric.compare("HAMMING") == 0) { - m_distanceMetric = HAMMING_DISTANCE; - } else if (distanceMetric.compare("KENDALL") == 0) { - m_distanceMetric = KENDALL_DISTANCE; - } - cerr << "Using permutation distance metric: " << distanceMetric << endl; + string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST); + if (refchoice == REFCHOICE_AVERAGE) { + m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE; + } else if (refchoice == REFCHOICE_CLOSEST) { + m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST; + } else { + throw runtime_error("Unknown reference choice strategy: " + refchoice); + } + cerr << "Using reference choice strategy: " << refchoice << endl; - //Get reference alignments from scconfig refalign option - static string KEY_ALIGNMENT_FILES = "refalign"; - string refalign = getConfig(KEY_ALIGNMENT_FILES,""); - //cout << refalign << endl; - if (refalign.length() > 0){ - string substring; - while (!refalign.empty()){ - getNextPound(refalign, substring, "+"); - m_referenceAlignments.push_back(substring); - } - } + if (distanceMetric.compare("HAMMING") == 0) { + m_distanceMetric = HAMMING_DISTANCE; + } else if (distanceMetric.compare("KENDALL") == 0) { + m_distanceMetric = KENDALL_DISTANCE; + } + cerr << "Using permutation distance metric: " << distanceMetric << endl; - //Get length of source sentences read in from scconfig source option - // this is essential for extractor but unneccesary for mert executable - static string KEY_SOURCE_FILE = "source"; - string sourceFile = getConfig(KEY_SOURCE_FILE,""); - if (sourceFile.length() > 0) { - cerr << "Loading source sentence lengths from " << sourceFile << endl; - ifstream sourcein(sourceFile.c_str()); - if (!sourcein) { - throw runtime_error("Unable to open: " + sourceFile); - } - string line; - while (getline(sourcein,line)) { - size_t wordNumber = 0; - string word; - while(!line.empty()){ - getNextPound(line, word, " "); - wordNumber++; - } - m_sourceLengths.push_back(wordNumber); - } - sourcein.close(); + //Get reference alignments from scconfig refalign option + static string KEY_ALIGNMENT_FILES = "refalign"; + string refalign = getConfig(KEY_ALIGNMENT_FILES,""); + //cout << refalign << endl; + if (refalign.length() > 0) { + string substring; + while (!refalign.empty()) { + getNextPound(refalign, substring, "+"); + m_referenceAlignments.push_back(substring); } + } + + //Get length of source sentences read in from scconfig source option + // this is essential for extractor but unneccesary for mert executable + static string KEY_SOURCE_FILE = "source"; + string sourceFile = getConfig(KEY_SOURCE_FILE,""); + if (sourceFile.length() > 0) { + cerr << "Loading source sentence lengths from " << sourceFile << endl; + ifstream sourcein(sourceFile.c_str()); + if (!sourcein) { + throw runtime_error("Unable to open: " + sourceFile); + } + string line; + while (getline(sourcein,line)) { + size_t wordNumber = 0; + string word; + while(!line.empty()) { + getNextPound(line, word, " "); + wordNumber++; + } + m_sourceLengths.push_back(wordNumber); + } + sourcein.close(); + } } -void PermutationScorer::setReferenceFiles(const vector& referenceFiles) { - cout << "*******setReferenceFiles" << endl; - //make sure reference data is clear - m_referencePerms.clear(); +void PermutationScorer::setReferenceFiles(const vector& referenceFiles) +{ + cout << "*******setReferenceFiles" << endl; + //make sure reference data is clear + m_referencePerms.clear(); - vector< vector< int> > targetLengths; - //Just getting target length from reference text file - for (size_t i = 0; i < referenceFiles.size(); ++i) - { - vector lengths; - cout << "Loading reference from " << referenceFiles[i] << endl; - ifstream refin(referenceFiles[i].c_str()); - if (!refin) - { - cerr << "Unable to open: " << referenceFiles[i] << endl; - throw runtime_error("Unable to open alignment file"); - } - string line; - while (getline(refin,line)) - { - int count = getNumberWords(line); - lengths.push_back(count); - } - targetLengths.push_back(lengths); + vector< vector< int> > targetLengths; + //Just getting target length from reference text file + for (size_t i = 0; i < referenceFiles.size(); ++i) { + vector lengths; + cout << "Loading reference from " << referenceFiles[i] << endl; + ifstream refin(referenceFiles[i].c_str()); + if (!refin) { + cerr << "Unable to open: " << referenceFiles[i] << endl; + throw runtime_error("Unable to open alignment file"); } + string line; + while (getline(refin,line)) { + int count = getNumberWords(line); + lengths.push_back(count); + } + targetLengths.push_back(lengths); + } - //load reference data - //NOTE ignoring normal reference file, only using previously saved alignment reference files - for (size_t i = 0; i < m_referenceAlignments.size(); ++i) - { - vector referencePerms; - cout << "Loading reference from " << m_referenceAlignments[i] << endl; - ifstream refin(m_referenceAlignments[i].c_str()); - if (!refin) - { - cerr << "Unable to open: " << m_referenceAlignments[i] << endl; - throw runtime_error("Unable to open alignment file"); - } - string line; - size_t sid = 0; //sentence counter - while (getline(refin,line)) - { - //cout << line << endl; - - //Line needs to be of the format: 0-0 1-1 1-2 etc source-target - Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]); - //perm.dump(); - referencePerms.push_back(perm); - //check the source sentence length is the same for previous file - if (perm.getLength() != m_sourceLengths[sid]) - { - cerr << "Permutation Length: " << perm.getLength() << endl; - cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl; - throw runtime_error("Source sentence lengths not the same: "); - } - - sid++; - } - m_referencePerms.push_back(referencePerms); + //load reference data + //NOTE ignoring normal reference file, only using previously saved alignment reference files + for (size_t i = 0; i < m_referenceAlignments.size(); ++i) { + vector referencePerms; + cout << "Loading reference from " << m_referenceAlignments[i] << endl; + ifstream refin(m_referenceAlignments[i].c_str()); + if (!refin) { + cerr << "Unable to open: " << m_referenceAlignments[i] << endl; + throw runtime_error("Unable to open alignment file"); } + string line; + size_t sid = 0; //sentence counter + while (getline(refin,line)) { + //cout << line << endl; + + //Line needs to be of the format: 0-0 1-1 1-2 etc source-target + Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]); + //perm.dump(); + referencePerms.push_back(perm); + //check the source sentence length is the same for previous file + if (perm.getLength() != m_sourceLengths[sid]) { + cerr << "Permutation Length: " << perm.getLength() << endl; + cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl; + throw runtime_error("Source sentence lengths not the same: "); + } + + sid++; + } + m_referencePerms.push_back(referencePerms); + } } -int PermutationScorer::getNumberWords (const string& text) const { - int count = 0; - string line = trimStr(text); - if (line.length()>0) { - int pos = line.find(" "); - while (pos!=int(string::npos)){ - count++; - pos = line.find(" ",pos+1); - } - count++; +int PermutationScorer::getNumberWords (const string& text) const +{ + int count = 0; + string line = trimStr(text); + if (line.length()>0) { + int pos = line.find(" "); + while (pos!=int(string::npos)) { + count++; + pos = line.find(" ",pos+1); } - return count; + count++; + } + return count; } -void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { - //cout << "*******prepareStats" ; - //cout << text << endl; - //cout << sid << endl; - //cout << "Reference0align:" << endl; - //m_referencePerms[0][sid].dump(); +void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) +{ + //cout << "*******prepareStats" ; + //cout << text << endl; + //cout << sid << endl; + //cout << "Reference0align:" << endl; + //m_referencePerms[0][sid].dump(); - string sentence = ""; - string align = text; - size_t alignmentData = text.find("|||"); - //Get sentence and alignment parts - if(alignmentData != string::npos) { - getNextPound(align,sentence, "|||"); - } else { - align = text; + string sentence = ""; + string align = text; + size_t alignmentData = text.find("|||"); + //Get sentence and alignment parts + if(alignmentData != string::npos) { + getNextPound(align,sentence, "|||"); + } else { + align = text; + } + int translationLength = getNumberWords(sentence); + + + //A vector of Permutations for each sentence + vector< vector > nBestPerms; + float distanceValue; + + //need to create permutations for each nbest line + string standardFormat = Permutation::convertMosesToStandard(align); + Permutation perm(standardFormat, m_sourceLengths[sid],translationLength); + //perm.dump(); + + if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) { + float total = 0; + for (size_t i = 0; i < m_referencePerms.size(); ++i) { + float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric); + total += dist; + //cout << "Ref number: " << i << " distance: " << dist << endl; } - int translationLength = getNumberWords(sentence); + float mean = (float)total/m_referencePerms.size(); + //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl; + distanceValue = mean; + } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) { + float max_val = 0; - - //A vector of Permutations for each sentence - vector< vector > nBestPerms; - float distanceValue; - - //need to create permutations for each nbest line - string standardFormat = Permutation::convertMosesToStandard(align); - Permutation perm(standardFormat, m_sourceLengths[sid],translationLength); - //perm.dump(); - - if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) { - float total = 0; - for (size_t i = 0; i < m_referencePerms.size(); ++i) { - float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric); - total += dist; - //cout << "Ref number: " << i << " distance: " << dist << endl; - } - float mean = (float)total/m_referencePerms.size(); - //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl; - distanceValue = mean; - } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) { - float max_val = 0; - - for (size_t i = 0; i < m_referencePerms.size(); ++i) { - //look for the closest reference - float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric); - //cout << "Ref number: " << i << " distance: " << value << endl; - if (value > max_val) { - max_val = value; - } - } - distanceValue = max_val; - //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl; - } else { - throw runtime_error("Unsupported reflength strategy"); + for (size_t i = 0; i < m_referencePerms.size(); ++i) { + //look for the closest reference + float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric); + //cout << "Ref number: " << i << " distance: " << value << endl; + if (value > max_val) { + max_val = value; + } } + distanceValue = max_val; + //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl; + } else { + throw runtime_error("Unsupported reflength strategy"); + } - //SCOREROUT eg: 0.04546 - ostringstream tempStream; - tempStream.precision(SCORE_PRECISION); - tempStream << distanceValue; - string str = tempStream.str(); - entry.set(str); - - //cout << tempStream.str(); + //SCOREROUT eg: 0.04546 + ostringstream tempStream; + tempStream.precision(SCORE_PRECISION); + tempStream << distanceValue; + string str = tempStream.str(); + entry.set(str); + + //cout << tempStream.str(); } //Will just be final score -statscore_t PermutationScorer::calculateScore(const vector& comps) { - //cerr << "*******PermutationScorer::calculateScore" ; - //cerr << " " << comps[0] << endl; - return comps[0]; +statscore_t PermutationScorer::calculateScore(const vector& comps) +{ + //cerr << "*******PermutationScorer::calculateScore" ; + //cerr << " " << comps[0] << endl; + return comps[0]; } diff --git a/mert/PermutationScorer.h b/mert/PermutationScorer.h index dc0814d6d..d307b57fe 100644 --- a/mert/PermutationScorer.h +++ b/mert/PermutationScorer.h @@ -17,44 +17,44 @@ #include "Permutation.h" /** - * Permutation + * Permutation **/ -class PermutationScorer: public SentenceLevelScorer +class PermutationScorer: public SentenceLevelScorer { - public: - PermutationScorer(const string &distanceMetric = "HAMMING", - const string &config = string()); - void setReferenceFiles(const vector& referenceFiles); - void prepareStats(size_t sid, const string& text, ScoreStats& entry); - static const int SCORE_PRECISION; - - size_t NumberOfScores() const { - //cerr << "PermutationScorer number of scores: 1" << endl; - return 1; - }; - bool useAlignment() const { - //cout << "PermutationScorer::useAlignment returning true" << endl; - return true; - }; - - protected: - statscore_t calculateScore(const vector& scores); - PermutationScorer(const PermutationScorer&); - ~PermutationScorer(){}; - PermutationScorer& operator=(const PermutationScorer&); - int getNumberWords (const string & line) const; +public: + PermutationScorer(const string &distanceMetric = "HAMMING", + const string &config = string()); + void setReferenceFiles(const vector& referenceFiles); + void prepareStats(size_t sid, const string& text, ScoreStats& entry); + static const int SCORE_PRECISION; - distanceMetricReferenceChoice_t m_refChoiceStrategy; - distanceMetric_t m_distanceMetric; - - // data extracted from reference files - // A vector of permutations for each reference file - vector< vector > m_referencePerms; - vector m_sourceLengths; - vector m_referenceAlignments; - - private: + size_t NumberOfScores() const { + //cerr << "PermutationScorer number of scores: 1" << endl; + return 1; + }; + bool useAlignment() const { + //cout << "PermutationScorer::useAlignment returning true" << endl; + return true; + }; + +protected: + statscore_t calculateScore(const vector& scores); + PermutationScorer(const PermutationScorer&); + ~PermutationScorer() {}; + PermutationScorer& operator=(const PermutationScorer&); + int getNumberWords (const string & line) const; + + distanceMetricReferenceChoice_t m_refChoiceStrategy; + distanceMetric_t m_distanceMetric; + + // data extracted from reference files + // A vector of permutations for each reference file + vector< vector > m_referencePerms; + vector m_sourceLengths; + vector m_referenceAlignments; + +private: }; //TODO need to read in floats for scores - necessary for selecting mean reference strategy and for BLEU? diff --git a/mert/Point.cpp b/mert/Point.cpp index dc88cba20..af57aaa98 100644 --- a/mert/Point.cpp +++ b/mert/Point.cpp @@ -10,22 +10,24 @@ vector Point::optindices; unsigned Point::dim=0; map Point::fixedweights; - + unsigned Point::pdim=0; unsigned Point::ncall=0; -void Point::Randomize(const vector& min,const vector& max){ +void Point::Randomize(const vector& min,const vector& max) +{ assert(min.size()==Point::dim); assert(max.size()==Point::dim); for (unsigned int i=0; i& init):vector(Point::dim){ - if(init.size()==dim){ +Point::Point(const vector& init):vector(Point::dim) +{ + if(init.size()==dim) { for (unsigned int i=0; i& init):vector(Point::dim){ }; -double Point::operator*(const FeatureStats& F)const{ +double Point::operator*(const FeatureStats& F)const +{ ncall++;//to track performance double prod=0.0; if(OptimizeAll()) for (unsigned i=0; i::iterator it=fixedweights.begin();it!=fixedweights.end();it++) + for(map::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++) prod+=it->second*F.get(it->first); } return prod; } -Point Point::operator+(const Point& p2)const{ +Point Point::operator+(const Point& p2)const +{ assert(p2.size()==size()); Point Res(*this); - for(unsigned i=0;i::max(); return Res; }; -Point Point::operator*(float l)const{ +Point Point::operator*(float l)const +{ Point Res(*this); - for(unsigned i=0;i::max(); return Res; }; - ostream& operator<<(ostream& o,const Point& P){ - vector w=P.GetAllWeights(); +ostream& operator<<(ostream& o,const Point& P) +{ + vector w=P.GetAllWeights(); // o << "[" << Point::pdim << "] "; - for(unsigned int i=0;i " << P.GetScore(); - return o; + return o; }; -vector Point::GetAllWeights()const{ +vector Point::GetAllWeights()const +{ vector w; - if(OptimizeAll()){ + if(OptimizeAll()) { w=*this; - }else{ + } else { w.resize(pdim); for (unsigned int i=0; i::iterator it=fixedweights.begin();it!=fixedweights.end();it++) - w[it->first]=it->second; + for(map::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++) + w[it->first]=it->second; } return w; }; - + diff --git a/mert/Point.h b/mert/Point.h index 00dc7968f..9a4d3b5b0 100644 --- a/mert/Point.h +++ b/mert/Point.h @@ -10,9 +10,10 @@ class Optimizer; /**class that handle the subset of the Feature weight on which we run the optimization*/ -class Point:public vector{ +class Point:public vector +{ friend class Optimizer; - private: +private: /**The indices over which we optimize*/ static vector optindices; /**dimension of optindices and of the parent vector*/ @@ -22,12 +23,18 @@ class Point:public vector{ /**total size of the parameter space; we have pdim=FixedWeight.size()+optinidices.size()*/ static unsigned int pdim; static unsigned int ncall; - public: - static unsigned int getdim(){return dim;} - static unsigned int getpdim(){return pdim;} - static bool OptimizeAll(){return fixedweights.empty();}; +public: + static unsigned int getdim() { + return dim; + } + static unsigned int getpdim() { + return pdim; + } + static bool OptimizeAll() { + return fixedweights.empty(); + }; statscore_t score; - Point():vector(dim){}; + Point():vector(dim) {}; Point(const vector& init); void Randomize(const vector& min,const vector& max); @@ -36,12 +43,16 @@ class Point:public vector{ Point operator*(float)const; /**write the Whole featureweight to a stream (ie pdim float)*/ friend ostream& operator<<(ostream& o,const Point& P); - void Normalize(){ NormalizeL2(); }; + void Normalize() { + NormalizeL2(); + }; void NormalizeL2(); void NormalizeL1(); /**return a vector of size pdim where all weights have been put(including fixed ones)*/ vector GetAllWeights()const; - statscore_t GetScore()const { return score; }; + statscore_t GetScore()const { + return score; + }; }; #endif diff --git a/mert/ScoreArray.cpp b/mert/ScoreArray.cpp index 5f10131c0..92824c818 100644 --- a/mert/ScoreArray.cpp +++ b/mert/ScoreArray.cpp @@ -15,134 +15,134 @@ ScoreArray::ScoreArray(): idx("") void ScoreArray::savetxt(std::ofstream& outFile, const std::string& sctype) { - outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size() - << " " << number_of_scores << " " << sctype << std::endl; - for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++){ - i->savetxt(outFile); - outFile << std::endl; - } - outFile << SCORES_TXT_END << std::endl; + outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size() + << " " << number_of_scores << " " << sctype << std::endl; + for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) { + i->savetxt(outFile); + outFile << std::endl; + } + outFile << SCORES_TXT_END << std::endl; } void ScoreArray::savebin(std::ofstream& outFile, const std::string& sctype) { - outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size() - << " " << number_of_scores << " " << sctype << std::endl; - for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) - i->savebin(outFile); - - outFile << SCORES_BIN_END << std::endl; + outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size() + << " " << number_of_scores << " " << sctype << std::endl; + for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) + i->savebin(outFile); + + outFile << SCORES_BIN_END << std::endl; } void ScoreArray::save(std::ofstream& inFile, const std::string& sctype, bool bin) { - if (size()>0) - (bin)?savebin(inFile, sctype):savetxt(inFile, sctype); + if (size()>0) + (bin)?savebin(inFile, sctype):savetxt(inFile, sctype); } void ScoreArray::save(const std::string &file, const std::string& sctype, bool bin) { - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file + std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - save(outFile, sctype, bin); + save(outFile, sctype, bin); - outFile.close(); + outFile.close(); } void ScoreArray::loadbin(ifstream& inFile, size_t n) { - ScoreStats entry(number_of_scores); - - for (size_t i=0 ; i < n; i++){ - entry.loadbin(inFile); - add(entry); - } + ScoreStats entry(number_of_scores); + + for (size_t i=0 ; i < n; i++) { + entry.loadbin(inFile); + add(entry); + } } void ScoreArray::loadtxt(ifstream& inFile, size_t n) { - ScoreStats entry(number_of_scores); - - for (size_t i=0 ; i < n; i++){ - entry.loadtxt(inFile); - add(entry); - } + ScoreStats entry(number_of_scores); + + for (size_t i=0 ; i < n; i++) { + entry.loadtxt(inFile); + add(entry); + } } void ScoreArray::load(ifstream& inFile) { size_t number_of_entries=0; - bool binmode=false; - - std::string substring, stringBuf; + bool binmode=false; + + std::string substring, stringBuf; std::string::size_type loc; - - std::getline(inFile, stringBuf); - if (!inFile.good()){ - return; - } - - if (!stringBuf.empty()){ - if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0){ - binmode=false; - }else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0){ - binmode=true; - }else{ - TRACE_ERR("ERROR: ScoreArray::load(): Wrong header"); - return; - } - getNextPound(stringBuf, substring); - getNextPound(stringBuf, substring); - idx = substring; - getNextPound(stringBuf, substring); + + std::getline(inFile, stringBuf); + if (!inFile.good()) { + return; + } + + if (!stringBuf.empty()) { + if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) { + binmode=false; + } else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) { + binmode=true; + } else { + TRACE_ERR("ERROR: ScoreArray::load(): Wrong header"); + return; + } + getNextPound(stringBuf, substring); + getNextPound(stringBuf, substring); + idx = substring; + getNextPound(stringBuf, substring); number_of_entries = atoi(substring.c_str()); - getNextPound(stringBuf, substring); + getNextPound(stringBuf, substring); number_of_scores = atoi(substring.c_str()); - getNextPound(stringBuf, substring); - score_type = substring; - } - - (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries); - - std::getline(inFile, stringBuf); - if (!stringBuf.empty()){ - if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0){ - TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer"); - return; - } - } + getNextPound(stringBuf, substring); + score_type = substring; + } + + (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries); + + std::getline(inFile, stringBuf); + if (!stringBuf.empty()) { + if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0) { + TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer"); + return; + } + } } void ScoreArray::load(const std::string &file) { - TRACE_ERR("loading data from " << file << std::endl); + TRACE_ERR("loading data from " << file << std::endl); - inputfilestream inFile(file); // matches a stream with a file. Opens the file + inputfilestream inFile(file); // matches a stream with a file. Opens the file - load((ifstream&) inFile); + load((ifstream&) inFile); - inFile.close(); + inFile.close(); } void ScoreArray::merge(ScoreArray& e) { - //dummy implementation - for (size_t i=0; isize()!=sz) - return false; - return true; + size_t sz = NumberOfScores(); + + if (sz == 0) + return true; + + for (scorearray_t::iterator i=array_.begin(); i!=array_.end(); i++) + if (i->size()!=sz) + return false; + return true; } diff --git a/mert/ScoreArray.h b/mert/ScoreArray.h index b3294b56f..71ea2b51a 100644 --- a/mert/ScoreArray.h +++ b/mert/ScoreArray.h @@ -27,52 +27,76 @@ using namespace std; class ScoreArray { protected: - scorearray_t array_; - std::string score_type; - size_t number_of_scores; - -private: - std::string idx; // idx to identify the utterance, it can differ from the index inside the vector + scorearray_t array_; + std::string score_type; + size_t number_of_scores; + +private: + std::string idx; // idx to identify the utterance, it can differ from the index inside the vector + - public: - ScoreArray(); - - ~ScoreArray(){}; - - inline void clear() { array_.clear(); } - - inline std::string getIndex(){ return idx; } - inline void setIndex(const std::string& value){ idx=value; } + ScoreArray(); + + ~ScoreArray() {}; + + inline void clear() { + array_.clear(); + } + + inline std::string getIndex() { + return idx; + } + inline void setIndex(const std::string& value) { + idx=value; + } // inline ScoreStats get(size_t i){ return array_.at(i); } - - inline ScoreStats& get(size_t i){ return array_.at(i); } - inline const ScoreStats& get(size_t i)const{ return array_.at(i); } - void add(const ScoreStats& e){ array_.push_back(e); } + inline ScoreStats& get(size_t i) { + return array_.at(i); + } + inline const ScoreStats& get(size_t i)const { + return array_.at(i); + } - void merge(ScoreArray& e); + void add(const ScoreStats& e) { + array_.push_back(e); + } - inline std::string name() const{ return score_type; }; - inline void name(std::string &sctype){ score_type = sctype; }; + void merge(ScoreArray& e); - inline size_t size(){ return array_.size(); } - inline size_t NumberOfScores() const{ return number_of_scores; } - inline void NumberOfScores(size_t v){ number_of_scores = v; } - - void savetxt(ofstream& outFile, const std::string& sctype); - void savebin(ofstream& outFile, const std::string& sctype); - void save(ofstream& outFile, const std::string& sctype, bool bin=false); - void save(const std::string &file, const std::string& sctype, bool bin=false); - inline void save(const std::string& sctype, bool bin=false){ save("/dev/stdout", sctype, bin); } - - void loadtxt(ifstream& inFile, size_t n); - void loadbin(ifstream& inFile, size_t n); - void load(ifstream& inFile); - void load(const std::string &file); - - bool check_consistency(); + inline std::string name() const { + return score_type; + }; + inline void name(std::string &sctype) { + score_type = sctype; + }; + + inline size_t size() { + return array_.size(); + } + inline size_t NumberOfScores() const { + return number_of_scores; + } + inline void NumberOfScores(size_t v) { + number_of_scores = v; + } + + void savetxt(ofstream& outFile, const std::string& sctype); + void savebin(ofstream& outFile, const std::string& sctype); + void save(ofstream& outFile, const std::string& sctype, bool bin=false); + void save(const std::string &file, const std::string& sctype, bool bin=false); + inline void save(const std::string& sctype, bool bin=false) { + save("/dev/stdout", sctype, bin); + } + + void loadtxt(ifstream& inFile, size_t n); + void loadbin(ifstream& inFile, size_t n); + void load(ifstream& inFile); + void load(const std::string &file); + + bool check_consistency(); }; diff --git a/mert/ScoreData.cpp b/mert/ScoreData.cpp index ba11994f1..561116a38 100644 --- a/mert/ScoreData.cpp +++ b/mert/ScoreData.cpp @@ -13,138 +13,138 @@ ScoreData::ScoreData(Scorer& ptr): -theScorer(&ptr) + theScorer(&ptr) { - score_type = theScorer->getName(); - //theScorer->setScoreData(this);//this is not dangerous: we dont use the this pointer in SetScoreData - number_of_scores = theScorer->NumberOfScores(); - TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl); + score_type = theScorer->getName(); + //theScorer->setScoreData(this);//this is not dangerous: we dont use the this pointer in SetScoreData + number_of_scores = theScorer->NumberOfScores(); + TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl); }; -void ScoreData::dump() +void ScoreData::dump() { - for (vector::iterator it = array_.begin(); it !=array_.end(); it++){ - cout << "scorearray: " << endl; - for (size_t i = 0; i < (*it).size(); i++) { - ScoreStats scoreStats = (*it).get(i); - cout << "scorestats: " ; - for (size_t j = 0; j < scoreStats.size(); j ++ ){ - ScoreStatsType scoreStatsType = scoreStats.get(j); - cout << scoreStatsType << " " ; - } - cout << endl; - } - } + for (vector::iterator it = array_.begin(); it !=array_.end(); it++) { + cout << "scorearray: " << endl; + for (size_t i = 0; i < (*it).size(); i++) { + ScoreStats scoreStats = (*it).get(i); + cout << "scorestats: " ; + for (size_t j = 0; j < scoreStats.size(); j ++ ) { + ScoreStatsType scoreStatsType = scoreStats.get(j); + cout << scoreStatsType << " " ; + } + cout << endl; + } + } } void ScoreData::save(std::ofstream& outFile, bool bin) { - for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++){ - i->save(outFile, score_type, bin); - } + for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) { + i->save(outFile, score_type, bin); + } } void ScoreData::save(const std::string &file, bool bin) { - if (file.empty()) return; - TRACE_ERR("saving the array into " << file << std::endl); + if (file.empty()) return; + TRACE_ERR("saving the array into " << file << std::endl); - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file + std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file ScoreStats entry; - save(outFile, bin); + save(outFile, bin); - outFile.close(); + outFile.close(); } void ScoreData::load(ifstream& inFile) { ScoreArray entry; - while (!inFile.eof()){ - - if (!inFile.good()){ - std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl; - } - - entry.clear(); - entry.load(inFile); + while (!inFile.eof()) { - if (entry.size() == 0){ - break; - } - add(entry); - } - theScorer->setScoreData(this); + if (!inFile.good()) { + std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl; + } + + entry.clear(); + entry.load(inFile); + + if (entry.size() == 0) { + break; + } + add(entry); + } + theScorer->setScoreData(this); } void ScoreData::load(const std::string &file) { - TRACE_ERR("loading score data from " << file << std::endl); + TRACE_ERR("loading score data from " << file << std::endl); - inputfilestream inFile(file); // matches a stream with a file. Opens the file + inputfilestream inFile(file); // matches a stream with a file. Opens the file - if (!inFile) { - throw runtime_error("Unable to open score file: " + file); - } + if (!inFile) { + throw runtime_error("Unable to open score file: " + file); + } - load((ifstream&) inFile); + load((ifstream&) inFile); - inFile.close(); + inFile.close(); } -void ScoreData::add(ScoreArray& e){ - if (exists(e.getIndex())){ // array at position e.getIndex() already exists - //enlarge array at position e.getIndex() - size_t pos = getIndex(e.getIndex()); - array_.at(pos).merge(e); - } - else{ - array_.push_back(e); - setIndex(); - } +void ScoreData::add(ScoreArray& e) +{ + if (exists(e.getIndex())) { // array at position e.getIndex() already exists + //enlarge array at position e.getIndex() + size_t pos = getIndex(e.getIndex()); + array_.at(pos).merge(e); + } else { + array_.push_back(e); + setIndex(); + } } -void ScoreData::add(const ScoreStats& e, const std::string& sent_idx){ - if (exists(sent_idx)){ // array at position e.getIndex() already exists - //enlarge array at position e.getIndex() - size_t pos = getIndex(sent_idx); - // TRACE_ERR("Inserting in array " << sent_idx << std::endl); - array_.at(pos).add(e); - // TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl); - } - else{ - // TRACE_ERR("Creating a new entry in the array" << std::endl); - ScoreArray a; - a.NumberOfScores(number_of_scores); - a.add(e); - a.setIndex(sent_idx); - add(a); - // TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl); - } - } +void ScoreData::add(const ScoreStats& e, const std::string& sent_idx) +{ + if (exists(sent_idx)) { // array at position e.getIndex() already exists + //enlarge array at position e.getIndex() + size_t pos = getIndex(sent_idx); + // TRACE_ERR("Inserting in array " << sent_idx << std::endl); + array_.at(pos).add(e); + // TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl); + } else { + // TRACE_ERR("Creating a new entry in the array" << std::endl); + ScoreArray a; + a.NumberOfScores(number_of_scores); + a.add(e); + a.setIndex(sent_idx); + add(a); + // TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl); + } +} bool ScoreData::check_consistency() { - if (array_.size() == 0) - return true; - - for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) - if (!i->check_consistency()) return false; - - return true; + if (array_.size() == 0) + return true; + + for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) + if (!i->check_consistency()) return false; + + return true; } void ScoreData::setIndex() { - size_t j=0; - for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++){ - idx2arrayname_[j]=i->getIndex(); - arrayname2idx_[i->getIndex()]=j; - j++; - } + size_t j=0; + for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) { + idx2arrayname_[j]=i->getIndex(); + arrayname2idx_[i->getIndex()]=j; + j++; + } } diff --git a/mert/ScoreData.h b/mert/ScoreData.h index 34ac740e6..ee2172dbb 100644 --- a/mert/ScoreData.h +++ b/mert/ScoreData.h @@ -23,66 +23,92 @@ class Scorer; class ScoreData { protected: - scoredata_t array_; - idx2name idx2arrayname_; //map from index to name of array - name2idx arrayname2idx_; //map from name to index of array - + scoredata_t array_; + idx2name idx2arrayname_; //map from index to name of array + name2idx arrayname2idx_; //map from name to index of array + private: - Scorer* theScorer; - std::string score_type; - size_t number_of_scores; - + Scorer* theScorer; + std::string score_type; + size_t number_of_scores; + public: - ScoreData(Scorer& sc); - - ~ScoreData(){}; - - inline void clear() { array_.clear(); } - - inline ScoreArray get(const std::string& idx){ return array_.at(getIndex(idx)); } - inline ScoreArray& get(size_t idx){ return array_.at(idx); } - inline const ScoreArray& get(size_t idx) const { return array_.at(idx); } - - inline bool exists(const std::string & sent_idx){ return exists(getIndex(sent_idx)); } - inline bool exists(int sent_idx){ return (sent_idx>-1 && sent_idx<(int)array_.size())?true:false; } - - inline ScoreStats& get(size_t i, size_t j){ return array_.at(i).get(j); } - inline const ScoreStats& get(size_t i, size_t j) const { return array_.at(i).get(j); } - - inline std::string name(){ return score_type; }; - inline std::string name(std::string &sctype){ return score_type = sctype; }; + ScoreData(Scorer& sc); - void add(ScoreArray& e); - void add(const ScoreStats& e, const std::string& sent_idx); - - inline size_t NumberOfScores(){ return number_of_scores; } - inline size_t size(){ return array_.size(); } - - void save(const std::string &file, bool bin=false); - void save(ofstream& outFile, bool bin=false); - inline void save(bool bin=false){ save("/dev/stdout", bin); } + ~ScoreData() {}; - void load(ifstream& inFile); - void load(const std::string &file); - - bool check_consistency(); - void setIndex(); - - inline int getIndex(const std::string& idx){ - name2idx::iterator i = arrayname2idx_.find(idx); - if (i!=arrayname2idx_.end()) - return i->second; - else - return -1; + inline void clear() { + array_.clear(); } - inline std::string getIndex(size_t idx){ - idx2name::iterator i = idx2arrayname_.find(idx); - if (i!=idx2arrayname_.end()) - throw runtime_error("there is no entry at index " + idx); - return i->second; - } - void dump(); + inline ScoreArray get(const std::string& idx) { + return array_.at(getIndex(idx)); + } + inline ScoreArray& get(size_t idx) { + return array_.at(idx); + } + inline const ScoreArray& get(size_t idx) const { + return array_.at(idx); + } + + inline bool exists(const std::string & sent_idx) { + return exists(getIndex(sent_idx)); + } + inline bool exists(int sent_idx) { + return (sent_idx>-1 && sent_idx<(int)array_.size())?true:false; + } + + inline ScoreStats& get(size_t i, size_t j) { + return array_.at(i).get(j); + } + inline const ScoreStats& get(size_t i, size_t j) const { + return array_.at(i).get(j); + } + + inline std::string name() { + return score_type; + }; + inline std::string name(std::string &sctype) { + return score_type = sctype; + }; + + void add(ScoreArray& e); + void add(const ScoreStats& e, const std::string& sent_idx); + + inline size_t NumberOfScores() { + return number_of_scores; + } + inline size_t size() { + return array_.size(); + } + + void save(const std::string &file, bool bin=false); + void save(ofstream& outFile, bool bin=false); + inline void save(bool bin=false) { + save("/dev/stdout", bin); + } + + void load(ifstream& inFile); + void load(const std::string &file); + + bool check_consistency(); + void setIndex(); + + inline int getIndex(const std::string& idx) { + name2idx::iterator i = arrayname2idx_.find(idx); + if (i!=arrayname2idx_.end()) + return i->second; + else + return -1; + } + inline std::string getIndex(size_t idx) { + idx2name::iterator i = idx2arrayname_.find(idx); + if (i!=idx2arrayname_.end()) + throw runtime_error("there is no entry at index " + idx); + return i->second; + } + + void dump(); }; diff --git a/mert/ScoreStats.cpp b/mert/ScoreStats.cpp index 0af6dee4c..bbfc38743 100644 --- a/mert/ScoreStats.cpp +++ b/mert/ScoreStats.cpp @@ -14,123 +14,124 @@ ScoreStats::ScoreStats() { - available_ = AVAILABLE_; - entries_ = 0; - array_ = new ScoreStatsType[available_]; + available_ = AVAILABLE_; + entries_ = 0; + array_ = new ScoreStatsType[available_]; }; ScoreStats::~ScoreStats() { - delete array_; + delete array_; }; - ScoreStats::ScoreStats(const ScoreStats &stats) +ScoreStats::ScoreStats(const ScoreStats &stats) { - available_ = stats.available(); - entries_ = stats.size(); - array_ = new ScoreStatsType[available_]; - memcpy(array_,stats.getArray(),scorebytes_); + available_ = stats.available(); + entries_ = stats.size(); + array_ = new ScoreStatsType[available_]; + memcpy(array_,stats.getArray(),scorebytes_); }; ScoreStats::ScoreStats(const size_t size) { - available_ = size; - entries_ = size; - array_ = new ScoreStatsType[available_]; - memset(array_,0,scorebytes_); + available_ = size; + entries_ = size; + array_ = new ScoreStatsType[available_]; + memset(array_,0,scorebytes_); }; ScoreStats::ScoreStats(std::string &theString) { - set(theString); + set(theString); } void ScoreStats::expand() { - available_*=2; - scorestats_t t_ = new ScoreStatsType[available_]; - memcpy(t_,array_,scorebytes_); - delete array_; - array_=t_; + available_*=2; + scorestats_t t_ = new ScoreStatsType[available_]; + memcpy(t_,array_,scorebytes_); + delete array_; + array_=t_; } void ScoreStats::add(ScoreStatsType v) { - if (isfull()) expand(); - array_[entries_++]=v; + if (isfull()) expand(); + array_[entries_++]=v; } void ScoreStats::set(std::string &theString) { std::string substring, stringBuf; - reset(); - - while (!theString.empty()){ - getNextPound(theString, substring); - add(ATOSST(substring.c_str())); - } + reset(); + + while (!theString.empty()) { + getNextPound(theString, substring); + add(ATOSST(substring.c_str())); + } } void ScoreStats::loadbin(std::ifstream& inFile) { - inFile.read((char*) array_, scorebytes_); -} + inFile.read((char*) array_, scorebytes_); +} void ScoreStats::loadtxt(std::ifstream& inFile) { std::string theString; - std::getline(inFile, theString); - set(theString); + std::getline(inFile, theString); + set(theString); } void ScoreStats::loadtxt(const std::string &file) { -// TRACE_ERR("loading the stats from " << file << std::endl); +// TRACE_ERR("loading the stats from " << file << std::endl); - std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file + std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file - loadtxt(inFile); + loadtxt(inFile); } void ScoreStats::savetxt(const std::string &file) { -// TRACE_ERR("saving the stats into " << file << std::endl); +// TRACE_ERR("saving the stats into " << file << std::endl); - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file + std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - savetxt(outFile); + savetxt(outFile); } void ScoreStats::savetxt(std::ofstream& outFile) { - outFile << *this; + outFile << *this; } void ScoreStats::savebin(std::ofstream& outFile) { - outFile.write((char*) array_, scorebytes_); -} + outFile.write((char*) array_, scorebytes_); +} ScoreStats& ScoreStats::operator=(const ScoreStats &stats) { - delete array_; - available_ = stats.available(); - entries_ = stats.size(); - array_ = new ScoreStatsType[available_]; - memcpy(array_,stats.getArray(),scorebytes_); - - return *this; + delete array_; + available_ = stats.available(); + entries_ = stats.size(); + array_ = new ScoreStatsType[available_]; + memcpy(array_,stats.getArray(),scorebytes_); + + return *this; } /**write the whole object to a stream*/ -ostream& operator<<(ostream& o, const ScoreStats& e){ - for (size_t i=0; i< e.size(); i++) - o << e.get(i) << " "; - return o; +ostream& operator<<(ostream& o, const ScoreStats& e) +{ + for (size_t i=0; i< e.size(); i++) + o << e.get(i) << " "; + return o; } diff --git a/mert/ScoreStats.h b/mert/ScoreStats.h index 147fcef18..9ceee4c6b 100644 --- a/mert/ScoreStats.h +++ b/mert/ScoreStats.h @@ -26,51 +26,72 @@ using namespace std; class ScoreStats { private: - scorestats_t array_; - size_t entries_; - size_t available_; - + scorestats_t array_; + size_t entries_; + size_t available_; + public: - ScoreStats(); + ScoreStats(); ScoreStats(const size_t size); - ScoreStats(const ScoreStats &stats); - ScoreStats(std::string &theString); - ScoreStats& operator=(const ScoreStats &stats); - - ~ScoreStats(); - - bool isfull(){return (entries_ < available_)?0:1; } - void expand(); - void add(ScoreStatsType v); - - inline void clear() { memset((void*) array_,0,scorebytes_); } - - inline ScoreStatsType get(size_t i){ return array_[i]; } - inline ScoreStatsType get(size_t i)const{ return array_[i]; } - inline scorestats_t getArray() const { return array_; } - - void set(std::string &theString); + ScoreStats(const ScoreStats &stats); + ScoreStats(std::string &theString); + ScoreStats& operator=(const ScoreStats &stats); - inline size_t bytes() const{ return scorebytes_; } - inline size_t size() const{ return entries_; } - inline size_t available() const{ return available_; } - - void savetxt(const std::string &file); - void savetxt(ofstream& outFile); - void savebin(ofstream& outFile); - inline void savetxt(){ savetxt("/dev/stdout"); } + ~ScoreStats(); - - - void loadtxt(const std::string &file); - void loadtxt(ifstream& inFile); - void loadbin(ifstream& inFile); - - - inline void reset(){ entries_ = 0; clear(); } + bool isfull() { + return (entries_ < available_)?0:1; + } + void expand(); + void add(ScoreStatsType v); - /**write the whole object to a stream*/ - friend ostream& operator<<(ostream& o, const ScoreStats& e); + inline void clear() { + memset((void*) array_,0,scorebytes_); + } + + inline ScoreStatsType get(size_t i) { + return array_[i]; + } + inline ScoreStatsType get(size_t i)const { + return array_[i]; + } + inline scorestats_t getArray() const { + return array_; + } + + void set(std::string &theString); + + inline size_t bytes() const { + return scorebytes_; + } + inline size_t size() const { + return entries_; + } + inline size_t available() const { + return available_; + } + + void savetxt(const std::string &file); + void savetxt(ofstream& outFile); + void savebin(ofstream& outFile); + inline void savetxt() { + savetxt("/dev/stdout"); + } + + + + void loadtxt(const std::string &file); + void loadtxt(ifstream& inFile); + void loadbin(ifstream& inFile); + + + inline void reset() { + entries_ = 0; + clear(); + } + + /**write the whole object to a stream*/ + friend ostream& operator<<(ostream& o, const ScoreStats& e); }; diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp index dfe468c44..5e7bc49e5 100644 --- a/mert/Scorer.cpp +++ b/mert/Scorer.cpp @@ -1,105 +1,108 @@ #include "Scorer.h" //regularisation strategies -static float score_min(const statscores_t& scores, size_t start, size_t end) { - float min = numeric_limits::max(); - for (size_t i = start; i < end; ++i) { - if (scores[i] < min) { - min = scores[i]; - } - } - return min; +static float score_min(const statscores_t& scores, size_t start, size_t end) +{ + float min = numeric_limits::max(); + for (size_t i = start; i < end; ++i) { + if (scores[i] < min) { + min = scores[i]; + } + } + return min; } -static float score_average(const statscores_t& scores, size_t start, size_t end) { - if ((end - start) < 1) { - //shouldn't happen - return 0; - } - float total = 0; - for (size_t j = start; j < end; ++j) { - total += scores[j]; - } +static float score_average(const statscores_t& scores, size_t start, size_t end) +{ + if ((end - start) < 1) { + //shouldn't happen + return 0; + } + float total = 0; + for (size_t j = start; j < end; ++j) { + total += scores[j]; + } - return total / (end - start); + return total / (end - start); } void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t& diffs, - statscores_t& scores) { - //cout << "*******StatisticsBasedScorer::score" << endl; - if (!_scoreData) { - throw runtime_error("Score data not loaded"); - } - //calculate the score for the candidates - if (_scoreData->size() == 0) { - throw runtime_error("Score data is empty"); + statscores_t& scores) +{ + //cout << "*******StatisticsBasedScorer::score" << endl; + if (!_scoreData) { + throw runtime_error("Score data not loaded"); + } + //calculate the score for the candidates + if (_scoreData->size() == 0) { + throw runtime_error("Score data is empty"); + } + if (candidates.size() == 0) { + throw runtime_error("No candidates supplied"); + } + int numCounts = _scoreData->get(0,candidates[0]).size(); + vector totals(numCounts); + for (size_t i = 0; i < candidates.size(); ++i) { + //cout << " i " << i << " candidates[i] " << candidates[i] << endl; + ScoreStats stats = _scoreData->get(i,candidates[i]); + if (stats.size() != totals.size()) { + stringstream msg; + msg << "Statistics for (" << "," << candidates[i] << ") have incorrect " + << "number of fields. Found: " << stats.size() << " Expected: " + << totals.size(); + throw runtime_error(msg.str()); } - if (candidates.size() == 0) { - throw runtime_error("No candidates supplied"); + for (size_t k = 0; k < totals.size(); ++k) { + totals[k] += stats.get(k); } - int numCounts = _scoreData->get(0,candidates[0]).size(); - vector totals(numCounts); - for (size_t i = 0; i < candidates.size(); ++i) { - //cout << " i " << i << " candidates[i] " << candidates[i] << endl; - ScoreStats stats = _scoreData->get(i,candidates[i]); - if (stats.size() != totals.size()) { - stringstream msg; - msg << "Statistics for (" << "," << candidates[i] << ") have incorrect " - << "number of fields. Found: " << stats.size() << " Expected: " - << totals.size(); - throw runtime_error(msg.str()); - } - for (size_t k = 0; k < totals.size(); ++k) { - totals[k] += stats.get(k); - } - } - scores.push_back(calculateScore(totals)); + } + scores.push_back(calculateScore(totals)); - candidates_t last_candidates(candidates); - //apply each of the diffs, and get new scores - for (size_t i = 0; i < diffs.size(); ++i) { - for (size_t j = 0; j < diffs[i].size(); ++j) { - size_t sid = diffs[i][j].first; - size_t nid = diffs[i][j].second; + candidates_t last_candidates(candidates); + //apply each of the diffs, and get new scores + for (size_t i = 0; i < diffs.size(); ++i) { + for (size_t j = 0; j < diffs[i].size(); ++j) { + size_t sid = diffs[i][j].first; + size_t nid = diffs[i][j].second; //cout << "STSC:sid = " << sid << endl; //cout << "STSC:nid = " << nid << endl; - size_t last_nid = last_candidates[sid]; + size_t last_nid = last_candidates[sid]; //cout << "STSC:oid = " << last_nid << endl; - for (size_t k = 0; k < totals.size(); ++k) { - float diff = _scoreData->get(sid,nid).get(k) - - _scoreData->get(sid,last_nid).get(k); - totals[k] += diff; + for (size_t k = 0; k < totals.size(); ++k) { + float diff = _scoreData->get(sid,nid).get(k) + - _scoreData->get(sid,last_nid).get(k); + totals[k] += diff; //cout << "STSC:nid = " << _scoreData->get(sid,nid).get(k) << endl; //cout << "STSC:oid = " << _scoreData->get(sid,last_nid).get(k) << endl; //cout << "STSC:diff = " << diff << endl; //cout << "STSC:totals = " << totals[k] << endl; - } - last_candidates[sid] = nid; - } - scores.push_back(calculateScore(totals)); + } + last_candidates[sid] = nid; } + scores.push_back(calculateScore(totals)); + } - //regularisation. This can either be none, or the min or average as described in - //Cer, Jurafsky and Manning at WMT08 - if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) { - //no regularisation - return; - } + //regularisation. This can either be none, or the min or average as described in + //Cer, Jurafsky and Manning at WMT08 + if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) { + //no regularisation + return; + } - //window size specifies the +/- in each direction - statscores_t raw_scores(scores);//copy scores - for (size_t i = 0; i < scores.size(); ++i) { - size_t start = 0; - if (i >= _regularisationWindow) { - start = i - _regularisationWindow; - } - size_t end = min(scores.size(), i + _regularisationWindow+1); - if (_regularisationStrategy == REG_AVERAGE) { - scores[i] = score_average(raw_scores,start,end); - } else { - scores[i] = score_min(raw_scores,start,end); - } + //window size specifies the +/- in each direction + statscores_t raw_scores(scores);//copy scores + for (size_t i = 0; i < scores.size(); ++i) { + size_t start = 0; + if (i >= _regularisationWindow) { + start = i - _regularisationWindow; } + size_t end = min(scores.size(), i + _regularisationWindow+1); + if (_regularisationStrategy == REG_AVERAGE) { + scores[i] = score_average(raw_scores,start,end); + } else { + scores[i] = score_min(raw_scores,start,end); + } + } } @@ -110,89 +113,90 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t /** The sentence level scores have already been calculated, just need to average them and include the differences. Allows scores which are floats **/ void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs, - statscores_t& scores) { - //cout << "*******SentenceLevelScorer::score" << endl; - if (!_scoreData) { - throw runtime_error("Score data not loaded"); + statscores_t& scores) +{ + //cout << "*******SentenceLevelScorer::score" << endl; + if (!_scoreData) { + throw runtime_error("Score data not loaded"); + } + //calculate the score for the candidates + if (_scoreData->size() == 0) { + throw runtime_error("Score data is empty"); + } + if (candidates.size() == 0) { + throw runtime_error("No candidates supplied"); + } + int numCounts = _scoreData->get(0,candidates[0]).size(); + vector totals(numCounts); + for (size_t i = 0; i < candidates.size(); ++i) { + //cout << " i " << i << " candi " << candidates[i] ; + ScoreStats stats = _scoreData->get(i,candidates[i]); + if (stats.size() != totals.size()) { + stringstream msg; + msg << "Statistics for (" << "," << candidates[i] << ") have incorrect " + << "number of fields. Found: " << stats.size() << " Expected: " + << totals.size(); + throw runtime_error(msg.str()); } - //calculate the score for the candidates - if (_scoreData->size() == 0) { - throw runtime_error("Score data is empty"); - } - if (candidates.size() == 0) { - throw runtime_error("No candidates supplied"); - } - int numCounts = _scoreData->get(0,candidates[0]).size(); - vector totals(numCounts); - for (size_t i = 0; i < candidates.size(); ++i) { - //cout << " i " << i << " candi " << candidates[i] ; - ScoreStats stats = _scoreData->get(i,candidates[i]); - if (stats.size() != totals.size()) { - stringstream msg; - msg << "Statistics for (" << "," << candidates[i] << ") have incorrect " - << "number of fields. Found: " << stats.size() << " Expected: " - << totals.size(); - throw runtime_error(msg.str()); - } - //Add up scores for all sentences, would normally be just one score - for (size_t k = 0; k < totals.size(); ++k) { - totals[k] += stats.get(k); - //cout << " stats " << stats.get(k) ; - } - //cout << endl; - } - //take average + //Add up scores for all sentences, would normally be just one score for (size_t k = 0; k < totals.size(); ++k) { + totals[k] += stats.get(k); + //cout << " stats " << stats.get(k) ; + } + //cout << endl; + } + //take average + for (size_t k = 0; k < totals.size(); ++k) { //cout << "totals = " << totals[k] << endl; //cout << "cand = " << candidates.size() << endl; - totals[k] /= candidates.size(); + totals[k] /= candidates.size(); //cout << "finaltotals = " << totals[k] << endl; - } + } - scores.push_back(calculateScore(totals)); + scores.push_back(calculateScore(totals)); - candidates_t last_candidates(candidates); - //apply each of the diffs, and get new scores - for (size_t i = 0; i < diffs.size(); ++i) { - for (size_t j = 0; j < diffs[i].size(); ++j) { - size_t sid = diffs[i][j].first; - size_t nid = diffs[i][j].second; + candidates_t last_candidates(candidates); + //apply each of the diffs, and get new scores + for (size_t i = 0; i < diffs.size(); ++i) { + for (size_t j = 0; j < diffs[i].size(); ++j) { + size_t sid = diffs[i][j].first; + size_t nid = diffs[i][j].second; //cout << "sid = " << sid << endl; //cout << "nid = " << nid << endl; - size_t last_nid = last_candidates[sid]; - for (size_t k = 0; k < totals.size(); ++k) { - float diff = _scoreData->get(sid,nid).get(k) - - _scoreData->get(sid,last_nid).get(k); + size_t last_nid = last_candidates[sid]; + for (size_t k = 0; k < totals.size(); ++k) { + float diff = _scoreData->get(sid,nid).get(k) + - _scoreData->get(sid,last_nid).get(k); //cout << "diff = " << diff << endl; - totals[k] += diff/candidates.size(); + totals[k] += diff/candidates.size(); //cout << "totals = " << totals[k] << endl; - } - last_candidates[sid] = nid; - } - scores.push_back(calculateScore(totals)); + } + last_candidates[sid] = nid; } + scores.push_back(calculateScore(totals)); + } - //regularisation. This can either be none, or the min or average as described in - //Cer, Jurafsky and Manning at WMT08 - if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) { - //no regularisation - return; - } + //regularisation. This can either be none, or the min or average as described in + //Cer, Jurafsky and Manning at WMT08 + if (_regularisationStrategy == REG_NONE || _regularisationWindow <= 0) { + //no regularisation + return; + } - //window size specifies the +/- in each direction - statscores_t raw_scores(scores);//copy scores - for (size_t i = 0; i < scores.size(); ++i) { - size_t start = 0; - if (i >= _regularisationWindow) { - start = i - _regularisationWindow; - } - size_t end = min(scores.size(), i + _regularisationWindow+1); - if (_regularisationStrategy == REG_AVERAGE) { - scores[i] = score_average(raw_scores,start,end); - } else { - scores[i] = score_min(raw_scores,start,end); - } + //window size specifies the +/- in each direction + statscores_t raw_scores(scores);//copy scores + for (size_t i = 0; i < scores.size(); ++i) { + size_t start = 0; + if (i >= _regularisationWindow) { + start = i - _regularisationWindow; } + size_t end = min(scores.size(), i + _regularisationWindow+1); + if (_regularisationStrategy == REG_AVERAGE) { + scores[i] = score_average(raw_scores,start,end); + } else { + scores[i] = score_min(raw_scores,start,end); + } + } } diff --git a/mert/Scorer.h b/mert/Scorer.h index 83eca27ba..e52a04859 100644 --- a/mert/Scorer.h +++ b/mert/Scorer.h @@ -23,173 +23,180 @@ class ScoreStats; /** * Superclass of all scorers and dummy implementation. In order to add a new * scorer it should be sufficient to override prepareStats(), setReferenceFiles() - * and score() (or calculateScore()). + * and score() (or calculateScore()). **/ -class Scorer { - private: - string _name; - - public: - - Scorer(const string& name, const string& config): _name(name), _scoreData(0), _preserveCase(true){ - cerr << "Scorer config string: " << config << endl; - size_t start = 0; - while (start < config.size()) { - size_t end = config.find(",",start); - if (end == string::npos) { - end = config.size(); - } - string nv = config.substr(start,end-start); - size_t split = nv.find(":"); - if (split == string::npos) { - throw runtime_error("Missing colon when processing scorer config: " + config); - } - string name = nv.substr(0,split); - string value = nv.substr(split+1,nv.size()-split-1); - cerr << "name: " << name << " value: " << value << endl; - _config[name] = value; - start = end+1; - } +class Scorer +{ +private: + string _name; - }; - virtual ~Scorer(){}; +public: - - /** - * returns the number of statistics needed for the computation of the score - **/ - virtual size_t NumberOfScores() const { cerr << "Scorer: 0" << endl; return 0; }; - - /** - * set the reference files. This must be called before prepareStats. - **/ - virtual void setReferenceFiles(const vector& referenceFiles) { - //do nothing - } - - /** - * Process the given guessed text, corresponding to the given reference sindex - * and add the appropriate statistics to the entry. - **/ - virtual void prepareStats(size_t sindex, const string& text, ScoreStats& entry) - {} - - virtual void prepareStats(const string& sindex, const string& text, ScoreStats& entry) - { - -// cerr << sindex << endl; - this->prepareStats((size_t) atoi(sindex.c_str()), text, entry); - //cerr << text << std::endl; - } - - /** - * Score using each of the candidate index, then go through the diffs - * applying each in turn, and calculating a new score each time. - **/ - virtual void score(const candidates_t& candidates, const diffs_t& diffs, - statscores_t& scores) { - //dummy impl - if (!_scoreData) { - throw runtime_error("score data not loaded"); - } - scores.push_back(0); - for (size_t i = 0; i < diffs.size(); ++i) { - scores.push_back(0); - } - } - - - /** - * Calculate the score of the sentences corresponding to the list of candidate - * indices. Each index indicates the 1-best choice from the n-best list. - **/ - float score(const candidates_t& candidates) { - diffs_t diffs; - statscores_t scores; - score(candidates, diffs, scores); - return scores[0]; - } - - const string& getName() const {return _name;} - - size_t getReferenceSize() { - if (_scoreData) { - return _scoreData->size(); - } - return 0; - } - - - /** - * Set the score data, prior to scoring. - **/ - virtual void setScoreData(ScoreData* data) { - _scoreData = data; - } - /** - * The scorer returns if it uses the reference alignment data - * for permutation distance scores - **/ - virtual bool useAlignment() const { - //cout << "Scorer::useAlignment returning false " << endl; - return false; - }; - //calculate the actual score - virtual statscore_t calculateScore(const vector& totals){return 0;}; - - protected: - typedef map encodings_t; - typedef map::iterator encodings_it; - - ScoreData* _scoreData; - encodings_t _encodings; - - bool _preserveCase; - - /** - * Value of config variable. If not provided, return default. - **/ - string getConfig(const string& key, const string& def="") { - map::iterator i = _config.find(key); - if (i == _config.end()) { - return def; - } else { - return i->second; - } + Scorer(const string& name, const string& config): _name(name), _scoreData(0), _preserveCase(true) { + cerr << "Scorer config string: " << config << endl; + size_t start = 0; + while (start < config.size()) { + size_t end = config.find(",",start); + if (end == string::npos) { + end = config.size(); } - - - /** - * Tokenise line and encode. - * Note: We assume that all tokens are separated by single spaces - **/ - void encode(const string& line, vector& encoded) { - //cerr << line << endl; - istringstream in (line); - string token; - while (in >> token) { - if (!_preserveCase) { - for (string::iterator i = token.begin(); i != token.end(); ++i) { - *i = tolower(*i); - } - } - encodings_it encoding = _encodings.find(token); - int encoded_token; - if (encoding == _encodings.end()) { - encoded_token = (int)_encodings.size(); - _encodings[token] = encoded_token; - //cerr << encoded_token << "(n) "; - } else { - encoded_token = encoding->second; - //cerr << encoded_token << " "; - } - encoded.push_back(encoded_token); - } - //cerr << endl; + string nv = config.substr(start,end-start); + size_t split = nv.find(":"); + if (split == string::npos) { + throw runtime_error("Missing colon when processing scorer config: " + config); + } + string name = nv.substr(0,split); + string value = nv.substr(split+1,nv.size()-split-1); + cerr << "name: " << name << " value: " << value << endl; + _config[name] = value; + start = end+1; } - private: - map _config; + }; + virtual ~Scorer() {}; + + + /** + * returns the number of statistics needed for the computation of the score + **/ + virtual size_t NumberOfScores() const { + cerr << "Scorer: 0" << endl; + return 0; + }; + + /** + * set the reference files. This must be called before prepareStats. + **/ + virtual void setReferenceFiles(const vector& referenceFiles) { + //do nothing + } + + /** + * Process the given guessed text, corresponding to the given reference sindex + * and add the appropriate statistics to the entry. + **/ + virtual void prepareStats(size_t sindex, const string& text, ScoreStats& entry) + {} + + virtual void prepareStats(const string& sindex, const string& text, ScoreStats& entry) { + +// cerr << sindex << endl; + this->prepareStats((size_t) atoi(sindex.c_str()), text, entry); + //cerr << text << std::endl; + } + + /** + * Score using each of the candidate index, then go through the diffs + * applying each in turn, and calculating a new score each time. + **/ + virtual void score(const candidates_t& candidates, const diffs_t& diffs, + statscores_t& scores) { + //dummy impl + if (!_scoreData) { + throw runtime_error("score data not loaded"); + } + scores.push_back(0); + for (size_t i = 0; i < diffs.size(); ++i) { + scores.push_back(0); + } + } + + + /** + * Calculate the score of the sentences corresponding to the list of candidate + * indices. Each index indicates the 1-best choice from the n-best list. + **/ + float score(const candidates_t& candidates) { + diffs_t diffs; + statscores_t scores; + score(candidates, diffs, scores); + return scores[0]; + } + + const string& getName() const { + return _name; + } + + size_t getReferenceSize() { + if (_scoreData) { + return _scoreData->size(); + } + return 0; + } + + + /** + * Set the score data, prior to scoring. + **/ + virtual void setScoreData(ScoreData* data) { + _scoreData = data; + } + /** + * The scorer returns if it uses the reference alignment data + * for permutation distance scores + **/ + virtual bool useAlignment() const { + //cout << "Scorer::useAlignment returning false " << endl; + return false; + }; + //calculate the actual score + virtual statscore_t calculateScore(const vector& totals) { + return 0; + }; + +protected: + typedef map encodings_t; + typedef map::iterator encodings_it; + + ScoreData* _scoreData; + encodings_t _encodings; + + bool _preserveCase; + + /** + * Value of config variable. If not provided, return default. + **/ + string getConfig(const string& key, const string& def="") { + map::iterator i = _config.find(key); + if (i == _config.end()) { + return def; + } else { + return i->second; + } + } + + + /** + * Tokenise line and encode. + * Note: We assume that all tokens are separated by single spaces + **/ + void encode(const string& line, vector& encoded) { + //cerr << line << endl; + istringstream in (line); + string token; + while (in >> token) { + if (!_preserveCase) { + for (string::iterator i = token.begin(); i != token.end(); ++i) { + *i = tolower(*i); + } + } + encodings_it encoding = _encodings.find(token); + int encoded_token; + if (encoding == _encodings.end()) { + encoded_token = (int)_encodings.size(); + _encodings[token] = encoded_token; + //cerr << encoded_token << "(n) "; + } else { + encoded_token = encoding->second; + //cerr << encoded_token << " "; + } + encoded.push_back(encoded_token); + } + //cerr << endl; + } + +private: + map _config; }; @@ -197,11 +204,12 @@ class Scorer { /** - * Abstract base class for scorers that work by adding statistics across all + * Abstract base class for scorers that work by adding statistics across all * outout sentences, then apply some formula, e.g. bleu, per. **/ -class StatisticsBasedScorer : public Scorer { +class StatisticsBasedScorer : public Scorer +{ - public: +public: StatisticsBasedScorer(const string& name, const string& config): Scorer(name,config) { //configure regularisation static string KEY_TYPE = "regtype"; @@ -212,105 +220,110 @@ class StatisticsBasedScorer : public Scorer { static string TYPE_MINIMUM = "min"; static string TRUE = "true"; static string FALSE = "false"; - - + + string type = getConfig(KEY_TYPE,TYPE_NONE); if (type == TYPE_NONE) { - _regularisationStrategy = REG_NONE; + _regularisationStrategy = REG_NONE; } else if (type == TYPE_AVERAGE) { - _regularisationStrategy = REG_AVERAGE; + _regularisationStrategy = REG_AVERAGE; } else if (type == TYPE_MINIMUM) { - _regularisationStrategy = REG_MINIMUM; + _regularisationStrategy = REG_MINIMUM; } else { - throw runtime_error("Unknown scorer regularisation strategy: " + type); + throw runtime_error("Unknown scorer regularisation strategy: " + type); } cerr << "Using scorer regularisation strategy: " << type << endl; string window = getConfig(KEY_WINDOW,"0"); _regularisationWindow = atoi(window.c_str()); cerr << "Using scorer regularisation window: " << _regularisationWindow << endl; - + string preservecase = getConfig(KEY_CASE,TRUE); if (preservecase == TRUE) { - _preserveCase = true; - }else if (preservecase == FALSE) { - _preserveCase = false; + _preserveCase = true; + } else if (preservecase == FALSE) { + _preserveCase = false; } cerr << "Using case preservation: " << _preserveCase << endl; } - ~StatisticsBasedScorer(){}; - virtual void score(const candidates_t& candidates, const diffs_t& diffs, - statscores_t& scores); - //calculate the actual score - virtual statscore_t calculateScore(const vector& totals){return 0;}; + ~StatisticsBasedScorer() {}; + virtual void score(const candidates_t& candidates, const diffs_t& diffs, + statscores_t& scores); + //calculate the actual score + virtual statscore_t calculateScore(const vector& totals) { + return 0; + }; - protected: +protected: - //regularisation - ScorerRegularisationStrategy _regularisationStrategy; - size_t _regularisationWindow; + //regularisation + ScorerRegularisationStrategy _regularisationStrategy; + size_t _regularisationWindow; }; /** * Abstract base class for scorers that work by using sentence level * statistics eg. permutation distance metrics **/ -class SentenceLevelScorer : public Scorer { +class SentenceLevelScorer : public Scorer +{ - public: - SentenceLevelScorer(const string& name, const string& config): Scorer(name,config) { - //configure regularisation - static string KEY_TYPE = "regtype"; - static string KEY_WINDOW = "regwin"; - static string KEY_CASE = "case"; - static string TYPE_NONE = "none"; - static string TYPE_AVERAGE = "average"; - static string TYPE_MINIMUM = "min"; - static string TRUE = "true"; - static string FALSE = "false"; - - string type = getConfig(KEY_TYPE,TYPE_NONE); - if (type == TYPE_NONE) { - _regularisationStrategy = REG_NONE; - } else if (type == TYPE_AVERAGE) { - _regularisationStrategy = REG_AVERAGE; - } else if (type == TYPE_MINIMUM) { - _regularisationStrategy = REG_MINIMUM; - } else { - throw runtime_error("Unknown scorer regularisation strategy: " + type); - } - cerr << "Using scorer regularisation strategy: " << type << endl; +public: + SentenceLevelScorer(const string& name, const string& config): Scorer(name,config) { + //configure regularisation + static string KEY_TYPE = "regtype"; + static string KEY_WINDOW = "regwin"; + static string KEY_CASE = "case"; + static string TYPE_NONE = "none"; + static string TYPE_AVERAGE = "average"; + static string TYPE_MINIMUM = "min"; + static string TRUE = "true"; + static string FALSE = "false"; - string window = getConfig(KEY_WINDOW,"0"); - _regularisationWindow = atoi(window.c_str()); - cerr << "Using scorer regularisation window: " << _regularisationWindow << endl; - - string preservecase = getConfig(KEY_CASE,TRUE); - if (preservecase == TRUE) { - _preserveCase = true; - }else if (preservecase == FALSE) { - _preserveCase = false; - } - cerr << "Using case preservation: " << _preserveCase << endl; + string type = getConfig(KEY_TYPE,TYPE_NONE); + if (type == TYPE_NONE) { + _regularisationStrategy = REG_NONE; + } else if (type == TYPE_AVERAGE) { + _regularisationStrategy = REG_AVERAGE; + } else if (type == TYPE_MINIMUM) { + _regularisationStrategy = REG_MINIMUM; + } else { + throw runtime_error("Unknown scorer regularisation strategy: " + type); + } + cerr << "Using scorer regularisation strategy: " << type << endl; + + string window = getConfig(KEY_WINDOW,"0"); + _regularisationWindow = atoi(window.c_str()); + cerr << "Using scorer regularisation window: " << _regularisationWindow << endl; + + string preservecase = getConfig(KEY_CASE,TRUE); + if (preservecase == TRUE) { + _preserveCase = true; + } else if (preservecase == FALSE) { + _preserveCase = false; + } + cerr << "Using case preservation: " << _preserveCase << endl; - } - ~SentenceLevelScorer(){}; - virtual void score(const candidates_t& candidates, const diffs_t& diffs, - statscores_t& scores); + } + ~SentenceLevelScorer() {}; + virtual void score(const candidates_t& candidates, const diffs_t& diffs, + statscores_t& scores); - //calculate the actual score - virtual statscore_t calculateScore(const vector& totals){return 0;}; + //calculate the actual score + virtual statscore_t calculateScore(const vector& totals) { + return 0; + }; - protected: +protected: - //regularisation - ScorerRegularisationStrategy _regularisationStrategy; - size_t _regularisationWindow; + //regularisation + ScorerRegularisationStrategy _regularisationStrategy; + size_t _regularisationWindow; }; diff --git a/mert/ScorerFactory.h b/mert/ScorerFactory.h index 56a825579..2d3eb22a7 100644 --- a/mert/ScorerFactory.h +++ b/mert/ScorerFactory.h @@ -19,43 +19,44 @@ using namespace std; -class ScorerFactory { +class ScorerFactory +{ - public: - vector getTypes() { - vector types; - types.push_back(string("BLEU1")); - types.push_back(string("BLEU")); - types.push_back(string("PER")); - types.push_back(string("HAMMING")); - types.push_back(string("KENDALL")); - return types; +public: + vector getTypes() { + vector types; + types.push_back(string("BLEU1")); + types.push_back(string("BLEU")); + types.push_back(string("PER")); + types.push_back(string("HAMMING")); + types.push_back(string("KENDALL")); + return types; + } + + Scorer* getScorer(const string& type, const string& config = "") { + size_t scorerTypes = type.find(","); + if(scorerTypes == string::npos) { + if (type == "BLEU1") { + string conf; + if (config.length() > 0) { + conf = config + ",ngramlen:1"; + } else { + conf = config + "ngramlen:1"; } - - Scorer* getScorer(const string& type, const string& config = "") { - size_t scorerTypes = type.find(","); - if(scorerTypes == string::npos) { - if (type == "BLEU1") { - string conf; - if (config.length() > 0) { - conf = config + ",ngramlen:1"; - } else { - conf = config + "ngramlen:1"; - } - return (BleuScorer*) new BleuScorer(conf); - } else if (type == "BLEU") { - return (BleuScorer*) new BleuScorer(config); - } else if (type == "PER") { - return (PerScorer*) new PerScorer(config); - } else if ((type == "HAMMING") || (type == "KENDALL")) { - return (PermutationScorer*) new PermutationScorer(type, config); - } else { - throw runtime_error("Unknown scorer type: " + type); - } - } else { - return (InterpolatedScorer*) new InterpolatedScorer(type, config); - } - } + return (BleuScorer*) new BleuScorer(conf); + } else if (type == "BLEU") { + return (BleuScorer*) new BleuScorer(config); + } else if (type == "PER") { + return (PerScorer*) new PerScorer(config); + } else if ((type == "HAMMING") || (type == "KENDALL")) { + return (PermutationScorer*) new PermutationScorer(type, config); + } else { + throw runtime_error("Unknown scorer type: " + type); + } + } else { + return (InterpolatedScorer*) new InterpolatedScorer(type, config); + } + } }; #endif //__SCORER_FACTORY_H diff --git a/mert/Timer.cpp b/mert/Timer.cpp index 793ea659d..74db1b1e2 100644 --- a/mert/Timer.cpp +++ b/mert/Timer.cpp @@ -12,8 +12,8 @@ */ double Timer::elapsed_time() { - time_t now; - time(&now); + time_t now; + time(&now); return difftime(now, start_time); } @@ -36,7 +36,7 @@ double Timer::get_elapsed_time() void Timer::start(const char* msg) { // Print an optional message, something like "Starting timer t"; - if (msg) TRACE_ERR( msg << std::endl); + if (msg) TRACE_ERR( msg << std::endl); // Return immediately if the timer is already running if (running) return; diff --git a/mert/Timer.h b/mert/Timer.h index b02dd14bb..a9ec890c5 100644 --- a/mert/Timer.h +++ b/mert/Timer.h @@ -8,16 +8,16 @@ class Timer { - friend std::ostream& operator<<(std::ostream& os, Timer& t); + friend std::ostream& operator<<(std::ostream& os, Timer& t); - private: +private: bool running; time_t start_time; - //TODO in seconds? + //TODO in seconds? double elapsed_time(); - public: +public: /*** * 'running' is initially false. A timer needs to be explicitly started * using 'start' or 'restart' diff --git a/mert/Util.cpp b/mert/Util.cpp index 93e2231b2..250455bd2 100644 --- a/mert/Util.cpp +++ b/mert/Util.cpp @@ -1,7 +1,7 @@ /* * Util.cpp * met - Minimum Error Training - * + * * Created by Nicola Bertoldi on 13/05/08. * */ @@ -18,47 +18,47 @@ Timer g_timer; int verbose=0; -int verboselevel(){ +int verboselevel() +{ return verbose; } -int setverboselevel(int v){ +int setverboselevel(int v) +{ verbose=v; return verbose; } int getNextPound(std::string &theString, std::string &substring, const std::string delimiter) { - unsigned int pos = 0; - - //skip all occurrences of delimiter - while ( pos == 0 ) - { - if ((pos = theString.find(delimiter)) != std::string::npos){ - substring.assign(theString, 0, pos); - theString.erase(0,pos + delimiter.size()); - } - else{ - substring.assign(theString); - theString.assign(""); - } - } - return (pos); + unsigned int pos = 0; + + //skip all occurrences of delimiter + while ( pos == 0 ) { + if ((pos = theString.find(delimiter)) != std::string::npos) { + substring.assign(theString, 0, pos); + theString.erase(0,pos + delimiter.size()); + } else { + substring.assign(theString); + theString.assign(""); + } + } + return (pos); }; inputfilestream::inputfilestream(const std::string &filePath) -: std::istream(0), -m_streambuf(0) + : std::istream(0), + m_streambuf(0) { //check if file is readable std::filebuf* fb = new std::filebuf(); _good=(fb->open(filePath.c_str(), std::ios::in)!=NULL); - + if (filePath.size() > 3 && - filePath.substr(filePath.size() - 3, 3) == ".gz") - { - fb->close(); delete fb; - m_streambuf = new gzfilebuf(filePath.c_str()); + filePath.substr(filePath.size() - 3, 3) == ".gz") { + fb->close(); + delete fb; + m_streambuf = new gzfilebuf(filePath.c_str()); } else { m_streambuf = fb; } @@ -67,7 +67,8 @@ m_streambuf(0) inputfilestream::~inputfilestream() { - delete m_streambuf; m_streambuf = 0; + delete m_streambuf; + m_streambuf = 0; } void inputfilestream::close() @@ -75,16 +76,15 @@ void inputfilestream::close() } outputfilestream::outputfilestream(const std::string &filePath) -: std::ostream(0), -m_streambuf(0) + : std::ostream(0), + m_streambuf(0) { //check if file is readable std::filebuf* fb = new std::filebuf(); - _good=(fb->open(filePath.c_str(), std::ios::out)!=NULL); - - if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") - { - throw runtime_error("Output to a zipped file not supported!"); + _good=(fb->open(filePath.c_str(), std::ios::out)!=NULL); + + if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") { + throw runtime_error("Output to a zipped file not supported!"); } else { m_streambuf = fb; } @@ -93,7 +93,8 @@ m_streambuf(0) outputfilestream::~outputfilestream() { - delete m_streambuf; m_streambuf = 0; + delete m_streambuf; + m_streambuf = 0; } void outputfilestream::close() @@ -103,10 +104,14 @@ void outputfilestream::close() int swapbytes(char *p, int sz, int n) { char c, *l, *h; - + if((n<1) || (sz<2)) return 0; - for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; } - return 0; + for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { + c=*h; + *h=*l; + *l=c; + } + return 0; }; @@ -116,12 +121,12 @@ void ResetUserTime() }; void PrintUserTime(const std::string &message) -{ - g_timer.check(message.c_str()); +{ + g_timer.check(message.c_str()); } double GetUserTime() { - return g_timer.get_elapsed_time(); + return g_timer.get_elapsed_time(); } diff --git a/mert/Util.h b/mert/Util.h index d0f1528af..1f9bf0470 100644 --- a/mert/Util.h +++ b/mert/Util.h @@ -51,45 +51,49 @@ int getNextPound(std::string &theString, std::string &substring, const std::stri template inline T Scan(const std::string &input) { - std::stringstream stream(input); - T ret; - stream >> ret; - return ret; + std::stringstream stream(input); + T ret; + stream >> ret; + return ret; }; class inputfilestream : public std::istream { protected: - std::streambuf *m_streambuf; - bool _good; + std::streambuf *m_streambuf; + bool _good; public: - - inputfilestream(const std::string &filePath); - ~inputfilestream(); - bool good(){return _good;} - void close(); + + inputfilestream(const std::string &filePath); + ~inputfilestream(); + bool good() { + return _good; + } + void close(); }; class outputfilestream : public std::ostream { protected: - std::streambuf *m_streambuf; - bool _good; + std::streambuf *m_streambuf; + bool _good; public: - - outputfilestream(const std::string &filePath); - ~outputfilestream(); - bool good(){return _good;} - void close(); + + outputfilestream(const std::string &filePath); + ~outputfilestream(); + bool good() { + return _good; + } + void close(); }; template inline std::string stringify(T x) { - std::ostringstream o; - if (!(o << x)) - throw std::runtime_error("stringify(template)"); - return o.str(); + std::ostringstream o; + if (!(o << x)) + throw std::runtime_error("stringify(template)"); + return o.str(); } // Utilities to measure decoding time @@ -99,11 +103,11 @@ double GetUserTime(); inline std::string trimStr(const std::string& Src, const std::string& c = " \r\n") { - unsigned int p2 = Src.find_last_not_of(c); - if (p2 == std::string::npos) return std::string(); - unsigned int p1 = Src.find_first_not_of(c); - if (p1 == std::string::npos) p1 = 0; - return Src.substr(p1, (p2-p1)+1); + unsigned int p2 = Src.find_last_not_of(c); + if (p2 == std::string::npos) return std::string(); + unsigned int p1 = Src.find_first_not_of(c); + if (p1 == std::string::npos) p1 = 0; + return Src.substr(p1, (p2-p1)+1); } diff --git a/mert/extractor.cpp b/mert/extractor.cpp index ddcfd7dca..3a5e652b0 100644 --- a/mert/extractor.cpp +++ b/mert/extractor.cpp @@ -18,7 +18,8 @@ using namespace std; -void usage() { +void usage() +{ cerr<<"usage: extractor [options])"< 0 && referenceFile.length() == 0)){ - throw runtime_error("Error: reference file is not specified; you can not score the nbest"); + if ((nbestFile.length() > 0 && referenceFile.length() == 0)) { + throw runtime_error("Error: reference file is not specified; you can not score the nbest"); } - + vector nbestFiles; - if (nbestFile.length() > 0){ - std::string substring; - while (!nbestFile.empty()){ - getNextPound(nbestFile, substring, ","); - nbestFiles.push_back(substring); - } + if (nbestFile.length() > 0) { + std::string substring; + while (!nbestFile.empty()) { + getNextPound(nbestFile, substring, ","); + nbestFiles.push_back(substring); + } } vector referenceFiles; - if (referenceFile.length() > 0){ - std::string substring; - while (!referenceFile.empty()){ - getNextPound(referenceFile, substring, ","); - referenceFiles.push_back(substring); - } + if (referenceFile.length() > 0) { + std::string substring; + while (!referenceFile.empty()) { + getNextPound(referenceFile, substring, ","); + referenceFiles.push_back(substring); + } } vector prevScoreDataFiles; - if (prevScoreDataFile.length() > 0){ - std::string substring; - while (!prevScoreDataFile.empty()){ - getNextPound(prevScoreDataFile, substring, ","); - prevScoreDataFiles.push_back(substring); - } + if (prevScoreDataFile.length() > 0) { + std::string substring; + while (!prevScoreDataFile.empty()) { + getNextPound(prevScoreDataFile, substring, ","); + prevScoreDataFiles.push_back(substring); + } } vector prevFeatureDataFiles; - if (prevFeatureDataFile.length() > 0){ - std::string substring; - while (!prevFeatureDataFile.empty()){ - getNextPound(prevFeatureDataFile, substring, ","); - prevFeatureDataFiles.push_back(substring); - } + if (prevFeatureDataFile.length() > 0) { + std::string substring; + while (!prevFeatureDataFile.empty()) { + getNextPound(prevFeatureDataFile, substring, ","); + prevFeatureDataFiles.push_back(substring); + } } - if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()){ - throw runtime_error("Error: there is a different number of previous score and feature files"); + if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()) { + throw runtime_error("Error: there is a different number of previous score and feature files"); } - - if (binmode) cerr << "Binary write mode is selected" << endl; - else cerr << "Binary write mode is NOT selected" << endl; - - //TODO is comma separated list? split and create a scorer with multiple parts - TRACE_ERR("Scorer type: " << scorerType << endl); - ScorerFactory sfactory; - Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig); - - //load references - if (referenceFiles.size() > 0) - scorer->setReferenceFiles(referenceFiles); - PrintUserTime("References loaded"); - - Data data(*scorer); - - //load old data - for (size_t i=0;i < prevScoreDataFiles.size(); i++){ - data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i)); - } - - PrintUserTime("Previous data loaded"); - - //computing score statistics of each nbest file - for (size_t i=0;i < nbestFiles.size(); i++){ - data.loadnbest(nbestFiles.at(i)); - } + if (binmode) cerr << "Binary write mode is selected" << endl; + else cerr << "Binary write mode is NOT selected" << endl; - PrintUserTime("Nbest entries loaded and scored"); - - if (binmode) - cerr << "Binary write mode is selected" << endl; - else - cerr << "Binary write mode is NOT selected" << endl; - - data.save(featureDataFile, scoreDataFile, binmode); - PrintUserTime("Stopping..."); -/* - timer.stop("Stopping..."); - */ - - return EXIT_SUCCESS; - } catch (const exception& e) { - cerr << "Exception: " << e.what() << endl; - return EXIT_FAILURE; + //TODO is comma separated list? split and create a scorer with multiple parts + TRACE_ERR("Scorer type: " << scorerType << endl); + ScorerFactory sfactory; + Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig); + + //load references + if (referenceFiles.size() > 0) + scorer->setReferenceFiles(referenceFiles); + + PrintUserTime("References loaded"); + + Data data(*scorer); + + //load old data + for (size_t i=0; i < prevScoreDataFiles.size(); i++) { + data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i)); } + PrintUserTime("Previous data loaded"); + + //computing score statistics of each nbest file + for (size_t i=0; i < nbestFiles.size(); i++) { + data.loadnbest(nbestFiles.at(i)); + } + + PrintUserTime("Nbest entries loaded and scored"); + + if (binmode) + cerr << "Binary write mode is selected" << endl; + else + cerr << "Binary write mode is NOT selected" << endl; + + data.save(featureDataFile, scoreDataFile, binmode); + PrintUserTime("Stopping..."); + /* + timer.stop("Stopping..."); + */ + + return EXIT_SUCCESS; + } catch (const exception& e) { + cerr << "Exception: " << e.what() << endl; + return EXIT_FAILURE; + } + } diff --git a/mert/gzfilebuf.h b/mert/gzfilebuf.h index 5e0b38c6a..48b0ab036 100644 --- a/mert/gzfilebuf.h +++ b/mert/gzfilebuf.h @@ -4,66 +4,70 @@ #include #include -class gzfilebuf : public std::streambuf { +class gzfilebuf : public std::streambuf +{ public: - gzfilebuf(const char *filename) - { _gzf = gzopen(filename, "rb"); + gzfilebuf(const char *filename) { + _gzf = gzopen(filename, "rb"); setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } - ~gzfilebuf() { gzclose(_gzf); } + ~gzfilebuf() { + gzclose(_gzf); + } protected: virtual int_type overflow (int_type c) { - throw; + throw; } // write multiple characters virtual std::streamsize xsputn (const char* s, std::streamsize num) { - throw; + throw; } - virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw; + virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ) { + throw; } //read one character virtual int_type underflow () { // is read position before end of _buff? - if (gptr() < egptr()) { - return traits_type::to_int_type(*gptr()); - } + if (gptr() < egptr()) { + return traits_type::to_int_type(*gptr()); + } - /* process size of putback area - * - use number of characters read - * - but at most four - */ - unsigned int numPutback = gptr() - eback(); - if (numPutback > sizeof(int)) { - numPutback = sizeof(int); - } + /* process size of putback area + * - use number of characters read + * - but at most four + */ + unsigned int numPutback = gptr() - eback(); + if (numPutback > sizeof(int)) { + numPutback = sizeof(int); + } - /* copy up to four characters previously read into - * the putback _buff (area of first four characters) - */ - std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback, - numPutback); + /* copy up to four characters previously read into + * the putback _buff (area of first four characters) + */ + std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback, + numPutback); - // read new characters - int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int)); - if (num <= 0) { - // ERROR or EOF - return EOF; - } + // read new characters + int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int)); + if (num <= 0) { + // ERROR or EOF + return EOF; + } - // reset _buff pointers - setg (_buff+(sizeof(int)-numPutback), // beginning of putback area - _buff+sizeof(int), // read position - _buff+sizeof(int)+num); // end of buffer + // reset _buff pointers + setg (_buff+(sizeof(int)-numPutback), // beginning of putback area + _buff+sizeof(int), // read position + _buff+sizeof(int)+num); // end of buffer - // return next character - return traits_type::to_int_type(*gptr()); + // return next character + return traits_type::to_int_type(*gptr()); } std::streamsize xsgetn (char* s, diff --git a/mert/mert.cpp b/mert/mert.cpp index 8b457ef81..30665e368 100755 --- a/mert/mert.cpp +++ b/mert/mert.cpp @@ -28,7 +28,8 @@ float min_interval = 1e-3; using namespace std; -void usage(void) { +void usage(void) +{ cerr<<"usage: mert -d (mandatory )"<>start[j]; - if(j ScoreDataFiles; - if (scorerfile.length() > 0){ + if (scorerfile.length() > 0) { std::string substring; - while (!scorerfile.empty()){ + while (!scorerfile.empty()) { getNextPound(scorerfile, substring, ","); ScoreDataFiles.push_back(substring); } } vector FeatureDataFiles; - if (featurefile.length() > 0){ + if (featurefile.length() > 0) { std::string substring; - while (!featurefile.empty()){ + while (!featurefile.empty()) { getNextPound(featurefile, substring, ","); FeatureDataFiles.push_back(substring); } } - if (ScoreDataFiles.size() != FeatureDataFiles.size()){ + if (ScoreDataFiles.size() != FeatureDataFiles.size()) { throw runtime_error("Error: there is a different number of previous score and feature files"); } @@ -183,32 +184,37 @@ int main (int argc, char **argv) { //load data Data D(*TheScorer); - for (size_t i=0;i < ScoreDataFiles.size(); i++){ + for (size_t i=0; i < ScoreDataFiles.size(); i++) { cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl; D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i)); } PrintUserTime("Data loaded"); - - if (tooptimizestr.length() > 0){ + + if (tooptimizestr.length() > 0) { cerr << "Weights to optimize: " << tooptimizestr << endl; //parse string to get weights to optimize //and set them as active std::string substring; int index; - while (!tooptimizestr.empty()){ + while (!tooptimizestr.empty()) { getNextPound(tooptimizestr, substring, ","); index = D.getFeatureIndex(substring); cerr << "FeatNameIndex:" << index << " to insert" << endl; //index = strtol(substring.c_str(), NULL, 10); - if (index >= 0 && index < pdim){ tooptimize.push_back(index); } - else{ cerr << "Index " << index << " is out of bounds. Allowed indexes are [0," << (pdim-1) << "]." << endl; } + if (index >= 0 && index < pdim) { + tooptimize.push_back(index); + } else { + cerr << "Index " << index << " is out of bounds. Allowed indexes are [0," << (pdim-1) << "]." << endl; + } } - }else{ + } else { //set all weights as active tooptimize.resize(pdim);//We'll optimize on everything - for(int i=0;iSetFData(D.getFeatureData()); Point P(start);//Generate from the full feature set. Warning: must be done after Optimizer initialization statscore_t best=O->Run(P); - Point bestP=P; + Point bestP=P; statscore_t mean=best; statscore_t var=best*best; - stringstream oss; + stringstream oss; oss << "Try number 1"; - + PrintUserTime(oss.str()); - + vector min(Point::getdim()); vector max(Point::getdim()); - - for(unsigned int d=0;dRun(P); - if(score>best){ - best=score; - bestP=P; - } - mean+=score; - var+=(score*score); - - oss.str(""); - oss << "Try number " << (i+1); - PrintUserTime(oss.str()); - } - mean/=(float)ntry; - var/=(float)ntry; - var=sqrt(abs(var-mean*mean)); - if (verboselevel()>1) - cerr<<"best score: "<< best << " variance of the score (for "< " << best << endl; - ofstream res("weights.txt"); - res<Run(P); + if(score>best) { + best=score; + bestP=P; + } + mean+=score; + var+=(score*score); + + oss.str(""); + oss << "Try number " << (i+1); + PrintUserTime(oss.str()); + } + mean/=(float)ntry; + var/=(float)ntry; + var=sqrt(abs(var-mean*mean)); + if (verboselevel()>1) + cerr<<"best score: "<< best << " variance of the score (for "< " << best << endl; + ofstream res("weights.txt"); + res< references; - references.push_back("test_scorer_data/reference.txt"); - //bs.prepare(references, "test-scorer-data/nbest.out"); - Scorer* scorer = new BleuScorer();; - scorer->setReferenceFiles(references); - Data d(*scorer); - d.loadnbest("test_scorer_data/nbest.out"); - //sd.savetxt(); +int main(int argc, char** argv) +{ + cout << "Testing the scorer" << endl; + //BleuScorer bs("test-scorer-data/cppstats.feats.opt");; + vector references; + references.push_back("test_scorer_data/reference.txt"); + //bs.prepare(references, "test-scorer-data/nbest.out"); + Scorer* scorer = new BleuScorer();; + scorer->setReferenceFiles(references); + Data d(*scorer); + d.loadnbest("test_scorer_data/nbest.out"); + //sd.savetxt(); - //calculate two bleu scores, nbest and a diff - ScoreData* sd=d.getScoreData(); - scorer->setScoreData(sd); - candidates_t candidates(sd->size());; - for (size_t i = 0; i < sd->size(); ++i) { - sd->get(i,0).savetxt("/dev/stdout"); - } + //calculate two bleu scores, nbest and a diff + ScoreData* sd=d.getScoreData(); + scorer->setScoreData(sd); + candidates_t candidates(sd->size());; + for (size_t i = 0; i < sd->size(); ++i) { + sd->get(i,0).savetxt("/dev/stdout"); + } - diffs_t diffs; - diff_t diff; - diff.push_back(make_pair(1,2)); - diff.push_back(make_pair(7,8)); - diffs.push_back(diff); - - statscores_t scores; - scorer->score(candidates,diffs,scores); + diffs_t diffs; + diff_t diff; + diff.push_back(make_pair(1,2)); + diff.push_back(make_pair(7,8)); + diffs.push_back(diff); - cout << "Bleus: " << scores[0] << " " << scores[1] << endl; + statscores_t scores; + scorer->score(candidates,diffs,scores); - //try the per - scorer = new PerScorer(); - Data pd(*scorer); - scorer->setReferenceFiles(references); + cout << "Bleus: " << scores[0] << " " << scores[1] << endl; - pd.loadnbest("test_scorer_data/nbest.out"); - //sd.savetxt(); + //try the per + scorer = new PerScorer(); + Data pd(*scorer); + scorer->setReferenceFiles(references); - ScoreData* psd=pd.getScoreData(); - scorer->setScoreData(psd); - for (size_t i = 0; i < psd->size(); ++i) { - psd->get(i,0).savetxt("/dev/stdout"); - } + pd.loadnbest("test_scorer_data/nbest.out"); + //sd.savetxt(); + + ScoreData* psd=pd.getScoreData(); + scorer->setScoreData(psd); + for (size_t i = 0; i < psd->size(); ++i) { + psd->get(i,0).savetxt("/dev/stdout"); + } - cout << "PER: " << scorer->score(candidates) << endl; - + cout << "PER: " << scorer->score(candidates) << endl; + }