diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp index baf8a0f8b..ac02138b8 100644 --- a/mert/BleuScorer.cpp +++ b/mert/BleuScorer.cpp @@ -85,7 +85,6 @@ class BleuScorer::NgramCounts { BleuScorer::BleuScorer(const string& config) : StatisticsBasedScorer("BLEU", config), - kLENGTH(4), m_ref_length_type(CLOSEST) { const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST); if (reflen == REFLEN_AVERAGE) { @@ -150,7 +149,7 @@ void BleuScorer::setReferenceFiles(const vector& referenceFiles) throw runtime_error("File " + referenceFiles[i] + " has too many sentences"); } NgramCounts counts; - size_t length = countNgrams(line, counts, kLENGTH); + size_t length = countNgrams(line, counts, kBleuNgramOrder); //for any counts larger than those already there, merge them in for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) { @@ -184,9 +183,9 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) } NgramCounts testcounts; // stats for this line - vector stats(kLENGTH * 2); + vector stats(kBleuNgramOrder * 2); string sentence = this->applyFactors(text); - const size_t length = countNgrams(sentence, testcounts, kLENGTH); + const size_t length = countNgrams(sentence, testcounts, kBleuNgramOrder); // Calculate effective reference length. switch (m_ref_length_type) { @@ -222,15 +221,16 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) float BleuScorer::calculateScore(const vector& comps) const { float logbleu = 0.0; - for (int i = 0; i < kLENGTH; ++i) { + for (int i = 0; i < kBleuNgramOrder; ++i) { if (comps[2*i] == 0) { return 0.0; } logbleu += log(comps[2*i]) - log(comps[2*i+1]); } - logbleu /= kLENGTH; - const float brevity = 1.0 - static_cast(comps[kLENGTH*2]) / comps[1];//reflength divided by test length + logbleu /= kBleuNgramOrder; + // reflength divided by test length + const float brevity = 1.0 - static_cast(comps[kBleuNgramOrder * 2]) / comps[1]; if (brevity < 0.0) { logbleu += brevity; } diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h index 7ae19fa5f..f3513e135 100644 --- a/mert/BleuScorer.h +++ b/mert/BleuScorer.h @@ -12,6 +12,8 @@ using namespace std; +const int kBleuNgramOrder = 4; + /** * Bleu scoring */ @@ -24,7 +26,7 @@ public: virtual void setReferenceFiles(const vector& referenceFiles); virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); virtual float calculateScore(const vector& comps) const; - virtual size_t NumberOfScores() const { return 2 * kLENGTH + 1; } + virtual size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; } private: enum ReferenceLengthType { @@ -55,7 +57,6 @@ private: void CalcShortest(size_t sentence_id, vector& stats) const; - const int kLENGTH; ReferenceLengthType m_ref_length_type; // data extracted from reference files diff --git a/mert/Data.cpp b/mert/Data.cpp index c4a35b9b2..b1950ea4e 100644 --- a/mert/Data.cpp +++ b/mert/Data.cpp @@ -7,7 +7,6 @@ */ #include -#include "util/check.hh" #include #include @@ -16,36 +15,37 @@ #include "Scorer.h" #include "ScorerFactory.h" #include "Util.h" +#include "util/check.hh" Data::Data() - : theScorer(NULL), - number_of_scores(0), - _sparse_flag(false), - scoredata(), - featdata() {} + : m_scorer(NULL), + m_num_scores(0), + m_sparse_flag(false), + m_score_data(), + m_feature_data() {} -Data::Data(Scorer& ptr) - : theScorer(&ptr), - score_type(theScorer->getName()), - number_of_scores(0), - _sparse_flag(false), - scoredata(new ScoreData(*theScorer)), - featdata(new FeatureData) +Data::Data(Scorer* scorer) + : m_scorer(scorer), + m_score_type(m_scorer->getName()), + m_num_scores(0), + m_sparse_flag(false), + m_score_data(new ScoreData(m_scorer)), + m_feature_data(new FeatureData) { - TRACE_ERR("Data::score_type " << score_type << endl); - TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl); + TRACE_ERR("Data::m_score_type " << m_score_type << endl); + TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl); } //ADDED BY TS -void Data::remove_duplicates() { +// TODO: This is too long; consider creating additional functions to +// reduce the lines of this function. +void Data::removeDuplicates() { + size_t nSentences = m_feature_data->size(); + assert(m_score_data->size() == nSentences); - size_t nSentences = featdata->size(); - assert(scoredata->size() == nSentences); - - for (size_t s=0; s < nSentences; s++) { - - FeatureArray& feat_array = featdata->get(s); - ScoreArray& score_array = scoredata->get(s); + for (size_t s = 0; s < nSentences; s++) { + FeatureArray& feat_array = m_feature_data->get(s); + ScoreArray& score_array = m_score_data->get(s); assert(feat_array.size() == score_array.size()); @@ -55,48 +55,42 @@ void Data::remove_duplicates() { size_t end_pos = feat_array.size() - 1; size_t nRemoved = 0; - for (size_t k=0; k <= end_pos; k++) { + for (size_t k = 0; k <= end_pos; k++) { const FeatureStats& cur_feats = feat_array.get(k); - double sum = 0.0; - for (size_t l=0; l < cur_feats.size(); l++) - sum += cur_feats.get(l); + for (size_t l = 0; l < cur_feats.size(); l++) + sum += cur_feats.get(l); if (lookup.find(sum) != lookup.end()) { - //cerr << "hit" << endl; + //cerr << "hit" << endl; + vector& cur_list = lookup[sum]; - vector& cur_list = lookup[sum]; + // TODO: Make sure this is correct because we have already used 'l'. + // If this does not impact on the removing duplicates, it is better + // to change + size_t l = 0; + for (l = 0; l < cur_list.size(); l++) { + size_t j = cur_list[l]; - size_t l=0; - for (l=0; l < cur_list.size(); l++) { - - size_t j=cur_list[l]; - - if (cur_feats == feat_array.get(j) - && score_array.get(k) == score_array.get(j)) { - - if (k < end_pos) { - - feat_array.swap(k,end_pos); - score_array.swap(k,end_pos); - - k--; - } - - end_pos--; - nRemoved++; - break; - } - } - - if (l == lookup[sum].size()) - cur_list.push_back(k); + if (cur_feats == feat_array.get(j) + && score_array.get(k) == score_array.get(j)) { + if (k < end_pos) { + feat_array.swap(k,end_pos); + score_array.swap(k,end_pos); + k--; + } + end_pos--; + nRemoved++; + break; + } + } + if (l == lookup[sum].size()) + cur_list.push_back(k); + } else { + lookup[sum].push_back(k); } - else - lookup[sum].push_back(k); - // for (size_t j=0; j < k; j++) { // if (feat_array.get(k) == feat_array.get(j) @@ -115,11 +109,9 @@ void Data::remove_duplicates() { // break; // } // } - } - + } // end for k if (nRemoved > 0) { - feat_array.resize(end_pos+1); score_array.resize(end_pos+1); } @@ -127,8 +119,14 @@ void Data::remove_duplicates() { } //END_ADDED +void Data::load(const std::string &featfile, const std::string &scorefile) { + m_feature_data->load(featfile); + m_score_data->load(scorefile); + if (m_feature_data->hasSparseFeatures()) + m_sparse_flag = true; +} -void Data::loadnbest(const string &file) +void Data::loadNBest(const string &file) { TRACE_ERR("loading nbest from " << file << endl); inputfilestream inp(file); // matches a stream with a file. Opens the file @@ -147,8 +145,8 @@ void Data::loadnbest(const string &file) getNextPound(line, sentence, "|||"); // second field getNextPound(line, feature_str, "|||"); // third field - theScorer->prepareStats(sentence_index, sentence, scoreentry); - scoredata->add(scoreentry, sentence_index); + m_scorer->prepareStats(sentence_index, sentence, scoreentry); + m_score_data->add(scoreentry, sentence_index); // examine first line for name of features if (!existsFeatureNames()) { @@ -159,6 +157,16 @@ void Data::loadnbest(const string &file) inp.close(); } +void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) { + if (bin) + cerr << "Binary write mode is selected" << endl; + else + cerr << "Binary write mode is NOT selected" << endl; + + m_feature_data->save(featfile, bin); + m_score_data->save(scorefile, bin); +} + void Data::InitFeatureMap(const string& str) { string buf = str; string substr; @@ -185,7 +193,7 @@ void Data::InitFeatureMap(const string& str) { tmp_name = substr.substr(0, substr.size() - 1); } } - featdata->setFeatureMap(features); + m_feature_data->setFeatureMap(features); } void Data::AddFeatures(const string& str, @@ -207,10 +215,10 @@ void Data::AddFeatures(const string& str, string name = substr; getNextPound(buf, substr); feature_entry.addSparse(name, atof(substr.c_str())); - _sparse_flag = true; + m_sparse_flag = true; } } - featdata->add(feature_entry, sentence_index); + m_feature_data->add(feature_entry, sentence_index); } // TODO @@ -226,8 +234,8 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor CHECK(shard_size >= 0); CHECK(shard_size <= 1); - size_t data_size = scoredata->size(); - CHECK(data_size == featdata->size()); + size_t data_size = m_score_data->size(); + CHECK(data_size == m_feature_data->size()); shard_size *= data_size; const float coeff = static_cast(data_size) / shard_count; @@ -248,15 +256,15 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor } } - Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig); + Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig); - shards.push_back(Data(*scorer)); - shards.back().score_type = score_type; - shards.back().number_of_scores = number_of_scores; - shards.back()._sparse_flag = _sparse_flag; + shards.push_back(Data(scorer)); + shards.back().m_score_type = m_score_type; + shards.back().m_num_scores = m_num_scores; + shards.back().m_sparse_flag = m_sparse_flag; for (size_t i = 0; i < shard_contents.size(); ++i) { - shards.back().featdata->add(featdata->get(shard_contents[i])); - shards.back().scoredata->add(scoredata->get(shard_contents[i])); + shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i])); + shards.back().m_score_data->add(m_score_data->get(shard_contents[i])); } //cerr << endl; } diff --git a/mert/Data.h b/mert/Data.h index dbd7c753b..376367d4c 100644 --- a/mert/Data.h +++ b/mert/Data.h @@ -11,11 +11,8 @@ using namespace std; -#include #include -#include - -#include +#include #include "Util.h" #include "FeatureData.h" @@ -26,95 +23,70 @@ class Scorer; typedef boost::shared_ptr ScoreDataHandle; typedef boost::shared_ptr FeatureDataHandle; +// NOTE: there is no copy constructor implemented, so only the +// compiler synthesised shallow copy is available. class Data { private: - Scorer* theScorer; - std::string score_type; - size_t number_of_scores; - bool _sparse_flag; + Scorer* m_scorer; + std::string m_score_type; + size_t m_num_scores; + bool m_sparse_flag; + ScoreDataHandle m_score_data; + FeatureDataHandle m_feature_data; // Helper functions for loadnbest(); void InitFeatureMap(const std::string& str); void AddFeatures(const std::string& str, const std::string& sentence_index); -protected: - ScoreDataHandle scoredata; - FeatureDataHandle featdata; - public: - explicit Data(Scorer& sc); + explicit Data(Scorer* scorer); Data(); - //Note that there is no copy constructor implemented, so only the - //compiler synthesised shallow copy is available - - inline void clear() { - scoredata->clear(); - featdata->clear(); + void clear() { + m_score_data->clear(); + m_feature_data->clear(); } - ScoreDataHandle getScoreData() { - return scoredata; + ScoreDataHandle getScoreData() { return m_score_data; } + + FeatureDataHandle getFeatureData() { return m_feature_data; } + + Scorer* getScorer() { return m_scorer; } + + size_t NumberOfFeatures() const { + return m_feature_data->NumberOfFeatures(); } - FeatureDataHandle getFeatureData() { - return featdata; - } + void NumberOfFeatures(size_t v) { m_feature_data->NumberOfFeatures(v); } - Scorer* getScorer() { - return theScorer; - } + std::string Features() const { return m_feature_data->Features(); } + void Features(const std::string &f) { m_feature_data->Features(f); } - inline size_t NumberOfFeatures() const { - return featdata->NumberOfFeatures(); - } - inline void NumberOfFeatures(size_t v) { - featdata->NumberOfFeatures(v); - } - inline std::string Features() const { - return featdata->Features(); - } - inline void Features(const std::string &f) { - featdata->Features(f); - } - - inline bool hasSparseFeatures() const { return _sparse_flag; } + bool hasSparseFeatures() const { return m_sparse_flag; } void mergeSparseFeatures(); - void loadnbest(const std::string &file); + void loadNBest(const std::string &file); - void load(const std::string &featfile,const std::string &scorefile) { - featdata->load(featfile); - scoredata->load(scorefile); - if (featdata->hasSparseFeatures()) - _sparse_flag = true; - } + void load(const std::string &featfile, const std::string &scorefile); + + void save(const std::string &featfile, const std::string &scorefile, bool bin=false); //ADDED BY TS - void remove_duplicates(); + void removeDuplicates(); //END_ADDED - void save(const std::string &featfile,const std::string &scorefile, bool bin=false) { - - if (bin) cerr << "Binary write mode is selected" << endl; - else cerr << "Binary write mode is NOT selected" << endl; - - featdata->save(featfile, bin); - scoredata->save(scorefile, bin); - } - inline bool existsFeatureNames() const { - return featdata->existsFeatureNames(); + return m_feature_data->existsFeatureNames(); } inline std::string getFeatureName(size_t idx) const { - return featdata->getFeatureName(idx); + return m_feature_data->getFeatureName(idx); } inline size_t getFeatureIndex(const std::string& name) const { - return featdata->getFeatureIndex(name); + return m_feature_data->getFeatureIndex(name); } /** diff --git a/mert/DataTest.cpp b/mert/DataTest.cpp index 6d48a46eb..bf644fe1a 100644 --- a/mert/DataTest.cpp +++ b/mert/DataTest.cpp @@ -10,7 +10,7 @@ //very basic test of sharding BOOST_AUTO_TEST_CASE(shard_basic) { boost::scoped_ptr scorer(ScorerFactory::getScorer("BLEU", "")); - Data data(*scorer); + Data data(scorer.get()); FeatureArray fa1, fa2, fa3, fa4; ScoreArray sa1, sa2, sa3, sa4; fa1.setIndex("1"); diff --git a/mert/FeatureArray.cpp b/mert/FeatureArray.cpp index 594411998..62f9ceda5 100644 --- a/mert/FeatureArray.cpp +++ b/mert/FeatureArray.cpp @@ -6,135 +6,147 @@ * */ +#include #include "FeatureArray.h" #include "FileStream.h" #include "Util.h" - FeatureArray::FeatureArray() - : idx(""), number_of_features(0), _sparse_flag(false) {} + : m_index(""), m_num_features(0), m_sparse_flag(false) {} FeatureArray::~FeatureArray() {} -void FeatureArray::savetxt(std::ofstream& outFile) +void FeatureArray::savetxt(ostream* os) { - outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size() - << " " << number_of_features << " " << features << std::endl; - for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) { - i->savetxt(outFile); - outFile << std::endl; + *os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size() + << " " << m_num_features << " " << m_features << endl; + for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) { + i->savetxt(os); + *os << endl; } - outFile << FEATURES_TXT_END << std::endl; + *os << FEATURES_TXT_END << endl; } -void FeatureArray::savebin(std::ofstream& outFile) +void FeatureArray::savebin(ostream* os) { - outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size() - << " " << number_of_features << " " << features << std::endl; - for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) - i->savebin(outFile); + *os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size() + << " " << m_num_features << " " << m_features << endl; + for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) + i->savebin(os); - outFile << FEATURES_BIN_END << std::endl; + *os << FEATURES_BIN_END << endl; } -void FeatureArray::save(std::ofstream& inFile, bool bin) +void FeatureArray::save(ostream* os, bool bin) { - if (size()>0) - (bin)?savebin(inFile):savetxt(inFile); + if (size() <= 0) return; + if (bin) { + savebin(os); + } else { + savetxt(os); + } } -void FeatureArray::save(const std::string &file, bool bin) +void FeatureArray::save(const string &file, bool bin) { - - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - - save(outFile); - - outFile.close(); + ofstream ofs(file.c_str(), ios::out); + if (!ofs) { + cerr << "Failed to open " << file << endl; + exit(1); + } + ostream *os = &ofs; + save(os, bin); + ofs.close(); } -void FeatureArray::loadbin(ifstream& inFile, size_t n) +void FeatureArray::save(bool bin) { - FeatureStats entry(number_of_features); + save(&cout, bin); +} - for (size_t i=0 ; i < n; i++) { - entry.loadbin(inFile); +void FeatureArray::loadbin(istream* is, size_t n) +{ + FeatureStats entry(m_num_features); + for (size_t i = 0 ; i < n; i++) { + entry.loadbin(is); add(entry); } } -void FeatureArray::loadtxt(ifstream& inFile, size_t n) +void FeatureArray::loadtxt(istream* is, size_t n) { - FeatureStats entry(number_of_features); + FeatureStats entry(m_num_features); - for (size_t i=0 ; i < n; i++) { - entry.loadtxt(inFile); + for (size_t i = 0; i < n; i++) { + entry.loadtxt(is); add(entry); if (entry.getSparse().size()>0) - _sparse_flag = true; + m_sparse_flag = true; } } -void FeatureArray::load(ifstream& inFile) +void FeatureArray::load(istream* is) { - size_t number_of_entries=0; - bool binmode=false; + size_t number_of_entries = 0; + bool binmode = false; - std::string substring, stringBuf; - std::string::size_type loc; + string substring, stringBuf; + string::size_type loc; - std::getline(inFile, stringBuf); - if (!inFile.good()) { + getline(*is, stringBuf); + if (!is->good()) { return; } if (!stringBuf.empty()) { if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0) { - binmode=false; + binmode = false; } else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0) { - binmode=true; + binmode = true; } else { TRACE_ERR("ERROR: FeatureArray::load(): Wrong header"); return; } getNextPound(stringBuf, substring); getNextPound(stringBuf, substring); - idx = substring; + m_index = substring; getNextPound(stringBuf, substring); number_of_entries = atoi(substring.c_str()); getNextPound(stringBuf, substring); - number_of_features = atoi(substring.c_str()); - features = stringBuf; + m_num_features = atoi(substring.c_str()); + m_features = stringBuf; } - (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries); + if (binmode) { + loadbin(is, number_of_entries); + } else { + loadtxt(is, number_of_entries); + } - std::getline(inFile, stringBuf); + getline(*is, stringBuf); if (!stringBuf.empty()) { - if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0) { + if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && + (loc = stringBuf.find(FEATURES_BIN_END)) != 0) { TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer"); return; } } } -void FeatureArray::load(const std::string &file) +void FeatureArray::load(const string &file) { - TRACE_ERR("loading data from " << file << std::endl); - - inputfilestream inFile(file); // matches a stream with a file. Opens the file - - load((ifstream&) inFile); - - inFile.close(); - + TRACE_ERR("loading data from " << file << endl); + inputfilestream input_stream(file); // matches a stream with a file. Opens the file + istream* is = &input_stream; + load(is); + input_stream.close(); } void FeatureArray::merge(FeatureArray& e) { //dummy implementation - for (size_t i=0; isize() != sz) return false; } return true; } - diff --git a/mert/FeatureArray.h b/mert/FeatureArray.h index b4b305e39..25ebbe866 100644 --- a/mert/FeatureArray.h +++ b/mert/FeatureArray.h @@ -11,7 +11,6 @@ #include #include -#include #include "FeatureStats.h" using namespace std; @@ -26,82 +25,57 @@ class FeatureArray private: // idx to identify the utterance. It can differ from // the index inside the vector. - std::string idx; - -protected: - featarray_t array_; - size_t number_of_features; - std::string features; - bool _sparse_flag; + std::string m_index; + featarray_t m_array; + size_t m_num_features; + std::string m_features; + bool m_sparse_flag; public: FeatureArray(); ~FeatureArray(); - inline void clear() { - array_.clear(); - } + void clear() { m_array.clear(); } - inline bool hasSparseFeatures() const { - return _sparse_flag; - } + bool hasSparseFeatures() const { return m_sparse_flag; } - inline std::string getIndex() const { - return idx; - } - inline void setIndex(const std::string& value) { - idx = value; - } + std::string getIndex() const { return m_index; } + void setIndex(const std::string& value) { m_index = value; } - inline FeatureStats& get(size_t i) { - return array_.at(i); - } - inline const FeatureStats& get(size_t i)const { - return array_.at(i); - } - void add(FeatureStats& e) { - array_.push_back(e); - } + FeatureStats& get(size_t i) { return m_array.at(i); } + const FeatureStats& get(size_t i) const { return m_array.at(i); } + + void add(FeatureStats& e) { m_array.push_back(e); } //ADDED BY TS void swap(size_t i, size_t j) { - std::swap(array_[i],array_[j]); + std::swap(m_array[i], m_array[j]); } - + void resize(size_t new_size) { - array_.resize(std::min(new_size,array_.size())); + m_array.resize(std::min(new_size, m_array.size())); } //END_ADDED void merge(FeatureArray& e); - inline size_t size() const { - return array_.size(); - } - inline size_t NumberOfFeatures() const { - return number_of_features; - } - inline void NumberOfFeatures(size_t v) { - number_of_features = v; - } - inline std::string Features() const { - return features; - } - inline void Features(const std::string& f) { - features = f; - } + size_t size() const { return m_array.size(); } - void savetxt(ofstream& outFile); - void savebin(ofstream& outFile); - void save(ofstream& outFile, bool bin=false); + size_t NumberOfFeatures() const { return m_num_features; } + void NumberOfFeatures(size_t v) { m_num_features = v; } + + std::string Features() const { return m_features; } + void Features(const std::string& f) { m_features = f; } + + void savetxt(std::ostream* os); + void savebin(std::ostream* os); + void save(std::ostream* os, bool bin=false); void save(const std::string &file, bool bin=false); - inline void save(bool bin=false) { - save("/dev/stdout",bin); - } + void save(bool bin=false); - void loadtxt(ifstream& inFile, size_t n); - void loadbin(ifstream& inFile, size_t n); - void load(ifstream& inFile); + void loadtxt(std::istream* is, size_t n); + void loadbin(std::istream* is, size_t n); + void load(std::istream* is); void load(const std::string &file); bool check_consistency() const; diff --git a/mert/FeatureData.cpp b/mert/FeatureData.cpp index 081f7ab32..7e46f1803 100644 --- a/mert/FeatureData.cpp +++ b/mert/FeatureData.cpp @@ -13,44 +13,45 @@ #include "Util.h" #include -static const float MIN_FLOAT=-1.0*numeric_limits::max(); -static const float MAX_FLOAT=numeric_limits::max(); +static const float MIN_FLOAT = -1.0 * numeric_limits::max(); +static const float MAX_FLOAT = numeric_limits::max(); FeatureData::FeatureData() - : number_of_features(0), - _sparse_flag(false) {} + : m_num_features(0), + m_sparse_flag(false) {} -void FeatureData::save(std::ofstream& outFile, bool bin) +void FeatureData::save(ostream* os, bool bin) { - for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) - i->save(outFile, bin); + for (featdata_t::iterator i = m_array.begin(); i != m_array.end(); i++) + i->save(os, bin); } -void FeatureData::save(const std::string &file, bool bin) +void FeatureData::save(const string &file, bool bin) { if (file.empty()) return; - - TRACE_ERR("saving the array into " << file << std::endl); - - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - - save(outFile, bin); - - outFile.close(); + TRACE_ERR("saving the array into " << file << endl); + ofstream ofs(file.c_str(), ios::out); // matches a stream with a file. Opens the file + ostream* os = &ofs; + save(os, bin); + ofs.close(); } -void FeatureData::load(ifstream& inFile) +void FeatureData::save(bool bin) { + save(&cout, bin); +} + +void FeatureData::load(istream* is) { FeatureArray entry; - while (!inFile.eof()) { + while (!is->eof()) { - if (!inFile.good()) { - std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl; + if (!is->good()) { + cerr << "ERROR FeatureData::load inFile.good()" << endl; } entry.clear(); - entry.load(inFile); + entry.load(is); if (entry.size() == 0) break; @@ -59,26 +60,23 @@ void FeatureData::load(ifstream& inFile) setFeatureMap(entry.Features()); if (entry.hasSparseFeatures()) - _sparse_flag = true; + m_sparse_flag = true; add(entry); } } -void FeatureData::load(const std::string &file) +void FeatureData::load(const string &file) { - TRACE_ERR("loading feature data from " << file << std::endl); - - inputfilestream inFile(file); // matches a stream with a file. Opens the file - - if (!inFile) { + TRACE_ERR("loading feature data from " << file << endl); + inputfilestream input_stream(file); // matches a stream with a file. Opens the file + if (!input_stream) { throw runtime_error("Unable to open feature file: " + file); } - - load((ifstream&) inFile); - - inFile.close(); + istream* is = &input_stream; + load(is); + input_stream.close(); } void FeatureData::add(FeatureArray& e) @@ -86,25 +84,25 @@ void FeatureData::add(FeatureArray& e) if (exists(e.getIndex())) { // array at position e.getIndex() already exists //enlarge array at position e.getIndex() size_t pos = getIndex(e.getIndex()); - array_.at(pos).merge(e); + m_array.at(pos).merge(e); } else { - array_.push_back(e); + m_array.push_back(e); setIndex(); } } -void FeatureData::add(FeatureStats& e, const std::string& sent_idx) +void FeatureData::add(FeatureStats& e, const string& sent_idx) { if (exists(sent_idx)) { // array at position e.getIndex() already exists //enlarge array at position e.getIndex() size_t pos = getIndex(sent_idx); // TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl); - array_.at(pos).add(e); + m_array.at(pos).add(e); } else { // TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl); FeatureArray a; - a.NumberOfFeatures(number_of_features); - a.Features(features); + a.NumberOfFeatures(m_num_features); + a.Features(m_features); a.setIndex(sent_idx); a.add(e); add(a); @@ -113,10 +111,10 @@ void FeatureData::add(FeatureStats& e, const std::string& sent_idx) bool FeatureData::check_consistency() const { - if (array_.size() == 0) + if (m_array.size() == 0) return true; - for (featdata_t::const_iterator i = array_.begin(); i != array_.end(); i++) + for (featdata_t::const_iterator i = m_array.begin(); i != m_array.end(); i++) if (!i->check_consistency()) return false; return true; @@ -125,26 +123,26 @@ bool FeatureData::check_consistency() const void FeatureData::setIndex() { size_t j=0; - for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) { - idx2arrayname_[j]=(*i).getIndex(); - arrayname2idx_[(*i).getIndex()] = j; + for (featdata_t::iterator i = m_array.begin(); i !=m_array.end(); i++) { + m_index_to_array_name[j]=(*i).getIndex(); + m_array_name_to_index[(*i).getIndex()] = j; j++; } } -void FeatureData::setFeatureMap(const std::string& feat) +void FeatureData::setFeatureMap(const string& feat) { - number_of_features = 0; - features = feat; + m_num_features = 0; + m_features = feat; vector buf; Tokenize(feat.c_str(), ' ', &buf); for (vector::const_iterator it = buf.begin(); it != buf.end(); ++it) { - const size_t size = idx2featname_.size(); - featname2idx_[*it] = size; - idx2featname_[size] = *it; - ++number_of_features; + const size_t size = m_index_to_feature_name.size(); + m_feature_name_to_index[*it] = size; + m_index_to_feature_name[size] = *it; + ++m_num_features; } } @@ -152,26 +150,23 @@ string FeatureData::ToString() const { string res; char buf[100]; - snprintf(buf, sizeof(buf), "number of features: %lu, ", number_of_features); + snprintf(buf, sizeof(buf), "number of features: %lu, ", m_num_features); res.append(buf); - snprintf(buf, sizeof(buf), "features: "); - res.append(buf); - res.append(features); + res.append("features: "); + res.append(m_features); - snprintf(buf, sizeof(buf), ", sparse flag: %s, ", (_sparse_flag) ? "yes" : "no"); + snprintf(buf, sizeof(buf), ", sparse flag: %s, ", (m_sparse_flag) ? "yes" : "no"); res.append(buf); - snprintf(buf, sizeof(buf), "feature_id_map = { "); - res.append(buf); - for (map::const_iterator it = featname2idx_.begin(); - it != featname2idx_.end(); ++it) { + res.append("feature_id_map = { "); + for (map::const_iterator it = m_feature_name_to_index.begin(); + it != m_feature_name_to_index.end(); ++it) { snprintf(buf, sizeof(buf), "%s => %lu, ", it->first.c_str(), it->second); res.append(buf); } - snprintf(buf, sizeof(buf), "}"); - res.append(buf); + res.append("}"); return res; } diff --git a/mert/FeatureData.h b/mert/FeatureData.h index 09fb8e9be..347221ea6 100644 --- a/mert/FeatureData.h +++ b/mert/FeatureData.h @@ -19,109 +19,92 @@ using namespace std; class FeatureData { private: - size_t number_of_features; - std::string features; - bool _sparse_flag; - - map featname2idx_; // map from name to index of features - map idx2featname_; // map from index to name of features - -protected: - featdata_t array_; - idx2name idx2arrayname_; // map from index to name of array - name2idx arrayname2idx_; // map from name to index of array + size_t m_num_features; + std::string m_features; + bool m_sparse_flag; + map m_feature_name_to_index; // map from name to index of features + map m_index_to_feature_name; // map from index to name of features + featdata_t m_array; + idx2name m_index_to_array_name; // map from index to name of array + name2idx m_array_name_to_index; // map from name to index of array public: FeatureData(); ~FeatureData() {} - inline void clear() { - array_.clear(); + void clear() { m_array.clear(); } + + bool hasSparseFeatures() const { return m_sparse_flag; } + + FeatureArray get(const std::string& idx) { + return m_array.at(getIndex(idx)); } - inline bool hasSparseFeatures() const { - return _sparse_flag; - } - inline FeatureArray get(const std::string& idx) { - return array_.at(getIndex(idx)); - } - inline FeatureArray& get(size_t idx) { - return array_.at(idx); - } - inline const FeatureArray& get(size_t idx) const { - return array_.at(idx); - } + FeatureArray& get(size_t idx) { return m_array.at(idx); } + const FeatureArray& get(size_t idx) const { return m_array.at(idx); } inline bool exists(const std::string& sent_idx) const { return exists(getIndex(sent_idx)); } inline bool exists(int sent_idx) const { - return (sent_idx > -1 && sent_idx < static_cast(array_.size())) ? true : false; + return (sent_idx > -1 && sent_idx < static_cast(m_array.size())) ? true : false; } inline FeatureStats& get(size_t i, size_t j) { - return array_.at(i).get(j); + return m_array.at(i).get(j); } - inline const FeatureStats& get(size_t i, size_t j) const { - return array_.at(i).get(j); + + inline const FeatureStats& get(size_t i, size_t j) const { + return m_array.at(i).get(j); } void add(FeatureArray& e); void add(FeatureStats& e, const std::string& sent_idx); - inline size_t size() const { - return array_.size(); - } - inline size_t NumberOfFeatures() const { - return number_of_features; - } - inline void NumberOfFeatures(size_t v) { - number_of_features = v; - } - inline std::string Features() const { - return features; - } - inline void Features(const std::string& f) { - features = f; - } + size_t size() const { return m_array.size(); } + + size_t NumberOfFeatures() const { return m_num_features; } + void NumberOfFeatures(size_t v) { m_num_features = v; } + + std::string Features() const { return m_features; } + void Features(const std::string& f) { m_features = f; } void save(const std::string &file, bool bin=false); - void save(ofstream& outFile, bool bin=false); - inline void save(bool bin=false) { - save("/dev/stdout", bin); - } + void save(std::ostream* os, bool bin=false); + void save(bool bin=false); - void load(ifstream& inFile); + void load(std::istream* is); void load(const std::string &file); bool check_consistency() const; + void setIndex(); inline int getIndex(const std::string& idx) const { - name2idx::const_iterator i = arrayname2idx_.find(idx); - if (i != arrayname2idx_.end()) + name2idx::const_iterator i = m_array_name_to_index.find(idx); + if (i != m_array_name_to_index.end()) return i->second; else return -1; } inline std::string getIndex(size_t idx) const { - idx2name::const_iterator i = idx2arrayname_.find(idx); - if (i != idx2arrayname_.end()) + idx2name::const_iterator i = m_index_to_array_name.find(idx); + if (i != m_index_to_array_name.end()) throw runtime_error("there is no entry at index " + idx); return i->second; } bool existsFeatureNames() const { - return (idx2featname_.size() > 0) ? true : false; + return (m_index_to_feature_name.size() > 0) ? true : false; } std::string getFeatureName(size_t idx) const { - if (idx >= idx2featname_.size()) + if (idx >= m_index_to_feature_name.size()) throw runtime_error("Error: you required an too big index"); - map::const_iterator it = idx2featname_.find(idx); - if (it == idx2featname_.end()) { + map::const_iterator it = m_index_to_feature_name.find(idx); + if (it == m_index_to_feature_name.end()) { throw runtime_error("Error: specified id is unknown: " + idx); } else { return it->second; @@ -129,8 +112,8 @@ public: } size_t getFeatureIndex(const std::string& name) const { - map::const_iterator it = featname2idx_.find(name); - if (it == featname2idx_.end()) + map::const_iterator it = m_feature_name_to_index.find(name); + if (it == m_feature_name_to_index.end()) throw runtime_error("Error: feature " + name + " is unknown"); return it->second; } diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp index 73d7ec13a..38aa31328 100644 --- a/mert/FeatureStats.cpp +++ b/mert/FeatureStats.cpp @@ -8,6 +8,7 @@ #include "FeatureStats.h" +#include #include #include "Util.h" @@ -15,58 +16,58 @@ namespace { const int kAvailableSize = 8; } // namespace -SparseVector::name2id_t SparseVector::name2id_; -SparseVector::id2name_t SparseVector::id2name_; +SparseVector::name2id_t SparseVector::m_name_to_id; +SparseVector::id2name_t SparseVector::m_id_to_name; FeatureStatsType SparseVector::get(const string& name) const { - name2id_t::const_iterator name2id_iter = name2id_.find(name); - if (name2id_iter == name2id_.end()) return 0; + name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); + if (name2id_iter == m_name_to_id.end()) return 0; size_t id = name2id_iter->second; return get(id); } FeatureStatsType SparseVector::get(size_t id) const { - fvector_t::const_iterator fvector_iter = fvector_.find(id); - if (fvector_iter == fvector_.end()) return 0; + fvector_t::const_iterator fvector_iter = m_fvector.find(id); + if (fvector_iter == m_fvector.end()) return 0; return fvector_iter->second; } void SparseVector::set(const string& name, FeatureStatsType value) { - name2id_t::const_iterator name2id_iter = name2id_.find(name); + name2id_t::const_iterator name2id_iter = m_name_to_id.find(name); size_t id = 0; - if (name2id_iter == name2id_.end()) { - id = id2name_.size(); - id2name_.push_back(name); - name2id_[name] = id; + if (name2id_iter == m_name_to_id.end()) { + id = m_id_to_name.size(); + m_id_to_name.push_back(name); + m_name_to_id[name] = id; } else { id = name2id_iter->second; } - fvector_[id] = value; + m_fvector[id] = value; } void SparseVector::write(ostream& out, const string& sep) const { - for (fvector_t::const_iterator i = fvector_.begin(); i != fvector_.end(); ++i) { + for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) { if (abs(i->second) < 0.00001) continue; - string name = id2name_[i->first]; + string name = m_id_to_name[i->first]; out << name << sep << i->second << " "; } } void SparseVector::clear() { - fvector_.clear(); + m_fvector.clear(); } SparseVector& SparseVector::operator-=(const SparseVector& rhs) { //All the elements that have values in *this - for (fvector_t::iterator i = fvector_.begin(); i != fvector_.end(); ++i) { - fvector_[i->first] = i->second - rhs.get(i->first); + for (fvector_t::iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) { + m_fvector[i->first] = i->second - rhs.get(i->first); } //Any elements in rhs, that have no value in *this - for (fvector_t::const_iterator i = rhs.fvector_.begin(); - i != rhs.fvector_.end(); ++i) { - if (fvector_.find(i->first) == fvector_.end()) { - fvector_[i->first] = -(i->second); + for (fvector_t::const_iterator i = rhs.m_fvector.begin(); + i != rhs.m_fvector.end(); ++i) { + if (m_fvector.find(i->first) == m_fvector.end()) { + m_fvector[i->first] = -(i->second); } } return *this; @@ -79,37 +80,37 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) { } FeatureStats::FeatureStats() - : available_(kAvailableSize), entries_(0), - array_(new FeatureStatsType[available_]) {} + : m_available_size(kAvailableSize), m_entries(0), + m_array(new FeatureStatsType[m_available_size]) {} FeatureStats::FeatureStats(const size_t size) - : available_(size), entries_(size), - array_(new FeatureStatsType[available_]) + : m_available_size(size), m_entries(size), + m_array(new FeatureStatsType[m_available_size]) { - memset(array_, 0, GetArraySizeWithBytes()); + memset(m_array, 0, GetArraySizeWithBytes()); } -FeatureStats::FeatureStats(std::string &theString) - : available_(0), entries_(0), array_(NULL) +FeatureStats::FeatureStats(string &theString) + : m_available_size(0), m_entries(0), m_array(NULL) { set(theString); } FeatureStats::~FeatureStats() { - if (array_) { - delete [] array_; - array_ = NULL; + if (m_array) { + delete [] m_array; + m_array = NULL; } } void FeatureStats::Copy(const FeatureStats &stats) { - available_ = stats.available(); - entries_ = stats.size(); - array_ = new FeatureStatsType[available_]; - memcpy(array_, stats.getArray(), GetArraySizeWithBytes()); - map_ = stats.getSparse(); + m_available_size = stats.available(); + m_entries = stats.size(); + m_array = new FeatureStatsType[m_available_size]; + memcpy(m_array, stats.getArray(), GetArraySizeWithBytes()); + m_map = stats.getSparse(); } FeatureStats::FeatureStats(const FeatureStats &stats) @@ -119,34 +120,34 @@ FeatureStats::FeatureStats(const FeatureStats &stats) FeatureStats& FeatureStats::operator=(const FeatureStats &stats) { - delete [] array_; + delete [] m_array; Copy(stats); return *this; } void FeatureStats::expand() { - available_ *= 2; - featstats_t t_ = new FeatureStatsType[available_]; - memcpy(t_, array_, GetArraySizeWithBytes()); - delete [] array_; - array_ = t_; + m_available_size *= 2; + featstats_t t_ = new FeatureStatsType[m_available_size]; + memcpy(t_, m_array, GetArraySizeWithBytes()); + delete [] m_array; + m_array = t_; } void FeatureStats::add(FeatureStatsType v) { if (isfull()) expand(); - array_[entries_++]=v; + m_array[m_entries++]=v; } void FeatureStats::addSparse(const string& name, FeatureStatsType v) { - map_.set(name,v); + m_map.set(name,v); } -void FeatureStats::set(std::string &theString) +void FeatureStats::set(string &theString) { - std::string substring, stringBuf; + string substring, stringBuf; reset(); while (!theString.empty()) { @@ -163,48 +164,50 @@ void FeatureStats::set(std::string &theString) } } - -void FeatureStats::loadbin(std::ifstream& inFile) +void FeatureStats::loadbin(istream* is) { - inFile.read((char*) array_, GetArraySizeWithBytes()); + is->read(reinterpret_cast(m_array), + static_cast(GetArraySizeWithBytes())); } -void FeatureStats::loadtxt(std::ifstream& inFile) +void FeatureStats::loadtxt(istream* is) { - std::string theString; - std::getline(inFile, theString); - set(theString); + string line; + getline(*is, line); + set(line); } -void FeatureStats::loadtxt(const std::string &file) +void FeatureStats::loadtxt(const string &file) { - // TRACE_ERR("loading the stats from " << file << std::endl); - - std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file - - loadtxt(inFile); + ifstream ifs(file.c_str(), ios::in); + if (!ifs) { + cerr << "Failed to open " << file << endl; + exit(1); + } + istream* is = &ifs; + loadtxt(is); } - -void FeatureStats::savetxt(const std::string &file) +void FeatureStats::savetxt(const string &file) { -// TRACE_ERR("saving the stats into " << file << std::endl); - - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - - savetxt(outFile); + ofstream ofs(file.c_str(), ios::out); + ostream* os = &ofs; + savetxt(os); } - -void FeatureStats::savetxt(std::ofstream& outFile) +void FeatureStats::savetxt(ostream* os) { -// TRACE_ERR("saving the stats" << std::endl); - outFile << *this; + *os << *this; } -void FeatureStats::savebin(std::ofstream& outFile) +void FeatureStats::savetxt() { + savetxt(&cout); +} + +void FeatureStats::savebin(ostream* os) { - outFile.write((char*) array_, GetArraySizeWithBytes()); + os->write(reinterpret_cast(m_array), + static_cast(GetArraySizeWithBytes())); } ostream& operator<<(ostream& o, const FeatureStats& e) @@ -230,7 +233,7 @@ bool operator==(const FeatureStats& f1, const FeatureStats& f2) { if (f1.get(k) != f2.get(k)) return false; } - + return true; } //END_ADDED diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h index f29862c00..e2e63a714 100644 --- a/mert/FeatureStats.h +++ b/mert/FeatureStats.h @@ -10,7 +10,6 @@ #define MERT_FEATURE_STATS_H_ #include -#include #include #include #include @@ -30,18 +29,16 @@ public: FeatureStatsType get(size_t id) const; void set(const std::string& name, FeatureStatsType value); void clear(); - size_t size() const { - return fvector_.size(); - } + size_t size() const { return m_fvector.size(); } void write(std::ostream& out, const std::string& sep = " ") const; SparseVector& operator-=(const SparseVector& rhs); private: - static name2id_t name2id_; - static id2name_t id2name_; - fvector_t fvector_; + static name2id_t m_name_to_id; + static id2name_t m_id_to_name; + fvector_t m_fvector; }; SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs); @@ -49,12 +46,12 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs); class FeatureStats { private: - size_t available_; - size_t entries_; + size_t m_available_size; + size_t m_entries; // TODO: Use smart pointer for exceptional-safety. - featstats_t array_; - SparseVector map_; + featstats_t m_array; + SparseVector m_map; public: FeatureStats(); @@ -69,64 +66,47 @@ public: void Copy(const FeatureStats &stats); - bool isfull() const { - return (entries_ < available_) ? 0 : 1; - } + bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; } void expand(); void add(FeatureStatsType v); void addSparse(const string& name, FeatureStatsType v); void clear() { - memset((void*)array_, 0, GetArraySizeWithBytes()); - map_.clear(); + memset((void*)m_array, 0, GetArraySizeWithBytes()); + m_map.clear(); } void reset() { - entries_ = 0; + m_entries = 0; clear(); } - inline FeatureStatsType get(size_t i) { - return array_[i]; - } - inline FeatureStatsType get(size_t i)const { - return array_[i]; - } - inline featstats_t getArray() const { - return array_; - } - inline const SparseVector& getSparse() const { - return map_; - } + FeatureStatsType get(size_t i) { return m_array[i]; } + FeatureStatsType get(size_t i)const { return m_array[i]; } + featstats_t getArray() const { return m_array; } + + const SparseVector& getSparse() const { return m_map; } void set(std::string &theString); - inline size_t bytes() const { - return GetArraySizeWithBytes(); - } + inline size_t bytes() const { return GetArraySizeWithBytes(); } size_t GetArraySizeWithBytes() const { - return entries_ * sizeof(FeatureStatsType); + return m_entries * sizeof(FeatureStatsType); } - inline size_t size() const { - return entries_; - } + size_t size() const { return m_entries; } - inline size_t available() const { - return available_; - } + size_t available() const { return m_available_size; } void savetxt(const std::string &file); - void savetxt(ofstream& outFile); - void savebin(ofstream& outFile); - inline void savetxt() { - savetxt("/dev/stdout"); - } + void savetxt(std::ostream* os); + void savebin(std::ostream* os); + void savetxt(); void loadtxt(const std::string &file); - void loadtxt(ifstream& inFile); - void loadbin(ifstream& inFile); + void loadtxt(std::istream* is); + void loadbin(std::istream* is); /** * Write the whole object to a stream. diff --git a/mert/FileStream.cpp b/mert/FileStream.cpp index 11fd58e26..93b7138b2 100644 --- a/mert/FileStream.cpp +++ b/mert/FileStream.cpp @@ -13,11 +13,11 @@ bool IsGzipFile(const std::string &filename) { } // namespace inputfilestream::inputfilestream(const std::string &filePath) - : std::istream(0), m_streambuf(0), is_good(false) + : std::istream(0), m_streambuf(0), m_is_good(false) { // check if file is readable std::filebuf* fb = new std::filebuf(); - is_good = (fb->open(filePath.c_str(), std::ios::in) != NULL); + m_is_good = (fb->open(filePath.c_str(), std::ios::in) != NULL); if (IsGzipFile(filePath)) { fb->close(); @@ -40,11 +40,11 @@ void inputfilestream::close() } outputfilestream::outputfilestream(const std::string &filePath) - : std::ostream(0), m_streambuf(0), is_good(false) + : std::ostream(0), m_streambuf(0), m_is_good(false) { // check if file is readable std::filebuf* fb = new std::filebuf(); - is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL); + m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL); if (IsGzipFile(filePath)) { throw runtime_error("Output to a zipped file not supported!"); diff --git a/mert/FileStream.h b/mert/FileStream.h index 78b6ccb88..3fd489cd7 100644 --- a/mert/FileStream.h +++ b/mert/FileStream.h @@ -2,6 +2,7 @@ #define MERT_FILE_STREAM_H_ #include +#include #include #include @@ -9,13 +10,13 @@ class inputfilestream : public std::istream { protected: std::streambuf *m_streambuf; - bool is_good; + bool m_is_good; public: explicit inputfilestream(const std::string &filePath); virtual ~inputfilestream(); - bool good() const { return is_good; } + bool good() const { return m_is_good; } void close(); }; @@ -23,13 +24,13 @@ class outputfilestream : public std::ostream { protected: std::streambuf *m_streambuf; - bool is_good; + bool m_is_good; public: explicit outputfilestream(const std::string &filePath); virtual ~outputfilestream(); - bool good() const { return is_good; } + bool good() const { return m_is_good; } void close(); }; diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp index 1951e4234..25d29b42a 100644 --- a/mert/InterpolatedScorer.cpp +++ b/mert/InterpolatedScorer.cpp @@ -1,35 +1,36 @@ -#include "ScorerFactory.h" #include "InterpolatedScorer.h" +#include "ScorerFactory.h" #include "Util.h" using namespace std; - -InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config) +// TODO: This is too long. Consider creating a function for +// initialization such as Init(). +InterpolatedScorer::InterpolatedScorer(const string& name, const string& config) + : Scorer(name,config) { - // name would be: HAMMING,BLEU or similar string scorers = name; while (scorers.length() > 0) { string scorertype = ""; - getNextPound(scorers,scorertype,","); - Scorer *theScorer=ScorerFactory::getScorer(scorertype,config); - _scorers.push_back(theScorer); + getNextPound(scorers, scorertype,","); + Scorer *scorer = ScorerFactory::getScorer(scorertype,config); + m_scorers.push_back(scorer); } - if (_scorers.size() == 0) { + if (m_scorers.size() == 0) { throw runtime_error("There are no scorers"); } - cerr << "Number of scorers: " << _scorers.size() << endl; + cerr << "Number of scorers: " << m_scorers.size() << endl; //TODO debug this string wtype = getConfig("weights",""); //Default weights set to uniform ie. if two weights 0.5 each //weights should add to 1 if (wtype.length() == 0) { - float weight = 1.0/_scorers.size() ; + float weight = 1.0 / m_scorers.size() ; //cout << " Default weights:" << weight << endl; - for (size_t i = 0; i < _scorers.size(); i ++) { - _scorerWeights.push_back(weight); + for (size_t i = 0; i < m_scorers.size(); i ++) { + m_scorer_weights.push_back(weight); } } else { float tot=0; @@ -38,24 +39,24 @@ InterpolatedScorer::InterpolatedScorer (const string& name, const string& config string scoreweight = ""; getNextPound(wtype,scoreweight,"+"); float weight = atof(scoreweight.c_str()); - _scorerWeights.push_back(weight); + m_scorer_weights.push_back(weight); tot += weight; //cout << " :" << weight ; } //cout << endl; - if (tot != float(1)) { - for (vector::iterator it = _scorerWeights.begin(); it != _scorerWeights.end(); ++it) - { + if (tot != float(1)) { // TODO: fix this checking in terms of readability. + for (vector::iterator it = m_scorer_weights.begin(); + it != m_scorer_weights.end(); ++it) { *it /= tot; } } - if (_scorers.size() != _scorerWeights.size()) { + if (m_scorers.size() != m_scorer_weights.size()) { throw runtime_error("The number of weights does not equal the number of scorers!"); } } cerr << "The weights for the interpolated scorers are: " << endl; - for (vector::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) { + for (vector::iterator it = m_scorer_weights.begin(); it < m_scorer_weights.end(); it++) { cerr << *it << " " ; } cerr <::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) { + for (ScopedVector::iterator itsc = m_scorers.begin(); + itsc != m_scorers.end(); ++itsc) { int numScoresScorer = (*itsc)->NumberOfScores(); - ScoreData* newData =new ScoreData(**itsc); + ScoreData* newData =new ScoreData(*itsc); for (size_t i = 0; i < data->size(); i++) { ScoreArray scoreArray = data->get(i); ScoreArray newScoreArray; @@ -110,14 +112,16 @@ void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& di { //cout << "*******InterpolatedScorer::score" << endl; size_t scorerNum = 0; - for (ScopedVector::const_iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) { + for (ScopedVector::const_iterator itsc = m_scorers.begin(); + itsc != m_scorers.end(); ++itsc) { //int numScores = (*itsc)->NumberOfScores(); statscores_t tscores; (*itsc)->score(candidates,diffs,tscores); size_t inc = 0; - for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) { + for (statscores_t::iterator itstatsc = tscores.begin(); + itstatsc != tscores.end(); ++itstatsc) { //cout << "Scores " << (*itstatsc) << endl; - float weight = _scorerWeights[scorerNum]; + float weight = m_scorer_weights[scorerNum]; if (weight == 0) { stringstream msg; msg << "No weights for scorer" << scorerNum ; @@ -139,7 +143,8 @@ void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& di void InterpolatedScorer::setReferenceFiles(const vector& referenceFiles) { - for (ScopedVector::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) { + for (ScopedVector::iterator itsc = m_scorers.begin(); + itsc != m_scorers.end(); ++itsc) { (*itsc)->setReferenceFiles(referenceFiles); } } @@ -147,8 +152,9 @@ void InterpolatedScorer::setReferenceFiles(const vector& referenceFiles) void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { stringstream buff; - int i=0; - for (ScopedVector::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) { + int i = 0; + for (ScopedVector::iterator itsc = m_scorers.begin(); + itsc != m_scorers.end(); ++itsc) { ScoreStats tempEntry; (*itsc)->prepareStats(sid, text, tempEntry); if (i > 0) buff << " "; @@ -167,16 +173,10 @@ void InterpolatedScorer::setFactors(const string& factors) vector fsplit; split(factors, ',', fsplit); - if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers."); - - for (size_t i = 0; i < _scorers.size(); ++i) - { - _scorers[i]->setFactors(fsplit[i]); + if (fsplit.size() != m_scorers.size()) + throw runtime_error("Number of factor specifications does not equal number of interpolated scorers."); + + for (size_t i = 0; i < m_scorers.size(); ++i) { + m_scorers[i]->setFactors(fsplit[i]); } } - - - - - - diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h index 2a538bc39..5f76be538 100644 --- a/mert/InterpolatedScorer.h +++ b/mert/InterpolatedScorer.h @@ -1,14 +1,6 @@ -#ifndef __INTERPOLATED_SCORER_H__ -#define __INTERPOLATED_SCORER_H__ +#ifndef MERT_INTERPOLATED_SCORER_H_ +#define MERT_INTERPOLATED_SCORER_H_ -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include "Types.h" @@ -33,12 +25,13 @@ public: virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); virtual size_t NumberOfScores() const { - size_t sz=0; - for (ScopedVector::const_iterator itsc = _scorers.begin(); itsc != _scorers.end(); itsc++) { + size_t sz = 0; + for (ScopedVector::const_iterator itsc = m_scorers.begin(); + itsc != m_scorers.end(); ++itsc) { sz += (*itsc)->NumberOfScores(); } return sz; - }; + } virtual void setScoreData(ScoreData* data); @@ -48,13 +41,13 @@ public: virtual void setFactors(const string& factors); protected: - ScopedVector _scorers; + ScopedVector m_scorers; // Take the ownership of the heap-allocated the objects // by Scorer objects. ScopedVector m_scorers_score_data; - vector _scorerWeights; + vector m_scorer_weights; }; -#endif //__INTERPOLATED_SCORER_H +#endif // MERT_INTERPOLATED_SCORER_H_ diff --git a/mert/MergeScorer.cpp b/mert/MergeScorer.cpp index 1dbae600f..7a80f1477 100644 --- a/mert/MergeScorer.cpp +++ b/mert/MergeScorer.cpp @@ -14,7 +14,8 @@ using namespace TERCpp; MergeScorer::MergeScorer(const string& config) - : StatisticsBasedScorer("MERGE",config), kLENGTH(4) {} + : StatisticsBasedScorer("MERGE", config) {} + MergeScorer::~MergeScorer() {} void MergeScorer::setReferenceFiles(const vector& referenceFiles) diff --git a/mert/MergeScorer.h b/mert/MergeScorer.h index 579703412..2d7030421 100644 --- a/mert/MergeScorer.h +++ b/mert/MergeScorer.h @@ -13,6 +13,8 @@ using namespace std; class PerScorer; class ScoreStats; +const int kMergeScorerLength = 4; + /** * Merge scoring. */ @@ -23,23 +25,13 @@ public: virtual void setReferenceFiles(const vector& referenceFiles); virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); - - virtual size_t NumberOfScores() const - { - return 0; - } - - void whoami() const { - cerr << "I AM MergeScorer" << endl; - } + virtual size_t NumberOfScores() const { return 0; } protected: friend class PerScorer; virtual float calculateScore(const vector& comps) const; private: - const int kLENGTH; - // no copying allowed MergeScorer(const MergeScorer&); MergeScorer& operator=(const MergeScorer&); diff --git a/mert/Optimizer.cpp b/mert/Optimizer.cpp index 0e2ce9312..ca5af3736 100644 --- a/mert/Optimizer.cpp +++ b/mert/Optimizer.cpp @@ -32,36 +32,25 @@ inline float intersect(float m1, float b1, float m2, float b2) } // namespace - -void Optimizer::SetScorer(Scorer *_scorer) -{ - scorer = _scorer; -} - -void Optimizer::SetFData(FeatureDataHandle _FData) -{ - FData = _FData; -} - Optimizer::Optimizer(unsigned Pd, vector i2O, vector start, unsigned int nrandom) - : scorer(NULL), FData(), number_of_random_directions(nrandom) + : m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom) { - // Warning: the init vector is a full set of parameters, of dimension pdim! - Point::pdim = Pd; + // Warning: the init vector is a full set of parameters, of dimension m_pdim! + Point::m_pdim = Pd; CHECK(start.size() == Pd); - Point::dim = i2O.size(); - Point::optindices = i2O; - if (Point::pdim > Point::dim) { - for (unsigned int i = 0; i < Point::pdim; i++) { + Point::m_dim = i2O.size(); + Point::m_opt_indices = i2O; + if (Point::m_pdim > Point::m_dim) { + for (unsigned int i = 0; i < Point::m_pdim; i++) { unsigned int j = 0; - while (j < Point::dim && i != i2O[j]) + while (j < Point::m_dim && i != i2O[j]) j++; - // The index i wasnt found on optindices, it is a fixed index, + // The index i wasnt found on m_opt_indices, it is a fixed index, // we use the value of the start vector. - if (j == Point::dim) - Point::fixedweights[i] = start[i]; + if (j == Point::m_dim) + Point::m_fixed_weights[i] = start[i]; } } } @@ -112,12 +101,12 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction, //cerr << "Sentence " << S << endl; multimap gradient; vector f0; - f0.resize(FData->get(S).size()); - for (unsigned j = 0; j < FData->get(S).size(); j++) { + f0.resize(m_feature_data->get(S).size()); + for (unsigned j = 0; j < m_feature_data->get(S).size(); j++) { // gradient of the feature function for this particular target sentence - gradient.insert(pair(direction * (FData->get(S,j)), j)); + gradient.insert(pair(direction * (m_feature_data->get(S,j)), j)); // compute the feature function at the origin point - f0[j] = origin * FData->get(S, j); + f0[j] = origin * m_feature_data->get(S, j); } // Now let's compute the 1best for each value of x. @@ -308,7 +297,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction, void Optimizer::Get1bests(const Point& P, vector& bests) const { - CHECK(FData); + CHECK(m_feature_data); bests.clear(); bests.resize(size()); @@ -316,8 +305,8 @@ void Optimizer::Get1bests(const Point& P, vector& bests) const float bestfs = MIN_FLOAT; unsigned idx = 0; unsigned j; - for (j = 0; j < FData->get(i).size(); j++) { - float curfs = P * FData->get(i, j); + for (j = 0; j < m_feature_data->get(i).size(); j++) { + float curfs = P * m_feature_data->get(i, j); if (curfs > bestfs) { bestfs = curfs; idx = j; @@ -330,15 +319,15 @@ void Optimizer::Get1bests(const Point& P, vector& bests) const statscore_t Optimizer::Run(Point& P) const { - if (!FData) { + if (!m_feature_data) { cerr << "error trying to optimize without Features loaded" << endl; exit(2); } - if (!scorer) { + if (!m_scorer) { cerr << "error trying to optimize without a Scorer loaded" << endl; exit(2); } - if (scorer->getReferenceSize() != FData->size()) { + if (m_scorer->getReferenceSize() != m_feature_data->size()) { cerr << "error length mismatch between feature file and score file" << endl; exit(2); } @@ -361,11 +350,11 @@ statscore_t Optimizer::Run(Point& P) const vector Optimizer::GetIncStatScore(vector thefirst, vector > > thediffs) const { - CHECK(scorer); + CHECK(m_scorer); vector theres; - scorer->score(thefirst, thediffs, theres); + m_scorer->score(thefirst, thediffs, theres); return theres; } @@ -392,7 +381,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const Point linebest; - for (unsigned int d = 0; d < Point::getdim()+number_of_random_directions; d++) { + for (unsigned int d = 0; d < Point::getdim() + m_num_random_directions; d++) { if (verboselevel() > 4) { // cerr<<"minimizing along direction "< " << prevscore << endl; @@ -440,7 +429,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const // do specified number of random direction optimizations unsigned int nrun = 0; unsigned int nrun_no_change = 0; - for (; nrun_no_change < number_of_random_directions; nrun++, nrun_no_change++) + for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++) { // choose a random direction in which to optimize Point direction; @@ -476,32 +465,32 @@ statscore_t RandomOptimizer::TrueRun(Point& P) const //-------------------------------------- -vector OptimizerFactory::typenames; +vector OptimizerFactory::m_type_names; void OptimizerFactory::SetTypeNames() { - if (typenames.empty()) { - typenames.resize(NOPTIMIZER); - typenames[POWELL]="powell"; - typenames[RANDOM_DIRECTION]="random-direction"; - typenames[RANDOM]="random"; + if (m_type_names.empty()) { + m_type_names.resize(NOPTIMIZER); + m_type_names[POWELL]="powell"; + m_type_names[RANDOM_DIRECTION]="random-direction"; + m_type_names[RANDOM]="random"; // Add new type there } } vector OptimizerFactory::GetTypeNames() { - if (typenames.empty()) + if (m_type_names.empty()) SetTypeNames(); - return typenames; + return m_type_names; } OptimizerFactory::OptType OptimizerFactory::GetOType(const string& type) { unsigned int thetype; - if (typenames.empty()) + if (m_type_names.empty()) SetTypeNames(); - for (thetype = 0; thetype < typenames.size(); thetype++) - if (typenames[thetype] == type) + for (thetype = 0; thetype < m_type_names.size(); thetype++) + if (m_type_names[thetype] == type) break; return((OptType)thetype); } @@ -513,8 +502,8 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim, vector i2o, cerr << "Error: unknown Optimizer type " << type << endl; cerr << "Known Algorithm are:" << endl; unsigned int thetype; - for (thetype = 0; thetype < typenames.size(); thetype++) - cerr << typenames[thetype] << endl; + for (thetype = 0; thetype < m_type_names.size(); thetype++) + cerr << m_type_names[thetype] << endl; throw ("unknown Optimizer Type"); } diff --git a/mert/Optimizer.h b/mert/Optimizer.h index 4a964665d..519e9ecbc 100644 --- a/mert/Optimizer.h +++ b/mert/Optimizer.h @@ -20,18 +20,19 @@ class Point; class Optimizer { protected: - Scorer *scorer; // no accessor for them only child can use them - FeatureDataHandle FData; // no accessor for them only child can use them - unsigned int number_of_random_directions; + Scorer *m_scorer; // no accessor for them only child can use them + FeatureDataHandle m_feature_data; // no accessor for them only child can use them + unsigned int m_num_random_directions; public: Optimizer(unsigned Pd, vector i2O, vector start, unsigned int nrandom); - void SetScorer(Scorer *_scorer); - void SetFData(FeatureDataHandle _FData); + + void SetScorer(Scorer *scorer) { m_scorer = scorer; } + void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; } virtual ~Optimizer(); unsigned size() const { - return FData ? FData->size() : 0; + return m_feature_data ? m_feature_data->size() : 0; } /** @@ -53,7 +54,7 @@ public: * Given a set of nbests, get the Statistical score. */ statscore_t GetStatScore(const vector& nbests) const { - return scorer->score(nbests); + return m_scorer->score(nbests); } statscore_t GetStatScore(const Point& param) const; @@ -129,7 +130,7 @@ private: // Setup optimization types. static void SetTypeNames(); - static vector typenames; + static vector m_type_names; }; #endif // OPTIMIZER_H diff --git a/mert/PerScorer.h b/mert/PerScorer.h index f42974357..d32e14029 100644 --- a/mert/PerScorer.h +++ b/mert/PerScorer.h @@ -1,9 +1,7 @@ #ifndef MERT_PER_SCORER_H_ #define MERT_PER_SCORER_H_ -#include #include -#include #include #include #include "Types.h" @@ -27,18 +25,9 @@ public: virtual void setReferenceFiles(const vector& referenceFiles); virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry); - - virtual size_t NumberOfScores() const { - // cerr << "PerScorer: 3" << endl; - return 3; - } - + virtual size_t NumberOfScores() const { return 3; } virtual float calculateScore(const vector& comps) const; - void whoami() const { - cerr << "I AM PerScorer" << std::endl; - } - private: // no copying allowed PerScorer(const PerScorer&); diff --git a/mert/Point.cpp b/mert/Point.cpp index fe371ef53..ae5dbc21b 100644 --- a/mert/Point.cpp +++ b/mert/Point.cpp @@ -8,41 +8,41 @@ using namespace std; -vector Point::optindices; +vector Point::m_opt_indices; -unsigned Point::dim = 0; +unsigned Point::m_dim = 0; -map Point::fixedweights; +map Point::m_fixed_weights; -unsigned Point::pdim = 0; -unsigned Point::ncall = 0; +unsigned Point::m_pdim = 0; +unsigned Point::m_ncall = 0; vector Point::m_min; vector Point::m_max; -Point::Point() : vector(dim), score_(0.0) {} +Point::Point() : vector(m_dim), m_score(0.0) {} -//Can initialize from a vector of dim or pdim +//Can initialize from a vector of dim or m_pdim Point::Point(const vector& init, const vector& min, const vector& max) - : vector(Point::dim), score_(0.0) + : vector(Point::m_dim), m_score(0.0) { - m_min.resize(Point::dim); - m_max.resize(Point::dim); - if(init.size()==dim) { - for (unsigned int i=0; i(random()) / static_cast(RAND_MAX) * (m_max[i] - m_min[i]); } @@ -61,16 +61,17 @@ void Point::Randomize() double Point::operator*(const FeatureStats& F) const { - ncall++; // to track performance - double prod=0.0; - if(OptimizeAll()) + m_ncall++; // to track performance + double prod = 0.0; + if (OptimizeAll()) for (unsigned i=0; i::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++) - prod+=it->second*F.get(it->first); + for (unsigned i = 0; i < size(); i++) + prod += operator[](i) * F.get(m_opt_indices[i]); + for(map::iterator it = m_fixed_weights.begin(); + it != m_fixed_weights.end(); ++it) + prod += it->second * F.get(it->first); } return prod; } @@ -83,7 +84,7 @@ Point Point::operator+(const Point& p2) const Res[i] += p2[i]; } - Res.score_ = numeric_limits::max(); + Res.m_score = numeric_limits::max(); return Res; } @@ -93,7 +94,7 @@ void Point::operator+=(const Point& p2) for (unsigned i = 0; i < size(); i++) { operator[](i) += p2[i]; } - score_ = numeric_limits::max(); + m_score = numeric_limits::max(); } Point Point::operator*(float l) const @@ -102,14 +103,14 @@ Point Point::operator*(float l) const for (unsigned i = 0; i < size(); i++) { Res[i] *= l; } - Res.score_ = numeric_limits::max(); + Res.m_score = numeric_limits::max(); return Res; } ostream& operator<<(ostream& o, const Point& P) { vector w = P.GetAllWeights(); - for (unsigned int i = 0; i < Point::pdim; i++) { + for (unsigned int i = 0; i < Point::m_pdim; i++) { o << w[i] << " "; } return o; @@ -118,24 +119,24 @@ ostream& operator<<(ostream& o, const Point& P) void Point::NormalizeL2() { parameter_t norm=0.0; - for (unsigned int i=0; i Point::GetAllWeights()const { vector w; - if(OptimizeAll()) { - w=*this; + if (OptimizeAll()) { + w = *this; } else { - w.resize(pdim); - for (unsigned int i=0; i::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++) + w.resize(m_pdim); + for (unsigned int i = 0; i < size(); i++) + w[m_opt_indices[i]] = operator[](i); + for (map::iterator it = m_fixed_weights.begin(); + it != m_fixed_weights.end(); ++it) { w[it->first]=it->second; + } } return w; } diff --git a/mert/Point.h b/mert/Point.h index 5f55c7a7c..9bfaff156 100644 --- a/mert/Point.h +++ b/mert/Point.h @@ -1,7 +1,7 @@ #ifndef MERT_POINT_H_ #define MERT_POINT_H_ -#include +#include #include #include #include "Types.h" @@ -16,61 +16,55 @@ class Optimizer; class Point : public vector { friend class Optimizer; + private: /** * The indices over which we optimize. */ - static vector optindices; + static vector m_opt_indices; /** - * Dimension of optindices and of the parent vector. + * Dimension of m_opt_indices and of the parent vector. */ - static unsigned int dim; + static unsigned int m_dim; /** * Fixed weights in case of partial optimzation. */ - static map fixedweights; + static map m_fixed_weights; /** * Total size of the parameter space; we have - * pdim = FixedWeight.size() + optinidices.size(). + * m_pdim = FixedWeight.size() + optinidices.size(). */ - static unsigned int pdim; - static unsigned int ncall; + static unsigned int m_pdim; + static unsigned int m_ncall; /** - * The limits for randomization, both vectors are of full length, pdim. + * The limits for randomization, both vectors are of full length, m_pdim. */ static vector m_min; static vector m_max; - statscore_t score_; + statscore_t m_score; public: - static unsigned int getdim() { - return dim; - } - static unsigned int getpdim() { - return pdim; - } - static void setpdim(size_t pd) { - pdim = pd; - } - static void setdim(size_t d) { - dim = d; - } + static unsigned int getdim() { return m_dim; } + static void setdim(size_t d) { m_dim = d; } + + static unsigned int getpdim() { return m_pdim; } + static void setpdim(size_t pd) { m_pdim = pd; } static void set_optindices(const vector& indices) { - optindices = indices; + m_opt_indices = indices; } static const vector& get_optindices() { - return optindices; + return m_opt_indices; } static bool OptimizeAll() { - return fixedweights.empty(); + return m_fixed_weights.empty(); } Point(); @@ -88,7 +82,7 @@ public: Point operator*(float) const; /** - * Write the Whole featureweight to a stream (ie pdim float). + * Write the Whole featureweight to a stream (ie m_pdim float). */ friend ostream& operator<<(ostream& o,const Point& P); @@ -97,16 +91,13 @@ public: void NormalizeL1(); /** - * Return a vector of size pdim where all weights have been + * Return a vector of size m_pdim where all weights have been * put (including fixed ones). */ vector GetAllWeights() const; - statscore_t GetScore() const { - return score_; - } - - void SetScore(statscore_t score) { score_ = score; } + statscore_t GetScore() const { return m_score; } + void SetScore(statscore_t score) { m_score = score; } }; #endif // MERT_POINT_H diff --git a/mert/ScopedVector.h b/mert/ScopedVector.h index 6bd84e8c5..a2f0e7066 100644 --- a/mert/ScopedVector.h +++ b/mert/ScopedVector.h @@ -12,39 +12,39 @@ class ScopedVector { ScopedVector() {} virtual ~ScopedVector() { reset(); } - bool empty() const { return vec_.empty(); } + bool empty() const { return m_vec.empty(); } - void push_back(T *e) { vec_.push_back(e); } + void push_back(T *e) { m_vec.push_back(e); } void reset() { - for (iterator it = vec_.begin(); it != vec_.end(); ++it) { + for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) { delete *it; } - vec_.clear(); + m_vec.clear(); } - void reserve(size_t capacity) { vec_.reserve(capacity); } - void resize(size_t size) { vec_.resize(size); } + void reserve(size_t capacity) { m_vec.reserve(capacity); } + void resize(size_t size) { m_vec.resize(size); } - size_t size() const {return vec_.size(); } + size_t size() const {return m_vec.size(); } - iterator begin() { return vec_.begin(); } - const_iterator begin() const { return vec_.begin(); } + iterator begin() { return m_vec.begin(); } + const_iterator begin() const { return m_vec.begin(); } - iterator end() { return vec_.end(); } - const_iterator end() const { return vec_.end(); } + iterator end() { return m_vec.end(); } + const_iterator end() const { return m_vec.end(); } - std::vector& get() { return vec_; } - const std::vector& get() const { return vec_; } + std::vector& get() { return m_vec; } + const std::vector& get() const { return m_vec; } - std::vector* operator->() { return &vec_; } - const std::vector* operator->() const { return &vec_; } + std::vector* operator->() { return &m_vec; } + const std::vector* operator->() const { return &m_vec; } - T*& operator[](size_t i) { return vec_[i]; } - const T* operator[](size_t i) const { return vec_[i]; } + T*& operator[](size_t i) { return m_vec[i]; } + const T* operator[](size_t i) const { return m_vec[i]; } private: - std::vector vec_; + std::vector m_vec; // no copying allowed. ScopedVector(const ScopedVector&); diff --git a/mert/ScoreArray.cpp b/mert/ScoreArray.cpp index 16952f976..972bca0e7 100644 --- a/mert/ScoreArray.cpp +++ b/mert/ScoreArray.cpp @@ -10,76 +10,85 @@ #include "Util.h" #include "FileStream.h" - ScoreArray::ScoreArray() - : number_of_scores(0), idx("") {} + : m_num_scores(0), m_index("") {} -void ScoreArray::savetxt(std::ofstream& outFile, const std::string& sctype) +void ScoreArray::savetxt(ostream* os, const string& sctype) { - outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size() - << " " << number_of_scores << " " << sctype << std::endl; - for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) { - i->savetxt(outFile); - outFile << std::endl; + *os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size() + << " " << m_num_scores << " " << sctype << endl; + for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) { + i->savetxt(os); + *os << endl; } - outFile << SCORES_TXT_END << std::endl; + *os << SCORES_TXT_END << endl; } -void ScoreArray::savebin(std::ofstream& outFile, const std::string& sctype) +void ScoreArray::savebin(ostream* os, const string& score_type) { - outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size() - << " " << number_of_scores << " " << sctype << std::endl; - for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) - i->savebin(outFile); - - outFile << SCORES_BIN_END << std::endl; + *os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size() + << " " << m_num_scores << " " << score_type << endl; + for (scorearray_t::iterator i = m_array.begin(); + i != m_array.end(); i++) { + i->savebin(os); + } + *os << SCORES_BIN_END << endl; } -void ScoreArray::save(std::ofstream& inFile, const std::string& sctype, bool bin) +void ScoreArray::save(ostream* os, const string& score_type, bool bin) { - if (size()>0) - (bin)?savebin(inFile, sctype):savetxt(inFile, sctype); + if (size() <= 0) return; + if (bin) { + savebin(os, score_type); + } else { + savetxt(os, score_type); + } } -void ScoreArray::save(const std::string &file, const std::string& sctype, bool bin) +void ScoreArray::save(const string &file, const string& score_type, bool bin) { - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - - save(outFile, sctype, bin); - - outFile.close(); + ofstream ofs(file.c_str(), ios::out); + if (!ofs) { + cerr << "Failed to open " << file << endl; + exit(1); + } + ostream* os = &ofs; + save(os, score_type, bin); + ofs.close(); } -void ScoreArray::loadbin(ifstream& inFile, size_t n) -{ - ScoreStats entry(number_of_scores); +void ScoreArray::save(const string& score_type, bool bin) { + save(&cout, score_type, bin); +} - for (size_t i=0 ; i < n; i++) { - entry.loadbin(inFile); +void ScoreArray::loadbin(istream* is, size_t n) +{ + ScoreStats entry(m_num_scores); + for (size_t i = 0; i < n; i++) { + entry.loadbin(is); add(entry); } } -void ScoreArray::loadtxt(ifstream& inFile, size_t n) +void ScoreArray::loadtxt(istream* is, size_t n) { - ScoreStats entry(number_of_scores); - - for (size_t i=0 ; i < n; i++) { - entry.loadtxt(inFile); + ScoreStats entry(m_num_scores); + for (size_t i = 0; i < n; i++) { + entry.loadtxt(is); add(entry); } } -void ScoreArray::load(ifstream& inFile) +void ScoreArray::load(istream* is) { - size_t number_of_entries=0; - bool binmode=false; + size_t number_of_entries = 0; + bool binmode = false; - std::string substring, stringBuf; - std::string::size_type loc; + string substring, stringBuf; + string::size_type loc; - std::getline(inFile, stringBuf); - if (!inFile.good()) { + getline(*is, stringBuf); + if (!is->good()) { return; } @@ -94,35 +103,38 @@ void ScoreArray::load(ifstream& inFile) } getNextPound(stringBuf, substring); getNextPound(stringBuf, substring); - idx = substring; + m_index = substring; getNextPound(stringBuf, substring); number_of_entries = atoi(substring.c_str()); getNextPound(stringBuf, substring); - number_of_scores = atoi(substring.c_str()); + m_num_scores = atoi(substring.c_str()); getNextPound(stringBuf, substring); - score_type = substring; + m_score_type = substring; } - (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries); + if (binmode) { + loadbin(is, number_of_entries); + } else { + loadtxt(is, number_of_entries); + } - std::getline(inFile, stringBuf); + getline(*is, stringBuf); if (!stringBuf.empty()) { - if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0) { + if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && + (loc = stringBuf.find(SCORES_BIN_END)) != 0) { TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer"); return; } } } -void ScoreArray::load(const std::string &file) +void ScoreArray::load(const string &file) { - TRACE_ERR("loading data from " << file << std::endl); - - inputfilestream inFile(file); // matches a stream with a file. Opens the file - - load((ifstream&) inFile); - - inFile.close(); + TRACE_ERR("loading data from " << file << endl); + inputfilestream input_stream(file); // matches a stream with a file. Opens the file + istream* is = &input_stream; + load(is); + input_stream.close(); } @@ -139,7 +151,8 @@ bool ScoreArray::check_consistency() const if (sz == 0) return true; - for (scorearray_t::const_iterator i = array_.begin(); i != array_.end(); ++i) { + for (scorearray_t::const_iterator i = m_array.begin(); + i != m_array.end(); ++i) { if (i->size() != sz) return false; } diff --git a/mert/ScoreArray.h b/mert/ScoreArray.h index 5b84e020f..384fdfff3 100644 --- a/mert/ScoreArray.h +++ b/mert/ScoreArray.h @@ -24,85 +24,62 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0"; class ScoreArray { -protected: - scorearray_t array_; - std::string score_type; - size_t number_of_scores; + private: + scorearray_t m_array; + std::string m_score_type; + size_t m_num_scores; -private: - // idx to identify the utterance. + // indexx to identify the utterance. // It can differ from the index inside the vector. - std::string idx; + std::string m_index; public: ScoreArray(); ~ScoreArray() {} - inline void clear() { - array_.clear(); - } + void clear() { m_array.clear(); } - inline std::string getIndex() const { - return idx; - } - inline void setIndex(const std::string& value) { - idx=value; - } + std::string getIndex() const { return m_index; } -// inline ScoreStats get(size_t i){ return array_.at(i); } + void setIndex(const std::string& value) { m_index = value; } - inline ScoreStats& get(size_t i) { - return array_.at(i); - } - inline const ScoreStats& get(size_t i)const { - return array_.at(i); - } + ScoreStats& get(size_t i) { return m_array.at(i); } - void add(const ScoreStats& e) { - array_.push_back(e); - } + const ScoreStats& get(size_t i) const { return m_array.at(i); } + + void add(const ScoreStats& e) { m_array.push_back(e); } //ADDED BY TS void swap(size_t i, size_t j) { - std::swap(array_[i],array_[j]); + std::swap(m_array[i], m_array[j]); } void resize(size_t new_size) { - array_.resize(std::min(new_size,array_.size())); + m_array.resize(std::min(new_size, m_array.size())); } //END_ADDED void merge(ScoreArray& e); - inline std::string name() const { - return score_type; - } + std::string name() const { return m_score_type; } - inline void name(std::string &sctype) { - score_type = sctype; - } + void name(std::string &score_type) { m_score_type = score_type; } - inline size_t size() const { - return array_.size(); - } - inline size_t NumberOfScores() const { - return number_of_scores; - } - inline void NumberOfScores(size_t v) { - number_of_scores = v; - } + size_t size() const { return m_array.size(); } - void savetxt(ofstream& outFile, const std::string& sctype); - void savebin(ofstream& outFile, const std::string& sctype); - void save(ofstream& outFile, const std::string& sctype, bool bin=false); - void save(const std::string &file, const std::string& sctype, bool bin=false); - inline void save(const std::string& sctype, bool bin=false) { - save("/dev/stdout", sctype, bin); - } + size_t NumberOfScores() const { return m_num_scores; } - void loadtxt(ifstream& inFile, size_t n); - void loadbin(ifstream& inFile, size_t n); - void load(ifstream& inFile); + void NumberOfScores(size_t v) { m_num_scores = v; } + + void savetxt(std::ostream* os, const std::string& score_type); + void savebin(std::ostream* os, const std::string& score_type); + void save(std::ostream* os, const std::string& score_type, bool bin=false); + void save(const std::string &file, const std::string& score_type, bool bin=false); + void save(const std::string& score_type, bool bin=false); + + void loadtxt(std::istream* is, size_t n); + void loadbin(std::istream* is, size_t n); + void load(std::istream* is); void load(const std::string &file); bool check_consistency() const; diff --git a/mert/ScoreData.cpp b/mert/ScoreData.cpp index 2852e413f..b4454dc4e 100644 --- a/mert/ScoreData.cpp +++ b/mert/ScoreData.cpp @@ -7,55 +7,56 @@ */ #include "ScoreData.h" + +#include #include "Scorer.h" #include "Util.h" #include "FileStream.h" -ScoreData::ScoreData(Scorer& ptr): - theScorer(&ptr) +ScoreData::ScoreData(Scorer* scorer) : + m_scorer(scorer) { - score_type = theScorer->getName(); + m_score_type = m_scorer->getName(); // This is not dangerous: we don't use the this pointer in SetScoreData. - theScorer->setScoreData(this); - number_of_scores = theScorer->NumberOfScores(); - // TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl); + m_scorer->setScoreData(this); + m_num_scores = m_scorer->NumberOfScores(); + // TRACE_ERR("ScoreData: m_num_scores: " << m_num_scores << std::endl); } -void ScoreData::save(std::ofstream& outFile, bool bin) +void ScoreData::save(ostream* os, bool bin) { - for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) { - i->save(outFile, score_type, bin); + for (scoredata_t::iterator i = m_array.begin(); + i != m_array.end(); ++i) { + i->save(os, m_score_type, bin); } } -void ScoreData::save(const std::string &file, bool bin) +void ScoreData::save(const string &file, bool bin) { if (file.empty()) return; - TRACE_ERR("saving the array into " << file << std::endl); + TRACE_ERR("saving the array into " << file << endl); // matches a stream with a file. Opens the file. - std::ofstream outFile(file.c_str(), std::ios::out); - - ScoreStats entry; - - save(outFile, bin); - - outFile.close(); + ofstream ofs(file.c_str(), ios::out); + ostream* os = &ofs; + save(os, bin); + ofs.close(); } -void ScoreData::load(ifstream& inFile) +void ScoreData::save(bool bin) { + save(&cout, bin); +} + +void ScoreData::load(istream* is) { ScoreArray entry; - while (!inFile.eof()) { - - if (!inFile.good()) { - std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl; + while (!is->eof()) { + if (!is->good()) { + cerr << "ERROR ScoreData::load inFile.good()" << endl; } - entry.clear(); - entry.load(inFile); - + entry.load(is); if (entry.size() == 0) { break; } @@ -63,63 +64,58 @@ void ScoreData::load(ifstream& inFile) } } - -void ScoreData::load(const std::string &file) +void ScoreData::load(const string &file) { - TRACE_ERR("loading score data from " << file << std::endl); - - inputfilestream inFile(file); // matches a stream with a file. Opens the file - - if (!inFile) { + TRACE_ERR("loading score data from " << file << endl); + inputfilestream input_stream(file); // matches a stream with a file. Opens the file + if (!input_stream) { throw runtime_error("Unable to open score file: " + file); } - - load((ifstream&) inFile); - - inFile.close(); + istream* is = &input_stream; + load(is); + input_stream.close(); } - void ScoreData::add(ScoreArray& e) { if (exists(e.getIndex())) { // array at position e.getIndex() already exists //enlarge array at position e.getIndex() size_t pos = getIndex(e.getIndex()); - array_.at(pos).merge(e); + m_array.at(pos).merge(e); } else { - array_.push_back(e); + m_array.push_back(e); setIndex(); } } -void ScoreData::add(const ScoreStats& e, const std::string& sent_idx) +void ScoreData::add(const ScoreStats& e, const string& sent_idx) { if (exists(sent_idx)) { // array at position e.getIndex() already exists // Enlarge array at position e.getIndex() size_t pos = getIndex(sent_idx); // TRACE_ERR("Inserting in array " << sent_idx << std::endl); - array_.at(pos).add(e); + m_array.at(pos).add(e); // TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl); } else { // TRACE_ERR("Creating a new entry in the array" << std::endl); ScoreArray a; - a.NumberOfScores(number_of_scores); + a.NumberOfScores(m_num_scores); a.add(e); a.setIndex(sent_idx); - size_t idx = array_.size(); - array_.push_back(a); - idx2arrayname_[idx] = sent_idx; - arrayname2idx_[sent_idx]=idx; + size_t idx = m_array.size(); + m_array.push_back(a); + m_index_to_array_name[idx] = sent_idx; + m_array_name_to_index[sent_idx]=idx; // TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl); } } bool ScoreData::check_consistency() const { - if (array_.size() == 0) + if (m_array.size() == 0) return true; - for (scoredata_t::const_iterator i = array_.begin(); i != array_.end(); ++i) + for (scoredata_t::const_iterator i = m_array.begin(); i != m_array.end(); ++i) if (!i->check_consistency()) return false; return true; @@ -127,10 +123,10 @@ bool ScoreData::check_consistency() const void ScoreData::setIndex() { - size_t j=0; - for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) { - idx2arrayname_[j]=i->getIndex(); - arrayname2idx_[i->getIndex()]=j; + size_t j = 0; + for (scoredata_t::iterator i = m_array.begin(); i != m_array.end(); ++i) { + m_index_to_array_name[j] = i->getIndex(); + m_array_name_to_index[i->getIndex()]=j; j++; } } diff --git a/mert/ScoreData.h b/mert/ScoreData.h index d1a635e9e..70d7b9ab1 100644 --- a/mert/ScoreData.h +++ b/mert/ScoreData.h @@ -9,9 +9,8 @@ #ifndef MERT_SCORE_DATA_H_ #define MERT_SCORE_DATA_H_ -#include -#include #include +#include #include #include #include "ScoreArray.h" @@ -23,35 +22,34 @@ class Scorer; class ScoreData { -protected: - scoredata_t array_; - idx2name idx2arrayname_; // map from index to name of array - name2idx arrayname2idx_; // map from name to index of array - private: // Do not allow the user to instanciate without arguments. ScoreData() {} - Scorer* theScorer; - std::string score_type; - size_t number_of_scores; + scoredata_t m_array; + idx2name m_index_to_array_name; // map from index to name of array + name2idx m_array_name_to_index; // map from name to index of array + + Scorer* m_scorer; + std::string m_score_type; + size_t m_num_scores; public: - ScoreData(Scorer& sc); + ScoreData(Scorer* scorer); ~ScoreData() {} - inline void clear() { - array_.clear(); - } + void clear() { m_array.clear(); } inline ScoreArray get(const std::string& idx) { - return array_.at(getIndex(idx)); + return m_array.at(getIndex(idx)); } + inline ScoreArray& get(size_t idx) { - return array_.at(idx); + return m_array.at(idx); } + inline const ScoreArray& get(size_t idx) const { - return array_.at(idx); + return m_array.at(idx); } inline bool exists(const std::string& sent_idx) const { @@ -59,56 +57,51 @@ public: } inline bool exists(int sent_idx) const { - return (sent_idx > -1 && sent_idx < static_cast(array_.size())) ? true : false; + return (sent_idx > -1 && sent_idx < static_cast(m_array.size())) ? true : false; } inline ScoreStats& get(size_t i, size_t j) { - return array_.at(i).get(j); - } - inline const ScoreStats& get(size_t i, size_t j) const { - return array_.at(i).get(j); + return m_array.at(i).get(j); } - inline std::string name() const { - return score_type; + inline const ScoreStats& get(size_t i, size_t j) const { + return m_array.at(i).get(j); } - inline std::string name(const std::string &sctype) { - return score_type = sctype; + std::string name() const { return m_score_type; } + + std::string name(const std::string &score_type) { + return m_score_type = score_type; } void add(ScoreArray& e); void add(const ScoreStats& e, const std::string& sent_idx); - inline size_t NumberOfScores() const { - return number_of_scores; - } - inline size_t size() const { - return array_.size(); - } + size_t NumberOfScores() const { return m_num_scores; } + size_t size() const { return m_array.size(); } void save(const std::string &file, bool bin=false); - void save(ofstream& outFile, bool bin=false); - inline void save(bool bin=false) { - save("/dev/stdout", bin); - } + void save(std::ostream* os, bool bin=false); + void save(bool bin=false); - void load(ifstream& inFile); + void load(std::istream* is); void load(const std::string &file); bool check_consistency() const; + void setIndex(); inline int getIndex(const std::string& idx) const { - name2idx::const_iterator i = arrayname2idx_.find(idx); - if (i != arrayname2idx_.end()) + name2idx::const_iterator i = m_array_name_to_index.find(idx); + if (i != m_array_name_to_index.end()) return i->second; else return -1; } + inline std::string getIndex(size_t idx) const { - idx2name::const_iterator i = idx2arrayname_.find(idx); - if (i != idx2arrayname_.end()) + idx2name::const_iterator i = m_index_to_array_name.find(idx); + if (i != m_index_to_array_name.end()) throw runtime_error("there is no entry at index " + idx); return i->second; } diff --git a/mert/ScoreStats.cpp b/mert/ScoreStats.cpp index 0f49e8edc..e6c111d5d 100644 --- a/mert/ScoreStats.cpp +++ b/mert/ScoreStats.cpp @@ -14,30 +14,30 @@ const int kAvailableSize = 8; } // namespace ScoreStats::ScoreStats() - : available_(kAvailableSize), entries_(0), - array_(new ScoreStatsType[available_]) {} + : m_available_size(kAvailableSize), m_entries(0), + m_array(new ScoreStatsType[m_available_size]) {} ScoreStats::ScoreStats(const size_t size) - : available_(size), entries_(size), - array_(new ScoreStatsType[available_]) + : m_available_size(size), m_entries(size), + m_array(new ScoreStatsType[m_available_size]) { - memset(array_, 0, GetArraySizeWithBytes()); + memset(m_array, 0, GetArraySizeWithBytes()); } ScoreStats::~ScoreStats() { - if (array_) { - delete [] array_; - array_ = NULL; + if (m_array) { + delete [] m_array; + m_array = NULL; } } void ScoreStats::Copy(const ScoreStats &stats) { - available_ = stats.available(); - entries_ = stats.size(); - array_ = new ScoreStatsType[available_]; - memcpy(array_, stats.getArray(), GetArraySizeWithBytes()); + m_available_size = stats.available(); + m_entries = stats.size(); + m_array = new ScoreStatsType[m_available_size]; + memcpy(m_array, stats.getArray(), GetArraySizeWithBytes()); } ScoreStats::ScoreStats(const ScoreStats &stats) @@ -47,27 +47,27 @@ ScoreStats::ScoreStats(const ScoreStats &stats) ScoreStats& ScoreStats::operator=(const ScoreStats &stats) { - delete [] array_; + delete [] m_array; Copy(stats); return *this; } void ScoreStats::expand() { - available_ *= 2; - scorestats_t buf = new ScoreStatsType[available_]; - memcpy(buf, array_, GetArraySizeWithBytes()); - delete [] array_; - array_ = buf; + m_available_size *= 2; + scorestats_t buf = new ScoreStatsType[m_available_size]; + memcpy(buf, m_array, GetArraySizeWithBytes()); + delete [] m_array; + m_array = buf; } void ScoreStats::add(ScoreStatsType v) { if (isfull()) expand(); - array_[entries_++]=v; + m_array[m_entries++]=v; } -void ScoreStats::set(const std::string& str) +void ScoreStats::set(const string& str) { reset(); vector out; @@ -78,46 +78,51 @@ void ScoreStats::set(const std::string& str) } } -void ScoreStats::loadbin(std::ifstream& inFile) +void ScoreStats::loadbin(istream* is) { - inFile.read((char*)array_, GetArraySizeWithBytes()); + is->read(reinterpret_cast(m_array), + static_cast(GetArraySizeWithBytes())); } -void ScoreStats::loadtxt(std::ifstream& inFile) +void ScoreStats::loadtxt(istream* is) { - std::string theString; - std::getline(inFile, theString); - set(theString); + string line; + getline(*is, line); + set(line); } -void ScoreStats::loadtxt(const std::string &file) +void ScoreStats::loadtxt(const string &file) { -// TRACE_ERR("loading the stats from " << file << std::endl); - - std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file - - loadtxt(inFile); + ifstream ifs(file.c_str(), ios::in); // matches a stream with a file. Opens the file + if (!ifs) { + cerr << "Failed to open " << file << endl; + exit(1); + } + istream* is = &ifs; + loadtxt(is); } -void ScoreStats::savetxt(const std::string &file) +void ScoreStats::savetxt(const string &file) { -// TRACE_ERR("saving the stats into " << file << std::endl); - - std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file - - savetxt(outFile); + ofstream ofs(file.c_str(), ios::out); // matches a stream with a file. Opens the file + ostream* os = &ofs; + savetxt(os); } - -void ScoreStats::savetxt(std::ofstream& outFile) +void ScoreStats::savetxt(ostream* os) { - outFile << *this; + *os << *this; } -void ScoreStats::savebin(std::ofstream& outFile) +void ScoreStats::savetxt() { + savetxt(&cout); +} + +void ScoreStats::savebin(ostream* os) { - outFile.write((char*)array_, GetArraySizeWithBytes()); + os->write(reinterpret_cast(m_array), + static_cast(GetArraySizeWithBytes())); } ostream& operator<<(ostream& o, const ScoreStats& e) diff --git a/mert/ScoreStats.h b/mert/ScoreStats.h index cbdf1c8c2..e8d4543ce 100644 --- a/mert/ScoreStats.h +++ b/mert/ScoreStats.h @@ -22,11 +22,11 @@ using namespace std; class ScoreStats { private: - size_t available_; - size_t entries_; + size_t m_available_size; + size_t m_entries; // TODO: Use smart pointer for exceptional-safety. - scorestats_t array_; + scorestats_t m_array; public: ScoreStats(); @@ -40,31 +40,23 @@ public: void Copy(const ScoreStats &stats); - bool isfull() const { - return (entries_ < available_) ? 0 : 1; - } + bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; } void expand(); void add(ScoreStatsType v); void clear() { - memset((void*)array_, 0, GetArraySizeWithBytes()); + memset((void*)m_array, 0, GetArraySizeWithBytes()); } void reset() { - entries_ = 0; + m_entries = 0; clear(); } - inline ScoreStatsType get(size_t i) { - return array_[i]; - } - inline ScoreStatsType get(size_t i)const { - return array_[i]; - } - inline scorestats_t getArray() const { - return array_; - } + ScoreStatsType get(size_t i) { return m_array[i]; } + ScoreStatsType get(size_t i) const { return m_array[i]; } + scorestats_t getArray() const { return m_array; } void set(const std::string& str); @@ -76,31 +68,24 @@ public: } } - inline size_t bytes() const { - return GetArraySizeWithBytes(); - } + size_t bytes() const { return GetArraySizeWithBytes(); } size_t GetArraySizeWithBytes() const { - return entries_ * sizeof(ScoreStatsType); + return m_entries * sizeof(ScoreStatsType); } - inline size_t size() const { - return entries_; - } - inline size_t available() const { - return available_; - } + size_t size() const { return m_entries; } + + size_t available() const { return m_available_size; } void savetxt(const std::string &file); - void savetxt(ofstream& outFile); - void savebin(ofstream& outFile); - inline void savetxt() { - savetxt("/dev/stdout"); - } + void savetxt(ostream* os); + void savebin(ostream* os); + void savetxt(); void loadtxt(const std::string &file); - void loadtxt(ifstream& inFile); - void loadbin(ifstream& inFile); + void loadtxt(istream* is); + void loadbin(istream* is); /** * Write the whole object to a stream. diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp index 3b2e0d61f..eb8a9be95 100644 --- a/mert/evaluator.cpp +++ b/mert/evaluator.cpp @@ -55,7 +55,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap) for (int i = 0; i < bootstrap; ++i) { // TODO: Use smart pointer for exceptional-safety. - ScoreData* scoredata = new ScoreData(*g_scorer); + ScoreData* scoredata = new ScoreData(g_scorer); for (int j = 0; j < n; ++j) { int randomIndex = random() % n; @@ -89,7 +89,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap) else { // TODO: Use smart pointer for exceptional-safety. - ScoreData* scoredata = new ScoreData(*g_scorer); + ScoreData* scoredata = new ScoreData(g_scorer); for (int sid = 0; sid < n; ++sid) { string str_sid = int2string(sid); diff --git a/mert/extractor.cpp b/mert/extractor.cpp index 99567281a..3442ed36b 100644 --- a/mert/extractor.cpp +++ b/mert/extractor.cpp @@ -197,7 +197,7 @@ int main(int argc, char** argv) PrintUserTime("References loaded"); - Data data(*scorer); + Data data(scorer.get()); // load old data for (size_t i = 0; i < prevScoreDataFiles.size(); i++) { @@ -208,13 +208,13 @@ int main(int argc, char** argv) // computing score statistics of each nbest file for (size_t i = 0; i < nbestFiles.size(); i++) { - data.loadnbest(nbestFiles.at(i)); + data.loadNBest(nbestFiles.at(i)); } PrintUserTime("Nbest entries loaded and scored"); //ADDED_BY_TS - data.remove_duplicates(); + data.removeDuplicates(); //END_ADDED data.save(option.featureDataFile, option.scoreDataFile, option.binmode); diff --git a/mert/mert.cpp b/mert/mert.cpp index 694b48e8e..2455aa39f 100755 --- a/mert/mert.cpp +++ b/mert/mert.cpp @@ -338,7 +338,7 @@ int main(int argc, char **argv) ScorerFactory::getScorer(option.scorer_type, option.scorer_config)); //load data - Data data(*scorer); + Data data(scorer.get()); for (size_t i = 0; i < ScoreDataFiles.size(); i++) { cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl; @@ -348,7 +348,7 @@ int main(int argc, char **argv) scorer->setScoreData(data.getScoreData().get()); //ADDED_BY_TS - data.remove_duplicates(); + data.removeDuplicates(); //END_ADDED PrintUserTime("Data loaded"); @@ -434,7 +434,7 @@ int main(int argc, char **argv) vector& tasks = allTasks[i]; Optimizer *optimizer = OptimizerFactory::BuildOptimizer(option.pdim, to_optimize, start_list[0], option.optimize_type, option.nrandom); optimizer->SetScorer(data_ref.getScorer()); - optimizer->SetFData(data_ref.getFeatureData()); + optimizer->SetFeatureData(data_ref.getFeatureData()); // A task for each start point for (size_t j = 0; j < startingPoints.size(); ++j) { OptimizationTask* task = new OptimizationTask(optimizer, startingPoints[j]); diff --git a/mert/pro.cpp b/mert/pro.cpp index a18e7a117..b29bbe052 100644 --- a/mert/pro.cpp +++ b/mert/pro.cpp @@ -21,8 +21,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ -/** - * This is part of the PRO implementation. It converts the features and scores +/** + * This is part of the PRO implementation. It converts the features and scores * files into a form suitable for input into the megam maxent trainer. * * For details of PRO, refer to Hopkins & May (EMNLP 2011) @@ -34,9 +34,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include #include #include +#include #include +#include "BleuScorer.h" #include "FeatureDataIterator.h" #include "ScoreDataIterator.h" @@ -46,49 +48,49 @@ namespace po = boost::program_options; class SampledPair { private: - pair translation1; - pair translation2; - float scoreDiff; + pair m_translation1; + pair m_translation2; + float m_score_diff; + public: - SampledPair(const pair& t1, const pair& t2, float diff ) { - if (diff > 0) { - translation1 = t1; - translation2 = t2; - scoreDiff = diff; - } - else { - translation1 = t2; - translation2 = t1; - scoreDiff = -diff; - } - } - float getDiff() const { return scoreDiff; } - const pair& getTranslation1() const { return translation1; } - const pair& getTranslation2() const { return translation2; } + SampledPair(const pair& t1, const pair& t2, float diff ) { + if (diff > 0) { + m_translation1 = t1; + m_translation2 = t2; + m_score_diff = diff; + } else { + m_translation1 = t2; + m_translation2 = t1; + m_score_diff = -diff; + } + } + + float getDiff() const { return m_score_diff; } + const pair& getTranslation1() const { return m_translation1; } + const pair& getTranslation2() const { return m_translation2; } }; static float sentenceLevelBleuPlusOne(const vector& stats) { - float logbleu = 0.0; - const unsigned int bleu_order = 4; - for (unsigned int j=0; j(stats[(bleu_order*2)]) / stats[1]; - if (brevity < 0.0) { - logbleu += brevity; - } - //cerr << brevity << " -> " << exp(logbleu) << endl; - return exp(logbleu); + float logbleu = 0.0; + for (unsigned int j=0; j(stats[(kBleuNgramOrder * 2)]) / stats[1]; + if (brevity < 0.0) { + logbleu += brevity; + } + //cerr << brevity << " -> " << exp(logbleu) << endl; + return exp(logbleu); } static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) { // difference in score in regular features - for(unsigned int j=0; j 0.00001) - out << " F" << j << " " << (f1.dense[j]-f2.dense[j]); + for(unsigned int j=0; j 0.00001) + out << " F" << j << " " << (f1.dense[j]-f2.dense[j]); if (f1.sparse.size() || f2.sparse.size()) { out << " "; @@ -101,27 +103,27 @@ static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureD } } - -int main(int argc, char** argv) + +int main(int argc, char** argv) { bool help; vector scoreFiles; vector featureFiles; int seed; string outputFile; - //TODO: options - const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May - const unsigned int n_samples = 50; // Xi, in Hopkins & May - const float min_diff = 0.05; + // TODO: Add these constants to options + const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May + const unsigned int n_samples = 50; // Xi, in Hopkins & May + const float min_diff = 0.05; po::options_description desc("Allowed options"); desc.add_options() - ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") - ("scfile,S", po::value >(&scoreFiles), "Scorer data files") - ("ffile,F", po::value > (&featureFiles), "Feature data files") - ("random-seed,r", po::value(&seed), "Seed for random number generation") - ("output-file,o", po::value(&outputFile), "Output file") - ; + ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") + ("scfile,S", po::value >(&scoreFiles), "Scorer data files") + ("ffile,F", po::value > (&featureFiles), "Feature data files") + ("random-seed,r", po::value(&seed), "Seed for random number generation") + ("output-file,o", po::value(&outputFile), "Output file") + ; po::options_description cmdline_options; cmdline_options.add(desc); @@ -134,7 +136,7 @@ int main(int argc, char** argv) cout << desc << endl; exit(0); } - + if (vm.count("random-seed")) { cerr << "Initialising random seed to " << seed << endl; srand(seed); @@ -167,7 +169,7 @@ int main(int argc, char** argv) out = &cout; } - + vector featureDataIters; vector scoreDataIters; for (size_t i = 0; i < featureFiles.size(); ++i) { @@ -179,7 +181,7 @@ int main(int argc, char** argv) size_t sentenceId = 0; while(1) { vector > hypotheses; - //TODO: de-deuping. Collect hashes of score,feature pairs and + //TODO: de-deuping. Collect hashes of score,feature pairs and //only add index if it's unique. if (featureDataIters[0] == FeatureDataIterator::end()) { break; @@ -214,7 +216,7 @@ int main(int argc, char** argv) size_t rand2 = rand() % n_translations; pair translation2 = hypotheses[rand2]; float bleu2 = sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second)); - + /* cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 << " t(" << translation2.first << "," << translation2.second << ") = " << @@ -222,7 +224,7 @@ int main(int argc, char** argv) */ if (abs(bleu1-bleu2) < min_diff) continue; - + samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2)); scores.push_back(1.0-abs(bleu1-bleu2)); } @@ -261,4 +263,3 @@ int main(int argc, char** argv) outFile.close(); } -