mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-28 14:32:38 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
d5efa27be9
@ -85,7 +85,6 @@ class BleuScorer::NgramCounts {
|
||||
|
||||
BleuScorer::BleuScorer(const string& config)
|
||||
: StatisticsBasedScorer("BLEU", config),
|
||||
kLENGTH(4),
|
||||
m_ref_length_type(CLOSEST) {
|
||||
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
|
||||
if (reflen == REFLEN_AVERAGE) {
|
||||
@ -150,7 +149,7 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
|
||||
}
|
||||
NgramCounts counts;
|
||||
size_t length = countNgrams(line, counts, kLENGTH);
|
||||
size_t length = countNgrams(line, counts, kBleuNgramOrder);
|
||||
|
||||
//for any counts larger than those already there, merge them in
|
||||
for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
|
||||
@ -184,9 +183,9 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
}
|
||||
NgramCounts testcounts;
|
||||
// stats for this line
|
||||
vector<ScoreStatsType> stats(kLENGTH * 2);
|
||||
vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
|
||||
string sentence = this->applyFactors(text);
|
||||
const size_t length = countNgrams(sentence, testcounts, kLENGTH);
|
||||
const size_t length = countNgrams(sentence, testcounts, kBleuNgramOrder);
|
||||
|
||||
// Calculate effective reference length.
|
||||
switch (m_ref_length_type) {
|
||||
@ -222,15 +221,16 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
float BleuScorer::calculateScore(const vector<int>& comps) const
|
||||
{
|
||||
float logbleu = 0.0;
|
||||
for (int i = 0; i < kLENGTH; ++i) {
|
||||
for (int i = 0; i < kBleuNgramOrder; ++i) {
|
||||
if (comps[2*i] == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
logbleu += log(comps[2*i]) - log(comps[2*i+1]);
|
||||
|
||||
}
|
||||
logbleu /= kLENGTH;
|
||||
const float brevity = 1.0 - static_cast<float>(comps[kLENGTH*2]) / comps[1];//reflength divided by test length
|
||||
logbleu /= kBleuNgramOrder;
|
||||
// reflength divided by test length
|
||||
const float brevity = 1.0 - static_cast<float>(comps[kBleuNgramOrder * 2]) / comps[1];
|
||||
if (brevity < 0.0) {
|
||||
logbleu += brevity;
|
||||
}
|
||||
|
@ -12,6 +12,8 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
const int kBleuNgramOrder = 4;
|
||||
|
||||
/**
|
||||
* Bleu scoring
|
||||
*/
|
||||
@ -24,7 +26,7 @@ public:
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
virtual float calculateScore(const vector<int>& comps) const;
|
||||
virtual size_t NumberOfScores() const { return 2 * kLENGTH + 1; }
|
||||
virtual size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; }
|
||||
|
||||
private:
|
||||
enum ReferenceLengthType {
|
||||
@ -55,7 +57,6 @@ private:
|
||||
void CalcShortest(size_t sentence_id,
|
||||
vector<ScoreStatsType>& stats) const;
|
||||
|
||||
const int kLENGTH;
|
||||
ReferenceLengthType m_ref_length_type;
|
||||
|
||||
// data extracted from reference files
|
||||
|
156
mert/Data.cpp
156
mert/Data.cpp
@ -7,7 +7,6 @@
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include "util/check.hh"
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
|
||||
@ -16,36 +15,37 @@
|
||||
#include "Scorer.h"
|
||||
#include "ScorerFactory.h"
|
||||
#include "Util.h"
|
||||
#include "util/check.hh"
|
||||
|
||||
Data::Data()
|
||||
: theScorer(NULL),
|
||||
number_of_scores(0),
|
||||
_sparse_flag(false),
|
||||
scoredata(),
|
||||
featdata() {}
|
||||
: m_scorer(NULL),
|
||||
m_num_scores(0),
|
||||
m_sparse_flag(false),
|
||||
m_score_data(),
|
||||
m_feature_data() {}
|
||||
|
||||
Data::Data(Scorer& ptr)
|
||||
: theScorer(&ptr),
|
||||
score_type(theScorer->getName()),
|
||||
number_of_scores(0),
|
||||
_sparse_flag(false),
|
||||
scoredata(new ScoreData(*theScorer)),
|
||||
featdata(new FeatureData)
|
||||
Data::Data(Scorer* scorer)
|
||||
: m_scorer(scorer),
|
||||
m_score_type(m_scorer->getName()),
|
||||
m_num_scores(0),
|
||||
m_sparse_flag(false),
|
||||
m_score_data(new ScoreData(m_scorer)),
|
||||
m_feature_data(new FeatureData)
|
||||
{
|
||||
TRACE_ERR("Data::score_type " << score_type << endl);
|
||||
TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
|
||||
TRACE_ERR("Data::m_score_type " << m_score_type << endl);
|
||||
TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
|
||||
}
|
||||
|
||||
//ADDED BY TS
|
||||
void Data::remove_duplicates() {
|
||||
// TODO: This is too long; consider creating additional functions to
|
||||
// reduce the lines of this function.
|
||||
void Data::removeDuplicates() {
|
||||
size_t nSentences = m_feature_data->size();
|
||||
assert(m_score_data->size() == nSentences);
|
||||
|
||||
size_t nSentences = featdata->size();
|
||||
assert(scoredata->size() == nSentences);
|
||||
|
||||
for (size_t s=0; s < nSentences; s++) {
|
||||
|
||||
FeatureArray& feat_array = featdata->get(s);
|
||||
ScoreArray& score_array = scoredata->get(s);
|
||||
for (size_t s = 0; s < nSentences; s++) {
|
||||
FeatureArray& feat_array = m_feature_data->get(s);
|
||||
ScoreArray& score_array = m_score_data->get(s);
|
||||
|
||||
assert(feat_array.size() == score_array.size());
|
||||
|
||||
@ -55,48 +55,42 @@ void Data::remove_duplicates() {
|
||||
size_t end_pos = feat_array.size() - 1;
|
||||
|
||||
size_t nRemoved = 0;
|
||||
for (size_t k=0; k <= end_pos; k++) {
|
||||
|
||||
for (size_t k = 0; k <= end_pos; k++) {
|
||||
const FeatureStats& cur_feats = feat_array.get(k);
|
||||
|
||||
double sum = 0.0;
|
||||
for (size_t l=0; l < cur_feats.size(); l++)
|
||||
sum += cur_feats.get(l);
|
||||
for (size_t l = 0; l < cur_feats.size(); l++)
|
||||
sum += cur_feats.get(l);
|
||||
|
||||
if (lookup.find(sum) != lookup.end()) {
|
||||
|
||||
//cerr << "hit" << endl;
|
||||
//cerr << "hit" << endl;
|
||||
vector<size_t>& cur_list = lookup[sum];
|
||||
|
||||
vector<size_t>& cur_list = lookup[sum];
|
||||
// TODO: Make sure this is correct because we have already used 'l'.
|
||||
// If this does not impact on the removing duplicates, it is better
|
||||
// to change
|
||||
size_t l = 0;
|
||||
for (l = 0; l < cur_list.size(); l++) {
|
||||
size_t j = cur_list[l];
|
||||
|
||||
size_t l=0;
|
||||
for (l=0; l < cur_list.size(); l++) {
|
||||
|
||||
size_t j=cur_list[l];
|
||||
|
||||
if (cur_feats == feat_array.get(j)
|
||||
&& score_array.get(k) == score_array.get(j)) {
|
||||
|
||||
if (k < end_pos) {
|
||||
|
||||
feat_array.swap(k,end_pos);
|
||||
score_array.swap(k,end_pos);
|
||||
|
||||
k--;
|
||||
}
|
||||
|
||||
end_pos--;
|
||||
nRemoved++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (l == lookup[sum].size())
|
||||
cur_list.push_back(k);
|
||||
if (cur_feats == feat_array.get(j)
|
||||
&& score_array.get(k) == score_array.get(j)) {
|
||||
if (k < end_pos) {
|
||||
feat_array.swap(k,end_pos);
|
||||
score_array.swap(k,end_pos);
|
||||
k--;
|
||||
}
|
||||
end_pos--;
|
||||
nRemoved++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (l == lookup[sum].size())
|
||||
cur_list.push_back(k);
|
||||
} else {
|
||||
lookup[sum].push_back(k);
|
||||
}
|
||||
else
|
||||
lookup[sum].push_back(k);
|
||||
|
||||
// for (size_t j=0; j < k; j++) {
|
||||
|
||||
// if (feat_array.get(k) == feat_array.get(j)
|
||||
@ -115,11 +109,9 @@ void Data::remove_duplicates() {
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
} // end for k
|
||||
|
||||
if (nRemoved > 0) {
|
||||
|
||||
feat_array.resize(end_pos+1);
|
||||
score_array.resize(end_pos+1);
|
||||
}
|
||||
@ -127,8 +119,14 @@ void Data::remove_duplicates() {
|
||||
}
|
||||
//END_ADDED
|
||||
|
||||
void Data::load(const std::string &featfile, const std::string &scorefile) {
|
||||
m_feature_data->load(featfile);
|
||||
m_score_data->load(scorefile);
|
||||
if (m_feature_data->hasSparseFeatures())
|
||||
m_sparse_flag = true;
|
||||
}
|
||||
|
||||
void Data::loadnbest(const string &file)
|
||||
void Data::loadNBest(const string &file)
|
||||
{
|
||||
TRACE_ERR("loading nbest from " << file << endl);
|
||||
inputfilestream inp(file); // matches a stream with a file. Opens the file
|
||||
@ -147,8 +145,8 @@ void Data::loadnbest(const string &file)
|
||||
getNextPound(line, sentence, "|||"); // second field
|
||||
getNextPound(line, feature_str, "|||"); // third field
|
||||
|
||||
theScorer->prepareStats(sentence_index, sentence, scoreentry);
|
||||
scoredata->add(scoreentry, sentence_index);
|
||||
m_scorer->prepareStats(sentence_index, sentence, scoreentry);
|
||||
m_score_data->add(scoreentry, sentence_index);
|
||||
|
||||
// examine first line for name of features
|
||||
if (!existsFeatureNames()) {
|
||||
@ -159,6 +157,16 @@ void Data::loadnbest(const string &file)
|
||||
inp.close();
|
||||
}
|
||||
|
||||
void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
|
||||
if (bin)
|
||||
cerr << "Binary write mode is selected" << endl;
|
||||
else
|
||||
cerr << "Binary write mode is NOT selected" << endl;
|
||||
|
||||
m_feature_data->save(featfile, bin);
|
||||
m_score_data->save(scorefile, bin);
|
||||
}
|
||||
|
||||
void Data::InitFeatureMap(const string& str) {
|
||||
string buf = str;
|
||||
string substr;
|
||||
@ -185,7 +193,7 @@ void Data::InitFeatureMap(const string& str) {
|
||||
tmp_name = substr.substr(0, substr.size() - 1);
|
||||
}
|
||||
}
|
||||
featdata->setFeatureMap(features);
|
||||
m_feature_data->setFeatureMap(features);
|
||||
}
|
||||
|
||||
void Data::AddFeatures(const string& str,
|
||||
@ -207,10 +215,10 @@ void Data::AddFeatures(const string& str,
|
||||
string name = substr;
|
||||
getNextPound(buf, substr);
|
||||
feature_entry.addSparse(name, atof(substr.c_str()));
|
||||
_sparse_flag = true;
|
||||
m_sparse_flag = true;
|
||||
}
|
||||
}
|
||||
featdata->add(feature_entry, sentence_index);
|
||||
m_feature_data->add(feature_entry, sentence_index);
|
||||
}
|
||||
|
||||
// TODO
|
||||
@ -226,8 +234,8 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
|
||||
CHECK(shard_size >= 0);
|
||||
CHECK(shard_size <= 1);
|
||||
|
||||
size_t data_size = scoredata->size();
|
||||
CHECK(data_size == featdata->size());
|
||||
size_t data_size = m_score_data->size();
|
||||
CHECK(data_size == m_feature_data->size());
|
||||
|
||||
shard_size *= data_size;
|
||||
const float coeff = static_cast<float>(data_size) / shard_count;
|
||||
@ -248,15 +256,15 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
|
||||
}
|
||||
}
|
||||
|
||||
Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig);
|
||||
Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig);
|
||||
|
||||
shards.push_back(Data(*scorer));
|
||||
shards.back().score_type = score_type;
|
||||
shards.back().number_of_scores = number_of_scores;
|
||||
shards.back()._sparse_flag = _sparse_flag;
|
||||
shards.push_back(Data(scorer));
|
||||
shards.back().m_score_type = m_score_type;
|
||||
shards.back().m_num_scores = m_num_scores;
|
||||
shards.back().m_sparse_flag = m_sparse_flag;
|
||||
for (size_t i = 0; i < shard_contents.size(); ++i) {
|
||||
shards.back().featdata->add(featdata->get(shard_contents[i]));
|
||||
shards.back().scoredata->add(scoredata->get(shard_contents[i]));
|
||||
shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i]));
|
||||
shards.back().m_score_data->add(m_score_data->get(shard_contents[i]));
|
||||
}
|
||||
//cerr << endl;
|
||||
}
|
||||
|
94
mert/Data.h
94
mert/Data.h
@ -11,11 +11,8 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include<boost/shared_ptr.hpp>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
|
||||
#include "Util.h"
|
||||
#include "FeatureData.h"
|
||||
@ -26,95 +23,70 @@ class Scorer;
|
||||
typedef boost::shared_ptr<ScoreData> ScoreDataHandle;
|
||||
typedef boost::shared_ptr<FeatureData> FeatureDataHandle;
|
||||
|
||||
// NOTE: there is no copy constructor implemented, so only the
|
||||
// compiler synthesised shallow copy is available.
|
||||
class Data
|
||||
{
|
||||
private:
|
||||
Scorer* theScorer;
|
||||
std::string score_type;
|
||||
size_t number_of_scores;
|
||||
bool _sparse_flag;
|
||||
Scorer* m_scorer;
|
||||
std::string m_score_type;
|
||||
size_t m_num_scores;
|
||||
bool m_sparse_flag;
|
||||
ScoreDataHandle m_score_data;
|
||||
FeatureDataHandle m_feature_data;
|
||||
|
||||
// Helper functions for loadnbest();
|
||||
void InitFeatureMap(const std::string& str);
|
||||
void AddFeatures(const std::string& str,
|
||||
const std::string& sentence_index);
|
||||
|
||||
protected:
|
||||
ScoreDataHandle scoredata;
|
||||
FeatureDataHandle featdata;
|
||||
|
||||
public:
|
||||
explicit Data(Scorer& sc);
|
||||
explicit Data(Scorer* scorer);
|
||||
Data();
|
||||
|
||||
//Note that there is no copy constructor implemented, so only the
|
||||
//compiler synthesised shallow copy is available
|
||||
|
||||
inline void clear() {
|
||||
scoredata->clear();
|
||||
featdata->clear();
|
||||
void clear() {
|
||||
m_score_data->clear();
|
||||
m_feature_data->clear();
|
||||
}
|
||||
|
||||
ScoreDataHandle getScoreData() {
|
||||
return scoredata;
|
||||
ScoreDataHandle getScoreData() { return m_score_data; }
|
||||
|
||||
FeatureDataHandle getFeatureData() { return m_feature_data; }
|
||||
|
||||
Scorer* getScorer() { return m_scorer; }
|
||||
|
||||
size_t NumberOfFeatures() const {
|
||||
return m_feature_data->NumberOfFeatures();
|
||||
}
|
||||
|
||||
FeatureDataHandle getFeatureData() {
|
||||
return featdata;
|
||||
}
|
||||
void NumberOfFeatures(size_t v) { m_feature_data->NumberOfFeatures(v); }
|
||||
|
||||
Scorer* getScorer() {
|
||||
return theScorer;
|
||||
}
|
||||
std::string Features() const { return m_feature_data->Features(); }
|
||||
void Features(const std::string &f) { m_feature_data->Features(f); }
|
||||
|
||||
inline size_t NumberOfFeatures() const {
|
||||
return featdata->NumberOfFeatures();
|
||||
}
|
||||
inline void NumberOfFeatures(size_t v) {
|
||||
featdata->NumberOfFeatures(v);
|
||||
}
|
||||
inline std::string Features() const {
|
||||
return featdata->Features();
|
||||
}
|
||||
inline void Features(const std::string &f) {
|
||||
featdata->Features(f);
|
||||
}
|
||||
|
||||
inline bool hasSparseFeatures() const { return _sparse_flag; }
|
||||
bool hasSparseFeatures() const { return m_sparse_flag; }
|
||||
void mergeSparseFeatures();
|
||||
|
||||
void loadnbest(const std::string &file);
|
||||
void loadNBest(const std::string &file);
|
||||
|
||||
void load(const std::string &featfile,const std::string &scorefile) {
|
||||
featdata->load(featfile);
|
||||
scoredata->load(scorefile);
|
||||
if (featdata->hasSparseFeatures())
|
||||
_sparse_flag = true;
|
||||
}
|
||||
void load(const std::string &featfile, const std::string &scorefile);
|
||||
|
||||
void save(const std::string &featfile, const std::string &scorefile, bool bin=false);
|
||||
|
||||
//ADDED BY TS
|
||||
void remove_duplicates();
|
||||
void removeDuplicates();
|
||||
//END_ADDED
|
||||
|
||||
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
|
||||
|
||||
if (bin) cerr << "Binary write mode is selected" << endl;
|
||||
else cerr << "Binary write mode is NOT selected" << endl;
|
||||
|
||||
featdata->save(featfile, bin);
|
||||
scoredata->save(scorefile, bin);
|
||||
}
|
||||
|
||||
inline bool existsFeatureNames() const {
|
||||
return featdata->existsFeatureNames();
|
||||
return m_feature_data->existsFeatureNames();
|
||||
}
|
||||
|
||||
inline std::string getFeatureName(size_t idx) const {
|
||||
return featdata->getFeatureName(idx);
|
||||
return m_feature_data->getFeatureName(idx);
|
||||
}
|
||||
|
||||
inline size_t getFeatureIndex(const std::string& name) const {
|
||||
return featdata->getFeatureIndex(name);
|
||||
return m_feature_data->getFeatureIndex(name);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -10,7 +10,7 @@
|
||||
//very basic test of sharding
|
||||
BOOST_AUTO_TEST_CASE(shard_basic) {
|
||||
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
|
||||
Data data(*scorer);
|
||||
Data data(scorer.get());
|
||||
FeatureArray fa1, fa2, fa3, fa4;
|
||||
ScoreArray sa1, sa2, sa3, sa4;
|
||||
fa1.setIndex("1");
|
||||
|
@ -6,135 +6,147 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <fstream>
|
||||
#include "FeatureArray.h"
|
||||
#include "FileStream.h"
|
||||
#include "Util.h"
|
||||
|
||||
|
||||
FeatureArray::FeatureArray()
|
||||
: idx(""), number_of_features(0), _sparse_flag(false) {}
|
||||
: m_index(""), m_num_features(0), m_sparse_flag(false) {}
|
||||
|
||||
FeatureArray::~FeatureArray() {}
|
||||
|
||||
void FeatureArray::savetxt(std::ofstream& outFile)
|
||||
void FeatureArray::savetxt(ostream* os)
|
||||
{
|
||||
outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size()
|
||||
<< " " << number_of_features << " " << features << std::endl;
|
||||
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
|
||||
i->savetxt(outFile);
|
||||
outFile << std::endl;
|
||||
*os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size()
|
||||
<< " " << m_num_features << " " << m_features << endl;
|
||||
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
|
||||
i->savetxt(os);
|
||||
*os << endl;
|
||||
}
|
||||
outFile << FEATURES_TXT_END << std::endl;
|
||||
*os << FEATURES_TXT_END << endl;
|
||||
}
|
||||
|
||||
void FeatureArray::savebin(std::ofstream& outFile)
|
||||
void FeatureArray::savebin(ostream* os)
|
||||
{
|
||||
outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size()
|
||||
<< " " << number_of_features << " " << features << std::endl;
|
||||
for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++)
|
||||
i->savebin(outFile);
|
||||
*os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size()
|
||||
<< " " << m_num_features << " " << m_features << endl;
|
||||
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i)
|
||||
i->savebin(os);
|
||||
|
||||
outFile << FEATURES_BIN_END << std::endl;
|
||||
*os << FEATURES_BIN_END << endl;
|
||||
}
|
||||
|
||||
|
||||
void FeatureArray::save(std::ofstream& inFile, bool bin)
|
||||
void FeatureArray::save(ostream* os, bool bin)
|
||||
{
|
||||
if (size()>0)
|
||||
(bin)?savebin(inFile):savetxt(inFile);
|
||||
if (size() <= 0) return;
|
||||
if (bin) {
|
||||
savebin(os);
|
||||
} else {
|
||||
savetxt(os);
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureArray::save(const std::string &file, bool bin)
|
||||
void FeatureArray::save(const string &file, bool bin)
|
||||
{
|
||||
|
||||
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
|
||||
|
||||
save(outFile);
|
||||
|
||||
outFile.close();
|
||||
ofstream ofs(file.c_str(), ios::out);
|
||||
if (!ofs) {
|
||||
cerr << "Failed to open " << file << endl;
|
||||
exit(1);
|
||||
}
|
||||
ostream *os = &ofs;
|
||||
save(os, bin);
|
||||
ofs.close();
|
||||
}
|
||||
|
||||
void FeatureArray::loadbin(ifstream& inFile, size_t n)
|
||||
void FeatureArray::save(bool bin)
|
||||
{
|
||||
FeatureStats entry(number_of_features);
|
||||
save(&cout, bin);
|
||||
}
|
||||
|
||||
for (size_t i=0 ; i < n; i++) {
|
||||
entry.loadbin(inFile);
|
||||
void FeatureArray::loadbin(istream* is, size_t n)
|
||||
{
|
||||
FeatureStats entry(m_num_features);
|
||||
for (size_t i = 0 ; i < n; i++) {
|
||||
entry.loadbin(is);
|
||||
add(entry);
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureArray::loadtxt(ifstream& inFile, size_t n)
|
||||
void FeatureArray::loadtxt(istream* is, size_t n)
|
||||
{
|
||||
FeatureStats entry(number_of_features);
|
||||
FeatureStats entry(m_num_features);
|
||||
|
||||
for (size_t i=0 ; i < n; i++) {
|
||||
entry.loadtxt(inFile);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
entry.loadtxt(is);
|
||||
add(entry);
|
||||
if (entry.getSparse().size()>0)
|
||||
_sparse_flag = true;
|
||||
m_sparse_flag = true;
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureArray::load(ifstream& inFile)
|
||||
void FeatureArray::load(istream* is)
|
||||
{
|
||||
size_t number_of_entries=0;
|
||||
bool binmode=false;
|
||||
size_t number_of_entries = 0;
|
||||
bool binmode = false;
|
||||
|
||||
std::string substring, stringBuf;
|
||||
std::string::size_type loc;
|
||||
string substring, stringBuf;
|
||||
string::size_type loc;
|
||||
|
||||
std::getline(inFile, stringBuf);
|
||||
if (!inFile.good()) {
|
||||
getline(*is, stringBuf);
|
||||
if (!is->good()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!stringBuf.empty()) {
|
||||
if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0) {
|
||||
binmode=false;
|
||||
binmode = false;
|
||||
} else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0) {
|
||||
binmode=true;
|
||||
binmode = true;
|
||||
} else {
|
||||
TRACE_ERR("ERROR: FeatureArray::load(): Wrong header");
|
||||
return;
|
||||
}
|
||||
getNextPound(stringBuf, substring);
|
||||
getNextPound(stringBuf, substring);
|
||||
idx = substring;
|
||||
m_index = substring;
|
||||
getNextPound(stringBuf, substring);
|
||||
number_of_entries = atoi(substring.c_str());
|
||||
getNextPound(stringBuf, substring);
|
||||
number_of_features = atoi(substring.c_str());
|
||||
features = stringBuf;
|
||||
m_num_features = atoi(substring.c_str());
|
||||
m_features = stringBuf;
|
||||
}
|
||||
|
||||
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
|
||||
if (binmode) {
|
||||
loadbin(is, number_of_entries);
|
||||
} else {
|
||||
loadtxt(is, number_of_entries);
|
||||
}
|
||||
|
||||
std::getline(inFile, stringBuf);
|
||||
getline(*is, stringBuf);
|
||||
if (!stringBuf.empty()) {
|
||||
if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0) {
|
||||
if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 &&
|
||||
(loc = stringBuf.find(FEATURES_BIN_END)) != 0) {
|
||||
TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureArray::load(const std::string &file)
|
||||
void FeatureArray::load(const string &file)
|
||||
{
|
||||
TRACE_ERR("loading data from " << file << std::endl);
|
||||
|
||||
inputfilestream inFile(file); // matches a stream with a file. Opens the file
|
||||
|
||||
load((ifstream&) inFile);
|
||||
|
||||
inFile.close();
|
||||
|
||||
TRACE_ERR("loading data from " << file << endl);
|
||||
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
|
||||
istream* is = &input_stream;
|
||||
load(is);
|
||||
input_stream.close();
|
||||
}
|
||||
|
||||
void FeatureArray::merge(FeatureArray& e)
|
||||
{
|
||||
//dummy implementation
|
||||
for (size_t i=0; i<e.size(); i++)
|
||||
for (size_t i = 0; i < e.size(); i++)
|
||||
add(e.get(i));
|
||||
}
|
||||
|
||||
@ -144,10 +156,9 @@ bool FeatureArray::check_consistency() const
|
||||
if (sz == 0)
|
||||
return true;
|
||||
|
||||
for (featarray_t::const_iterator i = array_.begin(); i != array_.end(); i++) {
|
||||
for (featarray_t::const_iterator i = m_array.begin(); i != m_array.end(); i++) {
|
||||
if (i->size() != sz)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,6 @@
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include "FeatureStats.h"
|
||||
|
||||
using namespace std;
|
||||
@ -26,82 +25,57 @@ class FeatureArray
|
||||
private:
|
||||
// idx to identify the utterance. It can differ from
|
||||
// the index inside the vector.
|
||||
std::string idx;
|
||||
|
||||
protected:
|
||||
featarray_t array_;
|
||||
size_t number_of_features;
|
||||
std::string features;
|
||||
bool _sparse_flag;
|
||||
std::string m_index;
|
||||
featarray_t m_array;
|
||||
size_t m_num_features;
|
||||
std::string m_features;
|
||||
bool m_sparse_flag;
|
||||
|
||||
public:
|
||||
FeatureArray();
|
||||
~FeatureArray();
|
||||
|
||||
inline void clear() {
|
||||
array_.clear();
|
||||
}
|
||||
void clear() { m_array.clear(); }
|
||||
|
||||
inline bool hasSparseFeatures() const {
|
||||
return _sparse_flag;
|
||||
}
|
||||
bool hasSparseFeatures() const { return m_sparse_flag; }
|
||||
|
||||
inline std::string getIndex() const {
|
||||
return idx;
|
||||
}
|
||||
inline void setIndex(const std::string& value) {
|
||||
idx = value;
|
||||
}
|
||||
std::string getIndex() const { return m_index; }
|
||||
void setIndex(const std::string& value) { m_index = value; }
|
||||
|
||||
inline FeatureStats& get(size_t i) {
|
||||
return array_.at(i);
|
||||
}
|
||||
inline const FeatureStats& get(size_t i)const {
|
||||
return array_.at(i);
|
||||
}
|
||||
void add(FeatureStats& e) {
|
||||
array_.push_back(e);
|
||||
}
|
||||
FeatureStats& get(size_t i) { return m_array.at(i); }
|
||||
const FeatureStats& get(size_t i) const { return m_array.at(i); }
|
||||
|
||||
void add(FeatureStats& e) { m_array.push_back(e); }
|
||||
|
||||
//ADDED BY TS
|
||||
void swap(size_t i, size_t j) {
|
||||
std::swap(array_[i],array_[j]);
|
||||
std::swap(m_array[i], m_array[j]);
|
||||
}
|
||||
|
||||
|
||||
void resize(size_t new_size) {
|
||||
array_.resize(std::min(new_size,array_.size()));
|
||||
m_array.resize(std::min(new_size, m_array.size()));
|
||||
}
|
||||
//END_ADDED
|
||||
|
||||
void merge(FeatureArray& e);
|
||||
|
||||
inline size_t size() const {
|
||||
return array_.size();
|
||||
}
|
||||
inline size_t NumberOfFeatures() const {
|
||||
return number_of_features;
|
||||
}
|
||||
inline void NumberOfFeatures(size_t v) {
|
||||
number_of_features = v;
|
||||
}
|
||||
inline std::string Features() const {
|
||||
return features;
|
||||
}
|
||||
inline void Features(const std::string& f) {
|
||||
features = f;
|
||||
}
|
||||
size_t size() const { return m_array.size(); }
|
||||
|
||||
void savetxt(ofstream& outFile);
|
||||
void savebin(ofstream& outFile);
|
||||
void save(ofstream& outFile, bool bin=false);
|
||||
size_t NumberOfFeatures() const { return m_num_features; }
|
||||
void NumberOfFeatures(size_t v) { m_num_features = v; }
|
||||
|
||||
std::string Features() const { return m_features; }
|
||||
void Features(const std::string& f) { m_features = f; }
|
||||
|
||||
void savetxt(std::ostream* os);
|
||||
void savebin(std::ostream* os);
|
||||
void save(std::ostream* os, bool bin=false);
|
||||
void save(const std::string &file, bool bin=false);
|
||||
inline void save(bool bin=false) {
|
||||
save("/dev/stdout",bin);
|
||||
}
|
||||
void save(bool bin=false);
|
||||
|
||||
void loadtxt(ifstream& inFile, size_t n);
|
||||
void loadbin(ifstream& inFile, size_t n);
|
||||
void load(ifstream& inFile);
|
||||
void loadtxt(std::istream* is, size_t n);
|
||||
void loadbin(std::istream* is, size_t n);
|
||||
void load(std::istream* is);
|
||||
void load(const std::string &file);
|
||||
|
||||
bool check_consistency() const;
|
||||
|
@ -13,44 +13,45 @@
|
||||
#include "Util.h"
|
||||
#include <cstdio>
|
||||
|
||||
static const float MIN_FLOAT=-1.0*numeric_limits<float>::max();
|
||||
static const float MAX_FLOAT=numeric_limits<float>::max();
|
||||
static const float MIN_FLOAT = -1.0 * numeric_limits<float>::max();
|
||||
static const float MAX_FLOAT = numeric_limits<float>::max();
|
||||
|
||||
FeatureData::FeatureData()
|
||||
: number_of_features(0),
|
||||
_sparse_flag(false) {}
|
||||
: m_num_features(0),
|
||||
m_sparse_flag(false) {}
|
||||
|
||||
void FeatureData::save(std::ofstream& outFile, bool bin)
|
||||
void FeatureData::save(ostream* os, bool bin)
|
||||
{
|
||||
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
|
||||
i->save(outFile, bin);
|
||||
for (featdata_t::iterator i = m_array.begin(); i != m_array.end(); i++)
|
||||
i->save(os, bin);
|
||||
}
|
||||
|
||||
void FeatureData::save(const std::string &file, bool bin)
|
||||
void FeatureData::save(const string &file, bool bin)
|
||||
{
|
||||
if (file.empty()) return;
|
||||
|
||||
TRACE_ERR("saving the array into " << file << std::endl);
|
||||
|
||||
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
|
||||
|
||||
save(outFile, bin);
|
||||
|
||||
outFile.close();
|
||||
TRACE_ERR("saving the array into " << file << endl);
|
||||
ofstream ofs(file.c_str(), ios::out); // matches a stream with a file. Opens the file
|
||||
ostream* os = &ofs;
|
||||
save(os, bin);
|
||||
ofs.close();
|
||||
}
|
||||
|
||||
void FeatureData::load(ifstream& inFile)
|
||||
void FeatureData::save(bool bin) {
|
||||
save(&cout, bin);
|
||||
}
|
||||
|
||||
void FeatureData::load(istream* is)
|
||||
{
|
||||
FeatureArray entry;
|
||||
|
||||
while (!inFile.eof()) {
|
||||
while (!is->eof()) {
|
||||
|
||||
if (!inFile.good()) {
|
||||
std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl;
|
||||
if (!is->good()) {
|
||||
cerr << "ERROR FeatureData::load inFile.good()" << endl;
|
||||
}
|
||||
|
||||
entry.clear();
|
||||
entry.load(inFile);
|
||||
entry.load(is);
|
||||
|
||||
if (entry.size() == 0)
|
||||
break;
|
||||
@ -59,26 +60,23 @@ void FeatureData::load(ifstream& inFile)
|
||||
setFeatureMap(entry.Features());
|
||||
|
||||
if (entry.hasSparseFeatures())
|
||||
_sparse_flag = true;
|
||||
m_sparse_flag = true;
|
||||
|
||||
add(entry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void FeatureData::load(const std::string &file)
|
||||
void FeatureData::load(const string &file)
|
||||
{
|
||||
TRACE_ERR("loading feature data from " << file << std::endl);
|
||||
|
||||
inputfilestream inFile(file); // matches a stream with a file. Opens the file
|
||||
|
||||
if (!inFile) {
|
||||
TRACE_ERR("loading feature data from " << file << endl);
|
||||
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
|
||||
if (!input_stream) {
|
||||
throw runtime_error("Unable to open feature file: " + file);
|
||||
}
|
||||
|
||||
load((ifstream&) inFile);
|
||||
|
||||
inFile.close();
|
||||
istream* is = &input_stream;
|
||||
load(is);
|
||||
input_stream.close();
|
||||
}
|
||||
|
||||
void FeatureData::add(FeatureArray& e)
|
||||
@ -86,25 +84,25 @@ void FeatureData::add(FeatureArray& e)
|
||||
if (exists(e.getIndex())) { // array at position e.getIndex() already exists
|
||||
//enlarge array at position e.getIndex()
|
||||
size_t pos = getIndex(e.getIndex());
|
||||
array_.at(pos).merge(e);
|
||||
m_array.at(pos).merge(e);
|
||||
} else {
|
||||
array_.push_back(e);
|
||||
m_array.push_back(e);
|
||||
setIndex();
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureData::add(FeatureStats& e, const std::string& sent_idx)
|
||||
void FeatureData::add(FeatureStats& e, const string& sent_idx)
|
||||
{
|
||||
if (exists(sent_idx)) { // array at position e.getIndex() already exists
|
||||
//enlarge array at position e.getIndex()
|
||||
size_t pos = getIndex(sent_idx);
|
||||
// TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl);
|
||||
array_.at(pos).add(e);
|
||||
m_array.at(pos).add(e);
|
||||
} else {
|
||||
// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl);
|
||||
FeatureArray a;
|
||||
a.NumberOfFeatures(number_of_features);
|
||||
a.Features(features);
|
||||
a.NumberOfFeatures(m_num_features);
|
||||
a.Features(m_features);
|
||||
a.setIndex(sent_idx);
|
||||
a.add(e);
|
||||
add(a);
|
||||
@ -113,10 +111,10 @@ void FeatureData::add(FeatureStats& e, const std::string& sent_idx)
|
||||
|
||||
bool FeatureData::check_consistency() const
|
||||
{
|
||||
if (array_.size() == 0)
|
||||
if (m_array.size() == 0)
|
||||
return true;
|
||||
|
||||
for (featdata_t::const_iterator i = array_.begin(); i != array_.end(); i++)
|
||||
for (featdata_t::const_iterator i = m_array.begin(); i != m_array.end(); i++)
|
||||
if (!i->check_consistency()) return false;
|
||||
|
||||
return true;
|
||||
@ -125,26 +123,26 @@ bool FeatureData::check_consistency() const
|
||||
void FeatureData::setIndex()
|
||||
{
|
||||
size_t j=0;
|
||||
for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
|
||||
idx2arrayname_[j]=(*i).getIndex();
|
||||
arrayname2idx_[(*i).getIndex()] = j;
|
||||
for (featdata_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
|
||||
m_index_to_array_name[j]=(*i).getIndex();
|
||||
m_array_name_to_index[(*i).getIndex()] = j;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureData::setFeatureMap(const std::string& feat)
|
||||
void FeatureData::setFeatureMap(const string& feat)
|
||||
{
|
||||
number_of_features = 0;
|
||||
features = feat;
|
||||
m_num_features = 0;
|
||||
m_features = feat;
|
||||
|
||||
vector<string> buf;
|
||||
Tokenize(feat.c_str(), ' ', &buf);
|
||||
for (vector<string>::const_iterator it = buf.begin();
|
||||
it != buf.end(); ++it) {
|
||||
const size_t size = idx2featname_.size();
|
||||
featname2idx_[*it] = size;
|
||||
idx2featname_[size] = *it;
|
||||
++number_of_features;
|
||||
const size_t size = m_index_to_feature_name.size();
|
||||
m_feature_name_to_index[*it] = size;
|
||||
m_index_to_feature_name[size] = *it;
|
||||
++m_num_features;
|
||||
}
|
||||
}
|
||||
|
||||
@ -152,26 +150,23 @@ string FeatureData::ToString() const {
|
||||
string res;
|
||||
char buf[100];
|
||||
|
||||
snprintf(buf, sizeof(buf), "number of features: %lu, ", number_of_features);
|
||||
snprintf(buf, sizeof(buf), "number of features: %lu, ", m_num_features);
|
||||
res.append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf), "features: ");
|
||||
res.append(buf);
|
||||
res.append(features);
|
||||
res.append("features: ");
|
||||
res.append(m_features);
|
||||
|
||||
snprintf(buf, sizeof(buf), ", sparse flag: %s, ", (_sparse_flag) ? "yes" : "no");
|
||||
snprintf(buf, sizeof(buf), ", sparse flag: %s, ", (m_sparse_flag) ? "yes" : "no");
|
||||
res.append(buf);
|
||||
|
||||
snprintf(buf, sizeof(buf), "feature_id_map = { ");
|
||||
res.append(buf);
|
||||
for (map<string, size_t>::const_iterator it = featname2idx_.begin();
|
||||
it != featname2idx_.end(); ++it) {
|
||||
res.append("feature_id_map = { ");
|
||||
for (map<string, size_t>::const_iterator it = m_feature_name_to_index.begin();
|
||||
it != m_feature_name_to_index.end(); ++it) {
|
||||
snprintf(buf, sizeof(buf), "%s => %lu, ",
|
||||
it->first.c_str(), it->second);
|
||||
res.append(buf);
|
||||
}
|
||||
snprintf(buf, sizeof(buf), "}");
|
||||
res.append(buf);
|
||||
res.append("}");
|
||||
|
||||
return res;
|
||||
}
|
||||
|
@ -19,109 +19,92 @@ using namespace std;
|
||||
class FeatureData
|
||||
{
|
||||
private:
|
||||
size_t number_of_features;
|
||||
std::string features;
|
||||
bool _sparse_flag;
|
||||
|
||||
map<std::string, size_t> featname2idx_; // map from name to index of features
|
||||
map<size_t, std::string> idx2featname_; // map from index to name of features
|
||||
|
||||
protected:
|
||||
featdata_t array_;
|
||||
idx2name idx2arrayname_; // map from index to name of array
|
||||
name2idx arrayname2idx_; // map from name to index of array
|
||||
size_t m_num_features;
|
||||
std::string m_features;
|
||||
bool m_sparse_flag;
|
||||
map<std::string, size_t> m_feature_name_to_index; // map from name to index of features
|
||||
map<size_t, std::string> m_index_to_feature_name; // map from index to name of features
|
||||
featdata_t m_array;
|
||||
idx2name m_index_to_array_name; // map from index to name of array
|
||||
name2idx m_array_name_to_index; // map from name to index of array
|
||||
|
||||
public:
|
||||
FeatureData();
|
||||
~FeatureData() {}
|
||||
|
||||
inline void clear() {
|
||||
array_.clear();
|
||||
void clear() { m_array.clear(); }
|
||||
|
||||
bool hasSparseFeatures() const { return m_sparse_flag; }
|
||||
|
||||
FeatureArray get(const std::string& idx) {
|
||||
return m_array.at(getIndex(idx));
|
||||
}
|
||||
|
||||
inline bool hasSparseFeatures() const {
|
||||
return _sparse_flag;
|
||||
}
|
||||
inline FeatureArray get(const std::string& idx) {
|
||||
return array_.at(getIndex(idx));
|
||||
}
|
||||
inline FeatureArray& get(size_t idx) {
|
||||
return array_.at(idx);
|
||||
}
|
||||
inline const FeatureArray& get(size_t idx) const {
|
||||
return array_.at(idx);
|
||||
}
|
||||
FeatureArray& get(size_t idx) { return m_array.at(idx); }
|
||||
const FeatureArray& get(size_t idx) const { return m_array.at(idx); }
|
||||
|
||||
inline bool exists(const std::string& sent_idx) const {
|
||||
return exists(getIndex(sent_idx));
|
||||
}
|
||||
|
||||
inline bool exists(int sent_idx) const {
|
||||
return (sent_idx > -1 && sent_idx < static_cast<int>(array_.size())) ? true : false;
|
||||
return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
|
||||
}
|
||||
|
||||
inline FeatureStats& get(size_t i, size_t j) {
|
||||
return array_.at(i).get(j);
|
||||
return m_array.at(i).get(j);
|
||||
}
|
||||
inline const FeatureStats& get(size_t i, size_t j) const {
|
||||
return array_.at(i).get(j);
|
||||
|
||||
inline const FeatureStats& get(size_t i, size_t j) const {
|
||||
return m_array.at(i).get(j);
|
||||
}
|
||||
|
||||
void add(FeatureArray& e);
|
||||
void add(FeatureStats& e, const std::string& sent_idx);
|
||||
|
||||
inline size_t size() const {
|
||||
return array_.size();
|
||||
}
|
||||
inline size_t NumberOfFeatures() const {
|
||||
return number_of_features;
|
||||
}
|
||||
inline void NumberOfFeatures(size_t v) {
|
||||
number_of_features = v;
|
||||
}
|
||||
inline std::string Features() const {
|
||||
return features;
|
||||
}
|
||||
inline void Features(const std::string& f) {
|
||||
features = f;
|
||||
}
|
||||
size_t size() const { return m_array.size(); }
|
||||
|
||||
size_t NumberOfFeatures() const { return m_num_features; }
|
||||
void NumberOfFeatures(size_t v) { m_num_features = v; }
|
||||
|
||||
std::string Features() const { return m_features; }
|
||||
void Features(const std::string& f) { m_features = f; }
|
||||
|
||||
void save(const std::string &file, bool bin=false);
|
||||
void save(ofstream& outFile, bool bin=false);
|
||||
inline void save(bool bin=false) {
|
||||
save("/dev/stdout", bin);
|
||||
}
|
||||
void save(std::ostream* os, bool bin=false);
|
||||
void save(bool bin=false);
|
||||
|
||||
void load(ifstream& inFile);
|
||||
void load(std::istream* is);
|
||||
void load(const std::string &file);
|
||||
|
||||
bool check_consistency() const;
|
||||
|
||||
void setIndex();
|
||||
|
||||
inline int getIndex(const std::string& idx) const {
|
||||
name2idx::const_iterator i = arrayname2idx_.find(idx);
|
||||
if (i != arrayname2idx_.end())
|
||||
name2idx::const_iterator i = m_array_name_to_index.find(idx);
|
||||
if (i != m_array_name_to_index.end())
|
||||
return i->second;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline std::string getIndex(size_t idx) const {
|
||||
idx2name::const_iterator i = idx2arrayname_.find(idx);
|
||||
if (i != idx2arrayname_.end())
|
||||
idx2name::const_iterator i = m_index_to_array_name.find(idx);
|
||||
if (i != m_index_to_array_name.end())
|
||||
throw runtime_error("there is no entry at index " + idx);
|
||||
return i->second;
|
||||
}
|
||||
|
||||
bool existsFeatureNames() const {
|
||||
return (idx2featname_.size() > 0) ? true : false;
|
||||
return (m_index_to_feature_name.size() > 0) ? true : false;
|
||||
}
|
||||
|
||||
std::string getFeatureName(size_t idx) const {
|
||||
if (idx >= idx2featname_.size())
|
||||
if (idx >= m_index_to_feature_name.size())
|
||||
throw runtime_error("Error: you required an too big index");
|
||||
map<size_t, std::string>::const_iterator it = idx2featname_.find(idx);
|
||||
if (it == idx2featname_.end()) {
|
||||
map<size_t, std::string>::const_iterator it = m_index_to_feature_name.find(idx);
|
||||
if (it == m_index_to_feature_name.end()) {
|
||||
throw runtime_error("Error: specified id is unknown: " + idx);
|
||||
} else {
|
||||
return it->second;
|
||||
@ -129,8 +112,8 @@ public:
|
||||
}
|
||||
|
||||
size_t getFeatureIndex(const std::string& name) const {
|
||||
map<std::string, size_t>::const_iterator it = featname2idx_.find(name);
|
||||
if (it == featname2idx_.end())
|
||||
map<std::string, size_t>::const_iterator it = m_feature_name_to_index.find(name);
|
||||
if (it == m_feature_name_to_index.end())
|
||||
throw runtime_error("Error: feature " + name + " is unknown");
|
||||
return it->second;
|
||||
}
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "FeatureStats.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include "Util.h"
|
||||
|
||||
@ -15,58 +16,58 @@ namespace {
|
||||
const int kAvailableSize = 8;
|
||||
} // namespace
|
||||
|
||||
SparseVector::name2id_t SparseVector::name2id_;
|
||||
SparseVector::id2name_t SparseVector::id2name_;
|
||||
SparseVector::name2id_t SparseVector::m_name_to_id;
|
||||
SparseVector::id2name_t SparseVector::m_id_to_name;
|
||||
|
||||
FeatureStatsType SparseVector::get(const string& name) const {
|
||||
name2id_t::const_iterator name2id_iter = name2id_.find(name);
|
||||
if (name2id_iter == name2id_.end()) return 0;
|
||||
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
||||
if (name2id_iter == m_name_to_id.end()) return 0;
|
||||
size_t id = name2id_iter->second;
|
||||
return get(id);
|
||||
}
|
||||
|
||||
FeatureStatsType SparseVector::get(size_t id) const {
|
||||
fvector_t::const_iterator fvector_iter = fvector_.find(id);
|
||||
if (fvector_iter == fvector_.end()) return 0;
|
||||
fvector_t::const_iterator fvector_iter = m_fvector.find(id);
|
||||
if (fvector_iter == m_fvector.end()) return 0;
|
||||
return fvector_iter->second;
|
||||
}
|
||||
|
||||
void SparseVector::set(const string& name, FeatureStatsType value) {
|
||||
name2id_t::const_iterator name2id_iter = name2id_.find(name);
|
||||
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
||||
size_t id = 0;
|
||||
if (name2id_iter == name2id_.end()) {
|
||||
id = id2name_.size();
|
||||
id2name_.push_back(name);
|
||||
name2id_[name] = id;
|
||||
if (name2id_iter == m_name_to_id.end()) {
|
||||
id = m_id_to_name.size();
|
||||
m_id_to_name.push_back(name);
|
||||
m_name_to_id[name] = id;
|
||||
} else {
|
||||
id = name2id_iter->second;
|
||||
}
|
||||
fvector_[id] = value;
|
||||
m_fvector[id] = value;
|
||||
}
|
||||
|
||||
void SparseVector::write(ostream& out, const string& sep) const {
|
||||
for (fvector_t::const_iterator i = fvector_.begin(); i != fvector_.end(); ++i) {
|
||||
for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
|
||||
if (abs(i->second) < 0.00001) continue;
|
||||
string name = id2name_[i->first];
|
||||
string name = m_id_to_name[i->first];
|
||||
out << name << sep << i->second << " ";
|
||||
}
|
||||
}
|
||||
|
||||
void SparseVector::clear() {
|
||||
fvector_.clear();
|
||||
m_fvector.clear();
|
||||
}
|
||||
|
||||
SparseVector& SparseVector::operator-=(const SparseVector& rhs) {
|
||||
//All the elements that have values in *this
|
||||
for (fvector_t::iterator i = fvector_.begin(); i != fvector_.end(); ++i) {
|
||||
fvector_[i->first] = i->second - rhs.get(i->first);
|
||||
for (fvector_t::iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
|
||||
m_fvector[i->first] = i->second - rhs.get(i->first);
|
||||
}
|
||||
|
||||
//Any elements in rhs, that have no value in *this
|
||||
for (fvector_t::const_iterator i = rhs.fvector_.begin();
|
||||
i != rhs.fvector_.end(); ++i) {
|
||||
if (fvector_.find(i->first) == fvector_.end()) {
|
||||
fvector_[i->first] = -(i->second);
|
||||
for (fvector_t::const_iterator i = rhs.m_fvector.begin();
|
||||
i != rhs.m_fvector.end(); ++i) {
|
||||
if (m_fvector.find(i->first) == m_fvector.end()) {
|
||||
m_fvector[i->first] = -(i->second);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
@ -79,37 +80,37 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
|
||||
}
|
||||
|
||||
FeatureStats::FeatureStats()
|
||||
: available_(kAvailableSize), entries_(0),
|
||||
array_(new FeatureStatsType[available_]) {}
|
||||
: m_available_size(kAvailableSize), m_entries(0),
|
||||
m_array(new FeatureStatsType[m_available_size]) {}
|
||||
|
||||
FeatureStats::FeatureStats(const size_t size)
|
||||
: available_(size), entries_(size),
|
||||
array_(new FeatureStatsType[available_])
|
||||
: m_available_size(size), m_entries(size),
|
||||
m_array(new FeatureStatsType[m_available_size])
|
||||
{
|
||||
memset(array_, 0, GetArraySizeWithBytes());
|
||||
memset(m_array, 0, GetArraySizeWithBytes());
|
||||
}
|
||||
|
||||
FeatureStats::FeatureStats(std::string &theString)
|
||||
: available_(0), entries_(0), array_(NULL)
|
||||
FeatureStats::FeatureStats(string &theString)
|
||||
: m_available_size(0), m_entries(0), m_array(NULL)
|
||||
{
|
||||
set(theString);
|
||||
}
|
||||
|
||||
FeatureStats::~FeatureStats()
|
||||
{
|
||||
if (array_) {
|
||||
delete [] array_;
|
||||
array_ = NULL;
|
||||
if (m_array) {
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void FeatureStats::Copy(const FeatureStats &stats)
|
||||
{
|
||||
available_ = stats.available();
|
||||
entries_ = stats.size();
|
||||
array_ = new FeatureStatsType[available_];
|
||||
memcpy(array_, stats.getArray(), GetArraySizeWithBytes());
|
||||
map_ = stats.getSparse();
|
||||
m_available_size = stats.available();
|
||||
m_entries = stats.size();
|
||||
m_array = new FeatureStatsType[m_available_size];
|
||||
memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
|
||||
m_map = stats.getSparse();
|
||||
}
|
||||
|
||||
FeatureStats::FeatureStats(const FeatureStats &stats)
|
||||
@ -119,34 +120,34 @@ FeatureStats::FeatureStats(const FeatureStats &stats)
|
||||
|
||||
FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
|
||||
{
|
||||
delete [] array_;
|
||||
delete [] m_array;
|
||||
Copy(stats);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void FeatureStats::expand()
|
||||
{
|
||||
available_ *= 2;
|
||||
featstats_t t_ = new FeatureStatsType[available_];
|
||||
memcpy(t_, array_, GetArraySizeWithBytes());
|
||||
delete [] array_;
|
||||
array_ = t_;
|
||||
m_available_size *= 2;
|
||||
featstats_t t_ = new FeatureStatsType[m_available_size];
|
||||
memcpy(t_, m_array, GetArraySizeWithBytes());
|
||||
delete [] m_array;
|
||||
m_array = t_;
|
||||
}
|
||||
|
||||
void FeatureStats::add(FeatureStatsType v)
|
||||
{
|
||||
if (isfull()) expand();
|
||||
array_[entries_++]=v;
|
||||
m_array[m_entries++]=v;
|
||||
}
|
||||
|
||||
void FeatureStats::addSparse(const string& name, FeatureStatsType v)
|
||||
{
|
||||
map_.set(name,v);
|
||||
m_map.set(name,v);
|
||||
}
|
||||
|
||||
void FeatureStats::set(std::string &theString)
|
||||
void FeatureStats::set(string &theString)
|
||||
{
|
||||
std::string substring, stringBuf;
|
||||
string substring, stringBuf;
|
||||
reset();
|
||||
|
||||
while (!theString.empty()) {
|
||||
@ -163,48 +164,50 @@ void FeatureStats::set(std::string &theString)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void FeatureStats::loadbin(std::ifstream& inFile)
|
||||
void FeatureStats::loadbin(istream* is)
|
||||
{
|
||||
inFile.read((char*) array_, GetArraySizeWithBytes());
|
||||
is->read(reinterpret_cast<char*>(m_array),
|
||||
static_cast<streamsize>(GetArraySizeWithBytes()));
|
||||
}
|
||||
|
||||
void FeatureStats::loadtxt(std::ifstream& inFile)
|
||||
void FeatureStats::loadtxt(istream* is)
|
||||
{
|
||||
std::string theString;
|
||||
std::getline(inFile, theString);
|
||||
set(theString);
|
||||
string line;
|
||||
getline(*is, line);
|
||||
set(line);
|
||||
}
|
||||
|
||||
void FeatureStats::loadtxt(const std::string &file)
|
||||
void FeatureStats::loadtxt(const string &file)
|
||||
{
|
||||
// TRACE_ERR("loading the stats from " << file << std::endl);
|
||||
|
||||
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
|
||||
|
||||
loadtxt(inFile);
|
||||
ifstream ifs(file.c_str(), ios::in);
|
||||
if (!ifs) {
|
||||
cerr << "Failed to open " << file << endl;
|
||||
exit(1);
|
||||
}
|
||||
istream* is = &ifs;
|
||||
loadtxt(is);
|
||||
}
|
||||
|
||||
|
||||
void FeatureStats::savetxt(const std::string &file)
|
||||
void FeatureStats::savetxt(const string &file)
|
||||
{
|
||||
// TRACE_ERR("saving the stats into " << file << std::endl);
|
||||
|
||||
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
|
||||
|
||||
savetxt(outFile);
|
||||
ofstream ofs(file.c_str(), ios::out);
|
||||
ostream* os = &ofs;
|
||||
savetxt(os);
|
||||
}
|
||||
|
||||
|
||||
void FeatureStats::savetxt(std::ofstream& outFile)
|
||||
void FeatureStats::savetxt(ostream* os)
|
||||
{
|
||||
// TRACE_ERR("saving the stats" << std::endl);
|
||||
outFile << *this;
|
||||
*os << *this;
|
||||
}
|
||||
|
||||
void FeatureStats::savebin(std::ofstream& outFile)
|
||||
void FeatureStats::savetxt() {
|
||||
savetxt(&cout);
|
||||
}
|
||||
|
||||
void FeatureStats::savebin(ostream* os)
|
||||
{
|
||||
outFile.write((char*) array_, GetArraySizeWithBytes());
|
||||
os->write(reinterpret_cast<char*>(m_array),
|
||||
static_cast<streamsize>(GetArraySizeWithBytes()));
|
||||
}
|
||||
|
||||
ostream& operator<<(ostream& o, const FeatureStats& e)
|
||||
@ -230,7 +233,7 @@ bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
|
||||
if (f1.get(k) != f2.get(k))
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
//END_ADDED
|
||||
|
@ -10,7 +10,6 @@
|
||||
#define MERT_FEATURE_STATS_H_
|
||||
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
@ -30,18 +29,16 @@ public:
|
||||
FeatureStatsType get(size_t id) const;
|
||||
void set(const std::string& name, FeatureStatsType value);
|
||||
void clear();
|
||||
size_t size() const {
|
||||
return fvector_.size();
|
||||
}
|
||||
size_t size() const { return m_fvector.size(); }
|
||||
|
||||
void write(std::ostream& out, const std::string& sep = " ") const;
|
||||
|
||||
SparseVector& operator-=(const SparseVector& rhs);
|
||||
|
||||
private:
|
||||
static name2id_t name2id_;
|
||||
static id2name_t id2name_;
|
||||
fvector_t fvector_;
|
||||
static name2id_t m_name_to_id;
|
||||
static id2name_t m_id_to_name;
|
||||
fvector_t m_fvector;
|
||||
};
|
||||
|
||||
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs);
|
||||
@ -49,12 +46,12 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs);
|
||||
class FeatureStats
|
||||
{
|
||||
private:
|
||||
size_t available_;
|
||||
size_t entries_;
|
||||
size_t m_available_size;
|
||||
size_t m_entries;
|
||||
|
||||
// TODO: Use smart pointer for exceptional-safety.
|
||||
featstats_t array_;
|
||||
SparseVector map_;
|
||||
featstats_t m_array;
|
||||
SparseVector m_map;
|
||||
|
||||
public:
|
||||
FeatureStats();
|
||||
@ -69,64 +66,47 @@ public:
|
||||
|
||||
void Copy(const FeatureStats &stats);
|
||||
|
||||
bool isfull() const {
|
||||
return (entries_ < available_) ? 0 : 1;
|
||||
}
|
||||
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
|
||||
void expand();
|
||||
void add(FeatureStatsType v);
|
||||
void addSparse(const string& name, FeatureStatsType v);
|
||||
|
||||
void clear() {
|
||||
memset((void*)array_, 0, GetArraySizeWithBytes());
|
||||
map_.clear();
|
||||
memset((void*)m_array, 0, GetArraySizeWithBytes());
|
||||
m_map.clear();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
entries_ = 0;
|
||||
m_entries = 0;
|
||||
clear();
|
||||
}
|
||||
|
||||
inline FeatureStatsType get(size_t i) {
|
||||
return array_[i];
|
||||
}
|
||||
inline FeatureStatsType get(size_t i)const {
|
||||
return array_[i];
|
||||
}
|
||||
inline featstats_t getArray() const {
|
||||
return array_;
|
||||
}
|
||||
inline const SparseVector& getSparse() const {
|
||||
return map_;
|
||||
}
|
||||
FeatureStatsType get(size_t i) { return m_array[i]; }
|
||||
FeatureStatsType get(size_t i)const { return m_array[i]; }
|
||||
featstats_t getArray() const { return m_array; }
|
||||
|
||||
const SparseVector& getSparse() const { return m_map; }
|
||||
|
||||
void set(std::string &theString);
|
||||
|
||||
inline size_t bytes() const {
|
||||
return GetArraySizeWithBytes();
|
||||
}
|
||||
inline size_t bytes() const { return GetArraySizeWithBytes(); }
|
||||
|
||||
size_t GetArraySizeWithBytes() const {
|
||||
return entries_ * sizeof(FeatureStatsType);
|
||||
return m_entries * sizeof(FeatureStatsType);
|
||||
}
|
||||
|
||||
inline size_t size() const {
|
||||
return entries_;
|
||||
}
|
||||
size_t size() const { return m_entries; }
|
||||
|
||||
inline size_t available() const {
|
||||
return available_;
|
||||
}
|
||||
size_t available() const { return m_available_size; }
|
||||
|
||||
void savetxt(const std::string &file);
|
||||
void savetxt(ofstream& outFile);
|
||||
void savebin(ofstream& outFile);
|
||||
inline void savetxt() {
|
||||
savetxt("/dev/stdout");
|
||||
}
|
||||
void savetxt(std::ostream* os);
|
||||
void savebin(std::ostream* os);
|
||||
void savetxt();
|
||||
|
||||
void loadtxt(const std::string &file);
|
||||
void loadtxt(ifstream& inFile);
|
||||
void loadbin(ifstream& inFile);
|
||||
void loadtxt(std::istream* is);
|
||||
void loadbin(std::istream* is);
|
||||
|
||||
/**
|
||||
* Write the whole object to a stream.
|
||||
|
@ -13,11 +13,11 @@ bool IsGzipFile(const std::string &filename) {
|
||||
} // namespace
|
||||
|
||||
inputfilestream::inputfilestream(const std::string &filePath)
|
||||
: std::istream(0), m_streambuf(0), is_good(false)
|
||||
: std::istream(0), m_streambuf(0), m_is_good(false)
|
||||
{
|
||||
// check if file is readable
|
||||
std::filebuf* fb = new std::filebuf();
|
||||
is_good = (fb->open(filePath.c_str(), std::ios::in) != NULL);
|
||||
m_is_good = (fb->open(filePath.c_str(), std::ios::in) != NULL);
|
||||
|
||||
if (IsGzipFile(filePath)) {
|
||||
fb->close();
|
||||
@ -40,11 +40,11 @@ void inputfilestream::close()
|
||||
}
|
||||
|
||||
outputfilestream::outputfilestream(const std::string &filePath)
|
||||
: std::ostream(0), m_streambuf(0), is_good(false)
|
||||
: std::ostream(0), m_streambuf(0), m_is_good(false)
|
||||
{
|
||||
// check if file is readable
|
||||
std::filebuf* fb = new std::filebuf();
|
||||
is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
|
||||
m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
|
||||
|
||||
if (IsGzipFile(filePath)) {
|
||||
throw runtime_error("Output to a zipped file not supported!");
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define MERT_FILE_STREAM_H_
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <streambuf>
|
||||
#include <string>
|
||||
|
||||
@ -9,13 +10,13 @@ class inputfilestream : public std::istream
|
||||
{
|
||||
protected:
|
||||
std::streambuf *m_streambuf;
|
||||
bool is_good;
|
||||
bool m_is_good;
|
||||
|
||||
public:
|
||||
explicit inputfilestream(const std::string &filePath);
|
||||
virtual ~inputfilestream();
|
||||
|
||||
bool good() const { return is_good; }
|
||||
bool good() const { return m_is_good; }
|
||||
void close();
|
||||
};
|
||||
|
||||
@ -23,13 +24,13 @@ class outputfilestream : public std::ostream
|
||||
{
|
||||
protected:
|
||||
std::streambuf *m_streambuf;
|
||||
bool is_good;
|
||||
bool m_is_good;
|
||||
|
||||
public:
|
||||
explicit outputfilestream(const std::string &filePath);
|
||||
virtual ~outputfilestream();
|
||||
|
||||
bool good() const { return is_good; }
|
||||
bool good() const { return m_is_good; }
|
||||
void close();
|
||||
};
|
||||
|
||||
|
@ -1,35 +1,36 @@
|
||||
#include "ScorerFactory.h"
|
||||
#include "InterpolatedScorer.h"
|
||||
#include "ScorerFactory.h"
|
||||
#include "Util.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
InterpolatedScorer::InterpolatedScorer (const string& name, const string& config): Scorer(name,config)
|
||||
// TODO: This is too long. Consider creating a function for
|
||||
// initialization such as Init().
|
||||
InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
|
||||
: Scorer(name,config)
|
||||
{
|
||||
|
||||
// name would be: HAMMING,BLEU or similar
|
||||
string scorers = name;
|
||||
while (scorers.length() > 0) {
|
||||
string scorertype = "";
|
||||
getNextPound(scorers,scorertype,",");
|
||||
Scorer *theScorer=ScorerFactory::getScorer(scorertype,config);
|
||||
_scorers.push_back(theScorer);
|
||||
getNextPound(scorers, scorertype,",");
|
||||
Scorer *scorer = ScorerFactory::getScorer(scorertype,config);
|
||||
m_scorers.push_back(scorer);
|
||||
}
|
||||
if (_scorers.size() == 0) {
|
||||
if (m_scorers.size() == 0) {
|
||||
throw runtime_error("There are no scorers");
|
||||
}
|
||||
cerr << "Number of scorers: " << _scorers.size() << endl;
|
||||
cerr << "Number of scorers: " << m_scorers.size() << endl;
|
||||
|
||||
//TODO debug this
|
||||
string wtype = getConfig("weights","");
|
||||
//Default weights set to uniform ie. if two weights 0.5 each
|
||||
//weights should add to 1
|
||||
if (wtype.length() == 0) {
|
||||
float weight = 1.0/_scorers.size() ;
|
||||
float weight = 1.0 / m_scorers.size() ;
|
||||
//cout << " Default weights:" << weight << endl;
|
||||
for (size_t i = 0; i < _scorers.size(); i ++) {
|
||||
_scorerWeights.push_back(weight);
|
||||
for (size_t i = 0; i < m_scorers.size(); i ++) {
|
||||
m_scorer_weights.push_back(weight);
|
||||
}
|
||||
} else {
|
||||
float tot=0;
|
||||
@ -38,24 +39,24 @@ InterpolatedScorer::InterpolatedScorer (const string& name, const string& config
|
||||
string scoreweight = "";
|
||||
getNextPound(wtype,scoreweight,"+");
|
||||
float weight = atof(scoreweight.c_str());
|
||||
_scorerWeights.push_back(weight);
|
||||
m_scorer_weights.push_back(weight);
|
||||
tot += weight;
|
||||
//cout << " :" << weight ;
|
||||
}
|
||||
//cout << endl;
|
||||
if (tot != float(1)) {
|
||||
for (vector<float>::iterator it = _scorerWeights.begin(); it != _scorerWeights.end(); ++it)
|
||||
{
|
||||
if (tot != float(1)) { // TODO: fix this checking in terms of readability.
|
||||
for (vector<float>::iterator it = m_scorer_weights.begin();
|
||||
it != m_scorer_weights.end(); ++it) {
|
||||
*it /= tot;
|
||||
}
|
||||
}
|
||||
|
||||
if (_scorers.size() != _scorerWeights.size()) {
|
||||
if (m_scorers.size() != m_scorer_weights.size()) {
|
||||
throw runtime_error("The number of weights does not equal the number of scorers!");
|
||||
}
|
||||
}
|
||||
cerr << "The weights for the interpolated scorers are: " << endl;
|
||||
for (vector<float>::iterator it = _scorerWeights.begin(); it < _scorerWeights.end(); it++) {
|
||||
for (vector<float>::iterator it = m_scorer_weights.begin(); it < m_scorer_weights.end(); it++) {
|
||||
cerr << *it << " " ;
|
||||
}
|
||||
cerr <<endl;
|
||||
@ -65,9 +66,10 @@ void InterpolatedScorer::setScoreData(ScoreData* data)
|
||||
{
|
||||
size_t last = 0;
|
||||
m_score_data = data;
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
|
||||
itsc != m_scorers.end(); ++itsc) {
|
||||
int numScoresScorer = (*itsc)->NumberOfScores();
|
||||
ScoreData* newData =new ScoreData(**itsc);
|
||||
ScoreData* newData =new ScoreData(*itsc);
|
||||
for (size_t i = 0; i < data->size(); i++) {
|
||||
ScoreArray scoreArray = data->get(i);
|
||||
ScoreArray newScoreArray;
|
||||
@ -110,14 +112,16 @@ void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& di
|
||||
{
|
||||
//cout << "*******InterpolatedScorer::score" << endl;
|
||||
size_t scorerNum = 0;
|
||||
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
|
||||
itsc != m_scorers.end(); ++itsc) {
|
||||
//int numScores = (*itsc)->NumberOfScores();
|
||||
statscores_t tscores;
|
||||
(*itsc)->score(candidates,diffs,tscores);
|
||||
size_t inc = 0;
|
||||
for (statscores_t::iterator itstatsc = tscores.begin(); itstatsc!=tscores.end(); itstatsc++) {
|
||||
for (statscores_t::iterator itstatsc = tscores.begin();
|
||||
itstatsc != tscores.end(); ++itstatsc) {
|
||||
//cout << "Scores " << (*itstatsc) << endl;
|
||||
float weight = _scorerWeights[scorerNum];
|
||||
float weight = m_scorer_weights[scorerNum];
|
||||
if (weight == 0) {
|
||||
stringstream msg;
|
||||
msg << "No weights for scorer" << scorerNum ;
|
||||
@ -139,7 +143,8 @@ void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& di
|
||||
|
||||
void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
{
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
|
||||
itsc != m_scorers.end(); ++itsc) {
|
||||
(*itsc)->setReferenceFiles(referenceFiles);
|
||||
}
|
||||
}
|
||||
@ -147,8 +152,9 @@ void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
|
||||
{
|
||||
stringstream buff;
|
||||
int i=0;
|
||||
for (ScopedVector<Scorer>::iterator itsc = _scorers.begin(); itsc!=_scorers.end(); itsc++) {
|
||||
int i = 0;
|
||||
for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
|
||||
itsc != m_scorers.end(); ++itsc) {
|
||||
ScoreStats tempEntry;
|
||||
(*itsc)->prepareStats(sid, text, tempEntry);
|
||||
if (i > 0) buff << " ";
|
||||
@ -167,16 +173,10 @@ void InterpolatedScorer::setFactors(const string& factors)
|
||||
vector<string> fsplit;
|
||||
split(factors, ',', fsplit);
|
||||
|
||||
if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
|
||||
|
||||
for (size_t i = 0; i < _scorers.size(); ++i)
|
||||
{
|
||||
_scorers[i]->setFactors(fsplit[i]);
|
||||
if (fsplit.size() != m_scorers.size())
|
||||
throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
|
||||
|
||||
for (size_t i = 0; i < m_scorers.size(); ++i) {
|
||||
m_scorers[i]->setFactors(fsplit[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,14 +1,6 @@
|
||||
#ifndef __INTERPOLATED_SCORER_H__
|
||||
#define __INTERPOLATED_SCORER_H__
|
||||
#ifndef MERT_INTERPOLATED_SCORER_H_
|
||||
#define MERT_INTERPOLATED_SCORER_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "Types.h"
|
||||
@ -33,12 +25,13 @@ public:
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
|
||||
virtual size_t NumberOfScores() const {
|
||||
size_t sz=0;
|
||||
for (ScopedVector<Scorer>::const_iterator itsc = _scorers.begin(); itsc != _scorers.end(); itsc++) {
|
||||
size_t sz = 0;
|
||||
for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
|
||||
itsc != m_scorers.end(); ++itsc) {
|
||||
sz += (*itsc)->NumberOfScores();
|
||||
}
|
||||
return sz;
|
||||
};
|
||||
}
|
||||
|
||||
virtual void setScoreData(ScoreData* data);
|
||||
|
||||
@ -48,13 +41,13 @@ public:
|
||||
virtual void setFactors(const string& factors);
|
||||
|
||||
protected:
|
||||
ScopedVector<Scorer> _scorers;
|
||||
ScopedVector<Scorer> m_scorers;
|
||||
|
||||
// Take the ownership of the heap-allocated the objects
|
||||
// by Scorer objects.
|
||||
ScopedVector<ScoreData> m_scorers_score_data;
|
||||
|
||||
vector<float> _scorerWeights;
|
||||
vector<float> m_scorer_weights;
|
||||
};
|
||||
|
||||
#endif //__INTERPOLATED_SCORER_H
|
||||
#endif // MERT_INTERPOLATED_SCORER_H_
|
||||
|
@ -14,7 +14,8 @@
|
||||
using namespace TERCpp;
|
||||
|
||||
MergeScorer::MergeScorer(const string& config)
|
||||
: StatisticsBasedScorer("MERGE",config), kLENGTH(4) {}
|
||||
: StatisticsBasedScorer("MERGE", config) {}
|
||||
|
||||
MergeScorer::~MergeScorer() {}
|
||||
|
||||
void MergeScorer::setReferenceFiles(const vector<string>& referenceFiles)
|
||||
|
@ -13,6 +13,8 @@ using namespace std;
|
||||
class PerScorer;
|
||||
class ScoreStats;
|
||||
|
||||
const int kMergeScorerLength = 4;
|
||||
|
||||
/**
|
||||
* Merge scoring.
|
||||
*/
|
||||
@ -23,23 +25,13 @@ public:
|
||||
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
|
||||
virtual size_t NumberOfScores() const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void whoami() const {
|
||||
cerr << "I AM MergeScorer" << endl;
|
||||
}
|
||||
virtual size_t NumberOfScores() const { return 0; }
|
||||
|
||||
protected:
|
||||
friend class PerScorer;
|
||||
virtual float calculateScore(const vector<int>& comps) const;
|
||||
|
||||
private:
|
||||
const int kLENGTH;
|
||||
|
||||
// no copying allowed
|
||||
MergeScorer(const MergeScorer&);
|
||||
MergeScorer& operator=(const MergeScorer&);
|
||||
|
@ -32,36 +32,25 @@ inline float intersect(float m1, float b1, float m2, float b2)
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
void Optimizer::SetScorer(Scorer *_scorer)
|
||||
{
|
||||
scorer = _scorer;
|
||||
}
|
||||
|
||||
void Optimizer::SetFData(FeatureDataHandle _FData)
|
||||
{
|
||||
FData = _FData;
|
||||
}
|
||||
|
||||
Optimizer::Optimizer(unsigned Pd, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
|
||||
: scorer(NULL), FData(), number_of_random_directions(nrandom)
|
||||
: m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom)
|
||||
{
|
||||
// Warning: the init vector is a full set of parameters, of dimension pdim!
|
||||
Point::pdim = Pd;
|
||||
// Warning: the init vector is a full set of parameters, of dimension m_pdim!
|
||||
Point::m_pdim = Pd;
|
||||
|
||||
CHECK(start.size() == Pd);
|
||||
Point::dim = i2O.size();
|
||||
Point::optindices = i2O;
|
||||
if (Point::pdim > Point::dim) {
|
||||
for (unsigned int i = 0; i < Point::pdim; i++) {
|
||||
Point::m_dim = i2O.size();
|
||||
Point::m_opt_indices = i2O;
|
||||
if (Point::m_pdim > Point::m_dim) {
|
||||
for (unsigned int i = 0; i < Point::m_pdim; i++) {
|
||||
unsigned int j = 0;
|
||||
while (j < Point::dim && i != i2O[j])
|
||||
while (j < Point::m_dim && i != i2O[j])
|
||||
j++;
|
||||
|
||||
// The index i wasnt found on optindices, it is a fixed index,
|
||||
// The index i wasnt found on m_opt_indices, it is a fixed index,
|
||||
// we use the value of the start vector.
|
||||
if (j == Point::dim)
|
||||
Point::fixedweights[i] = start[i];
|
||||
if (j == Point::m_dim)
|
||||
Point::m_fixed_weights[i] = start[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -112,12 +101,12 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
|
||||
//cerr << "Sentence " << S << endl;
|
||||
multimap<float, unsigned> gradient;
|
||||
vector<float> f0;
|
||||
f0.resize(FData->get(S).size());
|
||||
for (unsigned j = 0; j < FData->get(S).size(); j++) {
|
||||
f0.resize(m_feature_data->get(S).size());
|
||||
for (unsigned j = 0; j < m_feature_data->get(S).size(); j++) {
|
||||
// gradient of the feature function for this particular target sentence
|
||||
gradient.insert(pair<float, unsigned>(direction * (FData->get(S,j)), j));
|
||||
gradient.insert(pair<float, unsigned>(direction * (m_feature_data->get(S,j)), j));
|
||||
// compute the feature function at the origin point
|
||||
f0[j] = origin * FData->get(S, j);
|
||||
f0[j] = origin * m_feature_data->get(S, j);
|
||||
}
|
||||
// Now let's compute the 1best for each value of x.
|
||||
|
||||
@ -308,7 +297,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
|
||||
|
||||
void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
|
||||
{
|
||||
CHECK(FData);
|
||||
CHECK(m_feature_data);
|
||||
bests.clear();
|
||||
bests.resize(size());
|
||||
|
||||
@ -316,8 +305,8 @@ void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
|
||||
float bestfs = MIN_FLOAT;
|
||||
unsigned idx = 0;
|
||||
unsigned j;
|
||||
for (j = 0; j < FData->get(i).size(); j++) {
|
||||
float curfs = P * FData->get(i, j);
|
||||
for (j = 0; j < m_feature_data->get(i).size(); j++) {
|
||||
float curfs = P * m_feature_data->get(i, j);
|
||||
if (curfs > bestfs) {
|
||||
bestfs = curfs;
|
||||
idx = j;
|
||||
@ -330,15 +319,15 @@ void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
|
||||
|
||||
statscore_t Optimizer::Run(Point& P) const
|
||||
{
|
||||
if (!FData) {
|
||||
if (!m_feature_data) {
|
||||
cerr << "error trying to optimize without Features loaded" << endl;
|
||||
exit(2);
|
||||
}
|
||||
if (!scorer) {
|
||||
if (!m_scorer) {
|
||||
cerr << "error trying to optimize without a Scorer loaded" << endl;
|
||||
exit(2);
|
||||
}
|
||||
if (scorer->getReferenceSize() != FData->size()) {
|
||||
if (m_scorer->getReferenceSize() != m_feature_data->size()) {
|
||||
cerr << "error length mismatch between feature file and score file" << endl;
|
||||
exit(2);
|
||||
}
|
||||
@ -361,11 +350,11 @@ statscore_t Optimizer::Run(Point& P) const
|
||||
|
||||
vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst, vector<vector <pair<unsigned,unsigned> > > thediffs) const
|
||||
{
|
||||
CHECK(scorer);
|
||||
CHECK(m_scorer);
|
||||
|
||||
vector<statscore_t> theres;
|
||||
|
||||
scorer->score(thefirst, thediffs, theres);
|
||||
m_scorer->score(thefirst, thediffs, theres);
|
||||
return theres;
|
||||
}
|
||||
|
||||
@ -392,7 +381,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const
|
||||
|
||||
Point linebest;
|
||||
|
||||
for (unsigned int d = 0; d < Point::getdim()+number_of_random_directions; d++) {
|
||||
for (unsigned int d = 0; d < Point::getdim() + m_num_random_directions; d++) {
|
||||
if (verboselevel() > 4) {
|
||||
// cerr<<"minimizing along direction "<<d<<endl;
|
||||
cerr << "starting point: " << P << " => " << prevscore << endl;
|
||||
@ -440,7 +429,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const
|
||||
// do specified number of random direction optimizations
|
||||
unsigned int nrun = 0;
|
||||
unsigned int nrun_no_change = 0;
|
||||
for (; nrun_no_change < number_of_random_directions; nrun++, nrun_no_change++)
|
||||
for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++)
|
||||
{
|
||||
// choose a random direction in which to optimize
|
||||
Point direction;
|
||||
@ -476,32 +465,32 @@ statscore_t RandomOptimizer::TrueRun(Point& P) const
|
||||
|
||||
//--------------------------------------
|
||||
|
||||
vector<string> OptimizerFactory::typenames;
|
||||
vector<string> OptimizerFactory::m_type_names;
|
||||
|
||||
void OptimizerFactory::SetTypeNames()
|
||||
{
|
||||
if (typenames.empty()) {
|
||||
typenames.resize(NOPTIMIZER);
|
||||
typenames[POWELL]="powell";
|
||||
typenames[RANDOM_DIRECTION]="random-direction";
|
||||
typenames[RANDOM]="random";
|
||||
if (m_type_names.empty()) {
|
||||
m_type_names.resize(NOPTIMIZER);
|
||||
m_type_names[POWELL]="powell";
|
||||
m_type_names[RANDOM_DIRECTION]="random-direction";
|
||||
m_type_names[RANDOM]="random";
|
||||
// Add new type there
|
||||
}
|
||||
}
|
||||
vector<string> OptimizerFactory::GetTypeNames()
|
||||
{
|
||||
if (typenames.empty())
|
||||
if (m_type_names.empty())
|
||||
SetTypeNames();
|
||||
return typenames;
|
||||
return m_type_names;
|
||||
}
|
||||
|
||||
OptimizerFactory::OptType OptimizerFactory::GetOType(const string& type)
|
||||
{
|
||||
unsigned int thetype;
|
||||
if (typenames.empty())
|
||||
if (m_type_names.empty())
|
||||
SetTypeNames();
|
||||
for (thetype = 0; thetype < typenames.size(); thetype++)
|
||||
if (typenames[thetype] == type)
|
||||
for (thetype = 0; thetype < m_type_names.size(); thetype++)
|
||||
if (m_type_names[thetype] == type)
|
||||
break;
|
||||
return((OptType)thetype);
|
||||
}
|
||||
@ -513,8 +502,8 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim, vector<unsigned> i2o,
|
||||
cerr << "Error: unknown Optimizer type " << type << endl;
|
||||
cerr << "Known Algorithm are:" << endl;
|
||||
unsigned int thetype;
|
||||
for (thetype = 0; thetype < typenames.size(); thetype++)
|
||||
cerr << typenames[thetype] << endl;
|
||||
for (thetype = 0; thetype < m_type_names.size(); thetype++)
|
||||
cerr << m_type_names[thetype] << endl;
|
||||
throw ("unknown Optimizer Type");
|
||||
}
|
||||
|
||||
|
@ -20,18 +20,19 @@ class Point;
|
||||
class Optimizer
|
||||
{
|
||||
protected:
|
||||
Scorer *scorer; // no accessor for them only child can use them
|
||||
FeatureDataHandle FData; // no accessor for them only child can use them
|
||||
unsigned int number_of_random_directions;
|
||||
Scorer *m_scorer; // no accessor for them only child can use them
|
||||
FeatureDataHandle m_feature_data; // no accessor for them only child can use them
|
||||
unsigned int m_num_random_directions;
|
||||
|
||||
public:
|
||||
Optimizer(unsigned Pd, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom);
|
||||
void SetScorer(Scorer *_scorer);
|
||||
void SetFData(FeatureDataHandle _FData);
|
||||
|
||||
void SetScorer(Scorer *scorer) { m_scorer = scorer; }
|
||||
void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; }
|
||||
virtual ~Optimizer();
|
||||
|
||||
unsigned size() const {
|
||||
return FData ? FData->size() : 0;
|
||||
return m_feature_data ? m_feature_data->size() : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -53,7 +54,7 @@ public:
|
||||
* Given a set of nbests, get the Statistical score.
|
||||
*/
|
||||
statscore_t GetStatScore(const vector<unsigned>& nbests) const {
|
||||
return scorer->score(nbests);
|
||||
return m_scorer->score(nbests);
|
||||
}
|
||||
|
||||
statscore_t GetStatScore(const Point& param) const;
|
||||
@ -129,7 +130,7 @@ private:
|
||||
// Setup optimization types.
|
||||
static void SetTypeNames();
|
||||
|
||||
static vector<string> typenames;
|
||||
static vector<string> m_type_names;
|
||||
};
|
||||
|
||||
#endif // OPTIMIZER_H
|
||||
|
@ -1,9 +1,7 @@
|
||||
#ifndef MERT_PER_SCORER_H_
|
||||
#define MERT_PER_SCORER_H_
|
||||
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "Types.h"
|
||||
@ -27,18 +25,9 @@ public:
|
||||
|
||||
virtual void setReferenceFiles(const vector<string>& referenceFiles);
|
||||
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
|
||||
|
||||
virtual size_t NumberOfScores() const {
|
||||
// cerr << "PerScorer: 3" << endl;
|
||||
return 3;
|
||||
}
|
||||
|
||||
virtual size_t NumberOfScores() const { return 3; }
|
||||
virtual float calculateScore(const vector<int>& comps) const;
|
||||
|
||||
void whoami() const {
|
||||
cerr << "I AM PerScorer" << std::endl;
|
||||
}
|
||||
|
||||
private:
|
||||
// no copying allowed
|
||||
PerScorer(const PerScorer&);
|
||||
|
107
mert/Point.cpp
107
mert/Point.cpp
@ -8,41 +8,41 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
vector<unsigned> Point::optindices;
|
||||
vector<unsigned> Point::m_opt_indices;
|
||||
|
||||
unsigned Point::dim = 0;
|
||||
unsigned Point::m_dim = 0;
|
||||
|
||||
map<unsigned,statscore_t> Point::fixedweights;
|
||||
map<unsigned,statscore_t> Point::m_fixed_weights;
|
||||
|
||||
unsigned Point::pdim = 0;
|
||||
unsigned Point::ncall = 0;
|
||||
unsigned Point::m_pdim = 0;
|
||||
unsigned Point::m_ncall = 0;
|
||||
|
||||
vector<parameter_t> Point::m_min;
|
||||
vector<parameter_t> Point::m_max;
|
||||
|
||||
Point::Point() : vector<parameter_t>(dim), score_(0.0) {}
|
||||
Point::Point() : vector<parameter_t>(m_dim), m_score(0.0) {}
|
||||
|
||||
//Can initialize from a vector of dim or pdim
|
||||
//Can initialize from a vector of dim or m_pdim
|
||||
Point::Point(const vector<parameter_t>& init,
|
||||
const vector<parameter_t>& min,
|
||||
const vector<parameter_t>& max)
|
||||
: vector<parameter_t>(Point::dim), score_(0.0)
|
||||
: vector<parameter_t>(Point::m_dim), m_score(0.0)
|
||||
{
|
||||
m_min.resize(Point::dim);
|
||||
m_max.resize(Point::dim);
|
||||
if(init.size()==dim) {
|
||||
for (unsigned int i=0; i<Point::dim; i++) {
|
||||
operator[](i)=init[i];
|
||||
m_min.resize(Point::m_dim);
|
||||
m_max.resize(Point::m_dim);
|
||||
if (init.size() == m_dim) {
|
||||
for (unsigned int i = 0; i < Point::m_dim; i++) {
|
||||
operator[](i) = init[i];
|
||||
m_min[i] = min[i];
|
||||
m_max[i] = max[i];
|
||||
}
|
||||
} else {
|
||||
CHECK(init.size()==pdim);
|
||||
CHECK(optindices.size() == Point::dim);
|
||||
for (unsigned int i=0; i<Point::dim; i++) {
|
||||
operator[](i)=init[optindices[i]];
|
||||
m_min[i] = min[optindices[i]];
|
||||
m_max[i] = max[optindices[i]];
|
||||
CHECK(init.size() == m_pdim);
|
||||
CHECK(m_opt_indices.size() == Point::m_dim);
|
||||
for (unsigned int i = 0; i < Point::m_dim; i++) {
|
||||
operator[](i) = init[m_opt_indices[i]];
|
||||
m_min[i] = min[m_opt_indices[i]];
|
||||
m_max[i] = max[m_opt_indices[i]];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -51,9 +51,9 @@ Point::~Point() {}
|
||||
|
||||
void Point::Randomize()
|
||||
{
|
||||
CHECK(m_min.size()==Point::dim);
|
||||
CHECK(m_max.size()==Point::dim);
|
||||
for (unsigned int i=0; i<size(); i++) {
|
||||
CHECK(m_min.size() == Point::m_dim);
|
||||
CHECK(m_max.size() == Point::m_dim);
|
||||
for (unsigned int i = 0; i < size(); i++) {
|
||||
operator[](i) = m_min[i] +
|
||||
static_cast<float>(random()) / static_cast<float>(RAND_MAX) * (m_max[i] - m_min[i]);
|
||||
}
|
||||
@ -61,16 +61,17 @@ void Point::Randomize()
|
||||
|
||||
double Point::operator*(const FeatureStats& F) const
|
||||
{
|
||||
ncall++; // to track performance
|
||||
double prod=0.0;
|
||||
if(OptimizeAll())
|
||||
m_ncall++; // to track performance
|
||||
double prod = 0.0;
|
||||
if (OptimizeAll())
|
||||
for (unsigned i=0; i<size(); i++)
|
||||
prod+= operator[](i)*F.get(i);
|
||||
prod += operator[](i) * F.get(i);
|
||||
else {
|
||||
for (unsigned i=0; i<size(); i++)
|
||||
prod+= operator[](i)*F.get(optindices[i]);
|
||||
for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
|
||||
prod+=it->second*F.get(it->first);
|
||||
for (unsigned i = 0; i < size(); i++)
|
||||
prod += operator[](i) * F.get(m_opt_indices[i]);
|
||||
for(map<unsigned, float>::iterator it = m_fixed_weights.begin();
|
||||
it != m_fixed_weights.end(); ++it)
|
||||
prod += it->second * F.get(it->first);
|
||||
}
|
||||
return prod;
|
||||
}
|
||||
@ -83,7 +84,7 @@ Point Point::operator+(const Point& p2) const
|
||||
Res[i] += p2[i];
|
||||
}
|
||||
|
||||
Res.score_ = numeric_limits<statscore_t>::max();
|
||||
Res.m_score = numeric_limits<statscore_t>::max();
|
||||
return Res;
|
||||
}
|
||||
|
||||
@ -93,7 +94,7 @@ void Point::operator+=(const Point& p2)
|
||||
for (unsigned i = 0; i < size(); i++) {
|
||||
operator[](i) += p2[i];
|
||||
}
|
||||
score_ = numeric_limits<statscore_t>::max();
|
||||
m_score = numeric_limits<statscore_t>::max();
|
||||
}
|
||||
|
||||
Point Point::operator*(float l) const
|
||||
@ -102,14 +103,14 @@ Point Point::operator*(float l) const
|
||||
for (unsigned i = 0; i < size(); i++) {
|
||||
Res[i] *= l;
|
||||
}
|
||||
Res.score_ = numeric_limits<statscore_t>::max();
|
||||
Res.m_score = numeric_limits<statscore_t>::max();
|
||||
return Res;
|
||||
}
|
||||
|
||||
ostream& operator<<(ostream& o, const Point& P)
|
||||
{
|
||||
vector<parameter_t> w = P.GetAllWeights();
|
||||
for (unsigned int i = 0; i < Point::pdim; i++) {
|
||||
for (unsigned int i = 0; i < Point::m_pdim; i++) {
|
||||
o << w[i] << " ";
|
||||
}
|
||||
return o;
|
||||
@ -118,24 +119,24 @@ ostream& operator<<(ostream& o, const Point& P)
|
||||
void Point::NormalizeL2()
|
||||
{
|
||||
parameter_t norm=0.0;
|
||||
for (unsigned int i=0; i<size(); i++)
|
||||
norm+= operator[](i)*operator[](i);
|
||||
if(norm!=0.0) {
|
||||
norm=sqrt(norm);
|
||||
for (unsigned int i=0; i<size(); i++)
|
||||
operator[](i)/=norm;
|
||||
for (unsigned int i = 0; i < size(); i++)
|
||||
norm += operator[](i) * operator[](i);
|
||||
if (norm != 0.0) {
|
||||
norm = sqrt(norm);
|
||||
for (unsigned int i = 0; i < size(); i++)
|
||||
operator[](i) /= norm;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Point::NormalizeL1()
|
||||
{
|
||||
parameter_t norm=0.0;
|
||||
for (unsigned int i=0; i<size(); i++)
|
||||
norm+= abs(operator[](i));
|
||||
if(norm!=0.0) {
|
||||
for (unsigned int i=0; i<size(); i++)
|
||||
operator[](i)/=norm;
|
||||
parameter_t norm = 0.0;
|
||||
for (unsigned int i = 0; i < size(); i++)
|
||||
norm += abs(operator[](i));
|
||||
if (norm != 0.0) {
|
||||
for (unsigned int i = 0; i < size(); i++)
|
||||
operator[](i) /= norm;
|
||||
}
|
||||
}
|
||||
|
||||
@ -143,14 +144,16 @@ void Point::NormalizeL1()
|
||||
vector<parameter_t> Point::GetAllWeights()const
|
||||
{
|
||||
vector<parameter_t> w;
|
||||
if(OptimizeAll()) {
|
||||
w=*this;
|
||||
if (OptimizeAll()) {
|
||||
w = *this;
|
||||
} else {
|
||||
w.resize(pdim);
|
||||
for (unsigned int i=0; i<size(); i++)
|
||||
w[optindices[i]]=operator[](i);
|
||||
for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
|
||||
w.resize(m_pdim);
|
||||
for (unsigned int i = 0; i < size(); i++)
|
||||
w[m_opt_indices[i]] = operator[](i);
|
||||
for (map<unsigned, float>::iterator it = m_fixed_weights.begin();
|
||||
it != m_fixed_weights.end(); ++it) {
|
||||
w[it->first]=it->second;
|
||||
}
|
||||
}
|
||||
return w;
|
||||
}
|
||||
|
55
mert/Point.h
55
mert/Point.h
@ -1,7 +1,7 @@
|
||||
#ifndef MERT_POINT_H_
|
||||
#define MERT_POINT_H_
|
||||
|
||||
#include <fstream>
|
||||
#include <ostream>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include "Types.h"
|
||||
@ -16,61 +16,55 @@ class Optimizer;
|
||||
class Point : public vector<parameter_t>
|
||||
{
|
||||
friend class Optimizer;
|
||||
|
||||
private:
|
||||
/**
|
||||
* The indices over which we optimize.
|
||||
*/
|
||||
static vector<unsigned int> optindices;
|
||||
static vector<unsigned int> m_opt_indices;
|
||||
|
||||
/**
|
||||
* Dimension of optindices and of the parent vector.
|
||||
* Dimension of m_opt_indices and of the parent vector.
|
||||
*/
|
||||
static unsigned int dim;
|
||||
static unsigned int m_dim;
|
||||
|
||||
/**
|
||||
* Fixed weights in case of partial optimzation.
|
||||
*/
|
||||
static map<unsigned int,parameter_t> fixedweights;
|
||||
static map<unsigned int,parameter_t> m_fixed_weights;
|
||||
|
||||
/**
|
||||
* Total size of the parameter space; we have
|
||||
* pdim = FixedWeight.size() + optinidices.size().
|
||||
* m_pdim = FixedWeight.size() + optinidices.size().
|
||||
*/
|
||||
static unsigned int pdim;
|
||||
static unsigned int ncall;
|
||||
static unsigned int m_pdim;
|
||||
static unsigned int m_ncall;
|
||||
|
||||
/**
|
||||
* The limits for randomization, both vectors are of full length, pdim.
|
||||
* The limits for randomization, both vectors are of full length, m_pdim.
|
||||
*/
|
||||
static vector<parameter_t> m_min;
|
||||
static vector<parameter_t> m_max;
|
||||
|
||||
statscore_t score_;
|
||||
statscore_t m_score;
|
||||
|
||||
public:
|
||||
static unsigned int getdim() {
|
||||
return dim;
|
||||
}
|
||||
static unsigned int getpdim() {
|
||||
return pdim;
|
||||
}
|
||||
static void setpdim(size_t pd) {
|
||||
pdim = pd;
|
||||
}
|
||||
static void setdim(size_t d) {
|
||||
dim = d;
|
||||
}
|
||||
static unsigned int getdim() { return m_dim; }
|
||||
static void setdim(size_t d) { m_dim = d; }
|
||||
|
||||
static unsigned int getpdim() { return m_pdim; }
|
||||
static void setpdim(size_t pd) { m_pdim = pd; }
|
||||
|
||||
static void set_optindices(const vector<unsigned int>& indices) {
|
||||
optindices = indices;
|
||||
m_opt_indices = indices;
|
||||
}
|
||||
|
||||
static const vector<unsigned int>& get_optindices() {
|
||||
return optindices;
|
||||
return m_opt_indices;
|
||||
}
|
||||
|
||||
static bool OptimizeAll() {
|
||||
return fixedweights.empty();
|
||||
return m_fixed_weights.empty();
|
||||
}
|
||||
|
||||
Point();
|
||||
@ -88,7 +82,7 @@ public:
|
||||
Point operator*(float) const;
|
||||
|
||||
/**
|
||||
* Write the Whole featureweight to a stream (ie pdim float).
|
||||
* Write the Whole featureweight to a stream (ie m_pdim float).
|
||||
*/
|
||||
friend ostream& operator<<(ostream& o,const Point& P);
|
||||
|
||||
@ -97,16 +91,13 @@ public:
|
||||
void NormalizeL1();
|
||||
|
||||
/**
|
||||
* Return a vector of size pdim where all weights have been
|
||||
* Return a vector of size m_pdim where all weights have been
|
||||
* put (including fixed ones).
|
||||
*/
|
||||
vector<parameter_t> GetAllWeights() const;
|
||||
|
||||
statscore_t GetScore() const {
|
||||
return score_;
|
||||
}
|
||||
|
||||
void SetScore(statscore_t score) { score_ = score; }
|
||||
statscore_t GetScore() const { return m_score; }
|
||||
void SetScore(statscore_t score) { m_score = score; }
|
||||
};
|
||||
|
||||
#endif // MERT_POINT_H
|
||||
|
@ -12,39 +12,39 @@ class ScopedVector {
|
||||
ScopedVector() {}
|
||||
virtual ~ScopedVector() { reset(); }
|
||||
|
||||
bool empty() const { return vec_.empty(); }
|
||||
bool empty() const { return m_vec.empty(); }
|
||||
|
||||
void push_back(T *e) { vec_.push_back(e); }
|
||||
void push_back(T *e) { m_vec.push_back(e); }
|
||||
|
||||
void reset() {
|
||||
for (iterator it = vec_.begin(); it != vec_.end(); ++it) {
|
||||
for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) {
|
||||
delete *it;
|
||||
}
|
||||
vec_.clear();
|
||||
m_vec.clear();
|
||||
}
|
||||
|
||||
void reserve(size_t capacity) { vec_.reserve(capacity); }
|
||||
void resize(size_t size) { vec_.resize(size); }
|
||||
void reserve(size_t capacity) { m_vec.reserve(capacity); }
|
||||
void resize(size_t size) { m_vec.resize(size); }
|
||||
|
||||
size_t size() const {return vec_.size(); }
|
||||
size_t size() const {return m_vec.size(); }
|
||||
|
||||
iterator begin() { return vec_.begin(); }
|
||||
const_iterator begin() const { return vec_.begin(); }
|
||||
iterator begin() { return m_vec.begin(); }
|
||||
const_iterator begin() const { return m_vec.begin(); }
|
||||
|
||||
iterator end() { return vec_.end(); }
|
||||
const_iterator end() const { return vec_.end(); }
|
||||
iterator end() { return m_vec.end(); }
|
||||
const_iterator end() const { return m_vec.end(); }
|
||||
|
||||
std::vector<T*>& get() { return vec_; }
|
||||
const std::vector<T*>& get() const { return vec_; }
|
||||
std::vector<T*>& get() { return m_vec; }
|
||||
const std::vector<T*>& get() const { return m_vec; }
|
||||
|
||||
std::vector<T*>* operator->() { return &vec_; }
|
||||
const std::vector<T*>* operator->() const { return &vec_; }
|
||||
std::vector<T*>* operator->() { return &m_vec; }
|
||||
const std::vector<T*>* operator->() const { return &m_vec; }
|
||||
|
||||
T*& operator[](size_t i) { return vec_[i]; }
|
||||
const T* operator[](size_t i) const { return vec_[i]; }
|
||||
T*& operator[](size_t i) { return m_vec[i]; }
|
||||
const T* operator[](size_t i) const { return m_vec[i]; }
|
||||
|
||||
private:
|
||||
std::vector<T*> vec_;
|
||||
std::vector<T*> m_vec;
|
||||
|
||||
// no copying allowed.
|
||||
ScopedVector<T>(const ScopedVector<T>&);
|
||||
|
@ -10,76 +10,85 @@
|
||||
#include "Util.h"
|
||||
#include "FileStream.h"
|
||||
|
||||
|
||||
ScoreArray::ScoreArray()
|
||||
: number_of_scores(0), idx("") {}
|
||||
: m_num_scores(0), m_index("") {}
|
||||
|
||||
void ScoreArray::savetxt(std::ofstream& outFile, const std::string& sctype)
|
||||
void ScoreArray::savetxt(ostream* os, const string& sctype)
|
||||
{
|
||||
outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size()
|
||||
<< " " << number_of_scores << " " << sctype << std::endl;
|
||||
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
|
||||
i->savetxt(outFile);
|
||||
outFile << std::endl;
|
||||
*os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
|
||||
<< " " << m_num_scores << " " << sctype << endl;
|
||||
for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
|
||||
i->savetxt(os);
|
||||
*os << endl;
|
||||
}
|
||||
outFile << SCORES_TXT_END << std::endl;
|
||||
*os << SCORES_TXT_END << endl;
|
||||
}
|
||||
|
||||
void ScoreArray::savebin(std::ofstream& outFile, const std::string& sctype)
|
||||
void ScoreArray::savebin(ostream* os, const string& score_type)
|
||||
{
|
||||
outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size()
|
||||
<< " " << number_of_scores << " " << sctype << std::endl;
|
||||
for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++)
|
||||
i->savebin(outFile);
|
||||
|
||||
outFile << SCORES_BIN_END << std::endl;
|
||||
*os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
|
||||
<< " " << m_num_scores << " " << score_type << endl;
|
||||
for (scorearray_t::iterator i = m_array.begin();
|
||||
i != m_array.end(); i++) {
|
||||
i->savebin(os);
|
||||
}
|
||||
*os << SCORES_BIN_END << endl;
|
||||
}
|
||||
|
||||
void ScoreArray::save(std::ofstream& inFile, const std::string& sctype, bool bin)
|
||||
void ScoreArray::save(ostream* os, const string& score_type, bool bin)
|
||||
{
|
||||
if (size()>0)
|
||||
(bin)?savebin(inFile, sctype):savetxt(inFile, sctype);
|
||||
if (size() <= 0) return;
|
||||
if (bin) {
|
||||
savebin(os, score_type);
|
||||
} else {
|
||||
savetxt(os, score_type);
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreArray::save(const std::string &file, const std::string& sctype, bool bin)
|
||||
void ScoreArray::save(const string &file, const string& score_type, bool bin)
|
||||
{
|
||||
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
|
||||
|
||||
save(outFile, sctype, bin);
|
||||
|
||||
outFile.close();
|
||||
ofstream ofs(file.c_str(), ios::out);
|
||||
if (!ofs) {
|
||||
cerr << "Failed to open " << file << endl;
|
||||
exit(1);
|
||||
}
|
||||
ostream* os = &ofs;
|
||||
save(os, score_type, bin);
|
||||
ofs.close();
|
||||
}
|
||||
|
||||
void ScoreArray::loadbin(ifstream& inFile, size_t n)
|
||||
{
|
||||
ScoreStats entry(number_of_scores);
|
||||
void ScoreArray::save(const string& score_type, bool bin) {
|
||||
save(&cout, score_type, bin);
|
||||
}
|
||||
|
||||
for (size_t i=0 ; i < n; i++) {
|
||||
entry.loadbin(inFile);
|
||||
void ScoreArray::loadbin(istream* is, size_t n)
|
||||
{
|
||||
ScoreStats entry(m_num_scores);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
entry.loadbin(is);
|
||||
add(entry);
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreArray::loadtxt(ifstream& inFile, size_t n)
|
||||
void ScoreArray::loadtxt(istream* is, size_t n)
|
||||
{
|
||||
ScoreStats entry(number_of_scores);
|
||||
|
||||
for (size_t i=0 ; i < n; i++) {
|
||||
entry.loadtxt(inFile);
|
||||
ScoreStats entry(m_num_scores);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
entry.loadtxt(is);
|
||||
add(entry);
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreArray::load(ifstream& inFile)
|
||||
void ScoreArray::load(istream* is)
|
||||
{
|
||||
size_t number_of_entries=0;
|
||||
bool binmode=false;
|
||||
size_t number_of_entries = 0;
|
||||
bool binmode = false;
|
||||
|
||||
std::string substring, stringBuf;
|
||||
std::string::size_type loc;
|
||||
string substring, stringBuf;
|
||||
string::size_type loc;
|
||||
|
||||
std::getline(inFile, stringBuf);
|
||||
if (!inFile.good()) {
|
||||
getline(*is, stringBuf);
|
||||
if (!is->good()) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -94,35 +103,38 @@ void ScoreArray::load(ifstream& inFile)
|
||||
}
|
||||
getNextPound(stringBuf, substring);
|
||||
getNextPound(stringBuf, substring);
|
||||
idx = substring;
|
||||
m_index = substring;
|
||||
getNextPound(stringBuf, substring);
|
||||
number_of_entries = atoi(substring.c_str());
|
||||
getNextPound(stringBuf, substring);
|
||||
number_of_scores = atoi(substring.c_str());
|
||||
m_num_scores = atoi(substring.c_str());
|
||||
getNextPound(stringBuf, substring);
|
||||
score_type = substring;
|
||||
m_score_type = substring;
|
||||
}
|
||||
|
||||
(binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
|
||||
if (binmode) {
|
||||
loadbin(is, number_of_entries);
|
||||
} else {
|
||||
loadtxt(is, number_of_entries);
|
||||
}
|
||||
|
||||
std::getline(inFile, stringBuf);
|
||||
getline(*is, stringBuf);
|
||||
if (!stringBuf.empty()) {
|
||||
if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
|
||||
if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 &&
|
||||
(loc = stringBuf.find(SCORES_BIN_END)) != 0) {
|
||||
TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreArray::load(const std::string &file)
|
||||
void ScoreArray::load(const string &file)
|
||||
{
|
||||
TRACE_ERR("loading data from " << file << std::endl);
|
||||
|
||||
inputfilestream inFile(file); // matches a stream with a file. Opens the file
|
||||
|
||||
load((ifstream&) inFile);
|
||||
|
||||
inFile.close();
|
||||
TRACE_ERR("loading data from " << file << endl);
|
||||
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
|
||||
istream* is = &input_stream;
|
||||
load(is);
|
||||
input_stream.close();
|
||||
}
|
||||
|
||||
|
||||
@ -139,7 +151,8 @@ bool ScoreArray::check_consistency() const
|
||||
if (sz == 0)
|
||||
return true;
|
||||
|
||||
for (scorearray_t::const_iterator i = array_.begin(); i != array_.end(); ++i) {
|
||||
for (scorearray_t::const_iterator i = m_array.begin();
|
||||
i != m_array.end(); ++i) {
|
||||
if (i->size() != sz)
|
||||
return false;
|
||||
}
|
||||
|
@ -24,85 +24,62 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
|
||||
|
||||
class ScoreArray
|
||||
{
|
||||
protected:
|
||||
scorearray_t array_;
|
||||
std::string score_type;
|
||||
size_t number_of_scores;
|
||||
private:
|
||||
scorearray_t m_array;
|
||||
std::string m_score_type;
|
||||
size_t m_num_scores;
|
||||
|
||||
private:
|
||||
// idx to identify the utterance.
|
||||
// indexx to identify the utterance.
|
||||
// It can differ from the index inside the vector.
|
||||
std::string idx;
|
||||
std::string m_index;
|
||||
|
||||
public:
|
||||
ScoreArray();
|
||||
~ScoreArray() {}
|
||||
|
||||
inline void clear() {
|
||||
array_.clear();
|
||||
}
|
||||
void clear() { m_array.clear(); }
|
||||
|
||||
inline std::string getIndex() const {
|
||||
return idx;
|
||||
}
|
||||
inline void setIndex(const std::string& value) {
|
||||
idx=value;
|
||||
}
|
||||
std::string getIndex() const { return m_index; }
|
||||
|
||||
// inline ScoreStats get(size_t i){ return array_.at(i); }
|
||||
void setIndex(const std::string& value) { m_index = value; }
|
||||
|
||||
inline ScoreStats& get(size_t i) {
|
||||
return array_.at(i);
|
||||
}
|
||||
inline const ScoreStats& get(size_t i)const {
|
||||
return array_.at(i);
|
||||
}
|
||||
ScoreStats& get(size_t i) { return m_array.at(i); }
|
||||
|
||||
void add(const ScoreStats& e) {
|
||||
array_.push_back(e);
|
||||
}
|
||||
const ScoreStats& get(size_t i) const { return m_array.at(i); }
|
||||
|
||||
void add(const ScoreStats& e) { m_array.push_back(e); }
|
||||
|
||||
//ADDED BY TS
|
||||
void swap(size_t i, size_t j) {
|
||||
std::swap(array_[i],array_[j]);
|
||||
std::swap(m_array[i], m_array[j]);
|
||||
}
|
||||
|
||||
void resize(size_t new_size) {
|
||||
array_.resize(std::min(new_size,array_.size()));
|
||||
m_array.resize(std::min(new_size, m_array.size()));
|
||||
}
|
||||
//END_ADDED
|
||||
|
||||
void merge(ScoreArray& e);
|
||||
|
||||
inline std::string name() const {
|
||||
return score_type;
|
||||
}
|
||||
std::string name() const { return m_score_type; }
|
||||
|
||||
inline void name(std::string &sctype) {
|
||||
score_type = sctype;
|
||||
}
|
||||
void name(std::string &score_type) { m_score_type = score_type; }
|
||||
|
||||
inline size_t size() const {
|
||||
return array_.size();
|
||||
}
|
||||
inline size_t NumberOfScores() const {
|
||||
return number_of_scores;
|
||||
}
|
||||
inline void NumberOfScores(size_t v) {
|
||||
number_of_scores = v;
|
||||
}
|
||||
size_t size() const { return m_array.size(); }
|
||||
|
||||
void savetxt(ofstream& outFile, const std::string& sctype);
|
||||
void savebin(ofstream& outFile, const std::string& sctype);
|
||||
void save(ofstream& outFile, const std::string& sctype, bool bin=false);
|
||||
void save(const std::string &file, const std::string& sctype, bool bin=false);
|
||||
inline void save(const std::string& sctype, bool bin=false) {
|
||||
save("/dev/stdout", sctype, bin);
|
||||
}
|
||||
size_t NumberOfScores() const { return m_num_scores; }
|
||||
|
||||
void loadtxt(ifstream& inFile, size_t n);
|
||||
void loadbin(ifstream& inFile, size_t n);
|
||||
void load(ifstream& inFile);
|
||||
void NumberOfScores(size_t v) { m_num_scores = v; }
|
||||
|
||||
void savetxt(std::ostream* os, const std::string& score_type);
|
||||
void savebin(std::ostream* os, const std::string& score_type);
|
||||
void save(std::ostream* os, const std::string& score_type, bool bin=false);
|
||||
void save(const std::string &file, const std::string& score_type, bool bin=false);
|
||||
void save(const std::string& score_type, bool bin=false);
|
||||
|
||||
void loadtxt(std::istream* is, size_t n);
|
||||
void loadbin(std::istream* is, size_t n);
|
||||
void load(std::istream* is);
|
||||
void load(const std::string &file);
|
||||
|
||||
bool check_consistency() const;
|
||||
|
@ -7,55 +7,56 @@
|
||||
*/
|
||||
|
||||
#include "ScoreData.h"
|
||||
|
||||
#include <fstream>
|
||||
#include "Scorer.h"
|
||||
#include "Util.h"
|
||||
#include "FileStream.h"
|
||||
|
||||
ScoreData::ScoreData(Scorer& ptr):
|
||||
theScorer(&ptr)
|
||||
ScoreData::ScoreData(Scorer* scorer) :
|
||||
m_scorer(scorer)
|
||||
{
|
||||
score_type = theScorer->getName();
|
||||
m_score_type = m_scorer->getName();
|
||||
// This is not dangerous: we don't use the this pointer in SetScoreData.
|
||||
theScorer->setScoreData(this);
|
||||
number_of_scores = theScorer->NumberOfScores();
|
||||
// TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl);
|
||||
m_scorer->setScoreData(this);
|
||||
m_num_scores = m_scorer->NumberOfScores();
|
||||
// TRACE_ERR("ScoreData: m_num_scores: " << m_num_scores << std::endl);
|
||||
}
|
||||
|
||||
void ScoreData::save(std::ofstream& outFile, bool bin)
|
||||
void ScoreData::save(ostream* os, bool bin)
|
||||
{
|
||||
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
|
||||
i->save(outFile, score_type, bin);
|
||||
for (scoredata_t::iterator i = m_array.begin();
|
||||
i != m_array.end(); ++i) {
|
||||
i->save(os, m_score_type, bin);
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreData::save(const std::string &file, bool bin)
|
||||
void ScoreData::save(const string &file, bool bin)
|
||||
{
|
||||
if (file.empty()) return;
|
||||
TRACE_ERR("saving the array into " << file << std::endl);
|
||||
TRACE_ERR("saving the array into " << file << endl);
|
||||
|
||||
// matches a stream with a file. Opens the file.
|
||||
std::ofstream outFile(file.c_str(), std::ios::out);
|
||||
|
||||
ScoreStats entry;
|
||||
|
||||
save(outFile, bin);
|
||||
|
||||
outFile.close();
|
||||
ofstream ofs(file.c_str(), ios::out);
|
||||
ostream* os = &ofs;
|
||||
save(os, bin);
|
||||
ofs.close();
|
||||
}
|
||||
|
||||
void ScoreData::load(ifstream& inFile)
|
||||
void ScoreData::save(bool bin) {
|
||||
save(&cout, bin);
|
||||
}
|
||||
|
||||
void ScoreData::load(istream* is)
|
||||
{
|
||||
ScoreArray entry;
|
||||
|
||||
while (!inFile.eof()) {
|
||||
|
||||
if (!inFile.good()) {
|
||||
std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl;
|
||||
while (!is->eof()) {
|
||||
if (!is->good()) {
|
||||
cerr << "ERROR ScoreData::load inFile.good()" << endl;
|
||||
}
|
||||
|
||||
entry.clear();
|
||||
entry.load(inFile);
|
||||
|
||||
entry.load(is);
|
||||
if (entry.size() == 0) {
|
||||
break;
|
||||
}
|
||||
@ -63,63 +64,58 @@ void ScoreData::load(ifstream& inFile)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ScoreData::load(const std::string &file)
|
||||
void ScoreData::load(const string &file)
|
||||
{
|
||||
TRACE_ERR("loading score data from " << file << std::endl);
|
||||
|
||||
inputfilestream inFile(file); // matches a stream with a file. Opens the file
|
||||
|
||||
if (!inFile) {
|
||||
TRACE_ERR("loading score data from " << file << endl);
|
||||
inputfilestream input_stream(file); // matches a stream with a file. Opens the file
|
||||
if (!input_stream) {
|
||||
throw runtime_error("Unable to open score file: " + file);
|
||||
}
|
||||
|
||||
load((ifstream&) inFile);
|
||||
|
||||
inFile.close();
|
||||
istream* is = &input_stream;
|
||||
load(is);
|
||||
input_stream.close();
|
||||
}
|
||||
|
||||
|
||||
void ScoreData::add(ScoreArray& e)
|
||||
{
|
||||
if (exists(e.getIndex())) { // array at position e.getIndex() already exists
|
||||
//enlarge array at position e.getIndex()
|
||||
size_t pos = getIndex(e.getIndex());
|
||||
array_.at(pos).merge(e);
|
||||
m_array.at(pos).merge(e);
|
||||
} else {
|
||||
array_.push_back(e);
|
||||
m_array.push_back(e);
|
||||
setIndex();
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreData::add(const ScoreStats& e, const std::string& sent_idx)
|
||||
void ScoreData::add(const ScoreStats& e, const string& sent_idx)
|
||||
{
|
||||
if (exists(sent_idx)) { // array at position e.getIndex() already exists
|
||||
// Enlarge array at position e.getIndex()
|
||||
size_t pos = getIndex(sent_idx);
|
||||
// TRACE_ERR("Inserting in array " << sent_idx << std::endl);
|
||||
array_.at(pos).add(e);
|
||||
m_array.at(pos).add(e);
|
||||
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
|
||||
} else {
|
||||
// TRACE_ERR("Creating a new entry in the array" << std::endl);
|
||||
ScoreArray a;
|
||||
a.NumberOfScores(number_of_scores);
|
||||
a.NumberOfScores(m_num_scores);
|
||||
a.add(e);
|
||||
a.setIndex(sent_idx);
|
||||
size_t idx = array_.size();
|
||||
array_.push_back(a);
|
||||
idx2arrayname_[idx] = sent_idx;
|
||||
arrayname2idx_[sent_idx]=idx;
|
||||
size_t idx = m_array.size();
|
||||
m_array.push_back(a);
|
||||
m_index_to_array_name[idx] = sent_idx;
|
||||
m_array_name_to_index[sent_idx]=idx;
|
||||
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
|
||||
}
|
||||
}
|
||||
|
||||
bool ScoreData::check_consistency() const
|
||||
{
|
||||
if (array_.size() == 0)
|
||||
if (m_array.size() == 0)
|
||||
return true;
|
||||
|
||||
for (scoredata_t::const_iterator i = array_.begin(); i != array_.end(); ++i)
|
||||
for (scoredata_t::const_iterator i = m_array.begin(); i != m_array.end(); ++i)
|
||||
if (!i->check_consistency()) return false;
|
||||
|
||||
return true;
|
||||
@ -127,10 +123,10 @@ bool ScoreData::check_consistency() const
|
||||
|
||||
void ScoreData::setIndex()
|
||||
{
|
||||
size_t j=0;
|
||||
for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
|
||||
idx2arrayname_[j]=i->getIndex();
|
||||
arrayname2idx_[i->getIndex()]=j;
|
||||
size_t j = 0;
|
||||
for (scoredata_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
|
||||
m_index_to_array_name[j] = i->getIndex();
|
||||
m_array_name_to_index[i->getIndex()]=j;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
@ -9,9 +9,8 @@
|
||||
#ifndef MERT_SCORE_DATA_H_
|
||||
#define MERT_SCORE_DATA_H_
|
||||
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include "ScoreArray.h"
|
||||
@ -23,35 +22,34 @@ class Scorer;
|
||||
|
||||
class ScoreData
|
||||
{
|
||||
protected:
|
||||
scoredata_t array_;
|
||||
idx2name idx2arrayname_; // map from index to name of array
|
||||
name2idx arrayname2idx_; // map from name to index of array
|
||||
|
||||
private:
|
||||
// Do not allow the user to instanciate without arguments.
|
||||
ScoreData() {}
|
||||
|
||||
Scorer* theScorer;
|
||||
std::string score_type;
|
||||
size_t number_of_scores;
|
||||
scoredata_t m_array;
|
||||
idx2name m_index_to_array_name; // map from index to name of array
|
||||
name2idx m_array_name_to_index; // map from name to index of array
|
||||
|
||||
Scorer* m_scorer;
|
||||
std::string m_score_type;
|
||||
size_t m_num_scores;
|
||||
|
||||
public:
|
||||
ScoreData(Scorer& sc);
|
||||
ScoreData(Scorer* scorer);
|
||||
~ScoreData() {}
|
||||
|
||||
inline void clear() {
|
||||
array_.clear();
|
||||
}
|
||||
void clear() { m_array.clear(); }
|
||||
|
||||
inline ScoreArray get(const std::string& idx) {
|
||||
return array_.at(getIndex(idx));
|
||||
return m_array.at(getIndex(idx));
|
||||
}
|
||||
|
||||
inline ScoreArray& get(size_t idx) {
|
||||
return array_.at(idx);
|
||||
return m_array.at(idx);
|
||||
}
|
||||
|
||||
inline const ScoreArray& get(size_t idx) const {
|
||||
return array_.at(idx);
|
||||
return m_array.at(idx);
|
||||
}
|
||||
|
||||
inline bool exists(const std::string& sent_idx) const {
|
||||
@ -59,56 +57,51 @@ public:
|
||||
}
|
||||
|
||||
inline bool exists(int sent_idx) const {
|
||||
return (sent_idx > -1 && sent_idx < static_cast<int>(array_.size())) ? true : false;
|
||||
return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
|
||||
}
|
||||
|
||||
inline ScoreStats& get(size_t i, size_t j) {
|
||||
return array_.at(i).get(j);
|
||||
}
|
||||
inline const ScoreStats& get(size_t i, size_t j) const {
|
||||
return array_.at(i).get(j);
|
||||
return m_array.at(i).get(j);
|
||||
}
|
||||
|
||||
inline std::string name() const {
|
||||
return score_type;
|
||||
inline const ScoreStats& get(size_t i, size_t j) const {
|
||||
return m_array.at(i).get(j);
|
||||
}
|
||||
|
||||
inline std::string name(const std::string &sctype) {
|
||||
return score_type = sctype;
|
||||
std::string name() const { return m_score_type; }
|
||||
|
||||
std::string name(const std::string &score_type) {
|
||||
return m_score_type = score_type;
|
||||
}
|
||||
|
||||
void add(ScoreArray& e);
|
||||
void add(const ScoreStats& e, const std::string& sent_idx);
|
||||
|
||||
inline size_t NumberOfScores() const {
|
||||
return number_of_scores;
|
||||
}
|
||||
inline size_t size() const {
|
||||
return array_.size();
|
||||
}
|
||||
size_t NumberOfScores() const { return m_num_scores; }
|
||||
size_t size() const { return m_array.size(); }
|
||||
|
||||
void save(const std::string &file, bool bin=false);
|
||||
void save(ofstream& outFile, bool bin=false);
|
||||
inline void save(bool bin=false) {
|
||||
save("/dev/stdout", bin);
|
||||
}
|
||||
void save(std::ostream* os, bool bin=false);
|
||||
void save(bool bin=false);
|
||||
|
||||
void load(ifstream& inFile);
|
||||
void load(std::istream* is);
|
||||
void load(const std::string &file);
|
||||
|
||||
bool check_consistency() const;
|
||||
|
||||
void setIndex();
|
||||
|
||||
inline int getIndex(const std::string& idx) const {
|
||||
name2idx::const_iterator i = arrayname2idx_.find(idx);
|
||||
if (i != arrayname2idx_.end())
|
||||
name2idx::const_iterator i = m_array_name_to_index.find(idx);
|
||||
if (i != m_array_name_to_index.end())
|
||||
return i->second;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline std::string getIndex(size_t idx) const {
|
||||
idx2name::const_iterator i = idx2arrayname_.find(idx);
|
||||
if (i != idx2arrayname_.end())
|
||||
idx2name::const_iterator i = m_index_to_array_name.find(idx);
|
||||
if (i != m_index_to_array_name.end())
|
||||
throw runtime_error("there is no entry at index " + idx);
|
||||
return i->second;
|
||||
}
|
||||
|
@ -14,30 +14,30 @@ const int kAvailableSize = 8;
|
||||
} // namespace
|
||||
|
||||
ScoreStats::ScoreStats()
|
||||
: available_(kAvailableSize), entries_(0),
|
||||
array_(new ScoreStatsType[available_]) {}
|
||||
: m_available_size(kAvailableSize), m_entries(0),
|
||||
m_array(new ScoreStatsType[m_available_size]) {}
|
||||
|
||||
ScoreStats::ScoreStats(const size_t size)
|
||||
: available_(size), entries_(size),
|
||||
array_(new ScoreStatsType[available_])
|
||||
: m_available_size(size), m_entries(size),
|
||||
m_array(new ScoreStatsType[m_available_size])
|
||||
{
|
||||
memset(array_, 0, GetArraySizeWithBytes());
|
||||
memset(m_array, 0, GetArraySizeWithBytes());
|
||||
}
|
||||
|
||||
ScoreStats::~ScoreStats()
|
||||
{
|
||||
if (array_) {
|
||||
delete [] array_;
|
||||
array_ = NULL;
|
||||
if (m_array) {
|
||||
delete [] m_array;
|
||||
m_array = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreStats::Copy(const ScoreStats &stats)
|
||||
{
|
||||
available_ = stats.available();
|
||||
entries_ = stats.size();
|
||||
array_ = new ScoreStatsType[available_];
|
||||
memcpy(array_, stats.getArray(), GetArraySizeWithBytes());
|
||||
m_available_size = stats.available();
|
||||
m_entries = stats.size();
|
||||
m_array = new ScoreStatsType[m_available_size];
|
||||
memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
|
||||
}
|
||||
|
||||
ScoreStats::ScoreStats(const ScoreStats &stats)
|
||||
@ -47,27 +47,27 @@ ScoreStats::ScoreStats(const ScoreStats &stats)
|
||||
|
||||
ScoreStats& ScoreStats::operator=(const ScoreStats &stats)
|
||||
{
|
||||
delete [] array_;
|
||||
delete [] m_array;
|
||||
Copy(stats);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void ScoreStats::expand()
|
||||
{
|
||||
available_ *= 2;
|
||||
scorestats_t buf = new ScoreStatsType[available_];
|
||||
memcpy(buf, array_, GetArraySizeWithBytes());
|
||||
delete [] array_;
|
||||
array_ = buf;
|
||||
m_available_size *= 2;
|
||||
scorestats_t buf = new ScoreStatsType[m_available_size];
|
||||
memcpy(buf, m_array, GetArraySizeWithBytes());
|
||||
delete [] m_array;
|
||||
m_array = buf;
|
||||
}
|
||||
|
||||
void ScoreStats::add(ScoreStatsType v)
|
||||
{
|
||||
if (isfull()) expand();
|
||||
array_[entries_++]=v;
|
||||
m_array[m_entries++]=v;
|
||||
}
|
||||
|
||||
void ScoreStats::set(const std::string& str)
|
||||
void ScoreStats::set(const string& str)
|
||||
{
|
||||
reset();
|
||||
vector<string> out;
|
||||
@ -78,46 +78,51 @@ void ScoreStats::set(const std::string& str)
|
||||
}
|
||||
}
|
||||
|
||||
void ScoreStats::loadbin(std::ifstream& inFile)
|
||||
void ScoreStats::loadbin(istream* is)
|
||||
{
|
||||
inFile.read((char*)array_, GetArraySizeWithBytes());
|
||||
is->read(reinterpret_cast<char*>(m_array),
|
||||
static_cast<streamsize>(GetArraySizeWithBytes()));
|
||||
}
|
||||
|
||||
void ScoreStats::loadtxt(std::ifstream& inFile)
|
||||
void ScoreStats::loadtxt(istream* is)
|
||||
{
|
||||
std::string theString;
|
||||
std::getline(inFile, theString);
|
||||
set(theString);
|
||||
string line;
|
||||
getline(*is, line);
|
||||
set(line);
|
||||
}
|
||||
|
||||
void ScoreStats::loadtxt(const std::string &file)
|
||||
void ScoreStats::loadtxt(const string &file)
|
||||
{
|
||||
// TRACE_ERR("loading the stats from " << file << std::endl);
|
||||
|
||||
std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
|
||||
|
||||
loadtxt(inFile);
|
||||
ifstream ifs(file.c_str(), ios::in); // matches a stream with a file. Opens the file
|
||||
if (!ifs) {
|
||||
cerr << "Failed to open " << file << endl;
|
||||
exit(1);
|
||||
}
|
||||
istream* is = &ifs;
|
||||
loadtxt(is);
|
||||
}
|
||||
|
||||
|
||||
void ScoreStats::savetxt(const std::string &file)
|
||||
void ScoreStats::savetxt(const string &file)
|
||||
{
|
||||
// TRACE_ERR("saving the stats into " << file << std::endl);
|
||||
|
||||
std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
|
||||
|
||||
savetxt(outFile);
|
||||
ofstream ofs(file.c_str(), ios::out); // matches a stream with a file. Opens the file
|
||||
ostream* os = &ofs;
|
||||
savetxt(os);
|
||||
}
|
||||
|
||||
|
||||
void ScoreStats::savetxt(std::ofstream& outFile)
|
||||
void ScoreStats::savetxt(ostream* os)
|
||||
{
|
||||
outFile << *this;
|
||||
*os << *this;
|
||||
}
|
||||
|
||||
void ScoreStats::savebin(std::ofstream& outFile)
|
||||
void ScoreStats::savetxt() {
|
||||
savetxt(&cout);
|
||||
}
|
||||
|
||||
void ScoreStats::savebin(ostream* os)
|
||||
{
|
||||
outFile.write((char*)array_, GetArraySizeWithBytes());
|
||||
os->write(reinterpret_cast<char*>(m_array),
|
||||
static_cast<streamsize>(GetArraySizeWithBytes()));
|
||||
}
|
||||
|
||||
ostream& operator<<(ostream& o, const ScoreStats& e)
|
||||
|
@ -22,11 +22,11 @@ using namespace std;
|
||||
class ScoreStats
|
||||
{
|
||||
private:
|
||||
size_t available_;
|
||||
size_t entries_;
|
||||
size_t m_available_size;
|
||||
size_t m_entries;
|
||||
|
||||
// TODO: Use smart pointer for exceptional-safety.
|
||||
scorestats_t array_;
|
||||
scorestats_t m_array;
|
||||
|
||||
public:
|
||||
ScoreStats();
|
||||
@ -40,31 +40,23 @@ public:
|
||||
|
||||
void Copy(const ScoreStats &stats);
|
||||
|
||||
bool isfull() const {
|
||||
return (entries_ < available_) ? 0 : 1;
|
||||
}
|
||||
bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
|
||||
|
||||
void expand();
|
||||
void add(ScoreStatsType v);
|
||||
|
||||
void clear() {
|
||||
memset((void*)array_, 0, GetArraySizeWithBytes());
|
||||
memset((void*)m_array, 0, GetArraySizeWithBytes());
|
||||
}
|
||||
|
||||
void reset() {
|
||||
entries_ = 0;
|
||||
m_entries = 0;
|
||||
clear();
|
||||
}
|
||||
|
||||
inline ScoreStatsType get(size_t i) {
|
||||
return array_[i];
|
||||
}
|
||||
inline ScoreStatsType get(size_t i)const {
|
||||
return array_[i];
|
||||
}
|
||||
inline scorestats_t getArray() const {
|
||||
return array_;
|
||||
}
|
||||
ScoreStatsType get(size_t i) { return m_array[i]; }
|
||||
ScoreStatsType get(size_t i) const { return m_array[i]; }
|
||||
scorestats_t getArray() const { return m_array; }
|
||||
|
||||
void set(const std::string& str);
|
||||
|
||||
@ -76,31 +68,24 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
inline size_t bytes() const {
|
||||
return GetArraySizeWithBytes();
|
||||
}
|
||||
size_t bytes() const { return GetArraySizeWithBytes(); }
|
||||
|
||||
size_t GetArraySizeWithBytes() const {
|
||||
return entries_ * sizeof(ScoreStatsType);
|
||||
return m_entries * sizeof(ScoreStatsType);
|
||||
}
|
||||
|
||||
inline size_t size() const {
|
||||
return entries_;
|
||||
}
|
||||
inline size_t available() const {
|
||||
return available_;
|
||||
}
|
||||
size_t size() const { return m_entries; }
|
||||
|
||||
size_t available() const { return m_available_size; }
|
||||
|
||||
void savetxt(const std::string &file);
|
||||
void savetxt(ofstream& outFile);
|
||||
void savebin(ofstream& outFile);
|
||||
inline void savetxt() {
|
||||
savetxt("/dev/stdout");
|
||||
}
|
||||
void savetxt(ostream* os);
|
||||
void savebin(ostream* os);
|
||||
void savetxt();
|
||||
|
||||
void loadtxt(const std::string &file);
|
||||
void loadtxt(ifstream& inFile);
|
||||
void loadbin(ifstream& inFile);
|
||||
void loadtxt(istream* is);
|
||||
void loadbin(istream* is);
|
||||
|
||||
/**
|
||||
* Write the whole object to a stream.
|
||||
|
@ -55,7 +55,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
|
||||
for (int i = 0; i < bootstrap; ++i)
|
||||
{
|
||||
// TODO: Use smart pointer for exceptional-safety.
|
||||
ScoreData* scoredata = new ScoreData(*g_scorer);
|
||||
ScoreData* scoredata = new ScoreData(g_scorer);
|
||||
for (int j = 0; j < n; ++j)
|
||||
{
|
||||
int randomIndex = random() % n;
|
||||
@ -89,7 +89,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
|
||||
else
|
||||
{
|
||||
// TODO: Use smart pointer for exceptional-safety.
|
||||
ScoreData* scoredata = new ScoreData(*g_scorer);
|
||||
ScoreData* scoredata = new ScoreData(g_scorer);
|
||||
for (int sid = 0; sid < n; ++sid)
|
||||
{
|
||||
string str_sid = int2string(sid);
|
||||
|
@ -197,7 +197,7 @@ int main(int argc, char** argv)
|
||||
|
||||
PrintUserTime("References loaded");
|
||||
|
||||
Data data(*scorer);
|
||||
Data data(scorer.get());
|
||||
|
||||
// load old data
|
||||
for (size_t i = 0; i < prevScoreDataFiles.size(); i++) {
|
||||
@ -208,13 +208,13 @@ int main(int argc, char** argv)
|
||||
|
||||
// computing score statistics of each nbest file
|
||||
for (size_t i = 0; i < nbestFiles.size(); i++) {
|
||||
data.loadnbest(nbestFiles.at(i));
|
||||
data.loadNBest(nbestFiles.at(i));
|
||||
}
|
||||
|
||||
PrintUserTime("Nbest entries loaded and scored");
|
||||
|
||||
//ADDED_BY_TS
|
||||
data.remove_duplicates();
|
||||
data.removeDuplicates();
|
||||
//END_ADDED
|
||||
|
||||
data.save(option.featureDataFile, option.scoreDataFile, option.binmode);
|
||||
|
@ -338,7 +338,7 @@ int main(int argc, char **argv)
|
||||
ScorerFactory::getScorer(option.scorer_type, option.scorer_config));
|
||||
|
||||
//load data
|
||||
Data data(*scorer);
|
||||
Data data(scorer.get());
|
||||
|
||||
for (size_t i = 0; i < ScoreDataFiles.size(); i++) {
|
||||
cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl;
|
||||
@ -348,7 +348,7 @@ int main(int argc, char **argv)
|
||||
scorer->setScoreData(data.getScoreData().get());
|
||||
|
||||
//ADDED_BY_TS
|
||||
data.remove_duplicates();
|
||||
data.removeDuplicates();
|
||||
//END_ADDED
|
||||
|
||||
PrintUserTime("Data loaded");
|
||||
@ -434,7 +434,7 @@ int main(int argc, char **argv)
|
||||
vector<OptimizationTask*>& tasks = allTasks[i];
|
||||
Optimizer *optimizer = OptimizerFactory::BuildOptimizer(option.pdim, to_optimize, start_list[0], option.optimize_type, option.nrandom);
|
||||
optimizer->SetScorer(data_ref.getScorer());
|
||||
optimizer->SetFData(data_ref.getFeatureData());
|
||||
optimizer->SetFeatureData(data_ref.getFeatureData());
|
||||
// A task for each start point
|
||||
for (size_t j = 0; j < startingPoints.size(); ++j) {
|
||||
OptimizationTask* task = new OptimizationTask(optimizer, startingPoints[j]);
|
||||
|
109
mert/pro.cpp
109
mert/pro.cpp
@ -21,8 +21,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* This is part of the PRO implementation. It converts the features and scores
|
||||
/**
|
||||
* This is part of the PRO implementation. It converts the features and scores
|
||||
* files into a form suitable for input into the megam maxent trainer.
|
||||
*
|
||||
* For details of PRO, refer to Hopkins & May (EMNLP 2011)
|
||||
@ -34,9 +34,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include "BleuScorer.h"
|
||||
#include "FeatureDataIterator.h"
|
||||
#include "ScoreDataIterator.h"
|
||||
|
||||
@ -46,49 +48,49 @@ namespace po = boost::program_options;
|
||||
|
||||
class SampledPair {
|
||||
private:
|
||||
pair<size_t,size_t> translation1;
|
||||
pair<size_t,size_t> translation2;
|
||||
float scoreDiff;
|
||||
pair<size_t,size_t> m_translation1;
|
||||
pair<size_t,size_t> m_translation2;
|
||||
float m_score_diff;
|
||||
|
||||
public:
|
||||
SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
|
||||
if (diff > 0) {
|
||||
translation1 = t1;
|
||||
translation2 = t2;
|
||||
scoreDiff = diff;
|
||||
}
|
||||
else {
|
||||
translation1 = t2;
|
||||
translation2 = t1;
|
||||
scoreDiff = -diff;
|
||||
}
|
||||
}
|
||||
float getDiff() const { return scoreDiff; }
|
||||
const pair<size_t,size_t>& getTranslation1() const { return translation1; }
|
||||
const pair<size_t,size_t>& getTranslation2() const { return translation2; }
|
||||
SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
|
||||
if (diff > 0) {
|
||||
m_translation1 = t1;
|
||||
m_translation2 = t2;
|
||||
m_score_diff = diff;
|
||||
} else {
|
||||
m_translation1 = t2;
|
||||
m_translation2 = t1;
|
||||
m_score_diff = -diff;
|
||||
}
|
||||
}
|
||||
|
||||
float getDiff() const { return m_score_diff; }
|
||||
const pair<size_t,size_t>& getTranslation1() const { return m_translation1; }
|
||||
const pair<size_t,size_t>& getTranslation2() const { return m_translation2; }
|
||||
};
|
||||
|
||||
|
||||
static float sentenceLevelBleuPlusOne(const vector<float>& stats) {
|
||||
float logbleu = 0.0;
|
||||
const unsigned int bleu_order = 4;
|
||||
for (unsigned int j=0; j<bleu_order; j++) {
|
||||
//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
|
||||
logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
|
||||
}
|
||||
logbleu /= bleu_order;
|
||||
const float brevity = 1.0 - static_cast<float>(stats[(bleu_order*2)]) / stats[1];
|
||||
if (brevity < 0.0) {
|
||||
logbleu += brevity;
|
||||
}
|
||||
//cerr << brevity << " -> " << exp(logbleu) << endl;
|
||||
return exp(logbleu);
|
||||
float logbleu = 0.0;
|
||||
for (unsigned int j=0; j<kBleuNgramOrder; j++) {
|
||||
//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
|
||||
logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
|
||||
}
|
||||
logbleu /= kBleuNgramOrder;
|
||||
const float brevity = 1.0 - static_cast<float>(stats[(kBleuNgramOrder * 2)]) / stats[1];
|
||||
if (brevity < 0.0) {
|
||||
logbleu += brevity;
|
||||
}
|
||||
//cerr << brevity << " -> " << exp(logbleu) << endl;
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
||||
static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) {
|
||||
// difference in score in regular features
|
||||
for(unsigned int j=0; j<f1.dense.size(); j++)
|
||||
if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
|
||||
out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
|
||||
for(unsigned int j=0; j<f1.dense.size(); j++)
|
||||
if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
|
||||
out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
|
||||
|
||||
if (f1.sparse.size() || f2.sparse.size()) {
|
||||
out << " ";
|
||||
@ -101,27 +103,27 @@ static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureD
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
bool help;
|
||||
vector<string> scoreFiles;
|
||||
vector<string> featureFiles;
|
||||
int seed;
|
||||
string outputFile;
|
||||
//TODO: options
|
||||
const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May
|
||||
const unsigned int n_samples = 50; // Xi, in Hopkins & May
|
||||
const float min_diff = 0.05;
|
||||
// TODO: Add these constants to options
|
||||
const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May
|
||||
const unsigned int n_samples = 50; // Xi, in Hopkins & May
|
||||
const float min_diff = 0.05;
|
||||
|
||||
po::options_description desc("Allowed options");
|
||||
desc.add_options()
|
||||
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
|
||||
("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
|
||||
("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
|
||||
("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
|
||||
("output-file,o", po::value<string>(&outputFile), "Output file")
|
||||
;
|
||||
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
|
||||
("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
|
||||
("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
|
||||
("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
|
||||
("output-file,o", po::value<string>(&outputFile), "Output file")
|
||||
;
|
||||
|
||||
po::options_description cmdline_options;
|
||||
cmdline_options.add(desc);
|
||||
@ -134,7 +136,7 @@ int main(int argc, char** argv)
|
||||
cout << desc << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
if (vm.count("random-seed")) {
|
||||
cerr << "Initialising random seed to " << seed << endl;
|
||||
srand(seed);
|
||||
@ -167,7 +169,7 @@ int main(int argc, char** argv)
|
||||
out = &cout;
|
||||
}
|
||||
|
||||
|
||||
|
||||
vector<FeatureDataIterator> featureDataIters;
|
||||
vector<ScoreDataIterator> scoreDataIters;
|
||||
for (size_t i = 0; i < featureFiles.size(); ++i) {
|
||||
@ -179,7 +181,7 @@ int main(int argc, char** argv)
|
||||
size_t sentenceId = 0;
|
||||
while(1) {
|
||||
vector<pair<size_t,size_t> > hypotheses;
|
||||
//TODO: de-deuping. Collect hashes of score,feature pairs and
|
||||
//TODO: de-deuping. Collect hashes of score,feature pairs and
|
||||
//only add index if it's unique.
|
||||
if (featureDataIters[0] == FeatureDataIterator::end()) {
|
||||
break;
|
||||
@ -214,7 +216,7 @@ int main(int argc, char** argv)
|
||||
size_t rand2 = rand() % n_translations;
|
||||
pair<size_t,size_t> translation2 = hypotheses[rand2];
|
||||
float bleu2 = sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second));
|
||||
|
||||
|
||||
/*
|
||||
cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 <<
|
||||
" t(" << translation2.first << "," << translation2.second << ") = " <<
|
||||
@ -222,7 +224,7 @@ int main(int argc, char** argv)
|
||||
*/
|
||||
if (abs(bleu1-bleu2) < min_diff)
|
||||
continue;
|
||||
|
||||
|
||||
samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2));
|
||||
scores.push_back(1.0-abs(bleu1-bleu2));
|
||||
}
|
||||
@ -261,4 +263,3 @@ int main(int argc, char** argv)
|
||||
outFile.close();
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user