This commit is contained in:
Hieu Hoang 2011-12-12 20:48:42 +07:00
parent 8327cce73b
commit 753eebd959
11 changed files with 169 additions and 3 deletions

View File

@ -47,6 +47,99 @@ Data::~Data() {
}
}
//ADDED BY TS
void Data::remove_duplicates() {
uint nSentences = featdata->size();
assert(scoredata->size() == nSentences);
for (uint s=0; s < nSentences; s++) {
FeatureArray& feat_array = featdata->get(s);
ScoreArray& score_array = scoredata->get(s);
assert(feat_array.size() == score_array.size());
//serves as a hash-map:
std::map<double, std::vector<uint> > lookup;
uint end_pos = feat_array.size() - 1;
uint nRemoved = 0;
for (uint k=0; k <= end_pos; k++) {
const FeatureStats& cur_feats = feat_array.get(k);
double sum = 0.0;
for (uint l=0; l < cur_feats.size(); l++)
sum += cur_feats.get(l);
if (lookup.find(sum) != lookup.end()) {
//std::cerr << "hit" << std::endl;
std::vector<uint>& cur_list = lookup[sum];
uint l=0;
for (l=0; l < cur_list.size(); l++) {
uint j=cur_list[l];
if (cur_feats == feat_array.get(j)
&& score_array.get(k) == score_array.get(j)) {
if (k < end_pos) {
feat_array.swap(k,end_pos);
score_array.swap(k,end_pos);
k--;
}
end_pos--;
nRemoved++;
break;
}
}
if (l == lookup[sum].size())
cur_list.push_back(k);
}
else
lookup[sum].push_back(k);
// for (uint j=0; j < k; j++) {
// if (feat_array.get(k) == feat_array.get(j)
// && score_array.get(k) == score_array.get(j)) {
// if (k < end_pos) {
// feat_array.swap(k,end_pos);
// score_array.swap(k,end_pos);
// k--;
// }
// end_pos--;
// nRemoved++;
// break;
// }
// }
}
std::cerr << "removed " << nRemoved << "/" << feat_array.size() << std::endl;
if (nRemoved > 0) {
feat_array.resize(end_pos+1);
score_array.resize(end_pos+1);
}
}
}
//END_ADDED
void Data::loadnbest(const std::string &file)
{
TRACE_ERR("loading nbest from " << file << std::endl);

View File

@ -73,7 +73,7 @@ public:
void mergeSparseFeatures();
void loadnbest(const std::string &file);
void load(const std::string &featfile,const std::string &scorefile) {
featdata->load(featfile);
scoredata->load(scorefile);
@ -81,6 +81,10 @@ public:
_sparse_flag = true;
}
//ADDED BY TS
void remove_duplicates();
//END_ADDED
void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
if (bin) cerr << "Binary write mode is selected" << endl;

View File

@ -63,6 +63,16 @@ public:
array_.push_back(e);
}
//ADDED BY TS
void swap(size_t i, size_t j) {
std::swap(array_[i],array_[j]);
}
void resize(size_t new_size) {
array_.resize(std::min(new_size,array_.size()));
}
//END_ADDED
void merge(FeatureArray& e);
inline size_t size() const {

View File

@ -218,3 +218,19 @@ ostream& operator<<(ostream& o, const FeatureStats& e)
return o;
}
//ADEED_BY_TS
bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
size_t size = f1.size();
if (size != f2.size())
return false;
for (size_t k=0; k < size; k++) {
if (f1.get(k) != f2.get(k))
return false;
}
return true;
}
//END_ADDED

View File

@ -134,4 +134,8 @@ public:
friend ostream& operator<<(ostream& o, const FeatureStats& e);
};
//ADEED_BY_TS
bool operator==(const FeatureStats& f1, const FeatureStats& f2);
//END_ADDED
#endif // FEATURE_STATS_H

View File

@ -62,6 +62,16 @@ public:
array_.push_back(e);
}
//ADDED BY TS
void swap(size_t i, size_t j) {
std::swap(array_[i],array_[j]);
}
void resize(size_t new_size) {
array_.resize(std::min(new_size,array_.size()));
}
//END_ADDED
void merge(ScoreArray& e);
inline std::string name() const {

View File

@ -132,3 +132,19 @@ ostream& operator<<(ostream& o, const ScoreStats& e)
o << e.get(i) << " ";
return o;
}
//ADDED_BY_TS
bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
size_t size = s1.size();
if (size != s2.size())
return false;
for (size_t k=0; k < size; k++) {
if (s1.get(k) != s2.get(k))
return false;
}
return true;
}
//END_ADDED

View File

@ -100,4 +100,8 @@ public:
friend ostream& operator<<(ostream& o, const ScoreStats& e);
};
//ADDED_BY_TS
bool operator==(const ScoreStats& s1, const ScoreStats& s2);
//END_ADDED
#endif // SCORE_STATS_H

View File

@ -182,6 +182,10 @@ int main(int argc, char** argv)
PrintUserTime("Nbest entries loaded and scored");
//ADDED_BY_TS
data.remove_duplicates();
//END_ADDED
if (binmode)
cerr << "Binary write mode is selected" << endl;
else

View File

@ -1,5 +1,5 @@
/**
* \description The is the main for the new version of the mert algorithm developed during the 2nd MT marathon
* \description This is the main for the new version of the mert algorithm developed during the 2nd MT marathon
*/
#include <limits>
@ -260,6 +260,7 @@ int main (int argc, char **argv)
if(j<pdim) {
cerr<<initfile<<":Too few minimum weights." << endl;
cerr<<"error could not initialize start point with " << initfile << endl;
std::cerr << "j: " << j << ", pdim: " << pdim << std::endl;
exit(3);
}
max.resize(pdim);
@ -297,6 +298,10 @@ int main (int argc, char **argv)
D.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
}
//ADDED_BY_TS
D.remove_duplicates();
//END_ADDED
PrintUserTime("Data loaded");
// starting point score over latest n-best, accumulative n-best

View File

@ -3,7 +3,7 @@
# Compatible with sri LM-creating script, eg.
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
# To use it in the EMS, add this to the [LM] section
# lm-training = "$moses-script-dir/generic/trainlm.irst.perl -cores $cores -irst-dir $irst-dir"
# lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irst-dir"
# settings = ""
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
# It should point to the root of the LM toolkit, eg