mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
Merge branch 'master' of github.com:moses-smt/mosesdecoder
This commit is contained in:
commit
c85b96a6c6
1
.gitignore
vendored
1
.gitignore
vendored
@ -17,6 +17,7 @@ mert/extractor
|
||||
mert/mert
|
||||
mert/megam_i686.opt
|
||||
mert/pro
|
||||
mert/kbmira
|
||||
misc/processLexicalTable
|
||||
misc/processPhraseTable
|
||||
misc/queryLexicalTable
|
||||
|
@ -232,3 +232,44 @@ float sentenceLevelBleuPlusOne(const vector<float>& stats) {
|
||||
}
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
||||
float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg)
|
||||
{
|
||||
// Sum sent and background
|
||||
std::vector<float> stats;
|
||||
CHECK(sent.size()==bg.size());
|
||||
CHECK(sent.size()==kBleuNgramOrder*2+1);
|
||||
for(size_t i=0;i<sent.size();i++)
|
||||
stats.push_back(sent[i]+bg[i]);
|
||||
|
||||
// Calculate BLEU
|
||||
float logbleu = 0.0;
|
||||
for (int j = 0; j < kBleuNgramOrder; j++) {
|
||||
logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
|
||||
}
|
||||
logbleu /= kBleuNgramOrder;
|
||||
const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];
|
||||
|
||||
if (brevity < 0.0) {
|
||||
logbleu += brevity;
|
||||
}
|
||||
|
||||
// Exponentiate and scale by reference length (as per Chiang et al 08)
|
||||
return exp(logbleu) * stats[kBleuNgramOrder*2];
|
||||
}
|
||||
|
||||
float unsmoothedBleu(const std::vector<float>& stats) {
|
||||
CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
|
||||
|
||||
float logbleu = 0.0;
|
||||
for (int j = 0; j < kBleuNgramOrder; j++) {
|
||||
logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
|
||||
}
|
||||
logbleu /= kBleuNgramOrder;
|
||||
const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];
|
||||
|
||||
if (brevity < 0.0) {
|
||||
logbleu += brevity;
|
||||
}
|
||||
return exp(logbleu);
|
||||
}
|
||||
|
@ -70,4 +70,14 @@ private:
|
||||
*/
|
||||
float sentenceLevelBleuPlusOne(const std::vector<float>& stats);
|
||||
|
||||
/** Computes sentence-level BLEU score given a background corpus.
|
||||
* This function is used in batch MIRA.
|
||||
*/
|
||||
float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg);
|
||||
|
||||
/**
|
||||
* Computes plain old BLEU from a vector of stats
|
||||
*/
|
||||
float unsmoothedBleu(const std::vector<float>& stats);
|
||||
|
||||
#endif // MERT_BLEU_SCORER_H_
|
||||
|
@ -152,10 +152,10 @@ BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
|
||||
// "girl with a telescope", "with a telescope ."
|
||||
NgramCounts counts;
|
||||
BOOST_REQUIRE(scorer.CountNgrams(line, counts, kBleuNgramOrder) == 8);
|
||||
BOOST_CHECK_EQUAL(25, counts.size());
|
||||
BOOST_CHECK_EQUAL((std::size_t)25, counts.size());
|
||||
|
||||
mert::Vocabulary* vocab = scorer.GetVocab();
|
||||
BOOST_CHECK_EQUAL(7, vocab->size());
|
||||
BOOST_CHECK_EQUAL((std::size_t)7, vocab->size());
|
||||
|
||||
std::vector<std::string> res;
|
||||
Tokenize(line.c_str(), ' ', &res);
|
||||
@ -203,7 +203,7 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
|
||||
ScoreStats entry;
|
||||
scorer.prepareStats(0, line, entry);
|
||||
|
||||
BOOST_CHECK_EQUAL(entry.size(), 2 * kBleuNgramOrder + 1);
|
||||
BOOST_CHECK_EQUAL(entry.size(), (std::size_t)(2 * kBleuNgramOrder + 1));
|
||||
|
||||
// Test hypothesis ngram counts
|
||||
BOOST_CHECK_EQUAL(entry.get(0), 5); // unigram
|
||||
|
@ -33,8 +33,8 @@ BOOST_AUTO_TEST_CASE(shard_basic) {
|
||||
std::vector<Data> shards;
|
||||
data.createShards(2,0,"",shards);
|
||||
|
||||
BOOST_CHECK_EQUAL(shards.size(),2);
|
||||
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),2);
|
||||
BOOST_CHECK_EQUAL(shards.size(),(std::size_t)2);
|
||||
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(init_feature_map_test) {
|
||||
|
@ -18,6 +18,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
***********************************************************************/
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <boost/functional/hash.hpp>
|
||||
|
||||
#include "util/tokenize_piece.hh"
|
||||
|
||||
@ -47,6 +48,16 @@ float ParseFloat(const StringPiece& str) {
|
||||
return value;
|
||||
}
|
||||
|
||||
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) {
|
||||
return item1.dense==item1.dense && item1.sparse==item1.sparse;
|
||||
}
|
||||
|
||||
size_t hash_value(FeatureDataItem const& item) {
|
||||
size_t seed = 0;
|
||||
boost::hash_combine(seed,item.dense);
|
||||
boost::hash_combine(seed,item.sparse);
|
||||
return seed;
|
||||
}
|
||||
|
||||
|
||||
FeatureDataIterator::FeatureDataIterator() {}
|
||||
|
@ -61,6 +61,9 @@ class FeatureDataItem
|
||||
SparseVector sparse;
|
||||
};
|
||||
|
||||
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2);
|
||||
std::size_t hash_value(FeatureDataItem const& item);
|
||||
|
||||
class FeatureDataIterator :
|
||||
public boost::iterator_facade<FeatureDataIterator,
|
||||
const std::vector<FeatureDataItem>,
|
||||
|
@ -13,7 +13,7 @@ void CheckFeatureMap(const FeatureData* feature_data,
|
||||
std::stringstream ss;
|
||||
ss << str << "_" << i;
|
||||
const std::string& s = ss.str();
|
||||
BOOST_CHECK_EQUAL(feature_data->getFeatureIndex(s), *cnt);
|
||||
BOOST_CHECK_EQUAL(feature_data->getFeatureIndex(s), (std::size_t)(*cnt));
|
||||
BOOST_CHECK_EQUAL(feature_data->getFeatureName(*cnt).c_str(), s);
|
||||
++(*cnt);
|
||||
}
|
||||
@ -35,6 +35,6 @@ BOOST_AUTO_TEST_CASE(set_feature_map) {
|
||||
CheckFeatureMap(&feature_data, "lm", 2, &cnt);
|
||||
CheckFeatureMap(&feature_data, "tm", 5, &cnt);
|
||||
|
||||
BOOST_CHECK_EQUAL(feature_data.getFeatureIndex("w_0"), cnt);
|
||||
BOOST_CHECK_EQUAL(feature_data.getFeatureIndex("w_0"), (std::size_t)cnt);
|
||||
BOOST_CHECK_EQUAL(feature_data.getFeatureName(cnt).c_str(), "w_0");
|
||||
}
|
||||
|
@ -10,6 +10,8 @@
|
||||
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <boost/functional/hash.hpp>
|
||||
|
||||
#include "Util.h"
|
||||
|
||||
using namespace std;
|
||||
@ -81,6 +83,43 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<std::size_t> SparseVector::feats() const {
|
||||
std::vector<std::size_t> toRet;
|
||||
for(fvector_t::const_iterator iter = m_fvector.begin();
|
||||
iter!=m_fvector.end();
|
||||
iter++) {
|
||||
toRet.push_back(iter->first);
|
||||
}
|
||||
return toRet;
|
||||
}
|
||||
|
||||
std::size_t SparseVector::encode(const std::string& name) {
|
||||
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
|
||||
size_t id = 0;
|
||||
if (name2id_iter == m_name_to_id.end()) {
|
||||
id = m_id_to_name.size();
|
||||
m_id_to_name.push_back(name);
|
||||
m_name_to_id[name] = id;
|
||||
} else {
|
||||
id = name2id_iter->second;
|
||||
}
|
||||
return id;
|
||||
}
|
||||
|
||||
std::string SparseVector::decode(std::size_t id) {
|
||||
return m_id_to_name[id];
|
||||
}
|
||||
|
||||
bool operator==(SparseVector const& item1, SparseVector const& item2) {
|
||||
return item1.m_fvector==item2.m_fvector;
|
||||
}
|
||||
|
||||
std::size_t hash_value(SparseVector const& item) {
|
||||
boost::hash<SparseVector::fvector_t> hasher;
|
||||
return hasher(item.m_fvector);
|
||||
}
|
||||
|
||||
|
||||
FeatureStats::FeatureStats()
|
||||
: m_available_size(kAvailableSize), m_entries(0),
|
||||
m_array(new FeatureStatsType[m_available_size]) {}
|
||||
|
@ -28,11 +28,19 @@ public:
|
||||
void set(const std::string& name, FeatureStatsType value);
|
||||
void clear();
|
||||
std::size_t size() const { return m_fvector.size(); }
|
||||
|
||||
|
||||
void write(std::ostream& out, const std::string& sep = " ") const;
|
||||
|
||||
SparseVector& operator-=(const SparseVector& rhs);
|
||||
|
||||
// Added by cherryc
|
||||
std::vector<std::size_t> feats() const;
|
||||
friend bool operator==(SparseVector const& item1, SparseVector const& item2);
|
||||
friend std::size_t hash_value(SparseVector const& item);
|
||||
static std::size_t encode(const std::string& feat);
|
||||
static std::string decode(std::size_t feat);
|
||||
// End added by cherryc
|
||||
|
||||
private:
|
||||
static name2id_t m_name_to_id;
|
||||
static id2name_t m_id_to_name;
|
||||
|
187
mert/HypPackEnumerator.cpp
Normal file
187
mert/HypPackEnumerator.cpp
Normal file
@ -0,0 +1,187 @@
|
||||
#include "HypPackEnumerator.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
#include <boost/unordered_set.hpp>
|
||||
|
||||
StreamingHypPackEnumerator::StreamingHypPackEnumerator
|
||||
(
|
||||
vector<std::string> const& featureFiles,
|
||||
vector<std::string> const& scoreFiles
|
||||
)
|
||||
: m_featureFiles(featureFiles),
|
||||
m_scoreFiles(scoreFiles)
|
||||
{
|
||||
if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
|
||||
cerr << "No data to process" << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (featureFiles.size() != scoreFiles.size()) {
|
||||
cerr << "Error: Number of feature files (" << featureFiles.size() <<
|
||||
") does not match number of score files (" << scoreFiles.size() << ")" << endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
m_num_lists = scoreFiles.size();
|
||||
m_primed = false;
|
||||
m_iNumDense = -1;
|
||||
}
|
||||
|
||||
size_t StreamingHypPackEnumerator::num_dense() const {
|
||||
if(m_iNumDense<0) {
|
||||
cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
}
|
||||
return (size_t) m_iNumDense;
|
||||
}
|
||||
|
||||
void StreamingHypPackEnumerator::prime(){
|
||||
m_current_indexes.clear();
|
||||
boost::unordered_set<FeatureDataItem> seen;
|
||||
m_primed = true;
|
||||
|
||||
for (size_t i = 0; i < m_num_lists; ++i) {
|
||||
if (m_featureDataIters[i] == FeatureDataIterator::end()) {
|
||||
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (m_scoreDataIters[i] == ScoreDataIterator::end()) {
|
||||
cerr << "Error: Score file " << i << " ended prematurely" << endl;
|
||||
exit(1);
|
||||
}
|
||||
if (m_featureDataIters[i]->size() != m_scoreDataIters[i]->size()) {
|
||||
cerr << "Error: For sentence " << m_sentenceId << " features and scores have different size" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for (size_t j = 0; j < m_featureDataIters[i]->size(); ++j) {
|
||||
FeatureDataItem item = m_featureDataIters[i]->operator[](j);
|
||||
// Dedup
|
||||
if(seen.find(item)==seen.end()) {
|
||||
seen.insert(item);
|
||||
// Confirm dense features are always the same
|
||||
int iDense = item.dense.size();
|
||||
if(m_iNumDense != iDense) {
|
||||
if(m_iNumDense==-1) m_iNumDense = iDense;
|
||||
else {
|
||||
cerr << "Error: expecting constant number of dense features: "
|
||||
<< m_iNumDense << " != " << iDense << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
// Store item for retrieval
|
||||
m_current_indexes.push_back(pair<size_t,size_t>(i,j));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void StreamingHypPackEnumerator::reset(){
|
||||
m_featureDataIters.clear();
|
||||
m_scoreDataIters.clear();
|
||||
for (size_t i = 0; i < m_num_lists; ++i) {
|
||||
m_featureDataIters.push_back(FeatureDataIterator(m_featureFiles[i]));
|
||||
m_scoreDataIters.push_back(ScoreDataIterator(m_scoreFiles[i]));
|
||||
}
|
||||
m_sentenceId=0;
|
||||
prime();
|
||||
}
|
||||
|
||||
bool StreamingHypPackEnumerator::finished(){
|
||||
return m_featureDataIters[0]==FeatureDataIterator::end();
|
||||
}
|
||||
|
||||
void StreamingHypPackEnumerator::next(){
|
||||
if(!m_primed) {
|
||||
cerr << "Enumerating an unprimed HypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for (size_t i = 0; i < m_num_lists; ++i) {
|
||||
++m_featureDataIters[i];
|
||||
++m_scoreDataIters[i];
|
||||
}
|
||||
m_sentenceId++;
|
||||
if(!finished()) prime();
|
||||
}
|
||||
|
||||
size_t StreamingHypPackEnumerator::cur_size(){
|
||||
if(!m_primed) {
|
||||
cerr << "Querying size from an unprimed HypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
}
|
||||
return m_current_indexes.size();
|
||||
}
|
||||
|
||||
const FeatureDataItem& StreamingHypPackEnumerator::featuresAt(size_t index){
|
||||
if(!m_primed) {
|
||||
cerr << "Querying features from an unprimed HypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
}
|
||||
const pair<size_t,size_t>& pij = m_current_indexes[index];
|
||||
return m_featureDataIters[pij.first]->operator[](pij.second);
|
||||
}
|
||||
|
||||
const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
|
||||
if(!m_primed) {
|
||||
cerr << "Querying scores from an unprimed HypPackEnumerator" << endl;
|
||||
exit(1);
|
||||
}
|
||||
const pair<size_t,size_t>& pij = m_current_indexes[index];
|
||||
return m_scoreDataIters[pij.first]->operator[](pij.second);
|
||||
}
|
||||
|
||||
/* --------- RandomAccessHypPackEnumerator ------------- */
|
||||
|
||||
RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
|
||||
vector<string> const& scoreFiles,
|
||||
bool no_shuffle)
|
||||
{
|
||||
StreamingHypPackEnumerator train(featureFiles,scoreFiles);
|
||||
size_t index=0;
|
||||
for(train.reset(); !train.finished(); train.next()) {
|
||||
m_features.push_back(vector<FeatureDataItem>());
|
||||
m_scores.push_back(vector<ScoreDataItem>());
|
||||
for(size_t j=0;j<train.cur_size();j++) {
|
||||
m_features.back().push_back(train.featuresAt(j));
|
||||
m_scores.back().push_back(train.scoresAt(j));
|
||||
}
|
||||
m_indexes.push_back(index++);
|
||||
}
|
||||
|
||||
m_cur_index = 0;
|
||||
m_no_shuffle = no_shuffle;
|
||||
m_num_dense = train.num_dense();
|
||||
}
|
||||
|
||||
size_t RandomAccessHypPackEnumerator::num_dense() const {
|
||||
return m_num_dense;
|
||||
}
|
||||
|
||||
void RandomAccessHypPackEnumerator::reset() {
|
||||
m_cur_index = 0;
|
||||
if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end());
|
||||
}
|
||||
bool RandomAccessHypPackEnumerator::finished() {
|
||||
return m_cur_index >= m_indexes.size();
|
||||
}
|
||||
void RandomAccessHypPackEnumerator::next() {
|
||||
m_cur_index++;
|
||||
}
|
||||
|
||||
size_t RandomAccessHypPackEnumerator::cur_size() {
|
||||
assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size());
|
||||
return m_features[m_indexes[m_cur_index]].size();
|
||||
}
|
||||
const FeatureDataItem& RandomAccessHypPackEnumerator::featuresAt(size_t i) {
|
||||
return m_features[m_indexes[m_cur_index]][i];
|
||||
}
|
||||
const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) {
|
||||
return m_scores[m_indexes[m_cur_index]][i];
|
||||
}
|
||||
|
||||
|
||||
// --Emacs trickery--
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-basic-offset:2
|
||||
// End:
|
101
mert/HypPackEnumerator.h
Normal file
101
mert/HypPackEnumerator.h
Normal file
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* HypPackCollection.h
|
||||
* kbmira - k-best Batch MIRA
|
||||
*
|
||||
* Abstracts away the mess of iterating through multiple
|
||||
* collections of k-best lists, as well as deduping
|
||||
*/
|
||||
|
||||
#ifndef MERT_HYP_PACK_COLLECTION_H
|
||||
#define MERT_HYP_PACK_COLLECTION_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
#include "FeatureDataIterator.h"
|
||||
#include "ScoreDataIterator.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
// Start with these abstract classes
|
||||
|
||||
class HypPackEnumerator {
|
||||
public:
|
||||
virtual void reset() = 0;
|
||||
virtual bool finished() = 0;
|
||||
virtual void next() = 0;
|
||||
|
||||
virtual size_t cur_size() = 0;
|
||||
virtual size_t num_dense() const = 0;
|
||||
virtual const FeatureDataItem& featuresAt(size_t i) = 0;
|
||||
virtual const ScoreDataItem& scoresAt(size_t i) = 0;
|
||||
};
|
||||
|
||||
// Instantiation that streams from disk
|
||||
// Low-memory, low-speed, sequential access
|
||||
class StreamingHypPackEnumerator : public HypPackEnumerator {
|
||||
public:
|
||||
StreamingHypPackEnumerator(vector<string> const& featureFiles,
|
||||
vector<string> const& scoreFiles
|
||||
);
|
||||
|
||||
virtual size_t num_dense() const;
|
||||
|
||||
virtual void reset();
|
||||
virtual bool finished();
|
||||
virtual void next();
|
||||
|
||||
virtual size_t cur_size();
|
||||
virtual const FeatureDataItem& featuresAt(size_t i);
|
||||
virtual const ScoreDataItem& scoresAt(size_t i);
|
||||
|
||||
private:
|
||||
void prime();
|
||||
size_t m_num_lists;
|
||||
size_t m_sentenceId;
|
||||
vector<string> m_featureFiles;
|
||||
vector<string> m_scoreFiles;
|
||||
|
||||
bool m_primed;
|
||||
int m_iNumDense;
|
||||
vector<FeatureDataIterator> m_featureDataIters;
|
||||
vector<ScoreDataIterator> m_scoreDataIters;
|
||||
vector<pair<size_t,size_t> > m_current_indexes;
|
||||
};
|
||||
|
||||
// Instantiation that reads into memory
|
||||
// High-memory, high-speed, random access
|
||||
// (Actually randomizes with each call to reset)
|
||||
class RandomAccessHypPackEnumerator : public HypPackEnumerator {
|
||||
public:
|
||||
RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
|
||||
vector<string> const& scoreFiles,
|
||||
bool no_shuffle);
|
||||
|
||||
virtual size_t num_dense() const;
|
||||
|
||||
virtual void reset();
|
||||
virtual bool finished();
|
||||
virtual void next();
|
||||
|
||||
virtual size_t cur_size();
|
||||
virtual const FeatureDataItem& featuresAt(size_t i);
|
||||
virtual const ScoreDataItem& scoresAt(size_t i);
|
||||
|
||||
private:
|
||||
bool m_no_shuffle;
|
||||
size_t m_cur_index;
|
||||
size_t m_num_dense;
|
||||
vector<size_t> m_indexes;
|
||||
vector<vector<FeatureDataItem> > m_features;
|
||||
vector<vector<ScoreDataItem> > m_scores;
|
||||
};
|
||||
|
||||
#endif // MERT_HYP_PACK_COLLECTION_H
|
||||
|
||||
// --Emacs trickery--
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-basic-offset:2
|
||||
// End:
|
@ -15,6 +15,9 @@ FeatureStats.cpp
|
||||
FeatureArray.cpp
|
||||
FeatureData.cpp
|
||||
FeatureDataIterator.cpp
|
||||
MiraFeatureVector.cpp
|
||||
MiraWeightVector.cpp
|
||||
HypPackEnumerator.cpp
|
||||
Data.cpp
|
||||
BleuScorer.cpp
|
||||
SemposScorer.cpp
|
||||
@ -52,7 +55,9 @@ exe evaluator : evaluator.cpp mert_lib ;
|
||||
|
||||
exe pro : pro.cpp mert_lib ..//boost_program_options ;
|
||||
|
||||
alias programs : mert extractor evaluator pro ;
|
||||
exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ;
|
||||
|
||||
alias programs : mert extractor evaluator pro kbmira ;
|
||||
|
||||
unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;
|
||||
|
144
mert/MiraFeatureVector.cpp
Normal file
144
mert/MiraFeatureVector.cpp
Normal file
@ -0,0 +1,144 @@
|
||||
#include <cmath>
|
||||
|
||||
#include "MiraFeatureVector.h"
|
||||
|
||||
MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
|
||||
: m_dense(vec.dense)
|
||||
{
|
||||
vector<size_t> sparseFeats = vec.sparse.feats();
|
||||
bool bFirst = true;
|
||||
size_t lastFeat = 0;
|
||||
for(size_t i=0;i<sparseFeats.size();i++)
|
||||
{
|
||||
size_t feat = m_dense.size() + sparseFeats[i];
|
||||
m_sparseFeats.push_back(feat);
|
||||
m_sparseVals.push_back(vec.sparse.get(sparseFeats[i]));
|
||||
|
||||
// Check ordered property
|
||||
if(bFirst) {
|
||||
bFirst = false;
|
||||
}
|
||||
else {
|
||||
if(lastFeat>=feat) {
|
||||
cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
lastFeat = feat;
|
||||
}
|
||||
}
|
||||
|
||||
MiraFeatureVector::MiraFeatureVector(const MiraFeatureVector& other)
|
||||
: m_dense(other.m_dense),
|
||||
m_sparseFeats(other.m_sparseFeats),
|
||||
m_sparseVals(other.m_sparseVals)
|
||||
{
|
||||
if(m_sparseVals.size()!=m_sparseFeats.size()) {
|
||||
cerr << "Error: mismatching sparse feat and val sizes" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
MiraFeatureVector::MiraFeatureVector(const vector<ValType>& dense,
|
||||
const vector<size_t>& sparseFeats,
|
||||
const vector<ValType>& sparseVals)
|
||||
: m_dense(dense),
|
||||
m_sparseFeats(sparseFeats),
|
||||
m_sparseVals(sparseVals)
|
||||
{
|
||||
if(m_sparseVals.size()!=m_sparseFeats.size()) {
|
||||
cerr << "Error: mismatching sparse feat and val sizes" << endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
ValType MiraFeatureVector::val(size_t index) const {
|
||||
if(index < m_dense.size())
|
||||
return m_dense[index];
|
||||
else
|
||||
return m_sparseVals[index];
|
||||
}
|
||||
|
||||
size_t MiraFeatureVector::feat(size_t index) const {
|
||||
if(index < m_dense.size())
|
||||
return index;
|
||||
else
|
||||
return m_sparseFeats[index];
|
||||
}
|
||||
|
||||
size_t MiraFeatureVector::size() const {
|
||||
return m_dense.size() + m_sparseVals.size();
|
||||
}
|
||||
|
||||
ValType MiraFeatureVector::sqrNorm() const {
|
||||
ValType toRet = 0.0;
|
||||
for(size_t i=0;i<m_dense.size();i++)
|
||||
toRet += m_dense[i]*m_dense[i];
|
||||
for(size_t i=0;i<m_sparseVals.size();i++)
|
||||
toRet += m_sparseVals[i] * m_sparseVals[i];
|
||||
return toRet;
|
||||
}
|
||||
|
||||
MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector& b)
|
||||
{
|
||||
// Dense subtraction
|
||||
vector<ValType> dense;
|
||||
if(a.m_dense.size()!=b.m_dense.size()) {
|
||||
cerr << "Mismatching dense vectors passed to MiraFeatureVector subtraction" << endl;
|
||||
exit(1);
|
||||
}
|
||||
for(size_t i=0;i<a.m_dense.size();i++) {
|
||||
dense.push_back(a.m_dense[i] - b.m_dense[i]);
|
||||
}
|
||||
|
||||
// Sparse subtraction
|
||||
size_t i=0;
|
||||
size_t j=0;
|
||||
vector<ValType> sparseVals;
|
||||
vector<size_t> sparseFeats;
|
||||
while(i < a.m_sparseFeats.size() && j < b.m_sparseFeats.size()) {
|
||||
|
||||
if(a.m_sparseFeats[i] < b.m_sparseFeats[j]) {
|
||||
sparseFeats.push_back(a.m_sparseFeats[i]);
|
||||
sparseVals.push_back(a.m_sparseVals[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
else if(b.m_sparseFeats[j] < a.m_sparseFeats[i]) {
|
||||
sparseFeats.push_back(b.m_sparseFeats[j]);
|
||||
sparseVals.push_back(-b.m_sparseVals[j]);
|
||||
j++;
|
||||
}
|
||||
|
||||
else {
|
||||
ValType newVal = a.m_sparseVals[i] - b.m_sparseVals[j];
|
||||
if(abs(newVal)>1e-6) {
|
||||
sparseFeats.push_back(a.m_sparseFeats[i]);
|
||||
sparseVals.push_back(newVal);
|
||||
}
|
||||
i++;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
while(i<a.m_sparseFeats.size()) {
|
||||
sparseFeats.push_back(a.m_sparseFeats[i]);
|
||||
sparseVals.push_back(a.m_sparseVals[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
while(j<b.m_sparseFeats.size()) {
|
||||
sparseFeats.push_back(b.m_sparseFeats[j]);
|
||||
sparseVals.push_back(-b.m_sparseVals[j]);
|
||||
j++;
|
||||
}
|
||||
|
||||
// Create and return vector
|
||||
return MiraFeatureVector(dense,sparseFeats,sparseVals);
|
||||
}
|
||||
|
||||
// --Emacs trickery--
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-basic-offset:2
|
||||
// End:
|
51
mert/MiraFeatureVector.h
Normal file
51
mert/MiraFeatureVector.h
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* MiraFeatureVector.h
|
||||
* kbmira - k-best Batch MIRA
|
||||
*
|
||||
* An alternative to the existing SparseVector
|
||||
* and FeatureDataItem combo. Should be as memory
|
||||
* efficient, and a little more time efficient,
|
||||
* and should save me from constantly hacking
|
||||
* SparseVector
|
||||
*/
|
||||
|
||||
#ifndef MERT_MIRA_FEATURE_VECTOR_H
|
||||
#define MERT_MIRA_FEATURE_VECTOR_H
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "FeatureDataIterator.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
typedef FeatureStatsType ValType;
|
||||
|
||||
class MiraFeatureVector {
|
||||
public:
|
||||
MiraFeatureVector(const FeatureDataItem& vec);
|
||||
MiraFeatureVector(const MiraFeatureVector& other);
|
||||
MiraFeatureVector(const vector<ValType>& dense,
|
||||
const vector<size_t>& sparseFeats,
|
||||
const vector<ValType>& sparseVals);
|
||||
|
||||
ValType val(size_t index) const;
|
||||
size_t feat(size_t index) const;
|
||||
size_t size() const;
|
||||
ValType sqrNorm() const;
|
||||
|
||||
friend MiraFeatureVector operator-(const MiraFeatureVector& a,
|
||||
const MiraFeatureVector& b);
|
||||
|
||||
private:
|
||||
vector<ValType> m_dense;
|
||||
vector<size_t> m_sparseFeats;
|
||||
vector<ValType> m_sparseVals;
|
||||
};
|
||||
|
||||
#endif // MERT_FEATURE_VECTOR_H
|
||||
|
||||
// --Emacs trickery--
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-basic-offset:2
|
||||
// End:
|
143
mert/MiraWeightVector.cpp
Normal file
143
mert/MiraWeightVector.cpp
Normal file
@ -0,0 +1,143 @@
|
||||
#include "MiraWeightVector.h"
|
||||
|
||||
/**
|
||||
* Constructor, initializes to the zero vector
|
||||
*/
|
||||
MiraWeightVector::MiraWeightVector()
|
||||
: m_weights(),
|
||||
m_totals(),
|
||||
m_lastUpdated()
|
||||
{
|
||||
m_numUpdates = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor with provided initial vector
|
||||
* \param init Initial feature values
|
||||
*/
|
||||
MiraWeightVector::MiraWeightVector(const vector<ValType>& init)
|
||||
: m_weights(init),
|
||||
m_totals(init),
|
||||
m_lastUpdated(init.size(), 0)
|
||||
{
|
||||
m_numUpdates = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a the model
|
||||
* \param fv Feature vector to be added to the weights
|
||||
* \param tau FV will be scaled by this value before update
|
||||
*/
|
||||
void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
|
||||
m_numUpdates++;
|
||||
for(size_t i=0;i<fv.size();i++) {
|
||||
update(fv.feat(i), fv.val(i)*tau);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform an empty update (affects averaging)
|
||||
*/
|
||||
void MiraWeightVector::tick() {
|
||||
m_numUpdates++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Score a feature vector according to the model
|
||||
* \param fv Feature vector to be scored
|
||||
*/
|
||||
ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
|
||||
ValType toRet = 0.0;
|
||||
for(size_t i=0; i<fv.size(); i++) {
|
||||
toRet += weight(fv.feat(i)) * fv.val(i);
|
||||
}
|
||||
return toRet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an averaged view of this weight vector
|
||||
*/
|
||||
AvgWeightVector MiraWeightVector::avg() {
|
||||
this->fixTotals();
|
||||
return AvgWeightVector(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates a weight and lazily updates its total
|
||||
*/
|
||||
void MiraWeightVector::update(size_t index, ValType delta) {
|
||||
|
||||
// Handle previously unseen weights
|
||||
while(index>=m_weights.size()) {
|
||||
m_weights.push_back(0.0);
|
||||
m_totals.push_back(0.0);
|
||||
m_lastUpdated.push_back(0);
|
||||
}
|
||||
|
||||
// Book keeping for w = w + delta
|
||||
m_totals[index] += (m_numUpdates - m_lastUpdated[index]) * m_weights[index] + delta;
|
||||
m_weights[index] += delta;
|
||||
m_lastUpdated[index] = m_numUpdates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure everyone's total is up-to-date
|
||||
*/
|
||||
void MiraWeightVector::fixTotals() {
|
||||
for(size_t i=0; i<m_weights.size(); i++) update(i,0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to handle out of range weights
|
||||
*/
|
||||
ValType MiraWeightVector::weight(size_t index) const {
|
||||
if(index < m_weights.size()) {
|
||||
return m_weights[index];
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
ValType MiraWeightVector::sqrNorm() const {
|
||||
ValType toRet = 0;
|
||||
for(size_t i=0;i<m_weights.size();i++) {
|
||||
toRet += weight(i) * weight(i);
|
||||
}
|
||||
return toRet;
|
||||
}
|
||||
|
||||
AvgWeightVector::AvgWeightVector(const MiraWeightVector& wv)
|
||||
:m_wv(wv)
|
||||
{}
|
||||
|
||||
ValType AvgWeightVector::weight(size_t index) const
|
||||
{
|
||||
if(m_wv.m_numUpdates==0) return m_wv.weight(index);
|
||||
else {
|
||||
if(index < m_wv.m_totals.size()) {
|
||||
return m_wv.m_totals[index] / m_wv.m_numUpdates;
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
|
||||
ValType toRet = 0.0;
|
||||
for(size_t i=0; i<fv.size(); i++) {
|
||||
toRet += weight(fv.feat(i)) * fv.val(i);
|
||||
}
|
||||
return toRet;
|
||||
}
|
||||
|
||||
size_t AvgWeightVector::size() const {
|
||||
return m_wv.m_weights.size();
|
||||
}
|
||||
|
||||
// --Emacs trickery--
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-basic-offset:2
|
||||
// End:
|
106
mert/MiraWeightVector.h
Normal file
106
mert/MiraWeightVector.h
Normal file
@ -0,0 +1,106 @@
|
||||
/*
|
||||
* MiraWeightVector.h
|
||||
* kbmira - k-best Batch MIRA
|
||||
*
|
||||
* A self-averaging weight-vector. Good for
|
||||
* perceptron learning as well.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MERT_MIRA_WEIGHT_VECTOR_H
|
||||
#define MERT_MIRA_WEIGHT_VECTOR_H
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "MiraFeatureVector.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
class AvgWeightVector;
|
||||
|
||||
class MiraWeightVector {
|
||||
public:
|
||||
/**
|
||||
* Constructor, initializes to the zero vector
|
||||
*/
|
||||
MiraWeightVector();
|
||||
|
||||
/**
|
||||
* Constructor with provided initial vector
|
||||
* \param init Initial feature values
|
||||
*/
|
||||
MiraWeightVector(const vector<ValType>& init);
|
||||
|
||||
/**
|
||||
* Update a the model
|
||||
* \param fv Feature vector to be added to the weights
|
||||
* \param tau FV will be scaled by this value before update
|
||||
*/
|
||||
void update(const MiraFeatureVector& fv, float tau);
|
||||
|
||||
/**
|
||||
* Perform an empty update (affects averaging)
|
||||
*/
|
||||
void tick();
|
||||
|
||||
/**
|
||||
* Score a feature vector according to the model
|
||||
* \param fv Feature vector to be scored
|
||||
*/
|
||||
ValType score(const MiraFeatureVector& fv) const;
|
||||
|
||||
/**
|
||||
* Squared norm of the weight vector
|
||||
*/
|
||||
ValType sqrNorm() const;
|
||||
|
||||
/**
|
||||
* Return an averaged view of this weight vector
|
||||
*/
|
||||
AvgWeightVector avg();
|
||||
|
||||
friend class AvgWeightVector;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Updates a weight and lazily updates its total
|
||||
*/
|
||||
void update(size_t index, ValType delta);
|
||||
|
||||
/**
|
||||
* Make sure everyone's total is up-to-date
|
||||
*/
|
||||
void fixTotals();
|
||||
|
||||
/**
|
||||
* Helper to handle out-of-range weights
|
||||
*/
|
||||
ValType weight(size_t index) const;
|
||||
|
||||
vector<ValType> m_weights;
|
||||
vector<ValType> m_totals;
|
||||
vector<size_t> m_lastUpdated;
|
||||
size_t m_numUpdates;
|
||||
};
|
||||
|
||||
/**
|
||||
* Averaged view of a weight vector
|
||||
*/
|
||||
class AvgWeightVector {
|
||||
public:
|
||||
AvgWeightVector(const MiraWeightVector& wv);
|
||||
ValType score(const MiraFeatureVector& fv) const;
|
||||
ValType weight(size_t index) const;
|
||||
size_t size() const;
|
||||
private:
|
||||
const MiraWeightVector& m_wv;
|
||||
};
|
||||
|
||||
|
||||
#endif // MERT_WEIGHT_VECTOR_H
|
||||
|
||||
// --Emacs trickery--
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-basic-offset:2
|
||||
// End:
|
298
mert/kbmira.cpp
Normal file
298
mert/kbmira.cpp
Normal file
@ -0,0 +1,298 @@
|
||||
// $Id$
|
||||
// vim:tabstop=2
|
||||
/***********************************************************************
|
||||
|
||||
***********************************************************************/
|
||||
|
||||
/**
|
||||
* k-best Batch Mira, as described in:
|
||||
*
|
||||
* Colin Cherry and George Foster
|
||||
* Batch Tuning Strategies for Statistical Machine Translation
|
||||
* NAACL 2012
|
||||
*
|
||||
* Implemented by colin.cherry@nrc-cnrc.gc.ca
|
||||
*
|
||||
* To license implementations of any of the other tuners in that paper,
|
||||
* please get in touch with any member of NRC Canada's Portage project
|
||||
*
|
||||
* Input is a set of n-best lists, encoded as feature and score files.
|
||||
*
|
||||
* Output is a weight file that results from running MIRA on these
|
||||
* n-btest lists for J iterations. Will return the set that maximizes
|
||||
* training BLEU.
|
||||
**/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <algorithm>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
|
||||
#include "BleuScorer.h"
|
||||
#include "HypPackEnumerator.h"
|
||||
#include "MiraFeatureVector.h"
|
||||
#include "MiraWeightVector.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) {
|
||||
vector<ValType> stats(kBleuNgramOrder*2+1,0);
|
||||
for(train->reset(); !train->finished(); train->next()) {
|
||||
// Find max model
|
||||
size_t max_index=0;
|
||||
ValType max_score=0;
|
||||
for(size_t i=0;i<train->cur_size();i++) {
|
||||
MiraFeatureVector vec(train->featuresAt(i));
|
||||
ValType score = wv.score(vec);
|
||||
if(i==0 || score > max_score) {
|
||||
max_index = i;
|
||||
max_score = score;
|
||||
}
|
||||
}
|
||||
// Update stats
|
||||
const vector<float>& sent = train->scoresAt(max_index);
|
||||
for(size_t i=0;i<sent.size();i++) {
|
||||
stats[i]+=sent[i];
|
||||
}
|
||||
}
|
||||
return unsmoothedBleu(stats);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
bool help;
|
||||
string denseInitFile;
|
||||
string sparseInitFile;
|
||||
vector<string> scoreFiles;
|
||||
vector<string> featureFiles;
|
||||
int seed;
|
||||
string outputFile;
|
||||
float c = 0.01; // Step-size cap C
|
||||
float decay = 0.999; // Pseudo-corpus decay \gamma
|
||||
int n_iters = 60; // Max epochs J
|
||||
bool streaming = false; // Stream all k-best lists?
|
||||
bool no_shuffle = false; // Don't shuffle, even for in memory version
|
||||
bool model_bg = false; // Use model for background corpus
|
||||
|
||||
// Command-line processing follows pro.cpp
|
||||
po::options_description desc("Allowed options");
|
||||
desc.add_options()
|
||||
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
|
||||
("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
|
||||
("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
|
||||
("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
|
||||
("output-file,o", po::value<string>(&outputFile), "Output file")
|
||||
("cparam,C", po::value<float>(&c), "MIRA C-parameter, lower for more regularization (default 0.01)")
|
||||
("decay,D", po::value<float>(&decay), "BLEU background corpus decay rate (default 0.999)")
|
||||
("iters,J", po::value<int>(&n_iters), "Number of MIRA iterations to run (default 60)")
|
||||
("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features")
|
||||
("sparse-init,s", po::value<string>(&sparseInitFile), "Weight file for sparse features")
|
||||
("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle")
|
||||
("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch")
|
||||
("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background");
|
||||
;
|
||||
|
||||
po::options_description cmdline_options;
|
||||
cmdline_options.add(desc);
|
||||
po::variables_map vm;
|
||||
po::store(po::command_line_parser(argc,argv).
|
||||
options(cmdline_options).run(), vm);
|
||||
po::notify(vm);
|
||||
if (help) {
|
||||
cout << "Usage: " + string(argv[0]) + " [options]" << endl;
|
||||
cout << desc << endl;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (vm.count("random-seed")) {
|
||||
cerr << "Initialising random seed to " << seed << endl;
|
||||
srand(seed);
|
||||
} else {
|
||||
cerr << "Initialising random seed from system clock" << endl;
|
||||
srand(time(NULL));
|
||||
}
|
||||
|
||||
// Initialize weights
|
||||
///
|
||||
// Dense
|
||||
vector<parameter_t> initParams;
|
||||
if(!denseInitFile.empty()) {
|
||||
ifstream opt(denseInitFile.c_str());
|
||||
string buffer; istringstream strstrm(buffer);
|
||||
if (opt.fail()) {
|
||||
cerr << "could not open dense initfile: " << denseInitFile << endl;
|
||||
exit(3);
|
||||
}
|
||||
parameter_t val;
|
||||
getline(opt,buffer);
|
||||
while(strstrm >> val) initParams.push_back(val);
|
||||
opt.close();
|
||||
}
|
||||
size_t initDenseSize = initParams.size();
|
||||
// Sparse
|
||||
if(!sparseInitFile.empty()) {
|
||||
if(initDenseSize==0) {
|
||||
cerr << "sparse initialization requires dense initialization" << endl;
|
||||
exit(3);
|
||||
}
|
||||
ifstream opt(sparseInitFile.c_str());
|
||||
if(opt.fail()) {
|
||||
cerr << "could not open sparse initfile: " << sparseInitFile << endl;
|
||||
exit(3);
|
||||
}
|
||||
int sparseCount=0;
|
||||
parameter_t val; std::string name;
|
||||
while(opt >> name >> val) {
|
||||
size_t id = SparseVector::encode(name) + initDenseSize;
|
||||
while(initParams.size()<=id) initParams.push_back(0.0);
|
||||
initParams[id] = val;
|
||||
sparseCount++;
|
||||
}
|
||||
cerr << "Found " << sparseCount << " initial sparse features" << endl;
|
||||
opt.close();
|
||||
}
|
||||
|
||||
MiraWeightVector wv(initParams);
|
||||
|
||||
// Initialize background corpus
|
||||
vector<ValType> bg;
|
||||
for(int j=0;j<kBleuNgramOrder;j++){
|
||||
bg.push_back(kBleuNgramOrder-j);
|
||||
bg.push_back(kBleuNgramOrder-j);
|
||||
}
|
||||
bg.push_back(kBleuNgramOrder);
|
||||
|
||||
// Training loop
|
||||
boost::scoped_ptr<HypPackEnumerator> train;
|
||||
if(streaming)
|
||||
train.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles));
|
||||
else
|
||||
train.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle));
|
||||
cerr << "Initial BLEU = " << evaluate(train.get(), wv.avg()) << endl;
|
||||
ValType bestBleu = 0;
|
||||
for(int j=0;j<n_iters;j++)
|
||||
{
|
||||
// MIRA train for one epoch
|
||||
int iNumHyps = 0;
|
||||
int iNumExamples = 0;
|
||||
int iNumUpdates = 0;
|
||||
ValType totalLoss = 0.0;
|
||||
for(train->reset(); !train->finished(); train->next()) {
|
||||
|
||||
// Hope / fear decode
|
||||
size_t hope_index=0, fear_index=0, model_index=0;
|
||||
ValType hope_score=0, fear_score=0, model_score=0;
|
||||
for(size_t i=0; i< train->cur_size(); i++) {
|
||||
MiraFeatureVector vec(train->featuresAt(i));
|
||||
ValType score = wv.score(vec);
|
||||
ValType bleu = sentenceLevelBackgroundBleu(train->scoresAt(i),bg);
|
||||
// Hope
|
||||
if(i==0 || (score + bleu) > hope_score) {
|
||||
hope_score = score + bleu;
|
||||
hope_index = i;
|
||||
}
|
||||
// Fear
|
||||
if(i==0 || (score - bleu) > fear_score) {
|
||||
fear_score = score - bleu;
|
||||
fear_index = i;
|
||||
}
|
||||
// Model
|
||||
if(i==0 || score > model_score) {
|
||||
model_score = score;
|
||||
model_index = i;
|
||||
}
|
||||
iNumHyps++;
|
||||
}
|
||||
// Update weights
|
||||
if(hope_index!=fear_index) {
|
||||
// Vector difference
|
||||
MiraFeatureVector hope(train->featuresAt(hope_index));
|
||||
MiraFeatureVector fear(train->featuresAt(fear_index));
|
||||
MiraFeatureVector diff = hope - fear;
|
||||
// Bleu difference
|
||||
const vector<float>& hope_stats = train->scoresAt(hope_index);
|
||||
ValType hopeBleu = sentenceLevelBackgroundBleu(hope_stats, bg);
|
||||
const vector<float>& fear_stats = train->scoresAt(fear_index);
|
||||
ValType fearBleu = sentenceLevelBackgroundBleu(fear_stats, bg);
|
||||
assert(hopeBleu > fearBleu);
|
||||
ValType delta = hopeBleu - fearBleu;
|
||||
// Loss and update
|
||||
ValType diff_score = wv.score(diff);
|
||||
ValType loss = delta - diff_score;
|
||||
if(loss > 0) {
|
||||
ValType eta = min(c, loss / diff.sqrNorm());
|
||||
wv.update(diff,eta);
|
||||
totalLoss+=loss;
|
||||
iNumUpdates++;
|
||||
}
|
||||
// Update BLEU statistics
|
||||
const vector<float>& model_stats = train->scoresAt(model_index);
|
||||
for(size_t k=0;k<bg.size();k++) {
|
||||
bg[k]*=decay;
|
||||
if(model_bg)
|
||||
bg[k]+=model_stats[k];
|
||||
else
|
||||
bg[k]+=hope_stats[k];
|
||||
}
|
||||
}
|
||||
iNumExamples++;
|
||||
}
|
||||
// Training Epoch summary
|
||||
cerr << iNumUpdates << "/" << iNumExamples << " updates"
|
||||
<< ", avg loss = " << (totalLoss / iNumExamples);
|
||||
|
||||
|
||||
// Evaluate current average weights
|
||||
AvgWeightVector avg = wv.avg();
|
||||
ValType bleu = evaluate(train.get(), avg);
|
||||
cerr << ", BLEU = " << bleu << endl;
|
||||
if(bleu > bestBleu) {
|
||||
size_t num_dense = train->num_dense();
|
||||
if(initDenseSize>0 && initDenseSize!=num_dense) {
|
||||
cerr << "Error: Initial dense feature count and dense feature count from n-best do not match: "
|
||||
<< initDenseSize << "!=" << num_dense << endl;
|
||||
exit(1);
|
||||
}
|
||||
// Write to a file
|
||||
ostream* out;
|
||||
ofstream outFile;
|
||||
if (!outputFile.empty() ) {
|
||||
outFile.open(outputFile.c_str());
|
||||
if (!(outFile)) {
|
||||
cerr << "Error: Failed to open " << outputFile << endl;
|
||||
exit(1);
|
||||
}
|
||||
out = &outFile;
|
||||
} else {
|
||||
out = &cout;
|
||||
}
|
||||
for(size_t i=0;i<avg.size();i++) {
|
||||
if(i<num_dense)
|
||||
*out << "F" << i << " " << avg.weight(i) << endl;
|
||||
else {
|
||||
if(abs(avg.weight(i))>1e-8)
|
||||
*out << SparseVector::decode(i-num_dense) << " " << avg.weight(i) << endl;
|
||||
}
|
||||
}
|
||||
outFile.close();
|
||||
bestBleu = bleu;
|
||||
}
|
||||
}
|
||||
cerr << "Best BLEU = " << bestBleu << endl;
|
||||
}
|
||||
// --Emacs trickery--
|
||||
// Local Variables:
|
||||
// mode:c++
|
||||
// c-basic-offset:2
|
||||
// End:
|
@ -117,6 +117,9 @@ my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous
|
||||
# TODO: Should we also add these values to options of this script?
|
||||
my $megam_default_options = "-fvals -maxi 30 -nobias binary";
|
||||
|
||||
# Flags related to Batch MIRA (Cherry & Foster, 2012)
|
||||
my $___BATCH_MIRA = 0; # flg to enable batch MIRA
|
||||
|
||||
my $__THREADS = 0;
|
||||
|
||||
# Parameter for effective reference length when computing BLEU score
|
||||
@ -206,6 +209,7 @@ GetOptions(
|
||||
"pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER,
|
||||
"pro-starting-point" => \$___PRO_STARTING_POINT,
|
||||
"historic-interpolation=f" => \$___HISTORIC_INTERPOLATION,
|
||||
"batch-mira" => \$___BATCH_MIRA,
|
||||
"threads=i" => \$__THREADS
|
||||
) or exit(1);
|
||||
|
||||
@ -324,10 +328,12 @@ if (!defined $mertdir) {
|
||||
my $mert_extract_cmd = File::Spec->catfile($mertdir, "extractor");
|
||||
my $mert_mert_cmd = File::Spec->catfile($mertdir, "mert");
|
||||
my $mert_pro_cmd = File::Spec->catfile($mertdir, "pro");
|
||||
my $mert_mira_cmd = File::Spec->catfile($mertdir, "kbmira");
|
||||
|
||||
die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd;
|
||||
die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd;
|
||||
die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd;
|
||||
die "Not executable: $mert_mira_cmd" if ! -x $mert_mira_cmd;
|
||||
|
||||
my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt"); # or set to your installation
|
||||
|
||||
@ -727,6 +733,11 @@ while (1) {
|
||||
$scfiles = "$score_file";
|
||||
}
|
||||
|
||||
my $mira_settings = "";
|
||||
$mira_settings .= " --dense-init run$run.$weights_in_file";
|
||||
if (-e "run$run.sparse-weights") {
|
||||
$mira_settings .= " --sparse-init run$run.sparse-weights";
|
||||
}
|
||||
my $file_settings = " --ffile $ffiles --scfile $scfiles";
|
||||
my $pro_file_settings = "--ffile " . join(" --ffile ", split(/,/, $ffiles)) .
|
||||
" --scfile " . join(" --scfile ", split(/,/, $scfiles));
|
||||
@ -759,6 +770,10 @@ while (1) {
|
||||
# ... and run mert
|
||||
$cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/;
|
||||
&submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile);
|
||||
} elsif ($___BATCH_MIRA) { # batch MIRA optimization
|
||||
safesystem("echo 'not used' > $weights_out_file") or die;
|
||||
$cmd = "$mert_mira_cmd $mira_settings $seed_settings $pro_file_settings -o $mert_outfile";
|
||||
&submit_or_exec($cmd, "run$run.mira.out", $mert_logfile);
|
||||
} else { # just mert
|
||||
&submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile);
|
||||
}
|
||||
@ -906,7 +921,7 @@ chdir($cwd);
|
||||
sub get_weights_from_mert {
|
||||
my ($outfile, $logfile, $weight_count, $sparse_weights) = @_;
|
||||
my ($bestpoint, $devbleu);
|
||||
if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/)) {
|
||||
if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/) || $___BATCH_MIRA) {
|
||||
open my $fh, '<', $outfile or die "Can't open $outfile: $!";
|
||||
my (@WEIGHT, $sum);
|
||||
for (my $i = 0; $i < $weight_count; $i++) { push @WEIGHT, 0; }
|
||||
@ -923,6 +938,14 @@ sub get_weights_from_mert {
|
||||
foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; }
|
||||
$bestpoint = join(" ", @WEIGHT);
|
||||
close $fh;
|
||||
if($___BATCH_MIRA) {
|
||||
open my $fh2, '<', $logfile or die "Can't open $logfile: $!";
|
||||
while(<$fh2>) {
|
||||
if(/Best BLEU = ([\-\d\.]+)/) {
|
||||
$devbleu = $1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
open my $fh, '<', $logfile or die "Can't open $logfile: $!";
|
||||
while (<$fh>) {
|
||||
|
Loading…
Reference in New Issue
Block a user