Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Hieu Hoang 2012-05-29 19:14:49 +01:00
commit c85b96a6c6
19 changed files with 1181 additions and 10 deletions

1
.gitignore vendored
View File

@ -17,6 +17,7 @@ mert/extractor
mert/mert
mert/megam_i686.opt
mert/pro
mert/kbmira
misc/processLexicalTable
misc/processPhraseTable
misc/queryLexicalTable

View File

@ -232,3 +232,44 @@ float sentenceLevelBleuPlusOne(const vector<float>& stats) {
}
return exp(logbleu);
}
float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg)
{
// Sum sent and background
std::vector<float> stats;
CHECK(sent.size()==bg.size());
CHECK(sent.size()==kBleuNgramOrder*2+1);
for(size_t i=0;i<sent.size();i++)
stats.push_back(sent[i]+bg[i]);
// Calculate BLEU
float logbleu = 0.0;
for (int j = 0; j < kBleuNgramOrder; j++) {
logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
}
logbleu /= kBleuNgramOrder;
const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];
if (brevity < 0.0) {
logbleu += brevity;
}
// Exponentiate and scale by reference length (as per Chiang et al 08)
return exp(logbleu) * stats[kBleuNgramOrder*2];
}
float unsmoothedBleu(const std::vector<float>& stats) {
CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
float logbleu = 0.0;
for (int j = 0; j < kBleuNgramOrder; j++) {
logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
}
logbleu /= kBleuNgramOrder;
const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];
if (brevity < 0.0) {
logbleu += brevity;
}
return exp(logbleu);
}

View File

@ -70,4 +70,14 @@ private:
*/
float sentenceLevelBleuPlusOne(const std::vector<float>& stats);
/** Computes sentence-level BLEU score given a background corpus.
* This function is used in batch MIRA.
*/
float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg);
/**
* Computes plain old BLEU from a vector of stats
*/
float unsmoothedBleu(const std::vector<float>& stats);
#endif // MERT_BLEU_SCORER_H_

View File

@ -152,10 +152,10 @@ BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
// "girl with a telescope", "with a telescope ."
NgramCounts counts;
BOOST_REQUIRE(scorer.CountNgrams(line, counts, kBleuNgramOrder) == 8);
BOOST_CHECK_EQUAL(25, counts.size());
BOOST_CHECK_EQUAL((std::size_t)25, counts.size());
mert::Vocabulary* vocab = scorer.GetVocab();
BOOST_CHECK_EQUAL(7, vocab->size());
BOOST_CHECK_EQUAL((std::size_t)7, vocab->size());
std::vector<std::string> res;
Tokenize(line.c_str(), ' ', &res);
@ -203,7 +203,7 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
ScoreStats entry;
scorer.prepareStats(0, line, entry);
BOOST_CHECK_EQUAL(entry.size(), 2 * kBleuNgramOrder + 1);
BOOST_CHECK_EQUAL(entry.size(), (std::size_t)(2 * kBleuNgramOrder + 1));
// Test hypothesis ngram counts
BOOST_CHECK_EQUAL(entry.get(0), 5); // unigram

View File

@ -33,8 +33,8 @@ BOOST_AUTO_TEST_CASE(shard_basic) {
std::vector<Data> shards;
data.createShards(2,0,"",shards);
BOOST_CHECK_EQUAL(shards.size(),2);
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),2);
BOOST_CHECK_EQUAL(shards.size(),(std::size_t)2);
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2);
}
BOOST_AUTO_TEST_CASE(init_feature_map_test) {

View File

@ -18,6 +18,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <iostream>
#include <sstream>
#include <boost/functional/hash.hpp>
#include "util/tokenize_piece.hh"
@ -47,6 +48,16 @@ float ParseFloat(const StringPiece& str) {
return value;
}
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) {
return item1.dense==item1.dense && item1.sparse==item1.sparse;
}
size_t hash_value(FeatureDataItem const& item) {
size_t seed = 0;
boost::hash_combine(seed,item.dense);
boost::hash_combine(seed,item.sparse);
return seed;
}
FeatureDataIterator::FeatureDataIterator() {}

View File

@ -61,6 +61,9 @@ class FeatureDataItem
SparseVector sparse;
};
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2);
std::size_t hash_value(FeatureDataItem const& item);
class FeatureDataIterator :
public boost::iterator_facade<FeatureDataIterator,
const std::vector<FeatureDataItem>,

View File

@ -13,7 +13,7 @@ void CheckFeatureMap(const FeatureData* feature_data,
std::stringstream ss;
ss << str << "_" << i;
const std::string& s = ss.str();
BOOST_CHECK_EQUAL(feature_data->getFeatureIndex(s), *cnt);
BOOST_CHECK_EQUAL(feature_data->getFeatureIndex(s), (std::size_t)(*cnt));
BOOST_CHECK_EQUAL(feature_data->getFeatureName(*cnt).c_str(), s);
++(*cnt);
}
@ -35,6 +35,6 @@ BOOST_AUTO_TEST_CASE(set_feature_map) {
CheckFeatureMap(&feature_data, "lm", 2, &cnt);
CheckFeatureMap(&feature_data, "tm", 5, &cnt);
BOOST_CHECK_EQUAL(feature_data.getFeatureIndex("w_0"), cnt);
BOOST_CHECK_EQUAL(feature_data.getFeatureIndex("w_0"), (std::size_t)cnt);
BOOST_CHECK_EQUAL(feature_data.getFeatureName(cnt).c_str(), "w_0");
}

View File

@ -10,6 +10,8 @@
#include <fstream>
#include <cmath>
#include <boost/functional/hash.hpp>
#include "Util.h"
using namespace std;
@ -81,6 +83,43 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
return res;
}
std::vector<std::size_t> SparseVector::feats() const {
std::vector<std::size_t> toRet;
for(fvector_t::const_iterator iter = m_fvector.begin();
iter!=m_fvector.end();
iter++) {
toRet.push_back(iter->first);
}
return toRet;
}
std::size_t SparseVector::encode(const std::string& name) {
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0;
if (name2id_iter == m_name_to_id.end()) {
id = m_id_to_name.size();
m_id_to_name.push_back(name);
m_name_to_id[name] = id;
} else {
id = name2id_iter->second;
}
return id;
}
std::string SparseVector::decode(std::size_t id) {
return m_id_to_name[id];
}
bool operator==(SparseVector const& item1, SparseVector const& item2) {
return item1.m_fvector==item2.m_fvector;
}
std::size_t hash_value(SparseVector const& item) {
boost::hash<SparseVector::fvector_t> hasher;
return hasher(item.m_fvector);
}
FeatureStats::FeatureStats()
: m_available_size(kAvailableSize), m_entries(0),
m_array(new FeatureStatsType[m_available_size]) {}

View File

@ -28,11 +28,19 @@ public:
void set(const std::string& name, FeatureStatsType value);
void clear();
std::size_t size() const { return m_fvector.size(); }
void write(std::ostream& out, const std::string& sep = " ") const;
SparseVector& operator-=(const SparseVector& rhs);
// Added by cherryc
std::vector<std::size_t> feats() const;
friend bool operator==(SparseVector const& item1, SparseVector const& item2);
friend std::size_t hash_value(SparseVector const& item);
static std::size_t encode(const std::string& feat);
static std::string decode(std::size_t feat);
// End added by cherryc
private:
static name2id_t m_name_to_id;
static id2name_t m_id_to_name;

187
mert/HypPackEnumerator.cpp Normal file
View File

@ -0,0 +1,187 @@
#include "HypPackEnumerator.h"
#include <cassert>
#include <algorithm>
#include <boost/unordered_set.hpp>
StreamingHypPackEnumerator::StreamingHypPackEnumerator
(
vector<std::string> const& featureFiles,
vector<std::string> const& scoreFiles
)
: m_featureFiles(featureFiles),
m_scoreFiles(scoreFiles)
{
if (scoreFiles.size() == 0 || featureFiles.size() == 0) {
cerr << "No data to process" << endl;
exit(0);
}
if (featureFiles.size() != scoreFiles.size()) {
cerr << "Error: Number of feature files (" << featureFiles.size() <<
") does not match number of score files (" << scoreFiles.size() << ")" << endl;
exit(1);
}
m_num_lists = scoreFiles.size();
m_primed = false;
m_iNumDense = -1;
}
size_t StreamingHypPackEnumerator::num_dense() const {
if(m_iNumDense<0) {
cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl;
exit(1);
}
return (size_t) m_iNumDense;
}
void StreamingHypPackEnumerator::prime(){
m_current_indexes.clear();
boost::unordered_set<FeatureDataItem> seen;
m_primed = true;
for (size_t i = 0; i < m_num_lists; ++i) {
if (m_featureDataIters[i] == FeatureDataIterator::end()) {
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
exit(1);
}
if (m_scoreDataIters[i] == ScoreDataIterator::end()) {
cerr << "Error: Score file " << i << " ended prematurely" << endl;
exit(1);
}
if (m_featureDataIters[i]->size() != m_scoreDataIters[i]->size()) {
cerr << "Error: For sentence " << m_sentenceId << " features and scores have different size" << endl;
exit(1);
}
for (size_t j = 0; j < m_featureDataIters[i]->size(); ++j) {
FeatureDataItem item = m_featureDataIters[i]->operator[](j);
// Dedup
if(seen.find(item)==seen.end()) {
seen.insert(item);
// Confirm dense features are always the same
int iDense = item.dense.size();
if(m_iNumDense != iDense) {
if(m_iNumDense==-1) m_iNumDense = iDense;
else {
cerr << "Error: expecting constant number of dense features: "
<< m_iNumDense << " != " << iDense << endl;
exit(1);
}
}
// Store item for retrieval
m_current_indexes.push_back(pair<size_t,size_t>(i,j));
}
}
}
}
void StreamingHypPackEnumerator::reset(){
m_featureDataIters.clear();
m_scoreDataIters.clear();
for (size_t i = 0; i < m_num_lists; ++i) {
m_featureDataIters.push_back(FeatureDataIterator(m_featureFiles[i]));
m_scoreDataIters.push_back(ScoreDataIterator(m_scoreFiles[i]));
}
m_sentenceId=0;
prime();
}
bool StreamingHypPackEnumerator::finished(){
return m_featureDataIters[0]==FeatureDataIterator::end();
}
void StreamingHypPackEnumerator::next(){
if(!m_primed) {
cerr << "Enumerating an unprimed HypPackEnumerator" << endl;
exit(1);
}
for (size_t i = 0; i < m_num_lists; ++i) {
++m_featureDataIters[i];
++m_scoreDataIters[i];
}
m_sentenceId++;
if(!finished()) prime();
}
size_t StreamingHypPackEnumerator::cur_size(){
if(!m_primed) {
cerr << "Querying size from an unprimed HypPackEnumerator" << endl;
exit(1);
}
return m_current_indexes.size();
}
const FeatureDataItem& StreamingHypPackEnumerator::featuresAt(size_t index){
if(!m_primed) {
cerr << "Querying features from an unprimed HypPackEnumerator" << endl;
exit(1);
}
const pair<size_t,size_t>& pij = m_current_indexes[index];
return m_featureDataIters[pij.first]->operator[](pij.second);
}
const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
if(!m_primed) {
cerr << "Querying scores from an unprimed HypPackEnumerator" << endl;
exit(1);
}
const pair<size_t,size_t>& pij = m_current_indexes[index];
return m_scoreDataIters[pij.first]->operator[](pij.second);
}
/* --------- RandomAccessHypPackEnumerator ------------- */
RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
vector<string> const& scoreFiles,
bool no_shuffle)
{
StreamingHypPackEnumerator train(featureFiles,scoreFiles);
size_t index=0;
for(train.reset(); !train.finished(); train.next()) {
m_features.push_back(vector<FeatureDataItem>());
m_scores.push_back(vector<ScoreDataItem>());
for(size_t j=0;j<train.cur_size();j++) {
m_features.back().push_back(train.featuresAt(j));
m_scores.back().push_back(train.scoresAt(j));
}
m_indexes.push_back(index++);
}
m_cur_index = 0;
m_no_shuffle = no_shuffle;
m_num_dense = train.num_dense();
}
size_t RandomAccessHypPackEnumerator::num_dense() const {
return m_num_dense;
}
void RandomAccessHypPackEnumerator::reset() {
m_cur_index = 0;
if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end());
}
bool RandomAccessHypPackEnumerator::finished() {
return m_cur_index >= m_indexes.size();
}
void RandomAccessHypPackEnumerator::next() {
m_cur_index++;
}
size_t RandomAccessHypPackEnumerator::cur_size() {
assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size());
return m_features[m_indexes[m_cur_index]].size();
}
const FeatureDataItem& RandomAccessHypPackEnumerator::featuresAt(size_t i) {
return m_features[m_indexes[m_cur_index]][i];
}
const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) {
return m_scores[m_indexes[m_cur_index]][i];
}
// --Emacs trickery--
// Local Variables:
// mode:c++
// c-basic-offset:2
// End:

101
mert/HypPackEnumerator.h Normal file
View File

@ -0,0 +1,101 @@
/*
* HypPackCollection.h
* kbmira - k-best Batch MIRA
*
* Abstracts away the mess of iterating through multiple
* collections of k-best lists, as well as deduping
*/
#ifndef MERT_HYP_PACK_COLLECTION_H
#define MERT_HYP_PACK_COLLECTION_H
#include <string>
#include <vector>
#include <utility>
#include "FeatureDataIterator.h"
#include "ScoreDataIterator.h"
using namespace std;
// Start with these abstract classes
class HypPackEnumerator {
public:
virtual void reset() = 0;
virtual bool finished() = 0;
virtual void next() = 0;
virtual size_t cur_size() = 0;
virtual size_t num_dense() const = 0;
virtual const FeatureDataItem& featuresAt(size_t i) = 0;
virtual const ScoreDataItem& scoresAt(size_t i) = 0;
};
// Instantiation that streams from disk
// Low-memory, low-speed, sequential access
class StreamingHypPackEnumerator : public HypPackEnumerator {
public:
StreamingHypPackEnumerator(vector<string> const& featureFiles,
vector<string> const& scoreFiles
);
virtual size_t num_dense() const;
virtual void reset();
virtual bool finished();
virtual void next();
virtual size_t cur_size();
virtual const FeatureDataItem& featuresAt(size_t i);
virtual const ScoreDataItem& scoresAt(size_t i);
private:
void prime();
size_t m_num_lists;
size_t m_sentenceId;
vector<string> m_featureFiles;
vector<string> m_scoreFiles;
bool m_primed;
int m_iNumDense;
vector<FeatureDataIterator> m_featureDataIters;
vector<ScoreDataIterator> m_scoreDataIters;
vector<pair<size_t,size_t> > m_current_indexes;
};
// Instantiation that reads into memory
// High-memory, high-speed, random access
// (Actually randomizes with each call to reset)
class RandomAccessHypPackEnumerator : public HypPackEnumerator {
public:
RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
vector<string> const& scoreFiles,
bool no_shuffle);
virtual size_t num_dense() const;
virtual void reset();
virtual bool finished();
virtual void next();
virtual size_t cur_size();
virtual const FeatureDataItem& featuresAt(size_t i);
virtual const ScoreDataItem& scoresAt(size_t i);
private:
bool m_no_shuffle;
size_t m_cur_index;
size_t m_num_dense;
vector<size_t> m_indexes;
vector<vector<FeatureDataItem> > m_features;
vector<vector<ScoreDataItem> > m_scores;
};
#endif // MERT_HYP_PACK_COLLECTION_H
// --Emacs trickery--
// Local Variables:
// mode:c++
// c-basic-offset:2
// End:

View File

@ -15,6 +15,9 @@ FeatureStats.cpp
FeatureArray.cpp
FeatureData.cpp
FeatureDataIterator.cpp
MiraFeatureVector.cpp
MiraWeightVector.cpp
HypPackEnumerator.cpp
Data.cpp
BleuScorer.cpp
SemposScorer.cpp
@ -52,7 +55,9 @@ exe evaluator : evaluator.cpp mert_lib ;
exe pro : pro.cpp mert_lib ..//boost_program_options ;
alias programs : mert extractor evaluator pro ;
exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ;
alias programs : mert extractor evaluator pro kbmira ;
unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;

144
mert/MiraFeatureVector.cpp Normal file
View File

@ -0,0 +1,144 @@
#include <cmath>
#include "MiraFeatureVector.h"
MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
: m_dense(vec.dense)
{
vector<size_t> sparseFeats = vec.sparse.feats();
bool bFirst = true;
size_t lastFeat = 0;
for(size_t i=0;i<sparseFeats.size();i++)
{
size_t feat = m_dense.size() + sparseFeats[i];
m_sparseFeats.push_back(feat);
m_sparseVals.push_back(vec.sparse.get(sparseFeats[i]));
// Check ordered property
if(bFirst) {
bFirst = false;
}
else {
if(lastFeat>=feat) {
cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl;
exit(1);
}
}
lastFeat = feat;
}
}
MiraFeatureVector::MiraFeatureVector(const MiraFeatureVector& other)
: m_dense(other.m_dense),
m_sparseFeats(other.m_sparseFeats),
m_sparseVals(other.m_sparseVals)
{
if(m_sparseVals.size()!=m_sparseFeats.size()) {
cerr << "Error: mismatching sparse feat and val sizes" << endl;
exit(1);
}
}
MiraFeatureVector::MiraFeatureVector(const vector<ValType>& dense,
const vector<size_t>& sparseFeats,
const vector<ValType>& sparseVals)
: m_dense(dense),
m_sparseFeats(sparseFeats),
m_sparseVals(sparseVals)
{
if(m_sparseVals.size()!=m_sparseFeats.size()) {
cerr << "Error: mismatching sparse feat and val sizes" << endl;
exit(1);
}
}
ValType MiraFeatureVector::val(size_t index) const {
if(index < m_dense.size())
return m_dense[index];
else
return m_sparseVals[index];
}
size_t MiraFeatureVector::feat(size_t index) const {
if(index < m_dense.size())
return index;
else
return m_sparseFeats[index];
}
size_t MiraFeatureVector::size() const {
return m_dense.size() + m_sparseVals.size();
}
ValType MiraFeatureVector::sqrNorm() const {
ValType toRet = 0.0;
for(size_t i=0;i<m_dense.size();i++)
toRet += m_dense[i]*m_dense[i];
for(size_t i=0;i<m_sparseVals.size();i++)
toRet += m_sparseVals[i] * m_sparseVals[i];
return toRet;
}
MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector& b)
{
// Dense subtraction
vector<ValType> dense;
if(a.m_dense.size()!=b.m_dense.size()) {
cerr << "Mismatching dense vectors passed to MiraFeatureVector subtraction" << endl;
exit(1);
}
for(size_t i=0;i<a.m_dense.size();i++) {
dense.push_back(a.m_dense[i] - b.m_dense[i]);
}
// Sparse subtraction
size_t i=0;
size_t j=0;
vector<ValType> sparseVals;
vector<size_t> sparseFeats;
while(i < a.m_sparseFeats.size() && j < b.m_sparseFeats.size()) {
if(a.m_sparseFeats[i] < b.m_sparseFeats[j]) {
sparseFeats.push_back(a.m_sparseFeats[i]);
sparseVals.push_back(a.m_sparseVals[i]);
i++;
}
else if(b.m_sparseFeats[j] < a.m_sparseFeats[i]) {
sparseFeats.push_back(b.m_sparseFeats[j]);
sparseVals.push_back(-b.m_sparseVals[j]);
j++;
}
else {
ValType newVal = a.m_sparseVals[i] - b.m_sparseVals[j];
if(abs(newVal)>1e-6) {
sparseFeats.push_back(a.m_sparseFeats[i]);
sparseVals.push_back(newVal);
}
i++;
j++;
}
}
while(i<a.m_sparseFeats.size()) {
sparseFeats.push_back(a.m_sparseFeats[i]);
sparseVals.push_back(a.m_sparseVals[i]);
i++;
}
while(j<b.m_sparseFeats.size()) {
sparseFeats.push_back(b.m_sparseFeats[j]);
sparseVals.push_back(-b.m_sparseVals[j]);
j++;
}
// Create and return vector
return MiraFeatureVector(dense,sparseFeats,sparseVals);
}
// --Emacs trickery--
// Local Variables:
// mode:c++
// c-basic-offset:2
// End:

51
mert/MiraFeatureVector.h Normal file
View File

@ -0,0 +1,51 @@
/*
* MiraFeatureVector.h
* kbmira - k-best Batch MIRA
*
* An alternative to the existing SparseVector
* and FeatureDataItem combo. Should be as memory
* efficient, and a little more time efficient,
* and should save me from constantly hacking
* SparseVector
*/
#ifndef MERT_MIRA_FEATURE_VECTOR_H
#define MERT_MIRA_FEATURE_VECTOR_H
#include <vector>
#include "FeatureDataIterator.h"
using namespace std;
typedef FeatureStatsType ValType;
class MiraFeatureVector {
public:
MiraFeatureVector(const FeatureDataItem& vec);
MiraFeatureVector(const MiraFeatureVector& other);
MiraFeatureVector(const vector<ValType>& dense,
const vector<size_t>& sparseFeats,
const vector<ValType>& sparseVals);
ValType val(size_t index) const;
size_t feat(size_t index) const;
size_t size() const;
ValType sqrNorm() const;
friend MiraFeatureVector operator-(const MiraFeatureVector& a,
const MiraFeatureVector& b);
private:
vector<ValType> m_dense;
vector<size_t> m_sparseFeats;
vector<ValType> m_sparseVals;
};
#endif // MERT_FEATURE_VECTOR_H
// --Emacs trickery--
// Local Variables:
// mode:c++
// c-basic-offset:2
// End:

143
mert/MiraWeightVector.cpp Normal file
View File

@ -0,0 +1,143 @@
#include "MiraWeightVector.h"
/**
* Constructor, initializes to the zero vector
*/
MiraWeightVector::MiraWeightVector()
: m_weights(),
m_totals(),
m_lastUpdated()
{
m_numUpdates = 0;
}
/**
* Constructor with provided initial vector
* \param init Initial feature values
*/
MiraWeightVector::MiraWeightVector(const vector<ValType>& init)
: m_weights(init),
m_totals(init),
m_lastUpdated(init.size(), 0)
{
m_numUpdates = 0;
}
/**
* Update a the model
* \param fv Feature vector to be added to the weights
* \param tau FV will be scaled by this value before update
*/
void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
m_numUpdates++;
for(size_t i=0;i<fv.size();i++) {
update(fv.feat(i), fv.val(i)*tau);
}
}
/**
* Perform an empty update (affects averaging)
*/
void MiraWeightVector::tick() {
m_numUpdates++;
}
/**
* Score a feature vector according to the model
* \param fv Feature vector to be scored
*/
ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
ValType toRet = 0.0;
for(size_t i=0; i<fv.size(); i++) {
toRet += weight(fv.feat(i)) * fv.val(i);
}
return toRet;
}
/**
* Return an averaged view of this weight vector
*/
AvgWeightVector MiraWeightVector::avg() {
this->fixTotals();
return AvgWeightVector(*this);
}
/**
* Updates a weight and lazily updates its total
*/
void MiraWeightVector::update(size_t index, ValType delta) {
// Handle previously unseen weights
while(index>=m_weights.size()) {
m_weights.push_back(0.0);
m_totals.push_back(0.0);
m_lastUpdated.push_back(0);
}
// Book keeping for w = w + delta
m_totals[index] += (m_numUpdates - m_lastUpdated[index]) * m_weights[index] + delta;
m_weights[index] += delta;
m_lastUpdated[index] = m_numUpdates;
}
/**
* Make sure everyone's total is up-to-date
*/
void MiraWeightVector::fixTotals() {
for(size_t i=0; i<m_weights.size(); i++) update(i,0);
}
/**
* Helper to handle out of range weights
*/
ValType MiraWeightVector::weight(size_t index) const {
if(index < m_weights.size()) {
return m_weights[index];
}
else {
return 0;
}
}
ValType MiraWeightVector::sqrNorm() const {
ValType toRet = 0;
for(size_t i=0;i<m_weights.size();i++) {
toRet += weight(i) * weight(i);
}
return toRet;
}
AvgWeightVector::AvgWeightVector(const MiraWeightVector& wv)
:m_wv(wv)
{}
ValType AvgWeightVector::weight(size_t index) const
{
if(m_wv.m_numUpdates==0) return m_wv.weight(index);
else {
if(index < m_wv.m_totals.size()) {
return m_wv.m_totals[index] / m_wv.m_numUpdates;
}
else {
return 0;
}
}
}
ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
ValType toRet = 0.0;
for(size_t i=0; i<fv.size(); i++) {
toRet += weight(fv.feat(i)) * fv.val(i);
}
return toRet;
}
size_t AvgWeightVector::size() const {
return m_wv.m_weights.size();
}
// --Emacs trickery--
// Local Variables:
// mode:c++
// c-basic-offset:2
// End:

106
mert/MiraWeightVector.h Normal file
View File

@ -0,0 +1,106 @@
/*
* MiraWeightVector.h
* kbmira - k-best Batch MIRA
*
* A self-averaging weight-vector. Good for
* perceptron learning as well.
*
*/
#ifndef MERT_MIRA_WEIGHT_VECTOR_H
#define MERT_MIRA_WEIGHT_VECTOR_H
#include <vector>
#include "MiraFeatureVector.h"
using namespace std;
class AvgWeightVector;
class MiraWeightVector {
public:
/**
* Constructor, initializes to the zero vector
*/
MiraWeightVector();
/**
* Constructor with provided initial vector
* \param init Initial feature values
*/
MiraWeightVector(const vector<ValType>& init);
/**
* Update a the model
* \param fv Feature vector to be added to the weights
* \param tau FV will be scaled by this value before update
*/
void update(const MiraFeatureVector& fv, float tau);
/**
* Perform an empty update (affects averaging)
*/
void tick();
/**
* Score a feature vector according to the model
* \param fv Feature vector to be scored
*/
ValType score(const MiraFeatureVector& fv) const;
/**
* Squared norm of the weight vector
*/
ValType sqrNorm() const;
/**
* Return an averaged view of this weight vector
*/
AvgWeightVector avg();
friend class AvgWeightVector;
private:
/**
* Updates a weight and lazily updates its total
*/
void update(size_t index, ValType delta);
/**
* Make sure everyone's total is up-to-date
*/
void fixTotals();
/**
* Helper to handle out-of-range weights
*/
ValType weight(size_t index) const;
vector<ValType> m_weights;
vector<ValType> m_totals;
vector<size_t> m_lastUpdated;
size_t m_numUpdates;
};
/**
* Averaged view of a weight vector
*/
class AvgWeightVector {
public:
AvgWeightVector(const MiraWeightVector& wv);
ValType score(const MiraFeatureVector& fv) const;
ValType weight(size_t index) const;
size_t size() const;
private:
const MiraWeightVector& m_wv;
};
#endif // MERT_WEIGHT_VECTOR_H
// --Emacs trickery--
// Local Variables:
// mode:c++
// c-basic-offset:2
// End:

298
mert/kbmira.cpp Normal file
View File

@ -0,0 +1,298 @@
// $Id$
// vim:tabstop=2
/***********************************************************************
***********************************************************************/
/**
* k-best Batch Mira, as described in:
*
* Colin Cherry and George Foster
* Batch Tuning Strategies for Statistical Machine Translation
* NAACL 2012
*
* Implemented by colin.cherry@nrc-cnrc.gc.ca
*
* To license implementations of any of the other tuners in that paper,
* please get in touch with any member of NRC Canada's Portage project
*
* Input is a set of n-best lists, encoded as feature and score files.
*
* Output is a weight file that results from running MIRA on these
* n-btest lists for J iterations. Will return the set that maximizes
* training BLEU.
**/
#include <cmath>
#include <cstddef>
#include <cstdlib>
#include <ctime>
#include <cassert>
#include <iostream>
#include <string>
#include <vector>
#include <utility>
#include <algorithm>
#include <boost/program_options.hpp>
#include <boost/scoped_ptr.hpp>
#include "BleuScorer.h"
#include "HypPackEnumerator.h"
#include "MiraFeatureVector.h"
#include "MiraWeightVector.h"
using namespace std;
namespace po = boost::program_options;
ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) {
vector<ValType> stats(kBleuNgramOrder*2+1,0);
for(train->reset(); !train->finished(); train->next()) {
// Find max model
size_t max_index=0;
ValType max_score=0;
for(size_t i=0;i<train->cur_size();i++) {
MiraFeatureVector vec(train->featuresAt(i));
ValType score = wv.score(vec);
if(i==0 || score > max_score) {
max_index = i;
max_score = score;
}
}
// Update stats
const vector<float>& sent = train->scoresAt(max_index);
for(size_t i=0;i<sent.size();i++) {
stats[i]+=sent[i];
}
}
return unsmoothedBleu(stats);
}
int main(int argc, char** argv)
{
bool help;
string denseInitFile;
string sparseInitFile;
vector<string> scoreFiles;
vector<string> featureFiles;
int seed;
string outputFile;
float c = 0.01; // Step-size cap C
float decay = 0.999; // Pseudo-corpus decay \gamma
int n_iters = 60; // Max epochs J
bool streaming = false; // Stream all k-best lists?
bool no_shuffle = false; // Don't shuffle, even for in memory version
bool model_bg = false; // Use model for background corpus
// Command-line processing follows pro.cpp
po::options_description desc("Allowed options");
desc.add_options()
("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
("output-file,o", po::value<string>(&outputFile), "Output file")
("cparam,C", po::value<float>(&c), "MIRA C-parameter, lower for more regularization (default 0.01)")
("decay,D", po::value<float>(&decay), "BLEU background corpus decay rate (default 0.999)")
("iters,J", po::value<int>(&n_iters), "Number of MIRA iterations to run (default 60)")
("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features")
("sparse-init,s", po::value<string>(&sparseInitFile), "Weight file for sparse features")
("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle")
("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch")
("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background");
;
po::options_description cmdline_options;
cmdline_options.add(desc);
po::variables_map vm;
po::store(po::command_line_parser(argc,argv).
options(cmdline_options).run(), vm);
po::notify(vm);
if (help) {
cout << "Usage: " + string(argv[0]) + " [options]" << endl;
cout << desc << endl;
exit(0);
}
if (vm.count("random-seed")) {
cerr << "Initialising random seed to " << seed << endl;
srand(seed);
} else {
cerr << "Initialising random seed from system clock" << endl;
srand(time(NULL));
}
// Initialize weights
///
// Dense
vector<parameter_t> initParams;
if(!denseInitFile.empty()) {
ifstream opt(denseInitFile.c_str());
string buffer; istringstream strstrm(buffer);
if (opt.fail()) {
cerr << "could not open dense initfile: " << denseInitFile << endl;
exit(3);
}
parameter_t val;
getline(opt,buffer);
while(strstrm >> val) initParams.push_back(val);
opt.close();
}
size_t initDenseSize = initParams.size();
// Sparse
if(!sparseInitFile.empty()) {
if(initDenseSize==0) {
cerr << "sparse initialization requires dense initialization" << endl;
exit(3);
}
ifstream opt(sparseInitFile.c_str());
if(opt.fail()) {
cerr << "could not open sparse initfile: " << sparseInitFile << endl;
exit(3);
}
int sparseCount=0;
parameter_t val; std::string name;
while(opt >> name >> val) {
size_t id = SparseVector::encode(name) + initDenseSize;
while(initParams.size()<=id) initParams.push_back(0.0);
initParams[id] = val;
sparseCount++;
}
cerr << "Found " << sparseCount << " initial sparse features" << endl;
opt.close();
}
MiraWeightVector wv(initParams);
// Initialize background corpus
vector<ValType> bg;
for(int j=0;j<kBleuNgramOrder;j++){
bg.push_back(kBleuNgramOrder-j);
bg.push_back(kBleuNgramOrder-j);
}
bg.push_back(kBleuNgramOrder);
// Training loop
boost::scoped_ptr<HypPackEnumerator> train;
if(streaming)
train.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles));
else
train.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle));
cerr << "Initial BLEU = " << evaluate(train.get(), wv.avg()) << endl;
ValType bestBleu = 0;
for(int j=0;j<n_iters;j++)
{
// MIRA train for one epoch
int iNumHyps = 0;
int iNumExamples = 0;
int iNumUpdates = 0;
ValType totalLoss = 0.0;
for(train->reset(); !train->finished(); train->next()) {
// Hope / fear decode
size_t hope_index=0, fear_index=0, model_index=0;
ValType hope_score=0, fear_score=0, model_score=0;
for(size_t i=0; i< train->cur_size(); i++) {
MiraFeatureVector vec(train->featuresAt(i));
ValType score = wv.score(vec);
ValType bleu = sentenceLevelBackgroundBleu(train->scoresAt(i),bg);
// Hope
if(i==0 || (score + bleu) > hope_score) {
hope_score = score + bleu;
hope_index = i;
}
// Fear
if(i==0 || (score - bleu) > fear_score) {
fear_score = score - bleu;
fear_index = i;
}
// Model
if(i==0 || score > model_score) {
model_score = score;
model_index = i;
}
iNumHyps++;
}
// Update weights
if(hope_index!=fear_index) {
// Vector difference
MiraFeatureVector hope(train->featuresAt(hope_index));
MiraFeatureVector fear(train->featuresAt(fear_index));
MiraFeatureVector diff = hope - fear;
// Bleu difference
const vector<float>& hope_stats = train->scoresAt(hope_index);
ValType hopeBleu = sentenceLevelBackgroundBleu(hope_stats, bg);
const vector<float>& fear_stats = train->scoresAt(fear_index);
ValType fearBleu = sentenceLevelBackgroundBleu(fear_stats, bg);
assert(hopeBleu > fearBleu);
ValType delta = hopeBleu - fearBleu;
// Loss and update
ValType diff_score = wv.score(diff);
ValType loss = delta - diff_score;
if(loss > 0) {
ValType eta = min(c, loss / diff.sqrNorm());
wv.update(diff,eta);
totalLoss+=loss;
iNumUpdates++;
}
// Update BLEU statistics
const vector<float>& model_stats = train->scoresAt(model_index);
for(size_t k=0;k<bg.size();k++) {
bg[k]*=decay;
if(model_bg)
bg[k]+=model_stats[k];
else
bg[k]+=hope_stats[k];
}
}
iNumExamples++;
}
// Training Epoch summary
cerr << iNumUpdates << "/" << iNumExamples << " updates"
<< ", avg loss = " << (totalLoss / iNumExamples);
// Evaluate current average weights
AvgWeightVector avg = wv.avg();
ValType bleu = evaluate(train.get(), avg);
cerr << ", BLEU = " << bleu << endl;
if(bleu > bestBleu) {
size_t num_dense = train->num_dense();
if(initDenseSize>0 && initDenseSize!=num_dense) {
cerr << "Error: Initial dense feature count and dense feature count from n-best do not match: "
<< initDenseSize << "!=" << num_dense << endl;
exit(1);
}
// Write to a file
ostream* out;
ofstream outFile;
if (!outputFile.empty() ) {
outFile.open(outputFile.c_str());
if (!(outFile)) {
cerr << "Error: Failed to open " << outputFile << endl;
exit(1);
}
out = &outFile;
} else {
out = &cout;
}
for(size_t i=0;i<avg.size();i++) {
if(i<num_dense)
*out << "F" << i << " " << avg.weight(i) << endl;
else {
if(abs(avg.weight(i))>1e-8)
*out << SparseVector::decode(i-num_dense) << " " << avg.weight(i) << endl;
}
}
outFile.close();
bestBleu = bleu;
}
}
cerr << "Best BLEU = " << bestBleu << endl;
}
// --Emacs trickery--
// Local Variables:
// mode:c++
// c-basic-offset:2
// End:

View File

@ -117,6 +117,9 @@ my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous
# TODO: Should we also add these values to options of this script?
my $megam_default_options = "-fvals -maxi 30 -nobias binary";
# Flags related to Batch MIRA (Cherry & Foster, 2012)
my $___BATCH_MIRA = 0; # flg to enable batch MIRA
my $__THREADS = 0;
# Parameter for effective reference length when computing BLEU score
@ -206,6 +209,7 @@ GetOptions(
"pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER,
"pro-starting-point" => \$___PRO_STARTING_POINT,
"historic-interpolation=f" => \$___HISTORIC_INTERPOLATION,
"batch-mira" => \$___BATCH_MIRA,
"threads=i" => \$__THREADS
) or exit(1);
@ -324,10 +328,12 @@ if (!defined $mertdir) {
my $mert_extract_cmd = File::Spec->catfile($mertdir, "extractor");
my $mert_mert_cmd = File::Spec->catfile($mertdir, "mert");
my $mert_pro_cmd = File::Spec->catfile($mertdir, "pro");
my $mert_mira_cmd = File::Spec->catfile($mertdir, "kbmira");
die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd;
die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd;
die "Not executable: $mert_pro_cmd" if ! -x $mert_pro_cmd;
die "Not executable: $mert_mira_cmd" if ! -x $mert_mira_cmd;
my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt"); # or set to your installation
@ -727,6 +733,11 @@ while (1) {
$scfiles = "$score_file";
}
my $mira_settings = "";
$mira_settings .= " --dense-init run$run.$weights_in_file";
if (-e "run$run.sparse-weights") {
$mira_settings .= " --sparse-init run$run.sparse-weights";
}
my $file_settings = " --ffile $ffiles --scfile $scfiles";
my $pro_file_settings = "--ffile " . join(" --ffile ", split(/,/, $ffiles)) .
" --scfile " . join(" --scfile ", split(/,/, $scfiles));
@ -759,6 +770,10 @@ while (1) {
# ... and run mert
$cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/;
&submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile);
} elsif ($___BATCH_MIRA) { # batch MIRA optimization
safesystem("echo 'not used' > $weights_out_file") or die;
$cmd = "$mert_mira_cmd $mira_settings $seed_settings $pro_file_settings -o $mert_outfile";
&submit_or_exec($cmd, "run$run.mira.out", $mert_logfile);
} else { # just mert
&submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile);
}
@ -906,7 +921,7 @@ chdir($cwd);
sub get_weights_from_mert {
my ($outfile, $logfile, $weight_count, $sparse_weights) = @_;
my ($bestpoint, $devbleu);
if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/)) {
if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/) || $___BATCH_MIRA) {
open my $fh, '<', $outfile or die "Can't open $outfile: $!";
my (@WEIGHT, $sum);
for (my $i = 0; $i < $weight_count; $i++) { push @WEIGHT, 0; }
@ -923,6 +938,14 @@ sub get_weights_from_mert {
foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; }
$bestpoint = join(" ", @WEIGHT);
close $fh;
if($___BATCH_MIRA) {
open my $fh2, '<', $logfile or die "Can't open $logfile: $!";
while(<$fh2>) {
if(/Best BLEU = ([\-\d\.]+)/) {
$devbleu = $1;
}
}
}
} else {
open my $fh, '<', $logfile or die "Can't open $logfile: $!";
while (<$fh>) {