mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-27 22:14:57 +03:00
code clean-up, step 1
git-svn-id: http://svn.statmt.org/repository/mira@3918 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
parent
8e6c963041
commit
120be1df4f
@ -32,8 +32,6 @@ using namespace Moses;
|
|||||||
|
|
||||||
namespace Mira {
|
namespace Mira {
|
||||||
|
|
||||||
//Decoder::~Decoder() {}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allocates a char* and copies string into it.
|
* Allocates a char* and copies string into it.
|
||||||
**/
|
**/
|
||||||
@ -70,7 +68,7 @@ namespace Mira {
|
|||||||
|
|
||||||
MosesDecoder::MosesDecoder(bool scaleByInputLength, float historySmoothing)
|
MosesDecoder::MosesDecoder(bool scaleByInputLength, float historySmoothing)
|
||||||
: m_manager(NULL) {
|
: m_manager(NULL) {
|
||||||
// force initialisation of the phrase dictionary (TODO: what for?)
|
// force initialisation of the phrase dictionary (TODO: why?)
|
||||||
const StaticData &staticData = StaticData::Instance();
|
const StaticData &staticData = StaticData::Instance();
|
||||||
m_sentence = new Sentence(Input);
|
m_sentence = new Sentence(Input);
|
||||||
stringstream in("Initialising decoder..\n");
|
stringstream in("Initialising decoder..\n");
|
||||||
@ -176,61 +174,6 @@ namespace Mira {
|
|||||||
return best;
|
return best;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<float> MosesDecoder::getBleuAndScore(const std::string& source,
|
|
||||||
size_t sentenceid,
|
|
||||||
float bleuObjectiveWeight,
|
|
||||||
float bleuScoreWeight,
|
|
||||||
bool distinct,
|
|
||||||
size_t rank,
|
|
||||||
size_t epoch)
|
|
||||||
{
|
|
||||||
StaticData &staticData = StaticData::InstanceNonConst();
|
|
||||||
|
|
||||||
m_sentence = new Sentence(Input);
|
|
||||||
stringstream in(source + "\n");
|
|
||||||
const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder();
|
|
||||||
m_sentence->Read(in,inputFactorOrder);
|
|
||||||
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
|
|
||||||
|
|
||||||
// set the weight for the bleu feature
|
|
||||||
ostringstream bleuWeightStr;
|
|
||||||
bleuWeightStr << (bleuObjectiveWeight * bleuScoreWeight);
|
|
||||||
PARAM_VEC bleuWeight(1,bleuWeightStr.str());
|
|
||||||
|
|
||||||
staticData.GetParameter()->OverwriteParam("weight-bl", bleuWeight);
|
|
||||||
staticData.ReLoadBleuScoreFeatureParameter();
|
|
||||||
|
|
||||||
m_bleuScoreFeature->SetCurrentSourceLength((*m_sentence).GetSize());
|
|
||||||
m_bleuScoreFeature->SetCurrentReference(sentenceid);
|
|
||||||
|
|
||||||
//run the decoder
|
|
||||||
m_manager = new Moses::Manager(*m_sentence, staticData.GetSearchAlgorithm(), &system);
|
|
||||||
m_manager->ProcessSentence();
|
|
||||||
TrellisPathList sentences;
|
|
||||||
m_manager->CalcNBest(1, sentences, distinct);
|
|
||||||
|
|
||||||
// read off the feature values and bleu scores for each sentence in the nbest list
|
|
||||||
Moses::TrellisPathList::const_iterator iter = sentences.begin();
|
|
||||||
vector<float> bleuAndScore;
|
|
||||||
const Moses::TrellisPath &path = **iter;
|
|
||||||
float bleuScore = getBleuScore(path.GetScoreBreakdown());
|
|
||||||
float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
|
|
||||||
bleuAndScore.push_back(bleuScore);
|
|
||||||
bleuAndScore.push_back(scoreWithoutBleu);
|
|
||||||
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", 1best translation: ");
|
|
||||||
Phrase phrase = path.GetTargetPhrase();
|
|
||||||
for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
|
|
||||||
const Word &word = phrase.GetWord(pos);
|
|
||||||
Word *newWord = new Word(word);
|
|
||||||
VERBOSE(1, *newWord);
|
|
||||||
}
|
|
||||||
|
|
||||||
VERBOSE(1, endl);
|
|
||||||
|
|
||||||
return bleuAndScore;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t MosesDecoder::getCurrentInputLength() {
|
size_t MosesDecoder::getCurrentInputLength() {
|
||||||
return (*m_sentence).GetSize();
|
return (*m_sentence).GetSize();
|
||||||
}
|
}
|
||||||
@ -270,27 +213,5 @@ namespace Mira {
|
|||||||
void MosesDecoder::printReferenceLength(const vector<size_t>& ref_ids) {
|
void MosesDecoder::printReferenceLength(const vector<size_t>& ref_ids) {
|
||||||
m_bleuScoreFeature->PrintReferenceLength(ref_ids);
|
m_bleuScoreFeature->PrintReferenceLength(ref_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<float> MosesDecoder::calculateBleuOfCorpus(const vector< vector< const Word*> >& words, vector<size_t>& ref_ids, size_t epoch, size_t rank) {
|
|
||||||
vector<float> bleu = m_bleuScoreFeature->CalculateBleuOfCorpus(words, ref_ids);
|
|
||||||
if (bleu.size() > 0) {
|
|
||||||
cerr << "\nRank " << rank << ", BLEU after epoch " << epoch << ": " << bleu[4]*100 << ", "
|
|
||||||
<< bleu[0]*100 << "/" << bleu[1]*100 << "/" << bleu[2]*100 << "/" << bleu[3]*100 << " "
|
|
||||||
<< "(BP=" << bleu[5] << ", " << "ratio=" << bleu[6] << ", "
|
|
||||||
<< "hyp_len=" << bleu[7] << ", ref_len=" << bleu[8] << ")" << endl;
|
|
||||||
vector<float> bleuAndRatio(2);
|
|
||||||
bleuAndRatio[0] = bleu[4]*100;
|
|
||||||
bleuAndRatio[1] = bleu[6];
|
|
||||||
return bleuAndRatio;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
cerr << "\nRank " << rank << ", BLEU after epoch " << epoch << ": 0" << endl;
|
|
||||||
vector<float> bleuAndRatio(2);
|
|
||||||
bleuAndRatio[0] = 0;
|
|
||||||
bleuAndRatio[1] = 0;
|
|
||||||
return bleuAndRatio;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,20 +64,12 @@ class MosesDecoder {
|
|||||||
bool distinct,
|
bool distinct,
|
||||||
size_t rank,
|
size_t rank,
|
||||||
size_t epoch);
|
size_t epoch);
|
||||||
std::vector<float> getBleuAndScore(const std::string& source,
|
|
||||||
size_t sentenceid,
|
|
||||||
float bleuObjectiveWeight,
|
|
||||||
float bleuScoreWeight,
|
|
||||||
bool distinct,
|
|
||||||
size_t rank,
|
|
||||||
size_t epoch);
|
|
||||||
size_t getCurrentInputLength();
|
size_t getCurrentInputLength();
|
||||||
void updateHistory(const std::vector<const Moses::Word*>& words);
|
void updateHistory(const std::vector<const Moses::Word*>& words);
|
||||||
void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
|
void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
|
||||||
void loadReferenceSentences(const std::vector<std::vector<std::string> >& refs);
|
void loadReferenceSentences(const std::vector<std::vector<std::string> >& refs);
|
||||||
void printBleuFeatureHistory(std::ostream& out);
|
void printBleuFeatureHistory(std::ostream& out);
|
||||||
void printReferenceLength(const std::vector<size_t>& ref_ids);
|
void printReferenceLength(const std::vector<size_t>& ref_ids);
|
||||||
std::vector<float> calculateBleuOfCorpus(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& ref_ids, size_t epoch, size_t rank);
|
|
||||||
Moses::ScoreComponentCollection getWeights();
|
Moses::ScoreComponentCollection getWeights();
|
||||||
void setWeights(const Moses::ScoreComponentCollection& weights);
|
void setWeights(const Moses::ScoreComponentCollection& weights);
|
||||||
void cleanup();
|
void cleanup();
|
||||||
|
@ -5,187 +5,6 @@ using namespace std;
|
|||||||
|
|
||||||
namespace Mira {
|
namespace Mira {
|
||||||
|
|
||||||
vector<FValue> Hildreth::optimise (const vector<FVector>& a, const vector<FValue>& b) {
|
|
||||||
|
|
||||||
size_t i;
|
|
||||||
int max_iter = 10000;
|
|
||||||
float eps = 0.00000001;
|
|
||||||
float zero = 0.000000000001;
|
|
||||||
|
|
||||||
vector<FValue> alpha ( b.size() );
|
|
||||||
vector<FValue> F ( b.size() );
|
|
||||||
vector<FValue> kkt ( b.size() );
|
|
||||||
|
|
||||||
float max_kkt = -1e100;
|
|
||||||
|
|
||||||
size_t K = b.size();
|
|
||||||
|
|
||||||
float A[K][K];
|
|
||||||
bool is_computed[K];
|
|
||||||
for ( i = 0; i < K; i++ )
|
|
||||||
{
|
|
||||||
A[i][i] = a[i].inner_product(a[i]);
|
|
||||||
is_computed[i] = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int max_kkt_i = -1;
|
|
||||||
|
|
||||||
|
|
||||||
for ( i = 0; i < b.size(); i++ )
|
|
||||||
{
|
|
||||||
F[i] = b[i];
|
|
||||||
kkt[i] = F[i];
|
|
||||||
if ( kkt[i] > max_kkt )
|
|
||||||
{
|
|
||||||
max_kkt = kkt[i];
|
|
||||||
max_kkt_i = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int iter = 0;
|
|
||||||
FValue diff_alpha;
|
|
||||||
FValue try_alpha;
|
|
||||||
FValue add_alpha;
|
|
||||||
|
|
||||||
while ( max_kkt >= eps && iter < max_iter )
|
|
||||||
{
|
|
||||||
|
|
||||||
diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
|
|
||||||
try_alpha = alpha[max_kkt_i] + diff_alpha;
|
|
||||||
add_alpha = 0.0;
|
|
||||||
|
|
||||||
if ( try_alpha < 0.0 )
|
|
||||||
add_alpha = -1.0 * alpha[max_kkt_i];
|
|
||||||
else
|
|
||||||
add_alpha = diff_alpha;
|
|
||||||
|
|
||||||
alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
|
|
||||||
|
|
||||||
if ( !is_computed[max_kkt_i] )
|
|
||||||
{
|
|
||||||
for ( i = 0; i < K; i++ )
|
|
||||||
{
|
|
||||||
A[i][max_kkt_i] = a[i].inner_product(a[max_kkt_i] ); // for version 1
|
|
||||||
//A[i][max_kkt_i] = 0; // for version 1
|
|
||||||
is_computed[max_kkt_i] = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for ( i = 0; i < F.size(); i++ )
|
|
||||||
{
|
|
||||||
F[i] -= add_alpha * A[i][max_kkt_i];
|
|
||||||
kkt[i] = F[i];
|
|
||||||
if ( alpha[i] > zero )
|
|
||||||
kkt[i] = abs ( F[i] );
|
|
||||||
}
|
|
||||||
max_kkt = -1e100;
|
|
||||||
max_kkt_i = -1;
|
|
||||||
for ( i = 0; i < F.size(); i++ )
|
|
||||||
if ( kkt[i] > max_kkt )
|
|
||||||
{
|
|
||||||
max_kkt = kkt[i];
|
|
||||||
max_kkt_i = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
iter++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return alpha;
|
|
||||||
}
|
|
||||||
|
|
||||||
vector<FValue> Hildreth::optimise (const vector<FVector>& a, const vector<FValue>& b, FValue C) {
|
|
||||||
|
|
||||||
size_t i;
|
|
||||||
int max_iter = 10000;
|
|
||||||
FValue eps = 0.00000001;
|
|
||||||
FValue zero = 0.000000000001;
|
|
||||||
|
|
||||||
vector<FValue> alpha ( b.size() );
|
|
||||||
vector<FValue> F ( b.size() );
|
|
||||||
vector<FValue> kkt ( b.size() );
|
|
||||||
|
|
||||||
float max_kkt = -1e100;
|
|
||||||
|
|
||||||
size_t K = b.size();
|
|
||||||
|
|
||||||
float A[K][K];
|
|
||||||
bool is_computed[K];
|
|
||||||
for ( i = 0; i < K; i++ )
|
|
||||||
{
|
|
||||||
A[i][i] = a[i].inner_product(a[i]);
|
|
||||||
is_computed[i] = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int max_kkt_i = -1;
|
|
||||||
|
|
||||||
|
|
||||||
for ( i = 0; i < b.size(); i++ )
|
|
||||||
{
|
|
||||||
F[i] = b[i];
|
|
||||||
kkt[i] = F[i];
|
|
||||||
if ( kkt[i] > max_kkt )
|
|
||||||
{
|
|
||||||
max_kkt = kkt[i];
|
|
||||||
max_kkt_i = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int iter = 0;
|
|
||||||
FValue diff_alpha;
|
|
||||||
FValue try_alpha;
|
|
||||||
FValue add_alpha;
|
|
||||||
|
|
||||||
while ( max_kkt >= eps && iter < max_iter )
|
|
||||||
{
|
|
||||||
|
|
||||||
diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
|
|
||||||
try_alpha = alpha[max_kkt_i] + diff_alpha;
|
|
||||||
add_alpha = 0.0;
|
|
||||||
|
|
||||||
if ( try_alpha < 0.0 )
|
|
||||||
add_alpha = -1.0 * alpha[max_kkt_i];
|
|
||||||
else if (try_alpha > C)
|
|
||||||
add_alpha = C - alpha[max_kkt_i];
|
|
||||||
else
|
|
||||||
add_alpha = diff_alpha;
|
|
||||||
|
|
||||||
alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
|
|
||||||
|
|
||||||
if ( !is_computed[max_kkt_i] )
|
|
||||||
{
|
|
||||||
for ( i = 0; i < K; i++ )
|
|
||||||
{
|
|
||||||
A[i][max_kkt_i] = a[i].inner_product(a[max_kkt_i] ); // for version 1
|
|
||||||
//A[i][max_kkt_i] = 0; // for version 1
|
|
||||||
is_computed[max_kkt_i] = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for ( i = 0; i < F.size(); i++ )
|
|
||||||
{
|
|
||||||
F[i] -= add_alpha * A[i][max_kkt_i];
|
|
||||||
kkt[i] = F[i];
|
|
||||||
if (alpha[i] > C - zero)
|
|
||||||
kkt[i]=-kkt[i];
|
|
||||||
else if (alpha[i] > zero)
|
|
||||||
kkt[i] = abs(F[i]);
|
|
||||||
|
|
||||||
}
|
|
||||||
max_kkt = -1e100;
|
|
||||||
max_kkt_i = -1;
|
|
||||||
for ( i = 0; i < F.size(); i++ )
|
|
||||||
if ( kkt[i] > max_kkt )
|
|
||||||
{
|
|
||||||
max_kkt = kkt[i];
|
|
||||||
max_kkt_i = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
iter++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return alpha;
|
|
||||||
}
|
|
||||||
|
|
||||||
vector<FValue> Hildreth::optimise (const vector<ScoreComponentCollection>& a, const vector<FValue>& b) {
|
vector<FValue> Hildreth::optimise (const vector<ScoreComponentCollection>& a, const vector<FValue>& b) {
|
||||||
|
|
||||||
size_t i;
|
size_t i;
|
||||||
|
@ -5,8 +5,6 @@ namespace Mira {
|
|||||||
|
|
||||||
class Hildreth {
|
class Hildreth {
|
||||||
public :
|
public :
|
||||||
static std::vector<Moses::FValue> optimise (const std::vector<Moses::FVector>& a, const std::vector<Moses::FValue>& b );
|
|
||||||
static std::vector<Moses::FValue> optimise (const std::vector<Moses::FVector>& a, const std::vector<Moses::FValue>& b, Moses::FValue C);
|
|
||||||
static std::vector<Moses::FValue> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<Moses::FValue>& b );
|
static std::vector<Moses::FValue> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<Moses::FValue>& b );
|
||||||
static std::vector<Moses::FValue> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<Moses::FValue>& b, Moses::FValue C);
|
static std::vector<Moses::FValue> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<Moses::FValue>& b, Moses::FValue C);
|
||||||
};
|
};
|
||||||
|
@ -166,7 +166,6 @@ int main(int argc, char** argv) {
|
|||||||
string decoder_settings;
|
string decoder_settings;
|
||||||
float min_weight_change;
|
float min_weight_change;
|
||||||
float decrease_learning_rate;
|
float decrease_learning_rate;
|
||||||
bool devBleu;
|
|
||||||
bool normaliseWeights;
|
bool normaliseWeights;
|
||||||
bool print_feature_values;
|
bool print_feature_values;
|
||||||
bool historyOf1best;
|
bool historyOf1best;
|
||||||
@ -178,7 +177,6 @@ int main(int argc, char** argv) {
|
|||||||
float bleuScoreWeight;
|
float bleuScoreWeight;
|
||||||
float margin_slack;
|
float margin_slack;
|
||||||
float margin_slack_incr;
|
float margin_slack_incr;
|
||||||
bool analytical_update;
|
|
||||||
bool perceptron_update;
|
bool perceptron_update;
|
||||||
bool hope_fear;
|
bool hope_fear;
|
||||||
bool model_hope_fear;
|
bool model_hope_fear;
|
||||||
@ -189,7 +187,6 @@ int main(int argc, char** argv) {
|
|||||||
desc.add_options()
|
desc.add_options()
|
||||||
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
|
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
|
||||||
("adapt-after-epoch", po::value<size_t>(&adapt_after_epoch)->default_value(0), "Index of epoch after which adaptive parameters will be adapted")
|
("adapt-after-epoch", po::value<size_t>(&adapt_after_epoch)->default_value(0), "Index of epoch after which adaptive parameters will be adapted")
|
||||||
("analytical-update", po::value<bool>(&analytical_update)->default_value(0), "Use one best lists and compute the update analytically")
|
|
||||||
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
|
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
|
||||||
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
|
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
|
||||||
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
|
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
|
||||||
@ -201,9 +198,7 @@ int main(int argc, char** argv) {
|
|||||||
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
|
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
|
||||||
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
|
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
|
||||||
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
|
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
|
||||||
("dev-bleu", po::value<bool>(&devBleu)->default_value(true), "Compute BLEU score of oracle translations of the whole tuning set")
|
|
||||||
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
|
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
|
||||||
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
|
|
||||||
("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
|
("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
|
||||||
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
|
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
|
||||||
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
|
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
|
||||||
@ -214,12 +209,12 @@ int main(int argc, char** argv) {
|
|||||||
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
|
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
|
||||||
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
|
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
|
||||||
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
|
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
|
||||||
("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
|
|
||||||
("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
|
|
||||||
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
|
|
||||||
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
|
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
|
||||||
|
("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
|
||||||
|
("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
|
||||||
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
|
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
|
||||||
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
|
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
|
||||||
|
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
|
||||||
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(5), "How often per epoch to mix weights, when using mpi")
|
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(5), "How often per epoch to mix weights, when using mpi")
|
||||||
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
|
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
|
||||||
("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in nbest list")
|
("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in nbest list")
|
||||||
@ -229,6 +224,8 @@ int main(int argc, char** argv) {
|
|||||||
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
|
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
|
||||||
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
|
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
|
||||||
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
|
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
|
||||||
|
("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
|
||||||
|
("scale-update", po::value<bool>(&scale_update)->default_value(false), "Scale the update by the Bleu score of the oracle translation")
|
||||||
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(true), "Use a sentences level bleu scoring function")
|
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(true), "Use a sentences level bleu scoring function")
|
||||||
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
|
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
|
||||||
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimizer")
|
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimizer")
|
||||||
@ -236,8 +233,7 @@ int main(int argc, char** argv) {
|
|||||||
("slack-step", po::value<float>(&slack_step)->default_value(0), "Increase slack from epoch to epoch by the value provided")
|
("slack-step", po::value<float>(&slack_step)->default_value(0), "Increase slack from epoch to epoch by the value provided")
|
||||||
("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
|
("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
|
||||||
("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
|
("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
|
||||||
("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
|
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
|
||||||
("scale-update", po::value<bool>(&scale_update)->default_value(false), "Scale the update by the Bleu score of the oracle translation")
|
|
||||||
("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
|
("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
|
||||||
|
|
||||||
po::options_description cmdline_options;
|
po::options_description cmdline_options;
|
||||||
@ -355,42 +351,31 @@ int main(int argc, char** argv) {
|
|||||||
perceptron_update = true;
|
perceptron_update = true;
|
||||||
model_hope_fear = false; // mira only
|
model_hope_fear = false; // mira only
|
||||||
hope_fear = false; // mira only
|
hope_fear = false; // mira only
|
||||||
analytical_update = false; // mira only
|
|
||||||
} else {
|
} else {
|
||||||
cerr << "Error: Unknown optimiser: " << learner << endl;
|
cerr << "Error: Unknown optimiser: " << learner << endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// resolve parameter dependencies
|
// resolve parameter dependencies
|
||||||
if (perceptron_update || analytical_update) {
|
if (batchSize > 1 && perceptron_update) {
|
||||||
batchSize = 1;
|
batchSize = 1;
|
||||||
cerr << "Info: Setting batch size to 1 for perceptron/analytical update" << endl;
|
cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hope_n == -1 && fear_n == -1) {
|
if (hope_n == -1 && fear_n == -1) {
|
||||||
hope_n = n;
|
hope_n = n;
|
||||||
fear_n = n;
|
fear_n = n;
|
||||||
}
|
}
|
||||||
|
if (model_hope_fear && hope_fear) {
|
||||||
if ((model_hope_fear || analytical_update) && hope_fear) {
|
|
||||||
hope_fear = false; // is true by default
|
hope_fear = false; // is true by default
|
||||||
}
|
}
|
||||||
|
if (!hope_fear) {
|
||||||
if (!hope_fear && !analytical_update) {
|
|
||||||
model_hope_fear = true;
|
model_hope_fear = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (model_hope_fear && analytical_update) {
|
|
||||||
cerr << "Error: Must choose between model-hope-fear and analytical update" << endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sentenceLevelBleu) {
|
if (!sentenceLevelBleu) {
|
||||||
if (!historyOf1best && !historyOfOracles) {
|
if (!historyOf1best && !historyOfOracles) {
|
||||||
historyOf1best = true;
|
historyOf1best = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (burnIn && sentenceLevelBleu) {
|
if (burnIn && sentenceLevelBleu) {
|
||||||
burnIn = false;
|
burnIn = false;
|
||||||
cerr << "Info: Burn-in not needed when using sentence-level BLEU, deactivating burn-in." << endl;
|
cerr << "Info: Burn-in not needed when using sentence-level BLEU, deactivating burn-in." << endl;
|
||||||
@ -545,7 +530,6 @@ int main(int argc, char** argv) {
|
|||||||
int sumStillViolatedConstraints_lastEpoch = 0;
|
int sumStillViolatedConstraints_lastEpoch = 0;
|
||||||
int sumConstraintChangeAbs;
|
int sumConstraintChangeAbs;
|
||||||
int sumConstraintChangeAbs_lastEpoch = 0;
|
int sumConstraintChangeAbs_lastEpoch = 0;
|
||||||
// size_t sumBleuChangeAbs;
|
|
||||||
float *sendbuf, *recvbuf;
|
float *sendbuf, *recvbuf;
|
||||||
sendbuf = (float *) malloc(sizeof(float));
|
sendbuf = (float *) malloc(sizeof(float));
|
||||||
recvbuf = (float *) malloc(sizeof(float));
|
recvbuf = (float *) malloc(sizeof(float));
|
||||||
@ -553,7 +537,6 @@ int main(int argc, char** argv) {
|
|||||||
// sum of violated constraints
|
// sum of violated constraints
|
||||||
sumStillViolatedConstraints = 0;
|
sumStillViolatedConstraints = 0;
|
||||||
sumConstraintChangeAbs = 0;
|
sumConstraintChangeAbs = 0;
|
||||||
// sumBleuChangeAbs = 0;
|
|
||||||
|
|
||||||
numberOfUpdatesThisEpoch = 0;
|
numberOfUpdatesThisEpoch = 0;
|
||||||
// Sum up weights over one epoch, final average uses weights from last epoch
|
// Sum up weights over one epoch, final average uses weights from last epoch
|
||||||
@ -619,7 +602,7 @@ int main(int argc, char** argv) {
|
|||||||
dummyFeatureValues.push_back(newFeatureValues);
|
dummyFeatureValues.push_back(newFeatureValues);
|
||||||
dummyBleuScores.push_back(newBleuScores);
|
dummyBleuScores.push_back(newBleuScores);
|
||||||
|
|
||||||
if (perceptron_update || analytical_update) {
|
if (perceptron_update) {
|
||||||
if (historyOf1best) {
|
if (historyOf1best) {
|
||||||
// MODEL (for updating the history)
|
// MODEL (for updating the history)
|
||||||
cerr << "Rank " << rank << ", run decoder to get 1best wrt model score (for history)" << endl;
|
cerr << "Rank " << rank << ", run decoder to get 1best wrt model score (for history)" << endl;
|
||||||
@ -778,15 +761,6 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* // get 1best model results with old weights
|
|
||||||
vector< vector <float > > bestModelOld_batch;
|
|
||||||
for (size_t i = 0; i < actualBatchSize; ++i) {
|
|
||||||
string& input = inputSentences[*current_sid_start + i];
|
|
||||||
vector <float> bestModelOld = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest, rank, epoch);
|
|
||||||
bestModelOld_batch.push_back(bestModelOld);
|
|
||||||
decoder->cleanup();
|
|
||||||
}*/
|
|
||||||
|
|
||||||
// optionally print out the feature values
|
// optionally print out the feature values
|
||||||
if (print_feature_values) {
|
if (print_feature_values) {
|
||||||
cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
|
cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
|
||||||
@ -826,11 +800,6 @@ int main(int argc, char** argv) {
|
|||||||
featureValuesHope, featureValuesFear, dummy1, dummy1, dummy2,
|
featureValuesHope, featureValuesFear, dummy1, dummy1, dummy2,
|
||||||
learning_rate, rank, epoch);
|
learning_rate, rank, epoch);
|
||||||
}
|
}
|
||||||
else if (analytical_update) {
|
|
||||||
update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(mosesWeights,
|
|
||||||
featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0],
|
|
||||||
ref_ids[0], learning_rate, rank, epoch);
|
|
||||||
}
|
|
||||||
else {
|
else {
|
||||||
if (hope_fear) {
|
if (hope_fear) {
|
||||||
if (coreWeightMap.size() > 0) {
|
if (coreWeightMap.size() > 0) {
|
||||||
@ -900,17 +869,6 @@ int main(int argc, char** argv) {
|
|||||||
weightDifference.MinusEquals(oldWeights);
|
weightDifference.MinusEquals(oldWeights);
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weight difference: " << weightDifference << endl);
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weight difference: " << weightDifference << endl);
|
||||||
|
|
||||||
/* // get 1best model results with new weights (for each sentence in batch)
|
|
||||||
vector<float> bestModelNew;
|
|
||||||
for (size_t i = 0; i < actualBatchSize; ++i) {
|
|
||||||
string& input = inputSentences[*current_sid_start + i];
|
|
||||||
bestModelNew = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest, rank, epoch);
|
|
||||||
decoder->cleanup();
|
|
||||||
sumBleuChangeAbs += abs(bestModelOld_batch[i][0] - bestModelNew[0]);
|
|
||||||
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", 1best model bleu, old: " << bestModelOld_batch[i][0] << ", new: " << bestModelNew[0] << endl);
|
|
||||||
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", 1best model score, old: " << bestModelOld_batch[i][1] << ", new: " << bestModelNew[1] << endl);
|
|
||||||
}*/
|
|
||||||
|
|
||||||
// update history (for approximate document Bleu)
|
// update history (for approximate document Bleu)
|
||||||
if (sentenceLevelBleu) {
|
if (sentenceLevelBleu) {
|
||||||
for (size_t i = 0; i < oracles.size(); ++i) {
|
for (size_t i = 0; i < oracles.size(); ++i) {
|
||||||
|
@ -110,7 +110,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
|||||||
ScoreComponentCollection update(featureValueDiffs[k]);
|
ScoreComponentCollection update(featureValueDiffs[k]);
|
||||||
update.MultiplyEquals(alpha);
|
update.MultiplyEquals(alpha);
|
||||||
|
|
||||||
// sum up update
|
// sum updates
|
||||||
summedUpdate.PlusEquals(update);
|
summedUpdate.PlusEquals(update);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -122,24 +122,6 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
ScoreComponentCollection newWeights(currWeights);
|
|
||||||
newWeights.PlusEquals(summedUpdate);
|
|
||||||
|
|
||||||
// Sanity check: are there still violated constraints after optimisation?
|
|
||||||
int violatedConstraintsAfter = 0;
|
|
||||||
float newDistanceFromOptimum = 0;
|
|
||||||
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
|
|
||||||
float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
|
|
||||||
float loss = all_losses[i];
|
|
||||||
float diff = loss - (modelScoreDiff + m_margin_slack);
|
|
||||||
if (diff > epsilon) {
|
|
||||||
++violatedConstraintsAfter;
|
|
||||||
newDistanceFromOptimum += diff;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
|
|
||||||
|
|
||||||
// apply learning rate
|
// apply learning rate
|
||||||
if (learning_rate != 1) {
|
if (learning_rate != 1) {
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
|
||||||
@ -158,6 +140,21 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
|||||||
currWeights.PlusEquals(summedUpdate);
|
currWeights.PlusEquals(summedUpdate);
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
|
||||||
|
|
||||||
|
// Sanity check: are there still violated constraints after optimisation?
|
||||||
|
int violatedConstraintsAfter = 0;
|
||||||
|
float newDistanceFromOptimum = 0;
|
||||||
|
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
|
||||||
|
float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
|
||||||
|
float loss = all_losses[i];
|
||||||
|
float diff = loss - (modelScoreDiff + m_margin_slack);
|
||||||
|
if (diff > epsilon) {
|
||||||
|
++violatedConstraintsAfter;
|
||||||
|
newDistanceFromOptimum += diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
|
||||||
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
|
||||||
|
|
||||||
vector<int> status(2);
|
vector<int> status(2);
|
||||||
status[0] = violatedConstraintsBefore;
|
status[0] = violatedConstraintsBefore;
|
||||||
status[1] = violatedConstraintsAfter;
|
status[1] = violatedConstraintsAfter;
|
||||||
@ -291,25 +288,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
ScoreComponentCollection newWeights(currWeights);
|
// apply learning rate
|
||||||
newWeights.PlusEquals(summedUpdate);
|
|
||||||
|
|
||||||
// Sanity check: are there still violated constraints after optimisation?
|
|
||||||
int violatedConstraintsAfter = 0;
|
|
||||||
float newDistanceFromOptimum = 0;
|
|
||||||
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
|
|
||||||
float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
|
|
||||||
float loss = all_losses[i];
|
|
||||||
float diff = loss - (modelScoreDiff + m_margin_slack);
|
|
||||||
if (diff > epsilon) {
|
|
||||||
++violatedConstraintsAfter;
|
|
||||||
newDistanceFromOptimum += diff;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
|
|
||||||
|
|
||||||
// Apply learning rate (fixed or flexible)
|
|
||||||
if (learning_rate != 1) {
|
if (learning_rate != 1) {
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
|
||||||
summedUpdate.MultiplyEquals(learning_rate);
|
summedUpdate.MultiplyEquals(learning_rate);
|
||||||
@ -321,107 +300,27 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
|||||||
currWeights.PlusEquals(summedUpdate);
|
currWeights.PlusEquals(summedUpdate);
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
|
||||||
|
|
||||||
|
// Sanity check: are there still violated constraints after optimisation?
|
||||||
|
int violatedConstraintsAfter = 0;
|
||||||
|
float newDistanceFromOptimum = 0;
|
||||||
|
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
|
||||||
|
float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
|
||||||
|
float loss = all_losses[i];
|
||||||
|
float diff = loss - (modelScoreDiff + m_margin_slack);
|
||||||
|
if (diff > epsilon) {
|
||||||
|
++violatedConstraintsAfter;
|
||||||
|
newDistanceFromOptimum += diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
|
||||||
|
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
|
||||||
|
|
||||||
|
|
||||||
vector<int> statusPlus(2);
|
vector<int> statusPlus(2);
|
||||||
statusPlus[0] = violatedConstraintsBefore;
|
statusPlus[0] = violatedConstraintsBefore;
|
||||||
statusPlus[1] = violatedConstraintsAfter;
|
statusPlus[1] = violatedConstraintsAfter;
|
||||||
return statusPlus;
|
return statusPlus;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
|
|
||||||
ScoreComponentCollection& featureValuesHope,
|
|
||||||
ScoreComponentCollection& featureValuesFear,
|
|
||||||
float bleuScoreHope,
|
|
||||||
float bleuScoreFear,
|
|
||||||
size_t sentenceId,
|
|
||||||
float learning_rate,
|
|
||||||
size_t rank,
|
|
||||||
size_t epoch) {
|
|
||||||
|
|
||||||
float epsilon = 0.0001;
|
|
||||||
float oldDistanceFromOptimum = 0;
|
|
||||||
bool constraintViolatedBefore = false;
|
|
||||||
ScoreComponentCollection weightUpdate;
|
|
||||||
|
|
||||||
// cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
|
|
||||||
// cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
|
|
||||||
ScoreComponentCollection featureValueDiff = featureValuesHope;
|
|
||||||
featureValueDiff.MinusEquals(featureValuesFear);
|
|
||||||
cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
|
|
||||||
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
|
||||||
float loss = bleuScoreHope - bleuScoreFear;
|
|
||||||
float diff = 0;
|
|
||||||
if (loss > (modelScoreDiff + m_margin_slack)) {
|
|
||||||
diff = loss - (modelScoreDiff + m_margin_slack);
|
|
||||||
}
|
|
||||||
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
|
|
||||||
|
|
||||||
if (diff > epsilon) {
|
|
||||||
// constraint violated
|
|
||||||
oldDistanceFromOptimum += diff;
|
|
||||||
constraintViolatedBefore = true;
|
|
||||||
|
|
||||||
// compute alpha for given constraint: (loss - model score diff) / || feature value diff ||^2
|
|
||||||
// featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
|
|
||||||
// from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
|
|
||||||
float squaredNorm = featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm();
|
|
||||||
|
|
||||||
if (squaredNorm > 0) {
|
|
||||||
float alpha = diff / squaredNorm;
|
|
||||||
if (m_slack > 0 ) {
|
|
||||||
if (alpha > m_slack) {
|
|
||||||
alpha = m_slack;
|
|
||||||
}
|
|
||||||
else if (alpha < m_slack*(-1)) {
|
|
||||||
alpha = m_slack*(-1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
|
|
||||||
featureValueDiff.MultiplyEquals(alpha);
|
|
||||||
weightUpdate.PlusEquals(featureValueDiff);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", no update because squared norm is 0" << endl);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!constraintViolatedBefore) {
|
|
||||||
// constraint satisfied, nothing to do
|
|
||||||
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl;
|
|
||||||
vector<int> status(2);
|
|
||||||
status[0] = 0;
|
|
||||||
status[1] = 0;
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sanity check: constraint still violated after optimisation?
|
|
||||||
ScoreComponentCollection newWeights(currWeights);
|
|
||||||
newWeights.PlusEquals(weightUpdate);
|
|
||||||
bool constraintViolatedAfter = false;
|
|
||||||
float newDistanceFromOptimum = 0;
|
|
||||||
featureValueDiff = featureValuesHope;
|
|
||||||
featureValueDiff.MinusEquals(featureValuesFear);
|
|
||||||
modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
|
|
||||||
diff = loss - (modelScoreDiff + m_margin_slack);
|
|
||||||
// approximate comparison between floats!
|
|
||||||
if (diff > epsilon) {
|
|
||||||
constraintViolatedAfter = true;
|
|
||||||
newDistanceFromOptimum += (loss - modelScoreDiff);
|
|
||||||
}
|
|
||||||
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl);
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
|
|
||||||
|
|
||||||
// apply update to weight vector
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl);
|
|
||||||
currWeights.PlusEquals(weightUpdate);
|
|
||||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
|
|
||||||
|
|
||||||
vector<int> status(2);
|
|
||||||
status[0] = 1;
|
|
||||||
status[1] = constraintViolatedAfter ? 1 : 0;
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,15 +67,6 @@ namespace Mira {
|
|||||||
m_scale_update(scale_update),
|
m_scale_update(scale_update),
|
||||||
m_margin_slack(margin_slack) { }
|
m_margin_slack(margin_slack) { }
|
||||||
|
|
||||||
std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
|
|
||||||
Moses::ScoreComponentCollection& featureValuesHope,
|
|
||||||
Moses::ScoreComponentCollection& featureValuesFear,
|
|
||||||
float bleuScoresHope,
|
|
||||||
float bleuScoresFear,
|
|
||||||
size_t sentenceId,
|
|
||||||
float learning_rate,
|
|
||||||
size_t rank,
|
|
||||||
size_t epoch);
|
|
||||||
std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
|
std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
|
||||||
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
|
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
|
||||||
const std::vector<std::vector<float> >& losses,
|
const std::vector<std::vector<float> >& losses,
|
||||||
@ -117,6 +108,7 @@ namespace Mira {
|
|||||||
// scale update with log 10 of oracle BLEU score
|
// scale update with log 10 of oracle BLEU score
|
||||||
bool m_scale_update;
|
bool m_scale_update;
|
||||||
|
|
||||||
|
// slack when comparing losses to model scores
|
||||||
float m_margin_slack;
|
float m_margin_slack;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -45,7 +45,6 @@ vector<int> Perceptron::updateWeightsHopeFear(ScoreComponentCollection& currWeig
|
|||||||
vector<int> update_status;
|
vector<int> update_status;
|
||||||
update_status.push_back(0);
|
update_status.push_back(0);
|
||||||
update_status.push_back(0);
|
update_status.push_back(0);
|
||||||
update_status.push_back(0);
|
|
||||||
return update_status;
|
return update_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,14 +94,12 @@ void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::strin
|
|||||||
for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
|
for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
|
||||||
for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
|
for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
|
||||||
Phrase ngram(Output);
|
Phrase ngram(Output);
|
||||||
//cerr << "start: " << end_idx-order << " end: " << end_idx << endl;
|
|
||||||
for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
|
for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
|
||||||
const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
|
const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
|
||||||
Word w;
|
Word w;
|
||||||
w.SetFactor(0, f);
|
w.SetFactor(0, f);
|
||||||
ngram.AddWord(w);
|
ngram.AddWord(w);
|
||||||
}
|
}
|
||||||
//cerr << "Ref: " << ngram << endl;
|
|
||||||
ref_pair.second[ngram] += 1;
|
ref_pair.second[ngram] += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -120,10 +118,10 @@ void BleuScoreFeature::SetCurrentReference(size_t ref_id) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Update the pseudo-document big_O after each translation of a source sentence.
|
* Update the pseudo-document O after each translation of a source sentence.
|
||||||
* (big_O is an exponentially-weighted moving average of vectors c(e;{r_k}))
|
* (O is an exponentially-weighted moving average of vectors c(e;{r_k}))
|
||||||
* big_O = 0.9 * (big_O + c(e_oracle))
|
* O = m_historySmoothing * (O + c(e_oracle))
|
||||||
* big_O_f = 0.9 * (big_O_f + |f|) input length of document big_O
|
* O_f = m_historySmoothing * (O_f + |f|) input length of pseudo-document
|
||||||
*/
|
*/
|
||||||
void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
|
void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
|
||||||
Phrase phrase(Output, hypo);
|
Phrase phrase(Output, hypo);
|
||||||
@ -138,7 +136,6 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
|
|||||||
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
|
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
|
||||||
m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
|
m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
|
||||||
m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
|
m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
|
||||||
//cerr << "precisionHistory " << i + 1 << ": " << (m_match_history[i]/m_count_history[i]) << " (" << m_match_history[i] << "/" << m_count_history[i] << ")" << endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// update counts for reference and target length
|
// update counts for reference and target length
|
||||||
@ -148,7 +145,7 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Update history with a batch of oracle translations
|
* Update history with a batch of translations
|
||||||
*/
|
*/
|
||||||
void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
|
void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
|
||||||
for (size_t batchPosition = 0; batchPosition < hypos.size(); ++batchPosition){
|
for (size_t batchPosition = 0; batchPosition < hypos.size(); ++batchPosition){
|
||||||
@ -195,7 +192,7 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Update history with a batch of oracle translations
|
* Print batch of reference translations
|
||||||
*/
|
*/
|
||||||
void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
|
void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
|
||||||
for (size_t batchPosition = 0; batchPosition < ref_ids.size(); ++batchPosition){
|
for (size_t batchPosition = 0; batchPosition < ref_ids.size(); ++batchPosition){
|
||||||
@ -325,7 +322,6 @@ FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
|
|||||||
}
|
}
|
||||||
|
|
||||||
new_state->m_source_length = cur_hypo.GetWordsBitmap().GetSize();
|
new_state->m_source_length = cur_hypo.GetWordsBitmap().GetSize();
|
||||||
new_state->m_source_phrase_length = cur_hypo.GetCurrSourceWordsRange().GetNumWordsCovered(); // todo: delete
|
|
||||||
new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
|
new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
|
||||||
ctx_end_idx));
|
ctx_end_idx));
|
||||||
new_state->m_target_length += cur_hypo.GetTargetPhrase().GetSize();
|
new_state->m_target_length += cur_hypo.GetTargetPhrase().GetSize();
|
||||||
@ -337,7 +333,6 @@ FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
|
|||||||
|
|
||||||
// Calculate new bleu.
|
// Calculate new bleu.
|
||||||
new_bleu = CalculateBleu(new_state);
|
new_bleu = CalculateBleu(new_state);
|
||||||
//cerr << "NS: " << *new_state << " NB " << new_bleu << endl;
|
|
||||||
|
|
||||||
// Set score to new Bleu score
|
// Set score to new Bleu score
|
||||||
accumulator->PlusEquals(this, new_bleu - old_bleu);
|
accumulator->PlusEquals(this, new_bleu - old_bleu);
|
||||||
@ -396,82 +391,6 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
|
|||||||
return precision;
|
return precision;
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<float> BleuScoreFeature::CalculateBleuOfCorpus(const vector< vector< const Word* > >& oracles, const vector<size_t>& ref_ids) {
|
|
||||||
// get ngram matches and counts for all oracle sentences and their references
|
|
||||||
vector<size_t> sumOfClippedNgramMatches(BleuScoreState::bleu_order);
|
|
||||||
vector<size_t> sumOfNgramCounts(BleuScoreState::bleu_order);
|
|
||||||
size_t ref_length = 0;
|
|
||||||
size_t target_length = 0;
|
|
||||||
|
|
||||||
for (size_t batchPosition = 0; batchPosition < oracles.size(); ++batchPosition){
|
|
||||||
Phrase phrase(Output, oracles[batchPosition]);
|
|
||||||
size_t ref_id = ref_ids[batchPosition];
|
|
||||||
size_t cur_ref_length = m_refs[ref_id].first;
|
|
||||||
NGrams cur_ref_ngrams = m_refs[ref_id].second;
|
|
||||||
|
|
||||||
ref_length += cur_ref_length;
|
|
||||||
target_length += oracles[batchPosition].size();
|
|
||||||
|
|
||||||
std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
|
|
||||||
std::vector< size_t > clipped_ngram_matches(BleuScoreState::bleu_order);
|
|
||||||
GetClippedNgramMatchesAndCounts(phrase, cur_ref_ngrams, ngram_counts, clipped_ngram_matches, 0);
|
|
||||||
|
|
||||||
// add clipped ngram matches and ngram counts to corpus sums
|
|
||||||
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
|
|
||||||
sumOfClippedNgramMatches[i] += clipped_ngram_matches[i];
|
|
||||||
sumOfNgramCounts[i] += ngram_counts[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sumOfNgramCounts[0]) {
|
|
||||||
vector<float> empty(0);
|
|
||||||
return empty;
|
|
||||||
}
|
|
||||||
if (!sumOfClippedNgramMatches[0]) {
|
|
||||||
vector<float> empty(0);
|
|
||||||
return empty; // if we have no unigram matches, score should be 0
|
|
||||||
}
|
|
||||||
|
|
||||||
// calculate bleu score
|
|
||||||
float precision = 1.0;
|
|
||||||
|
|
||||||
vector<float> bleu;
|
|
||||||
// Calculate geometric mean of modified ngram precisions
|
|
||||||
// BLEU = BP * exp(SUM_1_4 1/4 * log p_n)
|
|
||||||
// = BP * 4th root(PRODUCT_1_4 p_n)
|
|
||||||
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
|
|
||||||
if (sumOfNgramCounts[i]) {
|
|
||||||
precision *= 1.0*sumOfClippedNgramMatches[i] / sumOfNgramCounts[i];
|
|
||||||
bleu.push_back(1.0*sumOfClippedNgramMatches[i] / sumOfNgramCounts[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// take geometric mean
|
|
||||||
precision = pow(precision, (float)1/4);
|
|
||||||
|
|
||||||
// Apply brevity penalty if applicable.
|
|
||||||
// BP = 1 if c > r
|
|
||||||
// BP = e^(1- r/c)) if c <= r
|
|
||||||
// where
|
|
||||||
// c: length of the candidate translation
|
|
||||||
// r: effective reference length (sum of best match lengths for each candidate sentence)
|
|
||||||
float BP;
|
|
||||||
if (target_length < ref_length) {
|
|
||||||
precision *= exp(1 - (1.0*ref_length/target_length));
|
|
||||||
BP = exp(1 - (1.0*ref_length/target_length));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
BP = 1.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bleu.push_back(precision);
|
|
||||||
bleu.push_back(BP);
|
|
||||||
bleu.push_back(1.0*target_length/ref_length);
|
|
||||||
bleu.push_back(target_length);
|
|
||||||
bleu.push_back(ref_length);
|
|
||||||
return bleu;
|
|
||||||
}
|
|
||||||
|
|
||||||
const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const
|
const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const
|
||||||
{
|
{
|
||||||
return new BleuScoreState();
|
return new BleuScoreState();
|
||||||
|
@ -29,8 +29,6 @@ private:
|
|||||||
size_t m_source_length;
|
size_t m_source_length;
|
||||||
size_t m_target_length;
|
size_t m_target_length;
|
||||||
|
|
||||||
size_t m_source_phrase_length; // todo: delete
|
|
||||||
|
|
||||||
// scaled reference length is needed for scoring incomplete hypotheses against reference translation
|
// scaled reference length is needed for scoring incomplete hypotheses against reference translation
|
||||||
float m_scaled_ref_length;
|
float m_scaled_ref_length;
|
||||||
|
|
||||||
@ -52,7 +50,7 @@ public:
|
|||||||
m_target_length_history(0),
|
m_target_length_history(0),
|
||||||
m_ref_length_history(0),
|
m_ref_length_history(0),
|
||||||
m_scale_by_input_length(true),
|
m_scale_by_input_length(true),
|
||||||
m_historySmoothing(0.9) {}
|
m_historySmoothing(0.7) {}
|
||||||
|
|
||||||
BleuScoreFeature(bool scaleByInputLength, float historySmoothing):
|
BleuScoreFeature(bool scaleByInputLength, float historySmoothing):
|
||||||
StatefulFeatureFunction("BleuScore"),
|
StatefulFeatureFunction("BleuScore"),
|
||||||
@ -101,11 +99,10 @@ public:
|
|||||||
const FFState* prev_state,
|
const FFState* prev_state,
|
||||||
ScoreComponentCollection* accumulator) const;
|
ScoreComponentCollection* accumulator) const;
|
||||||
float CalculateBleu(BleuScoreState*) const;
|
float CalculateBleu(BleuScoreState*) const;
|
||||||
std::vector<float> CalculateBleuOfCorpus(const std::vector< std::vector< const Word* > >& hypos, const std::vector<size_t>& ref_ids);
|
|
||||||
const FFState* EmptyHypothesisState(const InputType&) const;
|
const FFState* EmptyHypothesisState(const InputType&) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// counts for pseudo-document big_O
|
// counts for pseudo-document
|
||||||
std::vector< float > m_count_history;
|
std::vector< float > m_count_history;
|
||||||
std::vector< float > m_match_history;
|
std::vector< float > m_match_history;
|
||||||
float m_source_length_history;
|
float m_source_length_history;
|
||||||
@ -117,9 +114,10 @@ private:
|
|||||||
NGrams m_cur_ref_ngrams;
|
NGrams m_cur_ref_ngrams;
|
||||||
size_t m_cur_ref_length;
|
size_t m_cur_ref_length;
|
||||||
|
|
||||||
// whether or not to scale the BLEU score by a history of the input size
|
// scale BLEU score by history of input size
|
||||||
bool m_scale_by_input_length;
|
bool m_scale_by_input_length;
|
||||||
|
|
||||||
|
// smoothing factor for history counts
|
||||||
float m_historySmoothing;
|
float m_historySmoothing;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user