mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 07:42:21 +03:00
change verbosity for cerr messages, remove some unwanted options, introduce --margin-slack, --margin-incr
git-svn-id: http://svn.statmt.org/repository/mira@3913 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
parent
0585646b2d
commit
42333388b4
@ -68,7 +68,7 @@ namespace Mira {
|
||||
delete[] mosesargv;
|
||||
}
|
||||
|
||||
MosesDecoder::MosesDecoder(bool useScaledReference, bool scaleByInputLength, float historySmoothing)
|
||||
MosesDecoder::MosesDecoder(bool scaleByInputLength, float historySmoothing)
|
||||
: m_manager(NULL) {
|
||||
// force initialisation of the phrase dictionary (TODO: what for?)
|
||||
const StaticData &staticData = StaticData::Instance();
|
||||
@ -82,7 +82,7 @@ namespace Mira {
|
||||
m_manager->ProcessSentence();
|
||||
|
||||
// Add the bleu feature
|
||||
m_bleuScoreFeature = new BleuScoreFeature(useScaledReference, scaleByInputLength, historySmoothing);
|
||||
m_bleuScoreFeature = new BleuScoreFeature(scaleByInputLength, historySmoothing);
|
||||
(const_cast<TranslationSystem&>(system)).AddFeatureFunction(m_bleuScoreFeature);
|
||||
}
|
||||
|
||||
@ -100,7 +100,8 @@ namespace Mira {
|
||||
vector< float>& bleuScores,
|
||||
bool oracle,
|
||||
bool distinct,
|
||||
size_t rank)
|
||||
size_t rank,
|
||||
size_t epoch)
|
||||
{
|
||||
StaticData &staticData = StaticData::InstanceNonConst();
|
||||
|
||||
@ -137,11 +138,11 @@ namespace Mira {
|
||||
|
||||
//std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl;
|
||||
float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
|
||||
cerr << "Rank " << rank << ", total score: " << path.GetTotalScore() << ", Score w/o bleu: " << scoreWithoutBleu << ", Bleu: " << bleuScore << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", total score: " << path.GetTotalScore() << ", Score w/o bleu: " << scoreWithoutBleu << ", Bleu: " << bleuScore << endl;
|
||||
|
||||
Phrase bestPhrase = path.GetTargetPhrase();
|
||||
|
||||
cerr << "Rank " << rank << ": ";
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ": ";
|
||||
Phrase phrase = path.GetTargetPhrase();
|
||||
for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
|
||||
const Word &word = phrase.GetWord(pos);
|
||||
@ -179,7 +180,9 @@ namespace Mira {
|
||||
size_t sentenceid,
|
||||
float bleuObjectiveWeight,
|
||||
float bleuScoreWeight,
|
||||
bool distinct)
|
||||
bool distinct,
|
||||
size_t rank,
|
||||
size_t epoch)
|
||||
{
|
||||
StaticData &staticData = StaticData::InstanceNonConst();
|
||||
|
||||
@ -215,15 +218,15 @@ namespace Mira {
|
||||
bleuAndScore.push_back(bleuScore);
|
||||
bleuAndScore.push_back(scoreWithoutBleu);
|
||||
|
||||
cerr << "1best translation: ";
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", 1best translation: ");
|
||||
Phrase phrase = path.GetTargetPhrase();
|
||||
for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
|
||||
const Word &word = phrase.GetWord(pos);
|
||||
Word *newWord = new Word(word);
|
||||
cerr << *newWord;
|
||||
VERBOSE(1, *newWord);
|
||||
}
|
||||
|
||||
cerr << endl;
|
||||
VERBOSE(1, endl);
|
||||
|
||||
return bleuAndScore;
|
||||
}
|
||||
@ -245,7 +248,6 @@ namespace Mira {
|
||||
}
|
||||
|
||||
void MosesDecoder::setWeights(const ScoreComponentCollection& weights) {
|
||||
//cerr << "New weights: " << weights << endl;
|
||||
StaticData::InstanceNonConst().SetAllWeights(weights);
|
||||
}
|
||||
|
||||
|
@ -50,7 +50,7 @@ void initMoses(const std::string& inifile, int debuglevel, int argc, std::vecto
|
||||
**/
|
||||
class MosesDecoder {
|
||||
public:
|
||||
MosesDecoder(bool useScaledReference, bool scaleByInputLength, float historySmoothing);
|
||||
MosesDecoder(bool scaleByInputLength, float historySmoothing);
|
||||
|
||||
//returns the best sentence
|
||||
std::vector<const Moses::Word*> getNBest(const std::string& source,
|
||||
@ -62,12 +62,15 @@ class MosesDecoder {
|
||||
std::vector< float>& scores,
|
||||
bool oracle,
|
||||
bool distinct,
|
||||
size_t rank);
|
||||
size_t rank,
|
||||
size_t epoch);
|
||||
std::vector<float> getBleuAndScore(const std::string& source,
|
||||
size_t sentenceid,
|
||||
float bleuObjectiveWeight,
|
||||
float bleuScoreWeight,
|
||||
bool distinct);
|
||||
bool distinct,
|
||||
size_t rank,
|
||||
size_t epoch);
|
||||
size_t getCurrentInputLength();
|
||||
void updateHistory(const std::vector<const Moses::Word*>& words);
|
||||
void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
|
||||
@ -77,13 +80,13 @@ class MosesDecoder {
|
||||
std::vector<float> calculateBleuOfCorpus(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& ref_ids, size_t epoch, size_t rank);
|
||||
Moses::ScoreComponentCollection getWeights();
|
||||
void setWeights(const Moses::ScoreComponentCollection& weights);
|
||||
void cleanup();
|
||||
void cleanup();
|
||||
|
||||
private:
|
||||
float getBleuScore(const Moses::ScoreComponentCollection& scores);
|
||||
void setBleuScore(Moses::ScoreComponentCollection& scores, float bleu);
|
||||
Moses::Manager *m_manager;
|
||||
Moses::Sentence *m_sentence;
|
||||
Moses::Manager *m_manager;
|
||||
Moses::Sentence *m_sentence;
|
||||
Moses::BleuScoreFeature *m_bleuScoreFeature;
|
||||
|
||||
|
||||
|
409
mira/Main.cpp
409
mira/Main.cpp
@ -144,7 +144,6 @@ int main(int argc, char** argv) {
|
||||
size_t weightDumpFrequency;
|
||||
string weightDumpStem;
|
||||
float min_learning_rate;
|
||||
float min_sentence_update;
|
||||
size_t scale_margin;
|
||||
bool scale_update;
|
||||
size_t n;
|
||||
@ -153,14 +152,12 @@ int main(int argc, char** argv) {
|
||||
bool onlyViolatedConstraints;
|
||||
bool accumulateWeights;
|
||||
float historySmoothing;
|
||||
bool useScaledReference;
|
||||
bool scaleByInputLength;
|
||||
float slack;
|
||||
float slack_step;
|
||||
float slack_min;
|
||||
bool averageWeights;
|
||||
bool weightConvergence;
|
||||
bool controlUpdates;
|
||||
float learning_rate;
|
||||
float mira_learning_rate;
|
||||
float perceptron_learning_rate;
|
||||
@ -168,24 +165,18 @@ int main(int argc, char** argv) {
|
||||
size_t baseOfLog;
|
||||
string decoder_settings;
|
||||
float min_weight_change;
|
||||
float max_sentence_update;
|
||||
float decrease_learning_rate;
|
||||
float decrease_sentence_update;
|
||||
bool devBleu;
|
||||
bool normaliseWeights;
|
||||
bool print_feature_values;
|
||||
bool stop_dev_bleu;
|
||||
bool stop_approx_dev_bleu;
|
||||
bool train_linear_classifier;
|
||||
bool multiplyA;
|
||||
bool historyOf1best;
|
||||
bool burnIn;
|
||||
string burnInInputFile;
|
||||
vector<string> burnInReferenceFiles;
|
||||
bool sentenceLevelBleu;
|
||||
float bleuScoreWeight;
|
||||
float precision;
|
||||
float min_bleu_change;
|
||||
float margin_slack;
|
||||
float margin_slack_incr;
|
||||
bool analytical_update;
|
||||
bool perceptron_update;
|
||||
bool hope_fear;
|
||||
@ -204,49 +195,42 @@ int main(int argc, char** argv) {
|
||||
("burn-in-input-file", po::value<string>(&burnInInputFile), "Input file for burn-in phase of BLEU history")
|
||||
("burn-in-reference-files", po::value<vector<string> >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history")
|
||||
("config,f", po::value<string>(&mosesConfigFile), "Moses ini file")
|
||||
("control-updates", po::value<bool>(&controlUpdates)->default_value(true), "Ignore updates that increase number of violated constraints AND increase the error")
|
||||
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
|
||||
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
|
||||
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
|
||||
("decr-sentence-update", po::value<float>(&decrease_sentence_update)->default_value(0), "Decrease maximum weight update by the given value after every epoch")
|
||||
("dev-bleu", po::value<bool>(&devBleu)->default_value(true), "Compute BLEU score of oracle translations of the whole tuning set")
|
||||
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
|
||||
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
|
||||
("epochs,e", po::value<size_t>(&epochs)->default_value(5), "Number of epochs")
|
||||
("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
|
||||
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
|
||||
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
|
||||
("history-of-1best", po::value<bool>(&historyOf1best)->default_value(0), "Use the 1best translation to update the history")
|
||||
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
|
||||
("history-of-1best", po::value<bool>(&historyOf1best)->default_value(false), "Use the 1best translation to update the history")
|
||||
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.7), "Adjust the factor for history smoothing")
|
||||
("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimization (not model)")
|
||||
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
|
||||
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
|
||||
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
|
||||
("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
|
||||
("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
|
||||
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
|
||||
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
|
||||
("min-bleu-change", po::value<float>(&min_bleu_change)->default_value(0), "Minimum BLEU change of 1best translations of one epoch")
|
||||
("min-sentence-update", po::value<float>(&min_sentence_update)->default_value(0), "Set a minimum weight update per sentence")
|
||||
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
|
||||
("max-sentence-update", po::value<float>(&max_sentence_update)->default_value(-1), "Set a maximum weight update per sentence")
|
||||
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
|
||||
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
|
||||
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(5), "How often per epoch to mix weights, when using mpi")
|
||||
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
|
||||
("nbest,n", po::value<size_t>(&n)->default_value(10), "Number of translations in nbest list")
|
||||
("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in nbest list")
|
||||
("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
|
||||
("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
|
||||
("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
|
||||
("precision", po::value<float>(&precision)->default_value(0), "Precision when comparing left and right hand side of constraints")
|
||||
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
|
||||
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
|
||||
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
|
||||
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(false), "Use a sentences level bleu scoring function")
|
||||
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(true), "Use a sentences level bleu scoring function")
|
||||
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
|
||||
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimizer")
|
||||
("slack-min", po::value<float>(&slack_min)->default_value(0.01), "Minimum slack used")
|
||||
("slack-step", po::value<float>(&slack_step)->default_value(0), "Increase slack from epoch to epoch by the value provided")
|
||||
("stop-dev-bleu", po::value<bool>(&stop_dev_bleu)->default_value(false), "Stop when average Bleu (dev) decreases (or no more increases)")
|
||||
("stop-approx-dev-bleu", po::value<bool>(&stop_approx_dev_bleu)->default_value(false), "Stop when average approx. sentence Bleu (dev) decreases (or no more increases)")
|
||||
("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
|
||||
("use-scaled-reference", po::value<bool>(&useScaledReference)->default_value(true), "Use scaled reference length for comparing target and reference length of phrases")
|
||||
("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
|
||||
("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
|
||||
("scale-update", po::value<bool>(&scale_update)->default_value(false), "Scale the update by the Bleu score of the oracle translation")
|
||||
@ -255,8 +239,7 @@ int main(int argc, char** argv) {
|
||||
po::options_description cmdline_options;
|
||||
cmdline_options.add(desc);
|
||||
po::variables_map vm;
|
||||
po::store(
|
||||
po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
|
||||
po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
|
||||
po::notify(vm);
|
||||
|
||||
if (help) {
|
||||
@ -329,7 +312,7 @@ int main(int argc, char** argv) {
|
||||
vector<string> decoder_params;
|
||||
boost::split(decoder_params, decoder_settings, boost::is_any_of("\t "));
|
||||
initMoses(mosesConfigFile, verbosity, decoder_params.size(), decoder_params);
|
||||
MosesDecoder* decoder = new MosesDecoder(useScaledReference, scaleByInputLength, historySmoothing);
|
||||
MosesDecoder* decoder = new MosesDecoder(scaleByInputLength, historySmoothing);
|
||||
if (normaliseWeights) {
|
||||
ScoreComponentCollection startWeights = decoder->getWeights();
|
||||
startWeights.L1Normalise();
|
||||
@ -353,12 +336,16 @@ int main(int argc, char** argv) {
|
||||
// initialise optimizer
|
||||
Optimiser* optimiser = NULL;
|
||||
if (learner == "mira") {
|
||||
cerr << "Optimising using Mira" << endl;
|
||||
optimiser = new MiraOptimiser(onlyViolatedConstraints, slack, scale_margin, scale_update, precision);
|
||||
if (rank == 0) {
|
||||
cerr << "Optimising using Mira" << endl;
|
||||
}
|
||||
optimiser = new MiraOptimiser(onlyViolatedConstraints, slack, scale_margin, scale_update, margin_slack);
|
||||
learning_rate = mira_learning_rate;
|
||||
perceptron_update = false;
|
||||
} else if (learner == "perceptron") {
|
||||
cerr << "Optimising using Perceptron" << endl;
|
||||
if (rank == 0) {
|
||||
cerr << "Optimising using Perceptron" << endl;
|
||||
}
|
||||
optimiser = new Perceptron();
|
||||
learning_rate = perceptron_learning_rate;
|
||||
perceptron_update = true;
|
||||
@ -373,7 +360,7 @@ int main(int argc, char** argv) {
|
||||
// resolve parameter dependencies
|
||||
if (perceptron_update || analytical_update) {
|
||||
batchSize = 1;
|
||||
cerr << "Setting batch size to 1 for perceptron/analytical update" << endl;
|
||||
cerr << "Info: Setting batch size to 1 for perceptron/analytical update" << endl;
|
||||
}
|
||||
|
||||
if (hope_n == -1 && fear_n == -1) {
|
||||
@ -385,14 +372,18 @@ int main(int argc, char** argv) {
|
||||
hope_fear = false; // is true by default
|
||||
}
|
||||
|
||||
if (!hope_fear && !analytical_update) {
|
||||
model_hope_fear = true;
|
||||
}
|
||||
|
||||
if (model_hope_fear && analytical_update) {
|
||||
cerr << "Error: must choose between model-hope-fear and analytical update" << endl;
|
||||
cerr << "Error: Must choose between model-hope-fear and analytical update" << endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (burnIn && sentenceLevelBleu) {
|
||||
burnIn = false;
|
||||
cerr << "Burn-in not needed when using sentence-level BLEU, deactivating burn-in." << endl;
|
||||
cerr << "Info: Burn-in not needed when using sentence-level BLEU, deactivating burn-in." << endl;
|
||||
}
|
||||
|
||||
if (burnIn) {
|
||||
@ -436,7 +427,7 @@ int main(int argc, char** argv) {
|
||||
order.push_back(i);
|
||||
}
|
||||
|
||||
cerr << "Rank " << rank << ", starting burn-in phase for approx. BLEU history.." << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", starting burn-in phase for approx. BLEU history.." << endl);
|
||||
if (historyOf1best) {
|
||||
// get 1best translations for the burn-in sentences
|
||||
vector<size_t>::const_iterator sid = order.begin();
|
||||
@ -444,7 +435,7 @@ int main(int argc, char** argv) {
|
||||
string& input = burnInInputSentences[*sid];
|
||||
vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
|
||||
featureValues[0], bleuScores[0], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, -1);
|
||||
inputLengths.push_back(decoder->getCurrentInputLength());
|
||||
ref_ids.push_back(*sid);
|
||||
decoder->cleanup();
|
||||
@ -468,8 +459,7 @@ int main(int argc, char** argv) {
|
||||
while (sid != order.end()) {
|
||||
string& input = burnInInputSentences[*sid];
|
||||
vector<const Word*> oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight,
|
||||
featureValues[0], bleuScores[0], true,
|
||||
distinctNbest, rank);
|
||||
featureValues[0], bleuScores[0], true, distinctNbest, rank, -1);
|
||||
inputLengths.push_back(decoder->getCurrentInputLength());
|
||||
ref_ids.push_back(*sid);
|
||||
decoder->cleanup();
|
||||
@ -488,7 +478,7 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
cerr << "Bleu feature history after burn-in: " << endl;
|
||||
VERBOSE(1, "Bleu feature history after burn-in: " << endl);
|
||||
decoder->printBleuFeatureHistory(cerr);
|
||||
decoder->loadReferenceSentences(referenceSentences);
|
||||
}
|
||||
@ -532,44 +522,28 @@ int main(int argc, char** argv) {
|
||||
size_t numberOfUpdates = 0;
|
||||
size_t numberOfUpdatesThisEpoch = 0;
|
||||
|
||||
time_t now = time(0); // get current time
|
||||
struct tm* tm = localtime(&now); // get struct filled out
|
||||
cerr << "Start date/time: " << tm->tm_mon + 1 << "/" << tm->tm_mday << "/"
|
||||
<< tm->tm_year + 1900 << ", " << tm->tm_hour << ":" << tm->tm_min << ":"
|
||||
<< tm->tm_sec << endl;
|
||||
time_t now;
|
||||
time(&now);
|
||||
cerr << "Rank " << rank << ", " << ctime(&now) << endl;
|
||||
|
||||
ScoreComponentCollection mixedAverageWeights;
|
||||
ScoreComponentCollection mixedAverageWeightsPrevious;
|
||||
ScoreComponentCollection mixedAverageWeightsBeforePrevious;
|
||||
|
||||
/* float averageRatio = 0;
|
||||
float averageBleu = 0;
|
||||
float prevAverageBleu = 0;
|
||||
float beforePrevAverageBleu = 0;
|
||||
float summedApproxBleu = 0;
|
||||
float averageApproxBleu = 0;
|
||||
float prevAverageApproxBleu = 0;
|
||||
float beforePrevAverageApproxBleu = 0;*/
|
||||
bool stop = false;
|
||||
int sumStillViolatedConstraints;
|
||||
int sumStillViolatedConstraints_lastEpoch = 0;
|
||||
int sumConstraintChangeAbs;
|
||||
int sumConstraintChangeAbs_lastEpoch = 0;
|
||||
size_t sumBleuChangeAbs;
|
||||
// size_t sumBleuChangeAbs;
|
||||
float *sendbuf, *recvbuf;
|
||||
sendbuf = (float *) malloc(sizeof(float));
|
||||
recvbuf = (float *) malloc(sizeof(float));
|
||||
// Note: make sure that the variable mosesWeights always holds the current decoder weights
|
||||
for (size_t epoch = 0; epoch < epochs && !stop; ++epoch) {
|
||||
cerr << "\nRank " << rank << ", epoch " << epoch << endl;
|
||||
|
||||
// sum of violated constraints
|
||||
sumStillViolatedConstraints = 0;
|
||||
sumConstraintChangeAbs = 0;
|
||||
sumBleuChangeAbs = 0;
|
||||
|
||||
// sum of approx. sentence bleu scores per epoch
|
||||
// summedApproxBleu = 0;
|
||||
// sumBleuChangeAbs = 0;
|
||||
|
||||
numberOfUpdatesThisEpoch = 0;
|
||||
// Sum up weights over one epoch, final average uses weights from last epoch
|
||||
@ -601,8 +575,7 @@ int main(int argc, char** argv) {
|
||||
|
||||
// get moses weights
|
||||
ScoreComponentCollection mosesWeights = decoder->getWeights();
|
||||
cerr << "\nRank " << rank << ", next batch" << endl;
|
||||
cerr << "Rank " << rank << ", weights: " << mosesWeights << endl;
|
||||
VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl);
|
||||
|
||||
// BATCHING: produce nbest lists for all input sentences in batch
|
||||
vector<float> oracleBleuScores;
|
||||
@ -618,8 +591,7 @@ int main(int argc, char** argv) {
|
||||
!= shard.end(); ++batchPosition) {
|
||||
string& input = inputSentences[*sid];
|
||||
const vector<string>& refs = referenceSentences[*sid];
|
||||
cerr << "Rank " << rank << ", batch position " << batchPosition << endl;
|
||||
cerr << "Rank " << rank << ", input sentence " << *sid << ": \"" << input << "\"" << endl;
|
||||
cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"" << input << "\"" << " (batch pos " << batchPosition << ")" << endl;
|
||||
|
||||
vector<ScoreComponentCollection> newFeatureValues;
|
||||
vector<float> newBleuScores;
|
||||
@ -640,13 +612,13 @@ int main(int argc, char** argv) {
|
||||
if (perceptron_update || analytical_update) {
|
||||
if (historyOf1best) {
|
||||
// MODEL (for updating the history)
|
||||
cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl;
|
||||
cerr << "Rank " << rank << ", run decoder to get 1best wrt model score (for history)" << endl;
|
||||
vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
|
||||
dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, epoch);
|
||||
decoder->cleanup();
|
||||
oneBests.push_back(bestModel);
|
||||
cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl);
|
||||
}
|
||||
|
||||
// clear dummies
|
||||
@ -658,22 +630,22 @@ int main(int argc, char** argv) {
|
||||
size_t oraclePos = 0;
|
||||
vector<const Word*> oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight,
|
||||
featureValuesHope[batchPosition], bleuScoresHope[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, epoch);
|
||||
// needed for history
|
||||
inputLengths.push_back(decoder->getCurrentInputLength());
|
||||
ref_ids.push_back(*sid);
|
||||
decoder->cleanup();
|
||||
oracles.push_back(oracle);
|
||||
cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScoresHope[batchPosition][oraclePos] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScoresHope[batchPosition][oraclePos] << endl);
|
||||
|
||||
// FEAR
|
||||
cerr << "Rank " << rank << ", run decoder to get 1best fear translations" << endl;
|
||||
size_t fearPos = 0;
|
||||
vector<const Word*> fear = decoder->getNBest(input, *sid, 1, -1.0, bleuScoreWeight,
|
||||
featureValuesFear[batchPosition], bleuScoresFear[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, epoch);
|
||||
decoder->cleanup();
|
||||
cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScoresFear[batchPosition][fearPos] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScoresFear[batchPosition][fearPos] << endl);
|
||||
for (size_t i = 0; i < fear.size(); ++i) {
|
||||
delete fear[i];
|
||||
}
|
||||
@ -682,37 +654,34 @@ int main(int argc, char** argv) {
|
||||
if (hope_fear) {
|
||||
if (historyOf1best) {
|
||||
// MODEL (for updating the history only, using dummy vectors)
|
||||
cerr << "Rank " << rank << ", run decoder to get " << 1 << "best wrt model score" << endl;
|
||||
cerr << "dummyFeatureValues.size: " << dummyFeatureValues.size() << endl;
|
||||
cerr << "batch position: " << batchPosition << endl;
|
||||
cerr << "Rank " << rank << ", run decoder to get 1best wrt model score (for history)" << endl;
|
||||
vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
|
||||
dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
cerr << "finished decoding." << endl;
|
||||
distinctNbest, rank, epoch);
|
||||
decoder->cleanup();
|
||||
oneBests.push_back(bestModel);
|
||||
cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << dummyBleuScores[batchPosition][0] << endl);
|
||||
}
|
||||
|
||||
// HOPE
|
||||
cerr << "Rank " << rank << ", run decoder to get " << hope_n << "best hope translations" << endl;
|
||||
vector<const Word*> oracle = decoder->getNBest(input, *sid, hope_n, 1.0, bleuScoreWeight,
|
||||
featureValuesHope[batchPosition], bleuScoresHope[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, epoch);
|
||||
// needed for history
|
||||
inputLengths.push_back(decoder->getCurrentInputLength());
|
||||
ref_ids.push_back(*sid);
|
||||
decoder->cleanup();
|
||||
oracles.push_back(oracle);
|
||||
cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScoresHope[batchPosition][0] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScoresHope[batchPosition][0] << endl);
|
||||
|
||||
// FEAR
|
||||
cerr << "Rank " << rank << ", run decoder to get " << fear_n << "best fear translations" << endl;
|
||||
vector<const Word*> fear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuScoreWeight,
|
||||
featureValuesFear[batchPosition], bleuScoresFear[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, epoch);
|
||||
decoder->cleanup();
|
||||
cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScoresFear[batchPosition][0] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScoresFear[batchPosition][0] << endl);
|
||||
for (size_t i = 0; i < fear.size(); ++i) {
|
||||
delete fear[i];
|
||||
}
|
||||
@ -722,26 +691,26 @@ int main(int argc, char** argv) {
|
||||
cerr << "Rank " << rank << ", run decoder to get " << n << "best wrt model score" << endl;
|
||||
vector<const Word*> bestModel = decoder->getNBest(input, *sid, n, 0.0, bleuScoreWeight,
|
||||
featureValues[batchPosition], bleuScores[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, epoch);
|
||||
decoder->cleanup();
|
||||
oneBests.push_back(bestModel);
|
||||
// needed for calculating bleu of dev (1best translations) // todo:
|
||||
all_ref_ids.push_back(*sid);
|
||||
allBestModelScore.push_back(bestModel);
|
||||
cerr << "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << bleuScores[batchPosition][0] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", model length: " << bestModel.size() << " Bleu: " << bleuScores[batchPosition][0] << endl);
|
||||
|
||||
// HOPE
|
||||
cerr << "Rank " << rank << ", run decoder to get " << n << "best hope translations" << endl;
|
||||
size_t oraclePos = featureValues[batchPosition].size();
|
||||
vector<const Word*> oracle = decoder->getNBest(input, *sid, n, 1.0, bleuScoreWeight,
|
||||
featureValues[batchPosition], bleuScores[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, epoch);
|
||||
// needed for history
|
||||
inputLengths.push_back(decoder->getCurrentInputLength());
|
||||
ref_ids.push_back(*sid);
|
||||
decoder->cleanup();
|
||||
oracles.push_back(oracle);
|
||||
cerr << "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScores[batchPosition][oraclePos] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", oracle length: " << oracle.size() << " Bleu: " << bleuScores[batchPosition][oraclePos] << endl);
|
||||
|
||||
oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
|
||||
oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
|
||||
@ -751,18 +720,15 @@ int main(int argc, char** argv) {
|
||||
size_t fearPos = featureValues[batchPosition].size();
|
||||
vector<const Word*> fear = decoder->getNBest(input, *sid, n, -1.0, bleuScoreWeight,
|
||||
featureValues[batchPosition], bleuScores[batchPosition], true,
|
||||
distinctNbest, rank);
|
||||
distinctNbest, rank, epoch);
|
||||
decoder->cleanup();
|
||||
cerr << "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", fear length: " << fear.size() << " Bleu: " << bleuScores[batchPosition][fearPos] << endl);
|
||||
for (size_t i = 0; i < fear.size(); ++i) {
|
||||
delete fear[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cerr << "Rank " << rank << ", sentence " << *sid << ", best model Bleu (approximate sentence bleu): " << bleuScores[batchPosition][0] << endl;
|
||||
// summedApproxBleu += bleuScores[batchPosition][0];
|
||||
|
||||
// next input sentence
|
||||
++sid;
|
||||
++actualBatchSize;
|
||||
@ -802,14 +768,14 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// get 1best model results with old weights
|
||||
/* // get 1best model results with old weights
|
||||
vector< vector <float > > bestModelOld_batch;
|
||||
for (size_t i = 0; i < actualBatchSize; ++i) {
|
||||
string& input = inputSentences[*current_sid_start + i];
|
||||
vector <float> bestModelOld = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest);
|
||||
vector <float> bestModelOld = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest, rank, epoch);
|
||||
bestModelOld_batch.push_back(bestModelOld);
|
||||
decoder->cleanup();
|
||||
}
|
||||
}*/
|
||||
|
||||
// optionally print out the feature values
|
||||
if (print_feature_values) {
|
||||
@ -840,7 +806,7 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
// Run optimiser on batch:
|
||||
cerr << "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl;
|
||||
VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
|
||||
ScoreComponentCollection oldWeights(mosesWeights);
|
||||
vector<int> update_status;
|
||||
if (perceptron_update) {
|
||||
@ -848,12 +814,12 @@ int main(int argc, char** argv) {
|
||||
vector<size_t> dummy2;
|
||||
update_status = optimiser->updateWeightsHopeFear(mosesWeights,
|
||||
featureValuesHope, featureValuesFear, dummy1, dummy1, dummy2,
|
||||
learning_rate, 0, rank, epoch, 0);
|
||||
learning_rate, rank, epoch);
|
||||
}
|
||||
else if (analytical_update) {
|
||||
update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(mosesWeights,
|
||||
featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0],
|
||||
ref_ids[0], learning_rate, max_sentence_update, rank, epoch, controlUpdates);
|
||||
ref_ids[0], learning_rate, rank, epoch);
|
||||
}
|
||||
else {
|
||||
if (hope_fear) {
|
||||
@ -884,74 +850,64 @@ int main(int argc, char** argv) {
|
||||
|
||||
update_status = optimiser->updateWeightsHopeFear(mosesWeights,
|
||||
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, ref_ids,
|
||||
learning_rate, max_sentence_update, rank, epoch, controlUpdates);
|
||||
learning_rate, rank, epoch);
|
||||
}
|
||||
else {
|
||||
// model_hope_fear
|
||||
update_status = ((MiraOptimiser*) optimiser)->updateWeights(mosesWeights, featureValues,
|
||||
losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids,
|
||||
learning_rate, max_sentence_update, rank, epoch, controlUpdates);
|
||||
learning_rate, rank, epoch);
|
||||
}
|
||||
}
|
||||
|
||||
if (update_status[0] == 1) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", no update for batch" << endl;
|
||||
sumConstraintChangeAbs += abs(update_status[0] - update_status[1]);
|
||||
sumStillViolatedConstraints += update_status[1];
|
||||
|
||||
// pass new weights to decoder
|
||||
if (normaliseWeights) {
|
||||
mosesWeights.L1Normalise();
|
||||
}
|
||||
else if (update_status[0] == -1) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update ignored" << endl;
|
||||
|
||||
cumulativeWeights.PlusEquals(mosesWeights);
|
||||
++numberOfUpdates;
|
||||
++numberOfUpdatesThisEpoch;
|
||||
if (averageWeights) {
|
||||
ScoreComponentCollection averageWeights(cumulativeWeights);
|
||||
if (accumulateWeights) {
|
||||
averageWeights.DivideEquals(numberOfUpdates);
|
||||
} else {
|
||||
averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
|
||||
}
|
||||
|
||||
mosesWeights = averageWeights;
|
||||
}
|
||||
else {
|
||||
sumConstraintChangeAbs += abs(update_status[1] - update_status[2]);
|
||||
sumStillViolatedConstraints += update_status[2];
|
||||
|
||||
// pass new weights to decoder
|
||||
if (normaliseWeights) {
|
||||
mosesWeights.L1Normalise();
|
||||
}
|
||||
// set new Moses weights (averaged or not)
|
||||
decoder->setWeights(mosesWeights);
|
||||
|
||||
cumulativeWeights.PlusEquals(mosesWeights);
|
||||
++numberOfUpdates;
|
||||
++numberOfUpdatesThisEpoch;
|
||||
if (averageWeights) {
|
||||
ScoreComponentCollection averageWeights(cumulativeWeights);
|
||||
if (accumulateWeights) {
|
||||
averageWeights.DivideEquals(numberOfUpdates);
|
||||
} else {
|
||||
averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
|
||||
}
|
||||
// compute difference to old weights
|
||||
ScoreComponentCollection weightDifference(mosesWeights);
|
||||
weightDifference.MinusEquals(oldWeights);
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weight difference: " << weightDifference << endl);
|
||||
|
||||
mosesWeights = averageWeights;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", set new average weights: " << mosesWeights << endl;
|
||||
}
|
||||
else {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", set new weights: " << mosesWeights << endl;
|
||||
}
|
||||
|
||||
// set new Moses weights (averaged or not)
|
||||
decoder->setWeights(mosesWeights);
|
||||
|
||||
// compute difference to old weights
|
||||
ScoreComponentCollection weightDifference(mosesWeights);
|
||||
weightDifference.MinusEquals(oldWeights);
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", weight difference: " << weightDifference << endl;
|
||||
|
||||
// get 1best model results with new weights (for each sentence in batch)
|
||||
vector<float> bestModelNew;
|
||||
for (size_t i = 0; i < actualBatchSize; ++i) {
|
||||
string& input = inputSentences[*current_sid_start + i];
|
||||
bestModelNew = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest);
|
||||
decoder->cleanup();
|
||||
sumBleuChangeAbs += abs(bestModelOld_batch[i][0] - bestModelNew[0]);
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", 1best model bleu, old: " << bestModelOld_batch[i][0] << ", new: " << bestModelNew[0] << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", 1best model score, old: " << bestModelOld_batch[i][1] << ", new: " << bestModelNew[1] << endl;
|
||||
}
|
||||
}
|
||||
/* // get 1best model results with new weights (for each sentence in batch)
|
||||
vector<float> bestModelNew;
|
||||
for (size_t i = 0; i < actualBatchSize; ++i) {
|
||||
string& input = inputSentences[*current_sid_start + i];
|
||||
bestModelNew = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest, rank, epoch);
|
||||
decoder->cleanup();
|
||||
sumBleuChangeAbs += abs(bestModelOld_batch[i][0] - bestModelNew[0]);
|
||||
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", 1best model bleu, old: " << bestModelOld_batch[i][0] << ", new: " << bestModelNew[0] << endl);
|
||||
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", 1best model score, old: " << bestModelOld_batch[i][1] << ", new: " << bestModelNew[1] << endl);
|
||||
}*/
|
||||
|
||||
// update history (for approximate document Bleu)
|
||||
if (sentenceLevelBleu) {
|
||||
for (size_t i = 0; i < oracles.size(); ++i) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", oracle length: " << oracles[i].size() << " ";
|
||||
decoder->printReferenceLength(ref_ids);
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", oracle length: " << oracles[i].size() << " ");
|
||||
if (verbosity > 0) {
|
||||
decoder->printReferenceLength(ref_ids);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
@ -1058,16 +1014,17 @@ int main(int argc, char** argv) {
|
||||
cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
|
||||
}
|
||||
|
||||
cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl;
|
||||
cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
|
||||
mixedAverageWeights.Save(filename.str());
|
||||
++weightEpochDump;
|
||||
}
|
||||
}// end dumping
|
||||
} // end of shard loop, end of this epoch
|
||||
|
||||
|
||||
cerr << "Bleu feature history after epoch " << epoch << endl;
|
||||
decoder->printBleuFeatureHistory(cerr);
|
||||
if (verbosity > 0) {
|
||||
cerr << "Bleu feature history after epoch " << epoch << endl;
|
||||
decoder->printBleuFeatureHistory(cerr);
|
||||
}
|
||||
|
||||
// Check whether there were any weight updates during this epoch
|
||||
size_t sumUpdates;
|
||||
@ -1094,131 +1051,30 @@ int main(int argc, char** argv) {
|
||||
|
||||
if (epoch > 0) {
|
||||
if ((sumConstraintChangeAbs_lastEpoch == sumConstraintChangeAbs) && (sumStillViolatedConstraints_lastEpoch == sumStillViolatedConstraints)) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints and constraint changes has stayed the same: " << sumStillViolatedConstraints << ", " << sumConstraintChangeAbs << endl;
|
||||
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints and constraint changes has stayed the same: " << sumStillViolatedConstraints << ", " << sumConstraintChangeAbs << endl);
|
||||
}
|
||||
else {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << ", sum of constraint changes " << sumConstraintChangeAbs << endl;
|
||||
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << ", sum of constraint changes " << sumConstraintChangeAbs << endl);
|
||||
}
|
||||
}
|
||||
else {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl;
|
||||
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl);
|
||||
}
|
||||
|
||||
sumConstraintChangeAbs_lastEpoch = sumConstraintChangeAbs;
|
||||
sumStillViolatedConstraints_lastEpoch = sumStillViolatedConstraints;
|
||||
|
||||
if (min_bleu_change > 0) {
|
||||
if (sumBleuChangeAbs < min_bleu_change) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", sum of BLEU score changes was smaller than " << min_bleu_change << " (" << sumBleuChangeAbs << ")." << endl;
|
||||
stop = true;
|
||||
}
|
||||
else {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", sum of BLEU score changes: " << sumBleuChangeAbs << "." << endl;
|
||||
}
|
||||
}
|
||||
|
||||
if (!stop) {
|
||||
/* if (devBleu) {
|
||||
// calculate bleu score of dev set
|
||||
vector<float> bleuAndRatio = decoder->calculateBleuOfCorpus(allBestModelScore, all_ref_ids, epoch, rank);
|
||||
float bleu = bleuAndRatio[0];
|
||||
float ratio = bleuAndRatio[1];
|
||||
|
||||
for (size_t i = 0; i < allBestModelScore.size(); ++i) {
|
||||
for (size_t j = 0; j < allBestModelScore[i].size(); ++j) {
|
||||
delete allBestModelScore[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
if (rank == 0) {
|
||||
beforePrevAverageBleu = prevAverageBleu;
|
||||
beforePrevAverageApproxBleu = prevAverageApproxBleu;
|
||||
prevAverageBleu = averageBleu;
|
||||
prevAverageApproxBleu = averageApproxBleu;
|
||||
}
|
||||
|
||||
#ifdef MPI_ENABLE
|
||||
// average bleu across processes
|
||||
sendbuf[0] = bleu;
|
||||
recvbuf[0] = 0;
|
||||
MPI_Reduce(sendbuf, recvbuf, 1, MPI_FLOAT, MPI_SUM, 0, world);
|
||||
if (rank == 0) {
|
||||
averageBleu = recvbuf[0];
|
||||
|
||||
// divide by number of processes
|
||||
averageBleu /= size;
|
||||
cerr << "Average Bleu (dev) after epoch " << epoch << ": " << averageBleu << endl;
|
||||
}
|
||||
|
||||
// average ratio across processes
|
||||
sendbuf[0] = ratio;
|
||||
recvbuf[0] = 0;
|
||||
MPI_Reduce(sendbuf, recvbuf, 1, MPI_FLOAT, MPI_SUM, 0, world);
|
||||
if (rank == 0) {
|
||||
averageRatio = recvbuf[0];
|
||||
|
||||
// divide by number of processes
|
||||
averageRatio /= size;
|
||||
cerr << "Average ratio (dev) after epoch " << epoch << ": " << averageRatio << endl;
|
||||
}
|
||||
|
||||
// average approximate sentence bleu across processes
|
||||
sendbuf[0] = summedApproxBleu/numberOfUpdatesThisEpoch;
|
||||
recvbuf[0] = 0;
|
||||
MPI_Reduce(sendbuf, recvbuf, 1, MPI_FLOAT, MPI_SUM, 0, world);
|
||||
if (rank == 0) {
|
||||
averageApproxBleu = recvbuf[0];
|
||||
|
||||
// divide by number of processes
|
||||
averageApproxBleu /= size;
|
||||
cerr << "Average approx. sentence Bleu (dev) after epoch " << epoch << ": " << averageApproxBleu << endl;
|
||||
}
|
||||
#endif
|
||||
#ifndef MPI_ENABLE
|
||||
averageBleu = bleu;
|
||||
cerr << "Average Bleu (dev) after epoch " << epoch << ": " << averageBleu << endl;
|
||||
averageApproxBleu = summedApproxBleu / numberOfUpdatesThisEpoch;
|
||||
cerr << "Average approx. sentence Bleu (dev) after epoch " << epoch << ": " << averageApproxBleu << endl;
|
||||
#endif
|
||||
if (rank == 0) {
|
||||
if (stop_dev_bleu) {
|
||||
if (averageBleu <= prevAverageBleu && prevAverageBleu <= beforePrevAverageBleu) {
|
||||
stop = true;
|
||||
cerr << "Average Bleu (dev) is decreasing or no more increasing.. stop tuning." << endl;
|
||||
ScoreComponentCollection dummy;
|
||||
ostringstream endfilename;
|
||||
endfilename << "stopping";
|
||||
dummy.Save(endfilename.str());
|
||||
}
|
||||
}
|
||||
|
||||
if (stop_approx_dev_bleu) {
|
||||
if (averageApproxBleu <= prevAverageApproxBleu && prevAverageApproxBleu <= beforePrevAverageApproxBleu) {
|
||||
stop = true;
|
||||
cerr << "Average approx. sentence Bleu (dev) is decreasing or no more increasing.. stop tuning." << endl;
|
||||
ScoreComponentCollection dummy;
|
||||
ostringstream endfilename;
|
||||
endfilename << "stopping";
|
||||
dummy.Save(endfilename.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MPI_ENABLE
|
||||
mpi::broadcast(world, stop, 0);
|
||||
#endif
|
||||
} // end if (dev_bleu) */
|
||||
|
||||
// Test if weights have converged
|
||||
if (weightConvergence) {
|
||||
bool reached = true;
|
||||
if (rank == 0 && (epoch >= 2)) {
|
||||
ScoreComponentCollection firstDiff(mixedAverageWeights);
|
||||
firstDiff.MinusEquals(mixedAverageWeightsPrevious);
|
||||
cerr << "Average weight changes since previous epoch: " << firstDiff << endl;
|
||||
VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << endl);
|
||||
ScoreComponentCollection secondDiff(mixedAverageWeights);
|
||||
secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious);
|
||||
cerr << "Average weight changes since before previous epoch: " << secondDiff << endl << endl;
|
||||
VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << endl << endl);
|
||||
|
||||
// check whether stopping criterion has been reached
|
||||
// (both difference vectors must have all weight changes smaller than min_weight_change)
|
||||
@ -1240,7 +1096,7 @@ int main(int argc, char** argv) {
|
||||
if (reached) {
|
||||
// stop MIRA
|
||||
stop = true;
|
||||
cerr << "Stopping criterion has been reached after epoch " << epoch << ".. stopping MIRA." << endl;
|
||||
cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl;
|
||||
ScoreComponentCollection dummy;
|
||||
ostringstream endfilename;
|
||||
endfilename << "stopping";
|
||||
@ -1255,17 +1111,26 @@ int main(int argc, char** argv) {
|
||||
#endif
|
||||
} //end if (weightConvergence)
|
||||
|
||||
// if using flexible regularization, decrease regularization parameter for next epoch
|
||||
// if using flexible slack, decrease slack parameter for next epoch
|
||||
if (slack_step > 0) {
|
||||
if (slack - slack_step >= slack_min) {
|
||||
if (typeid(*optimiser) == typeid(MiraOptimiser)) {
|
||||
slack -= slack_step;
|
||||
cerr << "Change slack to: " << slack << endl;
|
||||
VERBOSE(1, "Change slack to: " << slack << endl);
|
||||
((MiraOptimiser*) optimiser)->setSlack(slack);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if using flexible margin slack, decrease margin slack parameter for next epoch
|
||||
if (margin_slack_incr > 0.0001) {
|
||||
if (typeid(*optimiser) == typeid(MiraOptimiser)) {
|
||||
margin_slack += margin_slack_incr;
|
||||
VERBOSE(1, "Change margin slack to: " << margin_slack << endl);
|
||||
((MiraOptimiser*) optimiser)->setMarginSlack(margin_slack);
|
||||
}
|
||||
}
|
||||
|
||||
// change learning rate
|
||||
if ((decrease_learning_rate > 0) && (learning_rate - decrease_learning_rate >= min_learning_rate)) {
|
||||
learning_rate -= decrease_learning_rate;
|
||||
@ -1276,20 +1141,7 @@ int main(int argc, char** argv) {
|
||||
mpi::broadcast(world, stop, 0);
|
||||
#endif
|
||||
}
|
||||
cerr << "Change learning rate to " << learning_rate << endl;
|
||||
}
|
||||
|
||||
// change maximum sentence update
|
||||
if ((decrease_sentence_update > 0) && (max_sentence_update - decrease_sentence_update >= min_sentence_update)) {
|
||||
max_sentence_update -= decrease_sentence_update;
|
||||
if (max_sentence_update <= 0.0001) {
|
||||
max_sentence_update = 0;
|
||||
stop = true;
|
||||
#ifdef MPI_ENABLE
|
||||
mpi::broadcast(world, stop, 0);
|
||||
#endif
|
||||
}
|
||||
cerr << "Change maximum sentence update to " << max_sentence_update << endl;
|
||||
VERBOSE(1, "Change learning rate to " << learning_rate << endl);
|
||||
}
|
||||
}
|
||||
} // end of epoch loop
|
||||
@ -1298,11 +1150,8 @@ int main(int argc, char** argv) {
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
|
||||
now = time(0); // get current time
|
||||
tm = localtime(&now); // get struct filled out
|
||||
cerr << "\nEnd date/time: " << tm->tm_mon + 1 << "/" << tm->tm_mday
|
||||
<< "/" << tm->tm_year + 1900 << ", " << tm->tm_hour << ":"
|
||||
<< tm->tm_min << ":" << tm->tm_sec << endl;
|
||||
time(&now);
|
||||
cerr << "Rank " << rank << ", " << ctime(&now);
|
||||
|
||||
delete decoder;
|
||||
exit(0);
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "Optimiser.h"
|
||||
#include "Hildreth.h"
|
||||
#include "StaticData.h"
|
||||
|
||||
using namespace Moses;
|
||||
using namespace std;
|
||||
@ -14,10 +15,8 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
const vector<float> oracleBleuScores,
|
||||
const vector<size_t> sentenceIds,
|
||||
float learning_rate,
|
||||
float max_sentence_update,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool controlUpdates) {
|
||||
size_t epoch) {
|
||||
|
||||
// vector of feature values differences for all created constraints
|
||||
vector<ScoreComponentCollection> featureValueDiffs;
|
||||
@ -40,41 +39,44 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
|
||||
featureValueDiff.MinusEquals(featureValues[i][j]);
|
||||
|
||||
cerr << "feature value diff: " << featureValueDiff << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl;
|
||||
if (featureValueDiff.GetL1Norm() == 0) {
|
||||
cerr << "Equal feature values, constraint skipped.." << endl;
|
||||
// skip constraint
|
||||
continue;
|
||||
}
|
||||
|
||||
float loss = losses[i][j];
|
||||
if (m_scale_margin == 1) {
|
||||
loss *= oracleBleuScores[i];
|
||||
cerr << "Scaling margin with oracle bleu score " << oracleBleuScores[i] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << oracleBleuScores[i] << endl);
|
||||
}
|
||||
else if (m_scale_margin == 2) {
|
||||
loss *= log2(oracleBleuScores[i]);
|
||||
cerr << "Scaling margin with log2 oracle bleu score " << log2(oracleBleuScores[i]) << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling margin with log2 oracle bleu score " << log2(oracleBleuScores[i]) << endl);
|
||||
}
|
||||
else if (m_scale_margin == 10) {
|
||||
loss *= log10(oracleBleuScores[i]);
|
||||
cerr << "Scaling margin with log10 oracle bleu score " << log10(oracleBleuScores[i]) << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling margin with log10 oracle bleu score " << log10(oracleBleuScores[i]) << endl)
|
||||
}
|
||||
|
||||
// check if constraint is violated
|
||||
bool violated = false;
|
||||
bool addConstraint = true;
|
||||
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
||||
float diff = loss - (modelScoreDiff + m_precision);
|
||||
cerr << "constraint: " << (modelScoreDiff + m_precision) << " >= " << loss << endl;
|
||||
float diff = 0;
|
||||
if (loss > (modelScoreDiff + m_margin_slack)) {
|
||||
diff = loss - (modelScoreDiff + m_margin_slack);
|
||||
}
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
|
||||
|
||||
if (diff > epsilon) {
|
||||
violated = true;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << endl;
|
||||
}
|
||||
else if (m_onlyViolatedConstraints) {
|
||||
addConstraint = false;
|
||||
}
|
||||
|
||||
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_precision);
|
||||
float lossMinusModelScoreDiff = loss - modelScoreDiff;
|
||||
if (addConstraint) {
|
||||
featureValueDiffs.push_back(featureValueDiff);
|
||||
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
|
||||
@ -92,8 +94,8 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
vector<float> alphas;
|
||||
ScoreComponentCollection summedUpdate;
|
||||
if (violatedConstraintsBefore > 0) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << featureValueDiffs.size() << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of violated constraints passed to optimizer: " << violatedConstraintsBefore << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " <<
|
||||
featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl;
|
||||
if (m_slack != 0) {
|
||||
alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack);
|
||||
} else {
|
||||
@ -104,7 +106,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
// * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
|
||||
for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
|
||||
float alpha = alphas[k];
|
||||
cerr << "alpha: " << alpha << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl);
|
||||
ScoreComponentCollection update(featureValueDiffs[k]);
|
||||
update.MultiplyEquals(alpha);
|
||||
|
||||
@ -113,11 +115,10 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
}
|
||||
}
|
||||
else {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl;
|
||||
vector<int> status(3);
|
||||
status[0] = 1;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl;
|
||||
vector<int> status(2);
|
||||
status[0] = 0;
|
||||
status[1] = 0;
|
||||
status[2] = 0;
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -130,56 +131,37 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
|
||||
float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
|
||||
float loss = all_losses[i];
|
||||
float diff = loss - (modelScoreDiff + m_precision);
|
||||
float diff = loss - (modelScoreDiff + m_margin_slack);
|
||||
if (diff > epsilon) {
|
||||
++violatedConstraintsAfter;
|
||||
newDistanceFromOptimum += diff;
|
||||
}
|
||||
}
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl;
|
||||
|
||||
if (controlUpdates && violatedConstraintsAfter > 0) {
|
||||
float distanceChange = oldDistanceFromOptimum - newDistanceFromOptimum;
|
||||
if ((violatedConstraintsBefore - violatedConstraintsAfter) <= 0 && distanceChange < 0) {
|
||||
vector<int> statusPlus(3);
|
||||
statusPlus[0] = -1;
|
||||
statusPlus[1] = -1;
|
||||
statusPlus[2] = -1;
|
||||
return statusPlus;
|
||||
}
|
||||
}
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
|
||||
|
||||
// apply learning rate
|
||||
if (learning_rate != 1) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
|
||||
summedUpdate.MultiplyEquals(learning_rate);
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl;
|
||||
}
|
||||
|
||||
// apply threshold scaling
|
||||
if (max_sentence_update != -1) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update before scaling to max-sentence-update: " << summedUpdate << endl;
|
||||
summedUpdate.ThresholdScaling(max_sentence_update);
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update after scaling to max-sentence-update: " << summedUpdate << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl);
|
||||
}
|
||||
|
||||
// scale update by BLEU of oracle
|
||||
if (oracleBleuScores.size() == 1 && m_scale_update) {
|
||||
cerr << "Scaling summed update with log10 oracle bleu score " << log10(oracleBleuScores[0]) << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling summed update with log10 oracle bleu score " << log10(oracleBleuScores[0]) << endl);
|
||||
summedUpdate.MultiplyEquals(log10(oracleBleuScores[0]));
|
||||
}
|
||||
|
||||
// apply update to weight vector
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl);
|
||||
currWeights.PlusEquals(summedUpdate);
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
|
||||
|
||||
vector<int> statusPlus(3);
|
||||
statusPlus[0] = 0;
|
||||
statusPlus[1] = violatedConstraintsBefore;
|
||||
statusPlus[2] = violatedConstraintsAfter;
|
||||
return statusPlus;
|
||||
vector<int> status(2);
|
||||
status[0] = violatedConstraintsBefore;
|
||||
status[1] = violatedConstraintsAfter;
|
||||
return status;
|
||||
}
|
||||
|
||||
vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
|
||||
@ -189,10 +171,8 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
const std::vector<std::vector<float> >& bleuScoresFear,
|
||||
const std::vector< size_t> sentenceIds,
|
||||
float learning_rate,
|
||||
float max_sentence_update,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool controlUpdates) {
|
||||
size_t epoch) {
|
||||
|
||||
// vector of feature values differences for all created constraints
|
||||
vector<ScoreComponentCollection> featureValueDiffs;
|
||||
@ -216,41 +196,44 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
for (size_t k = 0; k < featureValuesFear[i].size(); ++k) {
|
||||
ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
|
||||
featureValueDiff.MinusEquals(featureValuesFear[i][k]);
|
||||
cerr << "feature value diff: " << featureValueDiff << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl;
|
||||
if (featureValueDiff.GetL1Norm() == 0) {
|
||||
cerr << "Equal feature values, constraint skipped.." << endl;
|
||||
// skip constraint
|
||||
continue;
|
||||
}
|
||||
|
||||
float loss = bleuScoresHope[i][j] - bleuScoresFear[i][k];
|
||||
if (m_scale_margin == 1) {
|
||||
loss *= bleuScoresHope[i][j];
|
||||
cerr << "Scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl);
|
||||
}
|
||||
else if (m_scale_margin == 2) {
|
||||
loss *= log2(bleuScoresHope[i][j]);
|
||||
cerr << "Scaling margin with log2 oracle bleu score " << log2(bleuScoresHope[i][j]) << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling margin with log2 oracle bleu score " << log2(bleuScoresHope[i][j]) << endl);
|
||||
}
|
||||
else if (m_scale_margin == 10) {
|
||||
loss *= log10(bleuScoresHope[i][j]);
|
||||
cerr << "Scaling margin with log10 oracle bleu score " << log10(bleuScoresHope[i][j]) << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling margin with log10 oracle bleu score " << log10(bleuScoresHope[i][j]) << endl);
|
||||
}
|
||||
|
||||
// check if constraint is violated
|
||||
bool violated = false;
|
||||
bool addConstraint = true;
|
||||
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
||||
float diff = loss - (modelScoreDiff + m_precision);
|
||||
cerr << "constraint: " << (modelScoreDiff + m_precision) << " >= " << loss << endl;
|
||||
float diff = 0;
|
||||
if (loss > (modelScoreDiff + m_margin_slack)) {
|
||||
diff = loss - (modelScoreDiff + m_margin_slack);
|
||||
}
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
|
||||
|
||||
if (diff > epsilon) {
|
||||
violated = true;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << endl;
|
||||
}
|
||||
else if (m_onlyViolatedConstraints) {
|
||||
addConstraint = false;
|
||||
}
|
||||
|
||||
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_precision);
|
||||
float lossMinusModelScoreDiff = loss - modelScoreDiff;
|
||||
if (addConstraint) {
|
||||
featureValueDiffs.push_back(featureValueDiff);
|
||||
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
|
||||
@ -269,8 +252,8 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
vector<float> alphas;
|
||||
ScoreComponentCollection summedUpdate;
|
||||
if (violatedConstraintsBefore > 0) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << featureValueDiffs.size() << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of violated constraints passed to optimizer: " << violatedConstraintsBefore << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " <<
|
||||
featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl;
|
||||
if (m_slack != 0) {
|
||||
alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack);
|
||||
} else {
|
||||
@ -281,17 +264,17 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
// * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
|
||||
for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
|
||||
float alpha = alphas[k];
|
||||
cerr << "alpha: " << alpha << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl);
|
||||
ScoreComponentCollection update(featureValueDiffs[k]);
|
||||
update.MultiplyEquals(alpha);
|
||||
|
||||
// scale update by BLEU of hope translation (only two cases defined at the moment)
|
||||
if (featureValuesHope.size() == 1 && m_scale_update) { // only defined for batch size 1)
|
||||
if (featureValuesHope[0].size() == 1) {
|
||||
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][0]) << endl; // only 1 oracle
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][0]) << endl); // only 1 oracle
|
||||
update.MultiplyEquals(log10(bleuScoresHope[0][0]));
|
||||
} else if (featureValuesFear[0].size() == 1) {
|
||||
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][k]) << endl; // k oracles
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][k]) << endl); // k oracles
|
||||
update.MultiplyEquals(log10(bleuScoresHope[0][k]));
|
||||
}
|
||||
}
|
||||
@ -301,11 +284,10 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
}
|
||||
}
|
||||
else {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl;
|
||||
vector<int> status(3);
|
||||
status[0] = 1;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl;
|
||||
vector<int> status(2);
|
||||
status[0] = 0;
|
||||
status[1] = 0;
|
||||
status[2] = 0;
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -318,49 +300,30 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
|
||||
float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
|
||||
float loss = all_losses[i];
|
||||
float diff = loss - (modelScoreDiff + m_precision);
|
||||
float diff = loss - (modelScoreDiff + m_margin_slack);
|
||||
if (diff > epsilon) {
|
||||
++violatedConstraintsAfter;
|
||||
newDistanceFromOptimum += diff;
|
||||
}
|
||||
}
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl;
|
||||
|
||||
if (controlUpdates && violatedConstraintsAfter > 0) {
|
||||
float distanceChange = oldDistanceFromOptimum - newDistanceFromOptimum;
|
||||
if ((violatedConstraintsBefore - violatedConstraintsAfter) <= 0 && distanceChange < 0) {
|
||||
vector<int> statusPlus(3);
|
||||
statusPlus[0] = -1;
|
||||
statusPlus[1] = -1;
|
||||
statusPlus[2] = -1;
|
||||
return statusPlus;
|
||||
}
|
||||
}
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
|
||||
|
||||
// Apply learning rate (fixed or flexible)
|
||||
if (learning_rate != 1) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
|
||||
summedUpdate.MultiplyEquals(learning_rate);
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl;
|
||||
}
|
||||
|
||||
// Apply threshold scaling
|
||||
if (max_sentence_update != -1) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update before scaling to max-sentence-update: " << summedUpdate << endl;
|
||||
summedUpdate.ThresholdScaling(max_sentence_update);
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", update after scaling to max-sentence-update: " << summedUpdate << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl);
|
||||
}
|
||||
|
||||
// apply update to weight vector
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl);
|
||||
currWeights.PlusEquals(summedUpdate);
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
|
||||
|
||||
vector<int> statusPlus(3);
|
||||
statusPlus[0] = 0;
|
||||
statusPlus[1] = violatedConstraintsBefore;
|
||||
statusPlus[2] = violatedConstraintsAfter;
|
||||
vector<int> statusPlus(2);
|
||||
statusPlus[0] = violatedConstraintsBefore;
|
||||
statusPlus[1] = violatedConstraintsAfter;
|
||||
return statusPlus;
|
||||
}
|
||||
|
||||
@ -371,26 +334,27 @@ vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& c
|
||||
float bleuScoreFear,
|
||||
size_t sentenceId,
|
||||
float learning_rate,
|
||||
float max_sentence_update,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool controlUpdates) {
|
||||
size_t epoch) {
|
||||
|
||||
float epsilon = 0.0001;
|
||||
float oldDistanceFromOptimum = 0;
|
||||
bool constraintViolatedBefore = false;
|
||||
ScoreComponentCollection weightUpdate;
|
||||
|
||||
cerr << "hope: " << featureValuesHope << endl;
|
||||
cerr << "fear: " << featureValuesFear << endl;
|
||||
// cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
|
||||
// cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
|
||||
ScoreComponentCollection featureValueDiff = featureValuesHope;
|
||||
featureValueDiff.MinusEquals(featureValuesFear);
|
||||
cerr << "hope - fear: " << featureValueDiff << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
|
||||
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
||||
float loss = bleuScoreHope - bleuScoreFear;
|
||||
float diff = loss - (modelScoreDiff + m_precision);
|
||||
// approximate comparison between floats
|
||||
cerr << "constraint: " << (modelScoreDiff + m_precision) << " >= " << loss << endl;
|
||||
float diff = 0;
|
||||
if (loss > (modelScoreDiff + m_margin_slack)) {
|
||||
diff = loss - (modelScoreDiff + m_margin_slack);
|
||||
}
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
|
||||
|
||||
if (diff > epsilon) {
|
||||
// constraint violated
|
||||
oldDistanceFromOptimum += diff;
|
||||
@ -417,17 +381,16 @@ vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& c
|
||||
weightUpdate.PlusEquals(featureValueDiff);
|
||||
}
|
||||
else {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", no update because squared norm is 0, can only happen if oracle == hypothesis, are bleu scores equal as well?" << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", no update because squared norm is 0" << endl);
|
||||
}
|
||||
}
|
||||
|
||||
if (!constraintViolatedBefore) {
|
||||
// constraint satisfied, nothing to do
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, constraint already satisfied" << endl;
|
||||
vector<int> status(3);
|
||||
status[0] = 1;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl;
|
||||
vector<int> status(2);
|
||||
status[0] = 0;
|
||||
status[1] = 0;
|
||||
status[2] = 0;
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -439,35 +402,25 @@ vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& c
|
||||
featureValueDiff = featureValuesHope;
|
||||
featureValueDiff.MinusEquals(featureValuesFear);
|
||||
modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
|
||||
diff = loss - (modelScoreDiff + m_precision);
|
||||
diff = loss - (modelScoreDiff + m_margin_slack);
|
||||
// approximate comparison between floats!
|
||||
if (diff > epsilon) {
|
||||
constraintViolatedAfter = true;
|
||||
newDistanceFromOptimum += (loss - (modelScoreDiff + m_precision));
|
||||
newDistanceFromOptimum += (loss - modelScoreDiff);
|
||||
}
|
||||
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl;
|
||||
|
||||
float distanceChange = oldDistanceFromOptimum - newDistanceFromOptimum;
|
||||
if (controlUpdates && constraintViolatedAfter && distanceChange < 0) {
|
||||
vector<int> statusPlus(3);
|
||||
statusPlus[0] = -1;
|
||||
statusPlus[1] = 1;
|
||||
statusPlus[2] = 1;
|
||||
return statusPlus;
|
||||
}
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl);
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
|
||||
|
||||
// apply update to weight vector
|
||||
cerr << "Rank " << rank << ", weights before update: " << currWeights << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl);
|
||||
currWeights.PlusEquals(weightUpdate);
|
||||
cerr << "Rank " << rank << ", weights after update: " << currWeights << endl;
|
||||
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
|
||||
|
||||
vector<int> statusPlus(3);
|
||||
statusPlus[0] = 0;
|
||||
statusPlus[1] = 1;
|
||||
statusPlus[2] = constraintViolatedAfter ? 1 : 0;
|
||||
return statusPlus;
|
||||
vector<int> status(2);
|
||||
status[0] = 1;
|
||||
status[1] = constraintViolatedAfter ? 1 : 0;
|
||||
return status;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -36,11 +36,9 @@ namespace Mira {
|
||||
const std::vector<std::vector<float> >& bleuScoresHope,
|
||||
const std::vector<std::vector<float> >& bleuScoresFear,
|
||||
const std::vector< size_t> sentenceIds,
|
||||
float learning_rate,
|
||||
float max_sentence_update,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool controlUpdates) = 0;
|
||||
float learning_rate,
|
||||
size_t rank,
|
||||
size_t epoch) = 0;
|
||||
};
|
||||
|
||||
class Perceptron : public Optimiser {
|
||||
@ -52,10 +50,8 @@ namespace Mira {
|
||||
const std::vector<std::vector<float> >& bleuScoresFear,
|
||||
const std::vector< size_t> sentenceIds,
|
||||
float learning_rate,
|
||||
float max_sentence_update,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool controlUpdates);
|
||||
size_t rank,
|
||||
size_t epoch);
|
||||
};
|
||||
|
||||
class MiraOptimiser : public Optimiser {
|
||||
@ -63,13 +59,13 @@ namespace Mira {
|
||||
MiraOptimiser() :
|
||||
Optimiser() { }
|
||||
|
||||
MiraOptimiser(bool onlyViolatedConstraints, float slack, size_t scale_margin, bool scale_update, float precision) :
|
||||
MiraOptimiser(bool onlyViolatedConstraints, float slack, size_t scale_margin, bool scale_update, float margin_slack) :
|
||||
Optimiser(),
|
||||
m_onlyViolatedConstraints(onlyViolatedConstraints),
|
||||
m_slack(slack),
|
||||
m_scale_margin(scale_margin),
|
||||
m_scale_update(scale_update),
|
||||
m_precision(precision) { }
|
||||
m_margin_slack(margin_slack) { }
|
||||
|
||||
std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
|
||||
Moses::ScoreComponentCollection& featureValuesHope,
|
||||
@ -78,10 +74,8 @@ namespace Mira {
|
||||
float bleuScoresFear,
|
||||
size_t sentenceId,
|
||||
float learning_rate,
|
||||
float max_sentence_update,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool controlUpdates);
|
||||
size_t epoch);
|
||||
std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
|
||||
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
|
||||
const std::vector<std::vector<float> >& losses,
|
||||
@ -89,27 +83,27 @@ namespace Mira {
|
||||
const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues,
|
||||
const std::vector< float> oracleBleuScores,
|
||||
const std::vector< size_t> sentenceIds,
|
||||
float learning_rate,
|
||||
float max_sentence_update,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool controlUpdates);
|
||||
float learning_rate,
|
||||
size_t rank,
|
||||
size_t epoch);
|
||||
virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
|
||||
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
|
||||
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
|
||||
const std::vector<std::vector<float> >& bleuScoresHope,
|
||||
const std::vector<std::vector<float> >& bleuScoresFear,
|
||||
const std::vector< size_t> sentenceIds,
|
||||
float learning_rate,
|
||||
float max_sentence_update,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool controlUpdates);
|
||||
float learning_rate,
|
||||
size_t rank,
|
||||
size_t epoch);
|
||||
|
||||
void setSlack(float slack) {
|
||||
m_slack = slack;
|
||||
}
|
||||
|
||||
void setMarginSlack(float margin_slack) {
|
||||
m_margin_slack = margin_slack;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
// add only violated constraints to the optimisation problem
|
||||
@ -123,7 +117,7 @@ namespace Mira {
|
||||
// scale update with log 10 of oracle BLEU score
|
||||
bool m_scale_update;
|
||||
|
||||
float m_precision;
|
||||
float m_margin_slack;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -31,10 +31,8 @@ vector<int> Perceptron::updateWeightsHopeFear(ScoreComponentCollection& currWeig
|
||||
const vector< vector<float> >& dummy2,
|
||||
const vector< size_t> dummy3,
|
||||
float perceptron_learning_rate,
|
||||
float dummy4,
|
||||
size_t rank,
|
||||
size_t epoch,
|
||||
bool dummy5)
|
||||
size_t epoch)
|
||||
{
|
||||
cerr << "hope: " << featureValuesHope[0][0] << endl;
|
||||
cerr << "fear: " << featureValuesFear[0][0] << endl;
|
||||
|
Loading…
Reference in New Issue
Block a user