remove caching of wp weight and translation weights, clean up mira code

This commit is contained in:
Eva Hasler 2012-04-29 05:37:48 +01:00
parent f09c962f76
commit ef552fe91a
22 changed files with 335 additions and 319 deletions

View File

@ -369,12 +369,10 @@ namespace Mira {
void MosesDecoder::setBleuParameters(bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
bool scaleByInverseLength, bool scaleByAvgInverseLength,
float scaleByX, float historySmoothing, size_t scheme, float relax_BP,
bool useSourceLengthHistory) {
float scaleByX, float historySmoothing, size_t scheme) {
m_bleuScoreFeature->SetBleuParameters(sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
scaleByInverseLength, scaleByAvgInverseLength,
scaleByX, historySmoothing, scheme, relax_BP,
useSourceLengthHistory);
scaleByX, historySmoothing, scheme);
}
}

View File

@ -110,8 +110,7 @@ class MosesDecoder {
size_t getShortestReferenceIndex(size_t ref_id);
void setBleuParameters(bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
bool scaleByInverseLength, bool scaleByAvgInverseLength,
float scaleByX, float historySmoothing, size_t scheme, float relax_BP,
bool useSourceLengthHistory);
float scaleByX, float historySmoothing, size_t scheme);
void setAvgInputLength (float l) { m_bleuScoreFeature->SetAvgInputLength(l); }
Moses::ScoreComponentCollection getWeights();
void setWeights(const Moses::ScoreComponentCollection& weights);

View File

@ -41,6 +41,7 @@ namespace mpi = boost::mpi;
#include "Optimiser.h"
#include "Hildreth.h"
#include "ThreadPool.h"
#include "DummyScoreProducers.h"
using namespace Mira;
using namespace std;
@ -70,21 +71,17 @@ int main(int argc, char** argv) {
size_t mixingFrequency;
size_t weightDumpFrequency;
string weightDumpStem;
float min_learning_rate;
bool scale_margin, scale_margin_precision;
bool scale_update, scale_update_precision;
size_t n;
size_t batchSize;
bool distinctNbest;
bool onlyViolatedConstraints;
bool accumulateWeights;
float historySmoothing;
bool scaleByInputLength, scaleByAvgInputLength;
bool scaleByInverseLength, scaleByAvgInverseLength;
float scaleByX;
float slack, dummy;
float slack_step;
float slack_min;
float slack;
bool averageWeights;
bool weightConvergence;
float learning_rate;
@ -94,75 +91,67 @@ int main(int argc, char** argv) {
size_t baseOfLog;
string decoder_settings;
float min_weight_change;
float decrease_learning_rate;
bool normaliseWeights, normaliseMargin;
bool print_feature_values;
bool historyOf1best;
bool historyOfOracles;
bool historyBleu ;
bool sentenceLevelBleu;
float bleuWeight, bleuWeight_hope, bleuWeight_fear;
float margin_slack;
float margin_slack_incr;
bool perceptron_update;
bool hope_fear, hope_fear_rank, hope_model;
bool model_hope_fear, rank_only;
int hope_n, fear_n, rank_n;
int threadcount;
size_t adapt_after_epoch;
size_t bleu_smoothing_scheme;
float max_length_dev_all;
float max_length_dev_hypos;
float max_length_dev_hope_ref;
float max_length_dev_fear_ref;
float relax_BP;
float min_oracle_bleu;
float minBleuRatio, maxBleuRatio;
bool boost;
bool decode_hope, decode_fear, decode_model;
string decode_filename;
size_t update_scheme;
bool separateUpdates, batchEqualsShard;
bool batchEqualsShard;
bool sparseAverage, dumpMixedWeights, sparseNoAverage;
bool useSourceLengthHistory;
int featureCutoff;
bool pruneZeroWeights;
bool megam;
bool printFeatureCounts, printNbestWithFeatures;
bool avgRefLength;
bool print_weights;
bool print_weights, print_core_weights, clear_static, debug_model, scale_lm, bleu_weight_lm, bleu_weight_lm_adjust, scale_wp;
float scale_lm_factor, bleu_weight_lm_factor, scale_wp_factor;
po::options_description desc("Allowed options");
desc.add_options()
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
("dummy", po::value<float>(&dummy)->default_value(-1), "Dummy variable for slack")
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
("adapt-after-epoch", po::value<size_t>(&adapt_after_epoch)->default_value(0), "Index of epoch after which adaptive parameters will be adapted")
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for taking logs of feature values")
("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
("bleu-score-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu score weight used in the decoder objective function (on top of the Bleu objective weight)")
("bleu-score-weight-hope", po::value<float>(&bleuWeight_hope)->default_value(-1), "Bleu score weight used in the decoder objective function for hope translations")
("bleu-score-weight-fear", po::value<float>(&bleuWeight_fear)->default_value(-1), "Bleu score weight used in the decoder objective function for fear translations")
("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu score weight used in the decoder objective function (on top of the Bleu objective weight)")
("bleu-weight-hope", po::value<float>(&bleuWeight_hope)->default_value(-1), "Bleu score weight used in the decoder objective function for hope translations")
("bleu-weight-fear", po::value<float>(&bleuWeight_fear)->default_value(-1), "Bleu score weight used in the decoder objective function for fear translations")
("bleu-weight-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
("bleu-weight-factor-lm", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
("bleu-weight-adjust-lm", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
("clear-static", po::value<bool>(&clear_static)->default_value(false), "Clear static data before every translation")
("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("history-of-1best", po::value<bool>(&historyOf1best)->default_value(false), "Use 1best translations to update the history")
("history-of-oracles", po::value<bool>(&historyOfOracles)->default_value(false), "Use oracle translations to update the history")
("history-of-1best", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
("hope-fear-rank", po::value<bool>(&hope_fear_rank)->default_value(false), "Use hope and fear translations for optimisation, use model for ranking")
@ -172,56 +161,51 @@ int main(int argc, char** argv) {
("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
("max-length-dev-all", po::value<float>(&max_length_dev_all)->default_value(-1), "Make use of all 3 following options")
("max-length-dev-hypos", po::value<float>(&max_length_dev_hypos)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hope and fear translations")
("max-length-dev-hope-ref", po::value<float>(&max_length_dev_hope_ref)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hope and reference translations")
("max-length-dev-fear-ref", po::value<float>(&max_length_dev_fear_ref)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between fear and reference translations")
("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
("megam", po::value<bool>(&megam)->default_value(false), "Use megam for optimization step")
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
("megam", po::value<bool>(&megam)->default_value(false), "Use megam for optimization step")
("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in n-best list")
("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(true), "Whether to normalise the updated weights before passing them to the decoder")
("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out new weights")
("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
("print-core-weights", po::value<bool>(&print_core_weights)->default_value(false), "Print out current core weights")
("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
("rank-n", po::value<int>(&rank_n)->default_value(-1), "Number of translations used for ranking")
("rank-only", po::value<bool>(&rank_only)->default_value(false), "Use only model translations for optimisation")
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
("relax-BP", po::value<float>(&relax_BP)->default_value(1), "Relax the BP by setting this value between 0 and 1")
("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale the BLEU score by (a history of) the inverse input length")
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by (a history of) the input length")
("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by an average of the input length")
("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by an average of the inverse input length")
("scale-by-x", po::value<float>(&scaleByX)->default_value(1), "Scale the BLEU score by value x")
("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
("scale-lm", po::value<bool>(&scale_lm)->default_value(false), "Scale the language model feature")
("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor")
("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
("scale-margin-precision", po::value<bool>(&scale_margin_precision)->default_value(0), "Scale the margin by the precision of the oracle translation")
("scale-update", po::value<bool>(&scale_update)->default_value(0), "Scale the update by the Bleu score of the oracle translation")
("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale the update by the precision of the oracle translation")
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(true), "Use a sentences level Bleu scoring function")
("separate-updates", po::value<bool>(&separateUpdates)->default_value(false), "Compute separate updates for each sentence in a batch")
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
("slack-min", po::value<float>(&slack_min)->default_value(0.01), "Minimum slack used")
("slack-step", po::value<float>(&slack_step)->default_value(0), "Increase slack from epoch to epoch by the value provided")
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
("threads", po::value<int>(&threadcount)->default_value(1), "Number of threads used")
("update-scheme", po::value<size_t>(&update_scheme)->default_value(1), "Update scheme, default: 1")
("use-source-length-history", po::value<bool>(&useSourceLengthHistory)->default_value(false), "Use history of source length instead of target length for history Bleu")
("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
@ -264,9 +248,6 @@ int main(int argc, char** argv) {
cerr << "Training with " << mosesConfigFilesFolds.size() << " folds" << endl;
trainWithMultipleFolds = true;
}
if (dummy != -1)
slack = dummy;
if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) {
cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl;
@ -380,12 +361,10 @@ int main(int argc, char** argv) {
if (scaleByAvgInputLength || scaleByInverseLength || scaleByAvgInverseLength)
scaleByInputLength = false;
if (historyOf1best || historyOfOracles)
if (historyBleu)
sentenceLevelBleu = false;
if (!sentenceLevelBleu) {
if (!historyOf1best && !historyOfOracles) {
historyOf1best = true;
}
historyBleu = true;
}
// initialise Moses
@ -411,17 +390,10 @@ int main(int argc, char** argv) {
MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
decoder->setBleuParameters(sentenceLevelBleu, scaleByInputLength, scaleByAvgInputLength,
scaleByInverseLength, scaleByAvgInverseLength,
scaleByX, historySmoothing, bleu_smoothing_scheme, relax_BP, useSourceLengthHistory);
scaleByX, historySmoothing, bleu_smoothing_scheme);
SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
bool chartDecoding = (searchAlgorithm == ChartDecoding);
cerr << "Normalise weights? " << normaliseWeights << endl;
if (normaliseWeights) {
ScoreComponentCollection startWeights = decoder->getWeights();
startWeights.L1Normalise();
decoder->setWeights(startWeights);
}
if (decode_hope || decode_fear || decode_model) {
size_t decode = 1;
if (decode_fear) decode = 2;
@ -463,8 +435,8 @@ int main(int argc, char** argv) {
cerr << "Optimising using Mira" << endl;
cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl;
}
optimiser = new MiraOptimiser(onlyViolatedConstraints, slack, scale_margin, scale_margin_precision,
scale_update, scale_update_precision, margin_slack, boost, update_scheme, normaliseMargin);
optimiser = new MiraOptimiser(slack, scale_margin, scale_margin_precision,
scale_update, scale_update_precision, boost, normaliseMargin);
learning_rate = mira_learning_rate;
perceptron_update = false;
} else if (learner == "perceptron") {
@ -507,13 +479,6 @@ int main(int argc, char** argv) {
return 1;
}
if (bleuWeight_hope == -1) {
bleuWeight_hope = bleuWeight;
}
if (bleuWeight_fear == -1) {
bleuWeight_fear = bleuWeight;
}
if (max_length_dev_all != -1) {
max_length_dev_hypos = max_length_dev_all;
max_length_dev_hope_ref = max_length_dev_all;
@ -575,16 +540,37 @@ int main(int argc, char** argv) {
// set core weights
ScoreComponentCollection initialWeights = decoder->getWeights();
cerr << "Rank " << rank << ", initial weights: " << initialWeights << endl;
if (coreWeightMap.size() > 0) {
ProducerWeightMap::iterator p;
for(p = coreWeightMap.begin(); p!=coreWeightMap.end(); ++p)
initialWeights.Assign(p->first, p->second);
}
cerr << "Normalise weights? " << normaliseWeights << endl;
if (normaliseWeights)
initialWeights.L1Normalise();
decoder->setWeights(initialWeights);
//Main loop:
// print initial weights
cerr << "Rank " << rank << ", initial weights: " << initialWeights << endl;
// set bleu weight to twice the size of the language model weight(s)
const LMList& lmList = staticData.GetLMList();
if (bleu_weight_lm) {
float lmSum = 0;
for (LMList::const_iterator i = lmList.begin(); i != lmList.end(); ++i)
lmSum += abs(initialWeights.GetScoreForProducer(*i));
bleuWeight = lmSum * bleu_weight_lm_factor;
if (bleuWeight_hope == -1) {
bleuWeight_hope = bleuWeight;
}
if (bleuWeight_fear == -1) {
bleuWeight_fear = bleuWeight;
}
}
cerr << "Bleu weight: " << bleuWeight << endl;
//Main loop:
cerr << "Rank " << rank << ", start weights: " << initialWeights << endl;
ScoreComponentCollection cumulativeWeights; // collect weights per epoch to produce an average
ScoreComponentCollection cumulativeWeightsBinary;
size_t numberOfUpdates = 0;
@ -707,7 +693,7 @@ int main(int argc, char** argv) {
bleuScoresFear.push_back(newScores);
modelScoresHope.push_back(newScores);
modelScoresFear.push_back(newScores);
if (historyOf1best) {
if (historyBleu || debug_model) {
dummyFeatureValues.push_back(newFeatureValues);
dummyBleuScores.push_back(newScores);
dummyModelScores.push_back(newScores);
@ -761,12 +747,30 @@ int main(int argc, char** argv) {
}
if (hope_fear || hope_fear_rank || perceptron_update) {
if (print_weights)
cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl;
if (print_core_weights) {
cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: ";
mosesWeights.PrintCoreFeatures();
cerr << endl;
}
// check LM weight
for (LMList::const_iterator i = lmList.begin(); i != lmList.end(); ++i) {
float lmWeight = mosesWeights.GetScoreForProducer(*i);
cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl;
if (lmWeight <= 0)
cerr << "ERROR: language model weight should never be <= 0." << endl;
}
// HOPE
/*delete decoder;
StaticData::ClearDataStatic();
decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
decoder->setBleuParameters(sentenceLevelBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme, relax_BP, useSourceLengthHistory);
decoder->setWeights(mosesWeights);*/
if (clear_static) {
delete decoder;
StaticData::ClearDataStatic();
decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
decoder->setBleuParameters(sentenceLevelBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme);
decoder->setWeights(mosesWeights);
}
//cerr << "Rank " << rank << ", epoch " << epoch << ", using weights: " << decoder->getWeights() << endl;
@ -795,9 +799,8 @@ int main(int argc, char** argv) {
float precision = bleuScoresHope[batchPosition][0];
if (historyOf1best) {
if (useSourceLengthHistory) precision /= decoder->getSourceLengthHistory();
else precision /= decoder->getTargetLengthHistory();
if (historyBleu) {
precision /= decoder->getTargetLengthHistory();
}
else {
if (scaleByAvgInputLength) precision /= decoder->getAverageInputLength();
@ -805,13 +808,12 @@ int main(int argc, char** argv) {
precision /= scaleByX;
}
if (scale_margin_precision || scale_update_precision) {
if (historyOf1best || scaleByAvgInputLength || scaleByAvgInverseLength) {
if (historyBleu || scaleByAvgInputLength || scaleByAvgInverseLength) {
cerr << "Rank " << rank << ", epoch " << epoch << ", set hope precision: " << precision << endl;
((MiraOptimiser*) optimiser)->setPrecision(precision);
}
}
// exit(0);
bool skip = false;
// Length-related example selection
@ -820,9 +822,17 @@ int main(int argc, char** argv) {
skip = true;
vector<const Word*> bestModel;
if (historyOf1best && !skip) {
if (debug_model || (historyBleu && !skip)) {
// MODEL (for updating the history only, using dummy vectors)
cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (for history or length stabilisation)" << endl;
if (clear_static) {
delete decoder;
StaticData::ClearDataStatic();
decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
decoder->setBleuParameters(sentenceLevelBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme);
decoder->setWeights(mosesWeights);
}
cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score " << endl;
vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, 1, 0.0, bleuWeight,
dummyFeatureValues[batchPosition], dummyBleuScores[batchPosition], dummyModelScores[batchPosition],
1, distinctNbest, avgRefLength, rank, epoch);
@ -837,11 +847,13 @@ int main(int argc, char** argv) {
float bleuRatioHopeFear = 0;
int fearSize = 0;
if (!skip) {
/*delete decoder;
StaticData::ClearDataStatic();
decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
decoder->setBleuParameters(sentenceLevelBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme, relax_BP, useSourceLengthHistory);
decoder->setWeights(mosesWeights);*/
if (clear_static) {
delete decoder;
StaticData::ClearDataStatic();
decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
decoder->setBleuParameters(sentenceLevelBleu, scaleByInputLength, scaleByAvgInputLength, scaleByInverseLength, scaleByAvgInverseLength, scaleByX, historySmoothing, bleu_smoothing_scheme);
decoder->setWeights(mosesWeights);
}
cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl;
vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear,
@ -888,7 +900,7 @@ int main(int argc, char** argv) {
// sanity check
float epsilon = 0.0001;
if (historyOf1best) {
if (historyBleu) {
if (dummyBleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] &&
dummyModelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) {
if (abs(dummyBleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon &&
@ -908,8 +920,14 @@ int main(int argc, char** argv) {
}
if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) {
if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) {
cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << " > " << epsilon << ")" <<endl;
// check if it's an error or a warning
skip = true;
if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
}
else {
cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
}
}
}
@ -919,7 +937,7 @@ int main(int argc, char** argv) {
featureValuesFear[batchPosition].clear();
bleuScoresHope[batchPosition].clear();
bleuScoresFear[batchPosition].clear();
if (historyOf1best) {
if (historyBleu) {
dummyFeatureValues[batchPosition].clear();
dummyBleuScores[batchPosition].clear();
}
@ -1143,6 +1161,35 @@ int main(int argc, char** argv) {
break;
}
}
// scale LM feature (to avoid rapid changes)
if (scale_lm) {
for (LMList::const_iterator iter = lmList.begin(); iter != lmList.end(); ++iter) {
// scale up weight
float lmWeight = mosesWeights.GetScoreForProducer(*iter);
mosesWeights.Assign(*iter, lmWeight*scale_lm_factor);
cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight scaled from " << lmWeight << " to " << lmWeight*scale_lm_factor << endl;
// scale down score
scaleFeatureScore(*iter, scale_lm_factor, featureValuesHope, rank, epoch);
scaleFeatureScore(*iter, scale_lm_factor, featureValuesFear, rank, epoch);
scaleFeatureScore(*iter, scale_lm_factor, featureValues, rank, epoch);
}
}
// scale WP
if (scale_wp) {
// scale up weight
WordPenaltyProducer *wp = staticData.GetWordPenaltyProducer();
float wpWeight = mosesWeights.GetScoreForProducer(wp);
mosesWeights.Assign(wp, wpWeight*scale_wp_factor);
cerr << "Rank " << rank << ", epoch " << epoch << ", wp weight scaled from " << wpWeight << " to " << wpWeight*scale_wp_factor << endl;
// scale down score
scaleFeatureScore(wp, scale_wp_factor, featureValuesHope, rank, epoch);
scaleFeatureScore(wp, scale_wp_factor, featureValuesFear, rank, epoch);
scaleFeatureScore(wp, scale_wp_factor, featureValues, rank, epoch);
}
// take logs of feature values
if (logFeatureValues) {
@ -1189,25 +1236,7 @@ int main(int argc, char** argv) {
featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0],
modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
else {
if (batchSize > 1 && separateUpdates) {
// separate updates for all input sentences
ScoreComponentCollection tmpWeights(mosesWeights);
for (size_t i = 0; i < batchSize; ++i) {
// use only the specified batch position to compute the update
int updatePosition = i;
ScoreComponentCollection partialWeightUpdate;
size_t partial_update_status = optimiser->updateWeightsHopeFear(tmpWeights, partialWeightUpdate,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
modelScoresHope, modelScoresFear, learning_rate, rank, epoch, updatePosition);
if (partial_update_status == 0) {
update_status = 0;
weightUpdate.PlusEquals(partialWeightUpdate);
tmpWeights.PlusEquals(partialWeightUpdate);
}
}
}
else
update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
modelScoresHope, modelScoresFear, learning_rate, rank, epoch);
}
@ -1233,6 +1262,25 @@ int main(int argc, char** argv) {
// sumStillViolatedConstraints += update_status;
// rescale LM feature
if (scale_lm) {
for (LMList::const_iterator iter = lmList.begin(); iter != lmList.end(); ++iter) {
// scale weight back down
float lmWeight = mosesWeights.GetScoreForProducer(*iter);
mosesWeights.Assign(*iter, lmWeight/scale_lm_factor);
cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight rescaled from " << lmWeight << " to " << lmWeight/scale_lm_factor << endl;
}
}
// rescale WP feature
if (scale_wp) {
// scale weight back down
WordPenaltyProducer *wp = staticData.GetWordPenaltyProducer();
float wpWeight = mosesWeights.GetScoreForProducer(wp);
mosesWeights.Assign(wp, wpWeight/scale_wp_factor);
cerr << "Rank " << rank << ", epoch " << epoch << ", wp weight rescaled from " << wpWeight << " to " << wpWeight/scale_wp_factor << endl;
}
if (update_status == 0) { // if weights were updated
// apply weight update
cerr << "Rank " << rank << ", epoch " << epoch << ", applying update.." << endl;
@ -1263,23 +1311,31 @@ int main(int argc, char** argv) {
// set new Moses weights
decoder->setWeights(mosesWeights);
if (print_weights)
cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
// adjust bleu weight
if (bleu_weight_lm_adjust) {
float lmSum = 0;
for (LMList::const_iterator i = lmList.begin(); i != lmList.end(); ++i)
lmSum += abs(mosesWeights.GetScoreForProducer(*i));
bleuWeight = lmSum * bleu_weight_lm_factor;
cerr << "Rank " << rank << ", epoch " << epoch << ", adjusting Bleu weight to " << bleuWeight << " (factor " << bleu_weight_lm_factor << ")" << endl;
if (bleuWeight_hope == -1) {
bleuWeight_hope = bleuWeight;
}
if (bleuWeight_fear == -1) {
bleuWeight_fear = bleuWeight;
}
}
}
// update history (for approximate document Bleu)
if (historyOf1best) {
if (historyBleu) {
for (size_t i = 0; i < oneBests.size(); ++i) {
cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " ";
}
decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch);
}
else if (historyOfOracles) {
for (size_t i = 0; i < oracles.size(); ++i) {
cerr << "Rank " << rank << ", epoch " << epoch << ", update history with oracle length: " << oracles[i].size() << " ";
}
decoder->updateHistory(oracles, inputLengths, ref_ids, rank, epoch);
}
deleteTranslations(oracles);
deleteTranslations(oneBests);
} // END TRANSLATE AND UPDATE BATCH
@ -1462,7 +1518,7 @@ int main(int argc, char** argv) {
fearPlusFeatures.close();
}
if (verbosity > 0) {
if (historyBleu) {
cerr << "Bleu feature history after epoch " << epoch << endl;
decoder->printBleuFeatureHistory(cerr);
}
@ -1537,42 +1593,6 @@ int main(int argc, char** argv) {
mpi::broadcast(world, stop, 0);
#endif
} //end if (weightConvergence)
// adjust flexible parameters
if (!stop && epoch >= adapt_after_epoch) {
// if using flexible slack, decrease slack parameter for next epoch
if (slack_step > 0) {
if (slack - slack_step >= slack_min) {
if (typeid(*optimiser) == typeid(MiraOptimiser)) {
slack -= slack_step;
VERBOSE(1, "Change slack to: " << slack << endl);
((MiraOptimiser*) optimiser)->setSlack(slack);
}
}
}
// if using flexible margin slack, decrease margin slack parameter for next epoch
if (margin_slack_incr > 0.0001) {
if (typeid(*optimiser) == typeid(MiraOptimiser)) {
margin_slack += margin_slack_incr;
VERBOSE(1, "Change margin slack to: " << margin_slack << endl);
((MiraOptimiser*) optimiser)->setMarginSlack(margin_slack);
}
}
// change learning rate
if ((decrease_learning_rate > 0) && (learning_rate - decrease_learning_rate >= min_learning_rate)) {
learning_rate -= decrease_learning_rate;
if (learning_rate <= 0.0001) {
learning_rate = 0;
stop = true;
#ifdef MPI_ENABLE
mpi::broadcast(world, stop, 0);
#endif
}
VERBOSE(1, "Change learning rate to " << learning_rate << endl);
}
}
}
} // end of epoch loop
@ -1806,3 +1826,17 @@ void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename,
delete decoder;
exit(0);
}
void scaleFeatureScore(ScoreProducer *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch) {
string name = sp->GetScoreProducerWeightShortName();
// scale down score
float featureScore;
for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
featureScore = featureValues[i][j].GetScoreForProducer(sp);
featureValues[i][j].Assign(sp, featureScore/scaling_factor);
cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
}
}
}

View File

@ -51,5 +51,6 @@ void ignoreCoreFeatures(std::vector<std::vector<Moses::ScoreComponentCollection>
void takeLogs(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t base);
void deleteTranslations(std::vector<std::vector<const Moses::Word*> > &translations);
void decodeHopeOrFear(size_t rank, size_t size, size_t decode, std::string decode_filename, std::vector<std::string> &inputSentences, Mira::MosesDecoder* decoder, size_t n);
void scaleFeatureScore(Moses::ScoreProducer *sp, float scaling_factor, std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch);
#endif /* MAIN_H_ */

View File

@ -28,7 +28,6 @@ size_t MiraOptimiser::updateWeights(
// most violated constraint in batch
ScoreComponentCollection max_batch_featureValueDiff;
float max_batch_lossMinusModelScoreDiff = -1;
// Make constraints for new hypothesis translations
float epsilon = 0.0001;
@ -56,19 +55,16 @@ size_t MiraOptimiser::updateWeights(
// float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
float modelScoreDiff = oracleModelScores[i] - modelScores[i][j];
float diff = 0;
if (loss > (modelScoreDiff + m_margin_slack)) {
diff = loss - (modelScoreDiff + m_margin_slack);
if (loss > modelScoreDiff) {
diff = loss - modelScoreDiff;
}
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
if (diff > epsilon) {
violated = true;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_margin_slack);
float lossMinusModelScoreDiff = loss - modelScoreDiff;
if (addConstraint) {
if (m_normaliseMargin)
lossMinusModelScoreDiff = (2/(1 + exp(- lossMinusModelScoreDiff))) - 1;
@ -146,7 +142,7 @@ size_t MiraOptimiser::updateWeights(
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
float loss = all_losses[i];
float diff = loss - (modelScoreDiff + m_margin_slack);
float diff = loss - modelScoreDiff;
if (diff > epsilon) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;
@ -180,7 +176,6 @@ size_t MiraOptimiser::updateWeightsHopeFear(
// most violated constraint in batch
ScoreComponentCollection max_batch_featureValueDiff;
float max_batch_lossMinusModelScoreDiff = -1;
// Make constraints for new hypothesis translations
float epsilon = 0.0001;
@ -215,19 +210,16 @@ size_t MiraOptimiser::updateWeightsHopeFear(
// float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][k];
float diff = 0;
if (loss > (modelScoreDiff + m_margin_slack)) {
diff = loss - (modelScoreDiff + m_margin_slack);
if (loss > modelScoreDiff) {
diff = loss - modelScoreDiff;
}
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
if (diff > epsilon) {
violated = true;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_margin_slack);
float lossMinusModelScoreDiff = loss - modelScoreDiff;
if (addConstraint) {
if (m_normaliseMargin)
lossMinusModelScoreDiff = (2/(1 + exp(- lossMinusModelScoreDiff))) - 1;
@ -318,7 +310,7 @@ size_t MiraOptimiser::updateWeightsHopeFear(
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
float loss = all_losses[i];
float diff = loss - (modelScoreDiff + m_margin_slack);
float diff = loss - modelScoreDiff;
if (diff > epsilon) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;
@ -349,86 +341,29 @@ size_t MiraOptimiser::updateWeightsAnalytically(
// cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
// cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
// scenario 1: reward only-hope, penalize only-fear
// scenario 2: reward all-hope, penalize only-fear
// scenario 3: reward all-hope
// scenario 4: reward strongly only-hope, reward mildly all-hope
// scenario 5: reward strongly only-hope, reward mildly all-hope, penalize only-fear
// scenario 6: reward only-hope
// scenario 7: penalize only-fear
ScoreComponentCollection featureValueDiff;
switch (m_update_scheme) {
case 2:
// values: 1: all-hope, -1: only-fear
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
featureValueDiff.SparsePlusEquals(featureValuesHope);
//max: 1 (set all 2 to 1)
featureValueDiff.CapMax(1);
break;
case 3:
// values: 1: all-hope
featureValueDiff = featureValuesHope;
break;
case 4:
// values: 2: only-hope, 1: both
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
featureValueDiff.SparsePlusEquals(featureValuesHope);
// min: 0 (set all -1 to 0)
featureValueDiff.CapMin(0);
break;
case 5:
// values: 2: only-hope, 1: both, -1: only-fear
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
featureValueDiff.SparsePlusEquals(featureValuesHope);
break;
case 6:
// values: 1: only-hope
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
// min: 0 (set all -1 to 0)
featureValueDiff.CapMin(0);
break;
case 7:
// values: -1: only-fear
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
// max: 0 (set all 1 to 0)
featureValueDiff.CapMax(0);
break;
case 1:
default:
// values: 1: only-hope, -1: only-fear
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
break;
ScoreComponentCollection featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
if (featureValueDiff.GetL1Norm() == 0) {
cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
return 1;
}
if (featureValueDiff.GetL1Norm() == 0) {
cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
return 1;
}
// cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
// float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
float modelScoreDiff = modelScoreHope - modelScoreFear;
float loss = bleuScoreHope - bleuScoreFear;
float diff = 0;
if (loss > (modelScoreDiff + m_margin_slack)) {
diff = loss - (modelScoreDiff + m_margin_slack);
if (loss > modelScoreDiff) {
diff = loss - modelScoreDiff;
}
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
if (m_normaliseMargin) {
modelScoreDiff = (2/(1 + exp(-modelScoreDiff))) - 1;
loss = (2/(1 + exp(-loss))) - 1;
if (loss > (modelScoreDiff + m_margin_slack)) {
diff = loss - (modelScoreDiff + m_margin_slack);
if (loss > modelScoreDiff) {
diff = loss - modelScoreDiff;
}
cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
}
if (m_scale_margin) {
@ -512,7 +447,7 @@ size_t MiraOptimiser::updateWeightsAnalytically(
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
diff = loss - (modelScoreDiff + m_margin_slack);
diff = loss - modelScoreDiff;
// approximate comparison between floats!
if (diff > epsilon) {
constraintViolatedAfter = true;
@ -581,9 +516,6 @@ size_t MiraOptimiser::updateWeightsRankModel(
if (diff > epsilon) {
violated = true;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
float lossMinusModelScoreDiff = loss - modelScoreDiff;
if (addConstraint) {
@ -663,7 +595,7 @@ size_t MiraOptimiser::updateWeightsRankModel(
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
float loss = all_losses[i];
float diff = loss - (modelScoreDiff + m_margin_slack);
float diff = loss - modelScoreDiff;
if (diff > epsilon) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;
@ -731,9 +663,6 @@ size_t MiraOptimiser::updateWeightsHopeFearAndRankModel(
if (diff > epsilon) {
violated = true;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
float lossMinusModelScoreDiff = loss - modelScoreDiff;
if (addConstraint) {
@ -787,9 +716,6 @@ size_t MiraOptimiser::updateWeightsHopeFearAndRankModel(
if (diff > epsilon) {
violated = true;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
float lossMinusModelScoreDiff = loss - modelScoreDiff;
if (addConstraint) {
@ -869,7 +795,7 @@ size_t MiraOptimiser::updateWeightsHopeFearAndRankModel(
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
float loss = all_losses[i];
float diff = loss - (modelScoreDiff + m_margin_slack);
float diff = loss - modelScoreDiff;
if (diff > epsilon) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;

View File

@ -67,20 +67,16 @@ namespace Mira {
MiraOptimiser() :
Optimiser() { }
MiraOptimiser(bool onlyViolatedConstraints, float slack, bool scale_margin, bool scale_margin_precision,
bool scale_update, bool scale_update_precision, float margin_slack, bool boost,
size_t update_scheme, bool normaliseMargin) :
MiraOptimiser(float slack, bool scale_margin, bool scale_margin_precision,
bool scale_update, bool scale_update_precision, bool boost, bool normaliseMargin) :
Optimiser(),
m_onlyViolatedConstraints(onlyViolatedConstraints),
m_slack(slack),
m_scale_margin(scale_margin),
m_scale_margin_precision(scale_margin_precision),
m_scale_update(scale_update),
m_scale_update_precision(scale_update_precision),
m_precision(1),
m_margin_slack(margin_slack),
m_boost(boost),
m_update_scheme(update_scheme),
m_normaliseMargin(normaliseMargin) { }
size_t updateWeights(Moses::ScoreComponentCollection& currWeights,
@ -145,10 +141,6 @@ namespace Mira {
void setSlack(float slack) {
m_slack = slack;
}
void setMarginSlack(float margin_slack) {
m_margin_slack = margin_slack;
}
void setPrecision(float precision) {
m_precision = precision;
@ -156,15 +148,9 @@ namespace Mira {
private:
// add only violated constraints to the optimisation problem
bool m_onlyViolatedConstraints;
// regularise Hildreth updates
float m_slack;
// slack when comparing losses to model scores
float m_margin_slack;
// scale margin with BLEU score or precision
bool m_scale_margin, m_scale_margin_precision;
@ -176,9 +162,6 @@ namespace Mira {
// boosting of updates on misranked candidates
bool m_boost;
// select 1 of 5 different update schemes
size_t m_update_scheme;
// squash margin between 0 and 1
bool m_normaliseMargin;
};

View File

@ -92,8 +92,7 @@ void BleuScoreFeature::PrintHistory(std::ostream& out) const {
void BleuScoreFeature::SetBleuParameters(bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
bool scaleByInverseLength, bool scaleByAvgInverseLength,
float scaleByX, float historySmoothing, size_t scheme, float relaxBP,
bool useSourceLengthHistory) {
float scaleByX, float historySmoothing, size_t scheme) {
m_sentence_bleu = sentenceBleu;
m_scale_by_input_length = scaleByInputLength;
m_scale_by_avg_input_length = scaleByAvgInputLength;
@ -102,8 +101,6 @@ void BleuScoreFeature::SetBleuParameters(bool sentenceBleu, bool scaleByInputLen
m_scale_by_x = scaleByX;
m_historySmoothing = historySmoothing;
m_smoothing_scheme = (SmoothingScheme)scheme;
m_relax_BP = relaxBP;
m_useSourceLengthHistory = useSourceLengthHistory;
}
// Incoming references (refs) are stored as refs[file_id][[sent_id][reference]]
@ -633,7 +630,7 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
smoothed_count += 1;
}
break;
case LIGHT:
case PLUS_POINT_ONE:
if (i > 0) {
// smoothing for all n > 1
smoothed_matches += 0.1;
@ -662,9 +659,9 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
// where
// c: length of the candidate translation
// r: effective reference length (sum of best match lengths for each candidate sentence)
if (state->m_target_length < (state->m_scaled_ref_length * m_relax_BP)) {
if (state->m_target_length < state->m_scaled_ref_length) {
float smoothed_target_length = m_target_length_history + state->m_target_length;
float smoothed_ref_length = m_ref_length_history + (state->m_scaled_ref_length * m_relax_BP);
float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length;
precision *= exp(1 - (smoothed_ref_length/ smoothed_target_length));
}
@ -696,8 +693,8 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
// = BP * 4th root(PRODUCT_1_4 p_n)
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
if (state->m_ngram_counts[i]) {
smoothed_matches = m_match_history[i] + state->m_ngram_matches[i];
smoothed_count = m_count_history[i] + state->m_ngram_counts[i];
smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1;
smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1;
precision *= smoothed_matches / smoothed_count;
}
}
@ -705,18 +702,18 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
// take geometric mean
precision = pow(precision, (float)1/4);
// BP
// Apply brevity penalty if applicable.
if (m_target_length_history + state->m_target_length < m_ref_length_history + state->m_scaled_ref_length)
precision *= exp(1 - (m_ref_length_history + state->m_scaled_ref_length/m_target_length_history + state->m_target_length));
// cerr << "precision: " << precision << endl;
//cerr << "\nprecision: " << precision << endl;
// **BLEU score of pseudo-document**
float precision_pd = 1.0;
if (m_target_length_history > 0) {
for (size_t i = 0; i < BleuScoreState::bleu_order; i++)
if (m_count_history[i] != 0)
precision_pd *= m_match_history[i]/m_count_history[i];
precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1);
// take geometric mean
precision_pd = pow(precision_pd, (float)1/4);
@ -729,18 +726,16 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
precision_pd = 0;
// **end BLEU of pseudo-document**
// cerr << "precision pd: " << precision_pd << endl;
//cerr << "precision pd: " << precision_pd << endl;
float sentence_impact;
if (m_target_length_history > 0) {
if (m_source_length_history)
sentence_impact = m_source_length_history * (precision - precision_pd);
else
sentence_impact = m_target_length_history * (precision - precision_pd);
}
if (m_target_length_history > 0)
sentence_impact = m_target_length_history * (precision - precision_pd);
else
sentence_impact = precision;
sentence_impact = precision;
sentence_impact *= 10;
//cerr << "sentence impact: " << sentence_impact << endl;
return sentence_impact;
}
}

View File

@ -66,7 +66,7 @@ public:
m_scale_by_x(1),
m_historySmoothing(0.9),
m_useSourceLengthHistory(0),
m_smoothing_scheme(PLUS_ONE),
m_smoothing_scheme(PLUS_POINT_ONE),
m_relax_BP(1) {}
std::string GetScoreProducerDescription() const
@ -93,8 +93,7 @@ public:
void PrintRefLength(const std::vector<size_t>& ref_ids);
void SetBleuParameters(bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
bool scaleByInverseLength, bool scaleByAvgInverseLength,
float scaleByX, float historySmoothing, size_t scheme, float relaxBP,
bool useSourceLengthHistory);
float scaleByX, float historySmoothing, size_t scheme);
void GetNgramMatchCounts(Phrase&,
const NGrams&,
@ -163,7 +162,7 @@ private:
float m_historySmoothing;
bool m_useSourceLengthHistory;
enum SmoothingScheme { PLUS_ONE = 1, LIGHT = 2, PAPINENI = 3 };
enum SmoothingScheme { PLUS_ONE = 1, PLUS_POINT_ONE = 2, PAPINENI = 3 };
SmoothingScheme m_smoothing_scheme;
// relax application of the BP by setting a value between 0 and 1

View File

@ -242,7 +242,6 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
const OnDiskPt::TargetPhraseCollection *tpcollBerkeleyDb = node->GetTargetPhraseCollection(m_dictionary.GetTableLimit(), m_dbWrapper);
std::vector<float> weightT = staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetTranslationWeights();
cerr << "Read weightT from translation sytem.. " << std::endl;
targetPhraseCollection
= tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
,m_outputFactorsVec

View File

@ -323,6 +323,17 @@ namespace Moses {
m_coreFeatures[i] = logOfValue;
}
}
void FVector::printCoreFeatures() {
cerr << "core=(";
for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
cerr << m_coreFeatures[i];
if (i + 1 < m_coreFeatures.size()) {
cerr << ",";
}
}
cerr << ") ";
}
FVector& FVector::operator+= (const FVector& rhs) {
if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
@ -519,9 +530,9 @@ namespace Moses {
FValue FVector::l1norm_coreFeatures() const {
FValue norm = 0;
for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
// ignore Bleu score feature (last feature)
for (size_t i = 0; i < m_coreFeatures.size()-1; ++i)
norm += abs(m_coreFeatures[i]);
}
return norm;
}

View File

@ -197,6 +197,7 @@ namespace Moses {
/** additional */
void logCoreFeatures(size_t baseOfLog);
void printCoreFeatures();
//scale so that abs. value is less than maxvalue
void thresholdScale(float maxValue );

View File

@ -447,14 +447,12 @@ public:
//tally up
std::vector<float> weightT = system.GetTranslationWeights();
std::cerr << "Read weightT from translation sytem.. " << std::endl;
//float score=std::inner_product(nscores.begin(), nscores.end(), m_weights.begin(), 0.0f);
float score=std::inner_product(nscores.begin(), nscores.end(), weightT.begin(), 0.0f);
//count word penalty
float weightWP = system.GetWeightWordPenalty();
std::cerr << "Read weightWP from translation sytem: " << weightWP << std::endl;
//score-=tcands[i].tokens.size() * m_weightWP;
//score-=tcands[i].tokens.size() * m_weightWP;
score-=tcands[i].tokens.size() * weightWP;
std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(tcands[i].tokens,TScores()));

View File

@ -79,7 +79,6 @@ PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSyst
{
const StaticData& staticData = StaticData::Instance();
std::vector<float> weightT = system->GetTranslationWeights();
cerr << "Read weightT from translation sytem.. " << std::endl;
if (m_implementation == Memory) {
// memory phrase table

View File

@ -57,7 +57,6 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
std::auto_ptr<RuleTableLoader> loader =
RuleTableLoaderFactory::Create(grammarFile);
std::vector<float> weightT = StaticData::Instance().GetTranslationSystem(TranslationSystem::DEFAULT).GetTranslationWeights();
cerr << "Read weightT from translation sytem.. " << std::endl;
//bool ret = loader->Load(*m_input, *m_output, inFile, *m_weight, m_tableLimit,
bool ret = loader->Load(*m_input, *m_output, inFile, weightT, m_tableLimit,
*m_languageModels, m_wpProducer, *this);

View File

@ -95,7 +95,6 @@ ChartRuleLookupManager *PhraseDictionaryOnDisk::CreateRuleLookupManager(
const ChartCellCollection &cellCollection)
{
std::vector<float> weightT = StaticData::Instance().GetTranslationSystem(TranslationSystem::DEFAULT).GetTranslationWeights();
cerr << "Read weightT from translation sytem.. " << std::endl;
return new ChartRuleLookupManagerOnDisk(sentence, cellCollection, *this,
m_dbWrapper, m_languageModels,
m_wpProducer, m_inputFactorsVec,

View File

@ -78,6 +78,10 @@ private:
}
public:
static void ResetCounter() {
s_denseVectorSize = 0;
}
//! Create a new score collection with all values set to 0.0
ScoreComponentCollection();
@ -314,6 +318,10 @@ public:
void LogCoreFeatures(size_t baseOfLog) {
m_scores.logCoreFeatures(baseOfLog);
}
void PrintCoreFeatures() {
m_scores.printCoreFeatures();
}
void ThresholdScaling(float maxValue)
{

View File

@ -34,6 +34,10 @@ public:
static const size_t unlimited;
static void ResetDescriptionCounts() {
description_counts.clear();
}
//! returns the number of scores that a subclass produces.
//! For example, a language model conventionally produces 1, a translation table some arbitrary number, etc
//! sparse features returned unlimited

View File

@ -145,7 +145,7 @@ void SearchCubePruning::ProcessSentence()
stackNo++;
}
PrintBitmapContainerGraph();
//PrintBitmapContainerGraph();
// some more logging
IFVERBOSE(2) {

View File

@ -116,6 +116,49 @@ StaticData::StaticData()
Phrase::InitializeMemPool();
}
void StaticData::ClearData() {
for (size_t i=0; i < m_decodeGraphs.size(); ++i)
delete m_decodeGraphs[i];
m_decodeGraphs.clear();
m_decodeGraphBackoff.clear();
m_translationSystems.clear();
for (size_t i=0; i < m_wordPenaltyProducers.size(); ++i) {
ScoreComponentCollection::UnregisterScoreProducer(m_wordPenaltyProducers[i]);
delete m_wordPenaltyProducers[i];
}
m_wordPenaltyProducers.clear();
for (size_t i=0; i < m_distortionScoreProducers.size(); ++i) {
ScoreComponentCollection::UnregisterScoreProducer(m_distortionScoreProducers[i]);
delete m_distortionScoreProducers[i];
}
m_distortionScoreProducers.clear();
for (size_t i=0; i < m_phraseDictionary.size(); ++i) {
ScoreComponentCollection::UnregisterScoreProducer(m_phraseDictionary[i]);
delete m_phraseDictionary[i];
}
m_phraseDictionary.clear();
for (size_t i=0; i < m_reorderModels.size(); ++i) {
ScoreComponentCollection::UnregisterScoreProducer(m_reorderModels[i]);
delete m_reorderModels[i];
}
m_reorderModels.clear();
for (LMList::const_iterator k = m_languageModel.begin(); k != m_languageModel.end(); ++k) {
ScoreComponentCollection::UnregisterScoreProducer(*k);
// delete *k;
}
m_languageModel.CleanUp();
ScoreComponentCollection::UnregisterScoreProducer(m_bleuScoreFeature);
ScoreComponentCollection::UnregisterScoreProducer(m_unknownWordPenaltyProducer);
m_inputFactorOrder.clear();
m_outputFactorOrder.clear();
ScoreComponentCollection::ResetCounter();
ScoreProducer::ResetDescriptionCounts();
}
bool StaticData::LoadData(Parameter *parameter)
{
ResetUserTime();
@ -282,6 +325,8 @@ bool StaticData::LoadData(Parameter *parameter)
m_useTransOptCache = false;
}
std::cerr << "transOptCache: " << m_useTransOptCache << std::endl;
std::cerr << "transOptCache max size: " << m_transOptCacheMaxSize << std::endl;
//input factors
const vector<string> &inputFactorVector = m_parameter->GetParam("input-factors");
@ -343,6 +388,7 @@ bool StaticData::LoadData(Parameter *parameter)
// settings for pruning
m_maxHypoStackSize = (m_parameter->GetParam("stack").size() > 0)
? Scan<size_t>(m_parameter->GetParam("stack")[0]) : DEFAULT_MAX_HYPOSTACK_SIZE;
std::cerr << "max stack size: " << m_maxHypoStackSize << std::endl;
m_minHypoStackDiversity = 0;
if (m_parameter->GetParam("stack-diversity").size() > 0) {
if (m_maxDistortion > 15) {
@ -366,6 +412,10 @@ bool StaticData::LoadData(Parameter *parameter)
TransformScore(Scan<float>(m_parameter->GetParam("translation-option-threshold")[0]))
: TransformScore(DEFAULT_TRANSLATION_OPTION_THRESHOLD);
std::cerr << "beamwidth: " << m_beamWidth << std::endl;
std::cerr << "early discarding threshold: " << m_earlyDiscardingThreshold << std::endl;
std::cerr << "translOptThreshold: " << m_translationOptionThreshold << std::endl;
m_maxNoTransOptPerCoverage = (m_parameter->GetParam("max-trans-opt-per-coverage").size() > 0)
? Scan<size_t>(m_parameter->GetParam("max-trans-opt-per-coverage")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
@ -1697,7 +1747,7 @@ bool StaticData::LoadPhrasePairFeature()
size_t sourceFactorId = Scan<size_t>(factors[0]);
size_t targetFactorId = Scan<size_t>(factors[1]);
bool simple = true, sourceContext = false, ignorePunctuation = true;
bool simple = true, sourceContext = false, ignorePunctuation = false;
if (tokens.size() >= 3) {
simple = Scan<size_t>(tokens[1]);
sourceContext = Scan<size_t>(tokens[2]);
@ -1837,7 +1887,7 @@ bool StaticData::LoadWordTranslationFeature()
FactorType factorIdSource = Scan<size_t>(factors[0]);
FactorType factorIdTarget = Scan<size_t>(factors[1]);
bool simple = true, sourceTrigger = false, targetTrigger = false, ignorePunctuation = true;
bool simple = true, sourceTrigger = false, targetTrigger = false, ignorePunctuation = false;
if (tokens.size() >= 4) {
simple = Scan<size_t>(tokens[1]);
sourceTrigger = Scan<size_t>(tokens[2]);

View File

@ -302,11 +302,19 @@ public:
//! Load data into static instance. This function is required as LoadData() is not const
static bool LoadDataStatic(Parameter *parameter) {
std::cerr << "Load static data.." << std::endl;
return s_instance.LoadData(parameter);
std::cerr << "done.." << std::endl;
}
static void ClearDataStatic() {
std::cerr << "Clear static data.." << std::endl;
s_instance.ClearData();
std::cerr << "done.." << std::endl;
}
//! Main function to load everything. Also initialize the Parameter object
bool LoadData(Parameter *parameter);
void ClearData();
const PARAM_VEC &GetParam(const std::string &paramName) const {
return m_parameter->GetParam(paramName);
@ -477,6 +485,10 @@ public:
LMList GetLMList() const {
return m_languageModel;
}
WordPenaltyProducer* GetWordPenaltyProducer() const {
assert(m_wordPenaltyProducers.size() >= 1);
return m_wordPenaltyProducers[0];
}
size_t GetNumInputScores() const {
return m_numInputScores;
}

View File

@ -145,7 +145,6 @@ void TargetPhrase::SetScore(const ScoreProducer* translationScoreProducer,
const TranslationSystem& system = StaticData::Instance().GetTranslationSystem(TranslationSystem::DEFAULT);
std::vector<float> weightsT = system.GetTranslationWeights();
weightWP = system.GetWeightWordPenalty();
VERBOSE(1, cerr << "weightWP: " << weightWP << std::endl);
//m_transScore = std::inner_product(scoreVector.begin(), scoreVector.end(), weightT.begin(), 0.0f);
m_transScore = std::inner_product(scoreVector.begin(), scoreVector.end(), weightsT.begin(), 0.0f);

View File

@ -161,7 +161,9 @@ namespace Moses {
}
float TranslationSystem::GetWeightWordPenalty() const {
return StaticData::Instance().GetWeight(m_wpProducer);
float weightWP = StaticData::Instance().GetWeight(m_wpProducer);
VERBOSE(1, "Read weightWP from translation sytem: " << weightWP << std::endl);
return weightWP;
}
float TranslationSystem::GetWeightUnknownWordPenalty() const {
@ -175,10 +177,10 @@ namespace Moses {
std::vector<float> TranslationSystem::GetTranslationWeights() const {
std::vector<float> weights = StaticData::Instance().GetWeights(GetTranslationScoreProducer());
VERBOSE(1, cerr << "Read weightT from translation sytem.. ");
VERBOSE(1, "Read weightT from translation sytem.. ");
for (size_t i = 0; i < weights.size(); ++i)
VERBOSE(1, std::cerr << weights[i] << " ");
VERBOSE(1, std::cerr << std::endl);
VERBOSE(1, weights[i] << " ");
VERBOSE(1, std::endl);
return weights;
}
};