mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 07:42:21 +03:00
remove multiple oracles, remove accumulating constraints
git-svn-id: http://svn.statmt.org/repository/mira@3908 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
parent
72da32f0cb
commit
4e0f848d50
138
mira/Main.cpp
138
mira/Main.cpp
@ -120,34 +120,6 @@ struct RandomIndex {
|
||||
}
|
||||
};
|
||||
|
||||
void shuffleInput(vector<size_t>& order, size_t size, size_t inputSize) {
|
||||
cerr << "Shuffling input examples.." << endl;
|
||||
// RandomIndex rindex;
|
||||
// random_shuffle(order.begin(), order.end(), rindex);
|
||||
|
||||
// remove first element and put it in the back
|
||||
size_t first = order.at(0);
|
||||
size_t index = 0;
|
||||
order.erase(order.begin());
|
||||
order.push_back(first);
|
||||
}
|
||||
|
||||
void createShard(vector<size_t>& order, size_t size, size_t rank, vector<size_t>& shard) {
|
||||
// Create the shards according to the number of processes used
|
||||
float shardSize = (float) (order.size()) / size;
|
||||
size_t shardStart = (size_t) (shardSize * rank);
|
||||
size_t shardEnd = (size_t) (shardSize * (rank + 1));
|
||||
if (rank == size - 1)
|
||||
shardEnd = order.size();
|
||||
shard.resize(shardSize);
|
||||
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
|
||||
cerr << "order: ";
|
||||
for (size_t i = 0; i < shard.size(); ++i) {
|
||||
cerr << shard[i] << " ";
|
||||
}
|
||||
cerr << endl;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
size_t rank = 0;
|
||||
size_t size = 1;
|
||||
@ -186,10 +158,7 @@ int main(int argc, char** argv) {
|
||||
float slack;
|
||||
float slack_step;
|
||||
float slack_min;
|
||||
size_t maxNumberOracles;
|
||||
bool accumulateMostViolatedConstraints;
|
||||
bool averageWeights;
|
||||
bool pastAndCurrentConstraints;
|
||||
bool weightConvergence;
|
||||
bool controlUpdates;
|
||||
float learning_rate;
|
||||
@ -225,62 +194,58 @@ int main(int argc, char** argv) {
|
||||
int fear_n;
|
||||
po::options_description desc("Allowed options");
|
||||
desc.add_options()
|
||||
("accumulate-most-violated-constraints", po::value<bool>(&accumulateMostViolatedConstraints)->default_value(false),"Accumulate most violated constraint per example")
|
||||
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
|
||||
("analytical-update", po::value<bool>(&analytical_update)->default_value(0), "Use one best lists and compute the update analytically")
|
||||
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
|
||||
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
|
||||
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
|
||||
("bleu-score-weight", po::value<float>(&bleuScoreWeight)->default_value(1.0), "Bleu score weight used in the decoder objective function (on top of the bleu objective weight)")
|
||||
("burn-in", po::value<bool>(&burnIn)->default_value(false), "Do a burn-in of the BLEU history before training")
|
||||
("burn-in-input-file", po::value<string>(&burnInInputFile), "Input file for burn-in phase of BLEU history")
|
||||
("burn-in-reference-files", po::value<vector<string> >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history")
|
||||
("config,f", po::value<string>(&mosesConfigFile), "Moses ini file")
|
||||
("control-updates", po::value<bool>(&controlUpdates)->default_value(true), "Ignore updates that increase number of violated constraints AND increase the error")
|
||||
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
|
||||
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
|
||||
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
|
||||
("decr-sentence-update", po::value<float>(&decrease_sentence_update)->default_value(0), "Decrease maximum weight update by the given value after every epoch")
|
||||
("dev-bleu", po::value<bool>(&devBleu)->default_value(true), "Compute BLEU score of oracle translations of the whole tuning set")
|
||||
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
|
||||
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
|
||||
("epochs,e", po::value<size_t>(&epochs)->default_value(5), "Number of epochs")
|
||||
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
|
||||
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
|
||||
("history-of-1best", po::value<bool>(&historyOf1best)->default_value(0), "Use the 1best translation to update the history")
|
||||
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
|
||||
("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimization (not model)")
|
||||
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
|
||||
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
|
||||
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
|
||||
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
|
||||
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
|
||||
("max-number-oracles", po::value<size_t>(&maxNumberOracles)->default_value(1), "Set a maximum number of oracles to use per example")
|
||||
("min-bleu-change", po::value<float>(&min_bleu_change)->default_value(0), "Minimum BLEU change of 1best translations of one epoch")
|
||||
("min-sentence-update", po::value<float>(&min_sentence_update)->default_value(0), "Set a minimum weight update per sentence")
|
||||
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
|
||||
("max-sentence-update", po::value<float>(&max_sentence_update)->default_value(-1), "Set a maximum weight update per sentence")
|
||||
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
|
||||
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
|
||||
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
|
||||
("nbest,n", po::value<size_t>(&n)->default_value(10), "Number of translations in nbest list")
|
||||
("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
|
||||
("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
|
||||
("past-and-current-constraints", po::value<bool>(&pastAndCurrentConstraints)->default_value(false), "Accumulate most violated constraint per example and use them along all current constraints")
|
||||
("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
|
||||
("precision", po::value<float>(&precision)->default_value(0), "Precision when comparing left and right hand side of constraints")
|
||||
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
|
||||
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
|
||||
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
|
||||
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(false), "Use a sentences level bleu scoring function")
|
||||
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
|
||||
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
|
||||
("analytical-update", po::value<bool>(&analytical_update)->default_value(0), "Use one best lists and compute the update analytically")
|
||||
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
|
||||
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
|
||||
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
|
||||
("bleu-score-weight", po::value<float>(&bleuScoreWeight)->default_value(1.0), "Bleu score weight used in the decoder objective function (on top of the bleu objective weight)")
|
||||
("burn-in", po::value<bool>(&burnIn)->default_value(false), "Do a burn-in of the BLEU history before training")
|
||||
("burn-in-input-file", po::value<string>(&burnInInputFile), "Input file for burn-in phase of BLEU history")
|
||||
("burn-in-reference-files", po::value<vector<string> >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history")
|
||||
("config,f", po::value<string>(&mosesConfigFile), "Moses ini file")
|
||||
("control-updates", po::value<bool>(&controlUpdates)->default_value(true), "Ignore updates that increase number of violated constraints AND increase the error")
|
||||
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
|
||||
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
|
||||
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
|
||||
("decr-sentence-update", po::value<float>(&decrease_sentence_update)->default_value(0), "Decrease maximum weight update by the given value after every epoch")
|
||||
("dev-bleu", po::value<bool>(&devBleu)->default_value(true), "Compute BLEU score of oracle translations of the whole tuning set")
|
||||
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
|
||||
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
|
||||
("epochs,e", po::value<size_t>(&epochs)->default_value(5), "Number of epochs")
|
||||
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
|
||||
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
|
||||
("history-of-1best", po::value<bool>(&historyOf1best)->default_value(0), "Use the 1best translation to update the history")
|
||||
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
|
||||
("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimization (not model)")
|
||||
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
|
||||
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
|
||||
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
|
||||
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
|
||||
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
|
||||
("min-bleu-change", po::value<float>(&min_bleu_change)->default_value(0), "Minimum BLEU change of 1best translations of one epoch")
|
||||
("min-sentence-update", po::value<float>(&min_sentence_update)->default_value(0), "Set a minimum weight update per sentence")
|
||||
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
|
||||
("max-sentence-update", po::value<float>(&max_sentence_update)->default_value(-1), "Set a maximum weight update per sentence")
|
||||
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
|
||||
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
|
||||
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
|
||||
("nbest,n", po::value<size_t>(&n)->default_value(10), "Number of translations in nbest list")
|
||||
("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
|
||||
("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
|
||||
("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
|
||||
("precision", po::value<float>(&precision)->default_value(0), "Precision when comparing left and right hand side of constraints")
|
||||
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
|
||||
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
|
||||
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
|
||||
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(false), "Use a sentences level bleu scoring function")
|
||||
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
|
||||
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimizer")
|
||||
("slack-min", po::value<float>(&slack_min)->default_value(0.01), "Minimum slack used")
|
||||
("slack-step", po::value<float>(&slack_step)->default_value(0), "Increase slack from epoch to epoch by the value provided")
|
||||
("stop-dev-bleu", po::value<bool>(&stop_dev_bleu)->default_value(false), "Stop when average Bleu (dev) decreases (or no more increases)")
|
||||
("stop-approx-dev-bleu", po::value<bool>(&stop_approx_dev_bleu)->default_value(false), "Stop when average approx. sentence Bleu (dev) decreases (or no more increases)")
|
||||
("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
|
||||
("train-linear-classifier", po::value<bool>(&train_linear_classifier)->default_value(false), "Test algorithm for linear classification")
|
||||
("use-scaled-reference", po::value<bool>(&useScaledReference)->default_value(true), "Use scaled reference length for comparing target and reference length of phrases")
|
||||
("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
|
||||
("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
|
||||
@ -389,8 +354,7 @@ int main(int argc, char** argv) {
|
||||
Optimiser* optimiser = NULL;
|
||||
if (learner == "mira") {
|
||||
cerr << "Optimising using Mira" << endl;
|
||||
optimiser = new MiraOptimiser(onlyViolatedConstraints, slack, scale_margin, scale_update,
|
||||
maxNumberOracles, accumulateMostViolatedConstraints, pastAndCurrentConstraints, order.size(), precision);
|
||||
optimiser = new MiraOptimiser(onlyViolatedConstraints, slack, scale_margin, scale_update, precision);
|
||||
learning_rate = mira_learning_rate;
|
||||
perceptron_update = false;
|
||||
} else if (learner == "perceptron") {
|
||||
@ -407,12 +371,6 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
// resolve parameter dependencies
|
||||
|
||||
if (accumulateMostViolatedConstraints && pastAndCurrentConstraints) {
|
||||
cerr << "Error: the parameters --accumulate-most-violated-constraints and --past-and-current-constraints are mutually exclusive" << endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (perceptron_update || analytical_update) {
|
||||
batchSize = 1;
|
||||
cerr << "Setting batch size to 1 for perceptron/analytical update" << endl;
|
||||
@ -542,7 +500,7 @@ int main(int argc, char** argv) {
|
||||
mpi::broadcast(world, order, 0);
|
||||
#endif
|
||||
|
||||
// Create the shards according to the number of processes used
|
||||
// Create shards according to the number of processes used
|
||||
vector<size_t> shard;
|
||||
float shardSize = (float) (order.size()) / size;
|
||||
VERBOSE(1, "Shard size: " << shardSize << endl);
|
||||
|
@ -19,43 +19,6 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
size_t epoch,
|
||||
bool controlUpdates) {
|
||||
|
||||
// add every oracle in batch to list of oracles (under certain conditions)
|
||||
for (size_t i = 0; i < oracleFeatureValues.size(); ++i) {
|
||||
float newWeightedScore = oracleFeatureValues[i].InnerProduct(currWeights);
|
||||
size_t sentenceId = sentenceIds[i];
|
||||
|
||||
// compare new oracle with existing oracles:
|
||||
// if same translation exists, just update the bleu score
|
||||
// if not, add the oracle
|
||||
bool updated = false;
|
||||
size_t indexOfWorst = 0;
|
||||
float worstWeightedScore = 0;
|
||||
for (size_t j = 0; j < m_oracles[sentenceId].size(); ++j) {
|
||||
float currentWeightedScore = m_oracles[sentenceId][j].InnerProduct(currWeights);
|
||||
if (currentWeightedScore == newWeightedScore) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", bleu score of oracle updated at batch position " << i << ", " << m_bleu_of_oracles[sentenceId][j] << " --> " << oracleBleuScores[j] << endl;
|
||||
m_bleu_of_oracles[sentenceId][j] = oracleBleuScores[j];
|
||||
updated = true;
|
||||
break;
|
||||
} else if (worstWeightedScore == 0 || currentWeightedScore
|
||||
> worstWeightedScore) {
|
||||
worstWeightedScore = currentWeightedScore;
|
||||
indexOfWorst = j;
|
||||
}
|
||||
}
|
||||
|
||||
if (!updated) {
|
||||
// add if number of maximum oracles not exceeded, otherwise override the worst
|
||||
if (m_max_number_oracles > m_oracles[sentenceId].size()) {
|
||||
m_oracles[sentenceId].push_back(oracleFeatureValues[i]);
|
||||
m_bleu_of_oracles[sentenceId].push_back(oracleBleuScores[i]);
|
||||
} else {
|
||||
m_oracles[sentenceId][indexOfWorst] = oracleFeatureValues[i];
|
||||
m_bleu_of_oracles[sentenceId][indexOfWorst] = oracleBleuScores[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// vector of feature values differences for all created constraints
|
||||
vector<ScoreComponentCollection> featureValueDiffs;
|
||||
vector<float> lossMinusModelScoreDiffs;
|
||||
@ -63,7 +26,6 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
|
||||
// most violated constraint in batch
|
||||
ScoreComponentCollection max_batch_featureValueDiff;
|
||||
float max_batch_loss = -1;
|
||||
float max_batch_lossMinusModelScoreDiff = -1;
|
||||
|
||||
// Make constraints for new hypothesis translations
|
||||
@ -72,21 +34,19 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
float oldDistanceFromOptimum = 0;
|
||||
// iterate over input sentences (1 (online) or more (batch))
|
||||
for (size_t i = 0; i < featureValues.size(); ++i) {
|
||||
size_t sentenceId = sentenceIds[i];
|
||||
if (m_oracles[sentenceId].size() > 1)
|
||||
cerr << "Rank " << rank << ", available oracles for source sentence " << sentenceId << ": " << m_oracles[sentenceId].size() << endl;
|
||||
//size_t sentenceId = sentenceIds[i];
|
||||
// iterate over hypothesis translations for one input sentence
|
||||
for (size_t j = 0; j < featureValues[i].size(); ++j) {
|
||||
for (size_t k = 0; k < m_oracles[sentenceId].size(); ++k) {
|
||||
ScoreComponentCollection featureValueDiff = m_oracles[sentenceId][k];
|
||||
featureValueDiff.MinusEquals(featureValues[i][j]);
|
||||
cerr << "feature value diff: " << featureValueDiff << endl;
|
||||
if (featureValueDiff.GetL1Norm() == 0) {
|
||||
cerr << "Equal feature values, constraint skipped.." << endl;
|
||||
continue;
|
||||
}
|
||||
ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
|
||||
featureValueDiff.MinusEquals(featureValues[i][j]);
|
||||
|
||||
float loss = losses[i][j];
|
||||
cerr << "feature value diff: " << featureValueDiff << endl;
|
||||
if (featureValueDiff.GetL1Norm() == 0) {
|
||||
cerr << "Equal feature values, constraint skipped.." << endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
float loss = losses[i][j];
|
||||
if (m_scale_margin == 1) {
|
||||
loss *= oracleBleuScores[i];
|
||||
cerr << "Scaling margin with oracle bleu score " << oracleBleuScores[i] << endl;
|
||||
@ -101,128 +61,31 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
}
|
||||
|
||||
// check if constraint is violated
|
||||
bool violated = false;
|
||||
bool addConstraint = true;
|
||||
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
||||
float diff = loss - (modelScoreDiff + m_precision);
|
||||
cerr << "constraint: " << (modelScoreDiff + m_precision) << " >= " << loss << endl;
|
||||
if (diff > epsilon) {
|
||||
violated = true;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << endl;
|
||||
}
|
||||
else if (m_onlyViolatedConstraints) {
|
||||
addConstraint = false;
|
||||
}
|
||||
|
||||
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_precision);
|
||||
if (violated) {
|
||||
if (m_accumulateMostViolatedConstraints || m_pastAndCurrentConstraints) {
|
||||
// find the most violated constraint per batch
|
||||
if (lossMinusModelScoreDiff > max_batch_lossMinusModelScoreDiff) {
|
||||
max_batch_lossMinusModelScoreDiff = lossMinusModelScoreDiff;
|
||||
max_batch_featureValueDiff = featureValueDiff;
|
||||
max_batch_loss = loss;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (addConstraint && !m_accumulateMostViolatedConstraints) {
|
||||
featureValueDiffs.push_back(featureValueDiff);
|
||||
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
|
||||
all_losses.push_back(loss);
|
||||
|
||||
if (violated) {
|
||||
++violatedConstraintsBefore;
|
||||
oldDistanceFromOptimum += diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current constraints: " << featureValueDiffs.size() << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current violated constraints: " << violatedConstraintsBefore << endl;
|
||||
}
|
||||
|
||||
if (m_max_number_oracles == 1) {
|
||||
for (size_t k = 0; k < sentenceIds.size(); ++k) {
|
||||
size_t sentenceId = sentenceIds[k];
|
||||
m_oracles[sentenceId].clear();
|
||||
}
|
||||
}
|
||||
|
||||
size_t pastViolatedConstraints = 0;
|
||||
// Add constraints from past iterations (BEFORE updating that list)
|
||||
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
|
||||
// add all past (most violated) constraints to the list of current constraints, computed with current weights!
|
||||
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
|
||||
float modelScoreDiff = m_featureValueDiffs[i].InnerProduct(currWeights);
|
||||
|
||||
// check if constraint is violated
|
||||
bool violated = false;
|
||||
bool addConstraint = true;
|
||||
float diff = m_losses[i] - (modelScoreDiff + m_precision);
|
||||
if (diff > epsilon) {
|
||||
violated = true;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", past violation: " << diff << endl;
|
||||
}
|
||||
else if (m_onlyViolatedConstraints) {
|
||||
addConstraint = false;
|
||||
bool violated = false;
|
||||
bool addConstraint = true;
|
||||
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
||||
float diff = loss - (modelScoreDiff + m_precision);
|
||||
cerr << "constraint: " << (modelScoreDiff + m_precision) << " >= " << loss << endl;
|
||||
if (diff > epsilon) {
|
||||
violated = true;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << endl;
|
||||
}
|
||||
else if (m_onlyViolatedConstraints) {
|
||||
addConstraint = false;
|
||||
}
|
||||
|
||||
if (addConstraint) {
|
||||
featureValueDiffs.push_back(m_featureValueDiffs[i]);
|
||||
lossMinusModelScoreDiffs.push_back(m_losses[i] - (modelScoreDiff + m_precision));
|
||||
all_losses.push_back(m_losses[i]);
|
||||
// cerr << "old constraint: " << (modelScoreDiff + m_precision) << " >= " << m_losses[i] << endl;
|
||||
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_precision);
|
||||
if (addConstraint) {
|
||||
featureValueDiffs.push_back(featureValueDiff);
|
||||
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
|
||||
all_losses.push_back(loss);
|
||||
|
||||
if (violated) {
|
||||
++violatedConstraintsBefore;
|
||||
++pastViolatedConstraints;
|
||||
oldDistanceFromOptimum += diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past constraints: " << m_featureValueDiffs.size() << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past violated constraints: " << pastViolatedConstraints << endl;
|
||||
}
|
||||
|
||||
// Add new most violated constraint to the list of current constraints
|
||||
if (m_accumulateMostViolatedConstraints) {
|
||||
if (max_batch_loss != -1) {
|
||||
float modelScoreDiff = max_batch_featureValueDiff.InnerProduct(currWeights);
|
||||
float diff = max_batch_loss - (modelScoreDiff + m_precision);
|
||||
++violatedConstraintsBefore;
|
||||
oldDistanceFromOptimum += diff;
|
||||
|
||||
featureValueDiffs.push_back(max_batch_featureValueDiff);
|
||||
lossMinusModelScoreDiffs.push_back(max_batch_loss - (modelScoreDiff + m_precision));
|
||||
all_losses.push_back(max_batch_loss);
|
||||
// cerr << "new constraint: " << (modelScoreDiff + m_precision) << " !>= " << max_batch_loss << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Update the list of accumulated most violated constraints
|
||||
if (max_batch_loss != -1) {
|
||||
bool updated = false;
|
||||
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
|
||||
float oldScore = m_featureValueDiffs[i].InnerProduct(currWeights);
|
||||
float newScore = max_batch_featureValueDiff.InnerProduct(currWeights);
|
||||
if (abs(oldScore-newScore) < epsilon) {
|
||||
m_losses[i] = max_batch_loss;
|
||||
updated = true;
|
||||
break;
|
||||
if (violated) {
|
||||
++violatedConstraintsBefore;
|
||||
oldDistanceFromOptimum += diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!updated) {
|
||||
m_featureValueDiffs.push_back(max_batch_featureValueDiff);
|
||||
m_losses.push_back(max_batch_loss);
|
||||
}
|
||||
}
|
||||
|
||||
// run optimisation: compute alphas for all given constraints
|
||||
@ -284,7 +147,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
statusPlus[1] = -1;
|
||||
statusPlus[2] = -1;
|
||||
return statusPlus;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// apply learning rate
|
||||
@ -302,7 +165,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
||||
}
|
||||
|
||||
// scale update by BLEU of oracle
|
||||
if (oracleBleuScores.size() == 1 && m_max_number_oracles == 1 && m_scale_update) { // scale only if just 1 oracle is used
|
||||
if (oracleBleuScores.size() == 1 && m_scale_update) {
|
||||
cerr << "Scaling summed update with log10 oracle bleu score " << log10(oracleBleuScores[0]) << endl;
|
||||
summedUpdate.MultiplyEquals(log10(oracleBleuScores[0]));
|
||||
}
|
||||
@ -338,7 +201,6 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
|
||||
// most violated constraint in batch
|
||||
ScoreComponentCollection max_batch_featureValueDiff;
|
||||
float max_batch_loss = -1;
|
||||
float max_batch_lossMinusModelScoreDiff = -1;
|
||||
|
||||
// Make constraints for new hypothesis translations
|
||||
@ -348,8 +210,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
|
||||
// iterate over input sentences (1 (online) or more (batch))
|
||||
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
|
||||
size_t sentenceId = sentenceIds[i];
|
||||
|
||||
size_t sentenceId = sentenceIds[i]; // keep sentenceId for storing more than 1 oracle..
|
||||
// Pair all hope translations with all fear translations for one input sentence
|
||||
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
|
||||
for (size_t k = 0; k < featureValuesFear[i].size(); ++k) {
|
||||
@ -362,20 +223,20 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
}
|
||||
|
||||
float loss = bleuScoresHope[i][j] - bleuScoresFear[i][k];
|
||||
if (m_scale_margin == 1) {
|
||||
loss *= bleuScoresHope[i][j];
|
||||
cerr << "Scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl;
|
||||
}
|
||||
else if (m_scale_margin == 2) {
|
||||
loss *= log2(bleuScoresHope[i][j]);
|
||||
cerr << "Scaling margin with log2 oracle bleu score " << log2(bleuScoresHope[i][j]) << endl;
|
||||
}
|
||||
else if (m_scale_margin == 10) {
|
||||
loss *= log10(bleuScoresHope[i][j]);
|
||||
cerr << "Scaling margin with log10 oracle bleu score " << log10(bleuScoresHope[i][j]) << endl;
|
||||
}
|
||||
if (m_scale_margin == 1) {
|
||||
loss *= bleuScoresHope[i][j];
|
||||
cerr << "Scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl;
|
||||
}
|
||||
else if (m_scale_margin == 2) {
|
||||
loss *= log2(bleuScoresHope[i][j]);
|
||||
cerr << "Scaling margin with log2 oracle bleu score " << log2(bleuScoresHope[i][j]) << endl;
|
||||
}
|
||||
else if (m_scale_margin == 10) {
|
||||
loss *= log10(bleuScoresHope[i][j]);
|
||||
cerr << "Scaling margin with log10 oracle bleu score " << log10(bleuScoresHope[i][j]) << endl;
|
||||
}
|
||||
|
||||
// check if constraint is violated
|
||||
// check if constraint is violated
|
||||
bool violated = false;
|
||||
bool addConstraint = true;
|
||||
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
||||
@ -390,18 +251,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
}
|
||||
|
||||
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_precision);
|
||||
if (violated) {
|
||||
if (m_accumulateMostViolatedConstraints || m_pastAndCurrentConstraints) {
|
||||
// find the most violated constraint per batch
|
||||
if (lossMinusModelScoreDiff > max_batch_lossMinusModelScoreDiff) {
|
||||
max_batch_lossMinusModelScoreDiff = lossMinusModelScoreDiff;
|
||||
max_batch_featureValueDiff = featureValueDiff;
|
||||
max_batch_loss = loss;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (addConstraint && !m_accumulateMostViolatedConstraints) {
|
||||
if (addConstraint) {
|
||||
featureValueDiffs.push_back(featureValueDiff);
|
||||
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
|
||||
all_losses.push_back(loss);
|
||||
@ -415,84 +265,6 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
}
|
||||
}
|
||||
|
||||
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current constraints: " << featureValueDiffs.size() << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current violated constraints: " << violatedConstraintsBefore << endl;
|
||||
}
|
||||
|
||||
size_t pastViolatedConstraints = 0;
|
||||
// Add constraints from past iterations (BEFORE updating that list)
|
||||
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
|
||||
// add all past (most violated) constraints to the list of current constraints, computed with current weights!
|
||||
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
|
||||
float modelScoreDiff = m_featureValueDiffs[i].InnerProduct(currWeights);
|
||||
|
||||
// check if constraint is violated
|
||||
bool violated = false;
|
||||
bool addConstraint = true;
|
||||
float diff = m_losses[i] - (modelScoreDiff + m_precision);
|
||||
if (diff > epsilon) {
|
||||
violated = true;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", past violation: " << diff << endl;
|
||||
}
|
||||
else if (m_onlyViolatedConstraints) {
|
||||
addConstraint = false;
|
||||
}
|
||||
|
||||
if (addConstraint) {
|
||||
featureValueDiffs.push_back(m_featureValueDiffs[i]);
|
||||
lossMinusModelScoreDiffs.push_back(m_losses[i] - (modelScoreDiff + m_precision));
|
||||
all_losses.push_back(m_losses[i]);
|
||||
// cerr << "old constraint: " << (modelScoreDiff + m_precision) << " >= " << m_losses[i] << endl;
|
||||
|
||||
if (violated) {
|
||||
++violatedConstraintsBefore;
|
||||
++pastViolatedConstraints;
|
||||
oldDistanceFromOptimum += diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past constraints: " << m_featureValueDiffs.size() << endl;
|
||||
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past violated constraints: " << pastViolatedConstraints << endl;
|
||||
}
|
||||
|
||||
// Add new most violated constraint to the list of current constraints
|
||||
if (m_accumulateMostViolatedConstraints) {
|
||||
if (max_batch_loss != -1) {
|
||||
float modelScoreDiff = max_batch_featureValueDiff.InnerProduct(currWeights);
|
||||
float diff = max_batch_loss - (modelScoreDiff + m_precision);
|
||||
++violatedConstraintsBefore;
|
||||
oldDistanceFromOptimum += diff;
|
||||
|
||||
featureValueDiffs.push_back(max_batch_featureValueDiff);
|
||||
lossMinusModelScoreDiffs.push_back(max_batch_loss - (modelScoreDiff + m_precision));
|
||||
all_losses.push_back(max_batch_loss);
|
||||
// cerr << "new constraint: " << (modelScoreDiff + m_precision) << " !>= " << max_batch_loss << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Update the list of accumulated most violated constraints
|
||||
if (max_batch_loss != -1) {
|
||||
bool updated = false;
|
||||
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
|
||||
float oldScore = m_featureValueDiffs[i].InnerProduct(currWeights);
|
||||
float newScore = max_batch_featureValueDiff.InnerProduct(currWeights);
|
||||
if (abs(oldScore-newScore) < epsilon) {
|
||||
m_losses[i] = max_batch_loss;
|
||||
updated = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!updated) {
|
||||
m_featureValueDiffs.push_back(max_batch_featureValueDiff);
|
||||
m_losses.push_back(max_batch_loss);
|
||||
}
|
||||
}
|
||||
|
||||
// run optimisation: compute alphas for all given constraints
|
||||
vector<float> alphas;
|
||||
ScoreComponentCollection summedUpdate;
|
||||
@ -515,14 +287,14 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
|
||||
// scale update by BLEU of hope translation (only two cases defined at the moment)
|
||||
if (featureValuesHope.size() == 1 && m_scale_update) { // only defined for batch size 1)
|
||||
if (featureValuesHope[0].size() == 1) {
|
||||
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][0]) << endl; // only 1 oracle
|
||||
update.MultiplyEquals(log10(bleuScoresHope[0][0]));
|
||||
} else if (featureValuesFear[0].size() == 1) {
|
||||
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][k]) << endl; // k oracles
|
||||
update.MultiplyEquals(log10(bleuScoresHope[0][k]));
|
||||
}
|
||||
if (featureValuesHope[0].size() == 1) {
|
||||
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][0]) << endl; // only 1 oracle
|
||||
update.MultiplyEquals(log10(bleuScoresHope[0][0]));
|
||||
} else if (featureValuesFear[0].size() == 1) {
|
||||
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][k]) << endl; // k oracles
|
||||
update.MultiplyEquals(log10(bleuScoresHope[0][k]));
|
||||
}
|
||||
}
|
||||
|
||||
// sum up update
|
||||
summedUpdate.PlusEquals(update);
|
||||
@ -593,7 +365,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
|
||||
}
|
||||
|
||||
vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
|
||||
ScoreComponentCollection& featureValuesHope,
|
||||
ScoreComponentCollection& featureValuesHope,
|
||||
ScoreComponentCollection& featureValuesFear,
|
||||
float bleuScoreHope,
|
||||
float bleuScoreFear,
|
||||
|
@ -63,17 +63,12 @@ namespace Mira {
|
||||
MiraOptimiser() :
|
||||
Optimiser() { }
|
||||
|
||||
MiraOptimiser(bool onlyViolatedConstraints, float slack, size_t scale_margin, bool scale_update, size_t maxNumberOracles, bool accumulateMostViolatedConstraints, bool pastAndCurrentConstraints, size_t exampleSize, float precision) :
|
||||
MiraOptimiser(bool onlyViolatedConstraints, float slack, size_t scale_margin, bool scale_update, float precision) :
|
||||
Optimiser(),
|
||||
m_onlyViolatedConstraints(onlyViolatedConstraints),
|
||||
m_slack(slack),
|
||||
m_scale_margin(scale_margin),
|
||||
m_scale_update(scale_update),
|
||||
m_max_number_oracles(maxNumberOracles),
|
||||
m_accumulateMostViolatedConstraints(accumulateMostViolatedConstraints),
|
||||
m_pastAndCurrentConstraints(pastAndCurrentConstraints),
|
||||
m_oracles(exampleSize),
|
||||
m_bleu_of_oracles(exampleSize),
|
||||
m_precision(precision) { }
|
||||
|
||||
std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
|
||||
@ -125,21 +120,6 @@ namespace Mira {
|
||||
|
||||
size_t m_scale_margin;
|
||||
|
||||
// keep a list of oracle translations over epochs
|
||||
std::vector < std::vector< Moses::ScoreComponentCollection> > m_oracles;
|
||||
|
||||
std::vector < std::vector< float> > m_bleu_of_oracles;
|
||||
|
||||
size_t m_max_number_oracles;
|
||||
|
||||
// accumulate most violated constraints for every example
|
||||
std::vector< Moses::ScoreComponentCollection> m_featureValueDiffs;
|
||||
std::vector< float> m_losses;
|
||||
|
||||
bool m_accumulateMostViolatedConstraints;
|
||||
|
||||
bool m_pastAndCurrentConstraints;
|
||||
|
||||
float m_precision;
|
||||
|
||||
// scale update with log 10 of oracle BLEU score
|
||||
|
Loading…
Reference in New Issue
Block a user