remove multiple oracles, remove accumulating constraints

git-svn-id: http://svn.statmt.org/repository/mira@3908 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
ehasler 2011-06-10 14:14:40 +00:00 committed by Ondrej Bojar
parent 72da32f0cb
commit 4e0f848d50
3 changed files with 104 additions and 394 deletions

View File

@ -120,34 +120,6 @@ struct RandomIndex {
}
};
void shuffleInput(vector<size_t>& order, size_t size, size_t inputSize) {
cerr << "Shuffling input examples.." << endl;
// RandomIndex rindex;
// random_shuffle(order.begin(), order.end(), rindex);
// remove first element and put it in the back
size_t first = order.at(0);
size_t index = 0;
order.erase(order.begin());
order.push_back(first);
}
void createShard(vector<size_t>& order, size_t size, size_t rank, vector<size_t>& shard) {
// Create the shards according to the number of processes used
float shardSize = (float) (order.size()) / size;
size_t shardStart = (size_t) (shardSize * rank);
size_t shardEnd = (size_t) (shardSize * (rank + 1));
if (rank == size - 1)
shardEnd = order.size();
shard.resize(shardSize);
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
cerr << "order: ";
for (size_t i = 0; i < shard.size(); ++i) {
cerr << shard[i] << " ";
}
cerr << endl;
}
int main(int argc, char** argv) {
size_t rank = 0;
size_t size = 1;
@ -186,10 +158,7 @@ int main(int argc, char** argv) {
float slack;
float slack_step;
float slack_min;
size_t maxNumberOracles;
bool accumulateMostViolatedConstraints;
bool averageWeights;
bool pastAndCurrentConstraints;
bool weightConvergence;
bool controlUpdates;
float learning_rate;
@ -225,62 +194,58 @@ int main(int argc, char** argv) {
int fear_n;
po::options_description desc("Allowed options");
desc.add_options()
("accumulate-most-violated-constraints", po::value<bool>(&accumulateMostViolatedConstraints)->default_value(false),"Accumulate most violated constraint per example")
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
("analytical-update", po::value<bool>(&analytical_update)->default_value(0), "Use one best lists and compute the update analytically")
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
("bleu-score-weight", po::value<float>(&bleuScoreWeight)->default_value(1.0), "Bleu score weight used in the decoder objective function (on top of the bleu objective weight)")
("burn-in", po::value<bool>(&burnIn)->default_value(false), "Do a burn-in of the BLEU history before training")
("burn-in-input-file", po::value<string>(&burnInInputFile), "Input file for burn-in phase of BLEU history")
("burn-in-reference-files", po::value<vector<string> >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history")
("config,f", po::value<string>(&mosesConfigFile), "Moses ini file")
("control-updates", po::value<bool>(&controlUpdates)->default_value(true), "Ignore updates that increase number of violated constraints AND increase the error")
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
("decr-sentence-update", po::value<float>(&decrease_sentence_update)->default_value(0), "Decrease maximum weight update by the given value after every epoch")
("dev-bleu", po::value<bool>(&devBleu)->default_value(true), "Compute BLEU score of oracle translations of the whole tuning set")
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
("epochs,e", po::value<size_t>(&epochs)->default_value(5), "Number of epochs")
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("history-of-1best", po::value<bool>(&historyOf1best)->default_value(0), "Use the 1best translation to update the history")
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimization (not model)")
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
("max-number-oracles", po::value<size_t>(&maxNumberOracles)->default_value(1), "Set a maximum number of oracles to use per example")
("min-bleu-change", po::value<float>(&min_bleu_change)->default_value(0), "Minimum BLEU change of 1best translations of one epoch")
("min-sentence-update", po::value<float>(&min_sentence_update)->default_value(0), "Set a minimum weight update per sentence")
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
("max-sentence-update", po::value<float>(&max_sentence_update)->default_value(-1), "Set a maximum weight update per sentence")
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
("nbest,n", po::value<size_t>(&n)->default_value(10), "Number of translations in nbest list")
("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
("past-and-current-constraints", po::value<bool>(&pastAndCurrentConstraints)->default_value(false), "Accumulate most violated constraint per example and use them along all current constraints")
("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
("precision", po::value<float>(&precision)->default_value(0), "Precision when comparing left and right hand side of constraints")
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(false), "Use a sentences level bleu scoring function")
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
("analytical-update", po::value<bool>(&analytical_update)->default_value(0), "Use one best lists and compute the update analytically")
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
("bleu-score-weight", po::value<float>(&bleuScoreWeight)->default_value(1.0), "Bleu score weight used in the decoder objective function (on top of the bleu objective weight)")
("burn-in", po::value<bool>(&burnIn)->default_value(false), "Do a burn-in of the BLEU history before training")
("burn-in-input-file", po::value<string>(&burnInInputFile), "Input file for burn-in phase of BLEU history")
("burn-in-reference-files", po::value<vector<string> >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history")
("config,f", po::value<string>(&mosesConfigFile), "Moses ini file")
("control-updates", po::value<bool>(&controlUpdates)->default_value(true), "Ignore updates that increase number of violated constraints AND increase the error")
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
("decr-sentence-update", po::value<float>(&decrease_sentence_update)->default_value(0), "Decrease maximum weight update by the given value after every epoch")
("dev-bleu", po::value<bool>(&devBleu)->default_value(true), "Compute BLEU score of oracle translations of the whole tuning set")
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
("epochs,e", po::value<size_t>(&epochs)->default_value(5), "Number of epochs")
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
("history-of-1best", po::value<bool>(&historyOf1best)->default_value(0), "Use the 1best translation to update the history")
("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimization (not model)")
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
("min-bleu-change", po::value<float>(&min_bleu_change)->default_value(0), "Minimum BLEU change of 1best translations of one epoch")
("min-sentence-update", po::value<float>(&min_sentence_update)->default_value(0), "Set a minimum weight update per sentence")
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
("max-sentence-update", po::value<float>(&max_sentence_update)->default_value(-1), "Set a maximum weight update per sentence")
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
("nbest,n", po::value<size_t>(&n)->default_value(10), "Number of translations in nbest list")
("normalise", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
("only-violated-constraints", po::value<bool>(&onlyViolatedConstraints)->default_value(false), "Add only violated constraints to the optimisation problem")
("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
("precision", po::value<float>(&precision)->default_value(0), "Precision when comparing left and right hand side of constraints")
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(false), "Use a sentences level bleu scoring function")
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimizer")
("slack-min", po::value<float>(&slack_min)->default_value(0.01), "Minimum slack used")
("slack-step", po::value<float>(&slack_step)->default_value(0), "Increase slack from epoch to epoch by the value provided")
("stop-dev-bleu", po::value<bool>(&stop_dev_bleu)->default_value(false), "Stop when average Bleu (dev) decreases (or no more increases)")
("stop-approx-dev-bleu", po::value<bool>(&stop_approx_dev_bleu)->default_value(false), "Stop when average approx. sentence Bleu (dev) decreases (or no more increases)")
("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
("train-linear-classifier", po::value<bool>(&train_linear_classifier)->default_value(false), "Test algorithm for linear classification")
("use-scaled-reference", po::value<bool>(&useScaledReference)->default_value(true), "Use scaled reference length for comparing target and reference length of phrases")
("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
@ -389,8 +354,7 @@ int main(int argc, char** argv) {
Optimiser* optimiser = NULL;
if (learner == "mira") {
cerr << "Optimising using Mira" << endl;
optimiser = new MiraOptimiser(onlyViolatedConstraints, slack, scale_margin, scale_update,
maxNumberOracles, accumulateMostViolatedConstraints, pastAndCurrentConstraints, order.size(), precision);
optimiser = new MiraOptimiser(onlyViolatedConstraints, slack, scale_margin, scale_update, precision);
learning_rate = mira_learning_rate;
perceptron_update = false;
} else if (learner == "perceptron") {
@ -407,12 +371,6 @@ int main(int argc, char** argv) {
}
// resolve parameter dependencies
if (accumulateMostViolatedConstraints && pastAndCurrentConstraints) {
cerr << "Error: the parameters --accumulate-most-violated-constraints and --past-and-current-constraints are mutually exclusive" << endl;
return 1;
}
if (perceptron_update || analytical_update) {
batchSize = 1;
cerr << "Setting batch size to 1 for perceptron/analytical update" << endl;
@ -542,7 +500,7 @@ int main(int argc, char** argv) {
mpi::broadcast(world, order, 0);
#endif
// Create the shards according to the number of processes used
// Create shards according to the number of processes used
vector<size_t> shard;
float shardSize = (float) (order.size()) / size;
VERBOSE(1, "Shard size: " << shardSize << endl);

View File

@ -19,43 +19,6 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
size_t epoch,
bool controlUpdates) {
// add every oracle in batch to list of oracles (under certain conditions)
for (size_t i = 0; i < oracleFeatureValues.size(); ++i) {
float newWeightedScore = oracleFeatureValues[i].InnerProduct(currWeights);
size_t sentenceId = sentenceIds[i];
// compare new oracle with existing oracles:
// if same translation exists, just update the bleu score
// if not, add the oracle
bool updated = false;
size_t indexOfWorst = 0;
float worstWeightedScore = 0;
for (size_t j = 0; j < m_oracles[sentenceId].size(); ++j) {
float currentWeightedScore = m_oracles[sentenceId][j].InnerProduct(currWeights);
if (currentWeightedScore == newWeightedScore) {
cerr << "Rank " << rank << ", epoch " << epoch << ", bleu score of oracle updated at batch position " << i << ", " << m_bleu_of_oracles[sentenceId][j] << " --> " << oracleBleuScores[j] << endl;
m_bleu_of_oracles[sentenceId][j] = oracleBleuScores[j];
updated = true;
break;
} else if (worstWeightedScore == 0 || currentWeightedScore
> worstWeightedScore) {
worstWeightedScore = currentWeightedScore;
indexOfWorst = j;
}
}
if (!updated) {
// add if number of maximum oracles not exceeded, otherwise override the worst
if (m_max_number_oracles > m_oracles[sentenceId].size()) {
m_oracles[sentenceId].push_back(oracleFeatureValues[i]);
m_bleu_of_oracles[sentenceId].push_back(oracleBleuScores[i]);
} else {
m_oracles[sentenceId][indexOfWorst] = oracleFeatureValues[i];
m_bleu_of_oracles[sentenceId][indexOfWorst] = oracleBleuScores[i];
}
}
}
// vector of feature values differences for all created constraints
vector<ScoreComponentCollection> featureValueDiffs;
vector<float> lossMinusModelScoreDiffs;
@ -63,7 +26,6 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
// most violated constraint in batch
ScoreComponentCollection max_batch_featureValueDiff;
float max_batch_loss = -1;
float max_batch_lossMinusModelScoreDiff = -1;
// Make constraints for new hypothesis translations
@ -72,21 +34,19 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
float oldDistanceFromOptimum = 0;
// iterate over input sentences (1 (online) or more (batch))
for (size_t i = 0; i < featureValues.size(); ++i) {
size_t sentenceId = sentenceIds[i];
if (m_oracles[sentenceId].size() > 1)
cerr << "Rank " << rank << ", available oracles for source sentence " << sentenceId << ": " << m_oracles[sentenceId].size() << endl;
//size_t sentenceId = sentenceIds[i];
// iterate over hypothesis translations for one input sentence
for (size_t j = 0; j < featureValues[i].size(); ++j) {
for (size_t k = 0; k < m_oracles[sentenceId].size(); ++k) {
ScoreComponentCollection featureValueDiff = m_oracles[sentenceId][k];
featureValueDiff.MinusEquals(featureValues[i][j]);
cerr << "feature value diff: " << featureValueDiff << endl;
if (featureValueDiff.GetL1Norm() == 0) {
cerr << "Equal feature values, constraint skipped.." << endl;
continue;
}
ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
featureValueDiff.MinusEquals(featureValues[i][j]);
float loss = losses[i][j];
cerr << "feature value diff: " << featureValueDiff << endl;
if (featureValueDiff.GetL1Norm() == 0) {
cerr << "Equal feature values, constraint skipped.." << endl;
continue;
}
float loss = losses[i][j];
if (m_scale_margin == 1) {
loss *= oracleBleuScores[i];
cerr << "Scaling margin with oracle bleu score " << oracleBleuScores[i] << endl;
@ -101,128 +61,31 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
}
// check if constraint is violated
bool violated = false;
bool addConstraint = true;
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
float diff = loss - (modelScoreDiff + m_precision);
cerr << "constraint: " << (modelScoreDiff + m_precision) << " >= " << loss << endl;
if (diff > epsilon) {
violated = true;
cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << endl;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_precision);
if (violated) {
if (m_accumulateMostViolatedConstraints || m_pastAndCurrentConstraints) {
// find the most violated constraint per batch
if (lossMinusModelScoreDiff > max_batch_lossMinusModelScoreDiff) {
max_batch_lossMinusModelScoreDiff = lossMinusModelScoreDiff;
max_batch_featureValueDiff = featureValueDiff;
max_batch_loss = loss;
}
}
}
if (addConstraint && !m_accumulateMostViolatedConstraints) {
featureValueDiffs.push_back(featureValueDiff);
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
all_losses.push_back(loss);
if (violated) {
++violatedConstraintsBefore;
oldDistanceFromOptimum += diff;
}
}
}
}
}
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current constraints: " << featureValueDiffs.size() << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current violated constraints: " << violatedConstraintsBefore << endl;
}
if (m_max_number_oracles == 1) {
for (size_t k = 0; k < sentenceIds.size(); ++k) {
size_t sentenceId = sentenceIds[k];
m_oracles[sentenceId].clear();
}
}
size_t pastViolatedConstraints = 0;
// Add constraints from past iterations (BEFORE updating that list)
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
// add all past (most violated) constraints to the list of current constraints, computed with current weights!
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
float modelScoreDiff = m_featureValueDiffs[i].InnerProduct(currWeights);
// check if constraint is violated
bool violated = false;
bool addConstraint = true;
float diff = m_losses[i] - (modelScoreDiff + m_precision);
if (diff > epsilon) {
violated = true;
cerr << "Rank " << rank << ", epoch " << epoch << ", past violation: " << diff << endl;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
bool violated = false;
bool addConstraint = true;
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
float diff = loss - (modelScoreDiff + m_precision);
cerr << "constraint: " << (modelScoreDiff + m_precision) << " >= " << loss << endl;
if (diff > epsilon) {
violated = true;
cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << diff << endl;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
if (addConstraint) {
featureValueDiffs.push_back(m_featureValueDiffs[i]);
lossMinusModelScoreDiffs.push_back(m_losses[i] - (modelScoreDiff + m_precision));
all_losses.push_back(m_losses[i]);
// cerr << "old constraint: " << (modelScoreDiff + m_precision) << " >= " << m_losses[i] << endl;
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_precision);
if (addConstraint) {
featureValueDiffs.push_back(featureValueDiff);
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
all_losses.push_back(loss);
if (violated) {
++violatedConstraintsBefore;
++pastViolatedConstraints;
oldDistanceFromOptimum += diff;
}
}
}
}
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past constraints: " << m_featureValueDiffs.size() << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past violated constraints: " << pastViolatedConstraints << endl;
}
// Add new most violated constraint to the list of current constraints
if (m_accumulateMostViolatedConstraints) {
if (max_batch_loss != -1) {
float modelScoreDiff = max_batch_featureValueDiff.InnerProduct(currWeights);
float diff = max_batch_loss - (modelScoreDiff + m_precision);
++violatedConstraintsBefore;
oldDistanceFromOptimum += diff;
featureValueDiffs.push_back(max_batch_featureValueDiff);
lossMinusModelScoreDiffs.push_back(max_batch_loss - (modelScoreDiff + m_precision));
all_losses.push_back(max_batch_loss);
// cerr << "new constraint: " << (modelScoreDiff + m_precision) << " !>= " << max_batch_loss << endl;
}
}
// Update the list of accumulated most violated constraints
if (max_batch_loss != -1) {
bool updated = false;
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
float oldScore = m_featureValueDiffs[i].InnerProduct(currWeights);
float newScore = max_batch_featureValueDiff.InnerProduct(currWeights);
if (abs(oldScore-newScore) < epsilon) {
m_losses[i] = max_batch_loss;
updated = true;
break;
if (violated) {
++violatedConstraintsBefore;
oldDistanceFromOptimum += diff;
}
}
}
if (!updated) {
m_featureValueDiffs.push_back(max_batch_featureValueDiff);
m_losses.push_back(max_batch_loss);
}
}
// run optimisation: compute alphas for all given constraints
@ -284,7 +147,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
statusPlus[1] = -1;
statusPlus[2] = -1;
return statusPlus;
}
}
}
// apply learning rate
@ -302,7 +165,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
}
// scale update by BLEU of oracle
if (oracleBleuScores.size() == 1 && m_max_number_oracles == 1 && m_scale_update) { // scale only if just 1 oracle is used
if (oracleBleuScores.size() == 1 && m_scale_update) {
cerr << "Scaling summed update with log10 oracle bleu score " << log10(oracleBleuScores[0]) << endl;
summedUpdate.MultiplyEquals(log10(oracleBleuScores[0]));
}
@ -338,7 +201,6 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
// most violated constraint in batch
ScoreComponentCollection max_batch_featureValueDiff;
float max_batch_loss = -1;
float max_batch_lossMinusModelScoreDiff = -1;
// Make constraints for new hypothesis translations
@ -348,8 +210,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
// iterate over input sentences (1 (online) or more (batch))
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
size_t sentenceId = sentenceIds[i];
size_t sentenceId = sentenceIds[i]; // keep sentenceId for storing more than 1 oracle..
// Pair all hope translations with all fear translations for one input sentence
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
for (size_t k = 0; k < featureValuesFear[i].size(); ++k) {
@ -362,20 +223,20 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
}
float loss = bleuScoresHope[i][j] - bleuScoresFear[i][k];
if (m_scale_margin == 1) {
loss *= bleuScoresHope[i][j];
cerr << "Scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl;
}
else if (m_scale_margin == 2) {
loss *= log2(bleuScoresHope[i][j]);
cerr << "Scaling margin with log2 oracle bleu score " << log2(bleuScoresHope[i][j]) << endl;
}
else if (m_scale_margin == 10) {
loss *= log10(bleuScoresHope[i][j]);
cerr << "Scaling margin with log10 oracle bleu score " << log10(bleuScoresHope[i][j]) << endl;
}
if (m_scale_margin == 1) {
loss *= bleuScoresHope[i][j];
cerr << "Scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl;
}
else if (m_scale_margin == 2) {
loss *= log2(bleuScoresHope[i][j]);
cerr << "Scaling margin with log2 oracle bleu score " << log2(bleuScoresHope[i][j]) << endl;
}
else if (m_scale_margin == 10) {
loss *= log10(bleuScoresHope[i][j]);
cerr << "Scaling margin with log10 oracle bleu score " << log10(bleuScoresHope[i][j]) << endl;
}
// check if constraint is violated
// check if constraint is violated
bool violated = false;
bool addConstraint = true;
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
@ -390,18 +251,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
}
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_precision);
if (violated) {
if (m_accumulateMostViolatedConstraints || m_pastAndCurrentConstraints) {
// find the most violated constraint per batch
if (lossMinusModelScoreDiff > max_batch_lossMinusModelScoreDiff) {
max_batch_lossMinusModelScoreDiff = lossMinusModelScoreDiff;
max_batch_featureValueDiff = featureValueDiff;
max_batch_loss = loss;
}
}
}
if (addConstraint && !m_accumulateMostViolatedConstraints) {
if (addConstraint) {
featureValueDiffs.push_back(featureValueDiff);
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
all_losses.push_back(loss);
@ -415,84 +265,6 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
}
}
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current constraints: " << featureValueDiffs.size() << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", number of current violated constraints: " << violatedConstraintsBefore << endl;
}
size_t pastViolatedConstraints = 0;
// Add constraints from past iterations (BEFORE updating that list)
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
// add all past (most violated) constraints to the list of current constraints, computed with current weights!
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
float modelScoreDiff = m_featureValueDiffs[i].InnerProduct(currWeights);
// check if constraint is violated
bool violated = false;
bool addConstraint = true;
float diff = m_losses[i] - (modelScoreDiff + m_precision);
if (diff > epsilon) {
violated = true;
cerr << "Rank " << rank << ", epoch " << epoch << ", past violation: " << diff << endl;
}
else if (m_onlyViolatedConstraints) {
addConstraint = false;
}
if (addConstraint) {
featureValueDiffs.push_back(m_featureValueDiffs[i]);
lossMinusModelScoreDiffs.push_back(m_losses[i] - (modelScoreDiff + m_precision));
all_losses.push_back(m_losses[i]);
// cerr << "old constraint: " << (modelScoreDiff + m_precision) << " >= " << m_losses[i] << endl;
if (violated) {
++violatedConstraintsBefore;
++pastViolatedConstraints;
oldDistanceFromOptimum += diff;
}
}
}
}
if (m_pastAndCurrentConstraints || m_accumulateMostViolatedConstraints) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past constraints: " << m_featureValueDiffs.size() << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", number of past violated constraints: " << pastViolatedConstraints << endl;
}
// Add new most violated constraint to the list of current constraints
if (m_accumulateMostViolatedConstraints) {
if (max_batch_loss != -1) {
float modelScoreDiff = max_batch_featureValueDiff.InnerProduct(currWeights);
float diff = max_batch_loss - (modelScoreDiff + m_precision);
++violatedConstraintsBefore;
oldDistanceFromOptimum += diff;
featureValueDiffs.push_back(max_batch_featureValueDiff);
lossMinusModelScoreDiffs.push_back(max_batch_loss - (modelScoreDiff + m_precision));
all_losses.push_back(max_batch_loss);
// cerr << "new constraint: " << (modelScoreDiff + m_precision) << " !>= " << max_batch_loss << endl;
}
}
// Update the list of accumulated most violated constraints
if (max_batch_loss != -1) {
bool updated = false;
for (size_t i = 0; i < m_featureValueDiffs.size(); ++i) {
float oldScore = m_featureValueDiffs[i].InnerProduct(currWeights);
float newScore = max_batch_featureValueDiff.InnerProduct(currWeights);
if (abs(oldScore-newScore) < epsilon) {
m_losses[i] = max_batch_loss;
updated = true;
break;
}
}
if (!updated) {
m_featureValueDiffs.push_back(max_batch_featureValueDiff);
m_losses.push_back(max_batch_loss);
}
}
// run optimisation: compute alphas for all given constraints
vector<float> alphas;
ScoreComponentCollection summedUpdate;
@ -515,14 +287,14 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
// scale update by BLEU of hope translation (only two cases defined at the moment)
if (featureValuesHope.size() == 1 && m_scale_update) { // only defined for batch size 1)
if (featureValuesHope[0].size() == 1) {
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][0]) << endl; // only 1 oracle
update.MultiplyEquals(log10(bleuScoresHope[0][0]));
} else if (featureValuesFear[0].size() == 1) {
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][k]) << endl; // k oracles
update.MultiplyEquals(log10(bleuScoresHope[0][k]));
}
if (featureValuesHope[0].size() == 1) {
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][0]) << endl; // only 1 oracle
update.MultiplyEquals(log10(bleuScoresHope[0][0]));
} else if (featureValuesFear[0].size() == 1) {
cerr << "Scaling update with log10 oracle bleu score " << log10(bleuScoresHope[0][k]) << endl; // k oracles
update.MultiplyEquals(log10(bleuScoresHope[0][k]));
}
}
// sum up update
summedUpdate.PlusEquals(update);
@ -593,7 +365,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
}
vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
ScoreComponentCollection& featureValuesHope,
ScoreComponentCollection& featureValuesHope,
ScoreComponentCollection& featureValuesFear,
float bleuScoreHope,
float bleuScoreFear,

View File

@ -63,17 +63,12 @@ namespace Mira {
MiraOptimiser() :
Optimiser() { }
MiraOptimiser(bool onlyViolatedConstraints, float slack, size_t scale_margin, bool scale_update, size_t maxNumberOracles, bool accumulateMostViolatedConstraints, bool pastAndCurrentConstraints, size_t exampleSize, float precision) :
MiraOptimiser(bool onlyViolatedConstraints, float slack, size_t scale_margin, bool scale_update, float precision) :
Optimiser(),
m_onlyViolatedConstraints(onlyViolatedConstraints),
m_slack(slack),
m_scale_margin(scale_margin),
m_scale_update(scale_update),
m_max_number_oracles(maxNumberOracles),
m_accumulateMostViolatedConstraints(accumulateMostViolatedConstraints),
m_pastAndCurrentConstraints(pastAndCurrentConstraints),
m_oracles(exampleSize),
m_bleu_of_oracles(exampleSize),
m_precision(precision) { }
std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
@ -125,21 +120,6 @@ namespace Mira {
size_t m_scale_margin;
// keep a list of oracle translations over epochs
std::vector < std::vector< Moses::ScoreComponentCollection> > m_oracles;
std::vector < std::vector< float> > m_bleu_of_oracles;
size_t m_max_number_oracles;
// accumulate most violated constraints for every example
std::vector< Moses::ScoreComponentCollection> m_featureValueDiffs;
std::vector< float> m_losses;
bool m_accumulateMostViolatedConstraints;
bool m_pastAndCurrentConstraints;
float m_precision;
// scale update with log 10 of oracle BLEU score