Merge remote branch 'github/miramerge' into bjam

Conflicts:
	mira/Decoder.h
	mira/Main.cpp
	mira/Main.h
	moses-chart-cmd/src/IOWrapper.cpp
	moses-chart-cmd/src/Main.cpp
	moses-cmd/src/Main.cpp
	moses/src/BleuScoreFeature.cpp
	moses/src/BleuScoreFeature.h
	moses/src/TargetNgramFeature.h
This commit is contained in:
Barry Haddow 2012-02-01 10:12:16 +00:00
commit 5a17ef82b3
12 changed files with 407 additions and 149 deletions

View File

@ -103,7 +103,7 @@ namespace Mira {
staticData.ReLoadBleuScoreFeatureParameter(bleuObjectiveWeight*bleuScoreWeight);
m_bleuScoreFeature->SetCurrentSourceLength((*m_sentence).GetSize());
m_bleuScoreFeature->SetCurrentReference(sentenceid);
m_bleuScoreFeature->SetCurrentShortestReference(sentenceid);
//run the decoder
m_manager = new Moses::Manager(*m_sentence, staticData.GetSearchAlgorithm(), &system);
@ -124,6 +124,8 @@ namespace Mira {
Phrase bestPhrase = path.GetTargetPhrase();
if (iter != sentences.begin())
cerr << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", \"";
Phrase phrase = path.GetTargetPhrase();
for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
@ -192,12 +194,12 @@ namespace Mira {
m_bleuScoreFeature->PrintHistory(out);
}
void MosesDecoder::printReferenceLength(const vector<size_t>& ref_ids) {
/* void MosesDecoder::printReferenceLength(const vector<size_t>& ref_ids) {
m_bleuScoreFeature->PrintReferenceLength(ref_ids);
}
}*/
size_t MosesDecoder::getReferenceLength(size_t ref_id) {
return m_bleuScoreFeature->GetReferenceLength(ref_id);
size_t MosesDecoder::getClosestReferenceLength(size_t ref_id, int hypoLength) {
return m_bleuScoreFeature->GetClosestReferenceLength(ref_id, hypoLength);
}
void MosesDecoder::setBleuParameters(bool scaleByInputLength, bool scaleByRefLength, bool scaleByAvgLength,

View File

@ -68,12 +68,18 @@ class MosesDecoder {
void printBleuFeatureHistory(std::ostream& out);
void printReferenceLength(const std::vector<size_t>& ref_ids);
size_t getReferenceLength(size_t ref_id);
// void printReferenceLength(const std::vector<size_t>& ref_ids);
size_t getClosestReferenceLength(size_t ref_id, int hypoLength);
void setBleuParameters(bool scaleByInputLength, bool scaleByRefLength, bool scaleByAvgLength,
bool scaleByTargetLengthLinear, bool scaleByTargetLengthTrend,
float scaleByX, float historySmoothing, size_t scheme, float relax_BP);
Moses::ScoreComponentCollection getWeights();
void setWeights(const Moses::ScoreComponentCollection& weights);
void cleanup();
void cleanup();
void setCorrection(float correction) {
m_bleuScoreFeature->SetCorrection(correction);
}
private:
float getBleuScore(const Moses::ScoreComponentCollection& scores);

View File

@ -116,12 +116,15 @@ int main(int argc, char** argv) {
int threadcount;
size_t adapt_after_epoch;
size_t bleu_smoothing_scheme;
float max_length_deviation;
float max_length_dev_all;
float max_length_dev_hypos;
float max_length_dev_reference;
float max_length_dev_hope_ref;
float max_length_dev_fear_ref;
float relax_BP;
bool stabiliseLength;
bool delayUpdates;
float min_oracle_bleu;
bool correctScaling;
po::options_description desc("Allowed options");
desc.add_options()
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
@ -134,6 +137,7 @@ int main(int argc, char** argv) {
("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
("correct-scaling", po::value<bool>(&correctScaling)->default_value(false), "Try to correct for scaling issues")
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
("delay-updates", po::value<bool>(&delayUpdates)->default_value(false), "Delay all updates until the end of an epoch")
@ -151,11 +155,13 @@ int main(int argc, char** argv) {
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
("max-length-deviation", po::value<float>(&max_length_deviation)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hope/fear translations and w.r.t. reference translations")
("max-length-dev-hypos", po::value<float>(&max_length_dev_hypos)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hop/fear translations")
("max-length-dev-reference", po::value<float>(&max_length_dev_reference)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation of hope/fear translations w.r.t. reference translations")
("max-length-dev-all", po::value<float>(&max_length_dev_all)->default_value(-1), "Make use of all 3 following options")
("max-length-dev-hypos", po::value<float>(&max_length_dev_hypos)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hope and fear translations")
("max-length-dev-hope-ref", po::value<float>(&max_length_dev_hope_ref)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between hope and reference translations")
("max-length-dev-fear-ref", po::value<float>(&max_length_dev_fear_ref)->default_value(-1), "Number between 0 and 1 specifying the percentage of admissible length deviation between fear and reference translations")
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(5), "How often per epoch to mix weights, when using mpi")
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
@ -352,6 +358,8 @@ int main(int argc, char** argv) {
cerr << "Error: Need to select an one of parameters --hope-fear/--model-hope-fear for mira update." << endl;
return 1;
}
if (historyOf1best || historyOfOracles)
sentenceLevelBleu = false;
if (!sentenceLevelBleu) {
if (!historyOf1best && !historyOfOracles) {
historyOf1best = true;
@ -361,9 +369,10 @@ int main(int argc, char** argv) {
bleuScoreWeight_hope = bleuScoreWeight;
}
if (max_length_deviation != -1) {
max_length_dev_reference = max_length_deviation;
max_length_dev_hypos = max_length_deviation;
if (max_length_dev_all != -1) {
max_length_dev_hypos = max_length_dev_all;
max_length_dev_hope_ref = max_length_dev_all;
max_length_dev_fear_ref = max_length_dev_all;
}
#ifdef MPI_ENABLE
@ -429,9 +438,10 @@ int main(int argc, char** argv) {
numberOfUpdatesThisEpoch = 0;
// Sum up weights over one epoch, final average uses weights from last epoch
if (!accumulateWeights) {
if (!accumulateWeights)
cumulativeWeights.ZeroAll();
}
delayedWeightUpdates.ZeroAll();
// number of weight dumps this epoch
size_t weightEpochDump = 0;
@ -495,7 +505,8 @@ int main(int argc, char** argv) {
}
}
size_t reference_length = decoder->getReferenceLength(*sid);
size_t ref_length;
float avg_ref_length;
if (hope_fear || perceptron_update) {
// HOPE
cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n << "best hope translations" << endl;
@ -504,8 +515,11 @@ int main(int argc, char** argv) {
distinctNbest, rank, epoch);
size_t current_input_length = decoder->getCurrentInputLength();
decoder->cleanup();
float hope_length_ratio = (float)oracle.size()/reference_length;
ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
avg_ref_length = ref_length;
float hope_length_ratio = (float)oracle.size()/ref_length;
cerr << ", l-ratio hope: " << hope_length_ratio << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", current input length: " << current_input_length << endl;
vector<const Word*> bestModel;
if (historyOf1best || stabiliseLength) {
@ -516,8 +530,9 @@ int main(int argc, char** argv) {
distinctNbest, rank, epoch);
decoder->cleanup();
cerr << endl;
ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
dev_hypothesis_length += bestModel.size();
dev_reference_length += reference_length;
dev_reference_length += ref_length;
}
// FEAR
@ -526,7 +541,10 @@ int main(int argc, char** argv) {
featureValuesFear[batchPosition], bleuScoresFear[batchPosition], true,
distinctNbest, rank, epoch);
decoder->cleanup();
float fear_length_ratio = (float)fear.size()/reference_length;
ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
avg_ref_length += ref_length;
avg_ref_length /= 2;
float fear_length_ratio = (float)fear.size()/ref_length;
cerr << ", l-ratio fear: " << fear_length_ratio << endl;
for (size_t i = 0; i < fear.size(); ++i) {
delete fear[i];
@ -537,11 +555,12 @@ int main(int argc, char** argv) {
float length_diff_fear = abs(1 - fear_length_ratio);
size_t length_diff_hope_fear = abs((int)oracle.size() - (int)fear.size());
cerr << "Rank " << rank << ", epoch " << epoch << ", abs-length hope-fear: " << length_diff_hope_fear << ", BLEU hope-fear: " << bleuScoresHope[batchPosition][0] - bleuScoresFear[batchPosition][0] << endl;
bool skip = false;
if (max_length_dev_reference != -1 && (length_diff_hope > max_length_dev_reference || length_diff_fear > max_length_dev_reference))
if (max_length_dev_hypos != -1 && (length_diff_hope_fear > avg_ref_length * max_length_dev_hypos))
skip = true;
if (max_length_dev_hypos != -1 && (length_diff_hope_fear > reference_length * max_length_dev_hypos))
if (max_length_dev_hope_ref != -1 && length_diff_hope > max_length_dev_hope_ref)
skip = true;
if (max_length_dev_fear_ref != -1 && length_diff_fear > max_length_dev_fear_ref)
skip = true;
if (skip) {
cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << fear_length_ratio << ", " << length_diff_hope_fear << ").. " << endl;
@ -579,7 +598,8 @@ int main(int argc, char** argv) {
ref_ids.push_back(*sid);
decoder->cleanup();
oracles.push_back(oracle);
float hope_length_ratio = (float)oracle.size()/reference_length;
ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
float hope_length_ratio = (float)oracle.size()/ref_length;
cerr << ", l-ratio hope: " << hope_length_ratio << endl;
oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
@ -592,11 +612,12 @@ int main(int argc, char** argv) {
distinctNbest, rank, epoch);
decoder->cleanup();
oneBests.push_back(bestModel);
float model_length_ratio = (float)bestModel.size()/reference_length;
ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
float model_length_ratio = (float)bestModel.size()/ref_length;
cerr << ", l-ratio model: " << model_length_ratio << endl;
if (stabiliseLength) {
dev_hypothesis_length += bestModel.size();
dev_reference_length += reference_length;
dev_reference_length += ref_length;
}
// FEAR
@ -606,7 +627,8 @@ int main(int argc, char** argv) {
featureValues[batchPosition], bleuScores[batchPosition], true,
distinctNbest, rank, epoch);
decoder->cleanup();
float fear_length_ratio = (float)fear.size()/reference_length;
ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
float fear_length_ratio = (float)fear.size()/ref_length;
cerr << ", l-ratio fear: " << fear_length_ratio << endl;
for (size_t i = 0; i < fear.size(); ++i) {
delete fear[i];
@ -649,10 +671,9 @@ int main(int argc, char** argv) {
iter = featureFunctions.begin();
for (; iter != featureFunctions.end(); ++iter) {
if ((*iter)->GetScoreProducerWeightShortName() == "w") {
ignoreWPFeature(featureValues, (*iter));
ignoreWPFeature(featureValuesHope, (*iter));
ignoreWPFeature(featureValuesFear, (*iter));
break;
ignoreFeature(featureValues, (*iter));
ignoreFeature(featureValuesHope, (*iter));
ignoreFeature(featureValuesFear, (*iter));
}
}
}
@ -696,8 +717,16 @@ int main(int argc, char** argv) {
featureValuesHope, featureValuesFear, dummy1, dummy1, learning_rate, rank, epoch);
}
else if (hope_fear) {
update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, learning_rate, rank, epoch);
if (bleuScoresHope[0][0] >= min_oracle_bleu)
if (hope_n == 1 && fear_n ==1)
update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(mosesWeights, weightUpdate,
featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0],
learning_rate, rank, epoch);
else
update_status = optimiser->updateWeightsHopeFear(mosesWeights, weightUpdate,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, learning_rate, rank, epoch);
else
update_status = -1;
}
else {
// model_hope_fear
@ -709,31 +738,34 @@ int main(int argc, char** argv) {
if (update_status == 0) { // if weights were updated
// apply weight update
mosesWeights.PlusEquals(weightUpdate);
if (normaliseWeights) {
mosesWeights.L1Normalise();
if (delayUpdates) {
delayedWeightUpdates.PlusEquals(weightUpdate);
cerr << "\nRank " << rank << ", epoch " << epoch << ", keeping update: " << weightUpdate << endl;
++numberOfUpdatesThisEpoch;
}
else {
mosesWeights.PlusEquals(weightUpdate);
if (normaliseWeights)
mosesWeights.L1Normalise();
cumulativeWeights.PlusEquals(mosesWeights);
++numberOfUpdates;
++numberOfUpdatesThisEpoch;
if (averageWeights) {
ScoreComponentCollection averageWeights(cumulativeWeights);
if (accumulateWeights) {
averageWeights.DivideEquals(numberOfUpdates);
} else {
averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
cumulativeWeights.PlusEquals(mosesWeights);
++numberOfUpdates;
++numberOfUpdatesThisEpoch;
if (averageWeights) {
ScoreComponentCollection averageWeights(cumulativeWeights);
if (accumulateWeights) {
averageWeights.DivideEquals(numberOfUpdates);
} else {
averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
}
mosesWeights = averageWeights;
}
mosesWeights = averageWeights;
if (!delayUpdates)
// set new Moses weights
decoder->setWeights(mosesWeights);
}
if (delayUpdates)
delayedWeightUpdates.PlusEquals(weightUpdate);
else
// set new Moses weights
decoder->setWeights(mosesWeights);
}
// update history (for approximate document Bleu)
@ -788,7 +820,7 @@ int main(int argc, char** argv) {
} // end mixing
// Dump weights?
if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
if (!delayUpdates && evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
bool proceed = false;
if (accumulateWeights) {
@ -817,25 +849,25 @@ int main(int argc, char** argv) {
// normalise weights after averaging
if (normaliseWeights) {
mixedAverageWeights.L1Normalise();
mixedAverageWeights.L1Normalise();
}
// dump final average weights
ostringstream filename;
if (epoch < 10) {
filename << weightDumpStem << "_0" << epoch;
filename << weightDumpStem << "_0" << epoch;
} else {
filename << weightDumpStem << "_" << epoch;
filename << weightDumpStem << "_" << epoch;
}
if (weightDumpFrequency > 1) {
filename << "_" << weightEpochDump;
filename << "_" << weightEpochDump;
}
if (accumulateWeights) {
cerr << "\nMixed average weights (cumulative) during epoch " << epoch << ": " << mixedAverageWeights << endl;
cerr << "\nMixed average weights (cumulative) during epoch " << epoch << ": " << mixedAverageWeights << endl;
} else {
cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
}
cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
@ -847,12 +879,102 @@ int main(int argc, char** argv) {
} // end of shard loop, end of this epoch
if (correctScaling && epoch == 0) {
float averageRatio = ((MiraOptimiser*) optimiser)->getSumRatios();
averageRatio /= numberOfUpdatesThisEpoch;
cerr << "Rank " << rank << ", epoch " << epoch << ", average ratio: " << averageRatio << endl;
float correctionFactor = 0.9;
float mixedAverageRatio = 0;
float *sendbuf_float, *recvbuf_float;
sendbuf_float = (float *) malloc(sizeof(float));
recvbuf_float = (float *) malloc(sizeof(float));
#ifdef MPI_ENABLE
// average across processes
// mpi::reduce(world, averageRatio, mixedAverageRatio, SCCPlus(), 0);
sendbuf_float[0] = averageRatio;
recvbuf_float[0] = 0;
MPI_Reduce(sendbuf_float, recvbuf_float, 1, MPI_FLOAT, MPI_SUM, 0, world);
mixedAverageRatio = recvbuf_float[0];
if (rank == 0) {
mixedAverageRatio /= size;
mixedAverageRatio *= correctionFactor;
mpi::broadcast(world, mixedAverageRatio, 0);
}
#endif
#ifndef MPI_ENABLE
mixedAverageRatio = averageRatio;
mixedAverageRatio *= correctionFactor;
#endif
decoder->setCorrection(mixedAverageRatio);
cerr << "Rank " << rank << ", epoch " << epoch << ", setting scaling correction to " << mixedAverageRatio << "." << endl;
decoder->setWeights(initialWeights);
cerr << "Rank " << rank << ", epoch " << epoch << ", resetting decoder weights to initial weights." << endl;
}
if (delayUpdates) {
// apply all updates from this epoch to the weight vector
ScoreComponentCollection mosesWeights = decoder->getWeights();
cerr << "Rank " << rank << ", epoch " << epoch << ", delayed update, old moses weights: " << mosesWeights << endl;
mosesWeights.PlusEquals(delayedWeightUpdates);
cumulativeWeights.PlusEquals(mosesWeights);
decoder->setWeights(mosesWeights);
cerr << "Rank " << rank << ", epoch " << epoch << ", delayed update, new moses weights: " << mosesWeights << endl;
ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
bool proceed = false;
if (accumulateWeights) {
if (numberOfUpdatesThisEpoch > 0) {
tmpAverageWeights.DivideEquals(epoch+1);
proceed = true;
}
}
else {
if (numberOfUpdatesThisEpoch > 0)
proceed = true;
}
if (proceed) {
#ifdef MPI_ENABLE
// average across processes
mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
#endif
#ifndef MPI_ENABLE
mixedAverageWeights = tmpAverageWeights;
#endif
if (rank == 0 && !weightDumpStem.empty()) {
// divide by number of processes
mixedAverageWeights.DivideEquals(size);
// normalise weights after averaging
if (normaliseWeights) {
mixedAverageWeights.L1Normalise();
}
// dump final average weights
ostringstream filename;
if (epoch < 10) {
filename << weightDumpStem << "_0" << epoch;
} else {
filename << weightDumpStem << "_" << epoch;
}
if (weightDumpFrequency > 1) {
filename << "_" << weightEpochDump;
}
if (accumulateWeights) {
cerr << "\nMixed average weights (cumulative) during epoch " << epoch << ": " << mixedAverageWeights << endl;
} else {
cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
}
cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
mixedAverageWeights.Save(filename.str());
++weightEpochDump;
}
}
}
if (stabiliseLength && !fixLength) {
@ -1050,10 +1172,10 @@ void ignoreCoreFeatures(vector<vector<ScoreComponentCollection> > &featureValues
}
}
void ignoreWPFeature(vector<vector<ScoreComponentCollection> > &featureValues, const ScoreProducer* sp) {
void ignoreFeature(vector<vector<ScoreComponentCollection> > &featureValues, const ScoreProducer* sp) {
for (size_t i = 0; i < featureValues.size(); ++i)
for (size_t j = 0; j < featureValues[i].size(); ++j)
// set WP feature to 0
// set feature to 0
featureValues[i][j].Assign(sp, 0);
}

View File

@ -47,7 +47,7 @@ bool loadWeights(const std::string& filename, StrFloatMap& coreWeightMap);
bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size);
void printFeatureValues(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues);
void ignoreCoreFeatures(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, StrFloatMap &coreWeightMap);
void ignoreWPFeature(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, const Moses::ScoreProducer* sp);
void ignoreFeature(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, const Moses::ScoreProducer* sp);
void takeLogs(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t base);
void deleteTranslations(std::vector<std::vector<const Moses::Word*> > &translations);

View File

@ -107,7 +107,7 @@ size_t MiraOptimiser::updateWeights(
// * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
float alpha = alphas[k];
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl);
cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
ScoreComponentCollection update(featureValueDiffs[k]);
update.MultiplyEquals(alpha);
@ -233,7 +233,7 @@ size_t MiraOptimiser::updateWeightsHopeFear(
addConstraint = false;
}
float lossMinusModelScoreDiff = loss - modelScoreDiff;
float lossMinusModelScoreDiff = loss - (modelScoreDiff + m_margin_slack);
if (addConstraint) {
featureValueDiffs.push_back(featureValueDiff);
lossMinusModelScoreDiffs.push_back(lossMinusModelScoreDiff);
@ -264,7 +264,7 @@ size_t MiraOptimiser::updateWeightsHopeFear(
// * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
float alpha = alphas[k];
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl);
cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
ScoreComponentCollection update(featureValueDiffs[k]);
update.MultiplyEquals(alpha);
@ -321,5 +321,96 @@ size_t MiraOptimiser::updateWeightsHopeFear(
return 0;
}
size_t MiraOptimiser::updateWeightsAnalytically(
ScoreComponentCollection& currWeights,
ScoreComponentCollection& weightUpdate,
ScoreComponentCollection& featureValuesHope,
ScoreComponentCollection& featureValuesFear,
float bleuScoreHope,
float bleuScoreFear,
float learning_rate,
size_t rank,
size_t epoch) {
float epsilon = 0.0001;
float oldDistanceFromOptimum = 0;
bool constraintViolatedBefore = false;
// cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
// cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
ScoreComponentCollection featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
float loss = bleuScoreHope - bleuScoreFear;
float diff = 0;
float ratio = (modelScoreDiff == 0) ? 0 : loss / modelScoreDiff;
cerr << "Rank " << rank << ", epoch " << epoch << ", ratio model/loss: " << ratio << endl;
if (loss > (modelScoreDiff + m_margin_slack)) {
diff = loss - (modelScoreDiff + m_margin_slack);
}
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
if (epoch == 0)
m_sum_ratios += ratio;
if (diff > epsilon) {
// constraint violated
oldDistanceFromOptimum += diff;
constraintViolatedBefore = true;
// compute alpha for given constraint: (loss - model score diff) / || feature value diff ||^2
// featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
// from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
float squaredNorm = featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm();
if (squaredNorm > 0) {
float alpha = diff / squaredNorm;
if (m_slack > 0 ) {
if (alpha > m_slack) {
alpha = m_slack;
}
else if (alpha < m_slack*(-1)) {
alpha = m_slack*(-1);
}
}
cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
featureValueDiff.MultiplyEquals(alpha);
weightUpdate.PlusEquals(featureValueDiff);
}
else {
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", no update because squared norm is 0" << endl);
}
}
if (!constraintViolatedBefore) {
// constraint satisfied, nothing to do
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl;
return 1;
}
// sanity check: constraint still violated after optimisation?
/* ScoreComponentCollection newWeights(currWeights);
newWeights.PlusEquals(weightUpdate);
bool constraintViolatedAfter = false;
float newDistanceFromOptimum = 0;
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
diff = loss - (modelScoreDiff + m_margin_slack);
// approximate comparison between floats!
if (diff > epsilon) {
constraintViolatedAfter = true;
newDistanceFromOptimum += (loss - modelScoreDiff);
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
*/
return 0;
}
}

View File

@ -67,7 +67,8 @@ namespace Mira {
m_slack(slack),
m_scale_margin(scale_margin),
m_scale_update(scale_update),
m_margin_slack(margin_slack) { }
m_margin_slack(margin_slack),
m_sum_ratios(0) { }
size_t updateWeights(Moses::ScoreComponentCollection& currWeights,
Moses::ScoreComponentCollection& weightUpdate,
@ -88,6 +89,15 @@ namespace Mira {
float learning_rate,
size_t rank,
size_t epoch);
size_t updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
Moses::ScoreComponentCollection& weightUpdate,
Moses::ScoreComponentCollection& featureValuesHope,
Moses::ScoreComponentCollection& featureValuesFear,
float bleuScoresHope,
float bleuScoresFear,
float learning_rate,
size_t rank,
size_t epoch);
void setSlack(float slack) {
m_slack = slack;
@ -97,6 +107,10 @@ namespace Mira {
m_margin_slack = margin_slack;
}
float getSumRatios() {
return m_sum_ratios;
}
private:
// add only violated constraints to the optimisation problem
@ -112,6 +126,9 @@ namespace Mira {
// scale update with log 10 of oracle BLEU score
size_t m_scale_update;
// collect (loss diff/model score diff) ratios from first epoch
float m_sum_ratios;
};
}

View File

@ -367,6 +367,12 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
std::string lastName = "";
// output stateful sparse features
const vector<const StatefulFeatureFunction*>& sff = system->GetStatefulFeatureFunctions();
for( size_t i=0; i<sff.size(); i++ )
if (sff[i]->GetNumScoreComponents() == ScoreProducer::unlimited)
OutputSparseFeatureScores( out, path, sff[i], lastName );
// translation components
const vector<PhraseDictionaryFeature*>& pds = system->GetPhraseDictionaries();
if (pds.size() > 0) {
@ -408,19 +414,14 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, const Cha
}
}
// output sparse features
// output stateless sparse features
lastName = "";
const vector<const StatefulFeatureFunction*>& sff = system->GetStatefulFeatureFunctions();
for( size_t i=0; i<sff.size(); i++ )
if (sff[i]->GetNumScoreComponents() == ScoreProducer::unlimited)
OutputSparseFeatureScores( out, path, sff[i], lastName );
const vector<const StatelessFeatureFunction*>& slf = system->GetStatelessFeatureFunctions();
for( size_t i=0; i<slf.size(); i++ )
if (sff[i]->GetNumScoreComponents() == ScoreProducer::unlimited)
OutputSparseFeatureScores( out, path, slf[i], lastName );
// total
out << " ||| " << path.GetTotalScore();

View File

@ -173,17 +173,13 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
<< ff->GetScoreProducerWeightShortName() << " "
<< values[i] << endl;
}
}
static void PrintSparseFeatureWeight(const FeatureFunction* ff)
{
if (ff->GetNumScoreComponents() == ScoreProducer::unlimited) {
if (ff->GetSparseProducerWeight() == 1)
cout << ff->GetScoreProducerDescription() << " " <<
ff->GetScoreProducerWeightShortName() << " sparse" << endl;
else
cout << ff->GetScoreProducerDescription() << " " <<
ff->GetScoreProducerWeightShortName() << " " << ff->GetSparseProducerWeight() << endl;
else {
if (ff->GetSparseProducerWeight() == 1)
cout << ff->GetScoreProducerDescription() << " " <<
ff->GetScoreProducerWeightShortName() << " sparse" << endl;
else
cout << ff->GetScoreProducerDescription() << " " <<
ff->GetScoreProducerWeightShortName() << " " << ff->GetSparseProducerWeight() << endl;
}
}
@ -208,9 +204,6 @@ static void ShowWeights()
for (size_t i = 0; i < slf.size(); ++i) {
PrintFeatureWeight(slf[i]);
}
for (size_t i = 0; i < sff.size(); ++i) {
PrintSparseFeatureWeight(sff[i]);
}
}

View File

@ -295,17 +295,13 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
<< ff->GetScoreProducerWeightShortName() << " "
<< values[i] << endl;
}
}
static void PrintSparseFeatureWeight(const FeatureFunction* ff)
{
if (ff->GetNumScoreComponents() == ScoreProducer::unlimited) {
if (ff->GetSparseProducerWeight() == 1)
cout << ff->GetScoreProducerDescription() << " " <<
ff->GetScoreProducerWeightShortName() << " sparse" << endl;
else
cout << ff->GetScoreProducerDescription() << " " <<
ff->GetScoreProducerWeightShortName() << " " << ff->GetSparseProducerWeight() << endl;
else {
if (ff->GetSparseProducerWeight() == 1)
cout << ff->GetScoreProducerDescription() << " " <<
ff->GetScoreProducerWeightShortName() << " sparse" << endl;
else
cout << ff->GetScoreProducerDescription() << " " <<
ff->GetScoreProducerWeightShortName() << " " << ff->GetSparseProducerWeight() << endl;
}
}
@ -322,7 +318,7 @@ static void ShowWeights()
PrintFeatureWeight(sff[i]);
}
for (size_t i = 0; i < slf.size(); ++i) {
PrintFeatureWeight(slf[i]);
PrintFeatureWeight(slf[i]);
}
for (size_t i = 0; i < pds.size(); ++i) {
PrintFeatureWeight(pds[i]);
@ -330,9 +326,6 @@ static void ShowWeights()
for (size_t i = 0; i < gds.size(); ++i) {
PrintFeatureWeight(gds[i]);
}
for (size_t i = 0; i < sff.size(); ++i) {
PrintSparseFeatureWeight(sff[i]);
}
}
/** main function of the command line version of the decoder **/

View File

@ -98,37 +98,47 @@ void BleuScoreFeature::SetBleuParameters(bool scaleByInputLength, bool scaleByRe
void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs)
{
m_refs.clear();
FactorCollection& fc = FactorCollection::Instance();
cerr << "Number of reference files: " << refs.size() << endl;
for (size_t file_id = 0; file_id < refs.size(); file_id++) {
for (size_t ref_id = 0; ref_id < refs[file_id].size(); ref_id++) {
const string& ref = refs[file_id][ref_id];
vector<string> refTokens = Tokenize(ref);
m_refs[ref_id] = pair<size_t,NGrams>();
pair<size_t,NGrams>& ref_pair = m_refs[ref_id];
ref_pair.first = refTokens.size();
for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
Phrase ngram(1);
for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
Word w;
w.SetFactor(0, f);
ngram.AddWord(w);
}
ref_pair.second[ngram] += 1;
}
}
}
}
FactorCollection& fc = FactorCollection::Instance();
for (size_t file_id = 0; file_id < refs.size(); file_id++) {
for (size_t ref_id = 0; ref_id < refs[file_id].size(); ref_id++) {
const string& ref = refs[file_id][ref_id];
vector<string> refTokens = Tokenize(ref);
if (file_id == 0)
m_refs[ref_id] = pair<vector<size_t>,NGrams>();
pair<vector<size_t>,NGrams>& ref_pair = m_refs[ref_id];
(ref_pair.first).push_back(refTokens.size());
for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
Phrase ngram(1);
for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
Word w;
w.SetFactor(0, f);
ngram.AddWord(w);
}
ref_pair.second[ngram] += 1;
}
}
}
}
// for (size_t i = 0; i < m_refs.size(); ++i) {
// cerr << "ref id " << i << ", number of entries: " << (m_refs[i].first).size() << endl;
// }
}
void BleuScoreFeature::SetCurrentSourceLength(size_t source_length) {
m_cur_source_length = source_length;
}
void BleuScoreFeature::SetCurrentReference(size_t ref_id) {
m_cur_ref_length = m_refs[ref_id].first;
void BleuScoreFeature::SetCurrentShortestReference(size_t ref_id) {
// look for shortest reference
int shortestRef = -1;
for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef)
shortestRef = (m_refs[ref_id].first)[i];
}
m_cur_ref_length = shortestRef;
m_cur_ref_ngrams = m_refs[ref_id].second;
}
@ -163,15 +173,16 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
* Update history with a batch of translations
*/
void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
for (size_t batchPosition = 0; batchPosition < hypos.size(); ++batchPosition){
Phrase phrase(hypos[batchPosition]);
for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id){
Phrase phrase(hypos[ref_id]);
std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
// set current source and reference information for each oracle in the batch
size_t cur_source_length = sourceLengths[batchPosition];
size_t cur_ref_length = m_refs[ref_ids[batchPosition]].first;
NGrams cur_ref_ngrams = m_refs[ref_ids[batchPosition]].second;
size_t cur_source_length = sourceLengths[ref_id];
size_t hypo_length = hypos[ref_id].size();
size_t cur_ref_length = GetClosestReferenceLength(ref_ids[ref_id], hypo_length);
NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second;
cerr << "reference length: " << cur_ref_length << endl;
// compute vector c(e;{r_k}):
@ -184,7 +195,7 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
m_match_history[i] += ngram_matches[i];
// do this for last position in batch
if (batchPosition == hypos.size() - 1) {
if (ref_id == hypos.size() - 1) {
m_count_history[i] *= m_historySmoothing;
m_match_history[i] *= m_historySmoothing;
}
@ -192,11 +203,11 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
// update counts for reference and target length
m_source_length_history += cur_source_length;
m_target_length_history += hypos[batchPosition].size();
m_target_length_history += hypos[ref_id].size();
m_ref_length_history += cur_ref_length;
// do this for last position in batch
if (batchPosition == hypos.size() - 1) {
if (ref_id == hypos.size() - 1) {
cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl;
cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl;
m_source_length_history *= m_historySmoothing;
@ -209,15 +220,24 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
/*
* Print batch of reference translations
*/
void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
for (size_t batchPosition = 0; batchPosition < ref_ids.size(); ++batchPosition){
size_t cur_ref_length = m_refs[ref_ids[batchPosition]].first;
/*void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
for (size_t ref_id = 0; ref_id < ref_ids.size(); ++ref_id){
size_t cur_ref_length = (m_refs[ref_ids[ref_id]].first)[0]; // TODO!!
cerr << "reference length: " << cur_ref_length << endl;
}
}
}*/
size_t BleuScoreFeature::GetReferenceLength(size_t ref_id) {
size_t cur_ref_length = m_refs[ref_id].first;
size_t BleuScoreFeature::GetClosestReferenceLength(size_t ref_id, int hypoLength) {
// look for closest reference
int currentDist = -1;
int closestRef = -1;
for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
if (closestRef == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) {
closestRef = (m_refs[ref_id].first)[i];
currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]);
}
}
size_t cur_ref_length = closestRef;
return cur_ref_length;
}
@ -446,7 +466,7 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
else if (m_scale_by_avg_length) {
precision *= (m_source_length_history + m_ref_length_history + m_cur_source_length + + m_cur_ref_length) / 2;
}
return precision*m_scale_by_x;
return (precision*m_scale_by_x)/m_correction;
}
const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const

View File

@ -44,7 +44,7 @@ class BleuScoreFeature : public StatefulFeatureFunction {
public:
typedef boost::unordered_map< Phrase, size_t > NGrams;
typedef boost::unordered_map<size_t, std::pair<size_t,NGrams> > RefCounts;
typedef boost::unordered_map<size_t, std::pair<std::vector<size_t>,NGrams> > RefCounts;
typedef boost::unordered_map<size_t, NGrams> Matches;
BleuScoreFeature():
@ -53,14 +53,15 @@ public:
m_match_history(BleuScoreState::bleu_order),
m_source_length_history(0),
m_target_length_history(0),
m_ref_length_history(0),
m_scale_by_input_length(true),
m_scale_by_ref_length(false),
m_scale_by_avg_length(false),
m_scale_by_x(1),
m_historySmoothing(0.7),
m_smoothing_scheme(PLUS_ONE) {}
m_smoothing_scheme(PLUS_ONE),
m_relax_BP(1),
m_correction(1) {}
std::string GetScoreProducerDescription() const
{
@ -75,11 +76,12 @@ public:
void PrintHistory(std::ostream& out) const;
void LoadReferences(const std::vector< std::vector< std::string > > &);
void SetCurrentSourceLength(size_t);
void SetCurrentReference(size_t);
void SetCurrentShortestReference(size_t);
void UpdateHistory(const std::vector< const Word* >&);
void UpdateHistory(const std::vector< std::vector< const Word* > >& hypos, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
void PrintReferenceLength(const std::vector<size_t>& ref_ids);
size_t GetReferenceLength(size_t ref_id);
size_t GetClosestReferenceLength(size_t ref_id, int hypoLength);
void SetBleuParameters(bool scaleByInputLength, bool scaleByRefLength, bool scaleByAvgLength,
bool scaleByTargetLengthLinear, bool scaleByTargetLengthTrend,
float scaleByX, float historySmoothing, size_t scheme, float relaxBP);
@ -107,6 +109,14 @@ public:
float CalculateBleu(BleuScoreState*) const;
const FFState* EmptyHypothesisState(const InputType&) const;
void SetCorrection(float correction) {
m_correction = correction;
}
float GetCorrection() {
return m_correction;
}
private:
// counts for pseudo-document
std::vector< float > m_count_history;
@ -145,6 +155,9 @@ private:
// relax application of the BP by setting a value between 0 and 1
float m_relax_BP;
// correct scaling issues
float m_correction;
};
} // Namespace.

View File

@ -60,9 +60,9 @@ void WordTranslationFeature::Evaluate(const TargetPhrase& targetPhrase,
if (m_unrestricted || sourceExists || targetExists) {
// construct feature name
stringstream featureName;
featureName << (sourceExists ? sourceWord : "OTHER");
featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER");
featureName << "|";
featureName << (targetExists ? targetWord : "OTHER");
featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER");
accumulator->PlusEquals(this,featureName.str(),1);
}
}