refactorings, remove burn-in code

git-svn-id: http://svn.statmt.org/repository/mira@3922 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
ehasler 2011-06-29 18:06:28 +00:00 committed by Ondrej Bojar
parent 290e38fb73
commit f658840f1a
4 changed files with 58 additions and 260 deletions

View File

@ -97,9 +97,6 @@ int main(int argc, char** argv) {
bool print_feature_values;
bool historyOf1best;
bool historyOfOracles;
bool burnIn;
string burnInInputFile;
vector<string> burnInReferenceFiles;
bool sentenceLevelBleu;
float bleuScoreWeight;
float margin_slack;
@ -118,9 +115,6 @@ int main(int argc, char** argv) {
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
("bleu-score-weight", po::value<float>(&bleuScoreWeight)->default_value(1.0), "Bleu score weight used in the decoder objective function (on top of the bleu objective weight)")
("burn-in", po::value<bool>(&burnIn)->default_value(false), "Do a burn-in of the BLEU history before training")
("burn-in-input-file", po::value<string>(&burnInInputFile), "Input file for burn-in phase of BLEU history")
("burn-in-reference-files", po::value<vector<string> >(&burnInReferenceFiles), "Reference file for burn-in phase of BLEU history")
("config,f", po::value<string>(&mosesConfigFile), "Moses ini file")
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
@ -307,110 +301,8 @@ int main(int argc, char** argv) {
historyOf1best = true;
}
}
if (burnIn && sentenceLevelBleu) {
burnIn = false;
cerr << "Info: Burn-in not needed when using sentence-level BLEU, deactivating burn-in." << endl;
}
if (burnIn) {
// load burn-in input and references
vector<string> burnInInputSentences;
if (!loadSentences(burnInInputFile, burnInInputSentences)) {
cerr << "Error: Failed to load burn-in input sentences from " << burnInInputFile << endl;
return 1;
}
vector<vector<string> > burnInReferenceSentences(burnInReferenceFiles.size());
for (size_t i = 0; i < burnInReferenceFiles.size(); ++i) {
if (!loadSentences(burnInReferenceFiles[i], burnInReferenceSentences[i])) {
cerr << "Error: Failed to load burn-in reference sentences from "
<< burnInReferenceFiles[i] << endl;
return 1;
}
if (burnInReferenceSentences[i].size() != burnInInputSentences.size()) {
cerr << "Error: Burn-in input file length (" << burnInInputSentences.size() << ") != ("
<< burnInReferenceSentences[i].size() << ") length of burn-in reference file " << i
<< endl;
return 1;
}
}
decoder->loadReferenceSentences(burnInReferenceSentences);
vector<size_t> inputLengths;
vector<size_t> ref_ids;
vector<vector<const Word*> > oracles;
vector<vector<const Word*> > oneBests;
vector<vector<ScoreComponentCollection> > featureValues;
vector<vector<float> > bleuScores;
vector<ScoreComponentCollection> newFeatureValues;
vector<float> newBleuScores;
featureValues.push_back(newFeatureValues);
bleuScores.push_back(newBleuScores);
vector<size_t> order;
for (size_t i = 0; i < burnInInputSentences.size(); ++i) {
order.push_back(i);
}
VERBOSE(1, "Rank " << rank << ", starting burn-in phase for approx. BLEU history.." << endl);
if (historyOf1best) {
// get 1best translations for the burn-in sentences
vector<size_t>::const_iterator sid = order.begin();
while (sid != order.end()) {
string& input = burnInInputSentences[*sid];
vector<const Word*> bestModel = decoder->getNBest(input, *sid, 1, 0.0, bleuScoreWeight,
featureValues[0], bleuScores[0], true,
distinctNbest, rank, -1);
inputLengths.push_back(decoder->getCurrentInputLength());
ref_ids.push_back(*sid);
decoder->cleanup();
oneBests.push_back(bestModel);
++sid;
}
// update history
decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, 0);
// clean up 1best translations after updating history
for (size_t i = 0; i < oracles.size(); ++i) {
for (size_t j = 0; j < oracles[i].size(); ++j) {
delete oracles[i][j];
}
}
}
else {
// get oracle translations for the burn-in sentences
vector<size_t>::const_iterator sid = order.begin();
while (sid != order.end()) {
string& input = burnInInputSentences[*sid];
vector<const Word*> oracle = decoder->getNBest(input, *sid, 1, 1.0, bleuScoreWeight,
featureValues[0], bleuScores[0], true, distinctNbest, rank, -1);
inputLengths.push_back(decoder->getCurrentInputLength());
ref_ids.push_back(*sid);
decoder->cleanup();
oracles.push_back(oracle);
++sid;
}
// update history
decoder->updateHistory(oracles, inputLengths, ref_ids, rank, 0);
// clean up oracle translations after updating history
for (size_t i = 0; i < oracles.size(); ++i) {
for (size_t j = 0; j < oracles[i].size(); ++j) {
delete oracles[i][j];
}
}
}
VERBOSE(1, "Bleu feature history after burn-in: " << endl);
decoder->printBleuFeatureHistory(cerr);
decoder->loadReferenceSentences(referenceSentences);
}
else {
decoder->loadReferenceSentences(referenceSentences);
}
decoder->loadReferenceSentences(referenceSentences);
#ifdef MPI_ENABLE
mpi::broadcast(world, order, 0);
@ -458,16 +350,12 @@ int main(int argc, char** argv) {
bool stop = false;
int sumStillViolatedConstraints;
int sumStillViolatedConstraints_lastEpoch = 0;
int sumConstraintChangeAbs;
int sumConstraintChangeAbs_lastEpoch = 0;
float *sendbuf, *recvbuf;
sendbuf = (float *) malloc(sizeof(float));
recvbuf = (float *) malloc(sizeof(float));
for (size_t epoch = 0; epoch < epochs && !stop; ++epoch) {
// sum of violated constraints
// sum of violated constraints in an epoch
sumStillViolatedConstraints = 0;
sumConstraintChangeAbs = 0;
numberOfUpdatesThisEpoch = 0;
// Sum up weights over one epoch, final average uses weights from last epoch
@ -517,10 +405,6 @@ int main(int argc, char** argv) {
const vector<string>& refs = referenceSentences[*sid];
cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"" << input << "\"" << " (batch pos " << batchPosition << ")" << endl;
cerr << "model-hope-fear: " << model_hope_fear << endl;
cerr << "hope-fear: " << hope_fear << endl;
cerr << "perceptron: " << perceptron_update << endl;
vector<ScoreComponentCollection> newFeatureValues;
vector<float> newBleuScores;
if (model_hope_fear) {
@ -634,58 +518,39 @@ int main(int argc, char** argv) {
// set weight for bleu feature to 0
mosesWeights.Assign(featureFunctions.back(), 0);
// take logs of feature values
if (logFeatureValues) {
for (size_t i = 0; i < featureValues.size(); ++i) {
if (hope_fear) {
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
featureValuesHope[i][j].ApplyLog(baseOfLog);
}
for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
featureValuesFear[i][j].ApplyLog(baseOfLog);
}
}
else {
for (size_t j = 0; j < featureValues[i].size(); ++j) {
featureValues[i][j].ApplyLog(baseOfLog);
}
oracleFeatureValues[i].ApplyLog(baseOfLog);
}
takeLogs(featureValuesHope, baseOfLog);
takeLogs(featureValuesFear, baseOfLog);
takeLogs(featureValues, baseOfLog);
for (size_t i = 0; i < oracleFeatureValues.size(); ++i) {
oracleFeatureValues[i].ApplyLog(baseOfLog);
}
}
// optionally print out the feature values
// print out the feature values
if (print_feature_values) {
cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
if (model_hope_fear) {
for (size_t i = 0; i < featureValues.size(); ++i) {
for (size_t j = 0; j < featureValues[i].size(); ++j) {
cerr << featureValues[i][j] << endl;
}
}
cerr << endl;
}
if (model_hope_fear) printFeatureValues(featureValues);
else {
cerr << "hope: " << endl;
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
cerr << featureValuesHope[i][j] << endl;
}
}
printFeatureValues(featureValuesHope);
cerr << "fear: " << endl;
for (size_t i = 0; i < featureValuesFear.size(); ++i) {
for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
cerr << featureValuesFear[i][j] << endl;
}
}
cerr << endl;
printFeatureValues(featureValuesFear);
}
}
// set core features to 0 to avoid updating the feature weights
if (coreWeightMap.size() > 0) {
ignoreCoreFeatures(featureValues, coreWeightMap);
ignoreCoreFeatures(featureValuesHope, coreWeightMap);
ignoreCoreFeatures(featureValuesFear, coreWeightMap);
}
// Run optimiser on batch:
VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
ScoreComponentCollection oldWeights(mosesWeights);
vector<int> update_status;
size_t update_status;
if (perceptron_update) {
vector<vector<float> > dummy1;
vector<size_t> dummy2;
@ -693,47 +558,19 @@ int main(int argc, char** argv) {
featureValuesHope, featureValuesFear, dummy1, dummy1, dummy2,
learning_rate, rank, epoch);
}
else if (hope_fear) {
update_status = optimiser->updateWeightsHopeFear(mosesWeights,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, ref_ids,
learning_rate, rank, epoch);
}
else {
if (hope_fear) {
if (coreWeightMap.size() > 0) {
// set core features to 0 to avoid updating the feature weights
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
// set all core features to 0
StrFloatMap::iterator p;
for(p = coreWeightMap.begin(); p!=coreWeightMap.end(); ++p)
{
featureValuesHope[i][j].Assign(p->first, 0);
}
}
}
for (size_t i = 0; i < featureValuesFear.size(); ++i) {
for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
// set all core features to 0
StrFloatMap::iterator p;
for(p = coreWeightMap.begin(); p!=coreWeightMap.end(); ++p)
{
featureValuesFear[i][j].Assign(p->first, 0);
}
}
}
}
update_status = optimiser->updateWeightsHopeFear(mosesWeights,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, ref_ids,
learning_rate, rank, epoch);
}
else {
// model_hope_fear
update_status = ((MiraOptimiser*) optimiser)->updateWeights(mosesWeights, featureValues,
losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids,
learning_rate, rank, epoch);
}
// model_hope_fear
update_status = ((MiraOptimiser*) optimiser)->updateWeights(mosesWeights,
featureValues, losses, bleuScores, oracleFeatureValues, oracleBleuScores, ref_ids,
learning_rate, rank, epoch);
}
sumConstraintChangeAbs += abs(update_status[0] - update_status[1]);
sumStillViolatedConstraints += update_status[1];
sumStillViolatedConstraints += update_status;
// pass new weights to decoder
if (normaliseWeights) {
@ -754,7 +591,7 @@ int main(int argc, char** argv) {
mosesWeights = averageWeights;
}
// set new Moses weights (averaged or not)
// set new Moses weights
decoder->setWeights(mosesWeights);
// compute difference to old weights
@ -764,9 +601,9 @@ int main(int argc, char** argv) {
// update history (for approximate document Bleu)
if (sentenceLevelBleu) {
for (size_t i = 0; i < oracles.size(); ++i) {
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", oracle length: " << oracles[i].size() << " ");
if (verbosity > 0) {
if (verbosity > 0) {
for (size_t i = 0; i < oracles.size(); ++i) {
cerr << "Rank " << rank << ", epoch " << epoch << ", oracle length: " << oracles[i].size() << " ";
decoder->printReferenceLength(ref_ids);
}
}
@ -787,16 +624,8 @@ int main(int argc, char** argv) {
}
// clean up oracle and 1best translations after updating history
for (size_t i = 0; i < oracles.size(); ++i) {
for (size_t j = 0; j < oracles[i].size(); ++j) {
delete oracles[i][j];
}
}
for (size_t i = 0; i < oneBests.size(); ++i) {
for (size_t j = 0; j < oneBests[i].size(); ++j) {
delete oneBests[i][j];
}
}
deleteTranslations(oracles);
deleteTranslations(oneBests);
size_t mixing_base = mixingFrequency == 0 ? 0 : shard.size() / mixingFrequency;
size_t dumping_base = weightDumpFrequency ==0 ? 0 : shard.size() / weightDumpFrequency;
@ -886,6 +715,7 @@ int main(int argc, char** argv) {
cerr << "Bleu feature history after epoch " << epoch << endl;
decoder->printBleuFeatureHistory(cerr);
}
cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl;
// Check whether there were any weight updates during this epoch
size_t sumUpdates;
@ -910,21 +740,6 @@ int main(int argc, char** argv) {
#endif
}
if (epoch > 0) {
if ((sumConstraintChangeAbs_lastEpoch == sumConstraintChangeAbs) && (sumStillViolatedConstraints_lastEpoch == sumStillViolatedConstraints)) {
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints and constraint changes has stayed the same: " << sumStillViolatedConstraints << ", " << sumConstraintChangeAbs << endl);
}
else {
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << ", sum of constraint changes " << sumConstraintChangeAbs << endl);
}
}
else {
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl);
}
sumConstraintChangeAbs_lastEpoch = sumConstraintChangeAbs;
sumStillViolatedConstraints_lastEpoch = sumStillViolatedConstraints;
if (!stop) {
// Test if weights have converged
if (weightConvergence) {
@ -1110,4 +925,3 @@ void deleteTranslations(vector<vector<const Word*> > &translations) {
}
}
}

View File

@ -7,7 +7,7 @@ using namespace std;
namespace Mira {
vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
size_t MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
const vector<vector<ScoreComponentCollection> >& featureValues,
const vector<vector<float> >& losses,
const vector<vector<float> >& bleuScores,
@ -115,18 +115,14 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
}
}
else {
cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl;
vector<int> status(2);
status[0] = 0;
status[1] = 0;
return status;
cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
return 0;
}
// apply learning rate
if (learning_rate != 1) {
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
summedUpdate.MultiplyEquals(learning_rate);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl);
}
// scale update by BLEU of oracle
@ -135,6 +131,8 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
summedUpdate.MultiplyEquals(log10(oracleBleuScores[0]));
}
cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl;
// apply update to weight vector
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl);
currWeights.PlusEquals(summedUpdate);
@ -154,14 +152,10 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
vector<int> status(2);
status[0] = violatedConstraintsBefore;
status[1] = violatedConstraintsAfter;
return status;
return violatedConstraintsAfter;
}
vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
size_t MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,
@ -282,19 +276,17 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
}
else {
cerr << "Rank " << rank << ", epoch " << epoch << ", check, no constraint violated for this batch" << endl;
vector<int> status(2);
status[0] = 0;
status[1] = 0;
return status;
return 0;
}
// apply learning rate
if (learning_rate != 1) {
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
summedUpdate.MultiplyEquals(learning_rate);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update after applying learning rate: " << summedUpdate << endl);
}
cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl;
// apply update to weight vector
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl);
currWeights.PlusEquals(summedUpdate);
@ -314,12 +306,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
vector<int> statusPlus(2);
statusPlus[0] = violatedConstraintsBefore;
statusPlus[1] = violatedConstraintsAfter;
return statusPlus;
return violatedConstraintsAfter;
}
}

View File

@ -30,7 +30,7 @@ namespace Mira {
public:
Optimiser() {}
virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
virtual size_t updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,
@ -43,7 +43,7 @@ namespace Mira {
class Perceptron : public Optimiser {
public:
virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
virtual size_t updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,
@ -67,7 +67,7 @@ namespace Mira {
m_scale_update(scale_update),
m_margin_slack(margin_slack) { }
std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
size_t updateWeights(Moses::ScoreComponentCollection& currWeights,
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
const std::vector<std::vector<float> >& losses,
const std::vector<std::vector<float> >& bleuScores,
@ -77,7 +77,7 @@ namespace Mira {
float learning_rate,
size_t rank,
size_t epoch);
virtual std::vector<int> updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
virtual size_t updateWeightsHopeFear(Moses::ScoreComponentCollection& currWeights,
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
const std::vector<std::vector<float> >& bleuScoresHope,

View File

@ -24,7 +24,7 @@ using namespace std;
namespace Mira {
vector<int> Perceptron::updateWeightsHopeFear(ScoreComponentCollection& currWeights,
size_t Perceptron::updateWeightsHopeFear(ScoreComponentCollection& currWeights,
const vector< vector<ScoreComponentCollection> >& featureValuesHope,
const vector< vector<ScoreComponentCollection> >& featureValuesFear,
const vector< vector<float> >& dummy1,
@ -34,18 +34,15 @@ vector<int> Perceptron::updateWeightsHopeFear(ScoreComponentCollection& currWeig
size_t rank,
size_t epoch)
{
cerr << "hope: " << featureValuesHope[0][0] << endl;
cerr << "fear: " << featureValuesFear[0][0] << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope[0][0] << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear[0][0] << endl;
ScoreComponentCollection featureValueDiff = featureValuesHope[0][0];
featureValueDiff.MinusEquals(featureValuesFear[0][0]);
cerr << "hope - fear: " << featureValueDiff << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
featureValueDiff.MultiplyEquals(perceptron_learning_rate);
currWeights.PlusEquals(featureValueDiff);
vector<int> update_status;
update_status.push_back(0);
update_status.push_back(0);
return update_status;
cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << featureValueDiff << endl;
return 0;
}
}