code clean-up, step 1

git-svn-id: http://svn.statmt.org/repository/mira@3918 cc96ff50-19ce-11e0-b349-13d7f0bd23df
This commit is contained in:
ehasler 2011-06-28 11:35:59 +00:00 committed by Ondrej Bojar
parent 8e6c963041
commit 120be1df4f
10 changed files with 59 additions and 564 deletions

View File

@ -32,8 +32,6 @@ using namespace Moses;
namespace Mira {
//Decoder::~Decoder() {}
/**
* Allocates a char* and copies string into it.
**/
@ -70,8 +68,8 @@ namespace Mira {
MosesDecoder::MosesDecoder(bool scaleByInputLength, float historySmoothing)
: m_manager(NULL) {
// force initialisation of the phrase dictionary (TODO: what for?)
const StaticData &staticData = StaticData::Instance();
// force initialisation of the phrase dictionary (TODO: why?)
const StaticData &staticData = StaticData::Instance();
m_sentence = new Sentence(Input);
stringstream in("Initialising decoder..\n");
const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder();
@ -176,61 +174,6 @@ namespace Mira {
return best;
}
vector<float> MosesDecoder::getBleuAndScore(const std::string& source,
size_t sentenceid,
float bleuObjectiveWeight,
float bleuScoreWeight,
bool distinct,
size_t rank,
size_t epoch)
{
StaticData &staticData = StaticData::InstanceNonConst();
m_sentence = new Sentence(Input);
stringstream in(source + "\n");
const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder();
m_sentence->Read(in,inputFactorOrder);
const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
// set the weight for the bleu feature
ostringstream bleuWeightStr;
bleuWeightStr << (bleuObjectiveWeight * bleuScoreWeight);
PARAM_VEC bleuWeight(1,bleuWeightStr.str());
staticData.GetParameter()->OverwriteParam("weight-bl", bleuWeight);
staticData.ReLoadBleuScoreFeatureParameter();
m_bleuScoreFeature->SetCurrentSourceLength((*m_sentence).GetSize());
m_bleuScoreFeature->SetCurrentReference(sentenceid);
//run the decoder
m_manager = new Moses::Manager(*m_sentence, staticData.GetSearchAlgorithm(), &system);
m_manager->ProcessSentence();
TrellisPathList sentences;
m_manager->CalcNBest(1, sentences, distinct);
// read off the feature values and bleu scores for each sentence in the nbest list
Moses::TrellisPathList::const_iterator iter = sentences.begin();
vector<float> bleuAndScore;
const Moses::TrellisPath &path = **iter;
float bleuScore = getBleuScore(path.GetScoreBreakdown());
float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
bleuAndScore.push_back(bleuScore);
bleuAndScore.push_back(scoreWithoutBleu);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", 1best translation: ");
Phrase phrase = path.GetTargetPhrase();
for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
const Word &word = phrase.GetWord(pos);
Word *newWord = new Word(word);
VERBOSE(1, *newWord);
}
VERBOSE(1, endl);
return bleuAndScore;
}
size_t MosesDecoder::getCurrentInputLength() {
return (*m_sentence).GetSize();
}
@ -270,27 +213,5 @@ namespace Mira {
void MosesDecoder::printReferenceLength(const vector<size_t>& ref_ids) {
m_bleuScoreFeature->PrintReferenceLength(ref_ids);
}
vector<float> MosesDecoder::calculateBleuOfCorpus(const vector< vector< const Word*> >& words, vector<size_t>& ref_ids, size_t epoch, size_t rank) {
vector<float> bleu = m_bleuScoreFeature->CalculateBleuOfCorpus(words, ref_ids);
if (bleu.size() > 0) {
cerr << "\nRank " << rank << ", BLEU after epoch " << epoch << ": " << bleu[4]*100 << ", "
<< bleu[0]*100 << "/" << bleu[1]*100 << "/" << bleu[2]*100 << "/" << bleu[3]*100 << " "
<< "(BP=" << bleu[5] << ", " << "ratio=" << bleu[6] << ", "
<< "hyp_len=" << bleu[7] << ", ref_len=" << bleu[8] << ")" << endl;
vector<float> bleuAndRatio(2);
bleuAndRatio[0] = bleu[4]*100;
bleuAndRatio[1] = bleu[6];
return bleuAndRatio;
}
else {
cerr << "\nRank " << rank << ", BLEU after epoch " << epoch << ": 0" << endl;
vector<float> bleuAndRatio(2);
bleuAndRatio[0] = 0;
bleuAndRatio[1] = 0;
return bleuAndRatio;
}
}
}

View File

@ -64,20 +64,12 @@ class MosesDecoder {
bool distinct,
size_t rank,
size_t epoch);
std::vector<float> getBleuAndScore(const std::string& source,
size_t sentenceid,
float bleuObjectiveWeight,
float bleuScoreWeight,
bool distinct,
size_t rank,
size_t epoch);
size_t getCurrentInputLength();
void updateHistory(const std::vector<const Moses::Word*>& words);
void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
void loadReferenceSentences(const std::vector<std::vector<std::string> >& refs);
void printBleuFeatureHistory(std::ostream& out);
void printReferenceLength(const std::vector<size_t>& ref_ids);
std::vector<float> calculateBleuOfCorpus(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& ref_ids, size_t epoch, size_t rank);
Moses::ScoreComponentCollection getWeights();
void setWeights(const Moses::ScoreComponentCollection& weights);
void cleanup();

View File

@ -5,187 +5,6 @@ using namespace std;
namespace Mira {
vector<FValue> Hildreth::optimise (const vector<FVector>& a, const vector<FValue>& b) {
size_t i;
int max_iter = 10000;
float eps = 0.00000001;
float zero = 0.000000000001;
vector<FValue> alpha ( b.size() );
vector<FValue> F ( b.size() );
vector<FValue> kkt ( b.size() );
float max_kkt = -1e100;
size_t K = b.size();
float A[K][K];
bool is_computed[K];
for ( i = 0; i < K; i++ )
{
A[i][i] = a[i].inner_product(a[i]);
is_computed[i] = false;
}
int max_kkt_i = -1;
for ( i = 0; i < b.size(); i++ )
{
F[i] = b[i];
kkt[i] = F[i];
if ( kkt[i] > max_kkt )
{
max_kkt = kkt[i];
max_kkt_i = i;
}
}
int iter = 0;
FValue diff_alpha;
FValue try_alpha;
FValue add_alpha;
while ( max_kkt >= eps && iter < max_iter )
{
diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
try_alpha = alpha[max_kkt_i] + diff_alpha;
add_alpha = 0.0;
if ( try_alpha < 0.0 )
add_alpha = -1.0 * alpha[max_kkt_i];
else
add_alpha = diff_alpha;
alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
if ( !is_computed[max_kkt_i] )
{
for ( i = 0; i < K; i++ )
{
A[i][max_kkt_i] = a[i].inner_product(a[max_kkt_i] ); // for version 1
//A[i][max_kkt_i] = 0; // for version 1
is_computed[max_kkt_i] = true;
}
}
for ( i = 0; i < F.size(); i++ )
{
F[i] -= add_alpha * A[i][max_kkt_i];
kkt[i] = F[i];
if ( alpha[i] > zero )
kkt[i] = abs ( F[i] );
}
max_kkt = -1e100;
max_kkt_i = -1;
for ( i = 0; i < F.size(); i++ )
if ( kkt[i] > max_kkt )
{
max_kkt = kkt[i];
max_kkt_i = i;
}
iter++;
}
return alpha;
}
vector<FValue> Hildreth::optimise (const vector<FVector>& a, const vector<FValue>& b, FValue C) {
size_t i;
int max_iter = 10000;
FValue eps = 0.00000001;
FValue zero = 0.000000000001;
vector<FValue> alpha ( b.size() );
vector<FValue> F ( b.size() );
vector<FValue> kkt ( b.size() );
float max_kkt = -1e100;
size_t K = b.size();
float A[K][K];
bool is_computed[K];
for ( i = 0; i < K; i++ )
{
A[i][i] = a[i].inner_product(a[i]);
is_computed[i] = false;
}
int max_kkt_i = -1;
for ( i = 0; i < b.size(); i++ )
{
F[i] = b[i];
kkt[i] = F[i];
if ( kkt[i] > max_kkt )
{
max_kkt = kkt[i];
max_kkt_i = i;
}
}
int iter = 0;
FValue diff_alpha;
FValue try_alpha;
FValue add_alpha;
while ( max_kkt >= eps && iter < max_iter )
{
diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
try_alpha = alpha[max_kkt_i] + diff_alpha;
add_alpha = 0.0;
if ( try_alpha < 0.0 )
add_alpha = -1.0 * alpha[max_kkt_i];
else if (try_alpha > C)
add_alpha = C - alpha[max_kkt_i];
else
add_alpha = diff_alpha;
alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
if ( !is_computed[max_kkt_i] )
{
for ( i = 0; i < K; i++ )
{
A[i][max_kkt_i] = a[i].inner_product(a[max_kkt_i] ); // for version 1
//A[i][max_kkt_i] = 0; // for version 1
is_computed[max_kkt_i] = true;
}
}
for ( i = 0; i < F.size(); i++ )
{
F[i] -= add_alpha * A[i][max_kkt_i];
kkt[i] = F[i];
if (alpha[i] > C - zero)
kkt[i]=-kkt[i];
else if (alpha[i] > zero)
kkt[i] = abs(F[i]);
}
max_kkt = -1e100;
max_kkt_i = -1;
for ( i = 0; i < F.size(); i++ )
if ( kkt[i] > max_kkt )
{
max_kkt = kkt[i];
max_kkt_i = i;
}
iter++;
}
return alpha;
}
vector<FValue> Hildreth::optimise (const vector<ScoreComponentCollection>& a, const vector<FValue>& b) {
size_t i;

View File

@ -5,8 +5,6 @@ namespace Mira {
class Hildreth {
public :
static std::vector<Moses::FValue> optimise (const std::vector<Moses::FVector>& a, const std::vector<Moses::FValue>& b );
static std::vector<Moses::FValue> optimise (const std::vector<Moses::FVector>& a, const std::vector<Moses::FValue>& b, Moses::FValue C);
static std::vector<Moses::FValue> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<Moses::FValue>& b );
static std::vector<Moses::FValue> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<Moses::FValue>& b, Moses::FValue C);
};

View File

@ -166,7 +166,6 @@ int main(int argc, char** argv) {
string decoder_settings;
float min_weight_change;
float decrease_learning_rate;
bool devBleu;
bool normaliseWeights;
bool print_feature_values;
bool historyOf1best;
@ -178,7 +177,6 @@ int main(int argc, char** argv) {
float bleuScoreWeight;
float margin_slack;
float margin_slack_incr;
bool analytical_update;
bool perceptron_update;
bool hope_fear;
bool model_hope_fear;
@ -189,7 +187,6 @@ int main(int argc, char** argv) {
desc.add_options()
("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
("adapt-after-epoch", po::value<size_t>(&adapt_after_epoch)->default_value(0), "Index of epoch after which adaptive parameters will be adapted")
("analytical-update", po::value<bool>(&analytical_update)->default_value(0), "Use one best lists and compute the update analytically")
("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
("base-of-log", po::value<size_t>(&baseOfLog)->default_value(10), "Base for log-ing feature values")
("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
@ -201,9 +198,7 @@ int main(int argc, char** argv) {
("core-weights", po::value<string>(&coreWeightFile), "Weight file containing the core weights (already tuned, have to be non-zero)")
("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
("decr-learning-rate", po::value<float>(&decrease_learning_rate)->default_value(0),"Decrease learning rate by the given value after every epoch")
("dev-bleu", po::value<bool>(&devBleu)->default_value(true), "Compute BLEU score of oracle translations of the whole tuning set")
("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use nbest list with distinct translations in inference step")
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
("fear-n", po::value<int>(&fear_n)->default_value(-1), "Number of fear translations used")
("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
@ -214,12 +209,12 @@ int main(int argc, char** argv) {
("hope-n", po::value<int>(&hope_n)->default_value(-1), "Number of hope translations used")
("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
("log-feature-values", po::value<bool>(&logFeatureValues)->default_value(false), "Take log of feature values according to the given base.")
("margin-incr", po::value<float>(&margin_slack_incr)->default_value(0), "Increment margin slack after every epoch by this amount")
("margin-slack", po::value<float>(&margin_slack)->default_value(0), "Slack when comparing left and right hand side of constraints")
("min-learning-rate", po::value<float>(&min_learning_rate)->default_value(0), "Set a minimum learning rate")
("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.01), "Set minimum weight change for stopping criterion")
("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(5), "How often per epoch to mix weights, when using mpi")
("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimization")
("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in nbest list")
@ -229,6 +224,8 @@ int main(int argc, char** argv) {
("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale the BLEU score by a history of the input lengths")
("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
("scale-update", po::value<bool>(&scale_update)->default_value(false), "Scale the update by the Bleu score of the oracle translation")
("sentence-level-bleu", po::value<bool>(&sentenceLevelBleu)->default_value(true), "Use a sentences level bleu scoring function")
("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimizer")
@ -236,8 +233,7 @@ int main(int argc, char** argv) {
("slack-step", po::value<float>(&slack_step)->default_value(0), "Increase slack from epoch to epoch by the value provided")
("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
("scale-margin", po::value<size_t>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
("scale-update", po::value<bool>(&scale_update)->default_value(false), "Scale the update by the Bleu score of the oracle translation")
("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights, when using mpi")
("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
po::options_description cmdline_options;
@ -355,42 +351,31 @@ int main(int argc, char** argv) {
perceptron_update = true;
model_hope_fear = false; // mira only
hope_fear = false; // mira only
analytical_update = false; // mira only
} else {
cerr << "Error: Unknown optimiser: " << learner << endl;
return 1;
}
// resolve parameter dependencies
if (perceptron_update || analytical_update) {
if (batchSize > 1 && perceptron_update) {
batchSize = 1;
cerr << "Info: Setting batch size to 1 for perceptron/analytical update" << endl;
cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
}
if (hope_n == -1 && fear_n == -1) {
hope_n = n;
fear_n = n;
}
if ((model_hope_fear || analytical_update) && hope_fear) {
if (model_hope_fear && hope_fear) {
hope_fear = false; // is true by default
}
if (!hope_fear && !analytical_update) {
if (!hope_fear) {
model_hope_fear = true;
}
if (model_hope_fear && analytical_update) {
cerr << "Error: Must choose between model-hope-fear and analytical update" << endl;
return 1;
}
if (!sentenceLevelBleu) {
if (!historyOf1best && !historyOfOracles) {
historyOf1best = true;
}
}
if (burnIn && sentenceLevelBleu) {
burnIn = false;
cerr << "Info: Burn-in not needed when using sentence-level BLEU, deactivating burn-in." << endl;
@ -545,7 +530,6 @@ int main(int argc, char** argv) {
int sumStillViolatedConstraints_lastEpoch = 0;
int sumConstraintChangeAbs;
int sumConstraintChangeAbs_lastEpoch = 0;
// size_t sumBleuChangeAbs;
float *sendbuf, *recvbuf;
sendbuf = (float *) malloc(sizeof(float));
recvbuf = (float *) malloc(sizeof(float));
@ -553,7 +537,6 @@ int main(int argc, char** argv) {
// sum of violated constraints
sumStillViolatedConstraints = 0;
sumConstraintChangeAbs = 0;
// sumBleuChangeAbs = 0;
numberOfUpdatesThisEpoch = 0;
// Sum up weights over one epoch, final average uses weights from last epoch
@ -619,7 +602,7 @@ int main(int argc, char** argv) {
dummyFeatureValues.push_back(newFeatureValues);
dummyBleuScores.push_back(newBleuScores);
if (perceptron_update || analytical_update) {
if (perceptron_update) {
if (historyOf1best) {
// MODEL (for updating the history)
cerr << "Rank " << rank << ", run decoder to get 1best wrt model score (for history)" << endl;
@ -778,15 +761,6 @@ int main(int argc, char** argv) {
}
}
/* // get 1best model results with old weights
vector< vector <float > > bestModelOld_batch;
for (size_t i = 0; i < actualBatchSize; ++i) {
string& input = inputSentences[*current_sid_start + i];
vector <float> bestModelOld = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest, rank, epoch);
bestModelOld_batch.push_back(bestModelOld);
decoder->cleanup();
}*/
// optionally print out the feature values
if (print_feature_values) {
cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
@ -823,14 +797,9 @@ int main(int argc, char** argv) {
vector<vector<float> > dummy1;
vector<size_t> dummy2;
update_status = optimiser->updateWeightsHopeFear(mosesWeights,
featureValuesHope, featureValuesFear, dummy1, dummy1, dummy2,
featureValuesHope, featureValuesFear, dummy1, dummy1, dummy2,
learning_rate, rank, epoch);
}
else if (analytical_update) {
update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(mosesWeights,
featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0], bleuScoresFear[0][0],
ref_ids[0], learning_rate, rank, epoch);
}
else {
if (hope_fear) {
if (coreWeightMap.size() > 0) {
@ -859,7 +828,7 @@ int main(int argc, char** argv) {
}
update_status = optimiser->updateWeightsHopeFear(mosesWeights,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, ref_ids,
featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear, ref_ids,
learning_rate, rank, epoch);
}
else {
@ -900,17 +869,6 @@ int main(int argc, char** argv) {
weightDifference.MinusEquals(oldWeights);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weight difference: " << weightDifference << endl);
/* // get 1best model results with new weights (for each sentence in batch)
vector<float> bestModelNew;
for (size_t i = 0; i < actualBatchSize; ++i) {
string& input = inputSentences[*current_sid_start + i];
bestModelNew = decoder->getBleuAndScore(input, *current_sid_start + i, 0.0, bleuScoreWeight, distinctNbest, rank, epoch);
decoder->cleanup();
sumBleuChangeAbs += abs(bestModelOld_batch[i][0] - bestModelNew[0]);
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", 1best model bleu, old: " << bestModelOld_batch[i][0] << ", new: " << bestModelNew[0] << endl);
VERBOSE(2, "Rank " << rank << ", epoch " << epoch << ", 1best model score, old: " << bestModelOld_batch[i][1] << ", new: " << bestModelNew[1] << endl);
}*/
// update history (for approximate document Bleu)
if (sentenceLevelBleu) {
for (size_t i = 0; i < oracles.size(); ++i) {

View File

@ -110,7 +110,7 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
ScoreComponentCollection update(featureValueDiffs[k]);
update.MultiplyEquals(alpha);
// sum up update
// sum updates
summedUpdate.PlusEquals(update);
}
}
@ -122,24 +122,6 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
return status;
}
ScoreComponentCollection newWeights(currWeights);
newWeights.PlusEquals(summedUpdate);
// Sanity check: are there still violated constraints after optimisation?
int violatedConstraintsAfter = 0;
float newDistanceFromOptimum = 0;
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
float loss = all_losses[i];
float diff = loss - (modelScoreDiff + m_margin_slack);
if (diff > epsilon) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;
}
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
// apply learning rate
if (learning_rate != 1) {
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
@ -158,6 +140,21 @@ vector<int> MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
currWeights.PlusEquals(summedUpdate);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
// Sanity check: are there still violated constraints after optimisation?
int violatedConstraintsAfter = 0;
float newDistanceFromOptimum = 0;
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
float loss = all_losses[i];
float diff = loss - (modelScoreDiff + m_margin_slack);
if (diff > epsilon) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;
}
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
vector<int> status(2);
status[0] = violatedConstraintsBefore;
status[1] = violatedConstraintsAfter;
@ -291,25 +288,7 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
return status;
}
ScoreComponentCollection newWeights(currWeights);
newWeights.PlusEquals(summedUpdate);
// Sanity check: are there still violated constraints after optimisation?
int violatedConstraintsAfter = 0;
float newDistanceFromOptimum = 0;
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(newWeights);
float loss = all_losses[i];
float diff = loss - (modelScoreDiff + m_margin_slack);
if (diff > epsilon) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;
}
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
// Apply learning rate (fixed or flexible)
// apply learning rate
if (learning_rate != 1) {
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", update before applying learning rate: " << summedUpdate << endl);
summedUpdate.MultiplyEquals(learning_rate);
@ -321,107 +300,27 @@ vector<int> MiraOptimiser::updateWeightsHopeFear(Moses::ScoreComponentCollection
currWeights.PlusEquals(summedUpdate);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
// Sanity check: are there still violated constraints after optimisation?
int violatedConstraintsAfter = 0;
float newDistanceFromOptimum = 0;
for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
float loss = all_losses[i];
float diff = loss - (modelScoreDiff + m_margin_slack);
if (diff > epsilon) {
++violatedConstraintsAfter;
newDistanceFromOptimum += diff;
}
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
vector<int> statusPlus(2);
statusPlus[0] = violatedConstraintsBefore;
statusPlus[1] = violatedConstraintsAfter;
return statusPlus;
}
vector<int> MiraOptimiser::updateWeightsAnalytically(ScoreComponentCollection& currWeights,
ScoreComponentCollection& featureValuesHope,
ScoreComponentCollection& featureValuesFear,
float bleuScoreHope,
float bleuScoreFear,
size_t sentenceId,
float learning_rate,
size_t rank,
size_t epoch) {
float epsilon = 0.0001;
float oldDistanceFromOptimum = 0;
bool constraintViolatedBefore = false;
ScoreComponentCollection weightUpdate;
// cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
// cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
ScoreComponentCollection featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
float loss = bleuScoreHope - bleuScoreFear;
float diff = 0;
if (loss > (modelScoreDiff + m_margin_slack)) {
diff = loss - (modelScoreDiff + m_margin_slack);
}
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " + " << m_margin_slack << " >= " << loss << " (current violation: " << diff << ")" << endl;
if (diff > epsilon) {
// constraint violated
oldDistanceFromOptimum += diff;
constraintViolatedBefore = true;
// compute alpha for given constraint: (loss - model score diff) / || feature value diff ||^2
// featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
// from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
float squaredNorm = featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm();
if (squaredNorm > 0) {
float alpha = diff / squaredNorm;
if (m_slack > 0 ) {
if (alpha > m_slack) {
alpha = m_slack;
}
else if (alpha < m_slack*(-1)) {
alpha = m_slack*(-1);
}
}
cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
featureValueDiff.MultiplyEquals(alpha);
weightUpdate.PlusEquals(featureValueDiff);
}
else {
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", no update because squared norm is 0" << endl);
}
}
if (!constraintViolatedBefore) {
// constraint satisfied, nothing to do
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl;
vector<int> status(2);
status[0] = 0;
status[1] = 0;
return status;
}
// sanity check: constraint still violated after optimisation?
ScoreComponentCollection newWeights(currWeights);
newWeights.PlusEquals(weightUpdate);
bool constraintViolatedAfter = false;
float newDistanceFromOptimum = 0;
featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
diff = loss - (modelScoreDiff + m_margin_slack);
// approximate comparison between floats!
if (diff > epsilon) {
constraintViolatedAfter = true;
newDistanceFromOptimum += (loss - modelScoreDiff);
}
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
// apply update to weight vector
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights before update: " << currWeights << endl);
currWeights.PlusEquals(weightUpdate);
VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", weights after update: " << currWeights << endl);
vector<int> status(2);
status[0] = 1;
status[1] = constraintViolatedAfter ? 1 : 0;
return status;
}
}

View File

@ -67,15 +67,6 @@ namespace Mira {
m_scale_update(scale_update),
m_margin_slack(margin_slack) { }
std::vector<int> updateWeightsAnalytically(Moses::ScoreComponentCollection& currWeights,
Moses::ScoreComponentCollection& featureValuesHope,
Moses::ScoreComponentCollection& featureValuesFear,
float bleuScoresHope,
float bleuScoresFear,
size_t sentenceId,
float learning_rate,
size_t rank,
size_t epoch);
std::vector<int> updateWeights(Moses::ScoreComponentCollection& currWeights,
const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
const std::vector<std::vector<float> >& losses,
@ -117,6 +108,7 @@ namespace Mira {
// scale update with log 10 of oracle BLEU score
bool m_scale_update;
// slack when comparing losses to model scores
float m_margin_slack;
};
}

View File

@ -45,7 +45,6 @@ vector<int> Perceptron::updateWeightsHopeFear(ScoreComponentCollection& currWeig
vector<int> update_status;
update_status.push_back(0);
update_status.push_back(0);
update_status.push_back(0);
return update_status;
}

View File

@ -94,14 +94,12 @@ void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::strin
for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
Phrase ngram(Output);
//cerr << "start: " << end_idx-order << " end: " << end_idx << endl;
for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
Word w;
w.SetFactor(0, f);
ngram.AddWord(w);
}
//cerr << "Ref: " << ngram << endl;
ref_pair.second[ngram] += 1;
}
}
@ -120,10 +118,10 @@ void BleuScoreFeature::SetCurrentReference(size_t ref_id) {
}
/*
* Update the pseudo-document big_O after each translation of a source sentence.
* (big_O is an exponentially-weighted moving average of vectors c(e;{r_k}))
* big_O = 0.9 * (big_O + c(e_oracle))
* big_O_f = 0.9 * (big_O_f + |f|) input length of document big_O
* Update the pseudo-document O after each translation of a source sentence.
* (O is an exponentially-weighted moving average of vectors c(e;{r_k}))
* O = m_historySmoothing * (O + c(e_oracle))
* O_f = m_historySmoothing * (O_f + |f|) input length of pseudo-document
*/
void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
Phrase phrase(Output, hypo);
@ -138,7 +136,6 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
//cerr << "precisionHistory " << i + 1 << ": " << (m_match_history[i]/m_count_history[i]) << " (" << m_match_history[i] << "/" << m_count_history[i] << ")" << endl;
}
// update counts for reference and target length
@ -148,7 +145,7 @@ void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
}
/*
* Update history with a batch of oracle translations
* Update history with a batch of translations
*/
void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
for (size_t batchPosition = 0; batchPosition < hypos.size(); ++batchPosition){
@ -195,7 +192,7 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
}
/*
* Update history with a batch of oracle translations
* Print batch of reference translations
*/
void BleuScoreFeature::PrintReferenceLength(const vector<size_t>& ref_ids) {
for (size_t batchPosition = 0; batchPosition < ref_ids.size(); ++batchPosition){
@ -325,7 +322,6 @@ FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
}
new_state->m_source_length = cur_hypo.GetWordsBitmap().GetSize();
new_state->m_source_phrase_length = cur_hypo.GetCurrSourceWordsRange().GetNumWordsCovered(); // todo: delete
new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
ctx_end_idx));
new_state->m_target_length += cur_hypo.GetTargetPhrase().GetSize();
@ -337,7 +333,6 @@ FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
// Calculate new bleu.
new_bleu = CalculateBleu(new_state);
//cerr << "NS: " << *new_state << " NB " << new_bleu << endl;
// Set score to new Bleu score
accumulator->PlusEquals(this, new_bleu - old_bleu);
@ -396,82 +391,6 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
return precision;
}
vector<float> BleuScoreFeature::CalculateBleuOfCorpus(const vector< vector< const Word* > >& oracles, const vector<size_t>& ref_ids) {
// get ngram matches and counts for all oracle sentences and their references
vector<size_t> sumOfClippedNgramMatches(BleuScoreState::bleu_order);
vector<size_t> sumOfNgramCounts(BleuScoreState::bleu_order);
size_t ref_length = 0;
size_t target_length = 0;
for (size_t batchPosition = 0; batchPosition < oracles.size(); ++batchPosition){
Phrase phrase(Output, oracles[batchPosition]);
size_t ref_id = ref_ids[batchPosition];
size_t cur_ref_length = m_refs[ref_id].first;
NGrams cur_ref_ngrams = m_refs[ref_id].second;
ref_length += cur_ref_length;
target_length += oracles[batchPosition].size();
std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
std::vector< size_t > clipped_ngram_matches(BleuScoreState::bleu_order);
GetClippedNgramMatchesAndCounts(phrase, cur_ref_ngrams, ngram_counts, clipped_ngram_matches, 0);
// add clipped ngram matches and ngram counts to corpus sums
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
sumOfClippedNgramMatches[i] += clipped_ngram_matches[i];
sumOfNgramCounts[i] += ngram_counts[i];
}
}
if (!sumOfNgramCounts[0]) {
vector<float> empty(0);
return empty;
}
if (!sumOfClippedNgramMatches[0]) {
vector<float> empty(0);
return empty; // if we have no unigram matches, score should be 0
}
// calculate bleu score
float precision = 1.0;
vector<float> bleu;
// Calculate geometric mean of modified ngram precisions
// BLEU = BP * exp(SUM_1_4 1/4 * log p_n)
// = BP * 4th root(PRODUCT_1_4 p_n)
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
if (sumOfNgramCounts[i]) {
precision *= 1.0*sumOfClippedNgramMatches[i] / sumOfNgramCounts[i];
bleu.push_back(1.0*sumOfClippedNgramMatches[i] / sumOfNgramCounts[i]);
}
}
// take geometric mean
precision = pow(precision, (float)1/4);
// Apply brevity penalty if applicable.
// BP = 1 if c > r
// BP = e^(1- r/c)) if c <= r
// where
// c: length of the candidate translation
// r: effective reference length (sum of best match lengths for each candidate sentence)
float BP;
if (target_length < ref_length) {
precision *= exp(1 - (1.0*ref_length/target_length));
BP = exp(1 - (1.0*ref_length/target_length));
}
else {
BP = 1.0;
}
bleu.push_back(precision);
bleu.push_back(BP);
bleu.push_back(1.0*target_length/ref_length);
bleu.push_back(target_length);
bleu.push_back(ref_length);
return bleu;
}
const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const
{
return new BleuScoreState();

View File

@ -29,8 +29,6 @@ private:
size_t m_source_length;
size_t m_target_length;
size_t m_source_phrase_length; // todo: delete
// scaled reference length is needed for scoring incomplete hypotheses against reference translation
float m_scaled_ref_length;
@ -52,7 +50,7 @@ public:
m_target_length_history(0),
m_ref_length_history(0),
m_scale_by_input_length(true),
m_historySmoothing(0.9) {}
m_historySmoothing(0.7) {}
BleuScoreFeature(bool scaleByInputLength, float historySmoothing):
StatefulFeatureFunction("BleuScore"),
@ -101,11 +99,10 @@ public:
const FFState* prev_state,
ScoreComponentCollection* accumulator) const;
float CalculateBleu(BleuScoreState*) const;
std::vector<float> CalculateBleuOfCorpus(const std::vector< std::vector< const Word* > >& hypos, const std::vector<size_t>& ref_ids);
const FFState* EmptyHypothesisState(const InputType&) const;
private:
// counts for pseudo-document big_O
// counts for pseudo-document
std::vector< float > m_count_history;
std::vector< float > m_match_history;
float m_source_length_history;
@ -117,9 +114,10 @@ private:
NGrams m_cur_ref_ngrams;
size_t m_cur_ref_length;
// whether or not to scale the BLEU score by a history of the input size
// scale BLEU score by history of input size
bool m_scale_by_input_length;
// smoothing factor for history counts
float m_historySmoothing;
};