2010-09-17 13:24:46 +04:00
|
|
|
#include "Optimiser.h"
|
2010-10-29 19:41:37 +04:00
|
|
|
#include "Hildreth.h"
|
2010-09-17 13:24:46 +04:00
|
|
|
|
2010-09-17 18:54:35 +04:00
|
|
|
using namespace Moses;
|
2010-09-17 13:24:46 +04:00
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
namespace Mira {
|
2010-10-25 16:22:35 +04:00
|
|
|
|
2010-11-30 20:26:34 +03:00
|
|
|
int MiraOptimiser::updateWeights(ScoreComponentCollection& currWeights,
|
2010-10-25 16:22:35 +04:00
|
|
|
const vector< vector<ScoreComponentCollection> >& featureValues,
|
|
|
|
const vector< vector<float> >& losses,
|
2010-11-24 20:06:54 +03:00
|
|
|
const vector< ScoreComponentCollection>& oracleFeatureValues) {
|
2010-10-25 16:22:35 +04:00
|
|
|
|
2010-10-29 19:41:37 +04:00
|
|
|
if (m_hildreth) {
|
2010-11-09 20:31:30 +03:00
|
|
|
size_t violatedConstraintsBefore = 0;
|
2010-10-29 19:41:37 +04:00
|
|
|
vector< ScoreComponentCollection> featureValueDiffs;
|
|
|
|
vector< float> lossMarginDistances;
|
2010-10-29 21:36:16 +04:00
|
|
|
for (size_t i = 0; i < featureValues.size(); ++i) {
|
|
|
|
for (size_t j = 0; j < featureValues[i].size(); ++j) {
|
|
|
|
// check if optimisation criterion is violated for one hypothesis and the oracle
|
|
|
|
// h(e*) >= h(e_ij) + loss(e_ij)
|
|
|
|
// h(e*) - h(e_ij) >= loss(e_ij)
|
2010-11-24 20:06:54 +03:00
|
|
|
ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
|
2010-11-01 15:30:49 +03:00
|
|
|
featureValueDiff.MinusEquals(featureValues[i][j]);
|
|
|
|
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
2010-12-01 21:09:49 +03:00
|
|
|
//cerr << "loss of hypothesis: " << losses[i][j] << endl;
|
|
|
|
//cerr << "model score difference: " << modelScoreDiff << endl;
|
2010-11-16 14:44:44 +03:00
|
|
|
float loss = losses[i][j] * m_marginScaleFactor;
|
2010-11-16 21:52:08 +03:00
|
|
|
|
|
|
|
bool addConstraint = true;
|
2010-11-16 14:44:44 +03:00
|
|
|
if (modelScoreDiff < loss) {
|
2010-11-16 21:52:08 +03:00
|
|
|
// constraint violated
|
2010-11-09 20:31:30 +03:00
|
|
|
++violatedConstraintsBefore;
|
2010-10-29 19:41:37 +04:00
|
|
|
}
|
2010-11-16 21:52:08 +03:00
|
|
|
else if (m_onlyViolatedConstraints) {
|
|
|
|
// constraint not violated
|
|
|
|
addConstraint = false;
|
|
|
|
}
|
2010-10-29 21:36:16 +04:00
|
|
|
|
2010-11-16 21:52:08 +03:00
|
|
|
if (addConstraint) {
|
|
|
|
// Objective: 1/2 * ||w' - w||^2 + C * SUM_1_m[ max_1_n (l_ij - Delta_h_ij.w')]
|
|
|
|
// To add a constraint for the optimiser for each sentence i and hypothesis j, we need:
|
|
|
|
// 1. vector Delta_h_ij of the feature value differences (oracle - hypothesis)
|
|
|
|
// 2. loss_ij - difference in model scores (Delta_h_ij.w') (oracle - hypothesis)
|
|
|
|
featureValueDiffs.push_back(featureValueDiff);
|
|
|
|
float lossMarginDistance = loss - modelScoreDiff;
|
|
|
|
lossMarginDistances.push_back(lossMarginDistance);
|
|
|
|
}
|
2010-10-29 19:41:37 +04:00
|
|
|
}
|
|
|
|
}
|
2010-10-25 16:22:35 +04:00
|
|
|
|
2010-12-01 21:09:49 +03:00
|
|
|
//cerr << "Number of constraints passed to optimiser: " << featureValueDiffs.size() << endl;
|
2010-11-24 20:06:54 +03:00
|
|
|
|
2010-11-09 20:31:30 +03:00
|
|
|
if (violatedConstraintsBefore > 0) {
|
2010-11-18 19:24:51 +03:00
|
|
|
// TODO: slack?
|
2010-11-09 20:31:30 +03:00
|
|
|
// run optimisation
|
2010-12-01 21:09:49 +03:00
|
|
|
//cerr << "\nNumber of violated constraints: " << violatedConstraintsBefore << endl;
|
2010-10-29 19:41:37 +04:00
|
|
|
// compute deltas for all given constraints
|
2010-11-29 21:42:18 +03:00
|
|
|
vector< float> alphas;
|
|
|
|
if (m_regulariseHildrethUpdates) {
|
|
|
|
alphas = Hildreth::optimise(featureValueDiffs, lossMarginDistances, m_c);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
alphas = Hildreth::optimise(featureValueDiffs, lossMarginDistances);
|
|
|
|
}
|
2010-10-29 19:41:37 +04:00
|
|
|
|
|
|
|
// Update the weight vector according to the deltas and the feature value differences
|
|
|
|
// * w' = w' + delta * Dh_ij ---> w' = w' + delta * (h(e*) - h(e_ij))
|
2010-11-29 21:42:18 +03:00
|
|
|
float sumOfAlphas = 0;
|
2010-10-29 19:41:37 +04:00
|
|
|
for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
|
2010-12-01 21:09:49 +03:00
|
|
|
//cerr << "alpha " << k << ": " << alphas[k] << endl;
|
2010-11-29 21:42:18 +03:00
|
|
|
sumOfAlphas += alphas[k];
|
2010-11-09 20:31:30 +03:00
|
|
|
|
|
|
|
// compute update
|
2010-11-29 21:42:18 +03:00
|
|
|
featureValueDiffs[k].MultiplyEquals(alphas[k]);
|
2010-10-29 19:41:37 +04:00
|
|
|
|
|
|
|
// apply update to weight vector
|
|
|
|
currWeights.PlusEquals(featureValueDiffs[k]);
|
|
|
|
}
|
2010-11-09 20:31:30 +03:00
|
|
|
|
2010-12-01 21:09:49 +03:00
|
|
|
//cerr << "sum of alphas: " << sumOfAlphas << endl;
|
2010-11-29 21:42:18 +03:00
|
|
|
|
2010-11-09 20:31:30 +03:00
|
|
|
// sanity check: how many constraints violated after optimisation?
|
|
|
|
size_t violatedConstraintsAfter = 0;
|
|
|
|
for (size_t i = 0; i < featureValues.size(); ++i) {
|
|
|
|
for (size_t j = 0; j < featureValues[i].size(); ++j) {
|
2010-11-24 20:06:54 +03:00
|
|
|
ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
|
2010-11-09 20:31:30 +03:00
|
|
|
featureValueDiff.MinusEquals(featureValues[i][j]);
|
|
|
|
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
2010-11-16 14:44:44 +03:00
|
|
|
float loss = losses[i][j] * m_marginScaleFactor;
|
|
|
|
if (modelScoreDiff < loss) {
|
2010-11-09 20:31:30 +03:00
|
|
|
++violatedConstraintsAfter;
|
|
|
|
}
|
2010-11-16 14:44:44 +03:00
|
|
|
|
2010-12-01 21:09:49 +03:00
|
|
|
//cerr << "New model score difference: " << modelScoreDiff << endl;
|
2010-11-09 20:31:30 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-12-01 21:09:49 +03:00
|
|
|
//cerr << "Number of violated constraints after optimisation: " << violatedConstraintsAfter << endl;
|
2010-11-09 20:31:30 +03:00
|
|
|
if (violatedConstraintsAfter > violatedConstraintsBefore) {
|
|
|
|
cerr << "Increase: " << violatedConstraintsAfter - violatedConstraintsBefore << endl << endl;
|
|
|
|
}
|
2010-11-29 18:07:38 +03:00
|
|
|
|
|
|
|
return violatedConstraintsBefore - violatedConstraintsAfter;
|
2010-10-29 19:41:37 +04:00
|
|
|
}
|
|
|
|
else {
|
2010-11-01 19:56:10 +03:00
|
|
|
cerr << "No constraint violated for this batch" << endl;
|
2010-10-29 19:41:37 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// SMO:
|
2010-10-29 21:36:16 +04:00
|
|
|
for (size_t i = 0; i < featureValues.size(); ++i) {
|
2010-11-18 19:24:51 +03:00
|
|
|
vector< float> alphas(featureValues[i].size()); // TODO: dont pass alphas if not needed
|
|
|
|
if (!m_fixedClipping) {
|
|
|
|
// initialise alphas for each source (alpha for oracle translation = C, all other alphas = 0)
|
|
|
|
for (size_t j = 0; j < featureValues[i].size(); ++j) {
|
2010-11-24 20:06:54 +03:00
|
|
|
if (j == m_oracleIndices[i]) {
|
2010-11-18 19:24:51 +03:00
|
|
|
// oracle
|
|
|
|
alphas[j] = m_c;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
alphas[j] = 0;
|
|
|
|
}
|
2010-10-25 16:22:35 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-10-29 19:41:37 +04:00
|
|
|
// consider all pairs of hypotheses
|
2010-11-19 14:35:16 +03:00
|
|
|
size_t violatedConstraintsBefore = 0;
|
2010-10-29 21:36:16 +04:00
|
|
|
size_t pairs = 0;
|
|
|
|
for (size_t j = 0; j < featureValues[i].size(); ++j) {
|
|
|
|
for (size_t k = 0; k < featureValues[i].size(); ++k) {
|
|
|
|
if (j <= k) {
|
|
|
|
++pairs;
|
2010-11-19 14:35:16 +03:00
|
|
|
ScoreComponentCollection featureValueDiff = featureValues[i][k];
|
|
|
|
featureValueDiff.MinusEquals(featureValues[i][j]);
|
|
|
|
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
|
|
|
float loss_jk = (losses[i][j] - losses[i][k]) * m_marginScaleFactor;
|
|
|
|
|
|
|
|
if (m_onlyViolatedConstraints) {
|
|
|
|
// check if optimisation criterion is violated for current hypothesis pair
|
|
|
|
// (oracle - hypothesis j) - (oracle - hypothesis_k) = hypothesis_k - hypothesis_j
|
|
|
|
bool addConstraint = true;
|
|
|
|
if (modelScoreDiff < loss_jk) {
|
|
|
|
// constraint violated
|
|
|
|
++violatedConstraintsBefore;
|
|
|
|
}
|
|
|
|
else if (m_onlyViolatedConstraints) {
|
|
|
|
// constraint not violated
|
|
|
|
addConstraint = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (addConstraint) {
|
|
|
|
// Compute delta:
|
|
|
|
float delta = computeDelta(currWeights, featureValueDiff, loss_jk, j, k, alphas);
|
|
|
|
|
|
|
|
// update weight vector:
|
|
|
|
if (delta != 0) {
|
|
|
|
update(currWeights, featureValueDiff, delta);
|
|
|
|
cerr << "\nComparing pair" << j << "," << k << endl;
|
|
|
|
cerr << "Update with delta: " << delta << endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// add all constraints
|
|
|
|
// Compute delta:
|
|
|
|
float delta = computeDelta(currWeights, featureValueDiff, loss_jk, j, k, alphas);
|
|
|
|
|
|
|
|
// update weight vector:
|
|
|
|
if (delta != 0) {
|
|
|
|
update(currWeights, featureValueDiff, delta);
|
|
|
|
cerr << "\nComparing pair" << j << "," << k << endl;
|
|
|
|
cerr << "Update with delta: " << delta << endl;
|
|
|
|
}
|
2010-10-29 21:36:16 +04:00
|
|
|
}
|
2010-10-29 19:41:37 +04:00
|
|
|
}
|
2010-10-25 16:22:35 +04:00
|
|
|
}
|
|
|
|
}
|
2010-10-29 21:36:16 +04:00
|
|
|
|
2010-11-01 19:56:10 +03:00
|
|
|
cerr << "number of pairs: " << pairs << endl;
|
2010-10-25 16:22:35 +04:00
|
|
|
}
|
2010-09-17 13:24:46 +04:00
|
|
|
}
|
2010-11-29 18:07:38 +03:00
|
|
|
|
|
|
|
return 0;
|
2010-10-25 16:22:35 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute delta for weight update.
|
|
|
|
* As part of this compute feature value differences
|
|
|
|
* Dh_ij - Dh_ij' ---> h(e_ij') - h(e_ij)) --> h(hope) - h(fear)
|
|
|
|
* which are used in the delta term and in the weight update term.
|
|
|
|
*/
|
|
|
|
float MiraOptimiser::computeDelta(ScoreComponentCollection& currWeights,
|
2010-11-19 14:35:16 +03:00
|
|
|
const ScoreComponentCollection featureValueDiff,
|
|
|
|
float loss_jk,
|
|
|
|
float j,
|
|
|
|
float k,
|
|
|
|
vector< float>& alphas) {
|
2010-11-18 19:24:51 +03:00
|
|
|
|
|
|
|
// compute delta
|
|
|
|
float delta = 0.0;
|
2010-11-19 14:35:16 +03:00
|
|
|
float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
|
|
|
|
float squaredNorm = featureValueDiff.InnerProduct(featureValueDiff);
|
2010-11-18 19:24:51 +03:00
|
|
|
if (squaredNorm == 0.0) {
|
|
|
|
delta = 0.0;
|
|
|
|
}
|
|
|
|
else {
|
2010-11-19 14:35:16 +03:00
|
|
|
delta = (loss_jk - modelScoreDiff) / squaredNorm;
|
2010-11-18 19:24:51 +03:00
|
|
|
|
|
|
|
// clipping
|
|
|
|
if (m_fixedClipping) {
|
|
|
|
if (delta > m_c) {
|
|
|
|
delta = m_c;
|
|
|
|
}
|
|
|
|
else if (delta < -1 * m_c) {
|
|
|
|
delta = -1 * m_c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// alpha_ij = alpha_ij + delta
|
|
|
|
// alpha_ij' = alpha_ij' - delta
|
|
|
|
// clipping interval: [-alpha_ij, alpha_ij']
|
|
|
|
// clip delta
|
|
|
|
if (delta > alphas[j]) {
|
|
|
|
delta = alphas[j];
|
|
|
|
}
|
|
|
|
else if (delta < (-1 * alphas[k])) {
|
|
|
|
delta = (-1 * alphas[k]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// update alphas
|
|
|
|
alphas[j] -= delta;
|
|
|
|
alphas[k] += delta;
|
|
|
|
}
|
2010-10-25 16:22:35 +04:00
|
|
|
}
|
|
|
|
|
2010-11-18 19:24:51 +03:00
|
|
|
return delta;
|
2010-10-25 16:22:35 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the weight vector according to delta and the feature value difference
|
|
|
|
* w' = w' + delta * (Dh_ij - Dh_ij') ---> w' = w' + delta * (h(e_ij') - h(e_ij)))
|
|
|
|
*/
|
|
|
|
void MiraOptimiser::update(ScoreComponentCollection& currWeights, ScoreComponentCollection& featureValueDiffs, const float delta) {
|
|
|
|
featureValueDiffs.MultiplyEquals(delta);
|
|
|
|
currWeights.PlusEquals(featureValueDiffs);
|
|
|
|
}
|
2010-10-29 19:41:37 +04:00
|
|
|
|
2010-09-17 13:24:46 +04:00
|
|
|
}
|
|
|
|
|
2010-10-29 19:41:37 +04:00
|
|
|
|