mosesdecoder/mira/Main.cpp

/***********************************************************************
 Moses - factored phrase-based language decoder
 Copyright (C) 2010 University of Edinburgh

 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.

 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.

 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 ***********************************************************************/

#include <algorithm>
#include <cstdlib>
#include <ctime>
#include <string>
#include <vector>
#include <map>

#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>

#ifdef MPI_ENABLE
#include <boost/mpi.hpp>
namespace mpi = boost::mpi;
#endif

#include "Main.h"
#include "Optimiser.h"
#include "Hildreth.h"
#include "HypothesisQueue.h"
#include "moses/FeatureVector.h"
#include "moses/StaticData.h"
#include "moses/ChartTrellisPathList.h"
#include "moses/ChartTrellisPath.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/ThreadPool.h"
#include "moses/DummyScoreProducers.h"
#include "moses/LexicalReordering.h"
#include "moses/WordTranslationFeature.h"
#include "moses/PhrasePairFeature.h"
#include "mert/BleuScorer.h"

using namespace Mira;
using namespace std;
using namespace Moses;
namespace po = boost::program_options;

int main(int argc, char** argv) {
  size_t rank = 0;
  size_t size = 1;
#ifdef MPI_ENABLE
  mpi::environment env(argc,argv);
  mpi::communicator world;
  rank = world.rank();
  size = world.size();
#endif

  bool help;
  int verbosity;
  string mosesConfigFile;
  string inputFile;
  vector<string> referenceFiles;
  vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds;
  //  string coreWeightFile, startWeightFile;
  size_t epochs;
  string learner;
  bool shuffle;
  size_t mixingFrequency;
  size_t weightDumpFrequency;
  string weightDumpStem;
  bool scale_margin, scale_margin_precision;
  bool scale_update, scale_update_precision;
  size_t n;
  size_t batchSize;
  bool distinctNbest;
  bool accumulateWeights;
  float historySmoothing;
  bool scaleByInputLength, scaleByAvgInputLength;
  bool scaleByInverseLength, scaleByAvgInverseLength;
  float scaleByX;
  float slack;
  bool averageWeights;
  bool weightConvergence;
  float learning_rate;
  float mira_learning_rate;
  float perceptron_learning_rate;
  string decoder_settings;
  float min_weight_change;
  bool normaliseWeights, normaliseMargin;
  bool print_feature_values;
  bool historyBleu   ;
  bool sentenceBleu;
  bool perceptron_update;
  bool hope_fear;
  bool model_hope_fear;
  int hope_n, fear_n;
  size_t bleu_smoothing_scheme;
  float min_oracle_bleu;
  float minBleuRatio, maxBleuRatio;
  bool boost;
  bool decode_hope, decode_fear, decode_model;
  string decode_filename;
  bool batchEqualsShard;
  bool sparseAverage, dumpMixedWeights, sparseNoAverage;
  int featureCutoff;
  bool pruneZeroWeights;
  bool printFeatureCounts, printNbestWithFeatures;
  bool avgRefLength;
  bool print_weights, print_core_weights, debug_model, scale_lm, scale_wp;
  float scale_lm_factor, scale_wp_factor;
  bool kbest;
  string moses_src;
  float sigmoidParam;
  float bleuWeight, bleuWeight_hope, bleuWeight_fear;
  bool bleu_weight_lm, bleu_weight_lm_adjust;
  float bleu_weight_lm_factor;
  bool l1_regularize, l2_regularize, l1_reg_sparse, l2_reg_sparse;
  float l1_lambda, l2_lambda;
  bool most_violated, most_violated_reg, all_violated, max_bleu_diff, one_against_all;
  bool feature_confidence, signed_counts;
  float decay_core, decay_sparse, core_r0, sparse_r0;
  bool selective, summed;
  float bleu_weight_fear_factor;
  bool hildreth;
  float add2lm;
  bool realBleu, disableBleuFeature;
  bool rescaleSlack;
  bool makePairs;
  bool debug;
  bool reg_on_every_mix;
  size_t continue_epoch;
  bool modelPlusBleu,  simpleHistoryBleu;
  po::options_description desc("Allowed options");
  desc.add_options()
    ("continue-epoch", po::value<size_t>(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on")
    ("freq-reg", po::value<bool>(&reg_on_every_mix)->default_value(false), "Regularize after every weight mixing")
    ("l1sparse", po::value<bool>(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only")
    ("l2sparse", po::value<bool>(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only")
    ("mv-reg", po::value<bool>(&most_violated_reg)->default_value(false), "Regularize most violated constraint")
    ("dbg", po::value<bool>(&debug)->default_value(true), "More debug output")
    ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack")
    ("debug", po::value<bool>(&debug)->default_value(true), "More debug output")
    ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation")
    ("disable-bleu-feature", po::value<bool>(&disableBleuFeature)->default_value(false), "Disable the Bleu feature")
    ("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations")
    ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights")
    ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update")
    ("selective", po::value<bool>(&selective)->default_value(false), "Build constraints for every feature")
    ("summed", po::value<bool>(&summed)->default_value(false), "Sum up all constraints")
    ("model-plus-bleu", po::value<bool>(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations")
    ("simple-history-bleu", po::value<bool>(&simpleHistoryBleu)->default_value(false), "Simple history Bleu")

    ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
    ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
    ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")

    ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
    ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")

    ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
    ("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
    ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
    ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor")
    ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
    ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
    ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
    ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
    ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
    ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
    ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
    ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
    ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
    ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
    ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
    ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
    ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
    ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
    ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
    ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
    ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
    ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
    ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
    ("fear-n", po::value<int>(&fear_n)->default_value(1), "Number of fear translations used")
    ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
    ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
    ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
    ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
    ("hope-n", po::value<int>(&hope_n)->default_value(2), "Number of hope translations used")
    ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
    ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
    ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
    ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)")
    ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))")
    ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
    ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
    ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
    ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
    ("max-bleu-diff", po::value<bool>(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference")
    ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
    ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion")
    ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
    ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
    ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
    ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
    ("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in n-best list")
    ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
    ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
    ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
    ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
    ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
    ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
    ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
    ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights")
    ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
    ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
    ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
    ("kbest", po::value<bool>(&kbest)->default_value(false), "Select hope/fear pairs from a list of nbest translations")

    ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length")
    ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(false), "Scale BLEU by (history of) input length")
    ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length")
    ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length")
    ("scale-by-x", po::value<float>(&scaleByX)->default_value(1), "Scale the BLEU score by value x")
    ("scale-lm", po::value<bool>(&scale_lm)->default_value(false), "Scale the language model feature")
    ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor")
    ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
    ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
    ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
    ("scale-margin-precision", po::value<bool>(&scale_margin_precision)->default_value(0), "Scale margin by precision of oracle")
    ("scale-update", po::value<bool>(&scale_update)->default_value(0), "Scale update by Bleu score of oracle")
    ("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale update by precision of oracle")
    ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
    ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
    ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
    ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
    ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
    ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
    ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
    ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
    ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)")
    ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");

  po::options_description cmdline_options;
  cmdline_options.add(desc);
  po::variables_map vm;
  po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
  po::notify(vm);

  if (help) {
    std::cout << "Usage: " + string(argv[0])
      + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl;
    std::cout << desc << std::endl;
    return 0;
  }

  const StaticData &staticData = StaticData::Instance();

  bool trainWithMultipleFolds = false;
  if (mosesConfigFilesFolds.size() > 0 || inputFilesFolds.size() > 0 || referenceFilesFolds.size() > 0) {
    if (rank == 0)
      cerr << "Training with " << mosesConfigFilesFolds.size() << " folds" << endl;
    trainWithMultipleFolds = true;
  }

  if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) {
    cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl;
    exit(1);
  }

  if ((sparseAverage || sparseNoAverage) && averageWeights) {
    cerr << "Parameters --sparse-average 1/--sparse-no-average 1 and --average-weights 1 are incompatible (not implemented)" << endl;
    exit(1);
  }

  if (trainWithMultipleFolds) {
    if (!mosesConfigFilesFolds.size()) {
      cerr << "Error: No moses ini files specified for training with folds" << endl;
      exit(1);
    }

    if (!inputFilesFolds.size()) {
      cerr << "Error: No input files specified for training with folds" << endl;
      exit(1);
    }

    if (!referenceFilesFolds.size()) {
      cerr << "Error: No reference files specified for training with folds" << endl;
      exit(1);
    }
  }
  else {
    if (mosesConfigFile.empty()) {
      cerr << "Error: No moses ini file specified" << endl;
      return 1;
    }

    if (inputFile.empty()) {
      cerr << "Error: No input file specified" << endl;
      return 1;
    }

    if (!referenceFiles.size()) {
      cerr << "Error: No reference files specified" << endl;
      return 1;
    }
  }

  // load input and references
  vector<string> inputSentences;
  size_t inputSize = trainWithMultipleFolds? inputFilesFolds.size(): 0;
  size_t refSize = trainWithMultipleFolds? referenceFilesFolds.size(): referenceFiles.size();
  vector<vector<string> > inputSentencesFolds(inputSize);
  vector<vector<string> > referenceSentences(refSize);

  // number of cores for each fold
  size_t coresPerFold = 0, myFold = 0;
  if (trainWithMultipleFolds) {
    if (mosesConfigFilesFolds.size() > size) {
      cerr << "Number of cores has to be a multiple of the number of folds" << endl;
      exit(1);
    }
    coresPerFold = size/mosesConfigFilesFolds.size();
    if (size % coresPerFold > 0) {
      cerr << "Number of cores has to be a multiple of the number of folds" << endl;
      exit(1);
    }

    if (rank == 0)
      cerr << "Number of cores per fold: " << coresPerFold << endl;
    myFold = rank/coresPerFold;
    cerr << "Rank " << rank << ", my fold: " << myFold << endl;
  }

  // NOTE: we do not actually need the references here, because we are reading them in from StaticData
  if (trainWithMultipleFolds) {
    if (!loadSentences(inputFilesFolds[myFold], inputSentencesFolds[myFold])) {
      cerr << "Error: Failed to load input sentences from " << inputFilesFolds[myFold] << endl;
      exit(1);
    }
    VERBOSE(1, "Rank " << rank << " reading inputs from " << inputFilesFolds[myFold] << endl);

    if (!loadSentences(referenceFilesFolds[myFold], referenceSentences[myFold])) {
      cerr << "Error: Failed to load reference sentences from " << referenceFilesFolds[myFold] << endl;
      exit(1);
    }
    if (referenceSentences[myFold].size() != inputSentencesFolds[myFold].size()) {
      cerr << "Error: Input file length (" << inputSentencesFolds[myFold].size() << ") != ("
	   << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl;
      exit(1);
    }
    VERBOSE(1, "Rank " << rank << " reading references from " << referenceFilesFolds[myFold] << endl);
  }
  else {
    if (!loadSentences(inputFile, inputSentences)) {
      cerr << "Error: Failed to load input sentences from " << inputFile << endl;
      return 1;
    }

    for (size_t i = 0; i < referenceFiles.size(); ++i) {
      if (!loadSentences(referenceFiles[i], referenceSentences[i])) {
	cerr << "Error: Failed to load reference sentences from "
	     << referenceFiles[i] << endl;
	return 1;
      }
      if (referenceSentences[i].size() != inputSentences.size()) {
	cerr << "Error: Input file length (" << inputSentences.size() << ") != ("
	     << referenceSentences[i].size() << ") length of reference file " << i
	     << endl;
	return 1;
      }
    }
  }

  if (scaleByAvgInputLength ||  scaleByInverseLength || scaleByAvgInverseLength)
    scaleByInputLength = false;

  if (historyBleu || simpleHistoryBleu) {
    sentenceBleu = false;
    cerr << "Using history Bleu. " << endl;
  }

  if (kbest) {
    realBleu = true;
    disableBleuFeature = true;
    cerr << "Use kbest lists and real Bleu scores, disable Bleu feature.." << endl;
  }

  // initialise Moses
  // add references to initialize Bleu feature
  boost::trim(decoder_settings);
  decoder_settings += " -mira -distinct-nbest -references";
  if (trainWithMultipleFolds) {
    decoder_settings += " ";
    decoder_settings += referenceFilesFolds[myFold];
  }
  else {
    for (size_t i=0; i < referenceFiles.size(); ++i) {
      decoder_settings += " ";
      decoder_settings += referenceFiles[i];
    }
  }

  vector<string> decoder_params;
  boost::split(decoder_params, decoder_settings, boost::is_any_of("\t "));

  string configFile = trainWithMultipleFolds? mosesConfigFilesFolds[myFold] : mosesConfigFile;
  VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl);
  MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
  decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
			     scaleByInverseLength, scaleByAvgInverseLength,
			     scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu);
  SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
  bool chartDecoding = (searchAlgorithm == ChartDecoding);

  // Optionally shuffle the sentences
  vector<size_t> order;
  if (trainWithMultipleFolds) {
    for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) {
      order.push_back(i);
    }
  }
  else {
    if (rank == 0) {
      for (size_t i = 0; i < inputSentences.size(); ++i) {
	order.push_back(i);
      }
    }
  }

  // initialise optimizer
  Optimiser* optimiser = NULL;
  if (learner == "mira") {
    if (rank == 0) {
      cerr << "Optimising using Mira" << endl;
      cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl;
      cerr << "selective: " << selective << endl;
      if (normaliseMargin)
	cerr << "sigmoid parameter: " << sigmoidParam << endl;
    }
    optimiser = new MiraOptimiser(slack, scale_margin, scale_margin_precision,
				  scale_update, scale_update_precision, boost, normaliseMargin, sigmoidParam);
    learning_rate = mira_learning_rate;
    perceptron_update = false;
  } else if (learner == "perceptron") {
    if (rank == 0) {
      cerr << "Optimising using Perceptron" << endl;
    }
    optimiser = new Perceptron();
    learning_rate = perceptron_learning_rate;
    perceptron_update = true;
    model_hope_fear = false; // mira only
    hope_fear = false; // mira only
    n = 1;
    hope_n = 1;
    fear_n = 1;
  } else {
    cerr << "Error: Unknown optimiser: " << learner << endl;
    return 1;
  }

  // resolve parameter dependencies
  if (batchSize > 1 && perceptron_update) {
    batchSize = 1;
    cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
  }

  if (hope_n == -1)
    hope_n = n;
  if (fear_n == -1)
    fear_n = n;

  if (model_hope_fear || kbest)
    hope_fear = false; // is true by default
  if (learner == "mira" && !(hope_fear || model_hope_fear || kbest)) {
    cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update." << endl;
    return 1;
  }

#ifdef MPI_ENABLE
  if (!trainWithMultipleFolds)
    mpi::broadcast(world, order, 0);
#endif

  // Create shards according to the number of processes used
  vector<size_t> shard;
  if (trainWithMultipleFolds) {
    size_t shardSize = order.size()/coresPerFold;
    size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
    size_t shardEnd = shardStart + shardSize;
    if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
      shardEnd = order.size();
      shardSize = shardEnd - shardStart;
    }
    VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
    VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
    shard.resize(shardSize);
    copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
    batchSize = 1;
  }
  else {
    size_t shardSize = order.size() / size;
    size_t shardStart = (size_t) (shardSize * rank);
    size_t shardEnd = (size_t) (shardSize * (rank + 1));
    if (rank == size - 1) {
      shardEnd = order.size();
      shardSize = shardEnd - shardStart;
    }
    VERBOSE(1, "Rank: " << rank << " Shard size: " << shardSize << endl);
    VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
    shard.resize(shardSize);
    copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
    if (batchEqualsShard)
      batchSize = shardSize;
  }

  // get reference to feature functions
  const vector<const ScoreProducer*> featureFunctions =
    staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetFeatureFunctions();
  ScoreComponentCollection initialWeights = decoder->getWeights();

  bool tuneMetaFeature = false;
  const vector<const FeatureFunction*>& sparseProducers = staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetSparseProducers();
  for (unsigned i = 0; i < sparseProducers.size(); ++i) {
    float spWeight = sparseProducers[i]->GetSparseProducerWeight();
    if (spWeight != 1.0) {
      tuneMetaFeature = true;
      cerr << "Rank " << rank << ", sparse Producer " <<
	sparseProducers[i]->GetScoreProducerWeightShortName()
	   << " weight: " << spWeight << endl;
    }
  }

  if (add2lm != 0) {
    const LMList& lmList_new = staticData.GetLMList();
    for (LMList::const_iterator i = lmList_new.begin(); i != lmList_new.end(); ++i) {
      float lmWeight = initialWeights.GetScoreForProducer(*i) + add2lm;
      initialWeights.Assign(*i, lmWeight);
      cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl;
    }
  }

  if (normaliseWeights) {
    initialWeights.L1Normalise();
    cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl;
  }

  decoder->setWeights(initialWeights);

  // set bleu weight to twice the size of the language model weight(s)
  const LMList& lmList = staticData.GetLMList();
  if (bleu_weight_lm) {
    float lmSum = 0;
    for (LMList::const_iterator i = lmList.begin(); i != lmList.end(); ++i)
      lmSum += abs(initialWeights.GetScoreForProducer(*i));
    bleuWeight = lmSum * bleu_weight_lm_factor;
    cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl;
  }

  if (bleuWeight_hope == -1) {
    bleuWeight_hope = bleuWeight;
  }
  if (bleuWeight_fear == -1) {
    bleuWeight_fear = bleuWeight;
  }
  bleuWeight_fear *= bleu_weight_fear_factor;
  cerr << "Bleu weight: " << bleuWeight << endl;
  cerr << "Bleu weight fear: " << bleuWeight_fear << endl;

  if (decode_hope || decode_fear || decode_model) {
    size_t decode = 1;
    if (decode_fear) decode = 2;
    if (decode_model) decode = 3;
    decodeHopeOrFear(rank, size, decode, decode_filename, inputSentences, decoder, n, bleuWeight);
  }

  //Main loop:
  ScoreComponentCollection cumulativeWeights; // collect weights per epoch to produce an average
  ScoreComponentCollection cumulativeWeightsBinary;
  size_t numberOfUpdates = 0;
  size_t numberOfUpdatesThisEpoch = 0;

  time_t now;
  time(&now);
  cerr << "Rank " << rank << ", " << ctime(&now);

  float avgInputLength = 0;
  float sumOfInputs = 0;
  size_t numberOfInputs = 0;

  ScoreComponentCollection mixedWeights;
  ScoreComponentCollection mixedWeightsPrevious;
  ScoreComponentCollection mixedWeightsBeforePrevious;
  ScoreComponentCollection mixedAverageWeights;
  ScoreComponentCollection mixedAverageWeightsPrevious;
  ScoreComponentCollection mixedAverageWeightsBeforePrevious;

  bool stop = false;
//	int sumStillViolatedConstraints;
  float epsilon = 0.0001;

  // Variables for feature confidence
  ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates;
  featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates
  cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;

  for (size_t epoch = continue_epoch; epoch < epochs && !stop; ++epoch) {
    if (shuffle) {
      if (trainWithMultipleFolds || rank == 0) {
	cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl;
	RandomIndex rindex;
	random_shuffle(order.begin(), order.end(), rindex);
      }

#ifdef MPI_ENABLE
      if (!trainWithMultipleFolds)
	mpi::broadcast(world, order, 0);
#endif

      // redo shards
      if (trainWithMultipleFolds) {
	size_t shardSize = order.size()/coresPerFold;
	size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
	size_t shardEnd = shardStart + shardSize;
	if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
	  shardEnd = order.size();
	  shardSize = shardEnd - shardStart;
	}
	VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
	VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
	shard.resize(shardSize);
	copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
	batchSize = 1;
      }
      else {
	size_t shardSize = order.size()/size;
	size_t shardStart = (size_t) (shardSize * rank);
	size_t shardEnd = (size_t) (shardSize * (rank + 1));
	if (rank == size - 1) {
	  shardEnd = order.size();
	  shardSize = shardEnd - shardStart;
	}
	VERBOSE(1, "Shard size: " << shardSize << endl);
	VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
	shard.resize(shardSize);
	copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
	if (batchEqualsShard)
	  batchSize = shardSize;
      }
    }

    // sum of violated constraints in an epoch
    // sumStillViolatedConstraints = 0;

    numberOfUpdatesThisEpoch = 0;
    // Sum up weights over one epoch, final average uses weights from last epoch
    if (!accumulateWeights) {
      cumulativeWeights.ZeroAll();
      cumulativeWeightsBinary.ZeroAll();
    }

    // number of weight dumps this epoch
    size_t weightMixingThisEpoch = 0;
    size_t weightEpochDump = 0;

    size_t shardPosition = 0;
    vector<size_t>::const_iterator sid = shard.begin();
    while (sid != shard.end()) {
      // feature values for hypotheses i,j (matrix: batchSize x 3*n x featureValues)
      vector<vector<ScoreComponentCollection> > featureValues;
      vector<vector<float> > bleuScores;
      vector<vector<float> > modelScores;

      // variables for hope-fear/perceptron setting
      vector<vector<ScoreComponentCollection> > featureValuesHope;
      vector<vector<ScoreComponentCollection> > featureValuesFear;
      vector<vector<float> > bleuScoresHope;
      vector<vector<float> > bleuScoresFear;
      vector<vector<float> > modelScoresHope;
      vector<vector<float> > modelScoresFear;

      // get moses weights
      ScoreComponentCollection mosesWeights = decoder->getWeights();
      VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl);

      if (historyBleu || simpleHistoryBleu) {
	decoder->printBleuFeatureHistory(cerr);
      }

      if (tuneMetaFeature) {
	// initialise meta feature
	MetaFeatureProducer *m = staticData.GetMetaFeatureProducer();
	FeatureFunction* ff = const_cast<FeatureFunction*>(sparseProducers[0]);
	if (sparseProducers[0]->GetScoreProducerWeightShortName().compare("wt") == 0) {
	  WordTranslationFeature* wt =
	    static_cast<WordTranslationFeature*>(ff);
	  mosesWeights.Assign(m, wt->GetSparseProducerWeight());
	}
	else if (sparseProducers[0]->GetScoreProducerWeightShortName().compare("pp") == 0) {
	  PhrasePairFeature* pp =
	    static_cast<PhrasePairFeature*>(ff);
	  mosesWeights.Assign(m, pp->GetSparseProducerWeight());
	}
      }

      // BATCHING: produce nbest lists for all input sentences in batch
      vector<float> oracleBleuScores;
      vector<float> oracleModelScores;
      vector<vector<const Word*> > oneBests;
      vector<ScoreComponentCollection> oracleFeatureValues;
      vector<size_t> inputLengths;
      vector<size_t> ref_ids;
      size_t actualBatchSize = 0;

      vector<size_t>::const_iterator current_sid_start = sid;
      size_t examples_in_batch = 0;
      bool skip_example = false;
      for (size_t batchPosition = 0; batchPosition < batchSize && sid
	     != shard.end(); ++batchPosition) {
	string input;
	if (trainWithMultipleFolds)
	  input = inputSentencesFolds[myFold][*sid];
	else
	  input = inputSentences[*sid];

	Moses::Sentence *sentence = new Sentence();
	stringstream in(input + "\n");
	const vector<FactorType> inputFactorOrder = staticData.GetInputFactorOrder();
	sentence->Read(in,inputFactorOrder);
	cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"";
	sentence->Print(cerr);
	cerr << "\"" << " (batch pos " << batchPosition << ")" << endl;
	size_t current_input_length = (*sentence).GetSize();

	if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) {
	  sumOfInputs += current_input_length;
	  ++numberOfInputs;
	  avgInputLength = sumOfInputs/numberOfInputs;
	  decoder->setAvgInputLength(avgInputLength);
	  cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl;
	}

	vector<ScoreComponentCollection> newFeatureValues;
	vector<float> newScores;
	if (model_hope_fear) {
	  featureValues.push_back(newFeatureValues);
	  bleuScores.push_back(newScores);
	  modelScores.push_back(newScores);
	}
	if (hope_fear || perceptron_update) {
	  featureValuesHope.push_back(newFeatureValues);
	  featureValuesFear.push_back(newFeatureValues);
	  bleuScoresHope.push_back(newScores);
	  bleuScoresFear.push_back(newScores);
	  modelScoresHope.push_back(newScores);
	  modelScoresFear.push_back(newScores);
	  if (historyBleu || simpleHistoryBleu || debug_model) {
	    featureValues.push_back(newFeatureValues);
	    bleuScores.push_back(newScores);
	    modelScores.push_back(newScores);
	  }
	}
	if (kbest) {
	  // for decoding
	  featureValues.push_back(newFeatureValues);
	  bleuScores.push_back(newScores);
	  modelScores.push_back(newScores);

	  // for storing selected examples
	  featureValuesHope.push_back(newFeatureValues);
	  featureValuesFear.push_back(newFeatureValues);
	  bleuScoresHope.push_back(newScores);
	  bleuScoresFear.push_back(newScores);
	  modelScoresHope.push_back(newScores);
	  modelScoresFear.push_back(newScores);
	}

	size_t ref_length;
	float avg_ref_length;

	if (print_weights)
	  cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl;
	if (print_core_weights) {
	  cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: ";
	  mosesWeights.PrintCoreFeatures();
	  cerr << endl;
	}

	// check LM weight
	const LMList& lmList_new = staticData.GetLMList();
	for (LMList::const_iterator i = lmList_new.begin(); i != lmList_new.end(); ++i) {
	  float lmWeight = mosesWeights.GetScoreForProducer(*i);
	  cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl;
	  if (lmWeight <= 0) {
	    cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl;
	    mosesWeights.Assign(*i, 0.1);
	    cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl;
	  }
	}

	// select inference scheme
	cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl;
	if (hope_fear || perceptron_update) {
	  // HOPE
	  cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n <<
	    "best hope translations" << endl;
	  vector< vector<const Word*> > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope,
	       featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition],
	       1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
	  vector<const Word*> oracle = outputHope[0];
	  decoder->cleanup(chartDecoding);
	  ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
	  avg_ref_length = ref_length;
	  float hope_length_ratio = (float)oracle.size()/ref_length;
	  int oracleSize = (int)oracle.size();
	  cerr << endl;

	  // count sparse features occurring in hope translation
	  featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();

	  float precision = bleuScoresHope[batchPosition][0];
	  if (historyBleu || simpleHistoryBleu) {
	    precision /= decoder->getTargetLengthHistory();
	  }
	  else {
	    if (scaleByAvgInputLength) precision /= decoder->getAverageInputLength();
	    else if (scaleByAvgInverseLength) precision /= (100/decoder->getAverageInputLength());
	    precision /= scaleByX;
	  }
	  if (scale_margin_precision || scale_update_precision) {
	    if (historyBleu || simpleHistoryBleu || scaleByAvgInputLength || scaleByAvgInverseLength) {
	      cerr << "Rank " << rank << ", epoch " << epoch << ", set hope precision: " << precision << endl;
	      ((MiraOptimiser*) optimiser)->setPrecision(precision);
	    }
	  }

	  vector<const Word*> bestModel;
	  if (debug_model || historyBleu || simpleHistoryBleu) {
	    // MODEL (for updating the history only, using dummy vectors)
	    cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl;
	    vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
		featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
		1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
	    bestModel = outputModel[0];
	    decoder->cleanup(chartDecoding);
	    cerr << endl;
	    ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
	  }

	  // FEAR
	  float fear_length_ratio = 0;
	  float bleuRatioHopeFear = 0;
	  int fearSize = 0;
	  cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl;
	  vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear,
	      featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition],
	      1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
	  vector<const Word*> fear = outputFear[0];
	  decoder->cleanup(chartDecoding);
	  ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
	  avg_ref_length += ref_length;
	  avg_ref_length /= 2;
	  fear_length_ratio = (float)fear.size()/ref_length;
	  fearSize = (int)fear.size();
	  cerr << endl;
	  for (size_t i = 0; i < fear.size(); ++i)
	    delete fear[i];

	  // count sparse features occurring in fear translation
	  featureValuesFear[batchPosition][0].IncrementSparseFearFeatures();

	  // Bleu-related example selection
	  bool skip = false;
	  bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0];
	  if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio)
	    skip = true;
	  if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio)
	    skip = true;

	  // sanity check
	  if (historyBleu || simpleHistoryBleu) {
	    if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] &&
		modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) {
	      if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon &&
		  abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
		cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl;
		skip = true;
	      }
	    }
	    if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] &&
		modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) {
	      if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon &&
		  abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) {
		cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl;
		skip = true;
	      }
	    }
	  }
	  if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) {
	    if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) {
	      // check if it's an error or a warning
	      skip = true;
	      if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
		cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
	      }
	      else {
		cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
	      }
	    }
	  }

	  if (skip) {
	    cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << bleuRatioHopeFear << ").. " << endl;
	    featureValuesHope[batchPosition].clear();
	    featureValuesFear[batchPosition].clear();
	    bleuScoresHope[batchPosition].clear();
	    bleuScoresFear[batchPosition].clear();
	    if (historyBleu || simpleHistoryBleu || debug_model) {
	      featureValues[batchPosition].clear();
	      bleuScores[batchPosition].clear();
	    }
	  }
	  else {
	    examples_in_batch++;

	    // needed for history
	    if (historyBleu || simpleHistoryBleu)  {
	      inputLengths.push_back(current_input_length);
	      ref_ids.push_back(*sid);
	      oneBests.push_back(bestModel);
	    }
	  }
	}
	if (model_hope_fear) {
	  cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
	  size_t oraclePos = featureValues[batchPosition].size();
	  decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
	    featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
	    0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
	  //vector<const Word*> oracle = outputHope[0];
	  // needed for history
	  inputLengths.push_back(current_input_length);
	  ref_ids.push_back(*sid);
	  decoder->cleanup(chartDecoding);
	  //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
	  //float hope_length_ratio = (float)oracle.size()/ref_length;
	  cerr << endl;

	  oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
	  oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
	  oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);

	  // MODEL
	  cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
	  if (historyBleu || simpleHistoryBleu) {
	    vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
	       bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
	       modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
	    vector<const Word*> bestModel = outputModel[0];
	    oneBests.push_back(bestModel);
	    inputLengths.push_back(current_input_length);
	    ref_ids.push_back(*sid);
	  }
	  else {
	    decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
	      featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
	      0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
	  }
	  decoder->cleanup(chartDecoding);
	  //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
	  //float model_length_ratio = (float)bestModel.size()/ref_length;
	  cerr << endl;

	  // FEAR
	  cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
	  decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
	    featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
	    0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
	  decoder->cleanup(chartDecoding);
	  //ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
	  //float fear_length_ratio = (float)fear.size()/ref_length;

	  examples_in_batch++;
	}
	if (kbest) {
	  // MODEL
	  cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
	  if (historyBleu || simpleHistoryBleu) {
	    vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
		  bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
		  modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength,	rank, epoch, "");
	    vector<const Word*> bestModel = outputModel[0];
	    oneBests.push_back(bestModel);
	    inputLengths.push_back(current_input_length);
	    ref_ids.push_back(*sid);
	  }
	  else {
	    decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
	      featureValues[batchPosition], bleuScores[batchPosition],
	      modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
	  }
	  decoder->cleanup(chartDecoding);
	  //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
	  //float model_length_ratio = (float)bestModel.size()/ref_length;
	  cerr << endl;

	  examples_in_batch++;

	  HypothesisQueue queueHope(hope_n);
	  HypothesisQueue queueFear(fear_n);
	  cerr << endl;
	  if (most_violated || all_violated || one_against_all) {
	    float bleuHope = -1000;
	    float bleuFear = 1000;
	    size_t indexHope = -1;
	    size_t indexFear = -1;

	    vector<float> bleuHopeList;
	    vector<float> bleuFearList;
	    vector<float> indexHopeList;
	    vector<float> indexFearList;

	    if (most_violated)
	      cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
	    else if (all_violated)
	      cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
	    else
	      cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";

	    // find best hope, then find fear that violates our constraint most
	    for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
	      if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores
		if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
		  if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
		    // better model score
		    bleuHope = bleuScores[batchPosition][i];
		    indexHope = i;
		  }
		}
	      }
	      else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best
		bleuHope = bleuScores[batchPosition][i];
		indexHope = i;
	      }
	    }

	    float currentViolation = 0;
	    float minimum_bleu_diff = 0.01;
	    for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
	      float bleuDiff = bleuHope - bleuScores[batchPosition][i];
	      float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
	      if (bleuDiff > epsilon) {
		if (one_against_all && bleuDiff > minimum_bleu_diff) {
		  cerr << ".. adding pair";
		  bleuHopeList.push_back(bleuHope);
		  bleuFearList.push_back(bleuScores[batchPosition][i]);
		  indexHopeList.push_back(indexHope);
		  indexFearList.push_back(i);
		}
		else if (modelDiff < bleuDiff) {
		  float diff = bleuDiff - modelDiff;
		  if (diff > epsilon) {
		    if (all_violated) {
		      cerr << ".. adding pair";
		      bleuHopeList.push_back(bleuHope);
		      bleuFearList.push_back(bleuScores[batchPosition][i]);
		      indexHopeList.push_back(indexHope);
		      indexFearList.push_back(i);
		    }
		    else if (most_violated && diff > currentViolation) {
		      currentViolation = diff;
		      bleuFear = bleuScores[batchPosition][i];
		      indexFear = i;
		      cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
		    }
		  }
		}
	      }
	    }

	    if (most_violated) {
	      if (currentViolation > 0) {
		cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl;
		cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope  << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
		bleuScoresHope[batchPosition].push_back(bleuHope);
		bleuScoresFear[batchPosition].push_back(bleuFear);
		featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
		featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
		float modelScoreHope = modelScores[batchPosition][indexHope];
		float modelScoreFear = modelScores[batchPosition][indexFear];
		if (most_violated_reg) {
		  // reduce model score difference by factor ~0.5
		  float reg = currentViolation/4;
		  modelScoreHope += abs(reg);
		  modelScoreFear -= abs(reg);
		  float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear);
		  cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl;
		}
		modelScoresHope[batchPosition].push_back(modelScoreHope);
		modelScoresFear[batchPosition].push_back(modelScoreFear);

		featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
		featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
	      }
	      else {
		cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl;
		skip_example = 1;
	      }
	    }
	    else cerr << endl;
	  }
	  if (max_bleu_diff) {
	    cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl;
	    for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
	      float hopeScore = bleuScores[batchPosition][i];
	      if (modelPlusBleu) hopeScore += modelScores[batchPosition][i];
	      BleuIndexPair hope(hopeScore, i);
	      queueHope.Push(hope);

	      float fearScore = -1*(bleuScores[batchPosition][i]);
	      if (modelPlusBleu) fearScore += modelScores[batchPosition][i];
	      BleuIndexPair fear(fearScore, i);
	      queueFear.Push(fear);
	    }
	    skip_example = 0;
	  }
	  cerr << endl;

	  vector<BleuIndexPair> hopeList, fearList;
	  for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop());
	  for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop());
	  for (size_t i=0; i<hopeList.size(); ++i) {
	    //float bleuHope = hopeList[i].first;
	    size_t indexHope = hopeList[i].second;
	    float bleuHope = bleuScores[batchPosition][indexHope];
	    for (size_t j=0; j<fearList.size(); ++j) {
	      //float bleuFear = -1*(fearList[j].first);
	      size_t indexFear = fearList[j].second;
	      float bleuFear = bleuScores[batchPosition][indexFear];
	      cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope  << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
	      bleuScoresHope[batchPosition].push_back(bleuHope);
	      bleuScoresFear[batchPosition].push_back(bleuFear);
	      featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
	      featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
	      float modelScoreHope = modelScores[batchPosition][indexHope];
	      float modelScoreFear = modelScores[batchPosition][indexFear];

	      modelScoresHope[batchPosition].push_back(modelScoreHope);
	      modelScoresFear[batchPosition].push_back(modelScoreFear);

	      featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
	      featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
	    }
	  }
	  if (!makePairs)
	    cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl;
	}

	// next input sentence
	++sid;
	++actualBatchSize;
	++shardPosition;
      } // end of batch loop

      if (examples_in_batch == 0 || (kbest && skip_example)) {
	cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
      }
      else {
	vector<vector<float> > losses(actualBatchSize);
	if (model_hope_fear) {
	  // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
	  for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
	    for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
	      losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
	    }
	  }
	}

	// set weight for bleu feature to 0 before optimizing
	vector<const ScoreProducer*>::const_iterator iter;
	const vector<const ScoreProducer*> featureFunctions2 = staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetFeatureFunctions();
	for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) {
	  if ((*iter)->GetScoreProducerWeightShortName() == "bl") {
	    mosesWeights.Assign(*iter, 0);
	    break;
	  }
	}

	// scale LM feature (to avoid rapid changes)
	if (scale_lm) {
	  cerr << "scale lm" << endl;
	  const LMList& lmList_new = staticData.GetLMList();
	  for (LMList::const_iterator iter = lmList_new.begin(); iter != lmList_new.end(); ++iter) {
	    // scale down score
	    if (model_hope_fear) {
	      scaleFeatureScore(*iter, scale_lm_factor, featureValues, rank, epoch);
	    }
	    else {
	      scaleFeatureScore(*iter, scale_lm_factor, featureValuesHope, rank, epoch);
	      scaleFeatureScore(*iter, scale_lm_factor, featureValuesFear, rank, epoch);
	    }
	  }
	}

	// scale WP
	if (scale_wp) {
	  // scale up weight
	  WordPenaltyProducer *wp = staticData.GetFirstWordPenaltyProducer();

	  // scale down score
	  if (model_hope_fear) {
	    scaleFeatureScore(wp, scale_wp_factor, featureValues, rank, epoch);
	  }
	  else {
	    scaleFeatureScore(wp, scale_wp_factor, featureValuesHope, rank, epoch);
	    scaleFeatureScore(wp, scale_wp_factor, featureValuesFear, rank, epoch);
	  }
	}

	// print out the feature values
	if (print_feature_values) {
	  cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
	  if (model_hope_fear) printFeatureValues(featureValues);
	  else {
	    cerr << "hope: " << endl;
	    printFeatureValues(featureValuesHope);
	    cerr << "fear: " << endl;
	    printFeatureValues(featureValuesFear);
	  }
	}

	// apply learning rates to feature vectors before optimization
	if (feature_confidence) {
	  cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl;
	  if (model_hope_fear) {
	    applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0);
	  }
	  else {
	    applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0);
	    applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0);
	  }
	}
	else {
	  // apply fixed learning rates
	  cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
	  if (core_r0 != 1.0 || sparse_r0 != 1.0) {
	    if (model_hope_fear) {
	      applyLearningRates(featureValues, core_r0, sparse_r0);
	    }
	    else {
	      applyLearningRates(featureValuesHope, core_r0, sparse_r0);
	      applyLearningRates(featureValuesFear, core_r0, sparse_r0);
	    }
	  }
	}

	if (kbest) {
	  // If we are tuning a global weight for a sparse producer,
	  // we must collapse the sparse features first (report weighted aggregate)
	  if (tuneMetaFeature) {
	    for (unsigned i = 0; i < sparseProducers.size(); ++i) {
	      float spWeight = sparseProducers[i]->GetSparseProducerWeight();
	      if (spWeight != 1.0) {
		MetaFeatureProducer *m = staticData.GetMetaFeatureProducer();
		for (size_t i=0; i < featureValuesHope.size(); ++i) {
		  for (size_t j=0; j < featureValuesHope[i].size(); ++j) {
		    // multiply sparse feature values with weights
		    const FVector scores =
		      featureValuesHope[i][j].GetVectorForProducer(sparseProducers[i]);
		    const FVector &weights = staticData.GetAllWeights().GetScoresVector();
		    float aggregate = scores.inner_product(weights);
		    //cerr << "Rank " << rank << ", epoch " << epoch << ", sparse Producer " <<
		    //sparseProducers[i]->GetScoreProducerWeightShortName()
		    //<< " aggregate: " << aggregate << endl;
		    aggregate *= spWeight;
		    //cerr << "Rank " << rank << ", epoch " << epoch << ", sparse Producer " <<
		    //sparseProducers[i]->GetScoreProducerWeightShortName()
		    //<< " weighted aggregate: " << aggregate << endl;

		    // copy core features to a new collection, then assign aggregated sparse feature
		    ScoreComponentCollection scoresAggregate;
		    scoresAggregate.CoreAssign(featureValuesHope[i][j]);
		    scoresAggregate.Assign(m, aggregate);
		    featureValuesHope[i][j] = scoresAggregate;
		  }
		}
		for (size_t i=0; i < featureValuesFear.size(); ++i) {
		  for (size_t j=0; j < featureValuesFear[i].size(); ++j) {
		    // multiply sparse feature values with weights
		    const FVector scores =
		      featureValuesFear[i][j].GetVectorForProducer(sparseProducers[i]);
		    const FVector &weights = staticData.GetAllWeights().GetScoresVector();
		    float aggregate = scores.inner_product(weights);
		    aggregate *= spWeight;

		    // copy core features to a new collection, then assign aggregated sparse feature
		    ScoreComponentCollection scoresAggregate;
		    scoresAggregate.CoreAssign(featureValuesFear[i][j]);
		    scoresAggregate.Assign(m, aggregate);
		    featureValuesFear[i][j] = scoresAggregate;
		  }
		}

		cerr << "Rank " << rank << ", epoch " << epoch << ", new hope feature vector: " <<
		  featureValuesHope[0][0] << endl;
		cerr << "Rank " << rank << ", epoch " << epoch << ", new fear feature vector: " <<
		  featureValuesFear[0][0] << endl;
	      }
	    }
	  }
	}

	// Run optimiser on batch:
	VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
	size_t update_status = 1;
	ScoreComponentCollection weightUpdate;
	if (perceptron_update) {
	  vector<vector<float> > dummy1;
	  update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope,
	    featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch);
	}
	else if (hope_fear) {
	  if (bleuScoresHope[0][0] >= min_oracle_bleu) {
	    if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) {
	      update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate,
	      featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0],
		 bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
	    }
	    else
	      update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
	       featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
	       modelScoresFear, learning_rate, rank, epoch);
	  }
	  else
	    update_status = 1;
	}
	else if (kbest) {
	  if (selective)
	    update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective(
		weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
		modelScoresHope, modelScoresFear, learning_rate, rank, epoch);
	  else if (summed)
	    update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed(
	     weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
	     modelScoresHope, modelScoresFear, learning_rate, rank, epoch, rescaleSlack, makePairs);
	  else {
	    if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) {
	      cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
	      cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
	      update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(
		      weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
		      bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
		      modelScoresFear[0][0], learning_rate, rank, epoch);
	    }
	    else {
	      cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
	      cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
	      update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
		  featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
		       modelScoresFear, learning_rate, rank, epoch);
	    }
	  }
	}
	else {
	  // model_hope_fear
	  update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate,
	      featureValues, losses, bleuScores, modelScores, oracleFeatureValues,
	      oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
	}

	// sumStillViolatedConstraints += update_status;

	if (update_status == 0) {	 // if weights were updated
	  // apply weight update
	  if (debug)
	    cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;

	  if (tuneMetaFeature) {
	    MetaFeatureProducer *m = staticData.GetMetaFeatureProducer();
	    // update sparse producer weight
	    // (NOTE: this currently doesn't work for more than one sparse producer)
	    float metaWeightUpdate = weightUpdate.GetScoreForProducer(m);

	    const vector<const FeatureFunction*> sparseProducers =
	      staticData.GetTranslationSystem(TranslationSystem::DEFAULT).GetSparseProducers();
	    FeatureFunction* ff = const_cast<FeatureFunction*>(sparseProducers[0]);
	    if (sparseProducers[0]->GetScoreProducerWeightShortName().compare("wt") == 0) {
	      WordTranslationFeature* wt =
		static_cast<WordTranslationFeature*>(ff);
	      float newWeight = wt->GetSparseProducerWeight();
	      cerr << "Rank " << rank << ", epoch " << epoch << ", old meta weight: " << newWeight << endl;
	      newWeight += metaWeightUpdate;
	      wt->SetSparseProducerWeight(newWeight);
	      cerr << "Rank " << rank << ", epoch " << epoch << ", new meta weight: " << newWeight << endl;
	    }
	    else if (sparseProducers[0]->GetScoreProducerWeightShortName().compare("pp") == 0) {
	      PhrasePairFeature* pp =
		static_cast<PhrasePairFeature*>(ff);
	      float newWeight = pp->GetSparseProducerWeight();
	      cerr << "Rank " << rank << ", epoch " << epoch << ", old meta weight: " << newWeight << endl;
	      newWeight += metaWeightUpdate;
	      pp->SetSparseProducerWeight(newWeight);
	      cerr << "Rank " << rank << ", epoch " << epoch << ", new meta weight: " << newWeight << endl;
	    }
	  }

	  if (feature_confidence) {
	    // update confidence counts based on weight update
	    confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);

	    // update feature learning rates
	    featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0);
	  }

	  // apply weight update to Moses weights
	  mosesWeights.PlusEquals(weightUpdate);

	  if (normaliseWeights && !tuneMetaFeature)
	    mosesWeights.L1Normalise();

	  cumulativeWeights.PlusEquals(mosesWeights);
	  if (sparseAverage) {
	    ScoreComponentCollection binary;
	    binary.SetToBinaryOf(mosesWeights);
	    cumulativeWeightsBinary.PlusEquals(binary);
	  }

	  ++numberOfUpdates;
	  ++numberOfUpdatesThisEpoch;
	  if (averageWeights && !tuneMetaFeature) {
	    ScoreComponentCollection averageWeights(cumulativeWeights);
	    if (accumulateWeights) {
	      averageWeights.DivideEquals(numberOfUpdates);
	    } else {
	      averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
	    }

	    mosesWeights = averageWeights;
	  }

	  // set new Moses weights
	  decoder->setWeights(mosesWeights);
	  //cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
	}

	// update history (for approximate document Bleu)
	if (historyBleu || simpleHistoryBleu) {
	  for (size_t i = 0; i < oneBests.size(); ++i)
	    cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " ";
	  decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch);
	  deleteTranslations(oneBests);
	}
      } // END TRANSLATE AND UPDATE BATCH

      // size of all shards except for the last one
      size_t generalShardSize;
      if (trainWithMultipleFolds)
	generalShardSize = order.size()/coresPerFold;
      else
	generalShardSize = order.size()/size;

      size_t mixing_base = mixingFrequency == 0 ? 0 : generalShardSize / mixingFrequency;
      size_t dumping_base = weightDumpFrequency == 0 ? 0 : generalShardSize / weightDumpFrequency;
      bool mix = evaluateModulo(shardPosition, mixing_base, actualBatchSize);

      // mix weights?
      if (mix) {
#ifdef MPI_ENABLE
	cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl;
	// collect all weights in mixedWeights and divide by number of processes
	mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0);

	// mix confidence counts
	//mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
	ScoreComponentCollection totalBinary;
	if (sparseAverage) {
	  ScoreComponentCollection binary;
	  binary.SetToBinaryOf(mosesWeights);
	  mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
	}
	if (rank == 0) {
	  // divide by number of processes
	  if (sparseNoAverage)
	    mixedWeights.CoreDivideEquals(size); // average only core weights
	  else if (sparseAverage)
	    mixedWeights.DivideEquals(totalBinary);
	  else
	    mixedWeights.DivideEquals(size);

	  // divide confidence counts
	  //mixedConfidenceCounts.DivideEquals(size);

	  // normalise weights after averaging
	  if (normaliseWeights) {
	    mixedWeights.L1Normalise();
	  }

	  ++weightMixingThisEpoch;

	  if (pruneZeroWeights) {
	    size_t pruned = mixedWeights.PruneZeroWeightFeatures();
	    cerr << "Rank " << rank << ", epoch " << epoch << ", "
		 << pruned << " zero-weighted features pruned from mixedWeights." << endl;

	    pruned = cumulativeWeights.PruneZeroWeightFeatures();
	    cerr << "Rank " << rank << ", epoch " << epoch << ", "
		 << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
	  }

	  if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) {
	    size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
	    cerr << "Rank " << rank << ", epoch " << epoch << ", "
		 << pruned << " features pruned from mixedWeights." << endl;

	    pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
	    cerr << "Rank " << rank << ", epoch " << epoch << ", "
		 << pruned << " features pruned from cumulativeWeights." << endl;
	  }

	  if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) {
	    if (l1_regularize) {
	      size_t pruned;
	      if (l1_reg_sparse)
		pruned = mixedWeights.SparseL1Regularize(l1_lambda);
	      else
		pruned = mixedWeights.L1Regularize(l1_lambda);
	      cerr << "Rank " << rank << ", epoch " << epoch << ", "
		   << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
	    }
	    if (l2_regularize) {
	      if (l2_reg_sparse)
		mixedWeights.SparseL2Regularize(l2_lambda);
	      else
		mixedWeights.L2Regularize(l2_lambda);
	      cerr << "Rank " << rank << ", epoch " << epoch << ", "
		   << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;
	    }
	  }
	}

	// broadcast average weights from process 0
	mpi::broadcast(world, mixedWeights, 0);
	decoder->setWeights(mixedWeights);
	mosesWeights = mixedWeights;

	// broadcast summed confidence counts
	//mpi::broadcast(world, mixedConfidenceCounts, 0);
	//confidenceCounts = mixedConfidenceCounts;
#endif
#ifndef MPI_ENABLE
	//cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
	mixedWeights = mosesWeights;
#endif
      } // end mixing

      // Dump weights?
      if (trainWithMultipleFolds || weightEpochDump == weightDumpFrequency) {
	// dump mixed weights at end of every epoch to enable continuing a crashed experiment
	// (for jackknife every time the weights are mixed)
	ostringstream filename;
	if (epoch < 10)
	  filename << weightDumpStem << "_mixed_0" << epoch;
	else
	  filename << weightDumpStem << "_mixed_" << epoch;

	if (weightDumpFrequency > 1)
	  filename << "_" << weightEpochDump;

	mixedWeights.Save(filename.str());
	cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
      }
      if (dumpMixedWeights) {
	if (mix && rank == 0 && !weightDumpStem.empty()) {
	  // dump mixed weights instead of average weights
	  ostringstream filename;
	  if (epoch < 10)
	    filename << weightDumpStem << "_0" << epoch;
	  else
	    filename << weightDumpStem << "_" << epoch;

	  if (weightDumpFrequency > 1)
	    filename << "_" << weightEpochDump;

	  cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
	  mixedWeights.Save(filename.str());
	  ++weightEpochDump;
	}
      }
      else {
	if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
	  cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl;
	  ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
	  bool proceed = false;
	  if (accumulateWeights) {
	    if (numberOfUpdates > 0) {
	      tmpAverageWeights.DivideEquals(numberOfUpdates);
	      proceed = true;
	    }
	  } else {
	    if (numberOfUpdatesThisEpoch > 0) {
	      if (sparseNoAverage) // average only core weights
		tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch);
	      else if (sparseAverage)
		tmpAverageWeights.DivideEquals(cumulativeWeightsBinary);
	      else
		tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch);
	      proceed = true;
	    }
	  }

	  if (proceed) {
#ifdef MPI_ENABLE
	    // average across processes
	    mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
	    ScoreComponentCollection totalBinary;
	    if (sparseAverage) {
	      ScoreComponentCollection binary;
	      binary.SetToBinaryOf(mosesWeights);
	      mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
	    }
#endif
#ifndef MPI_ENABLE
	    mixedAverageWeights = tmpAverageWeights;
      //FIXME: What do to for non-mpi version
	    ScoreComponentCollection totalBinary;
#endif
	    if (rank == 0 && !weightDumpStem.empty()) {
	      // divide by number of processes
	      if (sparseNoAverage)
		mixedAverageWeights.CoreDivideEquals(size); // average only core weights
	      else if (sparseAverage)
		mixedAverageWeights.DivideEquals(totalBinary);
	      else
		mixedAverageWeights.DivideEquals(size);

	      // normalise weights after averaging
	      if (normaliseWeights) {
		mixedAverageWeights.L1Normalise();
	      }

	      // dump final average weights
	      ostringstream filename;
	      if (epoch < 10) {
		filename << weightDumpStem << "_0" << epoch;
	      } else {
		filename << weightDumpStem << "_" << epoch;
	      }

	      if (weightDumpFrequency > 1) {
		filename << "_" << weightEpochDump;
	      }

	      /*if (accumulateWeights) {
		cerr << "\nMixed average weights (cumulative) during epoch "	<< epoch << ": " << mixedAverageWeights << endl;
		} else {
		cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
		}*/

	      cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
	      mixedAverageWeights.Save(filename.str());
	      ++weightEpochDump;

	      if (weightEpochDump == weightDumpFrequency) {
		if (l1_regularize) {
		  size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda);
		  cerr << "Rank " << rank << ", epoch " << epoch << ", "
		       << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;

		}
		if (l2_regularize) {
		  mixedAverageWeights.SparseL2Regularize(l2_lambda);
		  cerr << "Rank " << rank << ", epoch " << epoch << ", "
		       << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl;
		}

		if (l1_regularize || l2_regularize) {
		  filename << "_reg";
		  cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
		  mixedAverageWeights.Save(filename.str());
		}
	      }

	      if (weightEpochDump == weightDumpFrequency && printFeatureCounts) {
		// print out all features with counts
		stringstream s1, s2;
		s1 << "sparse_feature_hope_counts" << "_" << epoch;
		s2 << "sparse_feature_fear_counts" << "_" << epoch;
		ofstream sparseFeatureCountsHope(s1.str().c_str());
		ofstream sparseFeatureCountsFear(s2.str().c_str());

		mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope);
		mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear);
		sparseFeatureCountsHope.close();
		sparseFeatureCountsFear.close();
	      }
	    }
	  }
	}// end dumping
      } // end if dump
    } // end of shard loop, end of this epoch
    cerr << "Rank " << rank << ", epoch " << epoch << ", end of epoch.." << endl;

    if (historyBleu || simpleHistoryBleu) {
      cerr << "Bleu feature history after epoch " <<  epoch << endl;
      decoder->printBleuFeatureHistory(cerr);
    }
    //		cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl;

    // Check whether there were any weight updates during this epoch
    size_t sumUpdates;
    size_t *sendbuf_uint, *recvbuf_uint;
    sendbuf_uint = (size_t *) malloc(sizeof(size_t));
    recvbuf_uint = (size_t *) malloc(sizeof(size_t));
#ifdef MPI_ENABLE
    sendbuf_uint[0] = numberOfUpdatesThisEpoch;
    recvbuf_uint[0] = 0;
    MPI_Reduce(sendbuf_uint, recvbuf_uint, 1, MPI_UNSIGNED, MPI_SUM, 0, world);
    sumUpdates = recvbuf_uint[0];
#endif
#ifndef MPI_ENABLE
    sumUpdates = numberOfUpdatesThisEpoch;
#endif
    if (rank == 0 && sumUpdates == 0) {
      cerr << "\nNo weight updates during this epoch.. stopping." << endl;
      stop = true;
#ifdef MPI_ENABLE
      mpi::broadcast(world, stop, 0);
#endif
    }

    if (!stop) {
      // Test if weights have converged
      if (weightConvergence) {
	bool reached = true;
	if (rank == 0 && (epoch >= 2)) {
	  ScoreComponentCollection firstDiff, secondDiff;
	  if (dumpMixedWeights) {
	    firstDiff = mixedWeights;
	    firstDiff.MinusEquals(mixedWeightsPrevious);
	    secondDiff = mixedWeights;
	    secondDiff.MinusEquals(mixedWeightsBeforePrevious);
	  }
	  else {
	    firstDiff = mixedAverageWeights;
	    firstDiff.MinusEquals(mixedAverageWeightsPrevious);
	    secondDiff = mixedAverageWeights;
	    secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious);
	  }
	  VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl);
	  VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl);

	  // check whether stopping criterion has been reached
	  // (both difference vectors must have all weight changes smaller than min_weight_change)
	  if (firstDiff.GetLInfNorm() >= min_weight_change)
	    reached = false;
	  if (secondDiff.GetLInfNorm() >= min_weight_change)
	    reached = false;
	  if (reached) {
	    // stop MIRA
	    stop = true;
	    cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl;
	    ScoreComponentCollection dummy;
	    ostringstream endfilename;
	    endfilename << "stopping";
	    dummy.Save(endfilename.str());
	  }
	}

	mixedWeightsBeforePrevious = mixedWeightsPrevious;
	mixedWeightsPrevious = mixedWeights;
	mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious;
	mixedAverageWeightsPrevious = mixedAverageWeights;
#ifdef MPI_ENABLE
	mpi::broadcast(world, stop, 0);
#endif
      } //end if (weightConvergence)
    }
  } // end of epoch loop

#ifdef MPI_ENABLE
  MPI_Finalize();
#endif

  time(&now);
  cerr << "Rank " << rank << ", " << ctime(&now);

  if (rank == 0) {
    ScoreComponentCollection dummy;
    ostringstream endfilename;
    endfilename << "finished";
    dummy.Save(endfilename.str());
  }

  delete decoder;
  exit(0);
}

bool loadSentences(const string& filename, vector<string>& sentences) {
  ifstream in(filename.c_str());
  if (!in)
    return false;
  string line;
  while (getline(in, line))
    sentences.push_back(line);
  return true;
}

bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size) {
  if (mix_or_dump_base == 0) return 0;
  if (actual_batch_size > 1) {
    bool mix_or_dump = false;
    size_t numberSubtracts = actual_batch_size;
    do {
      if (shard_position % mix_or_dump_base == 0) {
	mix_or_dump = true;
	break;
      }
      --shard_position;
      --numberSubtracts;
    } while (numberSubtracts > 0);
    return mix_or_dump;
  }
  else {
    return ((shard_position % mix_or_dump_base) == 0);
  }
}

void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues) {
  for (size_t i = 0; i < featureValues.size(); ++i) {
    for (size_t j = 0; j < featureValues[i].size(); ++j) {
      cerr << featureValues[i][j] << endl;
    }
  }
  cerr << endl;
}

void deleteTranslations(vector<vector<const Word*> > &translations) {
  for (size_t i = 0; i < translations.size(); ++i) {
    for (size_t j = 0; j < translations[i].size(); ++j) {
      delete translations[i][j];
    }
  }
}

void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector<string> &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight) {
  if (decode == 1)
    cerr << "Rank " << rank << ", decoding dev input set according to hope objective.. " << endl;
  else if (decode == 2)
    cerr << "Rank " << rank << ", decoding dev input set according to fear objective.. " << endl;
  else
    cerr << "Rank " << rank << ", decoding dev input set according to normal objective.. " << endl;

  // Create shards according to the number of processes used
  vector<size_t> order;
  for (size_t i = 0; i < inputSentences.size(); ++i)
    order.push_back(i);

  vector<size_t> shard;
  float shardSize = (float) (order.size()) / size;
  size_t shardStart = (size_t) (shardSize * rank);
  size_t shardEnd = (size_t) (shardSize * (rank + 1));
  if (rank == size - 1) {
    shardEnd = inputSentences.size();
    shardSize = shardEnd - shardStart;
  }
  VERBOSE(1, "Rank " << rank << ", shard start: " << shardStart << " Shard end: " << shardEnd << endl);
  VERBOSE(1, "Rank " << rank << ", shard size: " << shardSize << endl);
  shard.resize(shardSize);
  copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());

  // open files for writing
  stringstream fname;
  fname << filename << ".rank" << rank;
  filename = fname.str();
  ostringstream filename_nbest;
  filename_nbest << filename << "." << n << "best";
  ofstream out(filename.c_str());
  ofstream nbest_out((filename_nbest.str()).c_str());
  if (!out) {
    ostringstream msg;
    msg << "Unable to open " << fname.str();
    throw runtime_error(msg.str());
  }
  if (!nbest_out) {
    ostringstream msg;
    msg << "Unable to open " << filename_nbest;
    throw runtime_error(msg.str());
  }

  for (size_t i = 0; i < shard.size(); ++i) {
    size_t sid = shard[i];
    string& input = inputSentences[sid];

    vector<vector<ScoreComponentCollection> > dummyFeatureValues;
    vector<vector<float> > dummyBleuScores;
    vector<vector<float> > dummyModelScores;

    vector<ScoreComponentCollection> newFeatureValues;
    vector<float> newScores;
    dummyFeatureValues.push_back(newFeatureValues);
    dummyBleuScores.push_back(newScores);
    dummyModelScores.push_back(newScores);

    float factor = 0.0;
    if (decode == 1) factor = 1.0;
    if (decode == 2) factor = -1.0;
    cerr << "Rank " << rank << ", translating sentence " << sid << endl;
    bool realBleu = false;
    vector< vector<const Word*> > nbestOutput = decoder->getNBest(input, sid, n, factor, bleuWeight, dummyFeatureValues[0],
								  dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, "");
    cerr << endl;
    decoder->cleanup(StaticData::Instance().GetSearchAlgorithm() == ChartDecoding);

    for (size_t i = 0; i < nbestOutput.size(); ++i) {
      vector<const Word*> output = nbestOutput[i];
      stringstream translation;
      for (size_t k = 0; k < output.size(); ++k) {
	Word* w = const_cast<Word*>(output[k]);
	translation << w->GetString(0);
	translation << " ";
      }

      if (i == 0)
	out << translation.str() << endl;
      nbest_out << sid << " ||| " << translation.str() << " ||| " << dummyFeatureValues[0][i] <<
	" ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl;
    }
  }

  out.close();
  nbest_out.close();
  cerr << "Closing files " << filename << " and " << filename_nbest.str() << endl;

#ifdef MPI_ENABLE
  MPI_Finalize();
#endif

  time_t now;
  time(&now);
  cerr << "Rank " << rank << ", " << ctime(&now);

  delete decoder;
  exit(0);
}

void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0) {
  for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
    for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
      featureValues[i][j].MultiplyEquals(core_r0, sparse_r0);
}

void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0) {
  for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
    for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
      featureValues[i][j].MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
}

void scaleFeatureScore(ScoreProducer *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch) {
  string name = sp->GetScoreProducerWeightShortName();

  // scale down score
  float featureScore;
  for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
    for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
      featureScore = featureValues[i][j].GetScoreForProducer(sp);
      featureValues[i][j].Assign(sp, featureScore*scaling_factor);
      //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
    }
  }
}

void scaleFeatureScores(ScoreProducer *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch) {
  string name = sp->GetScoreProducerWeightShortName();

  // scale down score
  for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
    for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
      vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp);
      for (size_t k=0; k<featureScores.size(); ++k)
	featureScores[k] *= scaling_factor;
      featureValues[i][j].Assign(sp, featureScores);
      //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
    }
  }
}