Merge branch 'vw_tgtcontext' of github.com:moses-smt/mosesdecoder

2024-09-11 11:25:40 +03:00 · 2016-06-07 17:14:16 +02:00 · 2016-06-07 17:14:16 +02:00 · 44c9e6db17
commit 44c9e6db17
parent defbf8d7c3 7c5bb9328c
30 changed files with 1415 additions and 491 deletions
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@ -77,6 +77,9 @@

 #ifdef HAVE_VW
 #include "moses/FF/VW/VW.h"
+#include "moses/FF/VW/VWFeatureContextBigrams.h"
+#include "moses/FF/VW/VWFeatureContextBilingual.h"
+#include "moses/FF/VW/VWFeatureContextWindow.h"
 #include "moses/FF/VW/VWFeatureSourceBagOfWords.h"
 #include "moses/FF/VW/VWFeatureSourceBigrams.h"
 #include "moses/FF/VW/VWFeatureSourceIndicator.h"
@ -300,6 +303,9 @@ FeatureRegistry::FeatureRegistry()

 #ifdef HAVE_VW
  MOSES_FNAME(VW);
+  MOSES_FNAME(VWFeatureContextBigrams);
+  MOSES_FNAME(VWFeatureContextBilingual);
+  MOSES_FNAME(VWFeatureContextWindow);
  MOSES_FNAME(VWFeatureSourceBagOfWords);
  MOSES_FNAME(VWFeatureSourceBigrams);
  MOSES_FNAME(VWFeatureSourceIndicator);
--- a/moses/FF/VW/AlignmentConstraint.h
+++ b/moses/FF/VW/AlignmentConstraint.h
@ -0,0 +1,40 @@
+#pragma once
+
+namespace Moses
+{
+
+/**
+ * Helper class for storing alignment constraints.
+ */
+class AlignmentConstraint
+{
+public:
+  AlignmentConstraint() : m_min(std::numeric_limits<int>::max()), m_max(-1) {}
+
+  AlignmentConstraint(int min, int max) : m_min(min), m_max(max) {}
+
+  /**
+   * We are aligned to point => our min cannot be larger, our max cannot be smaller.
+   */
+  void Update(int point) {
+    if (m_min > point) m_min = point;
+    if (m_max < point) m_max = point;
+  }
+
+  bool IsSet() const {
+    return m_max != -1;
+  }
+
+  int GetMin() const {
+    return m_min;
+  }
+
+  int GetMax() const {
+    return m_max;
+  }
+
+private:
+  int m_min, m_max;
+};
+
+}
--- a/moses/FF/VW/VW.cpp
+++ b/moses/FF/VW/VW.cpp
@ -0,0 +1,627 @@
+#include <string>
+#include <map>
+#include <limits>
+#include <vector>
+
+#include <boost/unordered_map.hpp>
+#include <boost/functional/hash.hpp>
+
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/PP/CountsPhraseProperty.h"
+#include "moses/TranslationOptionList.h"
+#include "moses/TranslationOption.h"
+#include "moses/Util.h"
+#include "moses/TypeDef.h"
+#include "moses/StaticData.h"
+#include "moses/Phrase.h"
+#include "moses/AlignmentInfo.h"
+#include "moses/AlignmentInfoCollection.h"
+#include "moses/Word.h"
+#include "moses/FactorCollection.h"
+
+#include "Normalizer.h"
+#include "Classifier.h"
+#include "VWFeatureBase.h"
+#include "TabbedSentence.h"
+#include "ThreadLocalByFeatureStorage.h"
+#include "TrainingLoss.h"
+#include "VWTargetSentence.h"
+#include "VWState.h"
+#include "VW.h"
+
+namespace Moses
+{
+
+VW::VW(const std::string &line)
+  : StatefulFeatureFunction(1, line)
+  , TLSTargetSentence(this)
+  , m_train(false)
+  , m_sentenceStartWord(Word()) {
+  ReadParameters();
+  Discriminative::ClassifierFactory *classifierFactory = m_train
+      ? new Discriminative::ClassifierFactory(m_modelPath)
+      : new Discriminative::ClassifierFactory(m_modelPath, m_vwOptions);
+
+  m_tlsClassifier = new TLSClassifier(this, *classifierFactory);
+
+  m_tlsFutureScores = new TLSFloatHashMap(this);
+  m_tlsComputedStateExtensions = new TLSStateExtensions(this);
+  m_tlsTranslationOptionFeatures = new TLSFeatureVectorMap(this);
+  m_tlsTargetContextFeatures = new TLSFeatureVectorMap(this);
+
+  if (! m_normalizer) {
+    VERBOSE(1, "VW :: No loss function specified, assuming logistic loss.\n");
+    m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer();
+  }
+
+  if (! m_trainingLoss) {
+    VERBOSE(1, "VW :: Using basic 1/0 loss calculation in training.\n");
+    m_trainingLoss = (TrainingLoss *) new TrainingLossBasic();
+  }
+
+  // create a virtual beginning-of-sentence word with all factors replaced by <S>
+  const Factor *bosFactor = FactorCollection::Instance().AddFactor(BOS_);
+  for (size_t i = 0; i < MAX_NUM_FACTORS; i++)
+    m_sentenceStartWord.SetFactor(i, bosFactor);
+}
+
+VW::~VW() {
+  delete m_tlsClassifier;
+  delete m_normalizer;
+  // TODO delete more stuff
+}
+
+FFState* VW::EvaluateWhenApplied(
+  const Hypothesis& curHypo,
+  const FFState* prevState,
+  ScoreComponentCollection* accumulator) const
+{ 
+  VERBOSE(3, "VW :: Evaluating translation options\n");
+
+  const VWState& prevVWState = *static_cast<const VWState *>(prevState);
+
+  const std::vector<VWFeatureBase*>& contextFeatures =
+    VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription());
+
+  if (contextFeatures.empty()) {
+    // no target context features => we already evaluated everything in
+    // EvaluateTranslationOptionListWithSourceContext(). Nothing to do now,
+    // no state information to track.
+    return new VWState();
+  }
+
+  size_t spanStart = curHypo.GetTranslationOption().GetStartPos();
+  size_t spanEnd   = curHypo.GetTranslationOption().GetEndPos();
+
+  // compute our current key
+  size_t cacheKey = MakeCacheKey(prevState, spanStart, spanEnd);
+
+  boost::unordered_map<size_t, FloatHashMap> &computedStateExtensions 
+    = *m_tlsComputedStateExtensions->GetStored();
+
+  if (computedStateExtensions.find(cacheKey) == computedStateExtensions.end()) {
+    // we have not computed this set of translation options yet
+    const TranslationOptionList *topts = 
+      curHypo.GetManager().getSntTranslationOptions()->GetTranslationOptionList(spanStart, spanEnd);
+
+    const InputType& input = curHypo.GetManager().GetSource();
+
+    Discriminative::Classifier &classifier = *m_tlsClassifier->GetStored();
+
+    // extract target context features
+    size_t contextHash = prevVWState.hash();
+
+    FeatureVectorMap &contextFeaturesCache = *m_tlsTargetContextFeatures->GetStored();
+
+    FeatureVectorMap::const_iterator contextIt = contextFeaturesCache.find(contextHash);
+    if (contextIt == contextFeaturesCache.end()) {
+      // we have not extracted features for this context yet
+
+      const Phrase &targetContext = prevVWState.GetPhrase();
+      Discriminative::FeatureVector contextVector;
+      const AlignmentInfo *alignInfo = TransformAlignmentInfo(curHypo, targetContext.GetSize());
+      for(size_t i = 0; i < contextFeatures.size(); ++i)
+        (*contextFeatures[i])(input, targetContext, *alignInfo, classifier, contextVector);
+
+      contextFeaturesCache[contextHash] = contextVector;
+      VERBOSE(3, "VW :: context cache miss\n");
+    } else {
+      // context already in cache, simply put feature IDs in the classifier object
+      classifier.AddLabelIndependentFeatureVector(contextIt->second);
+      VERBOSE(3, "VW :: context cache hit\n");
+    }
+
+    std::vector<float> losses(topts->size());
+
+    for (size_t toptIdx = 0; toptIdx < topts->size(); toptIdx++) {
+      const TranslationOption *topt = topts->Get(toptIdx);
+      const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
+      size_t toptHash = hash_value(*topt);
+
+      // start with pre-computed source-context-only VW scores
+      losses[toptIdx] = m_tlsFutureScores->GetStored()->find(toptHash)->second;
+
+      // add all features associated with this translation option
+      // (pre-computed when evaluated with source context)
+      const Discriminative::FeatureVector &targetFeatureVector =
+        m_tlsTranslationOptionFeatures->GetStored()->find(toptHash)->second;
+
+      classifier.AddLabelDependentFeatureVector(targetFeatureVector);
+
+      // add classifier score with context+target features only to the total loss
+      losses[toptIdx] += classifier.Predict(MakeTargetLabel(targetPhrase));
+    }
+
+    // normalize classifier scores to get a probability distribution
+    (*m_normalizer)(losses);
+
+    // fill our cache with the results
+    FloatHashMap &toptScores = computedStateExtensions[cacheKey];
+    for (size_t toptIdx = 0; toptIdx < topts->size(); toptIdx++) {
+      const TranslationOption *topt = topts->Get(toptIdx);
+      size_t toptHash = hash_value(*topt);
+      toptScores[toptHash] = FloorScore(TransformScore(losses[toptIdx]));
+    }
+
+    VERBOSE(3, "VW :: cache miss\n");
+  } else {
+    VERBOSE(3, "VW :: cache hit\n");
+  }
+
+  // now our cache is guaranteed to contain the required score, simply look it up
+  std::vector<float> newScores(m_numScoreComponents);
+  size_t toptHash = hash_value(curHypo.GetTranslationOption());
+  newScores[0] = computedStateExtensions[cacheKey][toptHash];
+  VERBOSE(3, "VW :: adding score: " << newScores[0] << "\n");
+  accumulator->PlusEquals(this, newScores);
+
+  return new VWState(prevVWState, curHypo);
+}
+
+const FFState* VW::EmptyHypothesisState(const InputType &input) const {
+  size_t maxContextSize = VWFeatureBase::GetMaximumContextSize(GetScoreProducerDescription());
+  Phrase initialPhrase;
+  for (size_t i = 0; i < maxContextSize; i++)
+    initialPhrase.AddWord(m_sentenceStartWord);
+    
+  return new VWState(initialPhrase);
+}
+
+void VW::EvaluateTranslationOptionListWithSourceContext(const InputType &input
+    , const TranslationOptionList &translationOptionList) const {
+  Discriminative::Classifier &classifier = *m_tlsClassifier->GetStored();
+
+  if (translationOptionList.size() == 0)
+    return; // nothing to do
+
+  VERBOSE(3, "VW :: Evaluating translation options\n");
+
+  // which feature functions do we use (on the source and target side)
+  const std::vector<VWFeatureBase*>& sourceFeatures =
+    VWFeatureBase::GetSourceFeatures(GetScoreProducerDescription());
+
+  const std::vector<VWFeatureBase*>& contextFeatures =
+    VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription());
+
+  const std::vector<VWFeatureBase*>& targetFeatures =
+    VWFeatureBase::GetTargetFeatures(GetScoreProducerDescription());
+
+  size_t maxContextSize = VWFeatureBase::GetMaximumContextSize(GetScoreProducerDescription());
+
+  // only use stateful score computation when needed
+  bool haveTargetContextFeatures = ! contextFeatures.empty();
+
+  const Range &sourceRange = translationOptionList.Get(0)->GetSourceWordsRange();
+
+  if (m_train) {
+    //
+    // extract features for training the classifier (only call this when using vwtrainer, not in Moses!)
+    //
+
+    // find which topts are correct
+    std::vector<bool> correct(translationOptionList.size());
+    std::vector<int> startsAt(translationOptionList.size());
+    std::set<int> uncoveredStartingPositions;
+
+    for (size_t i = 0; i < translationOptionList.size(); i++) {
+      std::pair<bool, int> isCorrect = IsCorrectTranslationOption(* translationOptionList.Get(i));
+      correct[i] = isCorrect.first;
+      startsAt[i] = isCorrect.second;
+      if (isCorrect.first) {
+        uncoveredStartingPositions.insert(isCorrect.second);
+      }
+    }
+
+    // optionally update translation options using leave-one-out
+    std::vector<bool> keep = (m_leaveOneOut.size() > 0)
+                             ? LeaveOneOut(translationOptionList, correct)
+                             : std::vector<bool>(translationOptionList.size(), true);
+
+    while (! uncoveredStartingPositions.empty()) {
+      int currentStart = *uncoveredStartingPositions.begin();
+      uncoveredStartingPositions.erase(uncoveredStartingPositions.begin());
+
+      // check whether we (still) have some correct translation
+      int firstCorrect = -1;
+      for (size_t i = 0; i < translationOptionList.size(); i++) {
+        if (keep[i] && correct[i] && startsAt[i] == currentStart) {
+          firstCorrect = i;
+          break;
+        }
+      }
+
+      // do not train if there are no positive examples
+      if (firstCorrect == -1) {
+        VERBOSE(3, "VW :: skipping topt collection, no correct translation for span at current tgt start position\n");
+        continue;
+      }
+
+      // the first correct topt can be used by some loss functions
+      const TargetPhrase &correctPhrase = translationOptionList.Get(firstCorrect)->GetTargetPhrase();
+
+      // feature extraction *at prediction time* outputs feature hashes which can be cached;
+      // this is training time, simply store everything in this dummyVector
+      Discriminative::FeatureVector dummyVector;
+
+      // extract source side features
+      for(size_t i = 0; i < sourceFeatures.size(); ++i)
+        (*sourceFeatures[i])(input, sourceRange, classifier, dummyVector);
+
+      // build target-side context
+      Phrase targetContext;
+      for (size_t i = 0; i < maxContextSize; i++)
+        targetContext.AddWord(m_sentenceStartWord);
+
+      const Phrase *targetSent = GetStored()->m_sentence;
+
+      // word alignment info shifted by context size
+      AlignmentInfo contextAlignment = TransformAlignmentInfo(*GetStored()->m_alignment, maxContextSize, currentStart);
+
+      if (currentStart > 0)
+        targetContext.Append(targetSent->GetSubString(Range(0, currentStart - 1)));
+
+      // extract target-context features
+      for(size_t i = 0; i < contextFeatures.size(); ++i)
+        (*contextFeatures[i])(input, targetContext, contextAlignment, classifier, dummyVector);
+
+      // go over topts, extract target side features and train the classifier
+      for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
+
+        // this topt was discarded by leaving one out
+        if (! keep[toptIdx])
+          continue;
+
+        // extract target-side features for each topt
+        const TargetPhrase &targetPhrase = translationOptionList.Get(toptIdx)->GetTargetPhrase();
+        for(size_t i = 0; i < targetFeatures.size(); ++i)
+          (*targetFeatures[i])(input, targetPhrase, classifier, dummyVector);
+
+        bool isCorrect = correct[toptIdx] && startsAt[toptIdx] == currentStart;
+        float loss = (*m_trainingLoss)(targetPhrase, correctPhrase, isCorrect);
+
+        // train classifier on current example
+        classifier.Train(MakeTargetLabel(targetPhrase), loss);
+      }
+    }
+  } else {
+    //
+    // predict using a trained classifier, use this in decoding (=at test time)
+    //
+
+    std::vector<float> losses(translationOptionList.size());
+
+    Discriminative::FeatureVector outFeaturesSourceNamespace;
+
+    // extract source side features
+    for(size_t i = 0; i < sourceFeatures.size(); ++i)
+      (*sourceFeatures[i])(input, sourceRange, classifier, outFeaturesSourceNamespace);
+
+    for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
+      const TranslationOption *topt = translationOptionList.Get(toptIdx);
+      const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
+      Discriminative::FeatureVector outFeaturesTargetNamespace;
+
+      // extract target-side features for each topt
+      for(size_t i = 0; i < targetFeatures.size(); ++i)
+        (*targetFeatures[i])(input, targetPhrase, classifier, outFeaturesTargetNamespace);
+
+      // cache the extracted target features (i.e. features associated with given topt)
+      // for future use at decoding time
+      size_t toptHash = hash_value(*topt);
+      m_tlsTranslationOptionFeatures->GetStored()->insert(
+          std::make_pair(toptHash, outFeaturesTargetNamespace));
+
+      // get classifier score
+      losses[toptIdx] = classifier.Predict(MakeTargetLabel(targetPhrase));
+    }
+
+    // normalize classifier scores to get a probability distribution
+    std::vector<float> rawLosses = losses;
+    (*m_normalizer)(losses);
+
+    // update scores of topts
+    for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
+      TranslationOption *topt = *(translationOptionList.begin() + toptIdx);
+      if (! haveTargetContextFeatures) {
+        // no target context features; evaluate the FF now
+        std::vector<float> newScores(m_numScoreComponents);
+        newScores[0] = FloorScore(TransformScore(losses[toptIdx]));
+
+        ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown();
+        scoreBreakDown.PlusEquals(this, newScores);
+
+        topt->UpdateScore();
+      } else {
+        // We have target context features => this is just a partial score,
+        // do not add it to the score component collection.
+        size_t toptHash = hash_value(*topt);
+        
+        // Subtract the score contribution of target-only features, otherwise it would
+        // be included twice.
+        Discriminative::FeatureVector emptySource;
+        const Discriminative::FeatureVector &targetFeatureVector =
+          m_tlsTranslationOptionFeatures->GetStored()->find(toptHash)->second;
+        classifier.AddLabelIndependentFeatureVector(emptySource);
+        classifier.AddLabelDependentFeatureVector(targetFeatureVector);
+        float targetOnlyLoss = classifier.Predict(VW_DUMMY_LABEL);
+
+        float futureScore = rawLosses[toptIdx] - targetOnlyLoss;
+        m_tlsFutureScores->GetStored()->insert(std::make_pair(toptHash, futureScore));
+      }
+    }
+  }
+}
+
+void VW::SetParameter(const std::string& key, const std::string& value) {
+  if (key == "train") {
+    m_train = Scan<bool>(value);
+  } else if (key == "path") {
+    m_modelPath = value;
+  } else if (key == "vw-options") {
+    m_vwOptions = value;
+  } else if (key == "leave-one-out-from") {
+    m_leaveOneOut = value;
+  } else if (key == "training-loss") {
+    // which type of loss to use for training
+    if (value == "basic") {
+      m_trainingLoss = (TrainingLoss *) new TrainingLossBasic();
+    } else if (value == "bleu") {
+      m_trainingLoss = (TrainingLoss *) new TrainingLossBLEU();
+    } else {
+      UTIL_THROW2("Unknown training loss type:" << value);
+    }
+  } else if (key == "loss") {
+    // which normalizer to use (theoretically depends on the loss function used for training the
+    // classifier (squared/logistic/hinge/...), hence the name "loss"
+    if (value == "logistic") {
+      m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer();
+    } else if (value == "squared") {
+      m_normalizer = (Discriminative::Normalizer *) new Discriminative::SquaredLossNormalizer();
+    } else {
+      UTIL_THROW2("Unknown loss type:" << value);
+    }
+  } else {
+    StatefulFeatureFunction::SetParameter(key, value);
+  }
+}
+
+void VW::InitializeForInput(ttasksptr const& ttask) {
+  // do not keep future cost estimates across sentences!
+  m_tlsFutureScores->GetStored()->clear();
+
+  // invalidate our caches after each sentence
+  m_tlsComputedStateExtensions->GetStored()->clear();
+
+  // it's not certain that we should clear these caches; we do it
+  // because they shouldn't be allowed to grow indefinitely large but
+  // target contexts and translation options will have identical features
+  // the next time we extract them...
+  m_tlsTargetContextFeatures->GetStored()->clear();
+  m_tlsTranslationOptionFeatures->GetStored()->clear();
+
+  InputType const& source = *(ttask->GetSource().get());
+  // tabbed sentence is assumed only in training
+  if (! m_train)
+    return;
+
+  UTIL_THROW_IF2(source.GetType() != TabbedSentenceInput,
+                 "This feature function requires the TabbedSentence input type");
+
+  const TabbedSentence& tabbedSentence = static_cast<const TabbedSentence&>(source);
+  UTIL_THROW_IF2(tabbedSentence.GetColumns().size() < 2,
+                 "TabbedSentence must contain target<tab>alignment");
+
+  // target sentence represented as a phrase
+  Phrase *target = new Phrase();
+  target->CreateFromString(
+    Output
+    , StaticData::Instance().options()->output.factor_order
+    , tabbedSentence.GetColumns()[0]
+    , NULL);
+
+  // word alignment between source and target sentence
+  // we don't store alignment info in AlignmentInfoCollection because we keep alignments of whole
+  // sentences, not phrases
+  AlignmentInfo *alignment = new AlignmentInfo(tabbedSentence.GetColumns()[1]);
+
+  VWTargetSentence &targetSent = *GetStored();
+  targetSent.Clear();
+  targetSent.m_sentence = target;
+  targetSent.m_alignment = alignment;
+
+  // pre-compute max- and min- aligned points for faster translation option checking
+  targetSent.SetConstraints(source.GetSize());
+}
+
+/*************************************************************************************
+ * private methods
+ ************************************************************************************/
+
+const AlignmentInfo *VW::TransformAlignmentInfo(const Hypothesis &curHypo, size_t contextSize) const {
+  std::set<std::pair<size_t, size_t> > alignmentPoints;
+  const Hypothesis *contextHypo = curHypo.GetPrevHypo();
+  int idxInContext = contextSize - 1;
+  int processedWordsInHypo = 0;
+  while (idxInContext >= 0 && contextHypo) {
+    int idxInHypo = contextHypo->GetCurrTargetLength() - 1 - processedWordsInHypo;
+    if (idxInHypo >= 0) {
+      const AlignmentInfo &hypoAlign = contextHypo->GetCurrTargetPhrase().GetAlignTerm();
+      std::set<size_t> alignedToTgt = hypoAlign.GetAlignmentsForTarget(idxInHypo);
+      size_t srcOffset = contextHypo->GetCurrSourceWordsRange().GetStartPos();
+      BOOST_FOREACH(size_t srcIdx, alignedToTgt) {
+        alignmentPoints.insert(std::make_pair(srcOffset + srcIdx, idxInContext));
+      }
+      processedWordsInHypo++;
+      idxInContext--;
+    } else {
+      processedWordsInHypo = 0;
+      contextHypo = contextHypo->GetPrevHypo();
+    }
+  }
+
+  return AlignmentInfoCollection::Instance().Add(alignmentPoints);
+}
+
+AlignmentInfo VW::TransformAlignmentInfo(const AlignmentInfo &alignInfo, size_t contextSize, int currentStart) const {
+  std::set<std::pair<size_t, size_t> > alignmentPoints;
+  for (int i = std::max(0, currentStart - (int)contextSize); i < currentStart; i++) {
+    std::set<size_t> alignedToTgt = alignInfo.GetAlignmentsForTarget(i);
+    BOOST_FOREACH(size_t srcIdx, alignedToTgt) {
+      alignmentPoints.insert(std::make_pair(srcIdx, i + contextSize));
+    }
+  }
+  return AlignmentInfo(alignmentPoints);
+}
+
+std::pair<bool, int> VW::IsCorrectTranslationOption(const TranslationOption &topt) const {
+
+  //std::cerr << topt.GetSourceWordsRange() << std::endl;
+
+  int sourceStart = topt.GetSourceWordsRange().GetStartPos();
+  int sourceEnd   = topt.GetSourceWordsRange().GetEndPos();
+
+  const VWTargetSentence &targetSentence = *GetStored();
+
+  // [targetStart, targetEnd] spans aligned target words
+  int targetStart = targetSentence.m_sentence->GetSize();
+  int targetEnd   = -1;
+
+  // get the left-most and right-most alignment point within source span
+  for(int i = sourceStart; i <= sourceEnd; ++i) {
+    if(targetSentence.m_sourceConstraints[i].IsSet()) {
+      if(targetStart > targetSentence.m_sourceConstraints[i].GetMin())
+        targetStart = targetSentence.m_sourceConstraints[i].GetMin();
+      if(targetEnd < targetSentence.m_sourceConstraints[i].GetMax())
+        targetEnd = targetSentence.m_sourceConstraints[i].GetMax();
+    }
+  }
+  // there was no alignment
+  if(targetEnd == -1)
+    return std::make_pair(false, -1);
+
+  //std::cerr << "Shorter: " << targetStart << " " << targetEnd << std::endl;
+
+  // [targetStart2, targetEnd2] spans unaligned words left and right of [targetStart, targetEnd]
+  int targetStart2 = targetStart;
+  for(int i = targetStart2; i >= 0 && !targetSentence.m_targetConstraints[i].IsSet(); --i)
+    targetStart2 = i;
+
+  int targetEnd2   = targetEnd;
+  for(int i = targetEnd2;
+      i < targetSentence.m_sentence->GetSize() && !targetSentence.m_targetConstraints[i].IsSet();
+      ++i)
+    targetEnd2 = i;
+
+  //std::cerr << "Longer: " << targetStart2 << " " << targetEnd2 << std::endl;
+
+  const TargetPhrase &tphrase = topt.GetTargetPhrase();
+  //std::cerr << tphrase << std::endl;
+
+  // if target phrase is shorter than inner span return false
+  if(tphrase.GetSize() < targetEnd - targetStart + 1)
+    return std::make_pair(false, -1);
+
+  // if target phrase is longer than outer span return false
+  if(tphrase.GetSize() > targetEnd2 - targetStart2 + 1)
+    return std::make_pair(false, -1);
+
+  // for each possible starting point
+  for(int tempStart = targetStart2; tempStart <= targetStart; tempStart++) {
+    bool found = true;
+    // check if the target phrase is within longer span
+    for(int i = tempStart; i <= targetEnd2 && i < tphrase.GetSize() + tempStart; ++i) {
+      if(tphrase.GetWord(i - tempStart) != targetSentence.m_sentence->GetWord(i)) {
+        found = false;
+        break;
+      }
+    }
+    // return true if there was a match
+    if(found) {
+      //std::cerr << "Found" << std::endl;
+      return std::make_pair(true, tempStart);
+    }
+  }
+
+  return std::make_pair(false, -1);
+}
+
+std::vector<bool> VW::LeaveOneOut(const TranslationOptionList &topts, const std::vector<bool> &correct) const {
+  UTIL_THROW_IF2(m_leaveOneOut.size() == 0 || ! m_train, "LeaveOneOut called in wrong setting!");
+
+  float sourceRawCount = 0.0;
+  const float ONE = 1.0001; // I don't understand floating point numbers
+
+  std::vector<bool> keepOpt;
+
+  for (size_t i = 0; i < topts.size(); i++) {
+    TranslationOption *topt = *(topts.begin() + i);
+    const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
+
+    // extract raw counts from phrase-table property
+    const CountsPhraseProperty *property =
+      static_cast<const CountsPhraseProperty *>(targetPhrase.GetProperty("Counts"));
+
+    if (! property) {
+      VERBOSE(2, "VW :: Counts not found for topt! Is this an OOV?\n");
+      // keep all translation opts without updating, this is either OOV or bad usage...
+      keepOpt.assign(topts.size(), true);
+      return keepOpt;
+    }
+
+    if (sourceRawCount == 0.0) {
+      sourceRawCount = property->GetSourceMarginal() - ONE; // discount one occurrence of the source phrase
+      if (sourceRawCount <= 0) {
+        // no translation options survived, source phrase was a singleton
+        keepOpt.assign(topts.size(), false);
+        return keepOpt;
+      }
+    }
+
+    float discount = correct[i] ? ONE : 0.0;
+    float target = property->GetTargetMarginal() - discount;
+    float joint  = property->GetJointCount() - discount;
+    if (discount != 0.0) VERBOSE(3, "VW :: leaving one out!\n");
+
+    if (joint > 0) {
+      // topt survived leaving one out, update its scores
+      const FeatureFunction *feature = &FindFeatureFunction(m_leaveOneOut);
+      std::vector<float> scores = targetPhrase.GetScoreBreakdown().GetScoresForProducer(feature);
+      UTIL_THROW_IF2(scores.size() != 4, "Unexpected number of scores in feature " << m_leaveOneOut);
+      scores[0] = TransformScore(joint / target); // P(f|e)
+      scores[2] = TransformScore(joint / sourceRawCount); // P(e|f)
+
+      ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown();
+      scoreBreakDown.Assign(feature, scores);
+      topt->UpdateScore();
+      keepOpt.push_back(true);
+    } else {
+      // they only occurred together once, discard topt
+      VERBOSE(2, "VW :: discarded topt when leaving one out\n");
+      keepOpt.push_back(false);
+    }
+  }
+
+  return keepOpt;
+}
+
+} // namespace Moses
--- a/moses/FF/VW/VW.h
+++ b/moses/FF/VW/VW.h
@ -3,8 +3,12 @@
 #include <string>
 #include <map>
 #include <limits>
+#include <vector>

-#include "moses/FF/StatelessFeatureFunction.h"
+#include <boost/unordered_map.hpp>
+#include <boost/functional/hash.hpp>
+
+#include "moses/FF/StatefulFeatureFunction.h"
 #include "moses/PP/CountsPhraseProperty.h"
 #include "moses/TranslationOptionList.h"
 #include "moses/TranslationOption.h"
@ -13,6 +17,8 @@
 #include "moses/StaticData.h"
 #include "moses/Phrase.h"
 #include "moses/AlignmentInfo.h"
+#include "moses/Word.h"
+#include "moses/FactorCollection.h"

 #include "Normalizer.h"
 #include "Classifier.h"
@ -20,119 +26,50 @@
 #include "TabbedSentence.h"
 #include "ThreadLocalByFeatureStorage.h"
 #include "TrainingLoss.h"
+#include "VWTargetSentence.h"
+
+/*
+ * VW classifier feature. See vw/README.md for further information.
+ *
+ * TODO: say which paper to cite.
+ */

 namespace Moses
 {

-const std::string VW_DUMMY_LABEL = "1111"; // VW does not use the actual label, other classifiers might
-
-/**
- * Helper class for storing alignment constraints.
- */
-class Constraint
-{
-public:
-  Constraint() : m_min(std::numeric_limits<int>::max()), m_max(-1) {}
-
-  Constraint(int min, int max) : m_min(min), m_max(max) {}
-
-  /**
-   * We are aligned to point => our min cannot be larger, our max cannot be smaller.
-   */
-  void Update(int point) {
-    if (m_min > point) m_min = point;
-    if (m_max < point) m_max = point;
-  }
-
-  bool IsSet() const {
-    return m_max != -1;
-  }
-
-  int GetMin() const {
-    return m_min;
-  }
-
-  int GetMax() const {
-    return m_max;
-  }
-
-private:
-  int m_min, m_max;
-};
-
-/**
- * VW thread-specific data about target sentence.
- */
-struct VWTargetSentence {
-  VWTargetSentence() : m_sentence(NULL), m_alignment(NULL) {}
-
-  void Clear() {
-    if (m_sentence) delete m_sentence;
-    if (m_alignment) delete m_alignment;
-  }
-
-  ~VWTargetSentence() {
-    Clear();
-  }
-
-  void SetConstraints(size_t sourceSize) {
-    // initialize to unconstrained
-    m_sourceConstraints.assign(sourceSize, Constraint());
-    m_targetConstraints.assign(m_sentence->GetSize(), Constraint());
-
-    // set constraints according to alignment points
-    AlignmentInfo::const_iterator it;
-    for (it = m_alignment->begin(); it != m_alignment->end(); it++) {
-      int src = it->first;
-      int tgt = it->second;
-
-      if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) {
-        UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt);
-      }
-
-      m_sourceConstraints[src].Update(tgt);
-      m_targetConstraints[tgt].Update(src);
-    }
-  }
-
-  Phrase *m_sentence;
-  AlignmentInfo *m_alignment;
-  std::vector<Constraint> m_sourceConstraints, m_targetConstraints;
-};
+// dummy class label; VW does not use the actual label, other classifiers might
+const std::string VW_DUMMY_LABEL = "1111";

+// thread-specific classifier instance
 typedef ThreadLocalByFeatureStorage<Discriminative::Classifier, Discriminative::ClassifierFactory &> TLSClassifier;

+// current target sentence, used in VW training (vwtrainer), not in decoding (prediction time)
 typedef ThreadLocalByFeatureStorage<VWTargetSentence> TLSTargetSentence;

-class VW : public StatelessFeatureFunction, public TLSTargetSentence
+// hash table of feature vectors
+typedef boost::unordered_map<size_t, Discriminative::FeatureVector> FeatureVectorMap;
+
+// thread-specific feature vector hash
+typedef ThreadLocalByFeatureStorage<FeatureVectorMap> TLSFeatureVectorMap;
+
+// hash table of partial scores
+typedef boost::unordered_map<size_t, float> FloatHashMap;
+
+// thread-specific score hash table, used for caching
+typedef ThreadLocalByFeatureStorage<FloatHashMap> TLSFloatHashMap;
+
+// thread-specific hash tablei for caching full classifier outputs
+typedef ThreadLocalByFeatureStorage<boost::unordered_map<size_t, FloatHashMap> > TLSStateExtensions;
+
+/*
+ * VW feature function. A discriminative classifier with source and target context features.
+ */
+class VW : public StatefulFeatureFunction, public TLSTargetSentence
 {
 public:
-  VW(const std::string &line)
-    : StatelessFeatureFunction(1, line)
-    , TLSTargetSentence(this)
-    , m_train(false) {
-    ReadParameters();
-    Discriminative::ClassifierFactory *classifierFactory = m_train
-        ? new Discriminative::ClassifierFactory(m_modelPath)
-        : new Discriminative::ClassifierFactory(m_modelPath, m_vwOptions);
+  VW(const std::string &line);

-    m_tlsClassifier = new TLSClassifier(this, *classifierFactory);
-
-    if (! m_normalizer) {
-      VERBOSE(1, "VW :: No loss function specified, assuming logistic loss.\n");
-      m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer();
-    }
-
-    if (! m_trainingLoss) {
-      VERBOSE(1, "VW :: Using basic 1/0 loss calculation in training.\n");
-      m_trainingLoss = (TrainingLoss *) new TrainingLossBasic();
-    }
-  }
-
-  virtual ~VW() {
-    delete m_tlsClassifier;
-    delete m_normalizer;
-  }
+  virtual ~VW();

  bool IsUseable(const FactorMask &mask) const {
    return true;
@ -152,335 +89,89 @@ public:
                                 , ScoreComponentCollection *estimatedFutureScore = NULL) const {
  }

-  void EvaluateTranslationOptionListWithSourceContext(const InputType &input
-      , const TranslationOptionList &translationOptionList) const {
-    Discriminative::Classifier &classifier = *m_tlsClassifier->GetStored();
+  // This behavior of this method depends on whether it's called during VW
+  // training (feature extraction) by vwtrainer or during decoding (prediction
+  // time) by Moses.
+  //
+  // When predicting, it evaluates all translation options with the VW model;
+  // if no target-context features are defined, this is the final score and it
+  // is added directly to the TranslationOption score. If there are target
+  // context features, the score is a partial score and it is only stored in
+  // cache; the final score is computed based on target context in
+  // EvaluateWhenApplied().
+  //
+  // This method is also used in training by vwtrainer in which case features
+  // are written to a file, no classifier predictions take place. Target-side
+  // context is constant at training time (we know the true target sentence),
+  // so target-context features are extracted here as well.
+  virtual void EvaluateTranslationOptionListWithSourceContext(const InputType &input 
+      , const TranslationOptionList &translationOptionList) const;

-    if (translationOptionList.size() == 0)
-      return; // nothing to do
+  // Evaluate VW during decoding. This is only used at prediction time (not in training).
+  // When no target-context features are defined, VW predictions were already fully calculated
+  // in EvaluateTranslationOptionListWithSourceContext() and the scores were added to the model.
+  // If there are target-context features, we compute the context-dependent part of the 
+  // classifier score and combine it with the source-context only partial score which was computed
+  // in EvaluateTranslationOptionListWithSourceContext(). Various caches are used to make this
+  // method more efficient.
+  virtual FFState* EvaluateWhenApplied(
+    const Hypothesis& curHypo,
+    const FFState* prevState,
+    ScoreComponentCollection* accumulator) const;

-    VERBOSE(2, "VW :: Evaluating translation options\n");
-
-    // which feature functions do we use (on the source and target side)
-    const std::vector<VWFeatureBase*>& sourceFeatures =
-      VWFeatureBase::GetSourceFeatures(GetScoreProducerDescription());
-
-    const std::vector<VWFeatureBase*>& targetFeatures =
-      VWFeatureBase::GetTargetFeatures(GetScoreProducerDescription());
-
-    const Range &sourceRange = translationOptionList.Get(0)->GetSourceWordsRange();
-    const InputPath  &inputPath   = translationOptionList.Get(0)->GetInputPath();
-
-    if (m_train) {
-      //
-      // extract features for training the classifier (only call this when using vwtrainer, not in Moses!)
-      //
-
-      // find which topts are correct
-      std::vector<bool> correct(translationOptionList.size());
-      for (size_t i = 0; i < translationOptionList.size(); i++)
-        correct[i] = IsCorrectTranslationOption(* translationOptionList.Get(i));
-
-      // optionally update translation options using leave-one-out
-      std::vector<bool> keep = (m_leaveOneOut.size() > 0)
-                               ? LeaveOneOut(translationOptionList, correct)
-                               : std::vector<bool>(translationOptionList.size(), true);
-
-      // check whether we (still) have some correct translation
-      int firstCorrect = -1;
-      for (size_t i = 0; i < translationOptionList.size(); i++) {
-        if (keep[i] && correct[i]) {
-          firstCorrect = i;
-          break;
-        }
-      }
-
-      // do not train if there are no positive examples
-      if (firstCorrect == -1) {
-        VERBOSE(2, "VW :: skipping topt collection, no correct translation for span\n");
-        return;
-      }
-
-      // the first correct topt can be used by some loss functions
-      const TargetPhrase &correctPhrase = translationOptionList.Get(firstCorrect)->GetTargetPhrase();
-
-      // extract source side features
-      for(size_t i = 0; i < sourceFeatures.size(); ++i)
-        (*sourceFeatures[i])(input, inputPath, sourceRange, classifier);
-
-      // go over topts, extract target side features and train the classifier
-      for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
-
-        // this topt was discarded by leaving one out
-        if (! keep[toptIdx])
-          continue;
-
-        // extract target-side features for each topt
-        const TargetPhrase &targetPhrase = translationOptionList.Get(toptIdx)->GetTargetPhrase();
-        for(size_t i = 0; i < targetFeatures.size(); ++i)
-          (*targetFeatures[i])(input, inputPath, targetPhrase, classifier);
-
-        float loss = (*m_trainingLoss)(targetPhrase, correctPhrase, correct[toptIdx]);
-
-        // train classifier on current example
-        classifier.Train(MakeTargetLabel(targetPhrase), loss);
-      }
-    } else {
-      //
-      // predict using a trained classifier, use this in decoding (=at test time)
-      //
-
-      std::vector<float> losses(translationOptionList.size());
-
-      // extract source side features
-      for(size_t i = 0; i < sourceFeatures.size(); ++i)
-        (*sourceFeatures[i])(input, inputPath, sourceRange, classifier);
-
-      for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
-        const TranslationOption *topt = translationOptionList.Get(toptIdx);
-        const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
-
-        // extract target-side features for each topt
-        for(size_t i = 0; i < targetFeatures.size(); ++i)
-          (*targetFeatures[i])(input, inputPath, targetPhrase, classifier);
-
-        // get classifier score
-        losses[toptIdx] = classifier.Predict(MakeTargetLabel(targetPhrase));
-      }
-
-      // normalize classifier scores to get a probability distribution
-      (*m_normalizer)(losses);
-
-      // update scores of topts
-      for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) {
-        TranslationOption *topt = *(translationOptionList.begin() + toptIdx);
-        std::vector<float> newScores(m_numScoreComponents);
-        newScores[0] = FloorScore(TransformScore(losses[toptIdx]));
-
-        ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown();
-        scoreBreakDown.PlusEquals(this, newScores);
-
-        topt->UpdateScore();
-      }
-    }
+  virtual FFState* EvaluateWhenApplied(
+    const ChartHypothesis&,
+    int,
+    ScoreComponentCollection* accumulator) const { 
+    throw new std::logic_error("hiearchical/syntax not supported"); 
  }

-  void EvaluateWhenApplied(const Hypothesis& hypo,
-                           ScoreComponentCollection* accumulator) const {
-  }
+  // Initial VW state; contains unaligned BOS symbols.
+  const FFState* EmptyHypothesisState(const InputType &input) const; 

-  void EvaluateWhenApplied(const ChartHypothesis &hypo,
-                           ScoreComponentCollection* accumulator) const {
-  }
-
-  void SetParameter(const std::string& key, const std::string& value) {
-    if (key == "train") {
-      m_train = Scan<bool>(value);
-    } else if (key == "path") {
-      m_modelPath = value;
-    } else if (key == "vw-options") {
-      m_vwOptions = value;
-    } else if (key == "leave-one-out-from") {
-      m_leaveOneOut = value;
-    } else if (key == "training-loss") {
-      // which type of loss to use for training
-      if (value == "basic") {
-        m_trainingLoss = (TrainingLoss *) new TrainingLossBasic();
-      } else if (value == "bleu") {
-        m_trainingLoss = (TrainingLoss *) new TrainingLossBLEU();
-      } else {
-        UTIL_THROW2("Unknown training loss type:" << value);
-      }
-    } else if (key == "loss") {
-      // which normalizer to use (theoretically depends on the loss function used for training the
-      // classifier (squared/logistic/hinge/...), hence the name "loss"
-      if (value == "logistic") {
-        m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer();
-      } else if (value == "squared") {
-        m_normalizer = (Discriminative::Normalizer *) new Discriminative::SquaredLossNormalizer();
-      } else {
-        UTIL_THROW2("Unknown loss type:" << value);
-      }
-    } else {
-      StatelessFeatureFunction::SetParameter(key, value);
-    }
-  }
-
-  virtual void InitializeForInput(ttasksptr const& ttask) {
-    InputType const& source = *(ttask->GetSource().get());
-    // tabbed sentence is assumed only in training
-    if (! m_train)
-      return;
-
-    UTIL_THROW_IF2(source.GetType() != TabbedSentenceInput,
-                   "This feature function requires the TabbedSentence input type");
-
-    const TabbedSentence& tabbedSentence = static_cast<const TabbedSentence&>(source);
-    UTIL_THROW_IF2(tabbedSentence.GetColumns().size() < 2,
-                   "TabbedSentence must contain target<tab>alignment");
-
-    // target sentence represented as a phrase
-    Phrase *target = new Phrase();
-    target->CreateFromString(
-      Output
-      , StaticData::Instance().options()->output.factor_order
-      , tabbedSentence.GetColumns()[0]
-      , NULL);
-
-    // word alignment between source and target sentence
-    // we don't store alignment info in AlignmentInfoCollection because we keep alignments of whole
-    // sentences, not phrases
-    AlignmentInfo *alignment = new AlignmentInfo(tabbedSentence.GetColumns()[1]);
-
-    VWTargetSentence &targetSent = *GetStored();
-    targetSent.Clear();
-    targetSent.m_sentence = target;
-    targetSent.m_alignment = alignment;
-
-    // pre-compute max- and min- aligned points for faster translation option checking
-    targetSent.SetConstraints(source.GetSize());
-  }
+  void SetParameter(const std::string& key, const std::string& value);

+  // At prediction time, this clears our caches. At training time, we load the next sentence, its 
+  // translation and word alignment.
+  virtual void InitializeForInput(ttasksptr const& ttask);

 private:
-  std::string MakeTargetLabel(const TargetPhrase &targetPhrase) const {
-    return VW_DUMMY_LABEL;
+  inline std::string MakeTargetLabel(const TargetPhrase &targetPhrase) const {
+    return VW_DUMMY_LABEL; // VW does not care about class labels in our setting (--csoaa_ldf mc).
  }

-  bool IsCorrectTranslationOption(const TranslationOption &topt) const {
-
-    //std::cerr << topt.GetSourceWordsRange() << std::endl;
-
-    int sourceStart = topt.GetSourceWordsRange().GetStartPos();
-    int sourceEnd   = topt.GetSourceWordsRange().GetEndPos();
-
-    const VWTargetSentence &targetSentence = *GetStored();
-
-    // [targetStart, targetEnd] spans aligned target words
-    int targetStart = targetSentence.m_sentence->GetSize();
-    int targetEnd   = -1;
-
-    // get the left-most and right-most alignment point within source span
-    for(int i = sourceStart; i <= sourceEnd; ++i) {
-      if(targetSentence.m_sourceConstraints[i].IsSet()) {
-        if(targetStart > targetSentence.m_sourceConstraints[i].GetMin())
-          targetStart = targetSentence.m_sourceConstraints[i].GetMin();
-        if(targetEnd < targetSentence.m_sourceConstraints[i].GetMax())
-          targetEnd = targetSentence.m_sourceConstraints[i].GetMax();
-      }
-    }
-    // there was no alignment
-    if(targetEnd == -1)
-      return false;
-
-    //std::cerr << "Shorter: " << targetStart << " " << targetEnd << std::endl;
-
-    // [targetStart2, targetEnd2] spans unaligned words left and right of [targetStart, targetEnd]
-    int targetStart2 = targetStart;
-    for(int i = targetStart2; i >= 0 && !targetSentence.m_targetConstraints[i].IsSet(); --i)
-      targetStart2 = i;
-
-    int targetEnd2   = targetEnd;
-    for(int i = targetEnd2;
-        i < targetSentence.m_sentence->GetSize() && !targetSentence.m_targetConstraints[i].IsSet();
-        ++i)
-      targetEnd2 = i;
-
-    //std::cerr << "Longer: " << targetStart2 << " " << targetEnd2 << std::endl;
-
-    const TargetPhrase &tphrase = topt.GetTargetPhrase();
-    //std::cerr << tphrase << std::endl;
-
-    // if target phrase is shorter than inner span return false
-    if(tphrase.GetSize() < targetEnd - targetStart + 1)
-      return false;
-
-    // if target phrase is longer than outer span return false
-    if(tphrase.GetSize() > targetEnd2 - targetStart2 + 1)
-      return false;
-
-    // for each possible starting point
-    for(int tempStart = targetStart2; tempStart <= targetStart; tempStart++) {
-      bool found = true;
-      // check if the target phrase is within longer span
-      for(int i = tempStart; i <= targetEnd2 && i < tphrase.GetSize() + tempStart; ++i) {
-        if(tphrase.GetWord(i - tempStart) != targetSentence.m_sentence->GetWord(i)) {
-          found = false;
-          break;
-        }
-      }
-      // return true if there was a match
-      if(found) {
-        //std::cerr << "Found" << std::endl;
-        return true;
-      }
-    }
-
-    return false;
+  inline size_t MakeCacheKey(const FFState *prevState, size_t spanStart, size_t spanEnd) const {
+    size_t key = 0;
+    boost::hash_combine(key, prevState);
+    boost::hash_combine(key, spanStart);
+    boost::hash_combine(key, spanEnd);
+    return key;
  }

-  std::vector<bool> LeaveOneOut(const TranslationOptionList &topts, const std::vector<bool> &correct) const {
-    UTIL_THROW_IF2(m_leaveOneOut.size() == 0 || ! m_train, "LeaveOneOut called in wrong setting!");
+  // used in decoding to transform the global word alignment information into
+  // context-phrase internal alignment information (i.e., with target indices correspoding
+  // to positions in contextPhrase)
+  const AlignmentInfo *TransformAlignmentInfo(const Hypothesis &curHypo, size_t contextSize) const;

-    float sourceRawCount = 0.0;
-    const float ONE = 1.0001; // I don't understand floating point numbers
+  // used during training to extract relevant alignment points from the full sentence alignment
+  // and shift them by target context size
+  AlignmentInfo TransformAlignmentInfo(const AlignmentInfo &alignInfo, size_t contextSize, int currentStart) const;

-    std::vector<bool> keepOpt;
+  // At training time, determine whether a translation option is correct for the current target sentence
+  // based on word alignment. This is a bit complicated because we need to handle various corner-cases
+  // where some word(s) on phrase borders are unaligned.
+  std::pair<bool, int> IsCorrectTranslationOption(const TranslationOption &topt) const;

-    for (size_t i = 0; i < topts.size(); i++) {
-      TranslationOption *topt = *(topts.begin() + i);
-      const TargetPhrase &targetPhrase = topt->GetTargetPhrase();
-
-      // extract raw counts from phrase-table property
-      const CountsPhraseProperty *property =
-        static_cast<const CountsPhraseProperty *>(targetPhrase.GetProperty("Counts"));
-
-      if (! property) {
-        VERBOSE(1, "VW :: Counts not found for topt! Is this an OOV?\n");
-        // keep all translation opts without updating, this is either OOV or bad usage...
-        keepOpt.assign(topts.size(), true);
-        return keepOpt;
-      }
-
-      if (sourceRawCount == 0.0) {
-        sourceRawCount = property->GetSourceMarginal() - ONE; // discount one occurrence of the source phrase
-        if (sourceRawCount <= 0) {
-          // no translation options survived, source phrase was a singleton
-          keepOpt.assign(topts.size(), false);
-          return keepOpt;
-        }
-      }
-
-      float discount = correct[i] ? ONE : 0.0;
-      float target = property->GetTargetMarginal() - discount;
-      float joint  = property->GetJointCount() - discount;
-      if (discount != 0.0) VERBOSE(2, "VW :: leaving one out!\n");
-
-      if (joint > 0) {
-        // topt survived leaving one out, update its scores
-        const FeatureFunction *feature = &FindFeatureFunction(m_leaveOneOut);
-        std::vector<float> scores = targetPhrase.GetScoreBreakdown().GetScoresForProducer(feature);
-        UTIL_THROW_IF2(scores.size() != 4, "Unexpected number of scores in feature " << m_leaveOneOut);
-        scores[0] = TransformScore(joint / target); // P(f|e)
-        scores[2] = TransformScore(joint / sourceRawCount); // P(e|f)
-
-        ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown();
-        scoreBreakDown.Assign(feature, scores);
-        topt->UpdateScore();
-        keepOpt.push_back(true);
-      } else {
-        // they only occurred together once, discard topt
-        VERBOSE(2, "VW :: discarded topt when leaving one out\n");
-        keepOpt.push_back(false);
-      }
-    }
-
-    return keepOpt;
-  }
+  // At training time, optionally discount occurrences of phrase pairs from the current sentence, helps prevent
+  // over-fitting.
+  std::vector<bool> LeaveOneOut(const TranslationOptionList &topts, const std::vector<bool> &correct) const;

  bool m_train; // false means predict
-  std::string m_modelPath;
-  std::string m_vwOptions;
+  std::string m_modelPath; // path to the VW model file; at training time, this is where extracted features are stored
+  std::string m_vwOptions; // options for Vowpal Wabbit
+
+  // BOS token, all factors
+  Word m_sentenceStartWord;

  // calculator of training loss
  TrainingLoss *m_trainingLoss = NULL;
@ -488,9 +179,16 @@ private:
  // optionally contains feature name of a phrase table where we recompute scores with leaving one out
  std::string m_leaveOneOut;

+  // normalizer, typically this means softmax
  Discriminative::Normalizer *m_normalizer = NULL;
+  
+  // thread-specific classifier instance
  TLSClassifier *m_tlsClassifier;
+
+  // caches for partial scores and feature vectors
+  TLSFloatHashMap *m_tlsFutureScores;
+  TLSStateExtensions *m_tlsComputedStateExtensions;
+  TLSFeatureVectorMap *m_tlsTranslationOptionFeatures, *m_tlsTargetContextFeatures;
 };

 }
-
--- a/moses/FF/VW/VWFeatureBase.cpp
+++ b/moses/FF/VW/VWFeatureBase.cpp
@ -2,11 +2,25 @@
 #include <string>

 #include "VWFeatureBase.h"
+#include "VWFeatureContext.h"

 namespace Moses
 {
 std::map<std::string, std::vector<VWFeatureBase*> > VWFeatureBase::s_features;
 std::map<std::string, std::vector<VWFeatureBase*> > VWFeatureBase::s_sourceFeatures;
+std::map<std::string, std::vector<VWFeatureBase*> > VWFeatureBase::s_targetContextFeatures;
 std::map<std::string, std::vector<VWFeatureBase*> > VWFeatureBase::s_targetFeatures;
+
+std::map<std::string, size_t> VWFeatureBase::s_targetContextLength;
+
+
+void VWFeatureBase::UpdateContextSize(const std::string &usedBy) {
+  // using the standard map behavior here: if the entry does not
+  // exist, it will be added and initialized to zero
+  size_t currentSize = s_targetContextLength[usedBy];
+  size_t newSize = static_cast<VWFeatureContext *const>(this)->GetContextSize();
+  s_targetContextLength[usedBy] = std::max(currentSize, newSize);
+}
+
 }

--- a/moses/FF/VW/VWFeatureBase.h
+++ b/moses/FF/VW/VWFeatureBase.h
@ -12,11 +12,17 @@
 namespace Moses
 {

+enum VWFeatureType { 
+  vwft_source,
+  vwft_target,
+  vwft_targetContext
+};
+
 class VWFeatureBase : public StatelessFeatureFunction
 {
 public:
-  VWFeatureBase(const std::string &line, bool isSource = true)
-    : StatelessFeatureFunction(0, line), m_usedBy(1, "VW0"), m_isSource(isSource) {
+  VWFeatureBase(const std::string &line, VWFeatureType featureType = vwft_source)
+    : StatelessFeatureFunction(0, line), m_usedBy(1, "VW0"), m_featureType(featureType) {
    // defaults
    m_sourceFactors.push_back(0);
    m_targetFactors.push_back(0);
@ -71,26 +77,47 @@ public:
    return s_sourceFeatures[name];
  }

+  // Return only target-context classifier features
+  static const std::vector<VWFeatureBase*>& GetTargetContextFeatures(std::string name = "VW0") {
+    // don't throw an exception when there are no target-context features, this feature type is not mandatory
+    return s_targetContextFeatures[name];
+  }
+
  // Return only target-dependent classifier features
  static const std::vector<VWFeatureBase*>& GetTargetFeatures(std::string name = "VW0") {
    UTIL_THROW_IF2(s_targetFeatures.count(name) == 0, "No target features registered for parent classifier: " + name);
    return s_targetFeatures[name];
  }

+  // Required length context (maximum context size of defined target-context features)
+  static size_t GetMaximumContextSize(std::string name = "VW0") {
+    return s_targetContextLength[name]; // 0 by default
+  }
+
  // Overload to process source-dependent data, create features once for every
  // source sentence word range.
  virtual void operator()(const InputType &input
-                          , const InputPath &inputPath
                          , const Range &sourceRange
-                          , Discriminative::Classifier &classifier) const = 0;
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const = 0;

  // Overload to process target-dependent features, create features once for
-  // every target phrase. One source word range will have at leat one target
+  // every target phrase. One source word range will have at least one target
  // phrase, but may have more.
  virtual void operator()(const InputType &input
-                          , const InputPath &inputPath
                          , const TargetPhrase &targetPhrase
-                          , Discriminative::Classifier &classifier) const = 0;
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const = 0;
+
+  // Overload to process target-context dependent features, these features are
+  // evaluated during decoding. For efficiency, features are not fed directly into
+  // the classifier object but instead output in the vector "features" and managed
+  // separately in VW.h.
+  virtual void operator()(const InputType &input
+                          , const Phrase &contextPhrase
+                          , const AlignmentInfo &alignmentInfo
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const = 0;

 protected:
  std::vector<FactorType> m_sourceFactors, m_targetFactors;
@ -99,10 +126,15 @@ protected:
    for(std::vector<std::string>::const_iterator it = m_usedBy.begin();
        it != m_usedBy.end(); it++) {
      s_features[*it].push_back(this);
-      if(m_isSource)
+
+      if(m_featureType == vwft_source) {
        s_sourceFeatures[*it].push_back(this);
-      else
+      } else if (m_featureType == vwft_targetContext) {
+        s_targetContextFeatures[*it].push_back(this);
+        UpdateContextSize(*it);
+      } else {
        s_targetFeatures[*it].push_back(this);
+      }
    }
  }

@ -112,11 +144,16 @@ private:
    Tokenize(m_usedBy, usedBy, ",");
  }

+  void UpdateContextSize(const std::string &usedBy);
+
  std::vector<std::string> m_usedBy;
-  bool m_isSource;
+  VWFeatureType m_featureType;
  static std::map<std::string, std::vector<VWFeatureBase*> > s_features;
  static std::map<std::string, std::vector<VWFeatureBase*> > s_sourceFeatures;
+  static std::map<std::string, std::vector<VWFeatureBase*> > s_targetContextFeatures;
  static std::map<std::string, std::vector<VWFeatureBase*> > s_targetFeatures;
+
+  static std::map<std::string, size_t> s_targetContextLength;
 };

 }
--- a/moses/FF/VW/VWFeatureContext.h
+++ b/moses/FF/VW/VWFeatureContext.h
@ -0,0 +1,116 @@
+#pragma once
+
+#include <string>
+#include <boost/foreach.hpp>
+#include "VWFeatureBase.h"
+#include "moses/InputType.h"
+#include "moses/TypeDef.h"
+#include "moses/Word.h"
+
+namespace Moses
+{
+
+// Inherit from this for source-dependent classifier features. They will
+// automatically register with the classifier class named VW0 or one or more
+// names specified by the used-by=name1,name2,... parameter.
+//
+// The classifier gets a full list by calling
+// VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription())
+
+
+class VWFeatureContext : public VWFeatureBase
+{
+public:
+  VWFeatureContext(const std::string &line, size_t contextSize)
+    : VWFeatureBase(line, vwft_targetContext), m_contextSize(contextSize) {
+  }
+
+  // Gets its pure virtual functions from VWFeatureBase
+
+  virtual void operator()(const InputType &input
+                          , const TargetPhrase &targetPhrase
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
+  }
+
+  virtual void operator()(const InputType &input
+                          , const Range &sourceRange
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
+  }
+
+  virtual void SetParameter(const std::string& key, const std::string& value) {
+    if (key == "size") {
+      m_contextSize = Scan<size_t>(value);
+    } else if (key == "factor-positions") {
+      // factor positions: assuming a factor such as positional morphological tag, use this 
+      // option to select only certain positions; this assumes that only a single
+      // target-side factor is defined
+      Tokenize<size_t>(m_factorPositions, value, ",");
+    } else {
+      VWFeatureBase::SetParameter(key, value);
+    }
+  }
+
+  size_t GetContextSize() {
+    return m_contextSize;
+  }
+
+protected:
+  // Get word with the correct subset of factors as string. Because we're target
+  // context features, we look at a limited number of words to the left of the
+  // current translation. posFromEnd is interpreted like this:
+  // 0 = last word of the hypothesis
+  // 1 = next to last word
+  // ...etc.
+  inline std::string GetWord(const Phrase &phrase, size_t posFromEnd) const {
+    const Word &word = phrase.GetWord(phrase.GetSize() - posFromEnd - 1);
+    if (m_factorPositions.empty()) {
+      return word.GetString(m_targetFactors, false);
+    } else {
+      if (m_targetFactors.size() != 1)
+        UTIL_THROW2("You can only use factor-positions when a single target-side factor is defined.");
+      const std::string &fullFactor = word.GetFactor(m_targetFactors[0])->GetString().as_string();
+
+      // corner cases: at sentence beginning/end, we don't have the correct factors set up
+      // similarly for UNK
+      if (fullFactor == BOS_ || fullFactor == EOS_ || fullFactor == UNKNOWN_FACTOR)
+        return fullFactor;
+
+      std::string subFactor(m_factorPositions.size(), 'x'); // initialize string with correct size and placeholder chars
+      for (size_t i = 0; i < m_factorPositions.size(); i++)
+        subFactor[i] = fullFactor[m_factorPositions[i]];
+
+      return subFactor;
+    }
+  }
+
+  // some target-context feature functions also look at the source
+  inline std::string GetSourceWord(const InputType &input, size_t pos) const {
+    return input.GetWord(pos).GetString(m_sourceFactors, false);
+  }
+
+  // get source words aligned to a particular context word
+  std::vector<std::string> GetAlignedSourceWords(const Phrase &contextPhrase
+                                          , const InputType &input
+                                          , const AlignmentInfo &alignInfo
+                                          , size_t posFromEnd) const {
+    size_t idx = contextPhrase.GetSize() - posFromEnd - 1;
+    std::set<size_t> alignedToTarget = alignInfo.GetAlignmentsForTarget(idx);
+    std::vector<std::string> out;
+    out.reserve(alignedToTarget.size());
+    BOOST_FOREACH(size_t srcIdx, alignedToTarget) {
+      out.push_back(GetSourceWord(input, srcIdx));
+    }
+    return out;
+  }
+
+  // required context size
+  size_t m_contextSize;
+
+  // factor positions: assuming a factor such as positional morphological tag, use this 
+  // option to select only certain positions
+  std::vector<size_t> m_factorPositions;
+};
+
+}
--- a/moses/FF/VW/VWFeatureContextBigrams.h
+++ b/moses/FF/VW/VWFeatureContextBigrams.h
@ -0,0 +1,40 @@
+#pragma once
+
+#include <string>
+#include <algorithm>
+#include "VWFeatureContext.h"
+#include "moses/Util.h"
+
+namespace Moses
+{
+
+class VWFeatureContextBigrams : public VWFeatureContext
+{
+public:
+  VWFeatureContextBigrams(const std::string &line)
+    : VWFeatureContext(line, DEFAULT_WINDOW_SIZE) {
+    ReadParameters();
+
+    // Call this last
+    VWFeatureBase::UpdateRegister();
+  }
+
+  virtual void operator()(const InputType &input
+                          , const Phrase &contextPhrase
+                          , const AlignmentInfo &alignmentInfo
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
+    for (size_t i = 1; i < m_contextSize; i++)
+      outFeatures.push_back(classifier.AddLabelIndependentFeature("tcbigram^-" + SPrint(i + 1) 
+            + "^" + GetWord(contextPhrase, i - 1) + "^" + GetWord(contextPhrase, i)));
+  }
+
+  virtual void SetParameter(const std::string& key, const std::string& value) {
+    VWFeatureContext::SetParameter(key, value);
+  }
+
+private:
+  static const int DEFAULT_WINDOW_SIZE = 1;
+};
+
+}
--- a/moses/FF/VW/VWFeatureContextBilingual.h
+++ b/moses/FF/VW/VWFeatureContextBilingual.h
@ -0,0 +1,45 @@
+#pragma once
+
+#include <string>
+#include <boost/foreach.hpp>
+#include <algorithm>
+#include "VWFeatureContext.h"
+#include "moses/Util.h"
+
+namespace Moses
+{
+
+class VWFeatureContextBilingual : public VWFeatureContext
+{
+public:
+  VWFeatureContextBilingual(const std::string &line)
+    : VWFeatureContext(line, DEFAULT_WINDOW_SIZE) {
+    ReadParameters();
+
+    // Call this last
+    VWFeatureBase::UpdateRegister();
+  }
+
+  virtual void operator()(const InputType &input
+                          , const Phrase &contextPhrase
+                          , const AlignmentInfo &alignmentInfo
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
+    for (size_t i = 0; i < m_contextSize; i++) {
+      std::string tgtWord = GetWord(contextPhrase, i);
+      std::vector<std::string> alignedTo = GetAlignedSourceWords(contextPhrase, input, alignmentInfo, i);
+      BOOST_FOREACH(const std::string &srcWord, alignedTo) {
+        outFeatures.push_back(classifier.AddLabelIndependentFeature("tcblng^-" + SPrint(i + 1) + "^" + tgtWord + "^" + srcWord));
+      }
+    }
+  }
+
+  virtual void SetParameter(const std::string& key, const std::string& value) {
+    VWFeatureContext::SetParameter(key, value);
+  }
+
+private:
+  static const int DEFAULT_WINDOW_SIZE = 1;
+};
+
+}
--- a/moses/FF/VW/VWFeatureContextWindow.h
+++ b/moses/FF/VW/VWFeatureContextWindow.h
@ -0,0 +1,39 @@
+#pragma once
+
+#include <string>
+#include <algorithm>
+#include "VWFeatureContext.h"
+#include "moses/Util.h"
+
+namespace Moses
+{
+
+class VWFeatureContextWindow : public VWFeatureContext
+{
+public:
+  VWFeatureContextWindow(const std::string &line)
+    : VWFeatureContext(line, DEFAULT_WINDOW_SIZE) {
+    ReadParameters();
+
+    // Call this last
+    VWFeatureBase::UpdateRegister();
+  }
+
+  virtual void operator()(const InputType &input
+                          , const Phrase &contextPhrase
+                          , const AlignmentInfo &alignmentInfo
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
+    for (size_t i = 0; i < m_contextSize; i++)
+      outFeatures.push_back(classifier.AddLabelIndependentFeature("tcwin^-" + SPrint(i + 1) + "^" + GetWord(contextPhrase, i)));
+  }
+
+  virtual void SetParameter(const std::string& key, const std::string& value) {
+    VWFeatureContext::SetParameter(key, value);
+  }
+
+private:
+  static const int DEFAULT_WINDOW_SIZE = 1;
+};
+
+}
--- a/moses/FF/VW/VWFeatureSource.h
+++ b/moses/FF/VW/VWFeatureSource.h
@ -19,15 +19,22 @@ class VWFeatureSource : public VWFeatureBase
 {
 public:
  VWFeatureSource(const std::string &line)
-    : VWFeatureBase(line, true) {
+    : VWFeatureBase(line, vwft_source) {
  }

  // Gets its pure virtual functions from VWFeatureBase

  virtual void operator()(const InputType &input
-                          , const InputPath &inputPath
                          , const TargetPhrase &targetPhrase
-                          , Discriminative::Classifier &classifier) const {
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
+  }
+
+  virtual void operator()(const InputType &input
+                          , const Phrase &contextPhrase
+                          , const AlignmentInfo &alignmentInfo
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
  }

  virtual void SetParameter(const std::string& key, const std::string& value) {
--- a/moses/FF/VW/VWFeatureSourceBagOfWords.h
+++ b/moses/FF/VW/VWFeatureSourceBagOfWords.h
@ -18,11 +18,11 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const Range &sourceRange
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    for (size_t i = 0; i < input.GetSize(); i++) {
-      classifier.AddLabelIndependentFeature("bow^" + GetWord(input, i));
+      outFeatures.push_back(classifier.AddLabelIndependentFeature("bow^" + GetWord(input, i)));
    }
  }

--- a/moses/FF/VW/VWFeatureSourceBigrams.h
+++ b/moses/FF/VW/VWFeatureSourceBigrams.h
@ -18,11 +18,11 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const Range &sourceRange
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    for (size_t i = 1; i < input.GetSize(); i++) {
-      classifier.AddLabelIndependentFeature("bigram^" + GetWord(input, i - 1) + "^" + GetWord(input, i));
+      outFeatures.push_back(classifier.AddLabelIndependentFeature("bigram^" + GetWord(input, i - 1) + "^" + GetWord(input, i)));
    }
  }

--- a/moses/FF/VW/VWFeatureSourceExternalFeatures.h
+++ b/moses/FF/VW/VWFeatureSourceExternalFeatures.h
@ -23,12 +23,12 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const Range &sourceRange
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    const Features& features = *m_tls.GetStored();
    for (size_t i = 0; i < features.size(); i++) {
-      classifier.AddLabelIndependentFeature("srcext^" + features[i]);
+      outFeatures.push_back(classifier.AddLabelIndependentFeature("srcext^" + features[i]));
    }
  }

--- a/moses/FF/VW/VWFeatureSourceIndicator.h
+++ b/moses/FF/VW/VWFeatureSourceIndicator.h
@ -20,9 +20,9 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const Range &sourceRange
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    size_t begin = sourceRange.GetStartPos();
    size_t end   = sourceRange.GetEndPos() + 1;

@ -31,7 +31,7 @@ public:
    for (size_t i = 0; i < end - begin; i++)
      words[i] = GetWord(input, begin + i);

-    classifier.AddLabelIndependentFeature("sind^" + Join(" ", words));
+    outFeatures.push_back(classifier.AddLabelIndependentFeature("sind^" + Join(" ", words)));
  }

  virtual void SetParameter(const std::string& key, const std::string& value) {
--- a/moses/FF/VW/VWFeatureSourcePhraseInternal.h
+++ b/moses/FF/VW/VWFeatureSourcePhraseInternal.h
@ -20,14 +20,14 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const Range &sourceRange
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    size_t begin = sourceRange.GetStartPos();
    size_t end   = sourceRange.GetEndPos() + 1;

    while (begin < end) {
-      classifier.AddLabelIndependentFeature("sin^" + GetWord(input, begin++));
+      outFeatures.push_back(classifier.AddLabelIndependentFeature("sin^" + GetWord(input, begin++)));
    }
  }

--- a/moses/FF/VW/VWFeatureSourceSenseWindow.h
+++ b/moses/FF/VW/VWFeatureSourceSenseWindow.h
@ -51,9 +51,9 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const Range &sourceRange
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    int begin = sourceRange.GetStartPos();
    int end   = sourceRange.GetEndPos() + 1;
    int inputLen = input.GetSize();
@ -64,24 +64,24 @@ public:
    // before current phrase
    for (int i = std::max(0, begin - m_size); i < begin; i++) {
      BOOST_FOREACH(const Sense &sense, senses[i]) {
-        classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
-        classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob);
+        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
+        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob));
      }
    }

    // within current phrase
    for (int i = begin; i < end; i++) {
      BOOST_FOREACH(const Sense &sense, senses[i]) {
-        classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
-        classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob);
+        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
+        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob));
      }
    }

    // after current phrase
    for (int i = end; i < std::min(end + m_size, inputLen); i++) {
      BOOST_FOREACH(const Sense &sense, senses[i]) {
-        classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
-        classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob);
+        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob));
+        outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob));
      }
    }
  }
--- a/moses/FF/VW/VWFeatureSourceWindow.h
+++ b/moses/FF/VW/VWFeatureSourceWindow.h
@ -20,19 +20,19 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const Range &sourceRange
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    int begin = sourceRange.GetStartPos();
    int end   = sourceRange.GetEndPos() + 1;
    int inputLen = input.GetSize();

    for (int i = std::max(0, begin - m_size); i < begin; i++) {
-      classifier.AddLabelIndependentFeature("c^" + SPrint(i - begin) + "^" + GetWord(input, i));
+      outFeatures.push_back(classifier.AddLabelIndependentFeature("c^" + SPrint(i - begin) + "^" + GetWord(input, i)));
    }

    for (int i = end; i < std::min(end + m_size, inputLen); i++) {
-      classifier.AddLabelIndependentFeature("c^" + SPrint(i - end + 1) + "^" + GetWord(input, i));
+      outFeatures.push_back(classifier.AddLabelIndependentFeature("c^" + SPrint(i - end + 1) + "^" + GetWord(input, i)));
    }
  }

--- a/moses/FF/VW/VWFeatureTarget.h
+++ b/moses/FF/VW/VWFeatureTarget.h
@ -17,15 +17,22 @@ class VWFeatureTarget : public VWFeatureBase
 {
 public:
  VWFeatureTarget(const std::string &line)
-    : VWFeatureBase(line, false) {
+    : VWFeatureBase(line, vwft_target) {
  }

  // Gets its pure virtual functions from VWFeatureBase

  virtual void operator()(const InputType &input
-                          , const InputPath &inputPath
                          , const Range &sourceRange
-                          , Discriminative::Classifier &classifier) const {
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
+  }
+
+  virtual void operator()(const InputType &input
+                          , const Phrase &contextPhrase
+                          , const AlignmentInfo &alignmentInfo
+                          , Discriminative::Classifier &classifier
+                          , Discriminative::FeatureVector &outFeatures) const {
  }

  virtual void SetParameter(const std::string& key, const std::string& value) {
--- a/moses/FF/VW/VWFeatureTargetBigrams.h
+++ b/moses/FF/VW/VWFeatureTargetBigrams.h
@ -17,11 +17,11 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const TargetPhrase &targetPhrase
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    for (size_t i = 1; i < targetPhrase.GetSize(); i++) {
-      classifier.AddLabelDependentFeature("tbigram^" + GetWord(targetPhrase, i - 1) + "^" + GetWord(targetPhrase, i));
+      outFeatures.push_back(classifier.AddLabelDependentFeature("tbigram^" + GetWord(targetPhrase, i - 1) + "^" + GetWord(targetPhrase, i)));
    }
  }

--- a/moses/FF/VW/VWFeatureTargetIndicator.h
+++ b/moses/FF/VW/VWFeatureTargetIndicator.h
@ -17,10 +17,10 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const TargetPhrase &targetPhrase
-                  , Discriminative::Classifier &classifier) const {
-    classifier.AddLabelDependentFeature("tind^" + targetPhrase.GetStringRep(m_targetFactors));
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
+    outFeatures.push_back(classifier.AddLabelDependentFeature("tind^" + targetPhrase.GetStringRep(m_targetFactors)));
  }

  virtual void SetParameter(const std::string& key, const std::string& value) {
--- a/moses/FF/VW/VWFeatureTargetPhraseInternal.h
+++ b/moses/FF/VW/VWFeatureTargetPhraseInternal.h
@ -17,11 +17,11 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const TargetPhrase &targetPhrase
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    for (size_t i = 0; i < targetPhrase.GetSize(); i++) {
-      classifier.AddLabelDependentFeature("tin^" + GetWord(targetPhrase, i));
+      outFeatures.push_back(classifier.AddLabelDependentFeature("tin^" + GetWord(targetPhrase, i)));
    }
  }

--- a/moses/FF/VW/VWFeatureTargetPhraseScores.h
+++ b/moses/FF/VW/VWFeatureTargetPhraseScores.h
@ -20,9 +20,9 @@ public:
  }

  void operator()(const InputType &input
-                  , const InputPath &inputPath
                  , const TargetPhrase &targetPhrase
-                  , Discriminative::Classifier &classifier) const {
+                  , Discriminative::Classifier &classifier
+                  , Discriminative::FeatureVector &outFeatures) const {
    std::vector<FeatureFunction*> features = FeatureFunction::GetFeatureFunctions();
    for (size_t i = 0; i < features.size(); i++) {
      std::string fname = features[i]->GetScoreProducerDescription();
@ -31,7 +31,7 @@ public:

      std::vector<float> scores = targetPhrase.GetScoreBreakdown().GetScoresForProducer(features[i]);
      for(size_t j = 0; j < scores.size(); ++j)
-        classifier.AddLabelDependentFeature(fname + "^" + boost::lexical_cast<std::string>(j), scores[j]);
+        outFeatures.push_back(classifier.AddLabelDependentFeature(fname + "^" + boost::lexical_cast<std::string>(j), scores[j]));
    }
  }

--- a/moses/FF/VW/VWState.cpp
+++ b/moses/FF/VW/VWState.cpp
@ -0,0 +1,70 @@
+#include "VWState.h"
+
+#include "moses/FF/FFState.h"
+#include "moses/Phrase.h"
+#include "moses/Hypothesis.h"
+#include "moses/Util.h"
+#include "moses/TypeDef.h"
+#include "moses/StaticData.h"
+#include "moses/TranslationOption.h"
+#include <boost/functional/hash.hpp>
+
+namespace Moses {
+
+VWState::VWState() : m_spanStart(0), m_spanEnd(0) {
+  ComputeHash();
+}
+
+VWState::VWState(const Phrase &phrase) 
+  : m_phrase(phrase), m_spanStart(0), m_spanEnd(0) {
+  ComputeHash();
+}
+
+VWState::VWState(const VWState &prevState, const Hypothesis &curHypo) {
+  VERBOSE(3, "VW :: updating state\n>> previous state: " << prevState << "\n");
+
+  // copy phrase from previous state
+  Phrase phrase = prevState.GetPhrase();
+  size_t contextSize = phrase.GetSize(); // identical to VWFeatureBase::GetMaximumContextSize()
+  
+  // add words from current hypothesis
+  phrase.Append(curHypo.GetCurrTargetPhrase());
+
+  VERBOSE(3, ">> current hypo: " << curHypo.GetCurrTargetPhrase() << "\n");
+
+  // get a slice of appropriate length
+  Range range(phrase.GetSize() - contextSize, phrase.GetSize() - 1);
+  m_phrase = phrase.GetSubString(range);
+
+  // set current span start/end
+  m_spanStart = curHypo.GetTranslationOption().GetStartPos();
+  m_spanEnd   = curHypo.GetTranslationOption().GetEndPos();
+
+  // compute our hash
+  ComputeHash();
+
+  VERBOSE(3, ">> updated state: " << *this << "\n");
+}
+
+bool VWState::operator==(const FFState& o) const {
+  const VWState &other = static_cast<const VWState &>(o);
+
+  return m_phrase == other.GetPhrase()
+    && m_spanStart == other.GetSpanStart()
+    && m_spanEnd == other.GetSpanEnd();
+}
+
+void VWState::ComputeHash() {
+  m_hash = 0;
+
+  boost::hash_combine(m_hash, m_phrase);
+  boost::hash_combine(m_hash, m_spanStart);
+  boost::hash_combine(m_hash, m_spanEnd);
+}
+
+std::ostream &operator<<(std::ostream &out, const VWState &state) {
+  out << state.GetPhrase() << "::" << state.GetSpanStart() << "-" << state.GetSpanEnd();
+  return out;
+}
+
+}
--- a/moses/FF/VW/VWState.h
+++ b/moses/FF/VW/VWState.h
@ -0,0 +1,54 @@
+#pragma once
+
+#include <ostream>
+
+#include "moses/FF/FFState.h"
+#include "moses/Phrase.h"
+#include "moses/Hypothesis.h"
+
+namespace Moses {
+
+/**
+ * VW state, used in decoding (when target context is enabled).
+ */
+class VWState : public FFState {
+public:
+  // empty state, used only when VWState is ignored
+  VWState(); 
+
+  // used for construction of the initial VW state
+  VWState(const Phrase &phrase);
+
+  // continue from previous VW state with a new hypothesis
+  VWState(const VWState &prevState, const Hypothesis &curHypo);
+
+  virtual bool operator==(const FFState& o) const;
+
+  inline virtual size_t hash() const {
+    return m_hash;
+  }
+
+  inline const Phrase &GetPhrase() const {
+    return m_phrase;
+  }
+
+  inline size_t GetSpanStart() const {
+    return m_spanStart;
+  }
+
+  inline size_t GetSpanEnd() const {
+    return m_spanEnd;
+  }
+
+private:
+  void ComputeHash();
+
+  Phrase m_phrase;
+  size_t m_spanStart, m_spanEnd;
+  size_t m_hash;
+};
+
+// how to print a VW state
+std::ostream &operator<<(std::ostream &out, const VWState &state);
+
+}
--- a/moses/FF/VW/VWTargetSentence.h
+++ b/moses/FF/VW/VWTargetSentence.h
@ -0,0 +1,54 @@
+#pragma once
+
+#include <vector>
+
+#include "moses/AlignmentInfo.h"
+#include "moses/Phrase.h"
+
+#include "AlignmentConstraint.h"
+
+namespace Moses
+{
+
+/**
+ * VW thread-specific data about target sentence.
+ */
+class VWTargetSentence {
+public:
+  VWTargetSentence() : m_sentence(NULL), m_alignment(NULL) {}
+
+  void Clear() {
+    if (m_sentence) delete m_sentence;
+    if (m_alignment) delete m_alignment;
+  }
+
+  ~VWTargetSentence() {
+    Clear();
+  }
+
+  void SetConstraints(size_t sourceSize) {
+    // initialize to unconstrained
+    m_sourceConstraints.assign(sourceSize, AlignmentConstraint());
+    m_targetConstraints.assign(m_sentence->GetSize(), AlignmentConstraint());
+
+    // set constraints according to alignment points
+    AlignmentInfo::const_iterator it;
+    for (it = m_alignment->begin(); it != m_alignment->end(); it++) {
+      int src = it->first;
+      int tgt = it->second;
+
+      if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) {
+        UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt);
+      }
+
+      m_sourceConstraints[src].Update(tgt);
+      m_targetConstraints[tgt].Update(src);
+    }
+  }
+
+  Phrase *m_sentence;
+  AlignmentInfo *m_alignment;
+  std::vector<AlignmentConstraint> m_sourceConstraints, m_targetConstraints;
+};
+
+}
--- a/vw/Classifier.h
+++ b/vw/Classifier.h
@ -24,6 +24,8 @@ class ezexample;

 namespace Discriminative
 {
+typedef std::pair<uint32_t, float> FeatureType; // feature hash (=ID) and value
+typedef std::vector<FeatureType> FeatureVector;

 /**
 * Abstract class to be implemented by classifiers.
@ -34,12 +36,22 @@ public:
  /**
   * Add a feature that does not depend on the class (label).
   */
-  virtual void AddLabelIndependentFeature(const StringPiece &name, float value) = 0;
+  virtual FeatureType AddLabelIndependentFeature(const StringPiece &name, float value) = 0;

  /**
   * Add a feature that is specific for the given class.
   */
-  virtual void AddLabelDependentFeature(const StringPiece &name, float value) = 0;
+  virtual FeatureType AddLabelDependentFeature(const StringPiece &name, float value) = 0;
+
+  /**
+   * Efficient addition of features when their IDs are already computed.
+   */
+  virtual void AddLabelIndependentFeatureVector(const FeatureVector &features) = 0;
+
+  /**
+   * Efficient addition of features when their IDs are already computed.
+   */
+  virtual void AddLabelDependentFeatureVector(const FeatureVector &features) = 0;

  /**
   * Train using current example. Use loss to distinguish positive and negative training examples.
@ -54,12 +66,12 @@ public:
  virtual float Predict(const StringPiece &label) = 0;

  // helper methods for indicator features
-  void AddLabelIndependentFeature(const StringPiece &name) {
-    AddLabelIndependentFeature(name, 1.0);
+  FeatureType AddLabelIndependentFeature(const StringPiece &name) {
+    return AddLabelIndependentFeature(name, 1.0);
  }

-  void AddLabelDependentFeature(const StringPiece &name) {
-    AddLabelDependentFeature(name, 1.0);
+  FeatureType AddLabelDependentFeature(const StringPiece &name) {
+    return AddLabelDependentFeature(name, 1.0);
  }

  virtual ~Classifier() {}
@ -95,8 +107,10 @@ public:
  VWTrainer(const std::string &outputFile);
  virtual ~VWTrainer();

-  virtual void AddLabelIndependentFeature(const StringPiece &name, float value);
-  virtual void AddLabelDependentFeature(const StringPiece &name, float value);
+  virtual FeatureType AddLabelIndependentFeature(const StringPiece &name, float value);
+  virtual FeatureType AddLabelDependentFeature(const StringPiece &name, float value);
+  virtual void AddLabelIndependentFeatureVector(const FeatureVector &features);
+  virtual void AddLabelDependentFeatureVector(const FeatureVector &features);
  virtual void Train(const StringPiece &label, float loss);
  virtual float Predict(const StringPiece &label);

@ -121,15 +135,17 @@ public:
  VWPredictor(const std::string &modelFile, const std::string &vwOptions);
  virtual ~VWPredictor();

-  virtual void AddLabelIndependentFeature(const StringPiece &name, float value);
-  virtual void AddLabelDependentFeature(const StringPiece &name, float value);
+  virtual FeatureType AddLabelIndependentFeature(const StringPiece &name, float value);
+  virtual FeatureType AddLabelDependentFeature(const StringPiece &name, float value);
+  virtual void AddLabelIndependentFeatureVector(const FeatureVector &features);
+  virtual void AddLabelDependentFeatureVector(const FeatureVector &features);
  virtual void Train(const StringPiece &label, float loss);
  virtual float Predict(const StringPiece &label);

  friend class ClassifierFactory;

 protected:
-  void AddFeature(const StringPiece &name, float values);
+  FeatureType AddFeature(const StringPiece &name, float values);

  ::vw *m_VWInstance, *m_VWParser;
  ::ezexample *m_ex;
--- a/vw/Normalizer.h
+++ b/vw/Normalizer.h
@ -2,6 +2,7 @@
 #define moses_Normalizer_h

 #include <vector>
+#include <algorithm>
 #include "Util.h"

 namespace Discriminative
@ -45,16 +46,25 @@ public:
  virtual ~SquaredLossNormalizer() {}
 };

+// safe softmax
 class LogisticLossNormalizer : public Normalizer
 {
 public:
  virtual void operator()(std::vector<float> &losses) const {
-    float sum = 0;
    std::vector<float>::iterator it;
+
+    float sum = 0;
+    float max = 0;
    for (it = losses.begin(); it != losses.end(); it++) {
-      *it = exp(-*it);
+      *it = -*it;
+      max = std::max(max, *it);
+    }
+
+    for (it = losses.begin(); it != losses.end(); it++) {
+      *it = exp(*it - max);
      sum += *it;
    }
+
    for (it = losses.begin(); it != losses.end(); it++) {
      *it /= sum;
    }
--- a/vw/VWPredictor.cpp
+++ b/vw/VWPredictor.cpp
@ -36,7 +36,7 @@ VWPredictor::~VWPredictor()
    VW::finish(*m_VWInstance);
 }

-void VWPredictor::AddLabelIndependentFeature(const StringPiece &name, float value)
+FeatureType VWPredictor::AddLabelIndependentFeature(const StringPiece &name, float value)
 {
  // label-independent features are kept in a different feature namespace ('s' = source)

@ -48,10 +48,10 @@ void VWPredictor::AddLabelIndependentFeature(const StringPiece &name, float valu
    m_ex->addns('s');
    if (DEBUG) std::cerr << "VW :: Setting source namespace\n";
  }
-  AddFeature(name, value); // namespace 's' is set up, add the feature
+  return AddFeature(name, value); // namespace 's' is set up, add the feature
 }

-void VWPredictor::AddLabelDependentFeature(const StringPiece &name, float value)
+FeatureType VWPredictor::AddLabelDependentFeature(const StringPiece &name, float value)
 {
  // VW does not use the label directly, instead, we do a Cartesian product between source and target feature
  // namespaces, where the source namespace ('s') contains label-independent features and the target
@ -63,7 +63,37 @@ void VWPredictor::AddLabelDependentFeature(const StringPiece &name, float value)
    m_ex->addns('t');
    if (DEBUG) std::cerr << "VW :: Setting target namespace\n";
  }
-  AddFeature(name, value);
+  return AddFeature(name, value);
+}
+
+void VWPredictor::AddLabelIndependentFeatureVector(const FeatureVector &features)
+{
+  if (m_isFirstSource) {
+    // the first feature of a new example => create the source namespace for
+    // label-independent features to live in
+    m_isFirstSource = false;
+    m_ex->finish();
+    m_ex->addns('s');
+    if (DEBUG) std::cerr << "VW :: Setting source namespace\n";
+  }
+
+  // add each feature index using this "low level" call to VW
+  for (FeatureVector::const_iterator it = features.begin(); it != features.end(); it++)
+    m_ex->addf(it->first, it->second);
+}
+
+void VWPredictor::AddLabelDependentFeatureVector(const FeatureVector &features)
+{
+  if (m_isFirstTarget) {
+    // the first target-side feature => create namespace 't'
+    m_isFirstTarget = false;
+    m_ex->addns('t');
+    if (DEBUG) std::cerr << "VW :: Setting target namespace\n";
+  }
+
+  // add each feature index using this "low level" call to VW
+  for (FeatureVector::const_iterator it = features.begin(); it != features.end(); it++)
+    m_ex->addf(it->first, it->second);
 }

 void VWPredictor::Train(const StringPiece &label, float loss)
@ -82,10 +112,10 @@ float VWPredictor::Predict(const StringPiece &label)
  return loss;
 }

-void VWPredictor::AddFeature(const StringPiece &name, float value)
+FeatureType VWPredictor::AddFeature(const StringPiece &name, float value)
 {
  if (DEBUG) std::cerr << "VW :: Adding feature: " << EscapeSpecialChars(name.as_string()) << ":" << value << "\n";
-  m_ex->addf(EscapeSpecialChars(name.as_string()), value);
+  return std::make_pair(m_ex->addf(EscapeSpecialChars(name.as_string()), value), value);
 }

 } // namespace Discriminative
--- a/vw/VWTrainer.cpp
+++ b/vw/VWTrainer.cpp
@ -25,7 +25,7 @@ VWTrainer::~VWTrainer()
  close(m_bfos);
 }

-void VWTrainer::AddLabelIndependentFeature(const StringPiece &name, float value)
+FeatureType VWTrainer::AddLabelIndependentFeature(const StringPiece &name, float value)
 {
  if (m_isFirstSource) {
    if (m_isFirstExample) {
@ -43,9 +43,11 @@ void VWTrainer::AddLabelIndependentFeature(const StringPiece &name, float value)
  }

  AddFeature(name, value);
+
+  return std::make_pair(0, value); // we don't hash features
 }

-void VWTrainer::AddLabelDependentFeature(const StringPiece &name, float value)
+FeatureType VWTrainer::AddLabelDependentFeature(const StringPiece &name, float value)
 {
  if (m_isFirstTarget) {
    m_isFirstTarget = false;
@ -56,6 +58,18 @@ void VWTrainer::AddLabelDependentFeature(const StringPiece &name, float value)
  }

  AddFeature(name, value);
+
+  return std::make_pair(0, value); // we don't hash features
+}
+
+void VWTrainer::AddLabelIndependentFeatureVector(const FeatureVector &features)
+{
+  throw logic_error("VW trainer does not support feature IDs.");
+}
+
+void VWTrainer::AddLabelDependentFeatureVector(const FeatureVector &features)
+{
+  throw logic_error("VW trainer does not support feature IDs.");
 }

 void VWTrainer::Train(const StringPiece &label, float loss)