commenting and minor refactoring of beam search

2024-09-11 06:15:56 +03:00 · 2019-02-06 20:25:43 -08:00 · 2019-02-06 20:25:43 -08:00 · f88eb0d368
commit f88eb0d368
parent 7c7f94c416
15 changed files with 193 additions and 127 deletions
--- a/src/common/options.h
+++ b/src/common/options.h
@ -111,7 +111,7 @@ public:
    }
    try {
      return !options_[key].as<std::string>().empty();
-    } catch(const YAML::BadConversion& e) {
+    } catch(const YAML::BadConversion&) {
      ABORT("Option '{}' is neither a sequence nor a text");
    }
    return false;
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -135,31 +135,49 @@ Expr le(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant(
 /*********************************************************/

 Expr operator+(Expr a, float b) {
-  return Expression<ScalarAddNodeOp>(a, b);
+  if (b == 0)
+    return a;
+  else
+    return Expression<ScalarAddNodeOp>(a, b);
 }

 Expr operator+(float a, Expr b) {
-  return Expression<ScalarAddNodeOp>(b, a);
+  if (a == 0)
+    return b;
+  else
+    return Expression<ScalarAddNodeOp>(b, a);
 }

 Expr operator-(Expr a, float b) {
-  return Expression<ScalarAddNodeOp>(a, -b);
+  if (b == 0)
+    return a;
+  else
+    return Expression<ScalarAddNodeOp>(a, -b);
 }

 Expr operator-(float a, Expr b) {
-  return Expression<ScalarAddNodeOp>(-b, a);
+  if (a == 0)
+    return -b;
+  else
+    return Expression<ScalarAddNodeOp>(-b, a);
 }

 Expr operator*(float a, Expr b) {
-  return Expression<ScalarMultNodeOp>(b, a);
+  if (a == 1.0f)
+    return b;
+  else
+    return Expression<ScalarMultNodeOp>(b, a);
 }

 Expr operator*(Expr a, float b) {
-  return Expression<ScalarMultNodeOp>(a, b);
+  if (b == 1.0f)
+    return a;
+  else
+    return Expression<ScalarMultNodeOp>(a, b);
 }

 Expr operator/(Expr a, float b) {
-  return Expression<ScalarMultNodeOp>(a, 1.f / b);
+  return a * (1.f / b);
 }

 // TODO: efficient version of this without constant()
@ -254,7 +272,12 @@ Expr gather(Expr a, int axis, Expr indices) {
  return Expression<GatherNodeOp>(a, axis, indices);
 }

-// index_select() -- gather arbitrary elements along an axis; unbatched (indices are specified as a 1D vector)
+// index_select() -- gather arbitrary elements along an axis from an unbatched
+// input 'a'. Indices are specified as a 1D vector.
+// This is used e.g. for embedding lookup.
+// Note: To use a batch of index vectors, reshape them into a single vector,
+// call index_select(), then reshape the result back. Reshapes are cheap.
+// This function has the same semantics as PyTorch operation of the same name.
 Expr index_select(Expr a, int axis, Expr indices) {
  ABORT_IF(indices->shape().size() != 1, "Indices must be a 1D tensor");
  // We have specialized kernels for non-batched indexing of first or last axis of a 2D tensor.
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@ -129,7 +129,7 @@ public:
    QSNBestBatch qsNbestBatch;
    for(const auto& history : histories) { // loop over batch entries
      QSNBest qsNbest;
-      NBestList nbestHyps = history->NBest(SIZE_MAX); // request as many N as we have
+      NBestList nbestHyps = history->nBest(SIZE_MAX); // request as many N as we have
      for (const Result& result : nbestHyps) { // loop over N-best entries
        // get hypothesis word sequence and normalized sentence score
        auto words = std::get<0>(result);
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -864,17 +864,6 @@ public:
  }
 };

-// factory functions
-Ptr<EncoderBase> NewEncoderTransformer(Ptr<Options> options)
-{
-  return New<EncoderTransformer>(options);
-}
-
-Ptr<DecoderBase> NewDecoderTransformer(Ptr<Options> options)
-{
-  return New<DecoderTransformer>(options);
-}
-
 // clang-format on

 }  // namespace marian
--- a/src/models/transformer_factory.h
+++ b/src/models/transformer_factory.h
@ -9,7 +9,6 @@
 //#include "layers/factory.h"

 namespace marian {
-// @TODO: find out why static is required here to get to compile
-static Ptr<EncoderBase> NewEncoderTransformer(Ptr<Options> options);
-static Ptr<DecoderBase> NewDecoderTransformer(Ptr<Options> options);
+Ptr<EncoderBase> NewEncoderTransformer(Ptr<Options> options);
+Ptr<DecoderBase> NewDecoderTransformer(Ptr<Options> options);
 }  // namespace marian
--- a/src/models/transformer_stub.cpp
+++ b/src/models/transformer_stub.cpp
@ -1,4 +1,14 @@
-// TODO: This is a wrapper around transformer.h. We kept the .H name to minimize confusing git, until this is code-reviewed.
-// This is meant to speed-up builds, and to support Ctrl-F7 to rebuild.
-
 #include "models/transformer.h"
+
+namespace marian {
+// factory functions
+Ptr<EncoderBase> NewEncoderTransformer(Ptr<Options> options)
+{
+  return New<EncoderTransformer>(options);
+}
+
+Ptr<DecoderBase> NewDecoderTransformer(Ptr<Options> options)
+{
+  return New<DecoderTransformer>(options);
+}
+}  // namespace marian
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -15,6 +15,7 @@ namespace marian {
 namespace cpu {

 void IsNan(const Tensor in, Ptr<Allocator> allocator, bool& isNan, bool& isInf, bool zero) {
+  isNan; isInf; zero;
  ABORT("Not implemented");
 }

--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -535,7 +535,7 @@ public:
            std::stringstream best1;
            std::stringstream bestn;
            printer->print(history, best1, bestn);
-            collector->Write((long)history->GetLineNum(),
+            collector->Write((long)history->getLineNum(),
                             best1.str(),
                             bestn.str(),
                             options_->get<bool>("n-best"));
@ -677,14 +677,14 @@ public:
          size_t no = 0;
          std::lock_guard<std::mutex> statsLock(mutex_);
          for(auto history : histories) {
-            auto result = history->Top();
+            auto result = history->top();
            const auto& words = std::get<0>(result);
            updateStats(stats, words, batch, no, vocabs_.back()->getEosId());

            std::stringstream best1;
            std::stringstream bestn;
            printer->print(history, best1, bestn);
-            collector->Write((long)history->GetLineNum(),
+            collector->Write((long)history->getLineNum(),
                             best1.str(),
                             bestn.str(),
                             /*nbest=*/ false);
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -35,10 +35,10 @@ public:
               const std::vector<float> pathScores,
               size_t vocabSize,
               const Beams& beams,
-               std::vector<Ptr<ScorerState>>& states,
+               const std::vector<Ptr<ScorerState>>& states,
               size_t beamSize,
               bool first,
-               Ptr<data::CorpusBatch> batch) {
+               Ptr<data::CorpusBatch> batch) const {
    Beams newBeams(beams.size());

    std::vector<float> align;
@ -46,46 +46,49 @@ public:
      // Use alignments from the first scorer, even if ensemble
      align = scorers_[0]->getAlignment();

-    for(size_t i = 0; i < keys.size(); ++i) {
+    for(size_t i = 0; i < keys.size(); ++i) { // keys: [beamSize, ?] (flattened)
      // Keys contains indices to vocab items in the entire beam.
      // Values can be between 0 and beamSize * vocabSize.
-      Word embIdx = (Word)(keys[i] % vocabSize);
      auto beamIdx = i / beamSize;

-      // Retrieve short list for final softmax (based on words aligned
-      // to source sentences). If short list has been set, map the indices
-      // in the sub-selected vocabulary matrix back to their original positions.
-      auto shortlist = scorers_[0]->getShortlist();
-      if(shortlist)
-        embIdx = shortlist->reverseMap(embIdx); // @TODO: should reverseMap accept a size_t or a Word?
-
      if(newBeams[beamIdx].size() < beams[beamIdx].size()) {
-        auto& beam = beams[beamIdx];
+        Word wordIdx = (Word)(keys[i] % vocabSize);
+        // Retrieve short list for final softmax (based on words aligned
+        // to source sentences). If short list has been set, map the indices
+        // in the sub-selected vocabulary matrix back to their original positions.
+        auto shortlist = scorers_[0]->getShortlist();
+        if(shortlist)
+          wordIdx = shortlist->reverseMap(wordIdx); // @TODO: should reverseMap accept a size_t or a Word?
+
+        const auto& beam = beams[beamIdx];
        auto& newBeam = newBeams[beamIdx];

-        auto hypIdx = (IndexType)(keys[i] / vocabSize);
-        float pathScore = pathScores[i];
+        const float pathScore = pathScores[i];

-        auto hypIdxTrans
-            = IndexType((hypIdx / beamSize) + (hypIdx % beamSize) * beams.size());
+        // keys[i] = offset into row-major cube of dims [whatIsThis, beamSize, vocabSize]
+        // deconstruct into individual indices
+        const auto hypIdx = (IndexType)(keys[i] / vocabSize);
+        const auto whatIsThis = (hypIdx / beamSize); // @TODO: is this batchIdx?
+        size_t beamHypIdx = hypIdx % beamSize;
+
+        auto hypIdxTrans = IndexType(whatIsThis + beamHypIdx * beams.size());
        if(first)
          hypIdxTrans = hypIdx;

-        size_t beamHypIdx = hypIdx % beamSize;
-        if(beamHypIdx >= (int)beam.size())
+        if(beamHypIdx >= (int)beam.size())  // @TODO: What is this condition? Cf. beamHypIdx = hypIdx % beamSize
          beamHypIdx = beamHypIdx % beam.size();

        if(first)
          beamHypIdx = 0;

-        auto hyp = New<Hypothesis>(beam[beamHypIdx], embIdx, hypIdxTrans, pathScore);
+        auto hyp = New<Hypothesis>(beam[beamHypIdx], wordIdx, hypIdxTrans, pathScore);

        // Set score breakdown for n-best lists
        if(options_->get<bool>("n-best")) {
          std::vector<float> breakDown(states.size(), 0);
          beam[beamHypIdx]->GetScoreBreakdown().resize(states.size(), 0);
          for(size_t j = 0; j < states.size(); ++j) {
-            size_t key = embIdx + hypIdxTrans * vocabSize;
+            size_t key = wordIdx + hypIdxTrans * vocabSize;
            breakDown[j] = states[j]->breakDown(key)
                           + beam[beamHypIdx]->GetScoreBreakdown()[j];
          }
@ -108,7 +111,7 @@ public:
      const std::vector<float> alignAll,
      Ptr<data::CorpusBatch> batch,
      int beamHypIdx,
-      int beamIdx) {
+      int beamIdx) const {
    // Let's B be the beam size, N be the number of batched sentences,
    // and L the number of words in the longest sentence in the batch.
    // The alignment vector:
@ -140,12 +143,13 @@ public:
    return align;
  }

-  Beams pruneBeam(const Beams& beams) {
+  // remove all beam entries that have reached EOS
+  Beams purgeBeams(const Beams& beams) {
    Beams newBeams;
    for(auto beam : beams) {
      Beam newBeam;
      for(auto hyp : beam) {
-        if(hyp->GetWord() != trgEosId_) {
+        if(hyp->getWord() != trgEosId_) {
          newBeam.push_back(hyp);
        }
      }
@ -154,32 +158,29 @@ public:
    return newBeams;
  }

+  //**********************************************************************
  // main decoding function
  Histories search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
    int dimBatch = (int)batch->size();

-    Histories histories;
+    Histories histories(dimBatch);
    for(int i = 0; i < dimBatch; ++i) {
      size_t sentId = batch->getSentenceIds()[i];
-      auto history = New<History>(sentId,
+      histories[i] = New<History>(sentId,
                                  options_->get<float>("normalize"),
                                  options_->get<float>("word-penalty"));
-      histories.push_back(history);
    }

    size_t localBeamSize = beamSize_; // max over beam sizes of active sentence hypotheses

    auto getNBestList = createGetNBestListFn(localBeamSize, dimBatch, graph->getDeviceId());

-    Beams beams(dimBatch);        // [batchIndex][beamIndex] is one sentence hypothesis
+    Beams beams(dimBatch);        // array [dimBatch] of array [localBeamSize] of Hypothesis
    for(auto& beam : beams)
      beam.resize(localBeamSize, New<Hypothesis>());

-    bool first = true;
-    bool final = false;
-
    for(int i = 0; i < dimBatch; ++i)
-      histories[i]->Add(beams[i], trgEosId_);
+      histories[i]->add(beams[i], trgEosId_);

    std::vector<Ptr<ScorerState>> states;

@ -191,33 +192,48 @@ public:
      states.push_back(scorer->startState(graph, batch));
    }

-    // main loop over output tokens
-    do {
+    // the decoder maintains the following state:
+    //  - histories : array [dimBatch] of History
+    //    with History : vector [t] of array [localBeamSize] of Hypothesis
+    //    with Hypothesis : (last word, aggregate score, prev Hypothesis)
+    //     - search grid
+    //     - stores traceback information
+    //     - gets added to in each output time step
+    //     - the final version is the return value of this function
+    //  - beams : array [dimBatch] of  array [localBeamSize] of Hypothesis
+    //     - current output time step's set of active hypotheses, aka active search space
+    //     - gets replaced at the end of each output time step
+    //  - states[.] : ScorerState
+    //     - NN state
+    //     - one per scorer, e.g. 2 for ensemble of 2
+
+    // main loop over output time steps
+    for(bool first = true; ; first = false) {
      //**********************************************************************
      // create constant containing previous path scores for current beam
-      // also create mapping of hyp indices, which are not 1:1 if sentences complete
-      std::vector<IndexType> hypIndices; // [beamIndex * activeBatchSize + batchIndex] backpointers, concatenated over beam positions. Used for reordering hypotheses
-      std::vector<IndexType> embIndices;
-      Expr prevPathScores; // [beam, 1, 1, 1]
+      // Also create mapping of hyp indices, which are not 1:1 if sentences complete.
+      std::vector<IndexType> hypIndices; // [localBeamsize, 1, dimBatch, 1] (flattened) index of beam index that each of the new top N originated from
+      std::vector<Word> prevWords;      // [localBeamsize, 1, dimBatch, 1] (flattened) predecessor word
+      Expr prevPathScores;               // [localBeamSize, 1, dimBatch, 1], where the last axis broadcasts into vocab size when adding pathScores
      if(first) {
        // no scores yet
        prevPathScores = graph->constant({1, 1, 1, 1}, inits::from_value(0));
      } else {
-        std::vector<float> beamScores;
-
        dimBatch = (int)batch->size();
+        ABORT_IF(dimBatch != beams.size(), "Dimensions mismatch??");

-        for(size_t i = 0; i < localBeamSize; ++i) {
-          for(size_t j = 0; j < beams.size(); ++j) { // loop over batch entries (active sentences)
-            auto& beam = beams[j];
-            if(i < beam.size()) {
-              auto hyp = beam[i];
-              hypIndices.push_back((IndexType)hyp->GetPrevStateIndex()); // backpointer
-              embIndices.push_back(hyp->GetWord());
-              beamScores.push_back(hyp->GetPathScore());
+        std::vector<float> beamScores;
+        for(size_t beamIndex = 0; beamIndex < localBeamSize; ++beamIndex) {
+          for(int batchIndex = 0; batchIndex < dimBatch; ++batchIndex) { // loop over batch entries (active sentences)
+            auto& beam = beams[batchIndex];
+            if(beamIndex < beam.size()) {
+              auto hyp = beam[beamIndex];
+              hypIndices.push_back((IndexType)hyp->getPrevStateIndex()); // backpointer
+              prevWords .push_back(hyp->getWord());
+              beamScores.push_back(hyp->getPathScore());
            } else {  // dummy hypothesis
              hypIndices.push_back(0);
-              embIndices.push_back(0);  // (unused)
+              prevWords .push_back(Word{});  // (unused)
              beamScores.push_back(-9999);
            }
          }
@ -232,18 +248,23 @@ public:
      auto pathScores = prevPathScores;

      for(size_t i = 0; i < scorers_.size(); ++i) {
-        states[i] = scorers_[i]->step(
-            graph, states[i], hypIndices, embIndices, dimBatch, (int)localBeamSize);
+        // compute output probabilities for current output time step
+        //  - uses hypIndices[index in beam, 1, batch index, 1] and embIndices[index in beam, 1, batch index, 1] to reorder hypotheses
+        //  - returns new NN state for use in next output time step
+        //  - returns vector of prediction probabilities over output vocab via newState
+        auto newState = scorers_[i]->step(
+            graph, states[i], hypIndices, prevWords, dimBatch, (int)localBeamSize);

-        if(scorers_[i]->getWeight() != 1.f)
-          pathScores = pathScores + scorers_[i]->getWeight() * states[i]->getLogProbs();
-        else
-          pathScores = pathScores + states[i]->getLogProbs();
+        // expand all hypotheses, [localBeamSize, 1, dimBatch, 1] -> [localBeamSize, 1, dimBatch, dimVocab]
+        pathScores = pathScores + scorers_[i]->getWeight() * newState->getLogProbs();
+
+        // update state in-place for next output time step
+        states[i] = newState;
      }

      // make beams continuous
      if(dimBatch > 1 && localBeamSize > 1)
-        pathScores = transpose(pathScores, {2, 1, 0, 3});
+        pathScores = transpose(pathScores, {2, 1, 0, 3}); // -> [dimBatch, 1, localBeamSize, dimVocab]

      if(first)
        graph->forward();
@ -260,12 +281,15 @@ public:

      //**********************************************************************
      // perform beam search and pruning
+
+      // find N best amongst the (localBeamSize * dimVocab) hypotheses
+      const std::vector<size_t> beamSizes(dimBatch, localBeamSize);
      std::vector<unsigned int> outKeys;
      std::vector<float> outPathScores;
-
-      std::vector<size_t> beamSizes(dimBatch, localBeamSize);
      getNBestList(beamSizes, pathScores->val(), outPathScores, outKeys, first);
+      // outPathScores and outKeys contain pathScores and their original indices in N-best order

+      // convert N-best sets to updated search space
      int dimTrgVoc = pathScores->shape()[-1];
      beams = toHyps(outKeys,
                     outPathScores,
@ -276,20 +300,28 @@ public:
                     first,
                     batch);

-      auto prunedBeams = pruneBeam(beams);
+      // remove all hyps that end in EOS
+      auto purgedBeams = purgeBeams(beams); // @TODO: rename; this is not pruning
+
+      // add updated search space to search grid for traceback
+      bool maxLengthReached = false;
      for(int i = 0; i < dimBatch; ++i) {
+        // if this batch entry has surviving hyps then add them to the traceback grid
        if(!beams[i].empty()) {
-          final = final
-                  || histories[i]->size()
-                         >= options_->get<float>("max-length-factor")
-                                * batch->front()->batchWidth();
-          histories[i]->Add(
-              beams[i], trgEosId_, prunedBeams[i].empty() || final);
+          if (histories[i]->size() >= options_->get<float>("max-length-factor") * batch->front()->batchWidth())
+            maxLengthReached = true;
+          histories[i]->add(beams[i], trgEosId_, purgedBeams[i].empty() || maxLengthReached);
        }
      }
-      beams = prunedBeams;
+      if (maxLengthReached) // early exit if max length limit was reached
+        break;

-      // determine beam size for next sentence, as max over still-active sentences
+      // this is the search space for the next output time step
+      beams = purgedBeams;
+
+      // determine beam size for next output time step, as max over still-active sentences
+      // E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then
+      // switch to beam of 4 for all. If all are done, then beam ends up being 0, and we are done.
      if(!first) {
        size_t maxBeam = 0;
        for(auto& beam : beams)
@ -297,11 +329,11 @@ public:
            maxBeam = beam.size();
        localBeamSize = maxBeam;
      }
-      first = false;
+      if (localBeamSize == 0) // done if all batch entries have reached EOS on all beam entries
+        break;
+    } // end of main loop over output tokens

-    } while(localBeamSize != 0 && !final); // end of main loop over output tokens
-
-    return histories;
+    return histories; // [dimBatch][t][N best hyps]
  }
 };
 }  // namespace marian
--- a/src/translator/history.h
+++ b/src/translator/history.h
@ -19,18 +19,17 @@ private:
    float normalizedPathScore; // length-normalized sentence score
  };

+  float lengthPenalty(size_t length) { return std::pow((float)length, alpha_); }
+  float wordPenalty(size_t length) { return wp_ * (float)length; }
 public:
  History(size_t lineNo, float alpha = 1.f, float wp_ = 0.f);

-  float LengthPenalty(size_t length) { return std::pow((float)length, alpha_); }
-  float WordPenalty(size_t length) { return wp_ * (float)length; }
-
-  void Add(const Beam& beam, Word trgEosId, bool last = false) {
+  void add(const Beam& beam, Word trgEosId, bool last = false) {
    if(beam.back()->GetPrevHyp() != nullptr) {
      for(size_t j = 0; j < beam.size(); ++j)
-        if(beam[j]->GetWord() == trgEosId || last) {
-          float pathScore = (beam[j]->GetPathScore() - WordPenalty(history_.size()))
-                       / LengthPenalty(history_.size());
+        if(beam[j]->getWord() == trgEosId || last) {
+          float pathScore =
+              (beam[j]->getPathScore() - wordPenalty(history_.size())) / lengthPenalty(history_.size());
          topHyps_.push({history_.size(), j, pathScore});
          // std::cerr << "Add " << history_.size() << " " << j << " " << pathScore
          // << std::endl;
@ -41,7 +40,7 @@ public:

  size_t size() const { return history_.size(); } // number of time steps

-  NBestList NBest(size_t n) const {
+  NBestList nBest(size_t n) const {
    NBestList nbest;
    for (auto topHypsCopy = topHyps_; nbest.size() < n && !topHypsCopy.empty(); topHypsCopy.pop()) {
      auto bestHypCoord = topHypsCopy.top();
@ -55,15 +54,15 @@ public:
      // trace back best path
      Words targetWords = bestHyp->TracebackWords();

-      // note: bestHyp->GetPathScore() is not normalized, while bestHypCoord.normalizedPathScore is
+      // note: bestHyp->getPathScore() is not normalized, while bestHypCoord.normalizedPathScore is
      nbest.emplace_back(targetWords, bestHyp, bestHypCoord.normalizedPathScore);
    }
    return nbest;
  }

-  Result Top() const { return NBest(1)[0]; }
+  Result top() const { return nBest(1)[0]; }

-  size_t GetLineNum() const { return lineNo_; }
+  size_t getLineNum() const { return lineNo_; }

 private:
  std::vector<Beam> history_; // [time step][index into beam] search grid
@ -73,5 +72,5 @@ private:
  float wp_;
 };

-typedef std::vector<Ptr<History>> Histories;
+typedef std::vector<Ptr<History>> Histories; // [batchDim]
 }  // namespace marian
--- a/src/translator/hypothesis.h
+++ b/src/translator/hypothesis.h
@ -6,6 +6,11 @@

 namespace marian {

+// one single (possibly partial) hypothesis in beam search
+// key elements:
+//  - the word that this hyp ends with
+//  - the aggregate score up to and including the word
+//  - back pointer to previous hypothesis for traceback
 class Hypothesis {
 public:
  Hypothesis() : prevHyp_(nullptr), prevIndex_(0), word_(0), pathScore_(0.0) {}
@ -18,11 +23,11 @@ public:

  const Ptr<Hypothesis> GetPrevHyp() const { return prevHyp_; }

-  Word GetWord() const { return word_; }
+  Word getWord() const { return word_; }

-  IndexType GetPrevStateIndex() const { return prevIndex_; }
+  IndexType getPrevStateIndex() const { return prevIndex_; }

-  float GetPathScore() const { return pathScore_; }
+  float getPathScore() const { return pathScore_; }

  std::vector<float>& GetScoreBreakdown() { return scoreBreakdown_; }
  std::vector<float>& GetAlignment() { return alignment_; }
@ -34,8 +39,8 @@ public:
  {
      Words targetWords;
      for (auto hyp = this; hyp->GetPrevHyp(); hyp = hyp->GetPrevHyp().get()) {
-          targetWords.push_back(hyp->GetWord());
-          // std::cerr << hyp->GetWord() << " " << hyp << std::endl;
+          targetWords.push_back(hyp->getWord());
+          // std::cerr << hyp->getWord() << " " << hyp << std::endl;
      }
      std::reverse(targetWords.begin(), targetWords.end());
      return targetWords;
--- a/src/translator/output_printer.h
+++ b/src/translator/output_printer.h
@ -24,7 +24,7 @@ public:

  template <class OStream>
  void print(Ptr<History> history, OStream& best1, OStream& bestn) {
-    const auto& nbl = history->NBest(nbest_);
+    const auto& nbl = history->nBest(nbest_);

    for(size_t i = 0; i < nbl.size(); ++i) {
      const auto& result = nbl[i];
@ -35,14 +35,14 @@ public:
        std::reverse(words.begin(), words.end());

      std::string translation = vocab_->decode(words);
-      bestn << history->GetLineNum() << " ||| " << translation;
+      bestn << history->getLineNum() << " ||| " << translation;

      if(!alignment_.empty())
        bestn << " ||| " << getAlignment(hypo);

      bestn << " |||";
      if(hypo->GetScoreBreakdown().empty()) {
-        bestn << " F0=" << hypo->GetPathScore();
+        bestn << " F0=" << hypo->getPathScore();
      } else {
        for(size_t j = 0; j < hypo->GetScoreBreakdown().size(); ++j) {
          bestn << " F" << j << "= " << hypo->GetScoreBreakdown()[j];
@ -58,7 +58,7 @@ public:
        bestn << std::flush;
    }

-    auto result = history->Top();
+    auto result = history->top();
    auto words = std::get<0>(result);

    if(reverse_)
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@ -106,7 +106,7 @@ public:
          std::stringstream best1;
          std::stringstream bestn;
          printer->print(history, best1, bestn);
-          collector->Write((long)history->GetLineNum(),
+          collector->Write((long)history->getLineNum(),
                           best1.str(),
                           bestn.str(),
                           options_->get<bool>("n-best"));
@ -211,7 +211,7 @@ public:
            std::stringstream best1;
            std::stringstream bestn;
            printer->print(history, best1, bestn);
-            collector->add((long)history->GetLineNum(), best1.str(), bestn.str());
+            collector->add((long)history->getLineNum(), best1.str(), bestn.str());
          }
        };

--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@ -580,6 +580,7 @@
    <ClCompile Include="..\src\microsoft\quicksand.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
    </ClCompile>
+    <ClCompile Include="..\src\models\transformer_stub.cpp" />
    <ClCompile Include="..\src\rescorer\score_collector.cpp" />
    <ClCompile Include="..\src\tensors\backend.cpp" />
    <ClCompile Include="..\src\tensors\cpu\device.cpp" />
@ -894,6 +895,7 @@
    <ClInclude Include="..\src\layers\word2vec_reader.h" />
    <ClInclude Include="..\src\microsoft\quicksand.h" />
    <ClInclude Include="..\src\models\amun.h" />
+    <ClInclude Include="..\src\models\bert.h" />
    <ClInclude Include="..\src\models\char_s2s.h" />
    <ClInclude Include="..\src\models\costs.h" />
    <ClInclude Include="..\src\models\decoder.h" />
@ -905,7 +907,7 @@
    <ClInclude Include="..\src\models\nematus.h" />
    <ClInclude Include="..\src\models\s2s.h" />
    <ClInclude Include="..\src\models\states.h" />
-    <ClCompile Include="..\src\models\transformer.h" />
+    <ClInclude Include="..\src\models\transformer.h" />
    <ClInclude Include="..\src\models\experimental\lex_probs.h" />
    <ClInclude Include="..\src\models\transformer_factory.h" />
    <ClInclude Include="..\src\optimizers\clippers.h" />
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@ -202,9 +202,6 @@
    <ClCompile Include="..\src\tensors\cpu\sharp\sse_gemm.cpp">
      <Filter>tensors\cpu\sharp</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\models\transformer.h">
-      <Filter>models</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\common\io.cpp">
      <Filter>common</Filter>
    </ClCompile>
@ -481,6 +478,9 @@
    <ClCompile Include="..\src\examples\iris\iris.cpp">
      <Filter>examples\iris</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\models\transformer_stub.cpp">
+      <Filter>models</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\src\marian.h" />
@ -1517,6 +1517,12 @@
    <ClInclude Include="..\src\examples\mnist\validator.h">
      <Filter>examples\mnist</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\models\bert.h">
+      <Filter>models</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\models\transformer.h">
+      <Filter>models</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="3rd_party">