Merged PR 10692: new factor conditioning, inline fixing suppression, nan suppression

This adds two new features related to factored vocabs: * a new conditioning mechanism that mimics a mini transformer layer between the emitted lemma and the factors. This affects only factored vocabs, and requires to be enabled explicitly. Change is in `generic.cpp`. * in case of inline phrase-fixing, cross-attention is now no longer allows to look into the source sequence. This only affects inputs with `|is` factors or `<IOPEN>` tags. Change is in `states.h`. * Adam optimizer now skips update if the gradient contains a NaN. Does not affect existing configs unless they produce NaNs. Change is in `optimizers.cpp`. * reverts to old `LayerNorm` routine. *TODO*: Is this change still needed? Additional changes: * new method `locate()` for accessing batch data with array coordinates * new overloads for `constant_like()` from vector directly (most used case) * rvalue-ref version of `fromVector()`
2024-09-11 06:15:56 +03:00 · 2019-12-20 01:46:37 +00:00 · 2019-12-20 01:46:37 +00:00 · f882f27c09
commit f882f27c09
parent bab02e3b84
24 changed files with 356 additions and 132 deletions
--- a/src/common/file_stream.cpp
+++ b/src/common/file_stream.cpp
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@ -235,11 +235,12 @@ CorpusBase::batch_ptr Corpus::toBatch(const std::vector<Sample>& batchVector) {
  }

  std::vector<size_t> words(maxDims.size(), 0);
-  for(size_t i = 0; i < batchSize; ++i) {
-    for(size_t j = 0; j < maxDims.size(); ++j) {
-      for(size_t k = 0; k < batchVector[i][j].size(); ++k) {
-        subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
-        subBatches[j]->mask()[k * batchSize + i] = 1.f;
+  for(size_t b = 0; b < batchSize; ++b) {                    // loop over batch entries
+    for(size_t j = 0; j < maxDims.size(); ++j) {             // loop over streams
+      auto subBatch = subBatches[j];
+      for(size_t s = 0; s < batchVector[b][j].size(); ++s) { // loop over word positions
+        subBatch->data()[subBatch->locate(/*batchIdx=*/b, /*wordPos=*/s)/*s * batchSize + b*/] = batchVector[b][j][s];
+        subBatch->mask()[subBatch->locate(/*batchIdx=*/b, /*wordPos=*/s)/*s * batchSize + b*/] = 1.f;
        words[j]++;
      }
    }
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@ -1,6 +1,7 @@
 #include <random>

 #include "data/corpus.h"
+#include "data/factored_vocab.h"

 namespace marian {
 namespace data {
@ -330,5 +331,54 @@ void CorpusBase::initEOS(bool training = true) {
    }
 }

+// experimental: hide inline-fix source tokens from cross attention
+std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
+{
+  const auto& srcVocab = *vocab();
+
+  auto factoredVocab = vocab()->tryAs<FactoredVocab>();
+  size_t inlineFixGroupIndex = 0, inlineFixSrc = 0;
+  auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc);
+
+  auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG];
+  auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG];
+  auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG];
+  auto unkId = srcVocab.getUnkId();
+  auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId;
+
+  auto m = mask(); // default return value, which we will modify in-place below in case we need to
+  if (hasInlineFixFactors || hasInlineFixTags) {
+    LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens");
+
+    // example: force French translation of name "frank" to always be "franck"
+    //  - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to
+    //  - hasInlineFixTags:    "<IOPEN> frank <IDELIM> franck <ICLOSE>", "frank" and all tags cannot be cross-attended to
+    auto dimBatch = batchSize();  // number of sentences in the batch
+    auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch
+    const auto& d = data();
+    size_t numWords = 0;
+    for (size_t b = 0; b < dimBatch; b++) {     // loop over batch entries
+      bool inside = false;
+      for (size_t s = 0; s < dimWidth; s++) {  // loop over source positions
+        auto i = locate(/*batchIdx=*/b, /*wordPos=*/s);
+        if (!m[i])
+          break;
+        numWords++;
+        // keep track of entering/exiting the inline-fix source tags
+        auto w = d[i];
+        if (w == fixSrcId)
+          inside = true;
+        else if (w == fixTgtId)
+          inside = false;
+        bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc;
+        if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor)
+          m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens
+      }
+    }
+    ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??");
+  }
+  return m;
+}
+
 }  // namespace data
 }  // namespace marian
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -143,12 +143,19 @@ public:
   * words (width) and \f$s\f$ is the number of sentences (size).
   */
  Words& data() { return indices_; }
+  const Words& data() const { return indices_; }
+  /**
+   * @brief compute flat index into data() and mask() vectors for given batch index and word index in sentence
+   */
+  size_t locate(size_t batchIdx, size_t wordPos) const { return locate(batchIdx, wordPos, size_); }
+  static size_t locate(size_t batchIdx, size_t wordPos, size_t batchSize) { return wordPos * batchSize + batchIdx; }
  /**
   * @brief Flat masking vector; 0 is used for masked words.
   *
   * @see data()
   */
  std::vector<float>& mask() { return mask_; }
+  const std::vector<float>& mask() const { return mask_; }

  /**
   * @brief Accessors to the vocab_ field.
@ -158,15 +165,15 @@ public:
  /**
   * @brief The number of sentences in the batch.
   */
-  size_t batchSize() { return size_; }
+  size_t batchSize() const { return size_; }
  /**
   * @brief The number of words in the longest sentence in the batch.
   */
-  size_t batchWidth() { return width_; };
+  size_t batchWidth() const { return width_; };
  /**
   * @brief The total number of words in the batch (not counting masked-out words).
   */
-  size_t batchWords() { return words_; }
+  size_t batchWords() const { return words_; }

  /**
   * @brief Splits the stream into sub-batches of equal size (except for last).
@ -179,7 +186,7 @@ public:
   *
   * @see marian::data::Batch::split(size_t n)
   */
-  std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) {
+  std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) const {
    ABORT_IF(size_ == 0, "Encountered sub-batch size of 0");

    auto size = std::min(size_, sizeLimit); // if limit is given then pretend the batch only has that many sentences
@ -191,11 +198,11 @@ public:

      // determine actual width (=max length) of this sub-batch, which may be smaller than the overall max length
      size_t subWidth = 0;
-      for(size_t j = 0; j < width_; ++j) {
-        for(size_t i = 0; i < subSize; ++i) {
-          if(mask_[j * size_ + (pos + i)] != 0)
-            if (subWidth < j + 1)
-              subWidth = j + 1;
+      for(size_t s = 0; s < width_; ++s) {
+        for(size_t b = 0; b < subSize; ++b) {
+          if(mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)] != 0)   // s * size_ + (pos + b)
+            if (subWidth < s + 1)
+              subWidth = s + 1;
        }
      }

@ -203,12 +210,12 @@ public:
      auto sb = New<SubBatch>(subSize, subWidth, vocab_);

      size_t words = 0;
-      for(size_t j = 0; j < subWidth; ++j) {
-        for(size_t i = 0; i < subSize; ++i) {
-          sb->data()[j * subSize + i] = indices_[j * size_ + (pos + i)];
-          sb->mask()[j * subSize + i] =    mask_[j * size_ + (pos + i)];
+      for(size_t s = 0; s < subWidth; ++s) {
+        for(size_t b = 0; b < subSize; ++b) {
+          sb->data()[locate(/*batchIdx=*/b, /*wordPos=*/s, /*batchSize=*/subSize)/*s * subSize + b*/] = indices_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)]; // s * size_ + (pos + b)
+          sb->mask()[locate(/*batchIdx=*/b, /*wordPos=*/s, /*batchSize=*/subSize)/*s * subSize + b*/] =    mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)]; // s * size_ + (pos + b)

-          if(mask_[j * size_ + (pos + i)] != 0)
+          if(mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)/*s * size_ + (pos + b)*/] != 0)
            words++;
        }
      }
@ -220,6 +227,9 @@ public:
  }

  void setWords(size_t words) { words_ = words; }
+
+  // experimental: hide inline-fix source tokens from cross attention
+  std::vector<float> crossMaskWithInlineFixSourceSuppressed() const;
 };

 /**
@ -229,7 +239,7 @@ public:
 class CorpusBatch : public Batch {
 protected:
  std::vector<Ptr<SubBatch>> subBatches_;
-  std::vector<float> guidedAlignment_;
+  std::vector<float> guidedAlignment_; // [max source len, batch size, max target len] flattened
  std::vector<float> dataWeights_;

 public:
@ -300,7 +310,8 @@ public:

  /**
   * @brief Creates a batch filled with fake data. Used to determine the size of
-   * the batch object.
+   * the batch object. With guided-alignments and multiple encoders, those
+   * multiple source streams are expected to have the same lengths.
   *
   * @param lengths List of subbatch sizes.
   * @param batchSize Number of sentences in the batch.
@ -333,6 +344,7 @@ public:
      return batch;

    if(options->get("guided-alignment", std::string("none")) != "none") {
+      // @TODO: if > 1 encoder, verify that all encoders have the same sentence lengths
      std::vector<float> alignment(batchSize * lengths.front() * lengths.back(),
                                   0.f);
      batch->setGuidedAlignment(std::move(alignment));
@ -406,7 +418,7 @@ public:
          size_t bi = i + pos;
          for(size_t sid = 0; sid < srcWords; ++sid) {
            for(size_t tid = 0; tid < trgWords; ++tid) {
-              size_t bidx = sid * oldSize  * oldTrgWords + bi * oldTrgWords + tid;
+              size_t bidx = sid * oldSize  * oldTrgWords + bi * oldTrgWords + tid; // [sid, bi, tid]
              size_t idx  = sid * dimBatch *    trgWords +  i *    trgWords + tid;
              aligns[idx] = guidedAlignment_[bidx];
            }
@ -432,9 +444,9 @@ public:
        // this needs to be split along the batch dimension
        // which is here the innermost dimension.
        // Should work for sentence-based weights, too.
-        for(size_t j = 0; j < width; ++j) {
-          for(size_t i = 0; i < split->size(); ++i) {
-            ws[j * split->size() + i] = dataWeights_[j * oldSize + i + pos];
+        for(size_t s = 0; s < width; ++s) {
+          for(size_t b = 0; b < split->size(); ++b) {
+            ws[s * split->size() + b] = dataWeights_[s * oldSize + b + pos]; // @TODO: use locate() as well
          }
        }
        split->setDataWeights(ws);
@ -445,9 +457,13 @@ public:
    return splits;
  }

-  std::vector<float>& getGuidedAlignment() { return guidedAlignment_; }
+  const std::vector<float>& getGuidedAlignment() const { return guidedAlignment_; }  // [dimSrcWords, dimBatch, dimTrgWords] flattened
  void setGuidedAlignment(std::vector<float>&& aln) override {
-      guidedAlignment_ = std::move(aln);
+    guidedAlignment_ = std::move(aln);
+  }
+
+  size_t locateInGuidedAlignments(size_t b, size_t s, size_t t) {
+    return ((s * size()) + b) * widthTrg() + t;
  }

  std::vector<float>& getDataWeights() { return dataWeights_; }
@ -469,15 +485,14 @@ public:
      std::cerr << std::endl;
    }

-    size_t b = 0;
+    size_t subBatchIndex = 0;
    for(auto sb : subBatches_) {
-      std::cerr << "batch " << b++ << ": " << std::endl;
+      std::cerr << "stream " << subBatchIndex++ << ": " << std::endl;
      const auto& vocab = sb->vocab();
-      for(size_t i = 0; i < sb->batchWidth(); i++) {
+      for(size_t s = 0; s < sb->batchWidth(); s++) {
        std::cerr << "\t w: ";
-        for(size_t j = 0; j < sb->batchSize(); j++) {
-          size_t idx = i * sb->batchSize() + j;
-          Word w = sb->data()[idx];
+        for(size_t b = 0; b < sb->batchSize(); b++) {
+          Word w = sb->data()[sb->locate(/*batchIdx=*/b, /*wordPos=*/s)]; // s * sb->batchSize() + b;
          if (vocab && !printIndices)
            std::cerr << (*vocab)[w] << " ";
          else
--- a/src/data/factored_vocab.cpp
+++ b/src/data/factored_vocab.cpp
@ -400,7 +400,7 @@ std::string FactoredVocab::word2string(Word word) const {
        res.append("?");
    }
    else
-      res.append(factorVocab_[(WordIndex)(index + groupRanges_[g].first)]);
+      res.append(getFactorName(g, index));
  }
  return res;
 }
@ -431,6 +431,21 @@ Word FactoredVocab::string2word(const std::string& w) const {
  return word;
 }

+// does a specific factor exist in the vocabulary
+// Factor name must be given without separator. This function cannot be used for lemmas.
+bool FactoredVocab::tryGetFactor(const std::string& factorName, size_t& groupIndex, size_t& factorIndex) const {
+  WordIndex u;
+  if (factorVocab_.tryFind(factorSeparator_ + factorName, u))
+  {
+      groupIndex = factorGroups_[u];
+      ABORT_IF(u < groupRanges_[groupIndex].first || u >= groupRanges_[groupIndex].second, "Invalid factorGroups_ entry??");
+      factorIndex = u - groupRanges_[groupIndex].first;
+      return true;
+  }
+  else
+      return false;
+}
+
 // extract the factor index of a given factor type from the 'Word' representation
 size_t FactoredVocab::getFactor(Word word, size_t groupIndex) const {
  size_t index = word.toWordIndex();
@ -565,12 +580,18 @@ void FactoredVocab::constructNormalizationInfoForVocab() {

 // decode a 'Word' array into the external string representation of that token sequence, as written to output files
 /*virtual*/ std::string FactoredVocab::decode(const Words& sentence, bool ignoreEOS /*= true*/) const /*override final*/ {
-  std::vector<std::string> decoded;
-  decoded.reserve(sentence.size());
-  for(auto w : sentence) {
+  std::vector<std::string> decoded; decoded.reserve(sentence.size());
+  for(auto w : sentence)
    if((w != getEosId() || !ignoreEOS))
      decoded.push_back((*this)[w]);
-  }
+  return utils::join(decoded, " ");
+}
+
+// diagnostics version of decode() that will not fail on partial words, will print EOS, and is a little slower
+std::string FactoredVocab::decodeForDiagnostics(const Words& sentence) const {
+  std::vector<std::string> decoded; decoded.reserve(sentence.size());
+  for (auto w : sentence)
+    decoded.push_back(word2string(w));
  return utils::join(decoded, " ");
 }

@ -740,7 +761,7 @@ Ptr<IVocab> createFactoredVocab(const std::string& vocabPath) {
    static std::map<std::string, Ptr<IVocab>> s_cache;
    auto iter = s_cache.find(vocabPath);
    if (iter != s_cache.end()) {
-      LOG(info, "[vocab] Reusing existing vocabulary object in memory (vocab size {})", iter->second->size());
+      LOG_ONCE(info, "[vocab] Reusing existing vocabulary object in memory (vocab size {})", iter->second->size());
      return iter->second;
    }
    auto vocab = New<FactoredVocab>();
--- a/src/data/factored_vocab.h
+++ b/src/data/factored_vocab.h
@ -66,6 +66,9 @@ public:
  bool canExpandFactoredWord(Word word, size_t groupIndex) const { return lemmaHasFactorGroup(getFactor(word, 0), groupIndex); }
  size_t getFactor(Word word, size_t groupIndex) const;
  bool lemmaHasFactorGroup(size_t factor0Index, size_t g) const { return lemmaHasFactorGroup_[factor0Index][g]; }
+  const std::string& getFactorGroupPrefix(size_t groupIndex) const { return groupPrefixes_[groupIndex]; } // for diagnostics only
+  const std::string& getFactorName(size_t groupIndex, size_t factorIndex) const { return factorVocab_[(WordIndex)(factorIndex + groupRanges_[groupIndex].first)]; }
+  std::string decodeForDiagnostics(const Words& sentence) const;

  static constexpr size_t FACTOR_NOT_APPLICABLE = (SIZE_MAX - 1);
  static constexpr size_t FACTOR_NOT_SPECIFIED  = (SIZE_MAX - 2);
@ -74,6 +77,17 @@ public:
  static Ptr<FactoredVocab> tryCreateAndLoad(const std::string& path); // load from "vocab" option if it specifies a factored vocab
  std::string word2string(Word word) const;
  Word string2word(const std::string& w) const;
+  bool tryGetFactor(const std::string& factorGroupName, size_t& groupIndex, size_t& factorIndex) const; // note: factorGroupName given without separator
+
+  // some hard-coded constants from FactoredSegmenter
+  // The naming mimics the names in FactoredSegmenter.cs, and therefore intentionally does not follow Marian conventions.
+  // @TODO: We have more hard-coded constants throughout the code. Move them all here.
+  // @TODO: figure out how to do this with static const*/constexpr
+#define FactoredVocab_INLINE_FIX_WHAT_serialized "is"
+#define FactoredVocab_FIX_SRC_ID_TAG             "<IOPEN>"
+#define FactoredVocab_FIX_TGT_ID_TAG             "<IDELIM>"
+#define FactoredVocab_FIX_END_ID_TAG             "<ICLOSE>"
+
 private:
  void constructGroupInfoFromFactorVocab();
  void constructFactorIndexConversion();
--- a/src/data/vocab.cpp
+++ b/src/data/vocab.cpp
--- a/src/data/vocab_base.h
+++ b/src/data/vocab_base.h
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -284,11 +284,6 @@ Expr stopGradient(Expr a) {
  return res;
 }

-Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init) {
-  auto graph = a->graph();
-  return graph->constant(a->shape(), init, a->value_type());
-}
-
 // gather() -- gather arbitrary elements along an axis; batched or non-batched
 Expr gather(Expr a, int axis, Expr indices) {
  return Expression<GatherNodeOp>(a, axis, indices);
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@ -141,7 +141,17 @@ Expr atleast_4d(Expr a);
 Expr atleast_nd(Expr a, size_t dims);

 // create a constant of shape a->shape() and initialize with init
-Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init);
+// @TODO: add a && version, to avoid a ref count. NodeInitializers are typically temps.
+// @TODO: and/or make this a template on init
+static inline Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init) {
+  return a->graph()->constant(a->shape(), init, a->value_type());
+}
+
+// short-cut to init from std::vector, since we do this so often
+template<typename ElementType>
+Expr constant_like(Expr a, const std::vector<ElementType>& v) { return constant_like(a, inits::fromVector(std::move(v))); }
+template<typename ElementType>
+Expr constant_like(Expr a, std::vector<ElementType>&& v) { return constant_like(a, inits::fromVector(v)); }

 Expr flatten(Expr a);
 Expr flatten_2d(Expr a);
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@ -145,10 +145,20 @@ Ptr<NodeInitializer> fromVector(const std::vector<T>& v) {
  return fromLambda([v](Tensor t) { t->set(v.data(), v.data() + v.size()); }, typeId<T>());
 }

+template <typename T>
+Ptr<NodeInitializer> fromVector(std::vector<T>&& v) {
+  return fromLambda([v](Tensor t) { t->set(v.data(), v.data() + v.size()); }, typeId<T>());
+}
+
 template Ptr<NodeInitializer> fromVector<float16>(const std::vector<float16>& v);
 template Ptr<NodeInitializer> fromVector<float>(const std::vector<float>& v);
 template Ptr<NodeInitializer> fromVector<IndexType>(const std::vector<IndexType>& v);

+// @TODO: can we remove the const& ones above? They always make a copy anyways, and often from a temp
+template Ptr<NodeInitializer> fromVector<float16>  (std::vector<float16>  && v);
+template Ptr<NodeInitializer> fromVector<float>    (std::vector<float>    && v);
+template Ptr<NodeInitializer> fromVector<IndexType>(std::vector<IndexType>&& v);
+
 Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v) {
  return fromLambda([v](Tensor t) { t->set(1e-6); t->setSparse(v.first, v.second); });
 }
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@ -143,6 +143,8 @@ Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
 // @TODO: add documentation
 template <typename T>
 Ptr<NodeInitializer> fromVector(const std::vector<T>& v);
+template <typename T>
+Ptr<NodeInitializer> fromVector(std::vector<T>&& v);

 // @TODO: add documentation
 Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v);
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -859,7 +859,9 @@ struct MinimumNodeOp : public ElementBinaryNodeOp {

 struct CmpNodeOp : public ElementBinaryNodeOp {
  CmpNodeOp(Expr a, Expr b, int cmp_, bool not_) : ElementBinaryNodeOp(a, b), cmp_(cmp_), not_(not_) {
-    setTrainable(false); // has no gradient
+    //setTrainable(false); // has no gradient
+    // Note: ^^ Disabled because it currently causing Marian to choke, for unknown reasons.
+    //       Not setting this will not change the result since the vector of gradient functions is empty.
  }

  NodeOps forwardOps() override {
--- a/src/layers/generic.cpp
+++ b/src/layers/generic.cpp
@ -4,7 +4,8 @@
 #include "layers/constructors.h"
 #include "layers/loss.h"
 #include "data/factored_vocab.h"
-#include "rnn/types.h" // for State::select()
+#include "rnn/types.h"     // for State::select()
+#include "models/states.h" // for EncoderState

 //using std::size_t; // not sure why this is needed

@ -219,7 +220,7 @@ namespace marian {
      factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
      if (factoredVocab_) {
        numOutputClasses = (int)factoredVocab_->factorVocabSize();
-        LOG(info, "[embedding] Factored outputs enabled");
+        LOG_ONCE(info, "[embedding] Factored outputs enabled");
      }

      if(tiedParam_) {
@ -237,10 +238,10 @@ namespace marian {

      /*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
      ABORT_IF(lemmaDimEmb && !factoredVocab_, "--lemma-dim-emb requires a factored vocabulary");
-      if (lemmaDimEmb > 0) {
+      if (lemmaDimEmb > 0) { // > 0 means to embed the (expected) word with a different embedding matrix
 #define HARDMAX_HACK
 #ifdef HARDMAX_HACK
-        lemmaDimEmb = lemmaDimEmb & 0xfffffffe;
+        lemmaDimEmb = lemmaDimEmb & 0xfffffffe; // hack to select hard-max: use an odd number
 #endif
        auto range = factoredVocab_->getGroupRange(0);
        auto lemmaVocabDim = (int)(range.second - range.first);
@ -263,8 +264,9 @@ namespace marian {
        // project each factor separately
        auto numGroups = factoredVocab_->getNumGroups();
        std::vector<Ptr<RationalLoss>> allLogits(numGroups, nullptr); // (note: null entries for absent factors)
-        Expr input1 = input;
-        Expr Plemma = nullptr;
+        Expr input1 = input; // [B... x D]
+        Expr Plemma = nullptr;     // used for lemmaDimEmb=-1
+        Expr inputLemma = nullptr; // used for lemmaDimEmb=-2, -3
        for (size_t g = 0; g < numGroups; g++) {
          auto range = factoredVocab_->getGroupRange(g);
          if (g > 0 && range.first == range.second) // empty entry
@ -280,6 +282,52 @@ namespace marian {
            factorWt = slice(Wt_, isLegacyUntransposedW ? -1 : 0, Slice((int)range.first, (int)range.second));
            factorB  = slice(b_,                              -1, Slice((int)range.first, (int)range.second));
          }
+          /*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
+          if ((lemmaDimEmb == -2 || lemmaDimEmb == -3) && g > 0) { // -2/-3 means a gated transformer-like structure (-3 = hard-max)
+            LOG_ONCE(info, "[embedding] using lemma conditioning with gate");
+            // this mimics one transformer layer
+            //  - attention over two inputs:
+            //     - e = current lemma. We use the original embedding vector; specifically, expectation over all lemmas.
+            //     - input = hidden state FF(h_enc+h_dec)
+            //  - dot-prod attention to allow both sides to influence (unlike our recurrent self-attention)
+            //  - multi-head to allow for multiple conditions to be modeled
+            //  - add & norm, for gradient flow and scaling
+            //  - FF layer   --this is expensive; it is per-factor
+            // multi-head attention
+            int inputDim = input->shape()[-1];
+            int heads = 8;
+            auto name = options_->get<std::string>("prefix") + "_factor" + std::to_string(g);
+            auto Wq = graph_->param(name + "_Wq", { inputDim,  inputDim }, inits::glorotUniform());
+            auto Wk = graph_->param(name + "_Wk", { inputDim,  inputDim }, inits::glorotUniform());
+            auto Wv = graph_->param(name + "_Wv", { inputDim,  inputDim }, inits::glorotUniform());
+            auto toMultiHead = [&](Expr x, int heads) {
+              const auto& shape = x->shape();
+              int inputDim = shape[-1];
+              int otherDim = shape.elements() / inputDim;
+              ABORT_IF(inputDim / heads * heads != inputDim, "inputDim ({}) must be multiple of number of heads ({})", inputDim, heads);
+              return reshape(x, { otherDim, heads, 1, inputDim / heads });
+            };
+            input1 = inputLemma;
+            auto qm  = toMultiHead(dot(input1,         Wq), heads); // [B... x H x D/H] projected query
+            auto kdm = toMultiHead(dot(input1 - input, Wk), heads); // [B... x H x D/H] the two data vectors projected as keys. Use diff and sigmoid, instead of softmax.
+            auto vem = toMultiHead(dot(input1,         Wv), heads); // [B... x H x D/H] one of the two data vectors projected as values
+            auto vim = toMultiHead(dot(         input, Wv), heads); // [B... x H x D/H] the other
+            auto zm = bdot(qm, kdm, false, true);              // [B... x H x 1]
+            auto sm = sigmoid(zm);                // [B... x H x 1]
+            auto rm = sm * (vem - vim) + vim;     // [B... x H x D/H]
+            auto r = reshape(rm, input->shape()); // [B... x D]
+            // add & norm
+            input1 = r + input1;
+            input1 = layerNorm(input1, name + "_att");
+            // FF layer
+            auto ffnDropProb = 0.1f;    // @TODO: get as a parameter
+            auto ffnDim = inputDim * 2; // @TODO: get as a parameter
+            auto f = denseInline(input1, name + "_ffn", /*suffix=*/"1", ffnDim, (ActivationFunction*)relu, ffnDropProb);
+            f      = denseInline(f,      name + "_ffn", /*suffix=*/"2", inputDim);
+            // add & norm
+            input1 = f + input1;
+            input1 = layerNorm(input1, name + "_ffn");
+          }
          // @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
          auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
          // optionally add lemma-dependent bias
@ -294,15 +342,28 @@ namespace marian {
          allLogits[g] = New<RationalLoss>(factorLogits, nullptr);
          // optionally add a soft embedding of lemma back to create some lemma dependency
          // @TODO: if this works, move it into lazyConstruct
-          /*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
-          if (lemmaDimEmb < 0 && g == 0) {
-            ABORT_IF(shortlist_ && lemmaDimEmb != 0, "Lemma-dependent bias with short list is not yet implemented");
+          if (lemmaDimEmb == -2 && g == 0) { // -2 means a gated transformer-like structure
+            LOG_ONCE(info, "[embedding] using lemma conditioning with gate, soft-max version");
+            // get expected lemma embedding vector
+            auto factorLogSoftmax = logsoftmax(factorLogits); // [B... x U] note: with shortlist, this is not the full lemma set
+            auto factorSoftmax = exp(factorLogSoftmax);
+            inputLemma = dot(factorSoftmax, factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D]
+          }
+          else if (lemmaDimEmb == -3 && g == 0) { // same as -2 except with hard max
+            LOG_ONCE(info, "[embedding] using lemma conditioning with gate, hard-max version");
+            // get max-lemma embedding vector
+            auto maxVal = max(factorLogits, -1); // [B... x U] note: with shortlist, this is not the full lemma set
+            auto factorHardmax = eq(factorLogits, maxVal);
+            inputLemma = dot(factorHardmax, factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D]
+          }
+          else if (lemmaDimEmb == -1 && g == 0) { // -1 means learn a lemma-dependent bias
+            ABORT_IF(shortlist_, "Lemma-dependent bias with short list is not yet implemented");
            LOG_ONCE(info, "[embedding] using lemma-dependent bias");
            auto factorLogSoftmax = logsoftmax(factorLogits); // (we do that again later, CSE will kick in)
            auto z = /*stopGradient*/(factorLogSoftmax);
            Plemma = exp(z); // [B... x U]
          }
-          if (lemmaDimEmb > 0 && g == 0) {
+          else if (lemmaDimEmb > 0 && g == 0) { // > 0 means learn a re-embedding matrix
            LOG_ONCE(info, "[embedding] enabled re-embedding of lemma, at dim {}", lemmaDimEmb);
            // compute softmax. We compute logsoftmax() separately because this way, computation will be reused later via CSE
            auto factorLogSoftmax = logsoftmax(factorLogits);
@ -349,7 +410,7 @@ namespace marian {
    factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
    if (factoredVocab_) {
      dimVoc = (int)factoredVocab_->factorVocabSize();
-      LOG(info, "[embedding] Factored embeddings enabled");
+      LOG_ONCE(info, "[embedding] Factored embeddings enabled");
    }

    // Embedding layer initialization should depend only on embedding size, hence fanIn=false
@ -389,7 +450,7 @@ namespace marian {
    auto graph = E_->graph();
    int dimBatch = (int)subBatch->batchSize();
    int dimEmb = E_->shape()[-1];
-    int dimWords = (int)subBatch->batchWidth();
+    int dimWidth = (int)subBatch->batchWidth();

    // factored embeddings:
    //  - regular:
@ -419,9 +480,16 @@ namespace marian {
    //        - but forward pass weighs them down, so that all factors are in a similar numeric range
    //        - if it is required to be in a different range, the embeddings can still learn that, but more slowly

-    auto batchEmbeddings = apply(subBatch->data(), {dimWords, dimBatch, dimEmb});
-    auto batchMask = graph->constant({dimWords, dimBatch, 1},
+    auto batchEmbeddings = apply(subBatch->data(), {dimWidth, dimBatch, dimEmb});
+#if 0
+    auto batchMask = graph->constant({dimWidth, dimBatch, 1},
                                     inits::fromVector(subBatch->mask()));
+#else
+    // experimental: hide inline-fix source tokens from cross attention
+    auto batchMask = graph->constant({dimWidth, dimBatch, 1},
+                                     inits::fromVector(subBatch->crossMaskWithInlineFixSourceSuppressed()));
+#endif
+
    return std::make_tuple(batchEmbeddings, batchMask);
  }

--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@ -412,4 +412,32 @@ public:
    ABORT("not implemented"); // @TODO: implement me
  }
 };
+
+// --- a few layers with built-in parameters created on the fly, without proper object
+// @TODO: change to a proper layer object
+
+// like affine() but with built-in parameters, activation, and dropout
+static inline
+Expr denseInline(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
+{
+  auto graph = x->graph();
+
+  auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorotUniform());
+  auto b = graph->param(prefix + "_b" + suffix, { 1,              outDim }, inits::zeros());
+
+  x = affine(x, W, b);
+  if (actFn)
+    x = actFn(x);
+  x = dropout(x, dropProb);
+  return x;
+}
+
+static inline
+Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) {
+  int dimModel = x->shape()[-1];
+  auto scale = x->graph()->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones());
+  auto bias  = x->graph()->param(prefix + "_ln_bias"  + suffix, { 1, dimModel }, inits::zeros());
+  return marian::layerNorm(x, scale, bias, 1e-6f);
+}
+
 }  // namespace marian
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@ -18,29 +18,36 @@ static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
  Expr alignmentLoss; // sum up loss over all attention/alignment positions
  size_t numLabels;
  if(guidedLossType == "ce") {
-    // ce requires normalized probabilities
+    // normalizedAlignment is multi-hot, but ce requires normalized probabilities, so need to normalize to P(s|t)
    auto dimBatch    = shape[-2];
    auto dimTrgWords = shape[-1];
    auto dimSrcWords = shape[-3];
    ABORT_IF(shape[-4] != 1, "Guided alignments with beam??");
-    auto normalizedAlignment = batch->getGuidedAlignment();
+    auto normalizedAlignment = batch->getGuidedAlignment(); // [dimSrcWords, dimBatch, dimTrgWords] flattened, matches shape of 'attention'
+    auto srcBatch = batch->front();
+    const auto& srcMask = srcBatch->mask();
+    ABORT_IF(shape.elements() != normalizedAlignment.size(), "Attention-matrix and alignment shapes differ??");
+    ABORT_IF(dimBatch != batch->size() || dimTrgWords != batch->widthTrg() || dimSrcWords != batch->width(), "Attention-matrix and batch shapes differ??");
    auto locate = [=](size_t s, size_t b, size_t t) { return ((s * dimBatch) + b) * dimTrgWords + t; };
    for (size_t b = 0; b < dimBatch; b++) {
      for (size_t t = 0; t < dimTrgWords; t++) {
+        for (size_t s = 0; s < dimSrcWords; s++)
+          ABORT_IF(locate(s, b, t) != batch->locateInGuidedAlignments(b, s, t), "locate() and locateInGuidedAlignments() differ??");
+        // renormalize the alignment such that it sums up to 1
        float sum = 0;
        for (size_t s = 0; s < dimSrcWords; s++)
-          sum += normalizedAlignment[locate(s, b, t)];
+          sum += srcMask[srcBatch->locate(b, s)] * normalizedAlignment[locate(s, b, t)]; // these values are 0 or 1
        if (sum != 0 && sum != 1)
          for (size_t s = 0; s < dimSrcWords; s++)
            normalizedAlignment[locate(s, b, t)] /= sum;
      }
    }
-    auto alignment = constant_like(attention, inits::fromVector(normalizedAlignment));
+    auto alignment = constant_like(attention, std::move(normalizedAlignment));
    alignmentLoss = -sum(flatten(alignment * log(attention + epsilon)));
    numLabels = batch->back()->batchWords();
    ABORT_IF(numLabels > shape.elements() / shape[-3], "Num labels of guided alignment cost is off??");
  } else {
-    auto alignment = constant_like(attention, inits::fromVector(batch->getGuidedAlignment()));
+    auto alignment = constant_like(attention, batch->getGuidedAlignment());
    if(guidedLossType == "mse")
      alignmentLoss = sum(flatten(square(attention - alignment))) / 2.f;
    else if(guidedLossType == "mult") // @TODO: I don't know what this criterion is for. Can we remove it?
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@ -206,7 +206,7 @@ private:
  virtual Expr accumulateLoss(const RationalLoss& current) override {
    if(loss_) {
      const auto& first = partialLosses_.front();
-      return loss_ + first.count() * (current.loss() / current.count()); // scale up/down to match scale of first loss
+      return loss_ + current.loss() * first.count() / current.count(); // scale up/down to match scale of first loss
    } else {
      return current.loss(); // first reference loss, keeps to scale with this one
    }
@ -344,8 +344,8 @@ protected:
      // for bert training or classification the time dimension is lot.
      // Here safeguard against 2d classifier output, adds 1 on the left, non-op.
      Expr ce = cast(cross_entropy(logits, indices), Type::float32);
-      if (inFactor) {
-        LOG_ONCE("scaling factor losses with weight {}", factorWeight_);
+      if (inFactor && factorWeight_ != 1.0f) {
+        LOG_ONCE(info, "scaling factor losses with weight {}", factorWeight_);
        ce = ce * factorWeight_;
      }
      if (labelSmoothing_ > 0) {
--- a/src/models/states.h
+++ b/src/models/states.h
@ -9,7 +9,7 @@ namespace marian {
 class EncoderState {
 private:
  Expr context_;
-  Expr mask_;
+  Expr mask_;       // [beam depth=1, max length, batch size, vector dim=1] source mask
  Ptr<data::CorpusBatch> batch_;

 public:
@ -18,9 +18,9 @@ public:

  EncoderState() {}

-  virtual Expr getContext() { return context_; }
-  virtual Expr getAttended() { return context_; }
-  virtual Expr getMask() { return mask_; }
+  virtual Expr getContext()   const { return context_;   }
+  virtual Expr getAttended()  const { return context_;   }
+  virtual Expr getMask()      const { return mask_;      } // source batch mask; may have additional positions suppressed

  virtual const Words& getSourceWords() {
    return batch_->front()->data();
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -142,29 +142,6 @@ public:
    return reshape(output, {dimBeam, dimBatch, dimSteps, dimModel});
  }

-  // like affine() but with built-in parameters, activation, and dropout
-  static inline
-  Expr dense(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
-  {
-    auto graph = x->graph();
-
-    auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorotUniform());
-    auto b = graph->param(prefix + "_b" + suffix, { 1,              outDim }, inits::zeros());
-
-    x = affine(x, W, b);
-    if (actFn)
-      x = actFn(x);
-    x = dropout(x, dropProb);
-    return x;
-  }
-
-  Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) const {
-    int dimModel = x->shape()[-1];
-    auto scale = graph_->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones());
-    auto bias  = graph_->param(prefix + "_ln_bias"  + suffix, { 1, dimModel }, inits::zeros());
-    return marian::layerNorm(x, scale, bias, 1e-6f);
-  }
-
  Expr preProcess(std::string prefix, std::string ops, Expr input, float dropProb = 0.0f) const {
    auto output = input;
    for(auto op : ops) {
@ -192,7 +169,7 @@ public:
      // highway connection
      else if(op == 'h') {
        int dimModel = input->shape()[-1];
-        auto t = dense(prevInput, prefix, /*suffix=*/"h", dimModel);
+        auto t = denseInline(prevInput, prefix, /*suffix=*/"h", dimModel);
        output = highway(output, prevInput, t);
      }
      // layer normalization
@ -402,8 +379,8 @@ public:

    // the stack of FF layers
    for(int i = 1; i < depthFfn; ++i)
-      output = dense(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
-    output = dense(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);
+      output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
+    output = denseInline(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);

    auto opsPost = opt<std::string>("transformer-postprocess");
    output
@ -430,14 +407,14 @@ public:

    // the stack of AAN layers
    for(int i = 1; i < depthAan; ++i)
-      y = dense(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
+      y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
    if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed
-      y = dense(y, prefix, std::to_string(depthAan), dimModel);
+      y = denseInline(y, prefix, std::to_string(depthAan), dimModel);

    bool noGate = opt<bool>("transformer-aan-nogate");
    if(!noGate) {
-      auto gi = dense(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
-      auto gf = dense(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
+      auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
+      auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
      y = gi * x + gf * y;
    }

@ -533,28 +510,29 @@ public:
    batchEmbeddings = addSpecialEmbeddings(batchEmbeddings, /*start=*/0, batch);
    
    // reorganize batch and timestep
-    batchEmbeddings = atleast_nd(batchEmbeddings, 4);
-    batchMask = atleast_nd(batchMask, 4);
-    auto layer = transposeTimeBatch(batchEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
-    auto layerMask
-      = reshape(transposeTimeBatch(batchMask), {1, dimBatch, 1, dimSrcWords}); // [-4: beam depth=1, -3: batch size, -2: vector dim=1, -1: max length]
+    batchEmbeddings = atleast_nd(batchEmbeddings, 4); // [beam depth=1, max length, batch size, vector dim]
+    batchMask       = atleast_nd(batchMask, 4);       // [beam depth=1, max length, batch size, vector dim=1]
+
+    auto layer     = transposeTimeBatch(batchEmbeddings); // [beam depth=1, batch size, max length, vector dim]
+    auto layerMask = transposeTimeBatch(batchMask);       // [beam depth=1, batch size, max length, vector dim=1]

    auto opsEmb = opt<std::string>("transformer-postprocess-emb");
-
    float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
    layer = preProcess(prefix_ + "_emb", opsEmb, layer, dropProb);

-    layerMask = transposedLogMask(layerMask); // [-4: batch size, -3: 1, -2: vector dim=1, -1: max length]
+    // LayerAttention expects mask in a different layout
+    layerMask = reshape(layerMask, {1, dimBatch, 1, dimSrcWords}); // [1,          batch size,            1,                      max length]
+    layerMask = transposedLogMask(layerMask);                      // [batch size, num heads broadcast=1, max length broadcast=1, max length]

    // apply encoder layers
+    // This is the Transformer Encoder stack.
    auto encDepth = opt<int>("enc-depth");
    for(int i = 1; i <= encDepth; ++i) {
      layer = LayerAttention(prefix_ + "_l" + std::to_string(i) + "_self",
                             layer, // query
                             layer, // keys
                             layer, // values
-                             layerMask);
-
+                             layerMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
      layer = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", layer);
    }

@ -698,12 +676,14 @@ public:

    std::vector<Expr> encoderContexts;
    std::vector<Expr> encoderMasks;
-
    for(auto encoderState : state->getEncoderStates()) {
-      auto encoderContext = encoderState->getContext();
-      auto encoderMask = encoderState->getMask();
+      auto encoderContext = encoderState->getContext(); // encoder output
+      auto encoderMask = encoderState->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention
+      encoderMask = atleast_nd(encoderMask, 4);
+
+      encoderContext = transposeTimeBatch(encoderContext); // [beam depth=1, batch size, max length, vector dim]
+      encoderMask    = transposeTimeBatch(encoderMask);    // [beam depth=1, max length, batch size, vector dim=1]

-      encoderContext = transposeTimeBatch(encoderContext); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
      int dimSrcWords = encoderContext->shape()[-2];

      // This would happen if something goes wrong during batch pruning.
@ -712,10 +692,9 @@ public:
               encoderContext->shape()[-3], 
               dimBatch);

-      encoderMask = atleast_nd(encoderMask, 4);
-      encoderMask = reshape(transposeTimeBatch(encoderMask),
-                            {1, dimBatch, 1, dimSrcWords});
-      encoderMask = transposedLogMask(encoderMask);
+      // LayerAttention expects mask in a different layout
+      encoderMask = reshape(encoderMask, { 1, dimBatch, 1, dimSrcWords }); // [1,          batch size,            1,                      max length]
+      encoderMask = transposedLogMask(encoderMask);                        // [batch size, num heads broadcast=1, max length broadcast=1, max length]
      if(dimBeam > 1)
        encoderMask = repeat(encoderMask, dimBeam, /*axis=*/ -4);

--- a/src/optimizers/optimizers.cpp
+++ b/src/optimizers/optimizers.cpp
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@ -980,7 +980,7 @@ __global__ void gPasteRows(T* out,
                           const IndexType* targetRowIdx,
                           size_t rows) {
  for(int bid = 0; bid < rows; bid += gridDim.x) {
-    int j = bid + blockIdx.x;
+    int j = bid + blockIdx.x; // index into 'indices' vector
    if(j < rows) {
      size_t dstId = targetRowIdx[j];
      size_t srcId = j;
@ -988,11 +988,15 @@ __global__ void gPasteRows(T* out,
      T* rowOut = out + dstId * cols;
      const T* rowIn = in + srcId * cols;

+      // aggregate the entire row
      for(int tid = 0; tid < cols; tid += blockDim.x) {
-        int i = tid + threadIdx.x;
+        int i = tid + threadIdx.x; // column index   --@TODO: column index should be called 'j'
        if(i < cols) {
-          // @TODO: Do we need to get rid of this atomic add? It seems slow for fp16
-          atomics::atomicAdd(rowOut + i, rowIn[i]);
+          // Note: atomicAdd() not needed if number of blocks is 1. Avoid it because it is slow for fp16.
+          if (gridDim.x == 1)
+            rowOut[i] += rowIn[i];
+          else
+            atomics::atomicAdd(rowOut + i, rowIn[i]);
        }
      }
    }
@ -1011,7 +1015,15 @@ void PasteRows(Tensor out,
  size_t rowsToCopy = indices->size();

  int threads = std::min(MAX_THREADS, (int)cols);
+#if 1   // @TODO: make this configurable with a 'deterministic' flag
+  // If we only use one block, then each core operates on a different column,
+  // hence the summation becomes deterministic.
+  // However, we only use e.g. 512 cores out of possibly 3000+, so this will be
+  // 6 x slower in this example.
+  int blocks = 1;
+#else
  int blocks = std::min(MAX_BLOCKS, (int)rowsToCopy);
+#endif

  if(out->type() == Type::float32) {
    gPasteRows<<<blocks, threads>>>(
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -401,15 +401,20 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
          paramsAvg_[idx], curParam, scheduler_->numberOfBatches(), updateTrgWords);
  };

-  comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards
-  comm_->foreach(update);              // per-shard model-update
-  comm_->allGatherParams();            // distribute param value shards back
-
  // cost across all local devices (scheduler will aggregate cross-process)
  StaticLoss localLoss;
  for(auto& l : localDeviceLosses) // localDeviceLosses is already summed up over delay steps
    localLoss += l;

+  // model update
+  if (std::isfinite(localLoss.loss) || mpi_->numMPIProcesses() > 1) { // guard against NaN (except with MPI, as this simple way could hang it)
+    comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices and MPI nodes into shards
+    comm_->foreach(update);              // per-shard model-update
+    comm_->allGatherParams();            // distribute param value shards back
+  }
+  else
+    LOG(info, "[training] skipping {}-th update due to loss being {}", scheduler_->numberOfBatches(), localLoss.loss);
+
  if(scheduler_) {
    // track and log localLoss
    scheduler_->update(localLoss, numReadBatches, effectiveBatchSize, effectiveBatchTrgWords, mpi_);
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -358,7 +358,7 @@ public:
       && heartBeatTimer_.elapsed<std::chrono::minutes>() >= 10) {
      printf("PROGRESS: %.2f%%\nEVALERR: %.7f%%\n",
          (double)state_->epochs,
-          state_->costSum / state_->costCount / (mpi ? mpi->numMPIProcesses() : 1));
+          state_->costSum / (state_->costCount ? state_->costCount : 1) / (mpi ? mpi->numMPIProcesses() : 1));
      fflush(stdout);
      std::cout << "MBSIZE: " << batchLabels << " after " << state_->batches << " updates = " << state_->labelsTotal << " labels" << std::endl << std::flush;
      heartBeatTimer_.start();
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -99,12 +99,17 @@ public:
        // starting with the lemma, then adding factors one by one.
        if (factorGroup == 0) {
          word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
-          //std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
-          //LOG(info, "new lemma {},{}={} -> {}->{}", word.toWordIndex(), factorIndices[0], factoredVocab->word2string(word), prevHyp->getPathScore(), pathScore);
+          std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
+          //LOG(info, "{} + {} ({}) -> {} -> {}",
+          //    factoredVocab->decode(prevHyp->tracebackWords()),
+          //    factoredVocab->word2string(word), factorIndices[0], prevHyp->getPathScore(), pathScore);
        }
        else {
-          //LOG(info, "expand word {}={} with factor[{}] {} -> {}->{}", beam[beamHypIdx]->getWord().toWordIndex(),
-          //    factoredVocab->word2string(beam[beamHypIdx]->getWord()), factorGroup, wordIdx, prevHyp->getPathScore(), pathScore);
+          //LOG(info, "{} |{} ({}) = {} ({}) -> {} -> {}",
+          //    factoredVocab->decodeForDiagnostics(beam[beamHypIdx]->tracebackWords()),
+          //    factoredVocab->getFactorGroupPrefix(factorGroup), factorGroup,
+          //    factoredVocab->getFactorName(factorGroup, wordIdx), wordIdx,
+          //    prevHyp->getPathScore(), pathScore);
          word = beam[beamHypIdx]->getWord();
          ABORT_IF(!factoredVocab->canExpandFactoredWord(word, factorGroup),
                   "A word without this factor snuck through to here??");