From 8470c16bdd92797e75e7d5e7397ef7b29896538a Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Thu, 16 Sep 2021 02:00:00 +0000
Subject: [PATCH 01/11] Merged PR 20230: Add option for running regression
 tests only in Azure Pipelines

This PR adds a checkbox which can be unchecked to skip running compilation checks when triggering them manually. It is useful for generating expected outputs on different CPUs for tests using 8-bit models.
---
 azure-pipelines.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 4f7ce02d..d4d0b2e5 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -6,6 +6,13 @@
 # 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file
 # 4. "More actions" > "Save"
 
+parameters:
+# Allow skipping the entire 'Build' stage
+- name: runBuilds
+  displayName: Run builds? Uncheck to run regression tests only.
+  type: boolean
+  default: true
+
 # The pipeline CI trigger is set on the branch master only and PR trigger on a
 # (non-draft) pull request to any branch
 trigger:
@@ -45,6 +52,7 @@ stages:
 
   ######################################################################
   - job: BuildWindows
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: Windows
 
     strategy:
@@ -180,6 +188,7 @@ stages:
 
   ######################################################################
   - job: BuildUbuntu
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: Ubuntu
     timeoutInMinutes: 90
 
@@ -322,6 +331,7 @@ stages:
 
   ######################################################################
   - job: BuildUbuntuMinimal
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5
 
     pool:
@@ -368,6 +378,7 @@ stages:
 
   ######################################################################
   - job: BuildMacOS
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: macOS CPU clang
 
     pool:
@@ -416,6 +427,7 @@ stages:
 
   ######################################################################
   - job: BuildInstall
+    condition: eq(${{ parameters.runBuilds }}, true)
     displayName: Linux CPU library install
 
     pool:

From aa58ba8e239d228d539734e6be8266fbb3181044 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Mon, 20 Sep 2021 13:14:24 +0000
Subject: [PATCH 02/11] Merged PR 20593: Fix and update Azure pipelines

- Add `--allow-unauthenticated` to `apt` when installing CUDA on Ubuntu
- Removing `ubuntu-16.04` image from Azure pipelines, which will become unavailable after September 20
---
 azure-pipelines.yml               | 26 +++++++-------------------
 scripts/ci/install_cuda_ubuntu.sh |  2 +-
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index d4d0b2e5..7953b282 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -246,17 +246,7 @@ stages:
           examples: true
           static: true
         ################################################################
-        # Ubuntu 16.04 supports CUDA 8+
-        "16.04 CUDA 9.2 gcc-7":
-          image: ubuntu-16.04
-          boost: true
-          cpu: true
-          gpu: true
-          cuda: 9.2
-          gcc: 7
-          unit_tests: true
-          examples: true
-          static: false
+        # Ubuntu 16.04 is no longer available on Azure-hosted machines
 
     pool:
       vmImage: $(image)
@@ -332,18 +322,16 @@ stages:
   ######################################################################
   - job: BuildUbuntuMinimal
     condition: eq(${{ parameters.runBuilds }}, true)
-    displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5
+    displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5
 
     pool:
-      vmImage: ubuntu-16.04
+      vmImage: ubuntu-18.04
 
     steps:
     - checkout: self
       submodules: true
 
     # The script simplifies installation of different versions of CUDA.
-    # Ubuntu 16.04 on Azure-hosted VMs have GCC 5.5 as gcc-5, which is not compatible with CUDA 9.
-    # Downgrading to GCC 5.4 (the default gcc on Ubuntu 16.04) would be more work...
     - bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0"
       displayName: Install CUDA
 
@@ -356,10 +344,10 @@ stages:
 
     # GCC 5 is the minimum version supported
     - bash: |
-        /usr/bin/gcc-5 --version
+        /usr/bin/gcc-7 --version
         mkdir -p build
         cd build
-        CC=/usr/bin/gcc-5 CXX=/usr/bin/g++-5 CUDAHOSTCXX=/usr/bin/g++-5 \
+        CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \
         ../cmake-3.5.1-Linux-x86_64/bin/cmake .. \
           -DCOMPILE_CPU=on \
           -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0
@@ -592,7 +580,7 @@ stages:
 
     # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\'
     # instead of '/', which often breaks the job
-    - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
+    - bash: MARIAN=../marian-dev/build TIMEOUT=10m bash ./run_mrt.sh '#cpu' '#basics' '#devops'
       continueOnError: true
       displayName: Run tests
       workingDirectory: marian-prod-tests
@@ -689,7 +677,7 @@ stages:
           AWS_SECRET_SAS_TOKEN: $(blob-sas-token)
       workingDirectory: marian-prod-tests
 
-    - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
+    - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops'
       continueOnError: true
       displayName: Run tests
       workingDirectory: marian-prod-tests
diff --git a/scripts/ci/install_cuda_ubuntu.sh b/scripts/ci/install_cuda_ubuntu.sh
index 8dc77eda..b058294a 100755
--- a/scripts/ci/install_cuda_ubuntu.sh
+++ b/scripts/ci/install_cuda_ubuntu.sh
@@ -91,7 +91,7 @@ sudo add-apt-repository "deb ${REPO_URL} /"
 sudo apt-get update
 
 echo "Installing CUDA packages ${CUDA_PACKAGES}"
-sudo apt-get -y install ${CUDA_PACKAGES}
+sudo apt-get -y --allow-unauthenticated install ${CUDA_PACKAGES}
 
 if [[ $? -ne 0 ]]; then
     echo "CUDA Installation Error."

From d796a3c3b7779993660e672f2a47f5cdd685a174 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 28 Sep 2021 17:17:12 +0000
Subject: [PATCH 03/11] Merged PR 20839: Do not ignore ignoreEOS for spm
 decoding

With final space this eliminates trailing whitespace caused by appending EOS
---
 src/data/sentencepiece_vocab.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp
index 090d478b..8f774c2b 100644
--- a/src/data/sentencepiece_vocab.cpp
+++ b/src/data/sentencepiece_vocab.cpp
@@ -236,18 +236,20 @@ public:
     return words;
   }
 
-  std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
+  std::string decode(const Words& sentence, bool ignoreEOS) const override {
     std::string line;
     if(keepEncoded_) {  // i.e. keep the sentence segmented into subword units
       for(const Word& id : sentence)
-        line += (*this)[id] + " ";
+        if(!ignoreEOS || id != getEosId())
+          line += (*this)[id] + " ";
       line.pop_back();  // trim the trailing whitespace
     } else {
       // convert vector of Word to vector of int
       std::vector<int> spmSentence;
       spmSentence.reserve(sentence.size());
       for(auto&& word : sentence)
-        spmSentence.push_back(word.toWordIndex());
+        if(!ignoreEOS || word != getEosId())
+          spmSentence.push_back(word.toWordIndex());
       spm_->Decode(spmSentence, &line);
     }
     return line;

From 03fe1758763c99dd55bcf6c1c5e0e1dd60ae4e1a Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 28 Sep 2021 17:19:07 +0000
Subject: [PATCH 04/11] Merged PR 20879: Adjustable ffn width and depth in
 transformer decoder

---
 src/common/config_parser.cpp   |  8 +++++++-
 src/models/encoder_decoder.cpp |  2 ++
 src/models/transformer.h       | 21 +++++++++++++++------
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index d7818afb..b3e8950b 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -255,10 +255,16 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
       "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
   cli.add<int>("--transformer-dim-ffn",
       "Size of position-wise feed-forward network (transformer)",
-      2048);
+      2048);  
+  cli.add<int>("--transformer-decoder-dim-ffn",
+      "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.",
+      0);
   cli.add<int>("--transformer-ffn-depth",
       "Depth of filters (transformer)",
       2);
+  cli.add<int>("--transformer-decoder-ffn-depth",
+      "Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0",
+      0);
   cli.add<std::string>("--transformer-ffn-activation",
       "Activation between filters: swish or relu (transformer)",
       "swish");
diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp
index 8fc9321a..a7a398e7 100644
--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@@ -38,7 +38,9 @@ EncoderDecoder::EncoderDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
   modelFeatures_.insert("transformer-heads");
   modelFeatures_.insert("transformer-no-projection");
   modelFeatures_.insert("transformer-dim-ffn");
+  modelFeatures_.insert("transformer-decoder-dim-ffn");
   modelFeatures_.insert("transformer-ffn-depth");
+  modelFeatures_.insert("transformer-decoder-ffn-depth");
   modelFeatures_.insert("transformer-ffn-activation");
   modelFeatures_.insert("transformer-dim-aan");
   modelFeatures_.insert("transformer-aan-depth");
diff --git a/src/models/transformer.h b/src/models/transformer.h
index a792de8b..2393ad73 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -400,7 +400,7 @@ public:
                           opt<int>("transformer-heads"), /*cache=*/false);
   }
 
-  Expr LayerFFN(std::string prefix, Expr input) const {
+  Expr LayerFFN(std::string prefix, Expr input, bool isDecoder=false) const {
     int dimModel = input->shape()[-1];
 
     float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
@@ -408,13 +408,22 @@ public:
     auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb);
 
     auto actName = opt<std::string>("transformer-ffn-activation");
+
     int dimFfn = opt<int>("transformer-dim-ffn");
     int depthFfn = opt<int>("transformer-ffn-depth");
-    float ffnDropProb
-      = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
-
+    if(isDecoder) {
+      int decDimFfn = opt<int>("transformer-decoder-dim-ffn", 0);
+      if(decDimFfn != 0)
+        dimFfn = decDimFfn;
+      
+      int decDepthFfn = opt<int>("transformer-decoder-ffn-depth", 0);
+      if(decDepthFfn != 0)
+        depthFfn = decDepthFfn;      
+    }
+    
     ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn);
-
+    
+    float ffnDropProb = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
     auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f);
 
     // the stack of FF layers
@@ -861,7 +870,7 @@ public:
       // remember decoder state
       decoderStates.push_back(decoderState);
 
-      query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+      query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query, /*isDecoder=*/true); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
 
       checkpoint(query);
     }

From 2d79ad02bb66d7e0ba264defbf5ff9b47c70ba74 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hihoan@microsoft.com>
Date: Wed, 13 Oct 2021 20:20:14 +0000
Subject: [PATCH 05/11] Merged PR 20933: beam & batch works for n on-factored
 models

---
 src/layers/output.cpp          | 22 ++++++++++++++++------
 src/translator/beam_search.cpp |  5 +++--
 src/translator/nth_element.cpp |  2 ++
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/layers/output.cpp b/src/layers/output.cpp
index 92cccdfb..af72b794 100644
--- a/src/layers/output.cpp
+++ b/src/layers/output.cpp
@@ -313,14 +313,24 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
     }
     return Logits(std::move(allLogits), factoredVocab_);
   } else if(shortlist_) {
-    return Logits(affineOrDot(input,
-                              shortlist_->getCachedShortWt(),
-                              shortlist_->getCachedShortb(),
+    const Shape &inputShape = input->shape();
+    assert(inputShape[1] == 1); // time dimension always 1 for decoding
+    input = reshape(input, {inputShape[0], inputShape[2], 1, inputShape[3]});
+
+    Expr Wt = shortlist_->getCachedShortWt();
+    Expr b = shortlist_->getCachedShortb();
+    Expr ret = affineShortlist(input,
+                              Wt,
+                              b,
                               false,
-                              /*transB=*/isLegacyUntransposedW ? false : true));
+                              /*transB=*/isLegacyUntransposedW ? false : true);
+    const Shape &retShape = ret->shape();
+    assert(retShape[2] == 1); // time dimension always 1 for decoding
+    ret = reshape(ret, {retShape[0], 1, retShape[1], retShape[3]});
+    return Logits(ret);
   } else {
-    return Logits(
-        affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
+    Expr ret = affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true);
+    return Logits(ret);
   }
 }
 
diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp
index 2a0d3947..580895f2 100644
--- a/src/translator/beam_search.cpp
+++ b/src/translator/beam_search.cpp
@@ -94,7 +94,7 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
       // For factored decoding, the word is built over multiple decoding steps,
       // starting with the lemma, then adding factors one by one.
       if (factorGroup == 0) {
-        word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
+        word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx);
         std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
         //LOG(info, "{} + {} ({}) -> {} -> {}",
         //    factoredVocab->decode(prevHyp->tracebackWords()),
@@ -115,7 +115,7 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
       }
     }
     else if (shortlist)
-      word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) origBatchIdx, wordIdx));
+      word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx));
     else
       word = Word::fromWordIndex(wordIdx);
 
@@ -330,6 +330,7 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
   auto prevBatchIdxMap = batchIdxMap; // [origBatchIdx -> currentBatchIdx] but shifted by one time step
   // main loop over output time steps
   for (size_t t = 0; ; t++) {
+    //std::cerr << "\nstep=" << t << std::endl;
     ABORT_IF(origDimBatch != beams.size(), "Lost a batch entry??");
     // determine beam size for next output time step, as max over still-active sentences
     // E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then
diff --git a/src/translator/nth_element.cpp b/src/translator/nth_element.cpp
index 237d9b9d..dbcceec4 100644
--- a/src/translator/nth_element.cpp
+++ b/src/translator/nth_element.cpp
@@ -3,7 +3,9 @@
  *   SPDX-License-Identifier: MIT
  */
 
+#include "common/utils.h"
 #include "translator/nth_element.h"
+
 #include <algorithm>
 #include <iterator>
 #include <limits>

From 7f06f3c5d2035dac0cb4349bf29fbfa3e6bb5448 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Tue, 26 Oct 2021 11:20:41 +0000
Subject: [PATCH 06/11] Merged PR 21166: Keep building on macOS-10.15

Marian does not compile on macOS 11.6, so the build has stopped working due to an upgrade from macOS-10.15 to macOS 11.6 in Azure Pipelines: https://github.com/actions/virtual-environments/issues/4060
This PR explicitly set macOS 10.15 in the workflow.
---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 7953b282..bc76f85c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -370,7 +370,7 @@ stages:
     displayName: macOS CPU clang
 
     pool:
-      vmImage: macos-latest
+      vmImage: macos-10.15
 
     steps:
     - checkout: self

From 1404201926b5b4e27993776d52dfac809e8556f4 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 26 Oct 2021 20:25:39 +0000
Subject: [PATCH 07/11] Merged PR 21151: Cleaning up fp16 behavior

This PR improves clipping and pruning behavior of NaNs and Infs during fp16 training, ultimately avoiding the underflow problems that we were facing so far.
---
 src/common/aliases.cpp                 |   4 +-
 src/common/config_parser.cpp           |   6 +-
 src/common/definitions.h               |  10 +-
 src/models/transformer.h               |  15 ++-
 src/tensors/cpu/tensor_operators.cpp   |   4 +
 src/tensors/gpu/element.cu             |  12 +-
 src/tensors/gpu/tensor_operators.cu    | 147 ++++++++++++++++++-------
 src/tensors/tensor_operators.h         |  19 ++++
 src/training/graph_group.cpp           | 118 ++++++++++----------
 src/training/graph_group.h             |  17 ++-
 src/training/graph_group_async.cpp     |   6 +-
 src/training/graph_group_singleton.cpp |   8 +-
 src/training/graph_group_sync.cpp      |   8 +-
 13 files changed, 233 insertions(+), 141 deletions(-)

diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp
index 0be26a8c..99574fe1 100644
--- a/src/common/aliases.cpp
+++ b/src/common/aliases.cpp
@@ -29,8 +29,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
   cli.alias("fp16", "true", [&](YAML::Node& config) {
     if(mode_ == cli::mode::training) {
       config["precision"] = std::vector<std::string>({"float16", "float32"}); // inference type, optimization type, save type
-      // scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor
-      config["cost-scaling"] = std::vector<std::string>({"0", "1000", "2", "0.05", "10", "1e-5"}); 
+      // scaling factor, frequency, multiplier at increase, minium scaling factor
+      config["cost-scaling"] = std::vector<std::string>({"256.f", "1000", "2.f", "256.f"});
     } else {
       config["precision"] = std::vector<std::string>({"float16"}); // for inference we do not need the other types
     }
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index b3e8950b..51764cdc 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -522,15 +522,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   // mixed precision training
   cli.add<bool>("--fp16",
       "Shortcut for mixed precision training with float16 and cost-scaling, "
-      "corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f");
+      "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f");
   cli.add<std::vector<std::string>>("--precision",
       "Mixed precision training for forward/backward pass and optimizaton. "
       "Defines types for: forward/backward pass, optimization.",
       {"float32", "float32"});
   cli.add<std::vector<std::string>>("--cost-scaling",
       "Dynamic cost scaling for mixed precision training: "
-      "power of 2, scaling window, scaling factor, tolerance, range, minimum factor")
-      ->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f");
+      "scaling factor, frequency, multiplier, minimum factor")
+      ->implicit_val("256.f 1000 2.f 256.f");
   cli.add<size_t>("--gradient-norm-average-window",
       "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
       "After this many updates about 90% of the mass of the exponential average comes from these updates",
diff --git a/src/common/definitions.h b/src/common/definitions.h
index d2cf8aa4..d8a3ad46 100644
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -106,24 +106,24 @@ using Weak = std::weak_ptr<T>;
 /** @brief Creates shared_ptr of any type, passes all arguments to any available
  * constructor */
 template <class T, typename... Args>
-Ptr<T> New(Args&&... args) {
-  return Ptr<T>(new T(std::forward<Args>(args)...));
+inline Ptr<T> New(Args&&... args) {
+  return std::make_shared<T>(std::forward<Args>(args)...);
 }
 
 template <class T>
-Ptr<T> New(Ptr<T> p) {
+inline Ptr<T> New(Ptr<T> p) {
   return Ptr<T>(p);
 }
 
 /** @brief Creates InstrusivePtr of any type, passes all arguments to any available
  * constructor */
 template <class T, typename... Args>
-IPtr<T> INew(Args&&... args) {
+inline IPtr<T> INew(Args&&... args) {
   return IPtr<T>(new T(std::forward<Args>(args)...));
 }
 
 template <class T>
-IPtr<T> INew(Ptr<T> p) {
+inline IPtr<T> INew(Ptr<T> p) {
   return IPtr<T>(p);
 }
 
diff --git a/src/models/transformer.h b/src/models/transformer.h
index 2393ad73..b2c0f6be 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -147,8 +147,7 @@ public:
 
     int dimDepth = dimModel / dimHeads;
 
-    auto output
-        = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});
+    auto output = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});
 
     return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth]
   }
@@ -361,9 +360,9 @@ public:
 
   Expr LayerAttention(std::string prefix,
                       Expr input,         // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
-                      const Expr& keys,   // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
-                      const Expr& values, // ...?
-                      const Expr& mask,   // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
+                      Expr keys,   // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+                      Expr values, // ...?
+                      Expr mask,   // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
                       int dimHeads,
                       bool cache = false,
                       bool saveAttentionWeights = false) {
@@ -373,6 +372,12 @@ public:
     auto opsPre = opt<std::string>("transformer-preprocess");
     auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb);
 
+    // fixes missing norm for keys and values in self-attention with pre-norm
+    if(input == keys)
+       keys = output;
+    if(input == values)
+       values = output;
+
     // multi-head self-attention over previous input
     output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights);
     
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 1afb8f64..f3964f91 100755
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -24,6 +24,10 @@ void IsNaN(const Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool& /*isNaN*/, b
   ABORT("Not implemented");
 }
 
+bool SanitizeGradient(marian::Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool /*pruneNaN*/, bool /*clipInf*/) {
+  ABORT("Not implemented");
+}
+
 template <bool add, typename To, typename From>
 void CopyCastTo(To* out, const From* in, int length) {
   for(int i = 0; i < length; ++i)
diff --git a/src/tensors/gpu/element.cu b/src/tensors/gpu/element.cu
index 6790efd4..e9cbe081 100755
--- a/src/tensors/gpu/element.cu
+++ b/src/tensors/gpu/element.cu
@@ -29,7 +29,9 @@ __global__ void gElement(
           indices[i] = tensors[i].shape().bindex(dims);
       }
 
-      tensors[0].data()[index] = functional::apply(functor, tensors, indices);
+      // This performs the internal application of the functor in float32 regardless of the input type.
+      // It seems there are no speed penalties but improved precision.
+      tensors[0].data()[index] = (T)functional::applyWithCast<float>(functor, tensors, indices);
     }
   }
 }
@@ -65,13 +67,7 @@ void Element(Functor functor, Tensor out, Tensors... tensors) {
     ElementTyped<float>(functor, out, tensors...);
   } else if(out->type() == Type::float16) {
 #if COMPILE_FP16
-    std::vector<marian::Tensor> ts({out, tensors...});
-    bool div2 = std::all_of(ts.cbegin(), ts.cend(), [](marian::Tensor t){ return t->shape()[-1] % 2 == 0; });
-    if(div2) {
-      ElementTyped<halfx2>(functor, out, tensors...);
-    } else {
-      ElementTyped<half>(functor, out, tensors...);
-    }
+    ElementTyped<half>(functor, out, tensors...);
 #else
     ABORT("FP16 not supported with chosen current hardware or CUDA version");
 #endif
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index d55214bc..1347c3bb 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -16,15 +16,12 @@ namespace gpu {
 namespace atomics {
 
 static inline  __device__ void atomicAdd(float *address, float val) {
-  //*address += val;
   ::atomicAdd(address, val);
 }
 
 #if COMPILE_FP16
 // @TODO: copied from CuTorch, adapt this better, give credit.
 static inline  __device__ void atomicAdd(half *address, half val) {
-  //*address += val;
-
 #if __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000 // compute capability 70 and higher with CUDA 10
   ::atomicAdd(address, val);
 #else // __CUDA_ARCH__ < 700
@@ -50,7 +47,8 @@ static inline  __device__ void atomicAdd(half *address, half val) {
     } while (assumed != old);
 #endif // __CUDA_ARCH__
 }
-#endif
+#endif // COMPILE_FP16
+
 
 }
 
@@ -96,6 +94,81 @@ void IsNaN(const Tensor in, Ptr<Allocator> allocator, bool& isNaN, bool& isInf)
   cudaStreamSynchronize(0);
 }
 
+template <typename T>
+__global__ void gSanitizeGradient(T* in, int length,
+                                  bool* isNaN, bool* isInf,
+                                  bool pruneNaN, bool clipInf,
+                                  float forNaN = 0.f, float forInf = 65504.f, float forInfNeg = -65504.f) {
+  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+    if(index < length) {
+      float v = (float)in[index];
+      // handle NaN
+      if(isnan(v)) {
+        if(pruneNaN) {
+          in[index] = (T)forNaN;
+        } else {
+          *isNaN = true;
+        }
+      }
+      // handle +/- Inf
+      if(isinf(v)) {
+        if(clipInf) {
+          in[index] = v > 0 ? (T)forInf : (T)forInfNeg;
+        } else {
+         *isInf = true;
+        }
+      }
+    }
+  }
+}
+
+// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. 
+// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. 
+// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. 
+// In that case infinities do not result in a bad gradient, since they get clipped.
+// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result 
+// in a bad gradient.
+// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), 
+// we return `false` indicating a bad gradient. 
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
+  cudaSetDevice(in->getDeviceId().no);
+
+  int length = in->size();
+
+  int threads = std::min(MAX_THREADS, length);
+  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+  auto mem = allocator->alloc<bool>(2);
+  bool* dIsNaN = &mem->data<bool>()[0];
+  bool* dIsInf = &mem->data<bool>()[1];
+  fill(in->getBackend(), dIsNaN, dIsNaN + 2, false);
+
+  float forNaN    = 0.f;
+  float forInf    = NumericLimits<float>(in->type()).max;
+  float forInfNeg = NumericLimits<float>(in->type()).lowest;
+
+  if(in->type() == Type::float32) {
+    gSanitizeGradient<<<blocks, threads>>>(in->data<float>(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg);
+#if COMPILE_FP16
+  } else if(in->type() == Type::float16) {
+    gSanitizeGradient<<<blocks, threads>>>(in->data<half>(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg);
+#endif
+  } else {
+    ABORT("gSanitizeGradient for type {} not implemented", in->type());
+  }
+
+  bool isNaN, isInf;
+  CudaCopy(dIsNaN, dIsNaN + 1, &isNaN);
+  CudaCopy(dIsInf, dIsInf + 1, &isInf);
+
+  allocator->free(mem);
+
+  cudaStreamSynchronize(0);
+
+  return !isNaN && !isInf;
+}
+
 template <bool add, typename To, typename From>
 __global__ void gCopyCastTo(To* out, const From* in, int length) {
   for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
@@ -1090,7 +1163,7 @@ void PasteRows(Tensor out,
   size_t rowsToCopy = indices->size();
 
   int threads = std::min(MAX_THREADS, (int)cols);
-#if 1   // @TODO: make this configurable with a 'deterministic' flag
+#if 0   // @TODO: make this configurable with a 'deterministic' flag
   // If we only use one block, then each core operates on a different column,
   // hence the summation becomes deterministic.
   // However, we only use e.g. 512 cores out of possibly 3000+, so this will be
@@ -1355,7 +1428,7 @@ __global__ void gGRUFastForward(T* out,
   for(int bid = 0; bid < rows; bid += gridDim.x) {
     int j = bid + blockIdx.x;
     if(j < rows) {
-      T m = !mask || mask[j];
+      float m = !mask || mask[j];
       T* rowOut = out + j * cols;
       const T* rowState = state + j * cols;
 
@@ -1365,21 +1438,21 @@ __global__ void gGRUFastForward(T* out,
       for(int tid = 0; tid < cols; tid += blockDim.x) {
         int i = tid + threadIdx.x;
         if(i < cols) {
-          T r = functional::Ops<T>::sigmoid(xWrow[i] + sUrow[i] + b[i]);
+          float r = functional::Ops<float>::sigmoid((float)xWrow[i] + (float)sUrow[i] + (float)b[i]);
 
           int k = i + cols;
 
-          T z = functional::Ops<T>::sigmoid(xWrow[k] + sUrow[k] + b[k]);
+          float z = functional::Ops<float>::sigmoid((float)xWrow[k] + (float)sUrow[k] + (float)b[k]);
 
           int l = i + 2 * cols;
-          T h;
+          float h;
           if(final)
-            h = functional::Ops<T>::tanh(xWrow[l] + (sUrow[l] + b[l]) * r);
+            h = functional::Ops<float>::tanh((float)xWrow[l] + ((float)sUrow[l] + (float)b[l]) * r);
           else
-            h = functional::Ops<T>::tanh(xWrow[l] + sUrow[l] * r + b[l]);
+            h = functional::Ops<float>::tanh((float)xWrow[l] + (float)sUrow[l] * r + (float)b[l]);
 
-          T out = ((T)1.f - z) * h + z * rowState[i];
-          rowOut[i] = m * out + ((T)1.f - m) * rowState[i];
+          float out = (1.f - z) * h + z * (float)rowState[i];
+          rowOut[i] = (T)(m * out + (1.f - m) * (float)rowState[i]);
         }
       }
     }
@@ -1441,7 +1514,7 @@ __global__ void gGRUFastBackward(T* outState,
   for(int bid = 0; bid < rows; bid += gridDim.x) {
     int j = bid + blockIdx.x;
     if(j < rows) {
-      T m = !mask || mask[j];
+      float m = !mask || mask[j];
 
       T* rowOutState = outState + j * cols;
       T* rowOutXW = outXW + j * cols * 3;
@@ -1459,56 +1532,56 @@ __global__ void gGRUFastBackward(T* outState,
           int k = i + cols;
           int l = i + 2 * cols;
 
-          T r = functional::Ops<T>::sigmoid(rowXW[i] + rowSU[i] + b[i]);
-          T z = functional::Ops<T>::sigmoid(rowXW[k] + rowSU[k] + b[k]);
+          float r = functional::Ops<float>::sigmoid((float)rowXW[i] + (float)rowSU[i] + (float)b[i]);
+          float z = functional::Ops<float>::sigmoid((float)rowXW[k] + (float)rowSU[k] + (float)b[k]);
 
-          T h;
+          float h;
           if(final)
-            h = functional::Ops<T>::tanh(rowXW[l] + (rowSU[l] + b[l]) * r);
+            h = functional::Ops<float>::tanh((float)rowXW[l] + ((float)rowSU[l] + (float)b[l]) * r);
           else
-            h = functional::Ops<T>::tanh(rowXW[l] + rowSU[l] * r + b[l]);
+            h = functional::Ops<float>::tanh((float)rowXW[l] + (float)rowSU[l] * r + (float)b[l]);
 
-          T adj = rowAdj[i];
+          float adj = rowAdj[i];
 
-          T t = ((T)1.f - z) * ((T)1.f - h * h);
+          float t = (1.f - z) * (1.f - h * h);
 
           // df/ds
           if(outState)
-            rowOutState[i] += (m * z - m + (T)1.f) * adj;
+            rowOutState[i] += (T)((m * z - m + 1.f) * adj);
 
           // df/d(xW_r) ...
-          T dfdxW_r = m * r * ((T)1.f - r) * t * adj;
+          float dfdxW_r = m * r * (1.f - r) * t * adj;
           if(final)
-            dfdxW_r *= rowSU[l] + b[l];
+            dfdxW_r *= (float)rowSU[l] + (float)b[l];
           else
-            dfdxW_r *= rowSU[l];
+            dfdxW_r *= (float)rowSU[l];
           if(outXW)
-            rowOutXW[i] += dfdxW_r;
+            rowOutXW[i] += (T)dfdxW_r;
           if(outSU)
-            rowOutSU[i] += dfdxW_r;
+            rowOutSU[i] += (T)dfdxW_r;
           if(outB)
-            rowOutB[i] += dfdxW_r;
+            rowOutB[i] += (T)dfdxW_r;
 
           // df/d(xW_z) ...
-          T dfdxW_z = m * ((T)1.f - z) * z * (rowState[i] - h) * adj;
+          float dfdxW_z = m * (1.f - z) * z * ((float)rowState[i] - h) * adj;
           if(outXW)
-            rowOutXW[k] += dfdxW_z;
+            rowOutXW[k] += (T)dfdxW_z;
           if(outSU)
-            rowOutSU[k] += dfdxW_z;
+            rowOutSU[k] += (T)dfdxW_z;
           if(outB)
-            rowOutB[k] += dfdxW_z;
+            rowOutB[k] += (T)dfdxW_z;
 
           // df/d(xW_x) ...
-          T dfdxW_x = m * t * adj;
+          float dfdxW_x = m * t * adj;
           if(outXW)
-            rowOutXW[l] += dfdxW_x;
+            rowOutXW[l] += (T)dfdxW_x;
           if(outSU)
-            rowOutSU[l] += dfdxW_x * r;
+            rowOutSU[l] += (T)(dfdxW_x * r);
           if(outB)
             if(final)
-              rowOutB[l] += dfdxW_x * r;
+              rowOutB[l] += (T)(dfdxW_x * r);
             else
-              rowOutB[l] += dfdxW_x;
+              rowOutB[l] += (T)dfdxW_x;
         }
       }
     }
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index 6e587953..dc29bf35 100644
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -41,6 +41,25 @@ DISPATCH2(CopyCast, marian::Tensor, const marian::Tensor);
 DISPATCH2(AddCast, marian::Tensor, const marian::Tensor);
 DISPATCH4(IsNaN, const Tensor, Ptr<Allocator>, bool&, bool&);
 
+#ifdef CUDA_FOUND
+namespace gpu {
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
+}
+#endif
+
+namespace cpu {
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
+}
+
+static inline bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
+#ifdef CUDA_FOUND
+  if(in->getBackend()->getDeviceId().type == DeviceType::gpu)
+    return gpu::SanitizeGradient(in, allocator, pruneNaN, clipInf);
+  else
+#endif
+    return cpu::SanitizeGradient(in, allocator, pruneNaN, clipInf);
+}
+
 template <class Functor, class... Tensors>
 void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
 #ifdef CUDA_FOUND
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index e9c977b9..03e5acf4 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -10,25 +10,19 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
     mbRoundUp_(options_->get<bool>("mini-batch-round-up", true)) {
   if(options_->hasAndNotEmpty("cost-scaling")) {
     auto vcs = options_->get<std::vector<std::string>>("cost-scaling");
-    costScale_ = true;
-    float costExponent = std::stof(vcs[0]);
-    costScaleFactor_ = std::pow(2.0f, costExponent);
-    
-    if(vcs.size() > 1) costScaleFreq_ = std::stoul(vcs[1]);
-    if(vcs.size() > 2) costScaleMultiplier_ = std::stof(vcs[2]);
-    if(vcs.size() > 3) costScaleNanTolerance_ = std::stof(vcs[3]);
-    if(vcs.size() > 4) costScaleNanRange_ = std::stoul(vcs[4]);
-    if(vcs.size() > 5) costScaleFactorMinimum_ = std::stof(vcs[5]);
+
+    costScaling_                                 = true;
+    costScalingFactor_                           = std::stof( vcs[0]);
+    if(vcs.size() > 1) costScalingFreq_          = std::stoul(vcs[1]);
+    if(vcs.size() > 2) costScalingMultiplier_    = std::stof( vcs[2]);
+    if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]);
     
     LOG_ONCE(info,
-             "Training with cost scaling - factor: 2^{} = {}, frequency: {}, multiplier: {}, tolerance: {}, range: {}, minimum: {}",
-             costExponent,
-             costScaleFactor_,
-             costScaleFreq_,
-             costScaleMultiplier_,
-             costScaleNanTolerance_,
-             costScaleNanRange_,
-             costScaleFactorMinimum_);
+             "Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}",
+             costScalingFactor_,
+             costScalingFreq_,
+             costScalingMultiplier_,
+             costScalingFactorMinimum_);
   }
 
   if(options_->hasAndNotEmpty("dynamic-gradient-scaling")) {
@@ -96,21 +90,17 @@ void GraphGroup::initGraphsAndOpts() {
 // given number of iterations. Usually we increase by 2 which adds
 // one more bit for precision.
 void GraphGroup::increaseCostScaleFactor() {
-  if(!costScale_)
+  if(!costScaling_)
     return;
 
   noNanSeen_++;
 
   size_t total = nanSeen_ + noNanSeen_;
-  float nanPercent = noNanSeen_ == (float)nanSeen_ / (float)total; // total is at least 1 because of noNanSeen_++
 
-  if(noNanSeen_ % costScaleFreq_ == 0) {
-    costScaleFactor_ *= costScaleMultiplier_;
-    LOG(debug,
-        "NaN/Inf percentage {:.2f} after {} gradient updates. Increasing cost-scaling factor to {}",
-        nanPercent,
-        total,
-        costScaleFactor_);
+  if(noNanSeen_ % costScalingFreq_ == 0) {
+    costScalingFactor_ *= costScalingMultiplier_;
+    if(isMainProcess())
+      LOG(debug, "No NaN/Inf after {} gradient updates. Increasing cost-scaling factor to {}", total, costScalingFactor_);
 
     // Resetting counts after cost-scale change
     noNanSeen_ = 0;
@@ -120,48 +110,56 @@ void GraphGroup::increaseCostScaleFactor() {
 
 // call when a NaN was seen to decrease cost-scaling factor
 void GraphGroup::decreaseCostScaleFactor() {
-  if(!costScale_)
+  if(!costScaling_)
     return;
 
   nanSeen_++;
   
   size_t total = nanSeen_ + noNanSeen_;
-  float nanPercent = (float)nanSeen_ / (float)total; // total is at least 1 because of nanSeen_++
-  if(total >= costScaleNanRange_ && nanPercent > costScaleNanTolerance_) {
-    if(costScaleFactor_ > costScaleFactorMinimum_) {
-      costScaleFactor_ /= costScaleMultiplier_;
-      LOG(debug,
-          "NaN/Inf percentage {:.2f} in {} gradient updates, reducing cost-scaling factor to {}",
-          nanPercent,
-          total,
-          costScaleFactor_);
-    } else {
-      // @TODO: think if should this rather abort?
-      LOG(warn,
-          "NaN/Inf percentage {:.2f} in {} gradient updates, but cost-scaling factor {} is already at minimum",
-          nanPercent,
-          total,
-          costScaleFactor_);
-    }
 
-    // Resetting counts after cost-scale change
-    noNanSeen_ = 0;
-    nanSeen_ = 0;
+  // do not reduce cost-scaling factor below minimum
+  if(costScalingFactor_ > costScalingFactorMinimum_)
+    costScalingFactor_ /= costScalingMultiplier_;
+
+  if(isMainProcess()) {
+    if(costScalingFactor_ > costScalingFactorMinimum_)
+      LOG(debug, "Seen NaN/Inf after {} gradient updates. Reduced cost-scaling factor to {}", total, costScalingFactor_);
+    else
+      LOG(debug, "Seen NaN/Inf after {} gradient updates, Reduced cost-scaling factor to minimum {}. Pruning NaNs now.", total, costScalingFactor_);
   }
+
+  // Resetting counts after cost-scale change
+  noNanSeen_ = 0;
+  nanSeen_ = 0;
 }
 
 float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) {
   auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin);
   
-  if(checkGradientNan_ || costScale_) {
-    bool hasNan = false, hasInf = false;
-    IsNaN(curGrad, graphs_[i]->allocator(), hasNan, hasInf); // @TODO: make safe with different compiler options
-    if(hasNan || hasInf) {
-      LOG(debug, "Found Nan ({}) or Inf ({})", hasNan, hasInf);
+  // If costScaling_ then check for NaN values if the costScalingFactor_ is larger than
+  // the minimum. If a NaN value is seen we exit here and will reduce the factor next and 
+  // this skips an update. 
+  // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces 
+  // NaNs with 0. Updates are not skipped any more.
+  // Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type.
+  // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent 
+  // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large. 
+  if(costScaling_ || checkGradientNan_) {
+    bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_;
+    bool clipInf  = !checkGradientNan_;
+    bool saneGradient = SanitizeGradient(curGrad, graphs_[i]->allocator(), pruneNaN, clipInf);
+
+    // This should never happen, if it does, something is wrong with the kernel above and needs to be fixed.
+    ABORT_IF(pruneNaN && clipInf && !saneGradient, "We are removing NaNs and clipping Infs, but gradient is still not sane??");
+
+    if(!saneGradient) {
+      LOG(debug, "Found NaN");
       return std::numeric_limits<float>::quiet_NaN();
     }
   }
-  
+
+  // The optional clipping above will affect the norm here. The norm can be non-finite despite the above
+  // gradient sanitization, hence check again and propagate a NaN.
   if(dynamicGradientScaling_) {
     auto gNorm = L2Norm(curGrad, graphs_[i]->allocator());
     if(isFinite(gNorm) && gNorm > 0.0)
@@ -197,8 +195,8 @@ float GraphGroup::executeAndCollectNorm(const std::function<float(size_t, size_t
 float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) {
   float normalizationFactor = 1.f;
 
-  if(costScale_)
-    normalizationFactor *= costScaleFactor_;
+  if(costScaling_)
+    normalizationFactor *= costScalingFactor_;
 
   if(options_->get<bool>("normalize-gradient"))
     normalizationFactor *= updateTrgWords;
@@ -207,9 +205,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
     return normalizationFactor;
   
   if(dynamicGradientScaling_) {
-    // make gradient norm invariant to changes in costScaleFactor_, luckily norm(c * g) = c * norm(g)
-    if(costScale_)
-      gNorm = gNorm / costScaleFactor_;
+    // make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g)
+    if(costScaling_)
+      gNorm = gNorm / costScalingFactor_;
     
     // Normalize gradient norm w.r.t. number of labels in batch for statistics, 
     // there should be no gradient normalization before this point, @TODO: check this
@@ -288,9 +286,7 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
       restoreFromCheckpoint(modelFileName, scatterFn);
     } else if(options_->hasAndNotEmpty("pretrained-model")) {
       std::string nameInit = options_->get<std::string>("pretrained-model");
-      LOG(info,
-          "[training] Initializing model weights with pre-trained model {}",
-          nameInit);
+      LOG(info, "[training] Initializing model weights with pre-trained model {}", nameInit);
 
       size_t i = 0;
       for(auto graph : graphs_)
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index 422990b1..b7f2f7ef 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -60,22 +60,21 @@ protected:
   double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words
   bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false
 
-  bool costScale_{false};
-  float costScaleFactor_{1.f}; // @TODO, add current costScaleFactor_ to trainingState for serialization
-  size_t costScaleFreq_{2000};
-  float costScaleMultiplier_{2.f};
-  float costScaleNanTolerance_{0.f};
-  size_t costScaleNanRange_{1};
-  float costScaleFactorMinimum_{1.f}; // @TODO make this configureable
+  bool costScaling_{false};
+  float costScalingFactor_{1.f}; // @TODO, add current costScalingFactor_ to trainingState for serialization
+  size_t costScalingFreq_{2000};
+  float costScalingMultiplier_{2.f};
+  float costScalingFactorMinimum_{1.f};
+
   size_t noNanSeen_{0}; // @TODO, add current noNanSeen_ to trainingState for serialization
   size_t nanSeen_{0};
 
+  bool checkGradientNan_{false};
+
   bool dynamicGradientScaling_{false};
   float dynamicGradientScalingFactor_{2.f};
   bool dynamicGradientScalingUseLogs_{false};
 
-  bool checkGradientNan_{false};
-
   // determines the number of input streams (i.e. input files or fields in the TSV input) that need
   // to be included in the batch, i.e. without alignments and weights
   size_t numberOfInputFiles();
diff --git a/src/training/graph_group_async.cpp b/src/training/graph_group_async.cpp
index 72b06e48..f85f9cf8 100644
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@@ -143,13 +143,13 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
     thread_local Tensor accGradients;
     thread_local Ptr<TensorAllocator> accAlloc;
 
-    ABORT_IF(costScale_ ,"Cost-scaling not implemented for AsyncSGD");
+    ABORT_IF(costScaling_ ,"Cost-scaling not implemented for AsyncSGD");
 
     auto graph = graphs_[tid];
     Ptr<RationalLoss> dynamicLoss = models_[tid]->build(graph, batch);
-    if(costScaleFactor_ != 1.f) {
+    if(costScalingFactor_ != 1.f) {
       // it's ok to go out of scope, this will still insert the new top node into the graph
-      auto costNode = dynamicLoss->loss() * costScaleFactor_;
+      auto costNode = dynamicLoss->loss() * costScalingFactor_;
     }
 
     if(t % optimizerDelay_ == 0) {
diff --git a/src/training/graph_group_singleton.cpp b/src/training/graph_group_singleton.cpp
index 7dc86137..16261070 100644
--- a/src/training/graph_group_singleton.cpp
+++ b/src/training/graph_group_singleton.cpp
@@ -16,16 +16,16 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
   auto opt   = optimizerShards_[0];
 
   auto lossNode = model->build(graph, batch);
-  if(costScaleFactor_ != 1.f) {
+  if(costScalingFactor_ != 1.f) {
     // for fp16 training, it's ok to go out of scope, we do not use the scaled version for anything
-    auto scaledLoss = lossNode->loss() * costScaleFactor_;
+    auto scaledLoss = lossNode->loss() * costScalingFactor_;
   }
 
   graph->forward();
   graph->backward();
 
   bool noNanOrInf = true;
-  if(costScale_) {
+  if(costScaling_) {
     // Are there NaNs in the gradient?
     bool hasNan = false, hasInf = false;
     IsNaN(graph->params()->grads(), graph->allocator(), hasNan, hasInf);
@@ -39,7 +39,7 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
     opt->update(graph->params()->vals(),
                 graph->params()->grads(),
                 batch->wordsTrg(),
-                costScaleFactor_);
+                costScalingFactor_);
 
   if(scheduler_) {
     scheduler_->update(*lossNode, batch);
diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp
index 8c06761e..c90a384e 100644
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@@ -252,8 +252,8 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
 
       { // let loss go out of scope, frees memory
         auto rationalLoss = models_[localDeviceIndex]->build(graph, subBatch);
-        if(costScaleFactor_ != 1.f)
-          rationalLoss->loss() * costScaleFactor_;
+        if(costScalingFactor_ != 1.f)
+          rationalLoss->loss() * costScalingFactor_;
         graph->forward();
 
         localDeviceLosses[localDeviceIndex] += *rationalLoss;
@@ -262,7 +262,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
       graph->backward(/*zero=*/false); // (gradients are reset before we get here)
     }
 
-#if 1 
+#if 0 // @TODO: this can probably be removed now, keep around until confirmed.
     // experimental and should eventually be somewhere else
     // Handle local gradient explosion but only clip to largest possible value
     // given number of GPUs and type. Should clip rarely. Also clips inf
@@ -284,7 +284,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
   comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards
   
   float gradNorm = 0.f; 
-  if(costScale_ || dynamicGradientScaling_ || checkGradientNan_) {
+  if(costScaling_ || dynamicGradientScaling_ || checkGradientNan_) {
     // Wrapping member function
     auto checkNanOrNorm = [&](size_t i, size_t begin, size_t end) { 
       return GraphGroup::checkNanOrNorm(i, begin, end); 

From c85d0608483789d446361ea28d95f7d7c9545f2d Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 22 Nov 2021 03:32:54 +0000
Subject: [PATCH 08/11] Merged PR 20729: Add top-k sampling

This adds Top-K sampling to Marian and extends the --output-sampling option to take arguments
---
 regression-tests                     |  2 +-
 src/common/config_parser.cpp         |  7 ++--
 src/graph/expression_operators.cpp   |  7 ++++
 src/graph/expression_operators.h     | 15 ++++++-
 src/graph/node_operators_binary.h    | 61 ++++++++++++++++++++++++++-
 src/graph/node_operators_tuple.h     |  2 +-
 src/models/costs.cpp                 | 35 ++++++++++++++++
 src/models/costs.h                   | 32 +++++++++-----
 src/models/model_factory.cpp         | 21 ++++++++--
 src/tensors/cpu/tensor_operators.cpp |  9 +++-
 src/tensors/gpu/tensor_operators.cu  | 63 +++++++++++++++-------------
 src/tensors/tensor_operators.h       | 23 +++++++++-
 src/translator/translator.h          |  2 +-
 13 files changed, 226 insertions(+), 53 deletions(-)

diff --git a/regression-tests b/regression-tests
index 7d612ca5..0aa7b6b7 160000
--- a/regression-tests
+++ b/regression-tests
@@ -1 +1 @@
-Subproject commit 7d612ca5e4b27a76f92584dad76d240e34f216d0
+Subproject commit 0aa7b6b7632732d1f22f3d8169d3262a7e6b1e9d
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 51764cdc..59b328e9 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -695,9 +695,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
      "Use softmax shortlist: path first best prune");
   cli.add<std::vector<float>>("--weights",
       "Scorer weights");
-  cli.add<bool>("--output-sampling",
-     "Noise output layer with gumbel noise",
-      false);
+  cli.add<std::vector<std::string>>("--output-sampling",
+     "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. "
+     " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.")
+     ->implicit_val("full");
   cli.add<std::vector<int>>("--output-approx-knn",
      "Use approximate knn search in output layer (currently only in transformer)")
      ->implicit_val("100 1024");
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 560ab4e7..b26c2ae0 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -357,6 +357,13 @@ Expr gather(Expr a, int axis, Expr indices) {
   return Expression<GatherNodeOp>(a, axis, indices);
 }
 
+// scatter() -- scatter arbitrary elements along an axis; batched or non-batched
+// This is the reverse operation to gather.
+Expr scatter(Expr a, int axis, Expr indices, Expr source) {
+  return Expression<ScatterNodeOp>(a, axis, indices, source);
+}
+
+
 // index_select() -- gather arbitrary elements along an axis from an unbatched
 // input 'a'. Indices are specified as a 1D vector.
 // This is used e.g. for embedding lookup.
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index e34ddc8a..d032e8d3 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -707,10 +707,23 @@ Expr stopGradient(Expr a);
  * @param indices The indices to be gathered
  * @returns       Gathered expression with the same shape as @p indices
  * @note @p a and @p indices must have the same rank
- * @note The non-target axes of @p a and @p indicies must have the same size, or be broadcastable.
+ * @note The non-target axes of @p a and @p indices must have the same size, or be broadcastable.
  */
 Expr gather(Expr a, int axis, Expr indices);
 
+/**
+ * Scatter elements from source along an axis into a. Unindexed elements from a remain unchanged.
+ * This is the reverse operation to gather.
+ * @param a       The input expression
+ * @param axis    The axis along which to index
+ * @param indices The indices to be scattered
+ * @param source  Expression with values to scatter. 
+ * @returns       Scattered expression with the same shape as @p a now containing values from @p source in positions @p indices
+ * @note @p source and @p indices must have the same rank
+ * @note In this version @p source and @p indicies must have the same shape
+ */
+Expr scatter(Expr a, int axis, Expr indices, Expr source);
+
 #if 0
  // reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis.
  // with broadcasting
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index a180bb5c..b2a646b1 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -1033,12 +1033,14 @@ struct GatherNodeOp : public NaryNodeOp {
 
   NodeOps forwardOps() override {
     return {NodeOp(
+      // @TODO: rename to gather
       Select(val_, child(0)->val(), child(1)->val(), axis_))};
   }
 
   NodeOps backwardOps() override {
     return {NodeOp(
-      Insert(child(0)->grad(), adj_, child(1)->val(), axis_))};
+      // @TODO: rename to scatter
+      Insert</*add=*/true>(child(0)->grad(), adj_, child(1)->val(), axis_))};
   }
 
   Shape newShape(Expr a, int axis, Expr indices) {
@@ -1046,7 +1048,6 @@ struct GatherNodeOp : public NaryNodeOp {
     axis = shape.axis(axis);
     auto rank = shape.size();
     ABORT_IF(rank != indices->shape().size(), "Mismatching ranks for input ({}) and indices ({})", std::string(shape), std::string(indices->shape()));
-    axis = a->shape().axis(axis);
     shape.set(axis, indices->shape()[axis]);
     for (size_t i = 0; i < rank; ++i) {
       if (i != axis) {
@@ -1086,6 +1087,62 @@ private:
   int axis_;
 };
 
+struct ScatterNodeOp : public NaryNodeOp {
+  ScatterNodeOp(Expr a, int axis, Expr indices, Expr source)
+      : NaryNodeOp({a, indices, source}, newShape(a, axis, indices, source), a->value_type()),
+        axis_(a->shape().axis(axis)) {
+    matchOrAbort<IndexType>(indices->value_type());
+  }
+
+  NodeOps forwardOps() override {
+    return {NodeOp(
+      CopyCast(val_, child(0)->val()); // @TODO: use normal copy
+      Insert</*add=*/false>(val_, child(2)->val(), child(1)->val(), axis_)
+    )};
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("backward for ScatterNodeOp not yet implemented");
+  }
+
+  Shape newShape(Expr a, int axis, Expr indices, Expr source) {
+    ABORT_IF(axis != -1, "only last dimensions");
+    ABORT_IF(indices->shape() != source->shape(), "Shapes must match");
+
+    Shape shape = a->shape();
+    // @TODO: do proper checking
+    return shape;
+  }
+
+  const std::string type() override { return "scatter"; }
+
+  const std::string color() override { return "orange"; }
+
+  virtual size_t hash() override {
+    if(!hash_) {
+      size_t seed = NaryNodeOp::hash();
+      util::hash_combine(seed, axis_);
+      hash_ = seed;
+    }
+    return hash_;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<ScatterNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(axis_ != cnode->axis_)
+      return false;
+    return true;
+  }
+
+private:
+  friend class SerializationHelpers;
+  int axis_;
+};
+
 struct ColsNodeOp : public NaryNodeOp {
   ColsNodeOp(Expr a, Expr indices)
     : NaryNodeOp({a, indices}, newShape(a, indices), a->value_type()) {
diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h
index c7a9531a..8acb1bc8 100644
--- a/src/graph/node_operators_tuple.h
+++ b/src/graph/node_operators_tuple.h
@@ -133,7 +133,7 @@ public:
   }
 
   void backward() override {
-    Insert(/*out*/child(0)->grad(), adj_, val_, axis_);
+    Insert</*add=*/true>(/*out*/child(0)->grad(), adj_, val_, axis_);
   }
 
   const std::string type() override { return "topk"; }
diff --git a/src/models/costs.cpp b/src/models/costs.cpp
index c688b211..4b15bcb3 100644
--- a/src/models/costs.cpp
+++ b/src/models/costs.cpp
@@ -10,5 +10,40 @@ Ptr<DecoderState> LogSoftmaxStep::apply(Ptr<DecoderState> state) {
   return state;
 }
 
+Ptr<DecoderState> GumbelSoftmaxStep::apply(Ptr<DecoderState> state) {
+  state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
+      [](Expr logits) {  // lemma gets gumbelled
+        return logsoftmax(logits + constant_like(logits, inits::gumbel()));
+      },
+      logsoftmax));  // factors don't
+  return state;
+}
+
+TopkGumbelSoftmaxStep::TopkGumbelSoftmaxStep(int k) : k_{k} {}
+
+Ptr<DecoderState> TopkGumbelSoftmaxStep::apply(Ptr<DecoderState> state) {
+  state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
+      [=](Expr logits) {  // lemma gets gumbelled
+        // create logits-sized tensor consisting only of invalid path scores
+        float invalidPathScore = NumericLimits<float>(logits->value_type()).lowest;
+        Expr invalidLogits = constant_like(logits, inits::fromValue(invalidPathScore));
+        
+        // select top-k values
+        Expr val, idx;
+        std::tie(val, idx) = topk(logits, k_, /*axis=*/-1, /*descending=*/true);
+        
+        // uncomment below to display probability mass in top-k selection
+        // debug(sum(gather(softmax(logits), -1, idx), -1), "sum");
+
+        // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search
+        Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel()));
+
+        // Scatter gumbelled values back into logits to fill with usable values
+        return scatter(invalidLogits, -1, idx, gumbelVal);
+      },
+      logsoftmax));  // factors don't
+  return state;
+}
+
 }  // namespace models
 }  // namespace marian
diff --git a/src/models/costs.h b/src/models/costs.h
index e5463bfd..a087ed6a 100644
--- a/src/models/costs.h
+++ b/src/models/costs.h
@@ -297,20 +297,30 @@ public:
   virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
 };
 
-// Gumbel-max noising for sampling during beam-search
-// Seems to work well enough with beam-size=1. Turn on
-// with --output-sampling during translation with marian-decoder
+// Gumbel-max noising for sampling during translation.
+// Produces accurate sampling with beam=1. Turn on
+// with --output-sampling [full] during translation
+// with marian-decoder for samnpling from the full
+// softmax distribution.
 class GumbelSoftmaxStep : public ILogProbStep {
 public:
   virtual ~GumbelSoftmaxStep() {}
-  virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
-    state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
-        [](Expr logits) {  // lemma gets gumbelled
-          return logsoftmax(logits + constant_like(logits, inits::gumbel()));
-        },
-        logsoftmax));  // factors don't
-    return state;
-  }
+  virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
+};
+
+
+// Gumbel-max noising for top-k sampling during translation.
+// Produces accurate sampling with beam=1. Turn on
+// with --output-sampling topk [10] during translation
+// with marian-decoder for top-10 sampling.
+class TopkGumbelSoftmaxStep : public ILogProbStep {
+private:
+  int k_{1};
+
+public:
+  TopkGumbelSoftmaxStep(int k);
+  virtual ~TopkGumbelSoftmaxStep() {}
+  virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
 };
 
 // class to wrap an IEncoderDecoder and a ILogProbStep that are executed in sequence,
diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp
index e176e6a4..52a87e72 100644
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@@ -370,10 +370,25 @@ Ptr<IModel> createModelFromOptions(Ptr<Options> options, usage use) {
   // add (log)softmax if requested
   if (use == usage::translation) {
     if(std::dynamic_pointer_cast<EncoderDecoder>(baseModel)) {
-      if(options->get<bool>("output-sampling", false))
-        return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<GumbelSoftmaxStep>());
-      else
+      if(options->hasAndNotEmpty("output-sampling")) {
+        auto sampling = options->get<std::vector<std::string>>("output-sampling", {});
+        std::string method = sampling.size() > 0 ? sampling[0] : "full";
+
+        if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) {
+          LOG(info, "Output sampling from the full softmax distribution");
+          return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<GumbelSoftmaxStep>());
+        } else if(method == "topk") {
+          int k = sampling.size() > 1 ? std::stoi(sampling[1]) : 10;
+          if(k == 1)
+            LOG(info, "Output sampling with k=1 is equivalent to beam search with beam size 1");
+          LOG(info, "Output sampling via top-{} sampling", k);
+          return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<TopkGumbelSoftmaxStep>(k));
+        } else {
+          ABORT("Unknown sampling method: {}", method);
+        }
+      } else {
         return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<LogSoftmaxStep>());
+      }
     }
 #ifdef COMPILE_EXAMPLES
     // note: 'usage::translation' here means 'inference'
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index f3964f91..1e1adc38 100755
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -739,6 +739,7 @@ void Select(Tensor out,
   }
 }
 
+template <bool add>
 void Insert(Tensor out,
             const Tensor in,
             const Tensor indices,
@@ -760,10 +761,16 @@ void Insert(Tensor out,
     int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
     dims[axisCPU] = (int)indices->data<IndexType>()[idxIndex];
     int outIndex = outShape.index(dims);
-    out->data()[outIndex] += in->data()[index];
+    if(add)
+      out->data()[outIndex] += in->data()[index];
+    else
+      out->data()[outIndex] = in->data()[index];
   }
 }
 
+template void Insert<true>(Tensor out, const Tensor in, const Tensor indices, int axis);
+template void Insert<false>(Tensor out, const Tensor in, const Tensor indices, int axis);
+
 void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
   int rows = out_->shape().elements() / out_->shape().back();
   int cols = out_->shape().back();
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 1347c3bb..2103ca9d 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -1309,7 +1309,7 @@ __global__ void gSelect(T* out,
   }
 }
 
-template <typename T>
+template <bool add, typename T>
 __global__ void gInsert(T* out,
                         functional::Shape outShape,
                         const T* in,
@@ -1327,7 +1327,10 @@ __global__ void gInsert(T* out,
       int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
       dims[axis] = (int)d_indices[idxIndex];    
       int outIndex = outShape.index(dims);
-      out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
+      if(add)
+        out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
+      else
+        out[outIndex] = in[index];     
     }
   }
 }
@@ -1349,21 +1352,21 @@ void Select(Tensor out,
 
   if(out->type() == Type::float32) {
     gSelect<<<blocks, threads>>>(out->data<float>(),
-                                out->shape(),
-                                in->data<float>(),
-                                in->shape(),
-                                axisGPU,
-                                indices->data<IndexType>(), 
-                                indices->shape());
+                                 out->shape(),
+                                 in->data<float>(),
+                                 in->shape(),
+                                 axisGPU,
+                                 indices->data<IndexType>(), 
+                                 indices->shape());
 #if COMPILE_FP16
   } else if (out->type() == Type::float16) {
     gSelect<<<blocks, threads>>>(out->data<half>(),
-                                out->shape(),
-                                in->data<half>(),
-                                in->shape(),
-                                axisGPU,
-                                indices->data<IndexType>(),
-                                indices->shape());
+                                 out->shape(),
+                                 in->data<half>(),
+                                 in->shape(),
+                                 axisGPU,
+                                 indices->data<IndexType>(),
+                                 indices->shape());
 #endif
   } else if(out->type() == Type::uint32) {
     gSelect<<<blocks, threads>>>(out->data<IndexType>(),
@@ -1378,6 +1381,7 @@ void Select(Tensor out,
   }
 }
 
+template <bool add>
 void Insert(Tensor out,
             const Tensor in,
             const Tensor indices,
@@ -1393,28 +1397,31 @@ void Insert(Tensor out,
   int axisGPU = axis + functional::Shape::size() - out->shape().size();
 
   if(out->type() == Type::float32) {
-    gInsert<<<blocks, threads>>>(out->data<float>(),
-                                out->shape(),
-                                in->data<float>(),
-                                in->shape(),
-                                axisGPU,
-                                indices->data<IndexType>(),
-                                indices->shape());
+    gInsert<add><<<blocks, threads>>>(out->data<float>(),
+                                               out->shape(),
+                                               in->data<float>(),
+                                               in->shape(),
+                                               axisGPU,
+                                               indices->data<IndexType>(),
+                                               indices->shape());
 #if COMPILE_FP16
   } else if (out->type() == Type::float16) {
-    gInsert<<<blocks, threads>>>(out->data<half>(),
-                                out->shape(),
-                                in->data<half>(),
-                                in->shape(),
-                                axisGPU,
-                                indices->data<IndexType>(),
-                                indices->shape());
+    gInsert<add><<<blocks, threads>>>(out->data<half>(),
+                                               out->shape(),
+                                               in->data<half>(),
+                                               in->shape(),
+                                               axisGPU,
+                                               indices->data<IndexType>(),
+                                               indices->shape());
 #endif
   } else {
     ABORT("Insert not implemented for type {}", out->type());
   }
 }
 
+template void Insert<true>(Tensor out, const Tensor in, const Tensor indices, int axis);
+template void Insert<false>(Tensor out, const Tensor in, const Tensor indices, int axis);
+
 template <typename T>
 __global__ void gGRUFastForward(T* out,
                                 const T* state,
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index dc29bf35..1fc4542d 100644
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -297,7 +297,28 @@ DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor)
 DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor)
 
 DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int)
-DISPATCH4(Insert, marian::Tensor, const marian::Tensor, const marian::Tensor, int)
+
+#ifdef CUDA_FOUND
+namespace gpu {
+  template <bool add>
+  void Insert(Tensor out, const Tensor in, const Tensor indices, int axis);
+}
+#endif
+
+namespace cpu {
+  template <bool add>
+  void Insert(Tensor out, const Tensor in, const Tensor indices, int axis);
+}
+
+template <bool add>
+static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int axis) {
+#ifdef CUDA_FOUND
+  if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
+    gpu::Insert<add>(out, in, indices, axis);
+  else
+#endif
+    cpu::Insert<add>(out, in, indices, axis);
+}
 
 DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr<Allocator>, const marian::Tensor, int, int, bool);
 
diff --git a/src/translator/translator.h b/src/translator/translator.h
index db1f3d03..3e375f65 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -119,7 +119,7 @@ public:
       threadPool.enqueue(task, device, id++);
     }
 
-    if(options_->get<bool>("output-sampling", false)) {
+    if(options_->hasAndNotEmpty("output-sampling")) {
       if(options_->get<size_t>("beam-size") > 1)
         LOG(warn,
             "[warning] Output sampling and beam search (beam-size > 1) are contradictory methods "

From 8b8d1b11e28a421b348703d702c9c5206061df9d Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Thu, 25 Nov 2021 02:33:49 +0000
Subject: [PATCH 09/11] Merged PR 21553: Parallelize data reading for training

This parallelizes data reading. On very fast GPUs and with small models training speed can be starved by too slow batch creation. Use --data-threads 8 or more, by default currently set to 1 for backcompat.
---
 src/common/config_parser.cpp |   7 ++
 src/common/utils.cpp         |   8 +-
 src/data/batch_generator.h   |  35 +++++---
 src/data/corpus.cpp          | 160 +++++++++++++++++++----------------
 src/data/corpus.h            |   3 +
 src/data/corpus_base.cpp     |  44 +++++-----
 src/data/corpus_base.h       | 105 +++++++++++++++++++++--
 src/data/corpus_nbest.cpp    |   7 +-
 src/data/corpus_sqlite.cpp   |   6 +-
 src/data/text_input.cpp      |   6 +-
 10 files changed, 255 insertions(+), 126 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 59b328e9..3d79f8af 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -883,6 +883,10 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
   if(mode_ == cli::mode::training) {
     cli.add<bool>("--shuffle-in-ram",
         "Keep shuffled corpus in RAM, do not write to temp file");
+
+    cli.add<size_t>("--data-threads",
+        "Number of concurrent threads to use during data reading and processing", 1);
+
     // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
     cli.add<size_t>("--all-caps-every",
         "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
@@ -901,6 +905,9 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
     cli.add<bool>("--mini-batch-round-up",
         "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false",
         true);
+  } else {
+    cli.add<size_t>("--data-threads",
+        "Number of concurrent threads to use during data reading and processing", 1);
   }
   // clang-format on
 }
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index 72624041..99fc790a 100644
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -70,22 +70,20 @@ void split(const std::string& line,
 // the function guarantees that the output has as many elements as requested
 void splitTsv(const std::string& line, std::vector<std::string>& fields, size_t numFields) {
   fields.clear();
+  fields.resize(numFields); // make sure there is as many elements as requested
 
   size_t begin = 0;
   size_t pos = 0;
   for(size_t i = 0; i < numFields; ++i) {
     pos = line.find('\t', begin);
     if(pos == std::string::npos) {
-      fields.push_back(line.substr(begin));
+      fields[i] = line.substr(begin);
       break;
     }
-    fields.push_back(line.substr(begin, pos - begin));
+    fields[i] = line.substr(begin, pos - begin);
     begin = pos + 1;
   }
 
-  if(fields.size() < numFields)  // make sure there is as many elements as requested
-    fields.resize(numFields);
-
   ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line);
 }
 
diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h
index a248db23..ea977468 100644
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@@ -2,6 +2,7 @@
 
 #include "common/options.h"
 #include "common/signal_handling.h"
+#include "common/timer.h"
 #include "data/batch_stats.h"
 #include "data/rng_engine.h"
 #include "training/training_state.h"
@@ -92,6 +93,8 @@ private:
 
   // this runs on a bg thread; sequencing is handled by caller, but locking is done in here
   std::deque<BatchPtr> fetchBatches() {
+    timer::Timer total;
+
     typedef typename Sample::value_type Item;
     auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content
 
@@ -135,19 +138,29 @@ private:
       if(current_ != data_->end())
         ++current_;
     }
-    size_t sets = 0;
-    while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data
+    
+    Samples maxiBatchTemp;
+    while(current_ != data_->end() && maxiBatchTemp.size() < maxSize) { // loop over data
       if (saveAndExitRequested()) // stop generating batches
         return std::deque<BatchPtr>();
-      maxiBatch->push(*current_);
-      sets = current_->size();
+      
+      maxiBatchTemp.push_back(*current_);
+  
       // do not consume more than required for the maxi batch as this causes
       // that line-by-line translation is delayed by one sentence
-      bool last = maxiBatch->size() == maxSize;
+      bool last = maxiBatchTemp.size() == maxSize;
       if(!last)
         ++current_; // this actually reads the next line and pre-processes it
     }
-    size_t numSentencesRead = maxiBatch->size();
+    size_t numSentencesRead = maxiBatchTemp.size();
+
+    size_t sets = 0;
+    for(auto&& s : maxiBatchTemp) {
+      if(!s.empty()) {
+        sets = s.size();
+        maxiBatch->push(s);
+      }
+    }
 
     // construct the actual batches and place them in the queue
     Samples batchVector;
@@ -163,6 +176,7 @@ private:
     BatchStats::const_iterator cachedStatsIter;
     if (stats_)
       cachedStatsIter = stats_->begin();
+
     while(!maxiBatch->empty()) { // while there are sentences in the queue
       if (saveAndExitRequested()) // stop generating batches
         return std::deque<BatchPtr>();
@@ -178,12 +192,7 @@ private:
             lengths[i] = batchVector.back()[i].size(); // record max lengths so far
 
         maxBatchSize = stats_->findBatchSize(lengths, cachedStatsIter);
-        // this optimization makes no difference indeed
-#if 0   // sanity check: would we find the same entry if searching from the start?
-        auto it = stats_->lower_bound(lengths);
-        auto maxBatchSize1 = stats_->findBatchSize(lengths, it);
-        ABORT_IF(maxBatchSize != maxBatchSize1, "findBatchSize iter caching logic is borked");
-#endif
+
         makeBatch = batchVector.size() >= maxBatchSize;
         // if last added sentence caused a bump then we likely have bad padding, so rather move it into the next batch
         if(batchVector.size() > maxBatchSize) {
@@ -231,6 +240,8 @@ private:
     LOG(debug, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.",
         tempBatches.size(), numSentencesRead,
         (double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom);
+    LOG(debug, "[data] fetching batches took {:.2f} seconds, {:.2f} sents/s", total.elapsed(),  (double)numSentencesRead / total.elapsed());
+    
     return tempBatches;
   }
 
diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp
index d8a364b2..643a7de9 100644
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@@ -14,18 +14,30 @@ namespace data {
 
 Corpus::Corpus(Ptr<Options> options, bool translate /*= false*/, size_t seed /*= Config:seed*/)
     : CorpusBase(options, translate, seed),
-        shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
-        allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
-        titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
+      shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
+      allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
+      titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {
+
+  auto numThreads = options_->get<size_t>("data-threads", 1);
+  if(numThreads > 1)
+    threadPool_.reset(new ThreadPool(numThreads));
+
+}
 
 Corpus::Corpus(std::vector<std::string> paths,
                std::vector<Ptr<Vocab>> vocabs,
                Ptr<Options> options,
                size_t seed /*= Config:seed*/)
     : CorpusBase(paths, vocabs, options, seed),
-        shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
-        allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
-        titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
+      shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
+      allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
+      titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {
+  
+  auto numThreads = options_->get<size_t>("data-threads", 1);
+  if(numThreads > 1)
+    threadPool_.reset(new ThreadPool(numThreads));
+
+}
 
 void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
   bool isFactoredVocab = vocabs_.back()->tryAs<FactoredVocab>() != nullptr;
@@ -52,16 +64,10 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
 }
 
 SentenceTuple Corpus::next() {
-  // Used for handling TSV inputs
-  // Determine the total number of fields including alignments or weights
-  auto tsvNumAllFields = tsvNumInputFields_;
-  if(alignFileIdx_ > -1)
-    ++tsvNumAllFields;
-  if(weightFileIdx_ > -1)
-    ++tsvNumAllFields;
-  std::vector<std::string> fields(tsvNumAllFields);
+  size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size();    
+  std::vector<std::string> fields(numStreams);
 
-  for(;;) { // (this is a retry loop for skipping invalid sentences)
+  while(true) { // retry loop
     // get index of the current sentence
     size_t curId = pos_; // note: at end, pos_  == total size
     // if corpus has been shuffled, ids_ contains sentence indexes
@@ -69,83 +75,91 @@ SentenceTuple Corpus::next() {
       curId = ids_[pos_];
     pos_++;
 
-    // fill up the sentence tuple with sentences from all input files
-    SentenceTuple tup(curId);
     size_t eofsHit = 0;
-    size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size();
-    for(size_t i = 0; i < numStreams; ++i) {
-      std::string line;
-
+    for(size_t i = 0; i < numStreams; ++i) { // looping of all streams
       // fetch line, from cached copy in RAM or actual file
       if (!corpusInRAM_.empty()) {
         if (curId < corpusInRAM_[i].size())
-          line = corpusInRAM_[i][curId];
+          fields[i] = corpusInRAM_[i][curId];
         else {
           eofsHit++;
           continue;
         }
       }
       else {
-        bool gotLine = io::getline(*files_[i], line).good();
+        bool gotLine = io::getline(*files_[i], fields[i]).good();
         if(!gotLine) {
           eofsHit++;
           continue;
         }
       }
-
-      if(i > 0 && i == alignFileIdx_) {
-        addAlignmentToSentenceTuple(line, tup);
-      } else if(i > 0 && i == weightFileIdx_) {
-        addWeightsToSentenceTuple(line, tup);
-      } else {
-        if(tsv_) {  // split TSV input and add each field into the sentence tuple
-          utils::splitTsv(line, fields, tsvNumAllFields);
-          size_t shift = 0;
-          for(size_t j = 0; j < tsvNumAllFields; ++j) {
-            // index j needs to be shifted to get the proper vocab index if guided-alignment or
-            // data-weighting are preceding source or target sequences in TSV input
-            if(j == alignFileIdx_ || j == weightFileIdx_) {
-              ++shift;
-            } else {
-              size_t vocabId = j - shift;
-              bool altered;
-              preprocessLine(fields[j], vocabId, /*out=*/altered);
-              if (altered)
-                tup.markAltered();
-              addWordsToSentenceTuple(fields[j], vocabId, tup);
-            }
-          }
-
-          // weights are added last to the sentence tuple, because this runs a validation that needs
-          // length of the target sequence
-          if(alignFileIdx_ > -1)
-            addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
-          if(weightFileIdx_ > -1)
-            addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
-
-        } else {
-          bool altered;
-          preprocessLine(line, i, /*out=*/altered);
-          if (altered)
-            tup.markAltered();
-          addWordsToSentenceTuple(line, i, tup);
-        }
-      }
     }
 
-    if (eofsHit == numStreams)
-      return SentenceTuple(0);
+    if(eofsHit == numStreams)
+      return SentenceTuple(); // unintialized SentenceTuple which will be invalid when tested
+
     ABORT_IF(eofsHit != 0, "not all input files have the same number of lines");
 
-    // check if all streams are valid, that is, non-empty and no longer than maximum allowed length
-    if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
-         return words.size() > 0 && words.size() <= maxLength_;
-       }))
-      return tup;
+    auto makeSentenceTuple = [this](size_t curId, std::vector<std::string> fields) {
+      if(tsv_) {
+        // with tsv inputs data, there is only one input stream, hence we only have one field 
+        // which needs to be tokenized into tab-separated fields
+        ABORT_IF(fields.size() != 1, "Reading TSV file, but we have don't have exactly one stream??");
+        size_t numAllFields = tsvNumInputFields_;
+        if(alignFileIdx_ > -1)
+          ++numAllFields;
+        if(weightFileIdx_ > -1)
+          ++numAllFields;
+        // replace single-element fields array with extracted tsv fields
+        std::vector<std::string> tmpFields;
+        utils::splitTsv(fields[0], tmpFields, numAllFields); // this verifies the number of fields
+        fields.swap(tmpFields);
+      }
 
-    // otherwise skip this sentence and try the next one
-    // @TODO: tail recursion?
-  }
+      // fill up the sentence tuple with sentences from all input files
+      SentenceTupleImpl tup(curId);
+      size_t shift = 0;
+      for(size_t i = 0; i < fields.size(); ++i) {
+        // index j needs to be shifted to get the proper vocab index if guided-alignment or
+        // data-weighting are preceding source or target sequences in TSV input
+        if(i == alignFileIdx_ || i == weightFileIdx_) {
+          ++shift;
+        } else {
+          size_t vocabId = i - shift;
+          bool altered;
+          preprocessLine(fields[i], vocabId, /*out=*/altered);
+          if (altered)
+            tup.markAltered();
+          addWordsToSentenceTuple(fields[i], vocabId, tup);
+        }
+      
+        // weights are added last to the sentence tuple, because this runs a validation that needs
+        // length of the target sequence
+        if(alignFileIdx_ > -1)
+          addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
+        if(weightFileIdx_ > -1)
+          addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
+      }
+
+      // check if all streams are valid, that is, non-empty and no longer than maximum allowed length
+      if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
+            return words.size() > 0 && words.size() <= maxLength_;
+          })) {
+        return tup;
+      } else {
+        return SentenceTupleImpl(); // return an empty tuple if above test does not pass
+      }
+    };
+
+    if(threadPool_) { // use thread pool if available
+      return SentenceTuple(threadPool_->enqueue(makeSentenceTuple, curId, fields));
+    } else { // otherwise launch here and just pass the result into the wrapper
+      auto tup = makeSentenceTuple(curId, fields);
+      if(!tup.empty())
+        return SentenceTuple(tup);
+    }
+
+  } // end of retry loop
 }
 
 // reset and initialize shuffled reading
@@ -167,6 +181,8 @@ void Corpus::reset() {
   pos_ = 0;
   for (size_t i = 0; i < paths_.size(); ++i) {
       if(paths_[i] == "stdin" || paths_[i] == "-") {
+        std::cin.tie(0);
+        std::ios_base::sync_with_stdio(false);
         files_[i].reset(new std::istream(std::cin.rdbuf()));
         // Probably not necessary, unless there are some buffers
         // that we want flushed.
diff --git a/src/data/corpus.h b/src/data/corpus.h
index e8e9a9fd..281d43a2 100644
--- a/src/data/corpus.h
+++ b/src/data/corpus.h
@@ -4,6 +4,7 @@
 #include <iostream>
 #include <random>
 
+#include "3rd_party/threadpool.h"
 #include "common/definitions.h"
 #include "common/file_stream.h"
 #include "common/options.h"
@@ -20,6 +21,8 @@ class Corpus : public CorpusBase {
 private:
   std::vector<UPtr<io::TemporaryFile>> tempFiles_;
   std::vector<size_t> ids_;
+  
+  UPtr<ThreadPool> threadPool_; // thread pool for parallelized data reading
 
   // for shuffle-in-ram
   bool shuffleInRAM_{false};
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index 5f9a9ee3..bfce31bf 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -12,7 +12,24 @@ typedef std::vector<float> MaskBatch;
 typedef std::pair<WordBatch, MaskBatch> WordMask;
 typedef std::vector<WordMask> SentBatch;
 
-CorpusIterator::CorpusIterator() : pos_(-1), tup_(0) {}
+void SentenceTupleImpl::setWeights(const std::vector<float>& weights) {
+  if(weights.size() != 1) {  // this assumes a single sentence-level weight is always fine
+    ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights");
+    auto numWeights = weights.size();
+    auto numTrgWords = back().size();
+    // word-level weights may or may not contain a weight for EOS tokens
+    if(numWeights != numTrgWords && numWeights != numTrgWords - 1)
+      LOG(warn,
+          "[warn] "
+          "Number of weights ({}) does not match the number of target words ({}) in line #{}",
+          numWeights,
+          numTrgWords,
+          id_);
+  }
+  weights_ = weights;
+}
+
+CorpusIterator::CorpusIterator() : pos_(-1) {}
 
 CorpusIterator::CorpusIterator(CorpusBase* corpus)
     : corpus_(corpus), pos_(0), tup_(corpus_->next()) {}
@@ -23,7 +40,7 @@ void CorpusIterator::increment() {
 }
 
 bool CorpusIterator::equal(CorpusIterator const& other) const {
-  return this->pos_ == other.pos_ || (this->tup_.empty() && other.tup_.empty());
+  return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid());
 }
 
 const SentenceTuple& CorpusIterator::dereference() const {
@@ -390,7 +407,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
 
 void CorpusBase::addWordsToSentenceTuple(const std::string& line,
                                          size_t batchIndex,
-                                         SentenceTuple& tup) const {
+                                         SentenceTupleImpl& tup) const {
   // This turns a string in to a sequence of numerical word ids. Depending
   // on the vocabulary type, this can be non-trivial, e.g. when SentencePiece
   // is used.
@@ -411,7 +428,7 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
 }
 
 void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
-                                             SentenceTuple& tup) const {
+                                             SentenceTupleImpl& tup) const {
   ABORT_IF(rightLeft_,
            "Guided alignment and right-left model cannot be used "
            "together at the moment");
@@ -420,7 +437,7 @@ void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
   tup.setAlignment(align);
 }
 
-void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const {
+void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const {
   auto elements = utils::split(line, " ");
 
   if(!elements.empty()) {
@@ -549,23 +566,6 @@ size_t CorpusBase::getNumberOfTSVInputFields(Ptr<Options> options) {
   return 0;
 }
 
-void SentenceTuple::setWeights(const std::vector<float>& weights) {
-  if(weights.size() != 1) {  // this assumes a single sentence-level weight is always fine
-    ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights");
-    auto numWeights = weights.size();
-    auto numTrgWords = back().size();
-    // word-level weights may or may not contain a weight for EOS tokens
-    if(numWeights != numTrgWords && numWeights != numTrgWords - 1)
-      LOG(warn,
-          "[warn] "
-          "Number of weights ({}) does not match the number of target words ({}) in line #{}",
-          numWeights,
-          numTrgWords,
-          id_);
-  }
-  weights_ = weights;
-}
-
 // experimental: hide inline-fix source tokens from cross attention
 std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
 {
diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
index 251df5bc..82a01286 100644
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@@ -11,6 +11,8 @@
 #include "data/rng_engine.h"
 #include "data/vocab.h"
 
+#include <future>
+
 namespace marian {
 namespace data {
 
@@ -22,7 +24,7 @@ namespace data {
  * construction of marian::data::CorpusBatch objects. They are not a part of
  * marian::data::CorpusBatch.
  */
-class SentenceTuple {
+class SentenceTupleImpl {
 private:
   size_t id_;
   std::vector<Words> tuple_;    // [stream index][step index]
@@ -33,12 +35,17 @@ private:
 public:
   typedef Words value_type;
 
+  /**
+   * @brief Creates an empty tuple with 0 id (default constructor).
+   */
+  SentenceTupleImpl() : id_(0) {}
+
   /**
    * @brief Creates an empty tuple with the given Id.
    */
-  SentenceTuple(size_t id) : id_(id) {}
+  SentenceTupleImpl(size_t id) : id_(id) {}
 
-  ~SentenceTuple() { tuple_.clear(); }
+  ~SentenceTupleImpl() {}
 
   /**
    * @brief Returns the sentence's ID.
@@ -114,6 +121,92 @@ public:
   void setAlignment(const WordAlignment& alignment) { alignment_ = alignment; }
 };
 
+class SentenceTuple {
+private:
+  std::shared_ptr<std::future<SentenceTupleImpl>> fImpl_;
+  mutable std::shared_ptr<SentenceTupleImpl> impl_;
+
+public:
+  typedef Words value_type;
+
+  /**
+   * @brief Creates an empty tuple with no associated future.
+   */
+  SentenceTuple() {}
+  
+  SentenceTuple(const SentenceTupleImpl& tupImpl) 
+    : impl_(std::make_shared<SentenceTupleImpl>(tupImpl)) {}
+
+  SentenceTuple(std::future<SentenceTupleImpl>&& fImpl) 
+    : fImpl_(new std::future<SentenceTupleImpl>(std::move(fImpl))) {}
+
+  SentenceTupleImpl& get() const {
+    if(!impl_) {
+      ABORT_IF(!fImpl_ || !fImpl_->valid(), "No future tuple associated with SentenceTuple");
+      impl_ = std::make_shared<SentenceTupleImpl>(fImpl_->get());
+    }
+    return *impl_;
+  }
+
+  /**
+   * @brief Returns the sentence's ID.
+   */
+  size_t getId() const { return get().getId(); }
+
+  /**
+   * @brief Returns whether this Tuple was altered or augmented from what
+   * was provided to Marian in input.
+   */
+  bool isAltered() const { return get().isAltered(); }
+
+  /**
+   * @brief The size of the tuple, e.g. two for parallel data with a source and
+   * target sentences.
+   */
+  size_t size() const { return get().size(); }
+
+  /**
+   * @brief confirms that the tuple has been populated with data
+   */
+  bool valid() const {
+    return fImpl_ || impl_;
+  }
+
+  /**
+   * @brief The i-th tuple sentence.
+   *
+   * @param i Tuple's index.
+   */
+  Words& operator[](size_t i) { return get()[i]; }
+  const Words& operator[](size_t i) const { return get()[i]; }
+
+  /**
+   * @brief The last tuple sentence, i.e. the target sentence.
+   */
+  Words& back() { return get().back(); }
+  const Words& back() const { return get().back(); }
+
+  /**
+   * @brief Checks whether the tuple is empty.
+   */
+  bool empty() const { return get().empty(); }
+
+  auto begin() const -> decltype(get().begin()) { return get().begin(); }
+  auto end() const -> decltype(get().end()) { return get().end(); }
+
+  auto rbegin() const -> decltype(get().rbegin()) { return get().rbegin(); }
+  auto rend() const -> decltype(get().rend()) { return get().rend(); }
+
+  /**
+   * @brief Get sentence weights.
+   *
+   * For sentence-level weights the vector contains only one element.
+   */
+  const std::vector<float>& getWeights() const { return get().getWeights(); }
+
+  const WordAlignment& getAlignment() const { return get().getAlignment(); }
+};
+
 /**
  * @brief Batch of sentences represented as word indices with masking.
  */
@@ -586,17 +679,17 @@ protected:
    * @brief Helper function converting a line of text into words using the i-th
    * vocabulary and adding them to the sentence tuple.
    */
-  void addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTuple& tup) const;
+  void addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTupleImpl& tup) const;
   /**
    * @brief Helper function parsing a line with word alignments and adding them
    * to the sentence tuple.
    */
-  void addAlignmentToSentenceTuple(const std::string& line, SentenceTuple& tup) const;
+  void addAlignmentToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const;
   /**
    * @brief Helper function parsing a line of weights and adding them to the
    * sentence tuple.
    */
-  void addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const;
+  void addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const;
 
   void addAlignmentsToBatch(Ptr<CorpusBatch> batch, const std::vector<Sample>& batchVector);
 
diff --git a/src/data/corpus_nbest.cpp b/src/data/corpus_nbest.cpp
index d5a48d8d..8029d351 100644
--- a/src/data/corpus_nbest.cpp
+++ b/src/data/corpus_nbest.cpp
@@ -43,7 +43,7 @@ SentenceTuple CorpusNBest::next() {
     pos_++;
 
     // fill up the sentence tuple with sentences from all input files
-    SentenceTuple tup(curId);
+    SentenceTupleImpl tup(curId);
 
     std::string line;
     lastLines_.resize(files_.size() - 1);
@@ -74,9 +74,10 @@ SentenceTuple CorpusNBest::next() {
     if(cont && std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
          return words.size() > 0 && words.size() <= maxLength_;
        }))
-      return tup;
+      return SentenceTuple(tup);
   }
-  return SentenceTuple(0);
+
+  return SentenceTuple();
 }
 
 void CorpusNBest::reset() {
diff --git a/src/data/corpus_sqlite.cpp b/src/data/corpus_sqlite.cpp
index 297847c0..f7c577f2 100644
--- a/src/data/corpus_sqlite.cpp
+++ b/src/data/corpus_sqlite.cpp
@@ -109,7 +109,7 @@ SentenceTuple CorpusSQLite::next() {
   while(select_->executeStep()) {
     // fill up the sentence tuple with sentences from all input files
     size_t curId = select_->getColumn(0).getInt();
-    SentenceTuple tup(curId);
+    SentenceTupleImpl tup(curId);
 
     for(size_t i = 0; i < files_.size(); ++i) {
       auto line = select_->getColumn((int)(i + 1));
@@ -126,9 +126,9 @@ SentenceTuple CorpusSQLite::next() {
     if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
          return words.size() > 0 && words.size() <= maxLength_;
        }))
-      return tup;
+      return SentenceTuple(tup);
   }
-  return SentenceTuple(0);
+  return SentenceTuple();
 }
 
 void CorpusSQLite::shuffle() {
diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp
index 958190fc..b1f4cdd4 100644
--- a/src/data/text_input.cpp
+++ b/src/data/text_input.cpp
@@ -40,7 +40,7 @@ SentenceTuple TextInput::next() {
   size_t curId = pos_++;
 
   // fill up the sentence tuple with source and/or target sentences
-  SentenceTuple tup(curId);
+  SentenceTupleImpl tup(curId);
   for(size_t i = 0; i < files_.size(); ++i) {
     std::string line;
     if(io::getline(*files_[i], line)) {
@@ -57,9 +57,9 @@ SentenceTuple TextInput::next() {
   }
 
   if(tup.size() == files_.size()) // check if each input file provided an example
-    return tup;
+    return SentenceTuple(tup);
   else if(tup.size() == 0) // if no file provided examples we are done
-    return SentenceTuple(0);
+    return SentenceTuple();
   else // neither all nor none => we have at least on missing entry
     ABORT("There are missing entries in the text tuples.");
 }

From bbc673c50fbf2faa90bdc44003d15087632262bc Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <marcinjd@microsoft.com>
Date: Wed, 24 Nov 2021 18:42:14 -0800
Subject: [PATCH 10/11] update CHANGELOG and VERSION

---
 CHANGELOG.md | 6 +++++-
 VERSION      | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 05658fe1..bce24cfc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
-- Adds option --add-lsh to marian-conv which allows the LSH to be memory-mapped.
+- Parallelized data reading with e.g. `--data-threads 8`
+- Top-k sampling during decoding with e.g. `--output-sampling topk 10`
+- Improved mixed precision training with `--fp16`
+- Set FFN width in decoder independently from encoder with e.g. `--transformer-dim-ffn 4096 --transformer-decoder-dim-ffn 2048`
+- Adds option `--add-lsh` to marian-conv which allows the LSH to be memory-mapped.
 - Early stopping based on first, all, or any validation metrics via `--early-stopping-on`
 - Compute 8.6 support if using CUDA>=11.1
 - Support for RMSNorm as drop-in replace for LayerNorm from `Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization`. Enabled in Transformer model via `--transformer-postprocess dar` instead of `dan`.
diff --git a/VERSION b/VERSION
index 3c40cf56..cf4bd774 100644
--- a/VERSION
+++ b/VERSION
@@ -1,2 +1,2 @@
-v1.10.24
+v1.10.42
 

From e8ea37cd5b85e3df817b9ced68bef9cc64b45d16 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 6 Dec 2021 23:20:44 +0000
Subject: [PATCH 11/11] Merged PR 21648: Allow for dynamic gradient scaling to
 fade out after N updates

Allow for dynamic gradient scaling to fade out after N updates
---
 src/tensors/gpu/prod.cpp     |  6 +++++-
 src/training/graph_group.cpp | 17 ++++++++++++++---
 src/training/graph_group.h   |  1 +
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp
index bf0d2395..c72af4db 100755
--- a/src/tensors/gpu/prod.cpp
+++ b/src/tensors/gpu/prod.cpp
@@ -562,7 +562,11 @@ void ProdBatchedLegacy(marian::Tensor C,
     ProdBatchedTypedLegacy<float, float>(C, allocator, A, B, transA, transB, beta, scalar);
 #if COMPILE_FP16
   } else if(C->type() == Type::float16) { // not a *.cu file
-    ProdBatchedTypedLegacy<half, half>(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar));
+    // we use computeType=float here for fp16 training as this seems more stable and roughly as fast
+    ProdBatchedTypedLegacy<half, float>(C, allocator, A, B, transA, transB, beta, scalar);
+
+    // original for reference:
+    // ProdBatchedTypedLegacy<half, half>(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar));
 #endif
   } else {
     ABORT("ProdBatchedLegacy not implemented for element type {}", C->type());
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index 03e5acf4..59cd4b6d 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -31,11 +31,16 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
 
     if(vgc.size() > 0) dynamicGradientScalingFactor_  = std::stof(vgc[0]);
     if(vgc.size() > 1) dynamicGradientScalingUseLogs_ = vgc[1] == "log";
+    if(vgc.size() > 2) dynamicGradientScalingFadeout_ = std::stoul(vgc[2]);
 
     LOG_ONCE(info,
              "Re-scaling gradient to have average gradient norm if (log={}) gradient norm diverges from average by {} sigmas",
              dynamicGradientScalingUseLogs_,
              dynamicGradientScalingFactor_);
+    if(dynamicGradientScalingFadeout_ > 0)
+      LOG_ONCE(info,
+               "Dynamic gradient re-scaling will fade out linearly after {} updates",
+               dynamicGradientScalingFadeout_);
   }
 
   if(options_->get<bool>("check-gradient-nan")) {
@@ -229,11 +234,17 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
     auto deltaTransform    = gNormTransform - gNormAvgTransform; // compute the difference between the current transformer gradient norm and the running average.
     auto gNormStdTransform = std::sqrt(gNormVarTransform);       // compute STD for the running average of (log) gradient norms.
 
+    float fadeoutMultiplier = 1.f;
+    if(dynamicGradientScalingFadeout_ > 0ul) // fade out linearly after that many updates @TODO: allow units other than updates
+      fadeoutMultiplier = (float)std::max(dynamicGradientScalingFadeout_, scheduler_->numberOfBatches()) / (float)dynamicGradientScalingFadeout_;
+
+    float dynamicGradientScalingFactorWithFadeout = dynamicGradientScalingFactor_ * fadeoutMultiplier; // if fadeoutMultiplier increases dynamic gradient scaling becomes less and less likely to happen over time.
     // delta of (log) gradient norm vs (log) gradient norm average is larger than N standard deviations
     // hence rescale gradient using the average.
-    if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactor_ * gNormStdTransform) {
-      LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f}",
-          dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactor_, gNormStdTransform);
+    if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactorWithFadeout * gNormStdTransform) {
+      if(isMainProcess())
+        LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f} - scaling gradient by {:.4f}",
+            dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactorWithFadeout, gNormStdTransform, gNormAvg / gNorm);
 
       normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average. 
     }
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index b7f2f7ef..aa68922a 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -74,6 +74,7 @@ protected:
   bool dynamicGradientScaling_{false};
   float dynamicGradientScalingFactor_{2.f};
   bool dynamicGradientScalingUseLogs_{false};
+  size_t dynamicGradientScalingFadeout_{0ul};
 
   // determines the number of input streams (i.e. input files or fields in the TSV input) that need
   // to be included in the batch, i.e. without alignments and weights