Merged PR 21151: Cleaning up fp16 behavior

This PR improves clipping and pruning behavior of NaNs and Infs during fp16 training, ultimately avoiding the underflow problems that we were facing so far.
2024-09-17 09:47:34 +03:00 · 2021-10-26 20:25:39 +00:00 · 2021-10-26 20:25:39 +00:00 · 1404201926
commit 1404201926
parent 7f06f3c5d2
13 changed files with 233 additions and 141 deletions
--- a/src/common/aliases.cpp
+++ b/src/common/aliases.cpp
@ -29,8 +29,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
  cli.alias("fp16", "true", [&](YAML::Node& config) {
    if(mode_ == cli::mode::training) {
      config["precision"] = std::vector<std::string>({"float16", "float32"}); // inference type, optimization type, save type
-      // scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor
-      config["cost-scaling"] = std::vector<std::string>({"0", "1000", "2", "0.05", "10", "1e-5"}); 
+      // scaling factor, frequency, multiplier at increase, minium scaling factor
+      config["cost-scaling"] = std::vector<std::string>({"256.f", "1000", "2.f", "256.f"});
    } else {
      config["precision"] = std::vector<std::string>({"float16"}); // for inference we do not need the other types
    }
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -522,15 +522,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
  // mixed precision training
  cli.add<bool>("--fp16",
      "Shortcut for mixed precision training with float16 and cost-scaling, "
-      "corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f");
+      "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f");
  cli.add<std::vector<std::string>>("--precision",
      "Mixed precision training for forward/backward pass and optimizaton. "
      "Defines types for: forward/backward pass, optimization.",
      {"float32", "float32"});
  cli.add<std::vector<std::string>>("--cost-scaling",
      "Dynamic cost scaling for mixed precision training: "
-      "power of 2, scaling window, scaling factor, tolerance, range, minimum factor")
-      ->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f");
+      "scaling factor, frequency, multiplier, minimum factor")
+      ->implicit_val("256.f 1000 2.f 256.f");
  cli.add<size_t>("--gradient-norm-average-window",
      "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
      "After this many updates about 90% of the mass of the exponential average comes from these updates",
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@ -106,24 +106,24 @@ using Weak = std::weak_ptr<T>;
 /** @brief Creates shared_ptr of any type, passes all arguments to any available
 * constructor */
 template <class T, typename... Args>
-Ptr<T> New(Args&&... args) {
-  return Ptr<T>(new T(std::forward<Args>(args)...));
+inline Ptr<T> New(Args&&... args) {
+  return std::make_shared<T>(std::forward<Args>(args)...);
 }

 template <class T>
-Ptr<T> New(Ptr<T> p) {
+inline Ptr<T> New(Ptr<T> p) {
  return Ptr<T>(p);
 }

 /** @brief Creates InstrusivePtr of any type, passes all arguments to any available
 * constructor */
 template <class T, typename... Args>
-IPtr<T> INew(Args&&... args) {
+inline IPtr<T> INew(Args&&... args) {
  return IPtr<T>(new T(std::forward<Args>(args)...));
 }

 template <class T>
-IPtr<T> INew(Ptr<T> p) {
+inline IPtr<T> INew(Ptr<T> p) {
  return IPtr<T>(p);
 }

--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -147,8 +147,7 @@ public:

    int dimDepth = dimModel / dimHeads;

-    auto output
-        = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});
+    auto output = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});

    return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth]
  }
@ -361,9 +360,9 @@ public:

  Expr LayerAttention(std::string prefix,
                      Expr input,         // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
-                      const Expr& keys,   // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
-                      const Expr& values, // ...?
-                      const Expr& mask,   // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
+                      Expr keys,   // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+                      Expr values, // ...?
+                      Expr mask,   // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
                      int dimHeads,
                      bool cache = false,
                      bool saveAttentionWeights = false) {
@ -373,6 +372,12 @@ public:
    auto opsPre = opt<std::string>("transformer-preprocess");
    auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb);

+    // fixes missing norm for keys and values in self-attention with pre-norm
+    if(input == keys)
+       keys = output;
+    if(input == values)
+       values = output;
+
    // multi-head self-attention over previous input
    output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights);
    
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -24,6 +24,10 @@ void IsNaN(const Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool& /*isNaN*/, b
  ABORT("Not implemented");
 }

+bool SanitizeGradient(marian::Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool /*pruneNaN*/, bool /*clipInf*/) {
+  ABORT("Not implemented");
+}
+
 template <bool add, typename To, typename From>
 void CopyCastTo(To* out, const From* in, int length) {
  for(int i = 0; i < length; ++i)
--- a/src/tensors/gpu/element.cu
+++ b/src/tensors/gpu/element.cu
@ -29,7 +29,9 @@ __global__ void gElement(
          indices[i] = tensors[i].shape().bindex(dims);
      }

-      tensors[0].data()[index] = functional::apply(functor, tensors, indices);
+      // This performs the internal application of the functor in float32 regardless of the input type.
+      // It seems there are no speed penalties but improved precision.
+      tensors[0].data()[index] = (T)functional::applyWithCast<float>(functor, tensors, indices);
    }
  }
 }
@ -65,13 +67,7 @@ void Element(Functor functor, Tensor out, Tensors... tensors) {
    ElementTyped<float>(functor, out, tensors...);
  } else if(out->type() == Type::float16) {
 #if COMPILE_FP16
-    std::vector<marian::Tensor> ts({out, tensors...});
-    bool div2 = std::all_of(ts.cbegin(), ts.cend(), [](marian::Tensor t){ return t->shape()[-1] % 2 == 0; });
-    if(div2) {
-      ElementTyped<halfx2>(functor, out, tensors...);
-    } else {
-      ElementTyped<half>(functor, out, tensors...);
-    }
+    ElementTyped<half>(functor, out, tensors...);
 #else
    ABORT("FP16 not supported with chosen current hardware or CUDA version");
 #endif
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@ -16,15 +16,12 @@ namespace gpu {
 namespace atomics {

 static inline  __device__ void atomicAdd(float *address, float val) {
-  //*address += val;
  ::atomicAdd(address, val);
 }

 #if COMPILE_FP16
 // @TODO: copied from CuTorch, adapt this better, give credit.
 static inline  __device__ void atomicAdd(half *address, half val) {
-  //*address += val;
-
 #if __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000 // compute capability 70 and higher with CUDA 10
  ::atomicAdd(address, val);
 #else // __CUDA_ARCH__ < 700
@ -50,7 +47,8 @@ static inline  __device__ void atomicAdd(half *address, half val) {
    } while (assumed != old);
 #endif // __CUDA_ARCH__
 }
-#endif
+#endif // COMPILE_FP16
+

 }

@ -96,6 +94,81 @@ void IsNaN(const Tensor in, Ptr<Allocator> allocator, bool& isNaN, bool& isInf)
  cudaStreamSynchronize(0);
 }

+template <typename T>
+__global__ void gSanitizeGradient(T* in, int length,
+                                  bool* isNaN, bool* isInf,
+                                  bool pruneNaN, bool clipInf,
+                                  float forNaN = 0.f, float forInf = 65504.f, float forInfNeg = -65504.f) {
+  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+    if(index < length) {
+      float v = (float)in[index];
+      // handle NaN
+      if(isnan(v)) {
+        if(pruneNaN) {
+          in[index] = (T)forNaN;
+        } else {
+          *isNaN = true;
+        }
+      }
+      // handle +/- Inf
+      if(isinf(v)) {
+        if(clipInf) {
+          in[index] = v > 0 ? (T)forInf : (T)forInfNeg;
+        } else {
+         *isInf = true;
+        }
+      }
+    }
+  }
+}
+
+// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. 
+// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. 
+// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. 
+// In that case infinities do not result in a bad gradient, since they get clipped.
+// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result 
+// in a bad gradient.
+// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), 
+// we return `false` indicating a bad gradient. 
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
+  cudaSetDevice(in->getDeviceId().no);
+
+  int length = in->size();
+
+  int threads = std::min(MAX_THREADS, length);
+  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+  auto mem = allocator->alloc<bool>(2);
+  bool* dIsNaN = &mem->data<bool>()[0];
+  bool* dIsInf = &mem->data<bool>()[1];
+  fill(in->getBackend(), dIsNaN, dIsNaN + 2, false);
+
+  float forNaN    = 0.f;
+  float forInf    = NumericLimits<float>(in->type()).max;
+  float forInfNeg = NumericLimits<float>(in->type()).lowest;
+
+  if(in->type() == Type::float32) {
+    gSanitizeGradient<<<blocks, threads>>>(in->data<float>(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg);
+#if COMPILE_FP16
+  } else if(in->type() == Type::float16) {
+    gSanitizeGradient<<<blocks, threads>>>(in->data<half>(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg);
+#endif
+  } else {
+    ABORT("gSanitizeGradient for type {} not implemented", in->type());
+  }
+
+  bool isNaN, isInf;
+  CudaCopy(dIsNaN, dIsNaN + 1, &isNaN);
+  CudaCopy(dIsInf, dIsInf + 1, &isInf);
+
+  allocator->free(mem);
+
+  cudaStreamSynchronize(0);
+
+  return !isNaN && !isInf;
+}
+
 template <bool add, typename To, typename From>
 __global__ void gCopyCastTo(To* out, const From* in, int length) {
  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
@ -1090,7 +1163,7 @@ void PasteRows(Tensor out,
  size_t rowsToCopy = indices->size();

  int threads = std::min(MAX_THREADS, (int)cols);
-#if 1   // @TODO: make this configurable with a 'deterministic' flag
+#if 0   // @TODO: make this configurable with a 'deterministic' flag
  // If we only use one block, then each core operates on a different column,
  // hence the summation becomes deterministic.
  // However, we only use e.g. 512 cores out of possibly 3000+, so this will be
@ -1355,7 +1428,7 @@ __global__ void gGRUFastForward(T* out,
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int j = bid + blockIdx.x;
    if(j < rows) {
-      T m = !mask || mask[j];
+      float m = !mask || mask[j];
      T* rowOut = out + j * cols;
      const T* rowState = state + j * cols;

@ -1365,21 +1438,21 @@ __global__ void gGRUFastForward(T* out,
      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int i = tid + threadIdx.x;
        if(i < cols) {
-          T r = functional::Ops<T>::sigmoid(xWrow[i] + sUrow[i] + b[i]);
+          float r = functional::Ops<float>::sigmoid((float)xWrow[i] + (float)sUrow[i] + (float)b[i]);

          int k = i + cols;

-          T z = functional::Ops<T>::sigmoid(xWrow[k] + sUrow[k] + b[k]);
+          float z = functional::Ops<float>::sigmoid((float)xWrow[k] + (float)sUrow[k] + (float)b[k]);

          int l = i + 2 * cols;
-          T h;
+          float h;
          if(final)
-            h = functional::Ops<T>::tanh(xWrow[l] + (sUrow[l] + b[l]) * r);
+            h = functional::Ops<float>::tanh((float)xWrow[l] + ((float)sUrow[l] + (float)b[l]) * r);
          else
-            h = functional::Ops<T>::tanh(xWrow[l] + sUrow[l] * r + b[l]);
+            h = functional::Ops<float>::tanh((float)xWrow[l] + (float)sUrow[l] * r + (float)b[l]);

-          T out = ((T)1.f - z) * h + z * rowState[i];
-          rowOut[i] = m * out + ((T)1.f - m) * rowState[i];
+          float out = (1.f - z) * h + z * (float)rowState[i];
+          rowOut[i] = (T)(m * out + (1.f - m) * (float)rowState[i]);
        }
      }
    }
@ -1441,7 +1514,7 @@ __global__ void gGRUFastBackward(T* outState,
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int j = bid + blockIdx.x;
    if(j < rows) {
-      T m = !mask || mask[j];
+      float m = !mask || mask[j];

      T* rowOutState = outState + j * cols;
      T* rowOutXW = outXW + j * cols * 3;
@ -1459,56 +1532,56 @@ __global__ void gGRUFastBackward(T* outState,
          int k = i + cols;
          int l = i + 2 * cols;

-          T r = functional::Ops<T>::sigmoid(rowXW[i] + rowSU[i] + b[i]);
-          T z = functional::Ops<T>::sigmoid(rowXW[k] + rowSU[k] + b[k]);
+          float r = functional::Ops<float>::sigmoid((float)rowXW[i] + (float)rowSU[i] + (float)b[i]);
+          float z = functional::Ops<float>::sigmoid((float)rowXW[k] + (float)rowSU[k] + (float)b[k]);

-          T h;
+          float h;
          if(final)
-            h = functional::Ops<T>::tanh(rowXW[l] + (rowSU[l] + b[l]) * r);
+            h = functional::Ops<float>::tanh((float)rowXW[l] + ((float)rowSU[l] + (float)b[l]) * r);
          else
-            h = functional::Ops<T>::tanh(rowXW[l] + rowSU[l] * r + b[l]);
+            h = functional::Ops<float>::tanh((float)rowXW[l] + (float)rowSU[l] * r + (float)b[l]);

-          T adj = rowAdj[i];
+          float adj = rowAdj[i];

-          T t = ((T)1.f - z) * ((T)1.f - h * h);
+          float t = (1.f - z) * (1.f - h * h);

          // df/ds
          if(outState)
-            rowOutState[i] += (m * z - m + (T)1.f) * adj;
+            rowOutState[i] += (T)((m * z - m + 1.f) * adj);

          // df/d(xW_r) ...
-          T dfdxW_r = m * r * ((T)1.f - r) * t * adj;
+          float dfdxW_r = m * r * (1.f - r) * t * adj;
          if(final)
-            dfdxW_r *= rowSU[l] + b[l];
+            dfdxW_r *= (float)rowSU[l] + (float)b[l];
          else
-            dfdxW_r *= rowSU[l];
+            dfdxW_r *= (float)rowSU[l];
          if(outXW)
-            rowOutXW[i] += dfdxW_r;
+            rowOutXW[i] += (T)dfdxW_r;
          if(outSU)
-            rowOutSU[i] += dfdxW_r;
+            rowOutSU[i] += (T)dfdxW_r;
          if(outB)
-            rowOutB[i] += dfdxW_r;
+            rowOutB[i] += (T)dfdxW_r;

          // df/d(xW_z) ...
-          T dfdxW_z = m * ((T)1.f - z) * z * (rowState[i] - h) * adj;
+          float dfdxW_z = m * (1.f - z) * z * ((float)rowState[i] - h) * adj;
          if(outXW)
-            rowOutXW[k] += dfdxW_z;
+            rowOutXW[k] += (T)dfdxW_z;
          if(outSU)
-            rowOutSU[k] += dfdxW_z;
+            rowOutSU[k] += (T)dfdxW_z;
          if(outB)
-            rowOutB[k] += dfdxW_z;
+            rowOutB[k] += (T)dfdxW_z;

          // df/d(xW_x) ...
-          T dfdxW_x = m * t * adj;
+          float dfdxW_x = m * t * adj;
          if(outXW)
-            rowOutXW[l] += dfdxW_x;
+            rowOutXW[l] += (T)dfdxW_x;
          if(outSU)
-            rowOutSU[l] += dfdxW_x * r;
+            rowOutSU[l] += (T)(dfdxW_x * r);
          if(outB)
            if(final)
-              rowOutB[l] += dfdxW_x * r;
+              rowOutB[l] += (T)(dfdxW_x * r);
            else
-              rowOutB[l] += dfdxW_x;
+              rowOutB[l] += (T)dfdxW_x;
        }
      }
    }
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@ -41,6 +41,25 @@ DISPATCH2(CopyCast, marian::Tensor, const marian::Tensor);
 DISPATCH2(AddCast, marian::Tensor, const marian::Tensor);
 DISPATCH4(IsNaN, const Tensor, Ptr<Allocator>, bool&, bool&);

+#ifdef CUDA_FOUND
+namespace gpu {
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
+}
+#endif
+
+namespace cpu {
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
+}
+
+static inline bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
+#ifdef CUDA_FOUND
+  if(in->getBackend()->getDeviceId().type == DeviceType::gpu)
+    return gpu::SanitizeGradient(in, allocator, pruneNaN, clipInf);
+  else
+#endif
+    return cpu::SanitizeGradient(in, allocator, pruneNaN, clipInf);
+}
+
 template <class Functor, class... Tensors>
 void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
 #ifdef CUDA_FOUND
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@ -10,25 +10,19 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
    mbRoundUp_(options_->get<bool>("mini-batch-round-up", true)) {
  if(options_->hasAndNotEmpty("cost-scaling")) {
    auto vcs = options_->get<std::vector<std::string>>("cost-scaling");
-    costScale_ = true;
-    float costExponent = std::stof(vcs[0]);
-    costScaleFactor_ = std::pow(2.0f, costExponent);
-    
-    if(vcs.size() > 1) costScaleFreq_ = std::stoul(vcs[1]);
-    if(vcs.size() > 2) costScaleMultiplier_ = std::stof(vcs[2]);
-    if(vcs.size() > 3) costScaleNanTolerance_ = std::stof(vcs[3]);
-    if(vcs.size() > 4) costScaleNanRange_ = std::stoul(vcs[4]);
-    if(vcs.size() > 5) costScaleFactorMinimum_ = std::stof(vcs[5]);
+
+    costScaling_                                 = true;
+    costScalingFactor_                           = std::stof( vcs[0]);
+    if(vcs.size() > 1) costScalingFreq_          = std::stoul(vcs[1]);
+    if(vcs.size() > 2) costScalingMultiplier_    = std::stof( vcs[2]);
+    if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]);
    
    LOG_ONCE(info,
-             "Training with cost scaling - factor: 2^{} = {}, frequency: {}, multiplier: {}, tolerance: {}, range: {}, minimum: {}",
-             costExponent,
-             costScaleFactor_,
-             costScaleFreq_,
-             costScaleMultiplier_,
-             costScaleNanTolerance_,
-             costScaleNanRange_,
-             costScaleFactorMinimum_);
+             "Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}",
+             costScalingFactor_,
+             costScalingFreq_,
+             costScalingMultiplier_,
+             costScalingFactorMinimum_);
  }

  if(options_->hasAndNotEmpty("dynamic-gradient-scaling")) {
@ -96,21 +90,17 @@ void GraphGroup::initGraphsAndOpts() {
 // given number of iterations. Usually we increase by 2 which adds
 // one more bit for precision.
 void GraphGroup::increaseCostScaleFactor() {
-  if(!costScale_)
+  if(!costScaling_)
    return;

  noNanSeen_++;

  size_t total = nanSeen_ + noNanSeen_;
-  float nanPercent = noNanSeen_ == (float)nanSeen_ / (float)total; // total is at least 1 because of noNanSeen_++

-  if(noNanSeen_ % costScaleFreq_ == 0) {
-    costScaleFactor_ *= costScaleMultiplier_;
-    LOG(debug,
-        "NaN/Inf percentage {:.2f} after {} gradient updates. Increasing cost-scaling factor to {}",
-        nanPercent,
-        total,
-        costScaleFactor_);
+  if(noNanSeen_ % costScalingFreq_ == 0) {
+    costScalingFactor_ *= costScalingMultiplier_;
+    if(isMainProcess())
+      LOG(debug, "No NaN/Inf after {} gradient updates. Increasing cost-scaling factor to {}", total, costScalingFactor_);

    // Resetting counts after cost-scale change
    noNanSeen_ = 0;
@ -120,48 +110,56 @@ void GraphGroup::increaseCostScaleFactor() {

 // call when a NaN was seen to decrease cost-scaling factor
 void GraphGroup::decreaseCostScaleFactor() {
-  if(!costScale_)
+  if(!costScaling_)
    return;

  nanSeen_++;
  
  size_t total = nanSeen_ + noNanSeen_;
-  float nanPercent = (float)nanSeen_ / (float)total; // total is at least 1 because of nanSeen_++
-  if(total >= costScaleNanRange_ && nanPercent > costScaleNanTolerance_) {
-    if(costScaleFactor_ > costScaleFactorMinimum_) {
-      costScaleFactor_ /= costScaleMultiplier_;
-      LOG(debug,
-          "NaN/Inf percentage {:.2f} in {} gradient updates, reducing cost-scaling factor to {}",
-          nanPercent,
-          total,
-          costScaleFactor_);
-    } else {
-      // @TODO: think if should this rather abort?
-      LOG(warn,
-          "NaN/Inf percentage {:.2f} in {} gradient updates, but cost-scaling factor {} is already at minimum",
-          nanPercent,
-          total,
-          costScaleFactor_);
-    }

-    // Resetting counts after cost-scale change
-    noNanSeen_ = 0;
-    nanSeen_ = 0;
+  // do not reduce cost-scaling factor below minimum
+  if(costScalingFactor_ > costScalingFactorMinimum_)
+    costScalingFactor_ /= costScalingMultiplier_;
+
+  if(isMainProcess()) {
+    if(costScalingFactor_ > costScalingFactorMinimum_)
+      LOG(debug, "Seen NaN/Inf after {} gradient updates. Reduced cost-scaling factor to {}", total, costScalingFactor_);
+    else
+      LOG(debug, "Seen NaN/Inf after {} gradient updates, Reduced cost-scaling factor to minimum {}. Pruning NaNs now.", total, costScalingFactor_);
  }
+
+  // Resetting counts after cost-scale change
+  noNanSeen_ = 0;
+  nanSeen_ = 0;
 }

 float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) {
  auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin);
  
-  if(checkGradientNan_ || costScale_) {
-    bool hasNan = false, hasInf = false;
-    IsNaN(curGrad, graphs_[i]->allocator(), hasNan, hasInf); // @TODO: make safe with different compiler options
-    if(hasNan || hasInf) {
-      LOG(debug, "Found Nan ({}) or Inf ({})", hasNan, hasInf);
+  // If costScaling_ then check for NaN values if the costScalingFactor_ is larger than
+  // the minimum. If a NaN value is seen we exit here and will reduce the factor next and 
+  // this skips an update. 
+  // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces 
+  // NaNs with 0. Updates are not skipped any more.
+  // Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type.
+  // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent 
+  // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large. 
+  if(costScaling_ || checkGradientNan_) {
+    bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_;
+    bool clipInf  = !checkGradientNan_;
+    bool saneGradient = SanitizeGradient(curGrad, graphs_[i]->allocator(), pruneNaN, clipInf);
+
+    // This should never happen, if it does, something is wrong with the kernel above and needs to be fixed.
+    ABORT_IF(pruneNaN && clipInf && !saneGradient, "We are removing NaNs and clipping Infs, but gradient is still not sane??");
+
+    if(!saneGradient) {
+      LOG(debug, "Found NaN");
      return std::numeric_limits<float>::quiet_NaN();
    }
  }
-  
+
+  // The optional clipping above will affect the norm here. The norm can be non-finite despite the above
+  // gradient sanitization, hence check again and propagate a NaN.
  if(dynamicGradientScaling_) {
    auto gNorm = L2Norm(curGrad, graphs_[i]->allocator());
    if(isFinite(gNorm) && gNorm > 0.0)
@ -197,8 +195,8 @@ float GraphGroup::executeAndCollectNorm(const std::function<float(size_t, size_t
 float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) {
  float normalizationFactor = 1.f;

-  if(costScale_)
-    normalizationFactor *= costScaleFactor_;
+  if(costScaling_)
+    normalizationFactor *= costScalingFactor_;

  if(options_->get<bool>("normalize-gradient"))
    normalizationFactor *= updateTrgWords;
@ -207,9 +205,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
    return normalizationFactor;
  
  if(dynamicGradientScaling_) {
-    // make gradient norm invariant to changes in costScaleFactor_, luckily norm(c * g) = c * norm(g)
-    if(costScale_)
-      gNorm = gNorm / costScaleFactor_;
+    // make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g)
+    if(costScaling_)
+      gNorm = gNorm / costScalingFactor_;
    
    // Normalize gradient norm w.r.t. number of labels in batch for statistics, 
    // there should be no gradient normalization before this point, @TODO: check this
@ -288,9 +286,7 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
      restoreFromCheckpoint(modelFileName, scatterFn);
    } else if(options_->hasAndNotEmpty("pretrained-model")) {
      std::string nameInit = options_->get<std::string>("pretrained-model");
-      LOG(info,
-          "[training] Initializing model weights with pre-trained model {}",
-          nameInit);
+      LOG(info, "[training] Initializing model weights with pre-trained model {}", nameInit);

      size_t i = 0;
      for(auto graph : graphs_)
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@ -60,22 +60,21 @@ protected:
  double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words
  bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false

-  bool costScale_{false};
-  float costScaleFactor_{1.f}; // @TODO, add current costScaleFactor_ to trainingState for serialization
-  size_t costScaleFreq_{2000};
-  float costScaleMultiplier_{2.f};
-  float costScaleNanTolerance_{0.f};
-  size_t costScaleNanRange_{1};
-  float costScaleFactorMinimum_{1.f}; // @TODO make this configureable
+  bool costScaling_{false};
+  float costScalingFactor_{1.f}; // @TODO, add current costScalingFactor_ to trainingState for serialization
+  size_t costScalingFreq_{2000};
+  float costScalingMultiplier_{2.f};
+  float costScalingFactorMinimum_{1.f};
+
  size_t noNanSeen_{0}; // @TODO, add current noNanSeen_ to trainingState for serialization
  size_t nanSeen_{0};

+  bool checkGradientNan_{false};
+
  bool dynamicGradientScaling_{false};
  float dynamicGradientScalingFactor_{2.f};
  bool dynamicGradientScalingUseLogs_{false};

-  bool checkGradientNan_{false};
-
  // determines the number of input streams (i.e. input files or fields in the TSV input) that need
  // to be included in the batch, i.e. without alignments and weights
  size_t numberOfInputFiles();
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@ -143,13 +143,13 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
    thread_local Tensor accGradients;
    thread_local Ptr<TensorAllocator> accAlloc;

-    ABORT_IF(costScale_ ,"Cost-scaling not implemented for AsyncSGD");
+    ABORT_IF(costScaling_ ,"Cost-scaling not implemented for AsyncSGD");

    auto graph = graphs_[tid];
    Ptr<RationalLoss> dynamicLoss = models_[tid]->build(graph, batch);
-    if(costScaleFactor_ != 1.f) {
+    if(costScalingFactor_ != 1.f) {
      // it's ok to go out of scope, this will still insert the new top node into the graph
-      auto costNode = dynamicLoss->loss() * costScaleFactor_;
+      auto costNode = dynamicLoss->loss() * costScalingFactor_;
    }

    if(t % optimizerDelay_ == 0) {
--- a/src/training/graph_group_singleton.cpp
+++ b/src/training/graph_group_singleton.cpp
@ -16,16 +16,16 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
  auto opt   = optimizerShards_[0];

  auto lossNode = model->build(graph, batch);
-  if(costScaleFactor_ != 1.f) {
+  if(costScalingFactor_ != 1.f) {
    // for fp16 training, it's ok to go out of scope, we do not use the scaled version for anything
-    auto scaledLoss = lossNode->loss() * costScaleFactor_;
+    auto scaledLoss = lossNode->loss() * costScalingFactor_;
  }

  graph->forward();
  graph->backward();

  bool noNanOrInf = true;
-  if(costScale_) {
+  if(costScaling_) {
    // Are there NaNs in the gradient?
    bool hasNan = false, hasInf = false;
    IsNaN(graph->params()->grads(), graph->allocator(), hasNan, hasInf);
@ -39,7 +39,7 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
    opt->update(graph->params()->vals(),
                graph->params()->grads(),
                batch->wordsTrg(),
-                costScaleFactor_);
+                costScalingFactor_);

  if(scheduler_) {
    scheduler_->update(*lossNode, batch);
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -252,8 +252,8 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num

      { // let loss go out of scope, frees memory
        auto rationalLoss = models_[localDeviceIndex]->build(graph, subBatch);
-        if(costScaleFactor_ != 1.f)
-          rationalLoss->loss() * costScaleFactor_;
+        if(costScalingFactor_ != 1.f)
+          rationalLoss->loss() * costScalingFactor_;
        graph->forward();

        localDeviceLosses[localDeviceIndex] += *rationalLoss;
@ -262,7 +262,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
      graph->backward(/*zero=*/false); // (gradients are reset before we get here)
    }

-#if 1 
+#if 0 // @TODO: this can probably be removed now, keep around until confirmed.
    // experimental and should eventually be somewhere else
    // Handle local gradient explosion but only clip to largest possible value
    // given number of GPUs and type. Should clip rarely. Also clips inf
@ -284,7 +284,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
  comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards
  
  float gradNorm = 0.f; 
-  if(costScale_ || dynamicGradientScaling_ || checkGradientNan_) {
+  if(costScaling_ || dynamicGradientScaling_ || checkGradientNan_) {
    // Wrapping member function
    auto checkNanOrNorm = [&](size_t i, size_t begin, size_t end) { 
      return GraphGroup::checkNanOrNorm(i, begin, end);