From 16bfa0c913959f44c65deb495ae8dbecd175d85f Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Wed, 16 Mar 2022 14:44:17 +0000
Subject: [PATCH] Merged PR 23094: Adapt --cost-scaling to more stable setting

This PR sets default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable than variable cost-scaling with larger numbers that was the default before.
---
 CHANGELOG.md                 | 1 +
 VERSION                      | 2 +-
 src/common/aliases.cpp       | 2 +-
 src/common/config_parser.cpp | 4 ++--
 4 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5cb28e12..1d2b4338 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load
 
 ### Changed
+- Set default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable.
 - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce.
 - Changed minimal C++ standard to C++-17
 - Faster LSH top-k search on CPU
diff --git a/VERSION b/VERSION
index f5f1545d..62e1a502 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.11.4
+v1.11.5
diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp
index b38ccc64..3db31e51 100644
--- a/src/common/aliases.cpp
+++ b/src/common/aliases.cpp
@@ -32,7 +32,7 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
     if(mode_ == cli::mode::training) {
       config["precision"] = std::vector<std::string>({"float16", "float32"}); // inference type, optimization type, save type
       // scaling factor, frequency, multiplier at increase, minium scaling factor
-      config["cost-scaling"] = std::vector<std::string>({"256.f", "1000", "2.f", "256.f"});
+      config["cost-scaling"] = std::vector<std::string>({"8.f", "10000", "1.f", "8.f"});
     } else {
       config["precision"] = std::vector<std::string>({"float16"}); // for inference we do not need the other types
     }
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 0d956495..e3ac2108 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -534,7 +534,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   // mixed precision training
   cli.add<bool>("--fp16",
       "Shortcut for mixed precision training with float16 and cost-scaling, "
-      "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f");
+      "corresponds to: --precision float16 float32 --cost-scaling 8.f 10000 1.f 8.f");
   cli.add<std::vector<std::string>>("--precision",
       "Mixed precision training for forward/backward pass and optimizaton. "
       "Defines types for: forward/backward pass, optimization.",
@@ -542,7 +542,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   cli.add<std::vector<std::string>>("--cost-scaling",
       "Dynamic cost scaling for mixed precision training: "
       "scaling factor, frequency, multiplier, minimum factor")
-      ->implicit_val("256.f 1000 2.f 256.f");
+      ->implicit_val("8.f 10000 1.f 8.f");
   cli.add<size_t>("--gradient-norm-average-window",
       "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
       "After this many updates about 90% of the mass of the exponential average comes from these updates",