From 16bfa0c913959f44c65deb495ae8dbecd175d85f Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 16 Mar 2022 14:44:17 +0000 Subject: [PATCH] Merged PR 23094: Adapt --cost-scaling to more stable setting This PR sets default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable than variable cost-scaling with larger numbers that was the default before. --- CHANGELOG.md | 1 + VERSION | 2 +- src/common/aliases.cpp | 2 +- src/common/config_parser.cpp | 4 ++-- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cb28e12..1d2b4338 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load ### Changed +- Set default parameters for cost-scaling to 8.f 10000 1.f 8.f, i.e. when scaling scale by 8 and do not try to automatically scale up or down. This seems most stable. - Make guided-alignment faster via sparse memory layout, add alignment points for EOS, remove losses other than ce. - Changed minimal C++ standard to C++-17 - Faster LSH top-k search on CPU diff --git a/VERSION b/VERSION index f5f1545d..62e1a502 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.4 +v1.11.5 diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index b38ccc64..3db31e51 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -32,7 +32,7 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) { if(mode_ == cli::mode::training) { config["precision"] = std::vector({"float16", "float32"}); // inference type, optimization type, save type // scaling factor, frequency, multiplier at increase, minium scaling factor - config["cost-scaling"] = std::vector({"256.f", "1000", "2.f", "256.f"}); + config["cost-scaling"] = std::vector({"8.f", "10000", "1.f", "8.f"}); } else { config["precision"] = std::vector({"float16"}); // for inference we do not need the other types } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 0d956495..e3ac2108 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -534,7 +534,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { // mixed precision training cli.add("--fp16", "Shortcut for mixed precision training with float16 and cost-scaling, " - "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f"); + "corresponds to: --precision float16 float32 --cost-scaling 8.f 10000 1.f 8.f"); cli.add>("--precision", "Mixed precision training for forward/backward pass and optimizaton. " "Defines types for: forward/backward pass, optimization.", @@ -542,7 +542,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { cli.add>("--cost-scaling", "Dynamic cost scaling for mixed precision training: " "scaling factor, frequency, multiplier, minimum factor") - ->implicit_val("256.f 1000 2.f 256.f"); + ->implicit_val("8.f 10000 1.f 8.f"); cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates",