From 8470c16bdd92797e75e7d5e7397ef7b29896538a Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Thu, 16 Sep 2021 02:00:00 +0000 Subject: [PATCH 01/11] Merged PR 20230: Add option for running regression tests only in Azure Pipelines This PR adds a checkbox which can be unchecked to skip running compilation checks when triggering them manually. It is useful for generating expected outputs on different CPUs for tests using 8-bit models. --- azure-pipelines.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4f7ce02d..d4d0b2e5 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -6,6 +6,13 @@ # 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file # 4. "More actions" > "Save" +parameters: +# Allow skipping the entire 'Build' stage +- name: runBuilds + displayName: Run builds? Uncheck to run regression tests only. + type: boolean + default: true + # The pipeline CI trigger is set on the branch master only and PR trigger on a # (non-draft) pull request to any branch trigger: @@ -45,6 +52,7 @@ stages: ###################################################################### - job: BuildWindows + condition: eq(${{ parameters.runBuilds }}, true) displayName: Windows strategy: @@ -180,6 +188,7 @@ stages: ###################################################################### - job: BuildUbuntu + condition: eq(${{ parameters.runBuilds }}, true) displayName: Ubuntu timeoutInMinutes: 90 @@ -322,6 +331,7 @@ stages: ###################################################################### - job: BuildUbuntuMinimal + condition: eq(${{ parameters.runBuilds }}, true) displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5 pool: @@ -368,6 +378,7 @@ stages: ###################################################################### - job: BuildMacOS + condition: eq(${{ parameters.runBuilds }}, true) displayName: macOS CPU clang pool: @@ -416,6 +427,7 @@ stages: ###################################################################### - job: BuildInstall + condition: eq(${{ parameters.runBuilds }}, true) displayName: Linux CPU library install pool: From aa58ba8e239d228d539734e6be8266fbb3181044 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 20 Sep 2021 13:14:24 +0000 Subject: [PATCH 02/11] Merged PR 20593: Fix and update Azure pipelines - Add `--allow-unauthenticated` to `apt` when installing CUDA on Ubuntu - Removing `ubuntu-16.04` image from Azure pipelines, which will become unavailable after September 20 --- azure-pipelines.yml | 26 +++++++------------------- scripts/ci/install_cuda_ubuntu.sh | 2 +- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d4d0b2e5..7953b282 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -246,17 +246,7 @@ stages: examples: true static: true ################################################################ - # Ubuntu 16.04 supports CUDA 8+ - "16.04 CUDA 9.2 gcc-7": - image: ubuntu-16.04 - boost: true - cpu: true - gpu: true - cuda: 9.2 - gcc: 7 - unit_tests: true - examples: true - static: false + # Ubuntu 16.04 is no longer available on Azure-hosted machines pool: vmImage: $(image) @@ -332,18 +322,16 @@ stages: ###################################################################### - job: BuildUbuntuMinimal condition: eq(${{ parameters.runBuilds }}, true) - displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5 + displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5 pool: - vmImage: ubuntu-16.04 + vmImage: ubuntu-18.04 steps: - checkout: self submodules: true # The script simplifies installation of different versions of CUDA. - # Ubuntu 16.04 on Azure-hosted VMs have GCC 5.5 as gcc-5, which is not compatible with CUDA 9. - # Downgrading to GCC 5.4 (the default gcc on Ubuntu 16.04) would be more work... - bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0" displayName: Install CUDA @@ -356,10 +344,10 @@ stages: # GCC 5 is the minimum version supported - bash: | - /usr/bin/gcc-5 --version + /usr/bin/gcc-7 --version mkdir -p build cd build - CC=/usr/bin/gcc-5 CXX=/usr/bin/g++-5 CUDAHOSTCXX=/usr/bin/g++-5 \ + CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \ ../cmake-3.5.1-Linux-x86_64/bin/cmake .. \ -DCOMPILE_CPU=on \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0 @@ -592,7 +580,7 @@ stages: # Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\' # instead of '/', which often breaks the job - - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' + - bash: MARIAN=../marian-dev/build TIMEOUT=10m bash ./run_mrt.sh '#cpu' '#basics' '#devops' continueOnError: true displayName: Run tests workingDirectory: marian-prod-tests @@ -689,7 +677,7 @@ stages: AWS_SECRET_SAS_TOKEN: $(blob-sas-token) workingDirectory: marian-prod-tests - - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' + - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops' continueOnError: true displayName: Run tests workingDirectory: marian-prod-tests diff --git a/scripts/ci/install_cuda_ubuntu.sh b/scripts/ci/install_cuda_ubuntu.sh index 8dc77eda..b058294a 100755 --- a/scripts/ci/install_cuda_ubuntu.sh +++ b/scripts/ci/install_cuda_ubuntu.sh @@ -91,7 +91,7 @@ sudo add-apt-repository "deb ${REPO_URL} /" sudo apt-get update echo "Installing CUDA packages ${CUDA_PACKAGES}" -sudo apt-get -y install ${CUDA_PACKAGES} +sudo apt-get -y --allow-unauthenticated install ${CUDA_PACKAGES} if [[ $? -ne 0 ]]; then echo "CUDA Installation Error." From d796a3c3b7779993660e672f2a47f5cdd685a174 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 28 Sep 2021 17:17:12 +0000 Subject: [PATCH 03/11] Merged PR 20839: Do not ignore ignoreEOS for spm decoding With final space this eliminates trailing whitespace caused by appending EOS --- src/data/sentencepiece_vocab.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index 090d478b..8f774c2b 100644 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -236,18 +236,20 @@ public: return words; } - std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override { + std::string decode(const Words& sentence, bool ignoreEOS) const override { std::string line; if(keepEncoded_) { // i.e. keep the sentence segmented into subword units for(const Word& id : sentence) - line += (*this)[id] + " "; + if(!ignoreEOS || id != getEosId()) + line += (*this)[id] + " "; line.pop_back(); // trim the trailing whitespace } else { // convert vector of Word to vector of int std::vector spmSentence; spmSentence.reserve(sentence.size()); for(auto&& word : sentence) - spmSentence.push_back(word.toWordIndex()); + if(!ignoreEOS || word != getEosId()) + spmSentence.push_back(word.toWordIndex()); spm_->Decode(spmSentence, &line); } return line; From 03fe1758763c99dd55bcf6c1c5e0e1dd60ae4e1a Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 28 Sep 2021 17:19:07 +0000 Subject: [PATCH 04/11] Merged PR 20879: Adjustable ffn width and depth in transformer decoder --- src/common/config_parser.cpp | 8 +++++++- src/models/encoder_decoder.cpp | 2 ++ src/models/transformer.h | 21 +++++++++++++++------ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index d7818afb..b3e8950b 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -255,10 +255,16 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)"); cli.add("--transformer-dim-ffn", "Size of position-wise feed-forward network (transformer)", - 2048); + 2048); + cli.add("--transformer-decoder-dim-ffn", + "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.", + 0); cli.add("--transformer-ffn-depth", "Depth of filters (transformer)", 2); + cli.add("--transformer-decoder-ffn-depth", + "Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0", + 0); cli.add("--transformer-ffn-activation", "Activation between filters: swish or relu (transformer)", "swish"); diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index 8fc9321a..a7a398e7 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -38,7 +38,9 @@ EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) modelFeatures_.insert("transformer-heads"); modelFeatures_.insert("transformer-no-projection"); modelFeatures_.insert("transformer-dim-ffn"); + modelFeatures_.insert("transformer-decoder-dim-ffn"); modelFeatures_.insert("transformer-ffn-depth"); + modelFeatures_.insert("transformer-decoder-ffn-depth"); modelFeatures_.insert("transformer-ffn-activation"); modelFeatures_.insert("transformer-dim-aan"); modelFeatures_.insert("transformer-aan-depth"); diff --git a/src/models/transformer.h b/src/models/transformer.h index a792de8b..2393ad73 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -400,7 +400,7 @@ public: opt("transformer-heads"), /*cache=*/false); } - Expr LayerFFN(std::string prefix, Expr input) const { + Expr LayerFFN(std::string prefix, Expr input, bool isDecoder=false) const { int dimModel = input->shape()[-1]; float dropProb = inference_ ? 0 : opt("transformer-dropout"); @@ -408,13 +408,22 @@ public: auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb); auto actName = opt("transformer-ffn-activation"); + int dimFfn = opt("transformer-dim-ffn"); int depthFfn = opt("transformer-ffn-depth"); - float ffnDropProb - = inference_ ? 0 : opt("transformer-dropout-ffn"); - + if(isDecoder) { + int decDimFfn = opt("transformer-decoder-dim-ffn", 0); + if(decDimFfn != 0) + dimFfn = decDimFfn; + + int decDepthFfn = opt("transformer-decoder-ffn-depth", 0); + if(decDepthFfn != 0) + depthFfn = decDepthFfn; + } + ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn); - + + float ffnDropProb = inference_ ? 0 : opt("transformer-dropout-ffn"); auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f); // the stack of FF layers @@ -861,7 +870,7 @@ public: // remember decoder state decoderStates.push_back(decoderState); - query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query, /*isDecoder=*/true); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] checkpoint(query); } From 2d79ad02bb66d7e0ba264defbf5ff9b47c70ba74 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 13 Oct 2021 20:20:14 +0000 Subject: [PATCH 05/11] Merged PR 20933: beam & batch works for n on-factored models --- src/layers/output.cpp | 22 ++++++++++++++++------ src/translator/beam_search.cpp | 5 +++-- src/translator/nth_element.cpp | 2 ++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 92cccdfb..af72b794 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -313,14 +313,24 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { } return Logits(std::move(allLogits), factoredVocab_); } else if(shortlist_) { - return Logits(affineOrDot(input, - shortlist_->getCachedShortWt(), - shortlist_->getCachedShortb(), + const Shape &inputShape = input->shape(); + assert(inputShape[1] == 1); // time dimension always 1 for decoding + input = reshape(input, {inputShape[0], inputShape[2], 1, inputShape[3]}); + + Expr Wt = shortlist_->getCachedShortWt(); + Expr b = shortlist_->getCachedShortb(); + Expr ret = affineShortlist(input, + Wt, + b, false, - /*transB=*/isLegacyUntransposedW ? false : true)); + /*transB=*/isLegacyUntransposedW ? false : true); + const Shape &retShape = ret->shape(); + assert(retShape[2] == 1); // time dimension always 1 for decoding + ret = reshape(ret, {retShape[0], 1, retShape[1], retShape[3]}); + return Logits(ret); } else { - return Logits( - affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true)); + Expr ret = affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true); + return Logits(ret); } } diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 2a0d3947..580895f2 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -94,7 +94,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // For factored decoding, the word is built over multiple decoding steps, // starting with the lemma, then adding factors one by one. if (factorGroup == 0) { - word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0 + word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); std::vector factorIndices; factoredVocab->word2factors(word, factorIndices); //LOG(info, "{} + {} ({}) -> {} -> {}", // factoredVocab->decode(prevHyp->tracebackWords()), @@ -115,7 +115,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } } else if (shortlist) - word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) origBatchIdx, wordIdx)); + word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx)); else word = Word::fromWordIndex(wordIdx); @@ -330,6 +330,7 @@ Histories BeamSearch::search(Ptr graph, Ptr auto prevBatchIdxMap = batchIdxMap; // [origBatchIdx -> currentBatchIdx] but shifted by one time step // main loop over output time steps for (size_t t = 0; ; t++) { + //std::cerr << "\nstep=" << t << std::endl; ABORT_IF(origDimBatch != beams.size(), "Lost a batch entry??"); // determine beam size for next output time step, as max over still-active sentences // E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then diff --git a/src/translator/nth_element.cpp b/src/translator/nth_element.cpp index 237d9b9d..dbcceec4 100644 --- a/src/translator/nth_element.cpp +++ b/src/translator/nth_element.cpp @@ -3,7 +3,9 @@ * SPDX-License-Identifier: MIT */ +#include "common/utils.h" #include "translator/nth_element.h" + #include #include #include From 7f06f3c5d2035dac0cb4349bf29fbfa3e6bb5448 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Tue, 26 Oct 2021 11:20:41 +0000 Subject: [PATCH 06/11] Merged PR 21166: Keep building on macOS-10.15 Marian does not compile on macOS 11.6, so the build has stopped working due to an upgrade from macOS-10.15 to macOS 11.6 in Azure Pipelines: https://github.com/actions/virtual-environments/issues/4060 This PR explicitly set macOS 10.15 in the workflow. --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 7953b282..bc76f85c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -370,7 +370,7 @@ stages: displayName: macOS CPU clang pool: - vmImage: macos-latest + vmImage: macos-10.15 steps: - checkout: self From 1404201926b5b4e27993776d52dfac809e8556f4 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 26 Oct 2021 20:25:39 +0000 Subject: [PATCH 07/11] Merged PR 21151: Cleaning up fp16 behavior This PR improves clipping and pruning behavior of NaNs and Infs during fp16 training, ultimately avoiding the underflow problems that we were facing so far. --- src/common/aliases.cpp | 4 +- src/common/config_parser.cpp | 6 +- src/common/definitions.h | 10 +- src/models/transformer.h | 15 ++- src/tensors/cpu/tensor_operators.cpp | 4 + src/tensors/gpu/element.cu | 12 +- src/tensors/gpu/tensor_operators.cu | 147 ++++++++++++++++++------- src/tensors/tensor_operators.h | 19 ++++ src/training/graph_group.cpp | 118 ++++++++++---------- src/training/graph_group.h | 17 ++- src/training/graph_group_async.cpp | 6 +- src/training/graph_group_singleton.cpp | 8 +- src/training/graph_group_sync.cpp | 8 +- 13 files changed, 233 insertions(+), 141 deletions(-) diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp index 0be26a8c..99574fe1 100644 --- a/src/common/aliases.cpp +++ b/src/common/aliases.cpp @@ -29,8 +29,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) { cli.alias("fp16", "true", [&](YAML::Node& config) { if(mode_ == cli::mode::training) { config["precision"] = std::vector({"float16", "float32"}); // inference type, optimization type, save type - // scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor - config["cost-scaling"] = std::vector({"0", "1000", "2", "0.05", "10", "1e-5"}); + // scaling factor, frequency, multiplier at increase, minium scaling factor + config["cost-scaling"] = std::vector({"256.f", "1000", "2.f", "256.f"}); } else { config["precision"] = std::vector({"float16"}); // for inference we do not need the other types } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index b3e8950b..51764cdc 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -522,15 +522,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { // mixed precision training cli.add("--fp16", "Shortcut for mixed precision training with float16 and cost-scaling, " - "corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f"); + "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f"); cli.add>("--precision", "Mixed precision training for forward/backward pass and optimizaton. " "Defines types for: forward/backward pass, optimization.", {"float32", "float32"}); cli.add>("--cost-scaling", "Dynamic cost scaling for mixed precision training: " - "power of 2, scaling window, scaling factor, tolerance, range, minimum factor") - ->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f"); + "scaling factor, frequency, multiplier, minimum factor") + ->implicit_val("256.f 1000 2.f 256.f"); cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates", diff --git a/src/common/definitions.h b/src/common/definitions.h index d2cf8aa4..d8a3ad46 100644 --- a/src/common/definitions.h +++ b/src/common/definitions.h @@ -106,24 +106,24 @@ using Weak = std::weak_ptr; /** @brief Creates shared_ptr of any type, passes all arguments to any available * constructor */ template -Ptr New(Args&&... args) { - return Ptr(new T(std::forward(args)...)); +inline Ptr New(Args&&... args) { + return std::make_shared(std::forward(args)...); } template -Ptr New(Ptr p) { +inline Ptr New(Ptr p) { return Ptr(p); } /** @brief Creates InstrusivePtr of any type, passes all arguments to any available * constructor */ template -IPtr INew(Args&&... args) { +inline IPtr INew(Args&&... args) { return IPtr(new T(std::forward(args)...)); } template -IPtr INew(Ptr p) { +inline IPtr INew(Ptr p) { return IPtr(p); } diff --git a/src/models/transformer.h b/src/models/transformer.h index 2393ad73..b2c0f6be 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -147,8 +147,7 @@ public: int dimDepth = dimModel / dimHeads; - auto output - = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth}); + auto output = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth}); return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth] } @@ -361,9 +360,9 @@ public: Expr LayerAttention(std::string prefix, Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] - const Expr& keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] - const Expr& values, // ...? - const Expr& mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] + Expr keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + Expr values, // ...? + Expr mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] int dimHeads, bool cache = false, bool saveAttentionWeights = false) { @@ -373,6 +372,12 @@ public: auto opsPre = opt("transformer-preprocess"); auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb); + // fixes missing norm for keys and values in self-attention with pre-norm + if(input == keys) + keys = output; + if(input == values) + values = output; + // multi-head self-attention over previous input output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights); diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 1afb8f64..f3964f91 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -24,6 +24,10 @@ void IsNaN(const Tensor /*in*/, Ptr /*allocator*/, bool& /*isNaN*/, b ABORT("Not implemented"); } +bool SanitizeGradient(marian::Tensor /*in*/, Ptr /*allocator*/, bool /*pruneNaN*/, bool /*clipInf*/) { + ABORT("Not implemented"); +} + template void CopyCastTo(To* out, const From* in, int length) { for(int i = 0; i < length; ++i) diff --git a/src/tensors/gpu/element.cu b/src/tensors/gpu/element.cu index 6790efd4..e9cbe081 100755 --- a/src/tensors/gpu/element.cu +++ b/src/tensors/gpu/element.cu @@ -29,7 +29,9 @@ __global__ void gElement( indices[i] = tensors[i].shape().bindex(dims); } - tensors[0].data()[index] = functional::apply(functor, tensors, indices); + // This performs the internal application of the functor in float32 regardless of the input type. + // It seems there are no speed penalties but improved precision. + tensors[0].data()[index] = (T)functional::applyWithCast(functor, tensors, indices); } } } @@ -65,13 +67,7 @@ void Element(Functor functor, Tensor out, Tensors... tensors) { ElementTyped(functor, out, tensors...); } else if(out->type() == Type::float16) { #if COMPILE_FP16 - std::vector ts({out, tensors...}); - bool div2 = std::all_of(ts.cbegin(), ts.cend(), [](marian::Tensor t){ return t->shape()[-1] % 2 == 0; }); - if(div2) { - ElementTyped(functor, out, tensors...); - } else { - ElementTyped(functor, out, tensors...); - } + ElementTyped(functor, out, tensors...); #else ABORT("FP16 not supported with chosen current hardware or CUDA version"); #endif diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index d55214bc..1347c3bb 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -16,15 +16,12 @@ namespace gpu { namespace atomics { static inline __device__ void atomicAdd(float *address, float val) { - //*address += val; ::atomicAdd(address, val); } #if COMPILE_FP16 // @TODO: copied from CuTorch, adapt this better, give credit. static inline __device__ void atomicAdd(half *address, half val) { - //*address += val; - #if __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000 // compute capability 70 and higher with CUDA 10 ::atomicAdd(address, val); #else // __CUDA_ARCH__ < 700 @@ -50,7 +47,8 @@ static inline __device__ void atomicAdd(half *address, half val) { } while (assumed != old); #endif // __CUDA_ARCH__ } -#endif +#endif // COMPILE_FP16 + } @@ -96,6 +94,81 @@ void IsNaN(const Tensor in, Ptr allocator, bool& isNaN, bool& isInf) cudaStreamSynchronize(0); } +template +__global__ void gSanitizeGradient(T* in, int length, + bool* isNaN, bool* isInf, + bool pruneNaN, bool clipInf, + float forNaN = 0.f, float forInf = 65504.f, float forInfNeg = -65504.f) { + for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { + int index = bid + blockDim.x * blockIdx.x + threadIdx.x; + if(index < length) { + float v = (float)in[index]; + // handle NaN + if(isnan(v)) { + if(pruneNaN) { + in[index] = (T)forNaN; + } else { + *isNaN = true; + } + } + // handle +/- Inf + if(isinf(v)) { + if(clipInf) { + in[index] = v > 0 ? (T)forInf : (T)forInfNeg; + } else { + *isInf = true; + } + } + } + } +} + +// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. +// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. +// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. +// In that case infinities do not result in a bad gradient, since they get clipped. +// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result +// in a bad gradient. +// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), +// we return `false` indicating a bad gradient. +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { + cudaSetDevice(in->getDeviceId().no); + + int length = in->size(); + + int threads = std::min(MAX_THREADS, length); + int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + auto mem = allocator->alloc(2); + bool* dIsNaN = &mem->data()[0]; + bool* dIsInf = &mem->data()[1]; + fill(in->getBackend(), dIsNaN, dIsNaN + 2, false); + + float forNaN = 0.f; + float forInf = NumericLimits(in->type()).max; + float forInfNeg = NumericLimits(in->type()).lowest; + + if(in->type() == Type::float32) { + gSanitizeGradient<<>>(in->data(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg); +#if COMPILE_FP16 + } else if(in->type() == Type::float16) { + gSanitizeGradient<<>>(in->data(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg); +#endif + } else { + ABORT("gSanitizeGradient for type {} not implemented", in->type()); + } + + bool isNaN, isInf; + CudaCopy(dIsNaN, dIsNaN + 1, &isNaN); + CudaCopy(dIsInf, dIsInf + 1, &isInf); + + allocator->free(mem); + + cudaStreamSynchronize(0); + + return !isNaN && !isInf; +} + template __global__ void gCopyCastTo(To* out, const From* in, int length) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { @@ -1090,7 +1163,7 @@ void PasteRows(Tensor out, size_t rowsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)cols); -#if 1 // @TODO: make this configurable with a 'deterministic' flag +#if 0 // @TODO: make this configurable with a 'deterministic' flag // If we only use one block, then each core operates on a different column, // hence the summation becomes deterministic. // However, we only use e.g. 512 cores out of possibly 3000+, so this will be @@ -1355,7 +1428,7 @@ __global__ void gGRUFastForward(T* out, for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { - T m = !mask || mask[j]; + float m = !mask || mask[j]; T* rowOut = out + j * cols; const T* rowState = state + j * cols; @@ -1365,21 +1438,21 @@ __global__ void gGRUFastForward(T* out, for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { - T r = functional::Ops::sigmoid(xWrow[i] + sUrow[i] + b[i]); + float r = functional::Ops::sigmoid((float)xWrow[i] + (float)sUrow[i] + (float)b[i]); int k = i + cols; - T z = functional::Ops::sigmoid(xWrow[k] + sUrow[k] + b[k]); + float z = functional::Ops::sigmoid((float)xWrow[k] + (float)sUrow[k] + (float)b[k]); int l = i + 2 * cols; - T h; + float h; if(final) - h = functional::Ops::tanh(xWrow[l] + (sUrow[l] + b[l]) * r); + h = functional::Ops::tanh((float)xWrow[l] + ((float)sUrow[l] + (float)b[l]) * r); else - h = functional::Ops::tanh(xWrow[l] + sUrow[l] * r + b[l]); + h = functional::Ops::tanh((float)xWrow[l] + (float)sUrow[l] * r + (float)b[l]); - T out = ((T)1.f - z) * h + z * rowState[i]; - rowOut[i] = m * out + ((T)1.f - m) * rowState[i]; + float out = (1.f - z) * h + z * (float)rowState[i]; + rowOut[i] = (T)(m * out + (1.f - m) * (float)rowState[i]); } } } @@ -1441,7 +1514,7 @@ __global__ void gGRUFastBackward(T* outState, for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { - T m = !mask || mask[j]; + float m = !mask || mask[j]; T* rowOutState = outState + j * cols; T* rowOutXW = outXW + j * cols * 3; @@ -1459,56 +1532,56 @@ __global__ void gGRUFastBackward(T* outState, int k = i + cols; int l = i + 2 * cols; - T r = functional::Ops::sigmoid(rowXW[i] + rowSU[i] + b[i]); - T z = functional::Ops::sigmoid(rowXW[k] + rowSU[k] + b[k]); + float r = functional::Ops::sigmoid((float)rowXW[i] + (float)rowSU[i] + (float)b[i]); + float z = functional::Ops::sigmoid((float)rowXW[k] + (float)rowSU[k] + (float)b[k]); - T h; + float h; if(final) - h = functional::Ops::tanh(rowXW[l] + (rowSU[l] + b[l]) * r); + h = functional::Ops::tanh((float)rowXW[l] + ((float)rowSU[l] + (float)b[l]) * r); else - h = functional::Ops::tanh(rowXW[l] + rowSU[l] * r + b[l]); + h = functional::Ops::tanh((float)rowXW[l] + (float)rowSU[l] * r + (float)b[l]); - T adj = rowAdj[i]; + float adj = rowAdj[i]; - T t = ((T)1.f - z) * ((T)1.f - h * h); + float t = (1.f - z) * (1.f - h * h); // df/ds if(outState) - rowOutState[i] += (m * z - m + (T)1.f) * adj; + rowOutState[i] += (T)((m * z - m + 1.f) * adj); // df/d(xW_r) ... - T dfdxW_r = m * r * ((T)1.f - r) * t * adj; + float dfdxW_r = m * r * (1.f - r) * t * adj; if(final) - dfdxW_r *= rowSU[l] + b[l]; + dfdxW_r *= (float)rowSU[l] + (float)b[l]; else - dfdxW_r *= rowSU[l]; + dfdxW_r *= (float)rowSU[l]; if(outXW) - rowOutXW[i] += dfdxW_r; + rowOutXW[i] += (T)dfdxW_r; if(outSU) - rowOutSU[i] += dfdxW_r; + rowOutSU[i] += (T)dfdxW_r; if(outB) - rowOutB[i] += dfdxW_r; + rowOutB[i] += (T)dfdxW_r; // df/d(xW_z) ... - T dfdxW_z = m * ((T)1.f - z) * z * (rowState[i] - h) * adj; + float dfdxW_z = m * (1.f - z) * z * ((float)rowState[i] - h) * adj; if(outXW) - rowOutXW[k] += dfdxW_z; + rowOutXW[k] += (T)dfdxW_z; if(outSU) - rowOutSU[k] += dfdxW_z; + rowOutSU[k] += (T)dfdxW_z; if(outB) - rowOutB[k] += dfdxW_z; + rowOutB[k] += (T)dfdxW_z; // df/d(xW_x) ... - T dfdxW_x = m * t * adj; + float dfdxW_x = m * t * adj; if(outXW) - rowOutXW[l] += dfdxW_x; + rowOutXW[l] += (T)dfdxW_x; if(outSU) - rowOutSU[l] += dfdxW_x * r; + rowOutSU[l] += (T)(dfdxW_x * r); if(outB) if(final) - rowOutB[l] += dfdxW_x * r; + rowOutB[l] += (T)(dfdxW_x * r); else - rowOutB[l] += dfdxW_x; + rowOutB[l] += (T)dfdxW_x; } } } diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 6e587953..dc29bf35 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -41,6 +41,25 @@ DISPATCH2(CopyCast, marian::Tensor, const marian::Tensor); DISPATCH2(AddCast, marian::Tensor, const marian::Tensor); DISPATCH4(IsNaN, const Tensor, Ptr, bool&, bool&); +#ifdef CUDA_FOUND +namespace gpu { +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); +} +#endif + +namespace cpu { +bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); +} + +static inline bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { +#ifdef CUDA_FOUND + if(in->getBackend()->getDeviceId().type == DeviceType::gpu) + return gpu::SanitizeGradient(in, allocator, pruneNaN, clipInf); + else +#endif + return cpu::SanitizeGradient(in, allocator, pruneNaN, clipInf); +} + template void Element(Functor functor, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index e9c977b9..03e5acf4 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -10,25 +10,19 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) mbRoundUp_(options_->get("mini-batch-round-up", true)) { if(options_->hasAndNotEmpty("cost-scaling")) { auto vcs = options_->get>("cost-scaling"); - costScale_ = true; - float costExponent = std::stof(vcs[0]); - costScaleFactor_ = std::pow(2.0f, costExponent); - - if(vcs.size() > 1) costScaleFreq_ = std::stoul(vcs[1]); - if(vcs.size() > 2) costScaleMultiplier_ = std::stof(vcs[2]); - if(vcs.size() > 3) costScaleNanTolerance_ = std::stof(vcs[3]); - if(vcs.size() > 4) costScaleNanRange_ = std::stoul(vcs[4]); - if(vcs.size() > 5) costScaleFactorMinimum_ = std::stof(vcs[5]); + + costScaling_ = true; + costScalingFactor_ = std::stof( vcs[0]); + if(vcs.size() > 1) costScalingFreq_ = std::stoul(vcs[1]); + if(vcs.size() > 2) costScalingMultiplier_ = std::stof( vcs[2]); + if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]); LOG_ONCE(info, - "Training with cost scaling - factor: 2^{} = {}, frequency: {}, multiplier: {}, tolerance: {}, range: {}, minimum: {}", - costExponent, - costScaleFactor_, - costScaleFreq_, - costScaleMultiplier_, - costScaleNanTolerance_, - costScaleNanRange_, - costScaleFactorMinimum_); + "Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}", + costScalingFactor_, + costScalingFreq_, + costScalingMultiplier_, + costScalingFactorMinimum_); } if(options_->hasAndNotEmpty("dynamic-gradient-scaling")) { @@ -96,21 +90,17 @@ void GraphGroup::initGraphsAndOpts() { // given number of iterations. Usually we increase by 2 which adds // one more bit for precision. void GraphGroup::increaseCostScaleFactor() { - if(!costScale_) + if(!costScaling_) return; noNanSeen_++; size_t total = nanSeen_ + noNanSeen_; - float nanPercent = noNanSeen_ == (float)nanSeen_ / (float)total; // total is at least 1 because of noNanSeen_++ - if(noNanSeen_ % costScaleFreq_ == 0) { - costScaleFactor_ *= costScaleMultiplier_; - LOG(debug, - "NaN/Inf percentage {:.2f} after {} gradient updates. Increasing cost-scaling factor to {}", - nanPercent, - total, - costScaleFactor_); + if(noNanSeen_ % costScalingFreq_ == 0) { + costScalingFactor_ *= costScalingMultiplier_; + if(isMainProcess()) + LOG(debug, "No NaN/Inf after {} gradient updates. Increasing cost-scaling factor to {}", total, costScalingFactor_); // Resetting counts after cost-scale change noNanSeen_ = 0; @@ -120,48 +110,56 @@ void GraphGroup::increaseCostScaleFactor() { // call when a NaN was seen to decrease cost-scaling factor void GraphGroup::decreaseCostScaleFactor() { - if(!costScale_) + if(!costScaling_) return; nanSeen_++; size_t total = nanSeen_ + noNanSeen_; - float nanPercent = (float)nanSeen_ / (float)total; // total is at least 1 because of nanSeen_++ - if(total >= costScaleNanRange_ && nanPercent > costScaleNanTolerance_) { - if(costScaleFactor_ > costScaleFactorMinimum_) { - costScaleFactor_ /= costScaleMultiplier_; - LOG(debug, - "NaN/Inf percentage {:.2f} in {} gradient updates, reducing cost-scaling factor to {}", - nanPercent, - total, - costScaleFactor_); - } else { - // @TODO: think if should this rather abort? - LOG(warn, - "NaN/Inf percentage {:.2f} in {} gradient updates, but cost-scaling factor {} is already at minimum", - nanPercent, - total, - costScaleFactor_); - } - // Resetting counts after cost-scale change - noNanSeen_ = 0; - nanSeen_ = 0; + // do not reduce cost-scaling factor below minimum + if(costScalingFactor_ > costScalingFactorMinimum_) + costScalingFactor_ /= costScalingMultiplier_; + + if(isMainProcess()) { + if(costScalingFactor_ > costScalingFactorMinimum_) + LOG(debug, "Seen NaN/Inf after {} gradient updates. Reduced cost-scaling factor to {}", total, costScalingFactor_); + else + LOG(debug, "Seen NaN/Inf after {} gradient updates, Reduced cost-scaling factor to minimum {}. Pruning NaNs now.", total, costScalingFactor_); } + + // Resetting counts after cost-scale change + noNanSeen_ = 0; + nanSeen_ = 0; } float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) { auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin); - if(checkGradientNan_ || costScale_) { - bool hasNan = false, hasInf = false; - IsNaN(curGrad, graphs_[i]->allocator(), hasNan, hasInf); // @TODO: make safe with different compiler options - if(hasNan || hasInf) { - LOG(debug, "Found Nan ({}) or Inf ({})", hasNan, hasInf); + // If costScaling_ then check for NaN values if the costScalingFactor_ is larger than + // the minimum. If a NaN value is seen we exit here and will reduce the factor next and + // this skips an update. + // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces + // NaNs with 0. Updates are not skipped any more. + // Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type. + // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent + // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large. + if(costScaling_ || checkGradientNan_) { + bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_; + bool clipInf = !checkGradientNan_; + bool saneGradient = SanitizeGradient(curGrad, graphs_[i]->allocator(), pruneNaN, clipInf); + + // This should never happen, if it does, something is wrong with the kernel above and needs to be fixed. + ABORT_IF(pruneNaN && clipInf && !saneGradient, "We are removing NaNs and clipping Infs, but gradient is still not sane??"); + + if(!saneGradient) { + LOG(debug, "Found NaN"); return std::numeric_limits::quiet_NaN(); } } - + + // The optional clipping above will affect the norm here. The norm can be non-finite despite the above + // gradient sanitization, hence check again and propagate a NaN. if(dynamicGradientScaling_) { auto gNorm = L2Norm(curGrad, graphs_[i]->allocator()); if(isFinite(gNorm) && gNorm > 0.0) @@ -197,8 +195,8 @@ float GraphGroup::executeAndCollectNorm(const std::functionget("normalize-gradient")) normalizationFactor *= updateTrgWords; @@ -207,9 +205,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) return normalizationFactor; if(dynamicGradientScaling_) { - // make gradient norm invariant to changes in costScaleFactor_, luckily norm(c * g) = c * norm(g) - if(costScale_) - gNorm = gNorm / costScaleFactor_; + // make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g) + if(costScaling_) + gNorm = gNorm / costScalingFactor_; // Normalize gradient norm w.r.t. number of labels in batch for statistics, // there should be no gradient normalization before this point, @TODO: check this @@ -288,9 +286,7 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { restoreFromCheckpoint(modelFileName, scatterFn); } else if(options_->hasAndNotEmpty("pretrained-model")) { std::string nameInit = options_->get("pretrained-model"); - LOG(info, - "[training] Initializing model weights with pre-trained model {}", - nameInit); + LOG(info, "[training] Initializing model weights with pre-trained model {}", nameInit); size_t i = 0; for(auto graph : graphs_) diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 422990b1..b7f2f7ef 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -60,22 +60,21 @@ protected: double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false - bool costScale_{false}; - float costScaleFactor_{1.f}; // @TODO, add current costScaleFactor_ to trainingState for serialization - size_t costScaleFreq_{2000}; - float costScaleMultiplier_{2.f}; - float costScaleNanTolerance_{0.f}; - size_t costScaleNanRange_{1}; - float costScaleFactorMinimum_{1.f}; // @TODO make this configureable + bool costScaling_{false}; + float costScalingFactor_{1.f}; // @TODO, add current costScalingFactor_ to trainingState for serialization + size_t costScalingFreq_{2000}; + float costScalingMultiplier_{2.f}; + float costScalingFactorMinimum_{1.f}; + size_t noNanSeen_{0}; // @TODO, add current noNanSeen_ to trainingState for serialization size_t nanSeen_{0}; + bool checkGradientNan_{false}; + bool dynamicGradientScaling_{false}; float dynamicGradientScalingFactor_{2.f}; bool dynamicGradientScalingUseLogs_{false}; - bool checkGradientNan_{false}; - // determines the number of input streams (i.e. input files or fields in the TSV input) that need // to be included in the batch, i.e. without alignments and weights size_t numberOfInputFiles(); diff --git a/src/training/graph_group_async.cpp b/src/training/graph_group_async.cpp index 72b06e48..f85f9cf8 100644 --- a/src/training/graph_group_async.cpp +++ b/src/training/graph_group_async.cpp @@ -143,13 +143,13 @@ void AsyncGraphGroup::execute(Ptr batch) { thread_local Tensor accGradients; thread_local Ptr accAlloc; - ABORT_IF(costScale_ ,"Cost-scaling not implemented for AsyncSGD"); + ABORT_IF(costScaling_ ,"Cost-scaling not implemented for AsyncSGD"); auto graph = graphs_[tid]; Ptr dynamicLoss = models_[tid]->build(graph, batch); - if(costScaleFactor_ != 1.f) { + if(costScalingFactor_ != 1.f) { // it's ok to go out of scope, this will still insert the new top node into the graph - auto costNode = dynamicLoss->loss() * costScaleFactor_; + auto costNode = dynamicLoss->loss() * costScalingFactor_; } if(t % optimizerDelay_ == 0) { diff --git a/src/training/graph_group_singleton.cpp b/src/training/graph_group_singleton.cpp index 7dc86137..16261070 100644 --- a/src/training/graph_group_singleton.cpp +++ b/src/training/graph_group_singleton.cpp @@ -16,16 +16,16 @@ void SingletonGraph::execute(Ptr batch) { auto opt = optimizerShards_[0]; auto lossNode = model->build(graph, batch); - if(costScaleFactor_ != 1.f) { + if(costScalingFactor_ != 1.f) { // for fp16 training, it's ok to go out of scope, we do not use the scaled version for anything - auto scaledLoss = lossNode->loss() * costScaleFactor_; + auto scaledLoss = lossNode->loss() * costScalingFactor_; } graph->forward(); graph->backward(); bool noNanOrInf = true; - if(costScale_) { + if(costScaling_) { // Are there NaNs in the gradient? bool hasNan = false, hasInf = false; IsNaN(graph->params()->grads(), graph->allocator(), hasNan, hasInf); @@ -39,7 +39,7 @@ void SingletonGraph::execute(Ptr batch) { opt->update(graph->params()->vals(), graph->params()->grads(), batch->wordsTrg(), - costScaleFactor_); + costScalingFactor_); if(scheduler_) { scheduler_->update(*lossNode, batch); diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp index 8c06761e..c90a384e 100644 --- a/src/training/graph_group_sync.cpp +++ b/src/training/graph_group_sync.cpp @@ -252,8 +252,8 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num { // let loss go out of scope, frees memory auto rationalLoss = models_[localDeviceIndex]->build(graph, subBatch); - if(costScaleFactor_ != 1.f) - rationalLoss->loss() * costScaleFactor_; + if(costScalingFactor_ != 1.f) + rationalLoss->loss() * costScalingFactor_; graph->forward(); localDeviceLosses[localDeviceIndex] += *rationalLoss; @@ -262,7 +262,7 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num graph->backward(/*zero=*/false); // (gradients are reset before we get here) } -#if 1 +#if 0 // @TODO: this can probably be removed now, keep around until confirmed. // experimental and should eventually be somewhere else // Handle local gradient explosion but only clip to largest possible value // given number of GPUs and type. Should clip rarely. Also clips inf @@ -284,7 +284,7 @@ void SyncGraphGroup::update(std::vector> subBatches, size_t num comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards float gradNorm = 0.f; - if(costScale_ || dynamicGradientScaling_ || checkGradientNan_) { + if(costScaling_ || dynamicGradientScaling_ || checkGradientNan_) { // Wrapping member function auto checkNanOrNorm = [&](size_t i, size_t begin, size_t end) { return GraphGroup::checkNanOrNorm(i, begin, end); From c85d0608483789d446361ea28d95f7d7c9545f2d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 22 Nov 2021 03:32:54 +0000 Subject: [PATCH 08/11] Merged PR 20729: Add top-k sampling This adds Top-K sampling to Marian and extends the --output-sampling option to take arguments --- regression-tests | 2 +- src/common/config_parser.cpp | 7 ++-- src/graph/expression_operators.cpp | 7 ++++ src/graph/expression_operators.h | 15 ++++++- src/graph/node_operators_binary.h | 61 ++++++++++++++++++++++++++- src/graph/node_operators_tuple.h | 2 +- src/models/costs.cpp | 35 ++++++++++++++++ src/models/costs.h | 32 +++++++++----- src/models/model_factory.cpp | 21 ++++++++-- src/tensors/cpu/tensor_operators.cpp | 9 +++- src/tensors/gpu/tensor_operators.cu | 63 +++++++++++++++------------- src/tensors/tensor_operators.h | 23 +++++++++- src/translator/translator.h | 2 +- 13 files changed, 226 insertions(+), 53 deletions(-) diff --git a/regression-tests b/regression-tests index 7d612ca5..0aa7b6b7 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 7d612ca5e4b27a76f92584dad76d240e34f216d0 +Subproject commit 0aa7b6b7632732d1f22f3d8169d3262a7e6b1e9d diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 51764cdc..59b328e9 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -695,9 +695,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { "Use softmax shortlist: path first best prune"); cli.add>("--weights", "Scorer weights"); - cli.add("--output-sampling", - "Noise output layer with gumbel noise", - false); + cli.add>("--output-sampling", + "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. " + " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.") + ->implicit_val("full"); cli.add>("--output-approx-knn", "Use approximate knn search in output layer (currently only in transformer)") ->implicit_val("100 1024"); diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 560ab4e7..b26c2ae0 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -357,6 +357,13 @@ Expr gather(Expr a, int axis, Expr indices) { return Expression(a, axis, indices); } +// scatter() -- scatter arbitrary elements along an axis; batched or non-batched +// This is the reverse operation to gather. +Expr scatter(Expr a, int axis, Expr indices, Expr source) { + return Expression(a, axis, indices, source); +} + + // index_select() -- gather arbitrary elements along an axis from an unbatched // input 'a'. Indices are specified as a 1D vector. // This is used e.g. for embedding lookup. diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index e34ddc8a..d032e8d3 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -707,10 +707,23 @@ Expr stopGradient(Expr a); * @param indices The indices to be gathered * @returns Gathered expression with the same shape as @p indices * @note @p a and @p indices must have the same rank - * @note The non-target axes of @p a and @p indicies must have the same size, or be broadcastable. + * @note The non-target axes of @p a and @p indices must have the same size, or be broadcastable. */ Expr gather(Expr a, int axis, Expr indices); +/** + * Scatter elements from source along an axis into a. Unindexed elements from a remain unchanged. + * This is the reverse operation to gather. + * @param a The input expression + * @param axis The axis along which to index + * @param indices The indices to be scattered + * @param source Expression with values to scatter. + * @returns Scattered expression with the same shape as @p a now containing values from @p source in positions @p indices + * @note @p source and @p indices must have the same rank + * @note In this version @p source and @p indicies must have the same shape + */ +Expr scatter(Expr a, int axis, Expr indices, Expr source); + #if 0 // reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis. // with broadcasting diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index a180bb5c..b2a646b1 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -1033,12 +1033,14 @@ struct GatherNodeOp : public NaryNodeOp { NodeOps forwardOps() override { return {NodeOp( + // @TODO: rename to gather Select(val_, child(0)->val(), child(1)->val(), axis_))}; } NodeOps backwardOps() override { return {NodeOp( - Insert(child(0)->grad(), adj_, child(1)->val(), axis_))}; + // @TODO: rename to scatter + Insert(child(0)->grad(), adj_, child(1)->val(), axis_))}; } Shape newShape(Expr a, int axis, Expr indices) { @@ -1046,7 +1048,6 @@ struct GatherNodeOp : public NaryNodeOp { axis = shape.axis(axis); auto rank = shape.size(); ABORT_IF(rank != indices->shape().size(), "Mismatching ranks for input ({}) and indices ({})", std::string(shape), std::string(indices->shape())); - axis = a->shape().axis(axis); shape.set(axis, indices->shape()[axis]); for (size_t i = 0; i < rank; ++i) { if (i != axis) { @@ -1086,6 +1087,62 @@ private: int axis_; }; +struct ScatterNodeOp : public NaryNodeOp { + ScatterNodeOp(Expr a, int axis, Expr indices, Expr source) + : NaryNodeOp({a, indices, source}, newShape(a, axis, indices, source), a->value_type()), + axis_(a->shape().axis(axis)) { + matchOrAbort(indices->value_type()); + } + + NodeOps forwardOps() override { + return {NodeOp( + CopyCast(val_, child(0)->val()); // @TODO: use normal copy + Insert(val_, child(2)->val(), child(1)->val(), axis_) + )}; + } + + NodeOps backwardOps() override { + ABORT("backward for ScatterNodeOp not yet implemented"); + } + + Shape newShape(Expr a, int axis, Expr indices, Expr source) { + ABORT_IF(axis != -1, "only last dimensions"); + ABORT_IF(indices->shape() != source->shape(), "Shapes must match"); + + Shape shape = a->shape(); + // @TODO: do proper checking + return shape; + } + + const std::string type() override { return "scatter"; } + + const std::string color() override { return "orange"; } + + virtual size_t hash() override { + if(!hash_) { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, axis_); + hash_ = seed; + } + return hash_; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(axis_ != cnode->axis_) + return false; + return true; + } + +private: + friend class SerializationHelpers; + int axis_; +}; + struct ColsNodeOp : public NaryNodeOp { ColsNodeOp(Expr a, Expr indices) : NaryNodeOp({a, indices}, newShape(a, indices), a->value_type()) { diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h index c7a9531a..8acb1bc8 100644 --- a/src/graph/node_operators_tuple.h +++ b/src/graph/node_operators_tuple.h @@ -133,7 +133,7 @@ public: } void backward() override { - Insert(/*out*/child(0)->grad(), adj_, val_, axis_); + Insert(/*out*/child(0)->grad(), adj_, val_, axis_); } const std::string type() override { return "topk"; } diff --git a/src/models/costs.cpp b/src/models/costs.cpp index c688b211..4b15bcb3 100644 --- a/src/models/costs.cpp +++ b/src/models/costs.cpp @@ -10,5 +10,40 @@ Ptr LogSoftmaxStep::apply(Ptr state) { return state; } +Ptr GumbelSoftmaxStep::apply(Ptr state) { + state->setLogProbs(state->getLogProbs().applyUnaryFunctions( + [](Expr logits) { // lemma gets gumbelled + return logsoftmax(logits + constant_like(logits, inits::gumbel())); + }, + logsoftmax)); // factors don't + return state; +} + +TopkGumbelSoftmaxStep::TopkGumbelSoftmaxStep(int k) : k_{k} {} + +Ptr TopkGumbelSoftmaxStep::apply(Ptr state) { + state->setLogProbs(state->getLogProbs().applyUnaryFunctions( + [=](Expr logits) { // lemma gets gumbelled + // create logits-sized tensor consisting only of invalid path scores + float invalidPathScore = NumericLimits(logits->value_type()).lowest; + Expr invalidLogits = constant_like(logits, inits::fromValue(invalidPathScore)); + + // select top-k values + Expr val, idx; + std::tie(val, idx) = topk(logits, k_, /*axis=*/-1, /*descending=*/true); + + // uncomment below to display probability mass in top-k selection + // debug(sum(gather(softmax(logits), -1, idx), -1), "sum"); + + // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search + Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel())); + + // Scatter gumbelled values back into logits to fill with usable values + return scatter(invalidLogits, -1, idx, gumbelVal); + }, + logsoftmax)); // factors don't + return state; +} + } // namespace models } // namespace marian diff --git a/src/models/costs.h b/src/models/costs.h index e5463bfd..a087ed6a 100644 --- a/src/models/costs.h +++ b/src/models/costs.h @@ -297,20 +297,30 @@ public: virtual Ptr apply(Ptr state) override; }; -// Gumbel-max noising for sampling during beam-search -// Seems to work well enough with beam-size=1. Turn on -// with --output-sampling during translation with marian-decoder +// Gumbel-max noising for sampling during translation. +// Produces accurate sampling with beam=1. Turn on +// with --output-sampling [full] during translation +// with marian-decoder for samnpling from the full +// softmax distribution. class GumbelSoftmaxStep : public ILogProbStep { public: virtual ~GumbelSoftmaxStep() {} - virtual Ptr apply(Ptr state) override { - state->setLogProbs(state->getLogProbs().applyUnaryFunctions( - [](Expr logits) { // lemma gets gumbelled - return logsoftmax(logits + constant_like(logits, inits::gumbel())); - }, - logsoftmax)); // factors don't - return state; - } + virtual Ptr apply(Ptr state) override; +}; + + +// Gumbel-max noising for top-k sampling during translation. +// Produces accurate sampling with beam=1. Turn on +// with --output-sampling topk [10] during translation +// with marian-decoder for top-10 sampling. +class TopkGumbelSoftmaxStep : public ILogProbStep { +private: + int k_{1}; + +public: + TopkGumbelSoftmaxStep(int k); + virtual ~TopkGumbelSoftmaxStep() {} + virtual Ptr apply(Ptr state) override; }; // class to wrap an IEncoderDecoder and a ILogProbStep that are executed in sequence, diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index e176e6a4..52a87e72 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -370,10 +370,25 @@ Ptr createModelFromOptions(Ptr options, usage use) { // add (log)softmax if requested if (use == usage::translation) { if(std::dynamic_pointer_cast(baseModel)) { - if(options->get("output-sampling", false)) - return New(std::dynamic_pointer_cast(baseModel), New()); - else + if(options->hasAndNotEmpty("output-sampling")) { + auto sampling = options->get>("output-sampling", {}); + std::string method = sampling.size() > 0 ? sampling[0] : "full"; + + if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) { + LOG(info, "Output sampling from the full softmax distribution"); + return New(std::dynamic_pointer_cast(baseModel), New()); + } else if(method == "topk") { + int k = sampling.size() > 1 ? std::stoi(sampling[1]) : 10; + if(k == 1) + LOG(info, "Output sampling with k=1 is equivalent to beam search with beam size 1"); + LOG(info, "Output sampling via top-{} sampling", k); + return New(std::dynamic_pointer_cast(baseModel), New(k)); + } else { + ABORT("Unknown sampling method: {}", method); + } + } else { return New(std::dynamic_pointer_cast(baseModel), New()); + } } #ifdef COMPILE_EXAMPLES // note: 'usage::translation' here means 'inference' diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index f3964f91..1e1adc38 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -739,6 +739,7 @@ void Select(Tensor out, } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, @@ -760,10 +761,16 @@ void Insert(Tensor out, int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axisCPU] = (int)indices->data()[idxIndex]; int outIndex = outShape.index(dims); - out->data()[outIndex] += in->data()[index]; + if(add) + out->data()[outIndex] += in->data()[index]; + else + out->data()[outIndex] = in->data()[index]; } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); + void GRUFastForward(Tensor out_, std::vector inputs, bool final) { int rows = out_->shape().elements() / out_->shape().back(); int cols = out_->shape().back(); diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 1347c3bb..2103ca9d 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1309,7 +1309,7 @@ __global__ void gSelect(T* out, } } -template +template __global__ void gInsert(T* out, functional::Shape outShape, const T* in, @@ -1327,7 +1327,10 @@ __global__ void gInsert(T* out, int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axis] = (int)d_indices[idxIndex]; int outIndex = outShape.index(dims); - out[outIndex] += in[index]; // this is probably wrong, atomicAdd? + if(add) + out[outIndex] += in[index]; // this is probably wrong, atomicAdd? + else + out[outIndex] = in[index]; } } } @@ -1349,21 +1352,21 @@ void Select(Tensor out, if(out->type() == Type::float32) { gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gSelect<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #endif } else if(out->type() == Type::uint32) { gSelect<<>>(out->data(), @@ -1378,6 +1381,7 @@ void Select(Tensor out, } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, @@ -1393,28 +1397,31 @@ void Insert(Tensor out, int axisGPU = axis + functional::Shape::size() - out->shape().size(); if(out->type() == Type::float32) { - gInsert<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gInsert<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { - gInsert<<>>(out->data(), - out->shape(), - in->data(), - in->shape(), - axisGPU, - indices->data(), - indices->shape()); + gInsert<<>>(out->data(), + out->shape(), + in->data(), + in->shape(), + axisGPU, + indices->data(), + indices->shape()); #endif } else { ABORT("Insert not implemented for type {}", out->type()); } } +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); + template __global__ void gGRUFastForward(T* out, const T* state, diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index dc29bf35..1fc4542d 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -297,7 +297,28 @@ DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int) -DISPATCH4(Insert, marian::Tensor, const marian::Tensor, const marian::Tensor, int) + +#ifdef CUDA_FOUND +namespace gpu { + template + void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +} +#endif + +namespace cpu { + template + void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); +} + +template +static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int axis) { +#ifdef CUDA_FOUND + if(out->getBackend()->getDeviceId().type == DeviceType::gpu) + gpu::Insert(out, in, indices, axis); + else +#endif + cpu::Insert(out, in, indices, axis); +} DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr, const marian::Tensor, int, int, bool); diff --git a/src/translator/translator.h b/src/translator/translator.h index db1f3d03..3e375f65 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -119,7 +119,7 @@ public: threadPool.enqueue(task, device, id++); } - if(options_->get("output-sampling", false)) { + if(options_->hasAndNotEmpty("output-sampling")) { if(options_->get("beam-size") > 1) LOG(warn, "[warning] Output sampling and beam search (beam-size > 1) are contradictory methods " From 8b8d1b11e28a421b348703d702c9c5206061df9d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 25 Nov 2021 02:33:49 +0000 Subject: [PATCH 09/11] Merged PR 21553: Parallelize data reading for training This parallelizes data reading. On very fast GPUs and with small models training speed can be starved by too slow batch creation. Use --data-threads 8 or more, by default currently set to 1 for backcompat. --- src/common/config_parser.cpp | 7 ++ src/common/utils.cpp | 8 +- src/data/batch_generator.h | 35 +++++--- src/data/corpus.cpp | 160 +++++++++++++++++++---------------- src/data/corpus.h | 3 + src/data/corpus_base.cpp | 44 +++++----- src/data/corpus_base.h | 105 +++++++++++++++++++++-- src/data/corpus_nbest.cpp | 7 +- src/data/corpus_sqlite.cpp | 6 +- src/data/text_input.cpp | 6 +- 10 files changed, 255 insertions(+), 126 deletions(-) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 59b328e9..3d79f8af 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -883,6 +883,10 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { if(mode_ == cli::mode::training) { cli.add("--shuffle-in-ram", "Keep shuffled corpus in RAM, do not write to temp file"); + + cli.add("--data-threads", + "Number of concurrent threads to use during data reading and processing", 1); + // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope. cli.add("--all-caps-every", "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8"); @@ -901,6 +905,9 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { cli.add("--mini-batch-round-up", "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false", true); + } else { + cli.add("--data-threads", + "Number of concurrent threads to use during data reading and processing", 1); } // clang-format on } diff --git a/src/common/utils.cpp b/src/common/utils.cpp index 72624041..99fc790a 100644 --- a/src/common/utils.cpp +++ b/src/common/utils.cpp @@ -70,22 +70,20 @@ void split(const std::string& line, // the function guarantees that the output has as many elements as requested void splitTsv(const std::string& line, std::vector& fields, size_t numFields) { fields.clear(); + fields.resize(numFields); // make sure there is as many elements as requested size_t begin = 0; size_t pos = 0; for(size_t i = 0; i < numFields; ++i) { pos = line.find('\t', begin); if(pos == std::string::npos) { - fields.push_back(line.substr(begin)); + fields[i] = line.substr(begin); break; } - fields.push_back(line.substr(begin, pos - begin)); + fields[i] = line.substr(begin, pos - begin); begin = pos + 1; } - if(fields.size() < numFields) // make sure there is as many elements as requested - fields.resize(numFields); - ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line); } diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h index a248db23..ea977468 100644 --- a/src/data/batch_generator.h +++ b/src/data/batch_generator.h @@ -2,6 +2,7 @@ #include "common/options.h" #include "common/signal_handling.h" +#include "common/timer.h" #include "data/batch_stats.h" #include "data/rng_engine.h" #include "training/training_state.h" @@ -92,6 +93,8 @@ private: // this runs on a bg thread; sequencing is handled by caller, but locking is done in here std::deque fetchBatches() { + timer::Timer total; + typedef typename Sample::value_type Item; auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content @@ -135,19 +138,29 @@ private: if(current_ != data_->end()) ++current_; } - size_t sets = 0; - while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data + + Samples maxiBatchTemp; + while(current_ != data_->end() && maxiBatchTemp.size() < maxSize) { // loop over data if (saveAndExitRequested()) // stop generating batches return std::deque(); - maxiBatch->push(*current_); - sets = current_->size(); + + maxiBatchTemp.push_back(*current_); + // do not consume more than required for the maxi batch as this causes // that line-by-line translation is delayed by one sentence - bool last = maxiBatch->size() == maxSize; + bool last = maxiBatchTemp.size() == maxSize; if(!last) ++current_; // this actually reads the next line and pre-processes it } - size_t numSentencesRead = maxiBatch->size(); + size_t numSentencesRead = maxiBatchTemp.size(); + + size_t sets = 0; + for(auto&& s : maxiBatchTemp) { + if(!s.empty()) { + sets = s.size(); + maxiBatch->push(s); + } + } // construct the actual batches and place them in the queue Samples batchVector; @@ -163,6 +176,7 @@ private: BatchStats::const_iterator cachedStatsIter; if (stats_) cachedStatsIter = stats_->begin(); + while(!maxiBatch->empty()) { // while there are sentences in the queue if (saveAndExitRequested()) // stop generating batches return std::deque(); @@ -178,12 +192,7 @@ private: lengths[i] = batchVector.back()[i].size(); // record max lengths so far maxBatchSize = stats_->findBatchSize(lengths, cachedStatsIter); - // this optimization makes no difference indeed -#if 0 // sanity check: would we find the same entry if searching from the start? - auto it = stats_->lower_bound(lengths); - auto maxBatchSize1 = stats_->findBatchSize(lengths, it); - ABORT_IF(maxBatchSize != maxBatchSize1, "findBatchSize iter caching logic is borked"); -#endif + makeBatch = batchVector.size() >= maxBatchSize; // if last added sentence caused a bump then we likely have bad padding, so rather move it into the next batch if(batchVector.size() > maxBatchSize) { @@ -231,6 +240,8 @@ private: LOG(debug, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.", tempBatches.size(), numSentencesRead, (double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom); + LOG(debug, "[data] fetching batches took {:.2f} seconds, {:.2f} sents/s", total.elapsed(), (double)numSentencesRead / total.elapsed()); + return tempBatches; } diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index d8a364b2..643a7de9 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -14,18 +14,30 @@ namespace data { Corpus::Corpus(Ptr options, bool translate /*= false*/, size_t seed /*= Config:seed*/) : CorpusBase(options, translate, seed), - shuffleInRAM_(options_->get("shuffle-in-ram", false)), - allCapsEvery_(options_->get("all-caps-every", 0)), - titleCaseEvery_(options_->get("english-title-case-every", 0)) {} + shuffleInRAM_(options_->get("shuffle-in-ram", false)), + allCapsEvery_(options_->get("all-caps-every", 0)), + titleCaseEvery_(options_->get("english-title-case-every", 0)) { + + auto numThreads = options_->get("data-threads", 1); + if(numThreads > 1) + threadPool_.reset(new ThreadPool(numThreads)); + +} Corpus::Corpus(std::vector paths, std::vector> vocabs, Ptr options, size_t seed /*= Config:seed*/) : CorpusBase(paths, vocabs, options, seed), - shuffleInRAM_(options_->get("shuffle-in-ram", false)), - allCapsEvery_(options_->get("all-caps-every", 0)), - titleCaseEvery_(options_->get("english-title-case-every", 0)) {} + shuffleInRAM_(options_->get("shuffle-in-ram", false)), + allCapsEvery_(options_->get("all-caps-every", 0)), + titleCaseEvery_(options_->get("english-title-case-every", 0)) { + + auto numThreads = options_->get("data-threads", 1); + if(numThreads > 1) + threadPool_.reset(new ThreadPool(numThreads)); + +} void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { bool isFactoredVocab = vocabs_.back()->tryAs() != nullptr; @@ -52,16 +64,10 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { } SentenceTuple Corpus::next() { - // Used for handling TSV inputs - // Determine the total number of fields including alignments or weights - auto tsvNumAllFields = tsvNumInputFields_; - if(alignFileIdx_ > -1) - ++tsvNumAllFields; - if(weightFileIdx_ > -1) - ++tsvNumAllFields; - std::vector fields(tsvNumAllFields); + size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size(); + std::vector fields(numStreams); - for(;;) { // (this is a retry loop for skipping invalid sentences) + while(true) { // retry loop // get index of the current sentence size_t curId = pos_; // note: at end, pos_ == total size // if corpus has been shuffled, ids_ contains sentence indexes @@ -69,83 +75,91 @@ SentenceTuple Corpus::next() { curId = ids_[pos_]; pos_++; - // fill up the sentence tuple with sentences from all input files - SentenceTuple tup(curId); size_t eofsHit = 0; - size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size(); - for(size_t i = 0; i < numStreams; ++i) { - std::string line; - + for(size_t i = 0; i < numStreams; ++i) { // looping of all streams // fetch line, from cached copy in RAM or actual file if (!corpusInRAM_.empty()) { if (curId < corpusInRAM_[i].size()) - line = corpusInRAM_[i][curId]; + fields[i] = corpusInRAM_[i][curId]; else { eofsHit++; continue; } } else { - bool gotLine = io::getline(*files_[i], line).good(); + bool gotLine = io::getline(*files_[i], fields[i]).good(); if(!gotLine) { eofsHit++; continue; } } - - if(i > 0 && i == alignFileIdx_) { - addAlignmentToSentenceTuple(line, tup); - } else if(i > 0 && i == weightFileIdx_) { - addWeightsToSentenceTuple(line, tup); - } else { - if(tsv_) { // split TSV input and add each field into the sentence tuple - utils::splitTsv(line, fields, tsvNumAllFields); - size_t shift = 0; - for(size_t j = 0; j < tsvNumAllFields; ++j) { - // index j needs to be shifted to get the proper vocab index if guided-alignment or - // data-weighting are preceding source or target sequences in TSV input - if(j == alignFileIdx_ || j == weightFileIdx_) { - ++shift; - } else { - size_t vocabId = j - shift; - bool altered; - preprocessLine(fields[j], vocabId, /*out=*/altered); - if (altered) - tup.markAltered(); - addWordsToSentenceTuple(fields[j], vocabId, tup); - } - } - - // weights are added last to the sentence tuple, because this runs a validation that needs - // length of the target sequence - if(alignFileIdx_ > -1) - addAlignmentToSentenceTuple(fields[alignFileIdx_], tup); - if(weightFileIdx_ > -1) - addWeightsToSentenceTuple(fields[weightFileIdx_], tup); - - } else { - bool altered; - preprocessLine(line, i, /*out=*/altered); - if (altered) - tup.markAltered(); - addWordsToSentenceTuple(line, i, tup); - } - } } - if (eofsHit == numStreams) - return SentenceTuple(0); + if(eofsHit == numStreams) + return SentenceTuple(); // unintialized SentenceTuple which will be invalid when tested + ABORT_IF(eofsHit != 0, "not all input files have the same number of lines"); - // check if all streams are valid, that is, non-empty and no longer than maximum allowed length - if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { - return words.size() > 0 && words.size() <= maxLength_; - })) - return tup; + auto makeSentenceTuple = [this](size_t curId, std::vector fields) { + if(tsv_) { + // with tsv inputs data, there is only one input stream, hence we only have one field + // which needs to be tokenized into tab-separated fields + ABORT_IF(fields.size() != 1, "Reading TSV file, but we have don't have exactly one stream??"); + size_t numAllFields = tsvNumInputFields_; + if(alignFileIdx_ > -1) + ++numAllFields; + if(weightFileIdx_ > -1) + ++numAllFields; + // replace single-element fields array with extracted tsv fields + std::vector tmpFields; + utils::splitTsv(fields[0], tmpFields, numAllFields); // this verifies the number of fields + fields.swap(tmpFields); + } - // otherwise skip this sentence and try the next one - // @TODO: tail recursion? - } + // fill up the sentence tuple with sentences from all input files + SentenceTupleImpl tup(curId); + size_t shift = 0; + for(size_t i = 0; i < fields.size(); ++i) { + // index j needs to be shifted to get the proper vocab index if guided-alignment or + // data-weighting are preceding source or target sequences in TSV input + if(i == alignFileIdx_ || i == weightFileIdx_) { + ++shift; + } else { + size_t vocabId = i - shift; + bool altered; + preprocessLine(fields[i], vocabId, /*out=*/altered); + if (altered) + tup.markAltered(); + addWordsToSentenceTuple(fields[i], vocabId, tup); + } + + // weights are added last to the sentence tuple, because this runs a validation that needs + // length of the target sequence + if(alignFileIdx_ > -1) + addAlignmentToSentenceTuple(fields[alignFileIdx_], tup); + if(weightFileIdx_ > -1) + addWeightsToSentenceTuple(fields[weightFileIdx_], tup); + } + + // check if all streams are valid, that is, non-empty and no longer than maximum allowed length + if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { + return words.size() > 0 && words.size() <= maxLength_; + })) { + return tup; + } else { + return SentenceTupleImpl(); // return an empty tuple if above test does not pass + } + }; + + if(threadPool_) { // use thread pool if available + return SentenceTuple(threadPool_->enqueue(makeSentenceTuple, curId, fields)); + } else { // otherwise launch here and just pass the result into the wrapper + auto tup = makeSentenceTuple(curId, fields); + if(!tup.empty()) + return SentenceTuple(tup); + } + + } // end of retry loop } // reset and initialize shuffled reading @@ -167,6 +181,8 @@ void Corpus::reset() { pos_ = 0; for (size_t i = 0; i < paths_.size(); ++i) { if(paths_[i] == "stdin" || paths_[i] == "-") { + std::cin.tie(0); + std::ios_base::sync_with_stdio(false); files_[i].reset(new std::istream(std::cin.rdbuf())); // Probably not necessary, unless there are some buffers // that we want flushed. diff --git a/src/data/corpus.h b/src/data/corpus.h index e8e9a9fd..281d43a2 100644 --- a/src/data/corpus.h +++ b/src/data/corpus.h @@ -4,6 +4,7 @@ #include #include +#include "3rd_party/threadpool.h" #include "common/definitions.h" #include "common/file_stream.h" #include "common/options.h" @@ -20,6 +21,8 @@ class Corpus : public CorpusBase { private: std::vector> tempFiles_; std::vector ids_; + + UPtr threadPool_; // thread pool for parallelized data reading // for shuffle-in-ram bool shuffleInRAM_{false}; diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 5f9a9ee3..bfce31bf 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -12,7 +12,24 @@ typedef std::vector MaskBatch; typedef std::pair WordMask; typedef std::vector SentBatch; -CorpusIterator::CorpusIterator() : pos_(-1), tup_(0) {} +void SentenceTupleImpl::setWeights(const std::vector& weights) { + if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine + ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights"); + auto numWeights = weights.size(); + auto numTrgWords = back().size(); + // word-level weights may or may not contain a weight for EOS tokens + if(numWeights != numTrgWords && numWeights != numTrgWords - 1) + LOG(warn, + "[warn] " + "Number of weights ({}) does not match the number of target words ({}) in line #{}", + numWeights, + numTrgWords, + id_); + } + weights_ = weights; +} + +CorpusIterator::CorpusIterator() : pos_(-1) {} CorpusIterator::CorpusIterator(CorpusBase* corpus) : corpus_(corpus), pos_(0), tup_(corpus_->next()) {} @@ -23,7 +40,7 @@ void CorpusIterator::increment() { } bool CorpusIterator::equal(CorpusIterator const& other) const { - return this->pos_ == other.pos_ || (this->tup_.empty() && other.tup_.empty()); + return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid()); } const SentenceTuple& CorpusIterator::dereference() const { @@ -390,7 +407,7 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) void CorpusBase::addWordsToSentenceTuple(const std::string& line, size_t batchIndex, - SentenceTuple& tup) const { + SentenceTupleImpl& tup) const { // This turns a string in to a sequence of numerical word ids. Depending // on the vocabulary type, this can be non-trivial, e.g. when SentencePiece // is used. @@ -411,7 +428,7 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, } void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, - SentenceTuple& tup) const { + SentenceTupleImpl& tup) const { ABORT_IF(rightLeft_, "Guided alignment and right-left model cannot be used " "together at the moment"); @@ -420,7 +437,7 @@ void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, tup.setAlignment(align); } -void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const { +void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const { auto elements = utils::split(line, " "); if(!elements.empty()) { @@ -549,23 +566,6 @@ size_t CorpusBase::getNumberOfTSVInputFields(Ptr options) { return 0; } -void SentenceTuple::setWeights(const std::vector& weights) { - if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine - ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights"); - auto numWeights = weights.size(); - auto numTrgWords = back().size(); - // word-level weights may or may not contain a weight for EOS tokens - if(numWeights != numTrgWords && numWeights != numTrgWords - 1) - LOG(warn, - "[warn] " - "Number of weights ({}) does not match the number of target words ({}) in line #{}", - numWeights, - numTrgWords, - id_); - } - weights_ = weights; -} - // experimental: hide inline-fix source tokens from cross attention std::vector SubBatch::crossMaskWithInlineFixSourceSuppressed() const { diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 251df5bc..82a01286 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -11,6 +11,8 @@ #include "data/rng_engine.h" #include "data/vocab.h" +#include + namespace marian { namespace data { @@ -22,7 +24,7 @@ namespace data { * construction of marian::data::CorpusBatch objects. They are not a part of * marian::data::CorpusBatch. */ -class SentenceTuple { +class SentenceTupleImpl { private: size_t id_; std::vector tuple_; // [stream index][step index] @@ -33,12 +35,17 @@ private: public: typedef Words value_type; + /** + * @brief Creates an empty tuple with 0 id (default constructor). + */ + SentenceTupleImpl() : id_(0) {} + /** * @brief Creates an empty tuple with the given Id. */ - SentenceTuple(size_t id) : id_(id) {} + SentenceTupleImpl(size_t id) : id_(id) {} - ~SentenceTuple() { tuple_.clear(); } + ~SentenceTupleImpl() {} /** * @brief Returns the sentence's ID. @@ -114,6 +121,92 @@ public: void setAlignment(const WordAlignment& alignment) { alignment_ = alignment; } }; +class SentenceTuple { +private: + std::shared_ptr> fImpl_; + mutable std::shared_ptr impl_; + +public: + typedef Words value_type; + + /** + * @brief Creates an empty tuple with no associated future. + */ + SentenceTuple() {} + + SentenceTuple(const SentenceTupleImpl& tupImpl) + : impl_(std::make_shared(tupImpl)) {} + + SentenceTuple(std::future&& fImpl) + : fImpl_(new std::future(std::move(fImpl))) {} + + SentenceTupleImpl& get() const { + if(!impl_) { + ABORT_IF(!fImpl_ || !fImpl_->valid(), "No future tuple associated with SentenceTuple"); + impl_ = std::make_shared(fImpl_->get()); + } + return *impl_; + } + + /** + * @brief Returns the sentence's ID. + */ + size_t getId() const { return get().getId(); } + + /** + * @brief Returns whether this Tuple was altered or augmented from what + * was provided to Marian in input. + */ + bool isAltered() const { return get().isAltered(); } + + /** + * @brief The size of the tuple, e.g. two for parallel data with a source and + * target sentences. + */ + size_t size() const { return get().size(); } + + /** + * @brief confirms that the tuple has been populated with data + */ + bool valid() const { + return fImpl_ || impl_; + } + + /** + * @brief The i-th tuple sentence. + * + * @param i Tuple's index. + */ + Words& operator[](size_t i) { return get()[i]; } + const Words& operator[](size_t i) const { return get()[i]; } + + /** + * @brief The last tuple sentence, i.e. the target sentence. + */ + Words& back() { return get().back(); } + const Words& back() const { return get().back(); } + + /** + * @brief Checks whether the tuple is empty. + */ + bool empty() const { return get().empty(); } + + auto begin() const -> decltype(get().begin()) { return get().begin(); } + auto end() const -> decltype(get().end()) { return get().end(); } + + auto rbegin() const -> decltype(get().rbegin()) { return get().rbegin(); } + auto rend() const -> decltype(get().rend()) { return get().rend(); } + + /** + * @brief Get sentence weights. + * + * For sentence-level weights the vector contains only one element. + */ + const std::vector& getWeights() const { return get().getWeights(); } + + const WordAlignment& getAlignment() const { return get().getAlignment(); } +}; + /** * @brief Batch of sentences represented as word indices with masking. */ @@ -586,17 +679,17 @@ protected: * @brief Helper function converting a line of text into words using the i-th * vocabulary and adding them to the sentence tuple. */ - void addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTuple& tup) const; + void addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTupleImpl& tup) const; /** * @brief Helper function parsing a line with word alignments and adding them * to the sentence tuple. */ - void addAlignmentToSentenceTuple(const std::string& line, SentenceTuple& tup) const; + void addAlignmentToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const; /** * @brief Helper function parsing a line of weights and adding them to the * sentence tuple. */ - void addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const; + void addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const; void addAlignmentsToBatch(Ptr batch, const std::vector& batchVector); diff --git a/src/data/corpus_nbest.cpp b/src/data/corpus_nbest.cpp index d5a48d8d..8029d351 100644 --- a/src/data/corpus_nbest.cpp +++ b/src/data/corpus_nbest.cpp @@ -43,7 +43,7 @@ SentenceTuple CorpusNBest::next() { pos_++; // fill up the sentence tuple with sentences from all input files - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); std::string line; lastLines_.resize(files_.size() - 1); @@ -74,9 +74,10 @@ SentenceTuple CorpusNBest::next() { if(cont && std::all_of(tup.begin(), tup.end(), [=](const Words& words) { return words.size() > 0 && words.size() <= maxLength_; })) - return tup; + return SentenceTuple(tup); } - return SentenceTuple(0); + + return SentenceTuple(); } void CorpusNBest::reset() { diff --git a/src/data/corpus_sqlite.cpp b/src/data/corpus_sqlite.cpp index 297847c0..f7c577f2 100644 --- a/src/data/corpus_sqlite.cpp +++ b/src/data/corpus_sqlite.cpp @@ -109,7 +109,7 @@ SentenceTuple CorpusSQLite::next() { while(select_->executeStep()) { // fill up the sentence tuple with sentences from all input files size_t curId = select_->getColumn(0).getInt(); - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); for(size_t i = 0; i < files_.size(); ++i) { auto line = select_->getColumn((int)(i + 1)); @@ -126,9 +126,9 @@ SentenceTuple CorpusSQLite::next() { if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) { return words.size() > 0 && words.size() <= maxLength_; })) - return tup; + return SentenceTuple(tup); } - return SentenceTuple(0); + return SentenceTuple(); } void CorpusSQLite::shuffle() { diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp index 958190fc..b1f4cdd4 100644 --- a/src/data/text_input.cpp +++ b/src/data/text_input.cpp @@ -40,7 +40,7 @@ SentenceTuple TextInput::next() { size_t curId = pos_++; // fill up the sentence tuple with source and/or target sentences - SentenceTuple tup(curId); + SentenceTupleImpl tup(curId); for(size_t i = 0; i < files_.size(); ++i) { std::string line; if(io::getline(*files_[i], line)) { @@ -57,9 +57,9 @@ SentenceTuple TextInput::next() { } if(tup.size() == files_.size()) // check if each input file provided an example - return tup; + return SentenceTuple(tup); else if(tup.size() == 0) // if no file provided examples we are done - return SentenceTuple(0); + return SentenceTuple(); else // neither all nor none => we have at least on missing entry ABORT("There are missing entries in the text tuples."); } From bbc673c50fbf2faa90bdc44003d15087632262bc Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 24 Nov 2021 18:42:14 -0800 Subject: [PATCH 10/11] update CHANGELOG and VERSION --- CHANGELOG.md | 6 +++++- VERSION | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 05658fe1..bce24cfc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added -- Adds option --add-lsh to marian-conv which allows the LSH to be memory-mapped. +- Parallelized data reading with e.g. `--data-threads 8` +- Top-k sampling during decoding with e.g. `--output-sampling topk 10` +- Improved mixed precision training with `--fp16` +- Set FFN width in decoder independently from encoder with e.g. `--transformer-dim-ffn 4096 --transformer-decoder-dim-ffn 2048` +- Adds option `--add-lsh` to marian-conv which allows the LSH to be memory-mapped. - Early stopping based on first, all, or any validation metrics via `--early-stopping-on` - Compute 8.6 support if using CUDA>=11.1 - Support for RMSNorm as drop-in replace for LayerNorm from `Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization`. Enabled in Transformer model via `--transformer-postprocess dar` instead of `dan`. diff --git a/VERSION b/VERSION index 3c40cf56..cf4bd774 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -v1.10.24 +v1.10.42 From e8ea37cd5b85e3df817b9ced68bef9cc64b45d16 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 6 Dec 2021 23:20:44 +0000 Subject: [PATCH 11/11] Merged PR 21648: Allow for dynamic gradient scaling to fade out after N updates Allow for dynamic gradient scaling to fade out after N updates --- src/tensors/gpu/prod.cpp | 6 +++++- src/training/graph_group.cpp | 17 ++++++++++++++--- src/training/graph_group.h | 1 + 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp index bf0d2395..c72af4db 100755 --- a/src/tensors/gpu/prod.cpp +++ b/src/tensors/gpu/prod.cpp @@ -562,7 +562,11 @@ void ProdBatchedLegacy(marian::Tensor C, ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, beta, scalar); #if COMPILE_FP16 } else if(C->type() == Type::float16) { // not a *.cu file - ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar)); + // we use computeType=float here for fp16 training as this seems more stable and roughly as fast + ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, beta, scalar); + + // original for reference: + // ProdBatchedTypedLegacy(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar)); #endif } else { ABORT("ProdBatchedLegacy not implemented for element type {}", C->type()); diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 03e5acf4..59cd4b6d 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -31,11 +31,16 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) if(vgc.size() > 0) dynamicGradientScalingFactor_ = std::stof(vgc[0]); if(vgc.size() > 1) dynamicGradientScalingUseLogs_ = vgc[1] == "log"; + if(vgc.size() > 2) dynamicGradientScalingFadeout_ = std::stoul(vgc[2]); LOG_ONCE(info, "Re-scaling gradient to have average gradient norm if (log={}) gradient norm diverges from average by {} sigmas", dynamicGradientScalingUseLogs_, dynamicGradientScalingFactor_); + if(dynamicGradientScalingFadeout_ > 0) + LOG_ONCE(info, + "Dynamic gradient re-scaling will fade out linearly after {} updates", + dynamicGradientScalingFadeout_); } if(options_->get("check-gradient-nan")) { @@ -229,11 +234,17 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) auto deltaTransform = gNormTransform - gNormAvgTransform; // compute the difference between the current transformer gradient norm and the running average. auto gNormStdTransform = std::sqrt(gNormVarTransform); // compute STD for the running average of (log) gradient norms. + float fadeoutMultiplier = 1.f; + if(dynamicGradientScalingFadeout_ > 0ul) // fade out linearly after that many updates @TODO: allow units other than updates + fadeoutMultiplier = (float)std::max(dynamicGradientScalingFadeout_, scheduler_->numberOfBatches()) / (float)dynamicGradientScalingFadeout_; + + float dynamicGradientScalingFactorWithFadeout = dynamicGradientScalingFactor_ * fadeoutMultiplier; // if fadeoutMultiplier increases dynamic gradient scaling becomes less and less likely to happen over time. // delta of (log) gradient norm vs (log) gradient norm average is larger than N standard deviations // hence rescale gradient using the average. - if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactor_ * gNormStdTransform) { - LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f}", - dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactor_, gNormStdTransform); + if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactorWithFadeout * gNormStdTransform) { + if(isMainProcess()) + LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f} - scaling gradient by {:.4f}", + dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactorWithFadeout, gNormStdTransform, gNormAvg / gNorm); normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average. } diff --git a/src/training/graph_group.h b/src/training/graph_group.h index b7f2f7ef..aa68922a 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -74,6 +74,7 @@ protected: bool dynamicGradientScaling_{false}; float dynamicGradientScalingFactor_{2.f}; bool dynamicGradientScalingUseLogs_{false}; + size_t dynamicGradientScalingFadeout_{0ul}; // determines the number of input streams (i.e. input files or fields in the TSV input) that need // to be included in the batch, i.e. without alignments and weights