From d5c7372a67a6baf1df58ae5ef7240372cedf73c1 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 8 Apr 2022 16:00:04 +0000 Subject: [PATCH] Merged PR 23407: Fix incorrect/missing gradient accumulation for affine biases This PR fixes incorrect/missing gradient accumulation with delay > 1 or large effective batch size of biases of affine operations. --- CHANGELOG.md | 1 + VERSION | 2 +- src/graph/node_operators_binary.h | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d2b4338..681fa59a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added ### Fixed +- Fix incorrect/missing gradient accumulation with delay > 1 or large effective batch size of biases of affine operations. - Fixed case augmentation with multi-threaded reading. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load diff --git a/VERSION b/VERSION index 62e1a502..a130ad69 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.5 +v1.11.6 diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index b2a646b1..f46e0b89 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -334,7 +334,7 @@ public: false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 1.f, 1.f, computeTypeC)) }; if(transA_ && !transB_) @@ -353,7 +353,7 @@ public: false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 1.f, 1.f, computeTypeC)) }; if(transA_ && transB_) @@ -372,7 +372,7 @@ public: true, 1.0, scalar_, computeTypeB)), - NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 1.f, 1.f, computeTypeC)) }; return { @@ -390,7 +390,7 @@ public: false, 1.0, scalar_, computeTypeB)), - NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 0.f, 1.f, computeTypeC)) + NodeOp(Prod(child(2)->grad(), child(3)->val(), adj_, true, false, 1.f, 1.f, computeTypeC)) }; }