From af02867fb1cfab784d02519aad3b630be1bf5bf6 Mon Sep 17 00:00:00 2001 From: Martin Junczys-Dowmunt Date: Sat, 11 Jan 2020 20:29:43 +0000 Subject: [PATCH] Merged PR 11103: Clear cache for RNN object between batches * Clears cache for RNN object in transformer, otherwise stale tensor might be kept around. * Add missing `hash()` and `equal` functions everywhere. * Fixes bug from deployment test. --- examples | 2 +- src/3rd_party/CMakeLists.txt | 2 - src/graph/node_operators_binary.h | 151 +++++++++++++++++++++++++-- src/graph/node_operators_unary.h | 2 + src/models/decoder.h | 5 +- src/models/transformer.h | 6 +- src/rnn/cells.cpp | 17 +++ src/tensors/cpu/tensor_operators.cpp | 6 +- src/tensors/gpu/tensor_operators.cu | 12 +-- src/training/graph_group.h | 2 + 10 files changed, 180 insertions(+), 25 deletions(-) diff --git a/examples b/examples index 33674006..c19b7814 160000 --- a/examples +++ b/examples @@ -1 +1 @@ -Subproject commit 336740065d9c23e53e912a1befff18981d9d27ab +Subproject commit c19b7814d71febf1053bd93af6ac314b46204092 diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index b40df585..e100a619 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -87,8 +87,6 @@ if(CUDA_FOUND) set(GENCODE "${GENCODE} -gencode=arch=compute_70,code=sm_70") endif(COMPILE_CUDA_SM70) - message(${GENCODE}) - # install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation ExternalProject_Add(nccl_install SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/nccl diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 3158ee7a..d12c2481 100755 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -63,7 +63,6 @@ public: // df/dB += alpha * dot(op(A).T, D) // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C // to sum gradients from different graph parts - if(!transA_ && transB_) return {NodeOp(Prod(child(0)->grad(), adj_, @@ -130,6 +129,29 @@ public: const std::string type() override { return "dot"; } + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, transA_); + util::hash_combine(seed, transB_); + util::hash_combine(seed, scalar_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(transA_ != cnode->transA_) + return false; + if(transB_ != cnode->transB_) + return false; + if(scalar_ != cnode->scalar_) + return false; + return true; + } + const std::string color() override { return "orange"; } }; @@ -274,6 +296,30 @@ public: } const std::string type() override { return "affine"; } + + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, transA_); + util::hash_combine(seed, transB_); + util::hash_combine(seed, scalar_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(transA_ != cnode->transA_) + return false; + if(transB_ != cnode->transB_) + return false; + if(scalar_ != cnode->scalar_) + return false; + return true; + } + }; class DotBatchedNodeOp : public NaryNodeOp { @@ -402,6 +448,29 @@ public: const std::string type() override { return "bdot"; } + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, transA_); + util::hash_combine(seed, transB_); + util::hash_combine(seed, scalar_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(transA_ != cnode->transA_) + return false; + if(transB_ != cnode->transB_) + return false; + if(scalar_ != cnode->scalar_) + return false; + return true; + } + const std::string color() override { return "orange"; } }; @@ -443,18 +512,42 @@ public: } NodeOps backwardOps() override { - return {nullptr, // can't backprop into the sparse matrix (the gradient is dense) - nullptr, - nullptr, - NodeOp(CSRProd(child(3)->grad(), // child(3) = D - graph()->allocator(), - child(0)->val(), child(1)->val(), child(2)->val(), // children(0..2) = A - adj_, - /*transS=*/!transS_, /*swapOperands=*/swapOperands_, /*beta=*/1))}; + return { nullptr, // can't backprop into the sparse matrix (the gradient is dense) + nullptr, + nullptr, + NodeOp(CSRProd(child(3)->grad(), // child(3) = D + graph()->allocator(), + child(0)->val(), child(1)->val(), child(2)->val(), // children(0..2) = A + adj_, + /*transS=*/!transS_, /*swapOperands=*/swapOperands_, /*beta=*/1))}; } const std::string type() override { return "csr_dot"; } + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + for(auto s : shape()) + util::hash_combine(seed, s); + util::hash_combine(seed, transS_); + util::hash_combine(seed, swapOperands_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(transS_ != cnode->transS_) + return false; + if(shape() != cnode->shape()) + return false; + if(swapOperands_ != cnode->swapOperands_) + return false; + return true; + } + const std::string color() override { return "orange"; } }; @@ -883,6 +976,29 @@ struct CmpNodeOp : public ElementBinaryNodeOp { ABORT("Should not get here??"); } + virtual size_t hash() override { + if(!hash_) { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, cmp_); + util::hash_combine(seed, not_); + hash_ = seed; + } + return hash_; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(cmp_ != cnode->cmp_) + return false; + if(not_ != cnode->not_) + return false; + return true; + } + private: int cmp_; // -1: less; 0: equal; 1: greater bool not_; // invert result if true @@ -1015,6 +1131,23 @@ public: const std::string type() override { return "layer_normalization"; } + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, eps_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(eps_ != cnode->eps_) + return false; + return true; + } + private: float eps_; }; diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index b3f67bd6..a52ecf0e 100755 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -993,6 +993,8 @@ struct ShiftNodeOp : public UnaryNodeOp { if(!cnode) return false; if(shift_ != cnode->shift_) + return false; + if(padValue_ != cnode->padValue_) return false; return true; } diff --git a/src/models/decoder.h b/src/models/decoder.h index bbad32c3..725573c1 100755 --- a/src/models/decoder.h +++ b/src/models/decoder.h @@ -47,13 +47,12 @@ public: ABORT_IF(shortlist_, "How did a shortlist make it into training?"); - const Words& data = subBatch->data(); - Expr yData = graph_->indices(toWordIndexVector(data)); - auto yShifted = shift(y, {1, 0, 0}); state->setTargetHistoryEmbeddings(yShifted); state->setTargetMask(yMask); + + const Words& data = subBatch->data(); state->setTargetWords(data); } diff --git a/src/models/transformer.h b/src/models/transformer.h index 2211892a..84f1cd65 100755 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -459,7 +459,7 @@ public: int /*startPos*/) const { float dropoutRnn = inference_ ? 0.f : opt("dropout-rnn"); - if(!perLayerRnn[prefix]) // lazily created and cache RNNs in the docoder to avoid costly recreation @TODO: turn this into class members + if(!perLayerRnn[prefix]) // lazily create and cache RNNs in the decoder to avoid costly recreation @TODO: turn this into class members perLayerRnn[prefix] = rnn::rnn( "type", opt("dec-cell"), "prefix", prefix, @@ -820,6 +820,10 @@ public: output_->clear(); cache_.clear(); alignments_.clear(); + perLayerRnn_.clear(); // this needs to be cleared between batches. + // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, + // but where underlying memory has been deallocated by dropping all tensors + // from a TensorAllocator object. This can happen during ExpressionGraph::clear() } }; diff --git a/src/rnn/cells.cpp b/src/rnn/cells.cpp index e9467544..3d7eee21 100644 --- a/src/rnn/cells.cpp +++ b/src/rnn/cells.cpp @@ -43,6 +43,23 @@ struct GRUFastNodeOp : public NaryNodeOp { const std::string type() override { return "GRU-ops"; } const std::string color() override { return "yellow"; } + + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, final_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(final_ != cnode->final_) + return false; + return true; + } }; Expr gruOps(const std::vector& nodes, bool final) { diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index caad6337..91e058b8 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -1050,7 +1050,7 @@ void LayerNormalization(Tensor out_, sqSum += ex * ex; } - float sigma = std::sqrt(eps + sqSum / cols); + float sigma = std::sqrt(sqSum / cols + eps); #pragma omp simd for(int i = 0; i < cols; ++i) { @@ -1112,7 +1112,7 @@ void LayerNormalizationGrad(Tensor gradX_, sum_sqr += ex * ex; } - float sigma = std::sqrt(eps + sum_sqr / cols); + float sigma = std::sqrt(sum_sqr / cols + eps); #pragma omp simd for(size_t i = 0; i < cols; ++i) { float grad_x = 0.f; @@ -1154,7 +1154,7 @@ void LayerNormalizationGrad(Tensor gradX_, sum_sqr += ex * ex; } - float sigma = std::sqrt(eps + sum_sqr / cols); + float sigma = std::sqrt(sum_sqr / cols + eps); #pragma omp simd for(size_t i = 0; i < cols; ++i) { float grad_x = 0.f; diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 32564350..eefcc405 100755 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1925,7 +1925,7 @@ __global__ void gLNormalization(T* out, len = (len + 1) >> 1; } __syncthreads(); - AccType sigma = functional::Ops::sqrt(_sqSum[0] / N); // all AccType + AccType sigma = functional::Ops::sqrt(_sqSum[0] / N + eps); // all AccType __syncthreads(); for(int tid = 0; tid < cols; tid += blockDim.x) { @@ -1934,7 +1934,7 @@ __global__ void gLNormalization(T* out, AccType gammav = (AccType)gamma[id]; AccType xv = (AccType)xRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; - AccType lv = (xv - mean) / (sigma + eps); + AccType lv = (xv - mean) / sigma; AccType y = gammav * lv + betav; yRow[id] = (T)y; } @@ -2022,7 +2022,7 @@ __global__ void gLayerNormalizationGrad(T* gradX, AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; AccType gammav = (AccType)gamma[id]; AccType adjv = adjRow[id]; - AccType lv = (yv - betav) / (gammav + eps); // go back to LN(x) from scaled and shifted version for accumulation + AccType lv = (yv - betav) / gammav; // go back to LN(x) from scaled and shifted version for accumulation sum_x[threadIdx.x] += xv; sum_adj_l[threadIdx.x] += adjv * lv; @@ -2064,7 +2064,7 @@ __global__ void gLayerNormalizationGrad(T* gradX, len = (len + 1) >> 1; } __syncthreads(); - AccType sigma = functional::Ops::sqrt(sum_sqr[0] / N); + AccType sigma = functional::Ops::sqrt(sum_sqr[0] / N + eps); __syncthreads(); // Jacobian of layer norm @@ -2078,10 +2078,10 @@ __global__ void gLayerNormalizationGrad(T* gradX, AccType xv = xRow[id]; AccType gammav = (AccType)gamma[id]; AccType adjv = adjRow[id]; - AccType lv = (xv - mean) / (sigma + eps); + AccType lv = (xv - mean) / sigma; AccType gradLv = N * adjv - lv * sum_adj_l[0] - sum_adj[0]; - gradLv /= N * (sigma + eps); // eps has to be inside parentheses for correct gradient + gradLv /= N * sigma; AccType gradXv = gammav * gradLv; diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 37c848b6..265f9785 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -109,6 +109,8 @@ public: auto cost = model->build(graph, batch); fits = graph->fits(); + LOG(debug, "[batching] length: {} - size: {} - fits: {}", lengths[0], current, fits); + if(fits) { stats->add(batch, multiplier); start = current + 1;