Improve checks on transformer cache (#881)

* Fix caching in transformer attention * Move hash specialization * Swap comments to doxygen * Include string header
2024-11-04 14:04:24 +03:00 · 2022-01-24 15:28:13 +00:00 · 2022-01-24 15:28:13 +00:00 · 894a07ad5b
commit 894a07ad5b
parent b64e258bda
4 changed files with 67 additions and 39 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -30,6 +30,7 @@ set(MARIAN_SOURCES
  common/filesystem.cpp
  common/file_stream.cpp
  common/file_utils.cpp
+  common/hash.cpp
  common/signal_handling.cpp
  common/types.cpp

--- a/src/common/hash.cpp
+++ b/src/common/hash.cpp
@ -0,0 +1,12 @@
+#include <string>
+
+#include "hash.h"
+#include "common/shape.h"
+
+namespace std {
+size_t hash<pair<string, marian::Shape>>::operator()(pair<string, marian::Shape> const& k) const {
+  size_t seed = hash<string>{}(k.first);
+  marian::util::hash_combine(seed, k.second.hash());
+  return seed;
+}
+}  // namespace std
--- a/src/common/hash.h
+++ b/src/common/hash.h
@ -7,16 +7,18 @@ namespace util {

 template <class T> using hash = std::hash<T>;

-// This combinator is based on boost::hash_combine, but uses
-// std::hash as the hash implementation. Used as a drop-in
-// replacement for boost::hash_combine.
+/**
+ * Combine hash values.
+ * This combinator is based on boost::hash_combine, but uses std::hash as the hash implementation.
+ * Used as a drop-in replacement for boost::hash_combine.
+ */
 template <class T, class HashType = std::size_t>
 inline void hash_combine(HashType& seed, T const& v) {
  hash<T> hasher;
  seed ^= static_cast<HashType>(hasher(v)) + 0x9e3779b9 + (seed<<6) + (seed>>2);
 }

-// Hash a whole chunk of memory, mostly used for diagnostics
+/** Hash a whole chunk of memory. */
 template <class T, class HashType = std::size_t>
 inline HashType hashMem(const T* beg, size_t len) {
  HashType seed = 0;
@ -25,5 +27,17 @@ inline HashType hashMem(const T* beg, size_t len) {
  return seed;
 }

-}
+}  // namespace util
+
+struct Shape;  // Forward declaration
+}  // namespace marian
+
+namespace std {
+/**
+ * std::hash specialization for the string-shape pair used as a cache key in transformer.h.
+ */
+template <>
+struct hash<pair<string, marian::Shape>> {
+  size_t operator()(pair<string, marian::Shape> const& k) const;
+};
 }
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -5,6 +5,7 @@

 #include "marian.h"

+#include "common/hash.h"
 #include "layers/constructors.h"
 #include "models/decoder.h"
 #include "models/encoder.h"
@ -28,7 +29,7 @@ class Transformer : public EncoderOrDecoderBase {

 protected:
  using Base::options_; using Base::inference_; using Base::batchIndex_; using Base::graph_;
-  std::unordered_map<std::string, Expr> cache_;    // caching transformation of the encoder that should not be created again
+  std::unordered_map<std::pair<std::string, Shape>, Expr> cache_;  // caching transformation of the encoder that should not be created again
  mutable/*lazy*/ std::vector<float> sinusoidalEmbeddingsFreq_, sinusoidalEmbeddingsOffs_;  // cached contributions to sinusoidal embeddings

  bool depthScaling_{false}; // As recommended in the GPT-2 paper, down-scale layer weights by a factor of 1 / sqrt(depth);
@ -289,26 +290,26 @@ public:
    // Caching transformation of the encoder that should not be created again.
    // @TODO: set this automatically by memoizing encoder context and
    // memoization propagation (short-term)
-    if (cache                                                                          // if caching
-        && cache_.count(prefix + "_keys") > 0                                          // and the keys expression has been seen
-        && cache_[prefix + "_keys"]->shape().elements() == keys->shape().elements()) { // and the underlying element size did not change
-      kh = cache_[prefix + "_keys"];                                                   // then return cached tensor
-    }
-    else {
+    std::pair<std::unordered_map<std::pair<std::string, Shape>, Expr>::iterator, bool> cache_result;
+    if (cache
+        && !((cache_result = cache_.insert(std::pair<std::pair<std::string, Shape>, Expr>({prefix + "_keys", keys->shape()}, kh))).second)
+       ) {
+      kh = cache_result.first->second;
+    } else {
      int dimKeys =  keys->shape()[-1]; // different than dimModel when using lemma and factors combined with concatenation
      auto Wk = graph_->param(prefix + "_Wk", {dimKeys, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f));
      auto bk = graph_->param(prefix + "_bk", {1,        dimModel}, inits::zeros());

      kh = affine(keys, Wk, bk);     // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
      kh = SplitHeads(kh, dimHeads); // [-4: batch size, -3: num heads, -2: max length, -1: split vector dim]
-      cache_[prefix + "_keys"] = kh;
+      if (cache) cache_result.first->second = kh;
    }

    Expr vh;
    if (cache
-        && cache_.count(prefix + "_values") > 0 
-        && cache_[prefix + "_values"]->shape().elements() == values->shape().elements()) {
-      vh = cache_[prefix + "_values"];
+        && !((cache_result = cache_.insert(std::pair<std::pair<std::string, Shape>, Expr>({prefix + "_values", values->shape()}, vh))).second)
+       ) {
+      vh = cache_result.first->second;
    } else {
      int dimValues = values->shape()[-1]; // different than dimModel when using lemma and factors combined with concatenation
      auto Wv = graph_->param(prefix + "_Wv", {dimValues, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f));
@ -316,7 +317,7 @@ public:

      vh = affine(values, Wv, bv); // [-4: batch size, -3: num heads, -2: max length, -1: split vector dim]
      vh = SplitHeads(vh, dimHeads);
-      cache_[prefix + "_values"] = vh;
+      if (cache) cache_result.first->second = vh;
    }

    int dimBeam = q->shape()[-4];