Merged PR 10996: A number of smaller changes and clean-up

* Downgrade NCCL to 2.3.7 as 2.4.2 is buggy (hangs with larger models) * Actually enable gradient-checkpointing, previous option was inactive * Clean-up training-only options that should not be displayed for decoder and scorer * Re-enable conversion to FP16 if element types are compatible (belong to the same type class) * A few typos and more verbose log messages.
2024-09-11 06:15:56 +03:00 · 2020-01-05 23:16:13 +00:00 · 2020-01-05 23:16:13 +00:00 · 88d9980589
commit 88d9980589
parent 24f062cd27
15 changed files with 72 additions and 54 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -53,6 +53,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Compilation with CUDA 10.1

 ### Changed
+- Downgrade NCCL to 2.3.7 as 2.4.2 is buggy (hangs with larger models)
 - Return error signal on SIGTERM
 - Dropped support for CUDA 8.0, CUDA 9.0 is now minimal requirement
 - Removed autotuner for now, will be switched back on later
--- a/2
+++ b/2
@ -1 +1 @@
-v1.8.36
+v1.8.37
--- a/src/3rd_party/nccl
+++ b/src/3rd_party/nccl
@ -1 +1 @@
-Subproject commit 8e3a3f7c5b520babff49cec54a866fa3eda3a3b6
+Subproject commit b56650c7f59b8cd40d18809784a6d6be38ef8acb
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@ -22,7 +22,7 @@ int main(int argc, char** argv) {
        "  ./marian-conv -f model.npz -t model.bin --gemm-type packed16");
    cli->add<std::string>("--from,-f", "Input model", "model.npz");
    cli->add<std::string>("--to,-t", "Output model", "model.bin");
-    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8", "float32");
+    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512", "float32");
    cli->parse(argc, argv);
    options->merge(config);
  }
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -504,7 +504,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
     true);

  // add ULR settings
-  addSuboptionsULR(cli);
+  addSuboptionsULR(cli); 

  cli.add<std::vector<std::string>>("--task",
     "Use predefined set of options. Possible values: transformer, transformer-big");
@ -638,8 +638,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
     "Noise output layer with gumbel noise",
      false);

+#if 0 // @TODO: Ask Hany if there are any decoding-time options
  // add ULR settings
  addSuboptionsULR(cli);
+#endif

  cli.switchGroup(previous_group);
  // clang-format on
@ -749,29 +751,31 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
      "Sorting strategy for maxi-batch: none, src, trg (not available for decoder)",
      defaultMaxiBatchSort);

-  cli.add<bool>("--shuffle-in-ram",
-      "Keep shuffled corpus in RAM, do not write to temp file");
-  // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
-  cli.add<size_t>("--all-caps-every",
-      "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
-  cli.add<size_t>("--english-title-case-every",
-      "When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");
+  if(mode_ == cli::mode::training) {
+    cli.add<bool>("--shuffle-in-ram",
+        "Keep shuffled corpus in RAM, do not write to temp file");
+    // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
+    cli.add<size_t>("--all-caps-every",
+        "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
+    cli.add<size_t>("--english-title-case-every",
+        "When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");

-  cli.add<int>("--mini-batch-words-ref",
-      "If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
-      "--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
-  cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
-      "Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
-      "Auto-adjusted to --mini-batch-words-ref if given",
-      {"0"});
-  cli.add<bool>("--mini-batch-track-lr",
-      "Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
-  cli.add<size_t>("--mini-batch-overstuff",
-      "[experimental] Stuff this much more data into a minibatch, but scale down the LR and progress counter",
-      1);
-  cli.add<size_t>("--mini-batch-understuff",
-      "[experimental] Break each batch into this many updates",
-      1);
+    cli.add<int>("--mini-batch-words-ref",
+        "If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
+        "--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
+    cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
+        "Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
+        "Auto-adjusted to --mini-batch-words-ref if given",
+        {"0"});
+    cli.add<bool>("--mini-batch-track-lr",
+        "Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
+    cli.add<size_t>("--mini-batch-overstuff",
+        "[experimental] Stuff this much more data into a minibatch, but scale down the LR and progress counter",
+        1);
+    cli.add<size_t>("--mini-batch-understuff",
+        "[experimental] Break each batch into this many updates",
+        1);
+  }
  // clang-format on
 }

--- a/src/common/types.h
+++ b/src/common/types.h
@ -223,7 +223,8 @@ enum class TypeClass : size_t {
  avx2_type     = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only
  avx512_type   = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only

-  size_mask     = 0x00FF
+  size_mask     = 0x00FF,
+  class_mask    = 0xFF00
 };

 constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
@ -260,6 +261,10 @@ static inline size_t operator&(TypeClass typeClass, Type type) {
  return (size_t)typeClass & (size_t)type;
 }

+static inline bool isSameTypeClass(Type type1, Type type2) {
+  return (TypeClass::class_mask & type1) == (TypeClass::class_mask & type2);
+}
+
 static inline size_t sizeOf(Type type) {
  return TypeClass::size_mask & type;
 }
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@ -13,17 +13,17 @@ namespace data {

 Corpus::Corpus(Ptr<Options> options, bool translate /*= false*/)
    : CorpusBase(options, translate),
-        shuffleInRAM_(options_->get<bool>("shuffle-in-ram")),
-        allCapsEvery_(options_->get<size_t>("all-caps-every")),
-        titleCaseEvery_(options_->get<size_t>("english-title-case-every")) {}
+        shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
+        allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
+        titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}

 Corpus::Corpus(std::vector<std::string> paths,
               std::vector<Ptr<Vocab>> vocabs,
               Ptr<Options> options)
    : CorpusBase(paths, vocabs, options),
-        shuffleInRAM_(options_->get<bool>("shuffle-in-ram")),
-        allCapsEvery_(options_->get<size_t>("all-caps-every")),
-        titleCaseEvery_(options_->get<size_t>("english-title-case-every")) {}
+        shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
+        allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
+        titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}

 void Corpus::preprocessLine(std::string& line, size_t streamId) {
  if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) {
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -354,7 +354,7 @@ public:
             const Ptr<inits::NodeInitializer>& init,
             const Type elementType,
             bool fixed = false) {
-    // since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
+    // this param is called with a specified type
    return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
  }

@ -362,7 +362,7 @@ public:
             const Shape& shape,
             const Ptr<inits::NodeInitializer>& init,
             bool fixed = false) {
-    // since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
+    // since this param is called without a specified type, we assume defaultElementType but allow to check for a different type
    return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
  }

@ -497,7 +497,12 @@ public:
      // skip over special parameters starting with "special:"
      if(pName.substr(0, 8) == "special:")
        continue;
-      param(pName, item.shape, inits::fromItem(item), item.type, /*fixed=*/false);
+      
+      // if during loading the loaded type is of the same type class as the default element type, allow conversion;
+      // otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both
+      // have type class TypeClass::float_type.
+      auto loadElementType = isSameTypeClass(item.type, defaultElementType_) ? defaultElementType_ : item.type;
+      param(pName, item.shape, inits::fromItem(item), loadElementType, /*fixed=*/false);
    }
    if(markReloaded)
      setReloaded(true);
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -617,10 +617,10 @@ Expr unlikelihood(Expr logits, Expr indices) {
  int dimBatch = logits->shape()[-2];
  int dimTime  = logits->shape()[-3];

-  // @TODO: fix the outside of this function in decoder.h etc. 
+  // @TODO: fix this outside of this function in decoder.h etc. 
  auto indicesWithLayout = reshape(indices, {1, dimTime, dimBatch, 1});

-  // This is currently implemented with mutliple ops, might be worth doing a special operation like for cross_entropy
+  // This is currently implemented with multiple ops, might be worth doing a special operation like for cross_entropy
  return -log(gather(1.f - softmax(logits), /*axis=*/-1, indicesWithLayout));
 }

--- a/src/layers/generic.cpp
+++ b/src/layers/generic.cpp
@ -24,7 +24,11 @@ namespace marian {
    ABORT_IF(empty(), "Attempted to read out logits on empty Logits object");

    auto firstLogits = logits_.front()->loss();
-    ABORT_IF(labels.size() * firstLogits->shape()[-1] != firstLogits->shape().elements(), "Labels not matching logits shape??");
+    ABORT_IF(labels.size() * firstLogits->shape()[-1] != firstLogits->shape().elements(), 
+             "Labels not matching logits shape ({} != {}, {})??",
+             labels.size() * firstLogits->shape()[-1],
+             firstLogits->shape().elements(),
+             firstLogits->shape());

    // base case (no factors)
    if (!factoredVocab_) {
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -534,6 +534,7 @@ public:
                             layer, // values
                             layerMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
      layer = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", layer);
+      checkpoint(layer); // sets a manually specified checkpoint if gradient checkpointing is enabled, does nothing otherwise.
    }

    // restore organization of batch and time steps. This is currently required
@ -700,6 +701,9 @@ public:

      encoderContexts.push_back(encoderContext);
      encoderMasks.push_back(encoderMask);
+
+      checkpoint(encoderContext);
+      checkpoint(encoderMask);
    }

    rnn::States prevDecoderStates = state->getStates();
@ -735,6 +739,8 @@ public:
        ABORT("Unknown auto-regressive layer type in transformer decoder {}",
              layerType);

+      checkpoint(query);
+
      // source-target attention
      // Iterate over multiple encoders and simply stack the attention blocks
      if(encoderContexts.size() > 0) {
@ -771,10 +777,14 @@ public:
        }
      }

+      checkpoint(query);
+
      // remember decoder state
      decoderStates.push_back(decoderState);

      query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+
+      checkpoint(query);
    }

    auto decoderContext = transposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@ -69,11 +69,10 @@ public:

    for(auto device : devices) {
      auto graph = New<ExpressionGraph>(true);
-      graph->setDevice(device);

      auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
      graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
-
+      graph->setDevice(device);
      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
      if (device.type == DeviceType::cpu) {
        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@ -18,6 +18,7 @@ AsyncGraphGroup::AsyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
  for(auto device : devices_) {
    auto graph = New<ExpressionGraph>();
    graph->setDevice(device);
+    graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
    graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
    graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
    graphs_.push_back(graph);
--- a/src/training/graph_group_singleton.h
+++ b/src/training/graph_group_singleton.h
@ -34,6 +34,7 @@ public:
    // Initialize graph
    graph_ = New<ExpressionGraph>();
    graph_->setDevice(deviceId);
+    graph_->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
    graph_->getBackend()->setClip(options_->get<float>("clip-gemm"));
    graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
    opt_ = Optimizer(options_);
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -10,6 +10,7 @@ SyncGraphGroup::SyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
  for(auto device : devices_) {
    auto graph = New<ExpressionGraph>();
    graph->setDevice(device);
+    graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
    graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
    graph->getBackend()->setClip(options_->get<float>("clip-gemm"));

@ -57,19 +58,6 @@ void SyncGraphGroup::initialize(const Ptr<data::Batch>& exampleBatch) {
    if (i > 0)
      graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
  });
-  //ThreadPool pool(graphs_.size() - 1, graphs_.size() - 1);
-  //for(size_t i = 1; i < graphs_.size(); ++i) {
-  //  auto init = [&](size_t i) {
-  //    // initialize i-th graph and weights
-  //    builders_[i]->build(graphs_[i], exampleBatch);
-  //    graphs_[i]->forward();
-  //    // overwrite weights of i-th graph with weights from 0-th graph
-  //    graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
-  //  };
-  //  pool.enqueue(init, i);
-  //}
-  //// ThreadPool destructor waits until completion of all tasks.
-  //// @TODO: can we use comm_->foreach()?
 }

 void SyncGraphGroup::initializeAvg() {