minibatch-size warmup (manually merged over from fseide/covbias);

minibatches are now fed in GPU-sized chunks rather than a massive joint batch for all GPUs in the update; Adam hyper-parameter adjustment limited to learning rate, as momentum adjustment is counterproductive for MB scaling; log output now includes the last batch size; log output now shows current best for stalled validation metrics; bug fix: Adam optimizer should persist denominators; bug fix: Adam and Adagrad should use correct element size when persisting; min and max renamed to minimum and maximum, for consistency with other toolkits; pathie now compiles in manual VS Project
2024-09-11 06:15:56 +03:00 · 2018-12-12 18:40:46 -08:00 · 2018-12-12 18:40:46 -08:00 · 4cf97df51e
commit 4cf97df51e
parent 66f05527d9
51 changed files with 1132 additions and 653 deletions
--- a/src/3rd_party/ExceptionWithCallStack.h
+++ b/src/3rd_party/ExceptionWithCallStack.h
@ -5,6 +5,8 @@
 // ExceptionWithCallStack.h - debug util functions
 //

+#pragma once
+
 #include <string>

 namespace Microsoft { namespace MSR { namespace CNTK {
--- a/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
+++ b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
@ -178,7 +178,7 @@ entry_iterator& entry_iterator::operator++(int)
 /// Same as the other operator++().
 entry_iterator& entry_iterator::operator++()
 {
-  return (operator++());
+  return (operator++(0));
 }

 /**
--- a/src/3rd_party/pathie-cpp/src/path.cpp
+++ b/src/3rd_party/pathie-cpp/src/path.cpp
@ -51,7 +51,7 @@
 #include <shlwapi.h>
 //#include <ntifs.h> // Currently not in msys2

-// @TODO: This is a hack to make it compile under Windows, check if this is save.
+// @TODO: This is a hack to make it compile under Windows, check if this is safe.
 #define F_OK    0

 #elif defined(_PATHIE_UNIX)
@ -1546,7 +1546,7 @@ bool Path::is_directory() const
      throw(Pathie::ErrnoError(errsav));
  }

-  return s.st_mode & S_IFDIR;
+  return (s.st_mode & S_IFDIR) != 0;
 #else
 #error Unsupported system.
 #endif
@ -1590,7 +1590,7 @@ bool Path::is_file() const
      throw(Pathie::ErrnoError(errno));
  }

-  return s.st_mode & S_IFREG;
+  return (s.st_mode & S_IFREG) != 0;
 #else
 #error Unsupported system.
 #endif
@ -1710,9 +1710,9 @@ void Path::remove() const
   * function uses the apropriate native Win32API function
   * calls accordingly therefore. */
  if (is_directory())
-    result = RemoveDirectoryW(utf16.c_str());
+    result = RemoveDirectoryW(utf16.c_str()) != 0;
  else
-    result = DeleteFileW(utf16.c_str());
+    result = DeleteFileW(utf16.c_str()) != 0;

  if (!result) {
    DWORD err = GetLastError();
@ -3282,7 +3282,7 @@ bool Path::fnmatch(const std::string& pattern, int flags /* = 0 */) const
 #elif defined(_WIN32)
  std::wstring utf16path = utf8_to_utf16(m_path);
  std::wstring utf16pattern = utf8_to_utf16(pattern);
-  return PathMatchSpecW(utf16path.c_str(), utf16pattern.c_str());
+  return PathMatchSpecW(utf16path.c_str(), utf16pattern.c_str()) != 0;
 #else
 #error Unsupported system.
 #endif
--- a/src/3rd_party/sentencepiece
+++ b/src/3rd_party/sentencepiece
@ -1 +1 @@
-Subproject commit 1a38d26a13cc67b1aae641d4983b624bef6d5305
+Subproject commit 21309542e69e1821ff8e905fa60d8852ac12a73f
--- a/src/command/marian_train.cpp
+++ b/src/command/marian_train.cpp
@ -11,6 +11,8 @@
 #include "training/graph_group_multinode.h"
 #endif

+#include "3rd_party/ExceptionWithCallStack.h"
+
 int main(int argc, char** argv) {
  using namespace marian;

--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -335,9 +335,10 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
      "Reset running statistics of optimizer whenever learning rate decays");
  cli.add<bool>("--lr-decay-repeat-warmup",
     "Repeat learning rate warmup when learning rate is decayed");
-  cli.add<std::string/*SchedulerPeriod*/>("--lr-decay-inv-sqrt",
-     "Decrease learning rate at arg / sqrt(no. batches) starting at arg  (append 't' or 'e' for sqrt(target labels or epochs))",
-      "0");
+  cli.add<std::vector<std::string/*SchedulerPeriod*/>>("--lr-decay-inv-sqrt",
+     "Decrease learning rate at arg / sqrt(no. batches) starting at arg  (append 't' or 'e' for sqrt(target labels or epochs)). "
+      "Add second argument to define the starting point",
+      {"0"});

  cli.add<std::string/*SchedulerPeriod*/>("--lr-warmup",
     "Increase learning rate linearly for  arg  first batches (append 't' for  arg  first target labels)",
@ -354,9 +355,10 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
  cli.add<double>("--clip-norm",
     "Clip gradient norm to  argcli.add<int>(0 to disable)",
     1.f);
-  cli.add<float>("--exponential-smoothing",
-     "Maintain smoothed version of parameters for validation and saving with smoothing factor. 0 to disable",
-     0)->implicit_val("1e-4");
+  cli.add<std::vector<float>>("--exponential-smoothing",
+     "Maintain smoothed version of parameters for validation and saving with smoothing factor. 0 to disable. "
+     "Add a second number to specify a reference batch size (in target words).",
+     { 0.f })->implicit_val("1e-4");
  cli.add<std::string>("--guided-alignment",
     "Path to a file with word alignments. Use guided alignment to guide attention or 'none'",
     "none");
@ -604,6 +606,13 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {

  cli.add<bool>("--shuffle-in-ram",
      "Keep shuffled corpus in RAM, do not write to temp file");
+
+  cli.add<std::vector<std::string/*SchedulerPeriod*/>>("--mini-batch-warmup",
+      "linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels);"
+      "optional second number is reference batch size at which to stop scaling up (instead of full batch size)",
+      {"0"});
+  cli.add<bool>("--mini-batch-track-lr",
+      "Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
  // clang-format on
 }

--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@ -51,8 +51,16 @@ struct DeviceId {
  DeviceId() : no{0}, type{DeviceType::gpu} {}
  DeviceId(size_t no_, DeviceType type_) : no(no_), type(type_) {}

+  std::string typeAsString() const {
+    return (type == DeviceType::gpu ? "gpu" : "cpu");
+  }
+
+  operator std::string() const {
+    return typeAsString() + std::to_string(no);
+  }
+
  friend std::ostream& operator<<(std::ostream& out, DeviceId deviceId) {
-    out << (deviceId.type == DeviceType::gpu ? "gpu" : "cpu") << deviceId.no;
+    out << std::string(deviceId);
    return out;
  }

--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@ -178,11 +178,9 @@ public:
  bool empty() { return istream_->peek() == std::ifstream::traits_type::eof(); }

  void setbufsize(size_t size) const {
-#ifdef 0 // this is buggy, do nothing
    istream_->rdbuf()->pubsetbuf(0, 0);
-    readBuf_.reset(new char[size]);
-    istream_->rdbuf()->pubsetbuf(readBuf_.get(), 0);
-#endif
+    readBuf_.resize(size);
+    istream_->rdbuf()->pubsetbuf(readBuf_.data(), readBuf_.size());
  }

  template <typename T>
@ -206,9 +204,8 @@ private:
  std::unique_ptr<std::istream> istream_;

  boost::iostreams::file_descriptor_source fds_;
+  mutable std::vector<char> readBuf_; // for setbuf()
  std::unique_ptr<boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_source>> fdsBuffer_;
-
-  mutable UPtr<char[]> readBuf_; // for setbuf()
 };

 // wrapper around std::getline() that handles Windows input files with extra CR
--- a/src/common/filesystem.h
+++ b/src/common/filesystem.h
@ -12,7 +12,7 @@
 #pragma GCC diagnostic ignored "-Wsuggest-override"
 #endif

-#include "3rd_party/pathie-cpp/include/path.hpp"
+#include "3rd_party/pathie-cpp/include/path.hpp"  // @TODO: update to latest Pathie
 #include "3rd_party/pathie-cpp/include/errors.hpp"

 #ifdef __GNUC__
--- a/src/common/io.cpp
+++ b/src/common/io.cpp
@ -128,10 +128,12 @@ void saveItemsNpz(const std::string& fileName, const std::vector<Item>& items) {
  std::vector<cnpy::NpzItem> npzItems;
  for(auto& item : items) {
    std::vector<unsigned int> shape(item.shape.begin(), item.shape.end());
-    char type = 'f';
+    char type;

    if(item.type == Type::float32)
      type = cnpy::map_type(typeid(float));
+    else if(item.type == Type::float64)
+      type = cnpy::map_type(typeid(double));
    else if(item.type == Type::int8)
      type = cnpy::map_type(typeid(char));
    else
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@ -84,7 +84,7 @@ void createLoggers(const marian::Config* options) {

  bool quiet = options && options->get<bool>("quiet");
  Logger general{
-      createStderrLogger("general", "[%Y-%m-%d %T] %v", generalLogs, quiet)};
+      createStderrLogger("general", "[%Y-%m-%d %T %t] %v", generalLogs, quiet)};
  Logger valid{
      createStderrLogger("valid", "[%Y-%m-%d %T] [valid] %v", validLogs, quiet)};

@ -115,7 +115,7 @@ static void unhandledException() {
      throw; // rethrow so that we can get access to what()
    }
    catch (const std::exception& e) {
-      ABORT("Unhandled {}: {}", typeid(e).name(), e.what());
+      ABORT("Unhandled exception of type '{}': {}", typeid(e).name(), e.what());
    }
    catch (...) {
      ABORT("Unhandled exception");
@ -145,7 +145,7 @@ static void setErrorHandlers() {
 void switchtoMultinodeLogging(std::string nodeIdStr) {
  Logger log = spdlog::get("general");
  if (log)
-    log->set_pattern("[%Y-%m-%d %T " + nodeIdStr + "] %v");
+    log->set_pattern("[%Y-%m-%d %T " + nodeIdStr + ":%t] %v");
 }


--- a/src/common/logging.h
+++ b/src/common/logging.h
@ -21,6 +21,16 @@ namespace marian {
 */
 #define LOG(level, ...) checkedLog("general", #level, __VA_ARGS__)

+// variant that prints the log message only upon the first time the call site is executed
+#define LOG_ONCE(level, ...) do { \
+  static bool logged = false;     \
+  if (!logged)                    \
+  {                               \
+    logged = true;                \
+    LOG(level, __VA_ARGS__);      \
+  }                               \
+} while(0)
+
 /**
 * Prints logging message regarding validation into stderr and a file specified
 * with `--valid-log` option.
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@ -149,5 +149,40 @@ bool endsWith(const std::string& text, const std::string& suffix) {
         && !text.compare(text.size() - suffix.size(), suffix.size(), suffix);
 }

+std::string toUpper(const std::string& s) {
+  std::locale loc;
+  std::string res; res.reserve(s.capacity());
+  for (auto c : s) // @BUGBUG: This won't work with UTF-8 characters.
+    res.push_back((char)std::toupper(c, loc));
+  return res;
+}
+
+double parseDouble(std::string s) {
+  double res;
+  char c; // dummy char--if we succeed to parse this, then there were extraneous characters after the number
+  auto rc = sscanf(s.c_str(), "%lf%c", &res, &c);
+  ABORT_IF(rc != 1, "Mal-formed number: {}", s);
+  return res;
+}
+
+// parses a user-friendly number that can have commas and (some) units
+double parseNumber(std::string param) {
+  // get unit prefix
+  double factor = 1.;
+  if (!param.empty() && param.back() >= 'A') {
+    switch (param.back()) {
+    case 'k': factor = 1.e3;  break;
+    case 'M': factor = 1.e6;  break;
+    case 'G': factor = 1.e9;  break;
+    case 'T': factor = 1.e12; break;
+    default: ABORT("Invalid or unsupported unit prefix '{}' in {}", param.back(), param);
+    }
+    param.pop_back();
+  }
+  // we allow users to place commas in numbers (note: we are not actually verifying that they are in the right place)
+  std::remove_if(param.begin(), param.end(), [](char c) { return c == ','; });
+  return factor * parseDouble(param);
+}
+
 }  // namespace utils
 }  // namespace marian
--- a/src/common/utils.h
+++ b/src/common/utils.h
@ -38,5 +38,9 @@ std::pair<std::string, int> hostnameAndProcessId();
 std::string withCommas(size_t n);
 bool endsWith(const std::string& text, const std::string& suffix);

+std::string toUpper(const std::string& s);
+double parseDouble(std::string s);
+double parseNumber(std::string s);
+
 }  // namespace utils
 }  // namespace marian
--- a/src/data/batch.h
+++ b/src/data/batch.h
@ -19,7 +19,7 @@ public:

  virtual void debug(){};

-  virtual std::vector<Ptr<Batch>> split(size_t n) = 0;
+  virtual std::vector<Ptr<Batch>> split(size_t n, size_t sizeLimit = SIZE_MAX) = 0;

  const std::vector<size_t>& getSentenceIds() const { return sentenceIds_; }
  void setSentenceIds(const std::vector<size_t>& ids) { sentenceIds_ = ids; }
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@ -56,7 +56,7 @@ public:
  typedef typename DataSet::batch_ptr BatchPtr;

  typedef typename DataSet::Sample Sample;
-  typedef std::vector<Sample> Samples;     // @TODO: type names should be capitalized
+  typedef std::vector<Sample> Samples;

  typedef BatchIterator<BatchGenerator> iterator;
  friend iterator;
@ -83,7 +83,6 @@ private:

  // this runs on a bg thread; sequencing is handled by caller, but locking is done in here
  std::deque<BatchPtr> fetchBatches() {
-    //LOG(info, "fillBatches entered");
    typedef typename Sample::value_type Item;
    auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content

@ -118,8 +117,6 @@ private:
    size_t maxBatchSize = options_->get<int>("mini-batch");
    size_t maxSize = maxBatchSize * options_->get<int>("maxi-batch");

-    // LOG(info, "Preloading batches");
-
    // consume data from corpus into maxi-batch (single sentences)
    // sorted into specified order (due to queue)
    if(newlyPrepared_) {
@ -141,8 +138,6 @@ private:
    }
    size_t numSentencesRead = maxiBatch->size();

-    // LOG(info, "Turning samples into batches");
-
    // construct the actual batches and place them in the queue
    Samples batchVector;
    size_t currentWords = 0;
@ -152,7 +147,6 @@ private:

    // process all loaded sentences in order of increasing length
    // @TODO: we could just use a vector and do a sort() here; would make the cost more explicit
-    //LOG(info, "begin form batches, #lines = {}", maxiBatch->size());
    const size_t mbWords = options_->get<size_t>("mini-batch-words", 0);
    const bool useDynamicBatching = options_->has("mini-batch-fit");
    BatchStats::const_iterator cachedStatsIter;
@ -205,15 +199,25 @@ private:
    }

    // turn rest into batch
+    // @BUGBUG: This can create a very small batch, which with ce-mean-words can artificially
+    // inflate the contribution of the sames in the batch, causing instability.
+    // I think a good alternative would be to carry over the left-over sentences into the next round.
    if(!batchVector.empty())
      tempBatches.push_back(data_->toBatch(batchVector));
-    //LOG(info, "end form batches, #tempBatches = {}", tempBatches.size());

    // Shuffle the batches
    if(shuffle_) {
      std::shuffle(tempBatches.begin(), tempBatches.end(), eng_);
    }
-    LOG(debug, "[data] fetched {} batches with {} sentences.", tempBatches.size(), numSentencesRead);
+    double totalSent{}, totalLabels{};
+    for (auto& b : tempBatches) {
+        totalSent += (double)b->size();
+        totalLabels += (double)b->words(-1);
+    }
+    auto totalDenom = tempBatches.empty() ? 1 : tempBatches.size(); // (make 0/0 = 0)
+    LOG(info, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.",
+        tempBatches.size(), numSentencesRead,
+        (double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom);
    return tempBatches;
  }

@ -300,6 +304,18 @@ public:

    return true;
  }
+
+  // this is needed for dynamic MB scaling. Returns 0 if size is not known in words.
+  size_t estimateTypicalTrgBatchWords() const {
+    const size_t mbWords = options_->get<size_t>("mini-batch-words", 0);
+    const bool useDynamicBatching = options_->has("mini-batch-fit");
+    if (useDynamicBatching && stats_)
+      return stats_->estimateTypicalTrgWords();
+    else if (mbWords)
+      return mbWords;
+    else
+      return 0;
+  }
 };

 class CorpusBatchGenerator : public BatchGenerator<CorpusBase>,
--- a/src/data/batch_stats.h
+++ b/src/data/batch_stats.h
@ -49,6 +49,19 @@ public:
      map_[lengths] = batchSize;
  }

+  // return a rough minibatch size in labels
+  // We average over all (batch sizes * max trg length).
+  size_t estimateTypicalTrgWords() const {
+    size_t sum = 0;
+    for (const auto& entry : map_) {
+      auto maxTrgLength = entry.first.back();
+      auto numSentences = entry.second;
+      auto numLabels = numSentences * maxTrgLength;
+      sum += numLabels;
+    }
+    return sum / map_.size();
+  }
+
  // helpers for multi-node  --note: presently unused, but keeping them around for later use
  // serialize into a flat vector, for MPI data exchange
  std::vector<size_t> flatten() const {
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@ -102,7 +102,7 @@ void Corpus::restore(Ptr<TrainingState> ts) {
 }

 void Corpus::shuffleData(const std::vector<std::string>& paths) {
-  LOG(info, "[data] Shuffling files");
+  LOG(info, "[data] Shuffling data");

  size_t numStreams = paths.size();

--- a/src/data/corpus.h
+++ b/src/data/corpus.h
@ -63,8 +63,8 @@ public:

    std::vector<size_t> sentenceIds;

-    std::vector<int> maxDims;
-    for(auto& ex : batchVector) {
+    std::vector<int> maxDims;       // @TODO: What's this? widths? maxLengths?
+    for(auto& ex : batchVector) {   // @TODO: rename 'ex' to 'sample' or 'sentenceTuple'
      if(maxDims.size() < ex.size())
        maxDims.resize(ex.size(), 0);
      for(size_t i = 0; i < ex.size(); ++i) {
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -164,48 +164,51 @@ public:
   */
  size_t batchWidth() { return width_; };
  /**
-   * @brief The total number of words in the batch, considering the mask.
+   * @brief The total number of words in the batch (not counting masked-out words).
   */
  size_t batchWords() { return words_; }

  /**
-   * @brief Splits the subbatch into subbatches of equal size.
+   * @brief Splits the stream into sub-batches of equal size (except for last).
   *
-   * @param n Number of splits
+   * @param n number of sub-batches to split into
   *
-   * @return Vector of pointers to new subbatches.
+   * @param sizeLimit Pretend the batch only has this many sentences. Used for MB-size ramp-up.
+   *
+   * @return Vector of pointers to new sub-batches (or nullptrs where run out of sub-batches)
   *
   * @see marian::data::Batch::split(size_t n)
   */
-  std::vector<Ptr<SubBatch>> split(size_t n) {
-    ABORT_IF(size_ == 0, "Encoutered sub-batch size of 0");
+  std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) {
+    ABORT_IF(size_ == 0, "Encountered sub-batch size of 0");

-    size_t subSize = (size_t)(std::ceil(size_ / (float)n));
+    auto size = std::min(size_, sizeLimit); // if limit is given then pretend the batch only has that many sentences
+    size_t targetSubSize = (size_t)(std::ceil(size / (float)n)); // aim at forming sub-batches of this #sentences

    std::vector<Ptr<SubBatch>> splits;
-    for(size_t pos = 0; pos < size_; pos += subSize) {
-      size_t size = std::min(subSize, size_ - pos);
+    for(size_t pos = 0; pos < size; pos += targetSubSize) { // loop over ranges of size targetSubSize to form sub-batches of this size
+      size_t subSize = std::min(targetSubSize, size - pos); // actual number of sentences can be smaller at the end

-      // determine actual width
+      // determine actual width (=max length) of this sub-batch, which may be smaller than the overall max length
      size_t subWidth = 0;
      for(size_t j = 0; j < width_; ++j) {
-        for(size_t i = 0; i < size; ++i) {
+        for(size_t i = 0; i < subSize; ++i) {
          if(mask_[j * size_ + (pos + i)] != 0)
            if (subWidth < j + 1)
              subWidth = j + 1;
        }
      }
      //if (subWidth < width_)
-      //  LOG(info, "[data] sub-batch {} of {} wide batch has effective width of {}", pos / subSize, width_, subWidth);
+      //  LOG(info, "[data] sub-batch {} of {} wide batch has effective width of {}", pos / targetSize, width_, subWidth);

      // create sub-batch
-      auto sb = New<SubBatch>(size, subWidth, vocab_);
+      auto sb = New<SubBatch>(subSize, subWidth, vocab_);

      size_t words = 0;
      for(size_t j = 0; j < subWidth; ++j) {
-        for(size_t i = 0; i < size; ++i) {
-          sb->data()[j * size + i] = indices_[j * size_ + (pos + i)];
-          sb->mask()[j * size + i] =    mask_[j * size_ + (pos + i)];
+        for(size_t i = 0; i < subSize; ++i) {
+          sb->data()[j * subSize + i] = indices_[j * size_ + (pos + i)];
+          sb->mask()[j * subSize + i] =    mask_[j * size_ + (pos + i)];

          if(mask_[j * size_ + (pos + i)] != 0)
            words++;
@ -263,8 +266,8 @@ public:
  size_t size() const override { return subBatches_[0]->batchSize(); }

  /**
-   * @brief The total number of words for the longest sentence in the batch plus
-   * one. Pass which=0 for source and -1 for target.
+   * @brief The total number of words in the batch (not counting masked-out words).
+   * Pass which=0 for source words and -1 for target words.
   */
  size_t words(int which = 0) const override {
    return subBatches_[which >= 0 ? which
@ -349,25 +352,27 @@ public:
  }

  /**
-   * @brief Splits the batch into batches of equal size.
+   * @brief Splits the batch into batches of equal size (except for last).
   *
-   * @param n number of splits
+   * @param n number of sub-batches to split into
   *
-   * @return Vector of pointers to new batches.
+   * @param sizeLimit Clip batch content to the first sizeLimit sentences in the batch
+   *
+   * @return Vector of pointers to new sub-batches (or nullptrs where run out of sub-batches)
   *
   * @see marian::data::SubBatch::split(size_t n)
   */
-  std::vector<Ptr<Batch>> split(size_t n) override {
+  std::vector<Ptr<Batch>> split(size_t n, size_t sizeLimit /*=SIZE_MAX*/) override {
    ABORT_IF(size() == 0, "Encoutered batch size of 0");

-    std::vector<std::vector<Ptr<SubBatch>>> subs;
-    // split each subbatch separately
-    for(auto subBatch : subBatches_) {
-      size_t i = 0;
-      for(auto splitSubBatch : subBatch->split(n)) {
+    std::vector<std::vector<Ptr<SubBatch>>> subs; // [subBatchIndex][streamIndex]
+    // split each stream separately
+    for(auto batchStream : subBatches_) {
+      size_t i = 0; // index into split batch
+      for(auto splitSubBatch : batchStream->split(n, sizeLimit)) {
        if(subs.size() <= i)
          subs.resize(i + 1);
-        subs[i++].push_back(splitSubBatch);
+        subs[i++].push_back(splitSubBatch); // this forms tuples across streams
      }
    }

--- a/src/examples/mnist/dataset.h
+++ b/src/examples/mnist/dataset.h
@ -63,7 +63,7 @@ public:

  void push_back(Input input) { inputs_.push_back(input); }

-  virtual std::vector<Ptr<Batch>> split(size_t /*n*/) override { ABORT("Not implemented"); }
+  virtual std::vector<Ptr<Batch>> split(size_t /*n*/, size_t /*sizeLimit*/) override { ABORT("Not implemented"); }

  Data& features() { return inputs_[0].data(); }

--- a/src/functional/tmp.h
+++ b/src/functional/tmp.h
@ -81,6 +81,27 @@ struct FApply<4, Functor> {
  }
 };

+template <class Functor>
+struct FApply<5, Functor> {
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 5>& in,
+      const functional::Array<int, 5>& indices) {
+    return functor(in[0][indices[0]],
+                   in[1][indices[1]],
+                   in[2][indices[2]],
+                   in[3][indices[3]],
+                   in[4][indices[4]]);
+  }
+
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 5>& in,
+      int index) {
+    return functor(in[0][index], in[1][index], in[2][index], in[3][index], in[4][index]);
+  }
+};
+
 template <size_t K, class Functor>
 __HDI__ float apply(Functor functor,
                    functional::Array<functional::Tensor<float>, K>& in,
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -63,7 +63,7 @@ public:
      tensors_->allocate(node->grad(), node->shape(), node->value_type());
  }

-  void free(Tensor& tensor) { tensors_->free(tensor); }
+  void free(const Tensor& tensor) { tensors_->free(tensor); }

  // @TODO: get rid of this, not really used or can be done better
  Ptr<Allocator> allocator() { return tensors_->allocator(); }
@ -437,7 +437,7 @@ public:
      tensors_->allocateBackward(node);
  }

-  void free(Tensor& tensor) {
+  void free(const Tensor& tensor) {
    if(tensors_)
      tensors_->free(tensor);
  }
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@ -66,9 +66,9 @@ Expr operator/(Expr a, float b);

 Expr logaddexp(Expr a, Expr b);

-Expr max(Expr a, Expr b);  // TODO: haggle over the name (max vs. elementMax)
+Expr maximum(Expr a, Expr b);

-Expr min(Expr a, Expr b);  // TODO: haggle over the name
+Expr minimum(Expr a, Expr b);

 Expr dot(Expr a,
         Expr b,
--- a/src/graph/node_operators.h
+++ b/src/graph/node_operators.h
@ -50,7 +50,7 @@ struct ParamNode : public Node {
  ~ParamNode() {}

  virtual size_t allocate() override {
-    ABORT_IF(!val_, "Parameters should be allocated by their graph");
+    ABORT_IF(!val_, "Parameters should be allocated by their graph. Parameter {} was not", name_);
    return 0;
  }

--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -42,7 +42,7 @@ public:
    Shape outShape = shapeA;
    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
-             "matrix product requires dimensions to match");
+             "Matrix product requires dimensions to match");
    return outShape;
  }

@ -165,7 +165,7 @@ public:
    Shape outShape = shapeA;
    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
-             "matrix product requires dimensions to match");
+             "Matrix product requires dimensions to match");
    return outShape;
  }

@ -309,7 +309,7 @@ public:
    Shape outShape = shapeA;
    outShape.set(-1, shapeB[-1]);
    ABORT_IF(shapeA[-1] != shapeB[-2],
-             "matrix product requires dimensions to match");
+             "Batched matrix product requires dimensions to match");
    return outShape;
  }

--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
--- a/src/optimizers/optimizers.cpp
+++ b/src/optimizers/optimizers.cpp
@ -2,10 +2,12 @@

 #include "common/io.h"
 #include "tensors/tensor_operators.h"
+#include <array>

 namespace marian {

-void Sgd::updateImpl(Tensor params, Tensor grads) {
+void Sgd::updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBSize) {
+  actualMBSize, refMBSize; // (no correction for base update needed beyond using ce-sum)
  using namespace functional;
  Element(_1 -= eta_ * _2,
          params,
@ -14,9 +16,10 @@ void Sgd::updateImpl(Tensor params, Tensor grads) {
  params->getBackend()->synchronize();
 }

-// Aagrad
+// Adagrad

-void Adagrad::updateImpl(Tensor params, Tensor grads) {
+void Adagrad::updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBSize) {
+  ABORT_IF(actualMBSize != refMBSize, "Adagrad does not support rational hyper-parameter adjustment");
  if(!alloc_)
    alloc_ = New<TensorAllocator>(params->getBackend());

@ -62,7 +65,7 @@ void Adagrad::load(const std::string& name,
    if(item.name == "adagrad_gt") {
      vGt.resize(totalSize);
      std::copy(
-          (float*)item.data(), (float*)item.data() + totalSize, vGt.begin());
+          (float*)item.data(), ((float*)item.data()) + totalSize, vGt.begin());
    }
  }
  if(vGt.empty()) {
@ -109,7 +112,7 @@ void Adagrad::save(const std::string& name,
  item.type = Type::float32;
  item.bytes.resize(vGt.size() * sizeOf(item.type));
  std::copy(
-      (char*)vGt.data(), (char*)vGt.data() + vGt.size(), item.bytes.begin());
+      (char*)vGt.data(), (char*)(vGt.data() + vGt.size()), item.bytes.begin());

  io::saveItems(name, {item});
 }
@ -121,7 +124,8 @@ void Adagrad::resetStats() {

 // Adam

-void Adam::updateImpl(Tensor params, Tensor grads) {
+void Adam::updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBSize) {
+  // lazy allocation
  if(!alloc_)
    alloc_ = New<TensorAllocator>(params->getBackend());

@ -130,29 +134,42 @@ void Adam::updateImpl(Tensor params, Tensor grads) {
    alloc_->reserveExact(2 * params->memory()->size());
    alloc_->allocate(mt_, {1, elements});
    mt_->set(0.f);
-
    alloc_->allocate(vt_, {1, elements});
    vt_->set(0.f);
  }

-  t_++;
-  float denom1 = 1 - (float)std::pow(beta1_, t_);
-  float denom2 = 1 - (float)std::pow(beta2_, t_);
+  double Tref = (double)refMBSize;
+  double T    = (double)actualMBSize;

+  // adjust for minibatch-size changes if Adam parameters are given a reference size (else do nothing)
+  double eta   = eta_ * (T/Tref);
+  double beta1 = beta1_;
+  double beta2 = beta2_;
+  double decay = w_    ;
+
+  // denominators. At steady state: =1. This recursion does the same as the Adam beta correction term.
+  denom1_ = (beta1 * denom1_) + (1 - beta1); // momentum smoothing
+  denom2_ = (beta2 * denom2_) + (1 - beta2); // RMS normalization
+
+  LOG_ONCE(info, "[adam] First update: Tref = {}, T = {}, eta = {} -> {}, beta = {}, {}", Tref, T, eta_, eta, beta1, beta2);
+
+  // numerators. Divide by T to convert ce-sum gradient to avg gradient.
  using namespace functional;
+  Element(_1 = ((float)beta1 * _1) + float((1 - beta1) / T    ) *  _2,       mt_, grads); // momentum smoothing. At steady state: =smoothed avg gradient
+  Element(_1 = ((float)beta2 * _1) + float((1 - beta2) / T / T) * (_2 * _2), vt_, grads); // RMS normalization.  At steady state: =mean square of the avg gradients

-  Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2), mt_, grads);
-  Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)), vt_, grads);
+  // apply Adam normalization
+  float etaf = (float)eta, denom1f = (float)denom1_, denom2f = (float)denom2_, decayf = (float)decay; // (get casts out of Element expression for readability)
+  Element(_1 -= etaf                               // learning-rate: x_t = x_{t-1} - \eta * (...)
+                * ((  (     _2 / denom1f)          // momentum-smoothed per-sample gradient: m_{t-1}
+                    / (sqrt(_3 / denom2f) + eps_)) // normalize by RMS: \sqrt(v_{t-1})
+                   + decayf * _1),                 // weight-decay: w * x_{t-1}
+          params, // =_1
+          mt_,    // =_2
+          vt_     // =_3
+          );

-  Element(_1 -= eta_                         // learning-rate: x_t = x_{t-1} - \eta * (...)
-                * ((_2 / denom1)             // 1st moment: m_{t-1}
-                / (sqrt(_3 / denom2) + eps_) // 2nd moment: \sqrt(v_{t-1})
-                + w_ * _1),                  // weight-decay: w * x_{t-1}
-          params,
-          mt_,
-          vt_);
-
-  params->getBackend()->synchronize();
+  params->getBackend()->synchronize(); // @TODO: This should not be in here. Maybe in the wrapper. Why is it needed at all?
 }

 void Adam::load(const std::string& name,
@ -168,6 +185,7 @@ void Adam::load(const std::string& name,

  std::vector<float> vMt;
  std::vector<float> vVt;
+  std::array<double, 2> vDenoms;

  auto items = io::loadItems(name);
  for(auto item : items) {
@ -178,12 +196,18 @@ void Adam::load(const std::string& name,
    if(item.name == "adam_mt") {
      vMt.resize(totalSize);
      std::copy(
-          (float*)item.data(), (float*)item.data() + totalSize, vMt.begin());
+          (float*)item.data(), ((float*)item.data()) + totalSize, vMt.begin());
    }
-    if(item.name == "adam_vt") {
+    else if(item.name == "adam_vt") {
      vVt.resize(totalSize);
      std::copy(
-          (float*)item.data(), (float*)item.data() + totalSize, vVt.begin());
+          (float*)item.data(), ((float*)item.data()) + totalSize, vVt.begin());
+    }
+    else if(item.name == "adam_denoms") {
+      ABORT_IF(totalSize != 2, "adam_denoms should have 2 entries");
+      std::copy(
+          (double*)item.data(), ((double*)item.data()) + totalSize, vDenoms.begin());
+      // Back compat note: Old files lacked "adam_denoms". For those, vDenoms will remain 0, which reproduces the old behavior.
    }
  }
  if(vMt.empty() || vVt.empty()) {
@ -212,6 +236,9 @@ void Adam::load(const std::string& name,
    auto opt = std::dynamic_pointer_cast<Adam>(opts[id]);
    opt->vt_->set(std::vector<float>(begin, end));
  });
+
+  denom1_ = vDenoms[0];
+  denom2_ = vDenoms[1];
  //LOG(info, "done loading Adam params");
 }

@ -248,7 +275,7 @@ void Adam::save(const std::string& name,
  itemMt.type = Type::float32;
  itemMt.bytes.resize(vMt.size() * sizeOf(itemMt.type));
  std::copy(
-      (char*)vMt.data(), (char*)vMt.data() + vMt.size(), itemMt.bytes.begin());
+      (char*)vMt.data(), (char*)(vMt.data() + vMt.size()), itemMt.bytes.begin());

  io::Item itemVt;
  itemVt.name = "adam_vt";
@ -256,9 +283,19 @@ void Adam::save(const std::string& name,
  itemVt.type = Type::float32;
  itemVt.bytes.resize(vVt.size() * sizeOf(itemVt.type));
  std::copy(
-      (char*)vVt.data(), (char*)vVt.data() + vVt.size(), itemVt.bytes.begin());
+      (char*)vVt.data(), (char*)(vVt.data() + vVt.size()), itemVt.bytes.begin());

-  io::saveItems(name, {itemMt, itemVt});
+  // @TODO: this pattern is duplicated several times; refactor it
+  std::array<double, 2> vDenoms{denom1_, denom2_};
+  io::Item itemDenoms;
+  itemDenoms.name = "adam_denoms";
+  itemDenoms.shape = Shape({1, (int)vDenoms.size()});
+  itemDenoms.type = Type::float64;
+  itemDenoms.bytes.resize(vDenoms.size() * sizeOf(itemDenoms.type));
+  std::copy(
+      (char*)vDenoms.data(), (char*)(vDenoms.data() + vDenoms.size()), itemDenoms.bytes.begin());
+
+  io::saveItems(name, {itemMt, itemVt, itemDenoms});
 }

 void Adam::resetStats() {
@ -267,6 +304,9 @@ void Adam::resetStats() {

  if(vt_)
    vt_->set(0.f);
+
+  denom1_ = 0; // @BUGBUG: or 1 or refMBSize if so specified. Fix once we have proper parameterization for that.
+  denom2_ = 0;
 }

 Ptr<OptimizerBase> Optimizer(Ptr<Options> options) {
@ -287,7 +327,7 @@ Ptr<OptimizerBase> Optimizer(Ptr<Options> options) {
  } else if(opt == "adagrad") {
    return Optimizer<Adagrad>(lrate, clipper, params);
  } else if(opt == "adam") {
-    return Optimizer<Adam>(lrate, clipper, params);
+    return Optimizer<Adam>(lrate, clipper, params); // @TODO: parse the parameters here, or just pass the options object
  } else {
    ABORT("Unknown optimizer: {}", opt);
  }
--- a/src/optimizers/optimizers.h
+++ b/src/optimizers/optimizers.h
@ -21,19 +21,29 @@ public:
  OptimizerBase(float eta, Ptr<ClipperBase> clipper = nullptr)
      : eta_(eta), clipper_(clipper) {}

-  void update(Ptr<ExpressionGraph> graph) {
+  static constexpr size_t mbSizeNotProvided = SIZE_MAX;
+
+  void update(Ptr<ExpressionGraph> graph, size_t mbSize = mbSizeNotProvided) {
    Tensor p = graph->params()->vals();
    Tensor g = graph->params()->grads();

-    update(p, g);
+    update(p, g, mbSize);
  }

-  void update(Tensor params, Tensor grads) {
+  void update(Tensor params, Tensor grads, size_t mbSize = mbSizeNotProvided) {
    if(clipper_)
      clipper_->clip(grads);

-    // In case we want to add a multiply factor to our learning rate
-    updateImpl(params, grads);
+    size_t refMBSize = refMBSize_;
+    if (refMBSize == 0) { // optimizer not configured to use hyper-parameter auto-adjustment
+      refMBSize = mbSize = 1; // neutral settings that keep the standard behavior
+    }
+    else { // optimizer is configured to auto-adjust hyper-parameters
+      ABORT_IF(mbSize == mbSizeNotProvided, "Using rational optimizer auto-adjustment with trainer that does not provide MB size");
+      // note: this behavior is only meaningful if using the ce-sum criterion
+    }
+
+    updateImpl(params, grads, mbSize, refMBSize);
  }

  virtual void init(TrainingState& state) override {
@ -78,7 +88,7 @@ public:
                    bool /*isMainProcess*/ = true) {}

 protected:
-  virtual void updateImpl(Tensor params, Tensor grads) = 0;
+  virtual void updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBSize) = 0;
  virtual void parseParams(const std::vector<float>& params) = 0;
  virtual void resetStats() = 0;

@ -86,6 +96,8 @@ protected:
  float eta_;
  // Clip gradient norm
  Ptr<ClipperBase> clipper_;
+  // Reference MB size. This enables automatic adjustment of optimizer hyper-parameters to MB size.
+  size_t refMBSize_{0}; // 0 means no adjustment
 };

 /**
@ -97,7 +109,7 @@ public:
      : OptimizerBase(eta, clipper) {}

 private:
-  void updateImpl(Tensor params, Tensor grads) override;
+  void updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBSize) override;

  virtual void parseParams(const std::vector<float>& /*params*/) override {}
  virtual void resetStats() override {}
@ -123,7 +135,7 @@ public:
            bool /*isMainProcess*/ = true) override;

 private:
-  void updateImpl(Tensor params, Tensor grads) override;
+  void updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBSize) override;
  void resetStats() override;

  void parseParams(const std::vector<float>& params) override {
@ -140,11 +152,13 @@ private:
 * @brief Adam optimizer
 *
 * https://arxiv.org/pdf/1412.6980v8.pdf
+ *
+ * with Frank's modifications for automatic hyper-parameter adjustment.
 */
 class Adam : public OptimizerBase {
 public:
  Adam(float eta, Ptr<ClipperBase> clipper = nullptr)
-      : OptimizerBase(eta, clipper), t_(0) {}
+      : OptimizerBase(eta, clipper) {}

  void load(const std::string& name,
            const std::vector<Ptr<OptimizerBase>>& opts,
@ -156,9 +170,11 @@ public:
            bool isMainProcess = true) override;

 private:
-  void updateImpl(Tensor params, Tensor grads) override;
+  void updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBSize) override;
  void resetStats() override;

+  // Adam parameters:
+  // [beta1, beta2, eps, w, refMBSize]
  virtual void parseParams(const std::vector<float>& params) override {
    if(params.size() > 0)
      beta1_ = params[0];
@ -169,15 +185,29 @@ private:

    // weighted decay for AdamW, to be explored, disabled by default
    if(params.size() > 3)
-      w_ = params[3];
+      w_ = params[3]; // default (disabled): 0
+
+    // automatic learning-rate adjustment
+    // If users provide, in addition to the hyper-parameters, a reference minibatch size,
+    // that these hyper-parameters were originally tuned for, then the learning-rate gets
+    // adjusted accordingly. Note: Requires user to also use ce-sum criterion.
+    if(params.size() > 4) {
+      refMBSize_ = (size_t)params[4]; // default (disabled): 0
+      LOG(info, "Note: Modified Adam optimizer: automatically adjusting learning rate as if minibatch size was {}", refMBSize_);
+    }
  }

+  // hyper-parameters
  float beta1_ = 0.9f;
  float beta2_ = 0.999f;
  float eps_ = 1e-8f;
  float w_ = 0.0f;
-  size_t t_;

+  // CPU-side running accumulators
+  double denom1_ = 0;
+  double denom2_ = 0;
+
+  // GPU-side running accumulators
  Ptr<TensorAllocator> alloc_;
  Tensor mt_;
  Tensor vt_;
--- a/src/tensors/gpu/add.cu
+++ b/src/tensors/gpu/add.cu
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@ -48,3 +48,9 @@ template void Element<Assign<Var<1>, BinaryFunctor<elem::Div, BinaryFunctor<elem
 template void Element<Assign<Var<1>, UnaryFunctor<elem::Neg, UnaryFunctor<elem::Log, UnaryFunctor<elem::Neg, UnaryFunctor<elem::Log, Assignee<1>>>>>>>(Assign<Var<1>, UnaryFunctor<elem::Neg, UnaryFunctor<elem::Log, UnaryFunctor<elem::Neg, UnaryFunctor<elem::Log, Assignee<1>>>>>>, marian::Tensor);
 template void Element<Assign<Var<1>, UnaryFunctor<elem::Neg, UnaryFunctor<elem::Log, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Neg, UnaryFunctor<elem::Log, BinaryFunctor<elem::Plus, Assignee<1>, Capture>>>, Capture>>>>>(Assign<Var<1>, UnaryFunctor<elem::Neg, UnaryFunctor<elem::Log, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Neg, UnaryFunctor<elem::Log, BinaryFunctor<elem::Plus, Assignee<1>, Capture>>>, Capture>>>>, marian::Tensor);
 template void Element<Assign<Var<1>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<1>, Capture>, Capture>>>(Assign<Var<1>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<1>, Capture>, Capture> >, marian::Tensor);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> > > >, Capture>, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> > > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> > > >, Capture>, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> > > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<5> >, Assignee<4> >, Capture> >, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<5> >, Assignee<4> >, Capture> >, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@ -74,7 +74,7 @@ public:
    }
  }

-  void free(Tensor& t) { allocator_->free(t->memory()); }
+  void free(const Tensor& t) { allocator_->free(t->memory()); }

  Tensor asTensor() {
    auto mem = allocator_->memory();
--- a/src/training/communicator.cpp
+++ b/src/training/communicator.cpp
@ -72,7 +72,7 @@ class MPIWrapper : public IMPIWrapper

 public:
  MPIWrapper(bool multiThreaded) {
-    int requiredThreadingMode = multiThreaded ? MPI_THREAD_MULTIPLE : MPI_THREAD_SINGLE;
+    int requiredThreadingMode = multiThreaded ? MPI_THREAD_MULTIPLE : MPI_THREAD_FUNNELED; // FUNNELED means only one thread ever calls MPI

    int argc = 1; char* argv[] = { const_cast<char*>("this.exe") }; char** argvp = argv; // dummy argc/argv since MPI_Init needs something here
    int providedThreadingMode;
@ -124,6 +124,8 @@ public:
    HANDLE_MPI_ERROR(MPI_Recv(buf, (int)count, datatype, (int)sourceRank, tag, comm, status));
  }
  virtual void allReduce(const void* sendbuf, void* recvbuf, size_t count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) const override {
+    if (sendbuf == recvbuf)
+      sendbuf = MPI_IN_PLACE; // MSMPI requires this
    HANDLE_MPI_ERROR(MPI_Allreduce(sendbuf, recvbuf, (int)count, datatype, op, comm));
  }
  virtual void finalize() override {
--- a/src/training/communicator.h
+++ b/src/training/communicator.h
@ -203,7 +203,6 @@ public:

  void swapParams(const std::vector<Tensor>& paramShards) const override {
    // Update all graphs with parameter shard
-    
    auto gather = [this, paramShards](size_t idx, size_t begin, size_t end) {
      ABORT_IF(end - begin != paramShards[idx]->size(), "inconsistent shard size (swapParams, [{}], {} vs {})??", idx, end-begin, paramShards[idx]->size());
      // Copy parameter shard to each graph, apart from last graph
--- a/src/training/communicator_nccl.h
+++ b/src/training/communicator_nccl.h
@ -4,10 +4,11 @@
 #include "3rd_party/threadpool.h"
 #include "tensors/gpu/cuda_helpers.h"

+#include "common/timer.h"
+
 // Generated by NCCL make files in build/nccl/include;
 // include dir has been set in CMake files. NCCL add version number etc.
 #include "nccl.h"
-
 #include <cuda_runtime.h>

 #if (NCCL_MAJOR<3 || NCCL_MINOR<2)
@ -44,6 +45,14 @@ private:
    }
  }

+  void synchronizeAllOnNullStream() const {
+    for (int i = 0; i < graphs_.size(); ++i) {
+      auto backend = graphs_[i]->params()->vals()->getBackend();
+      backend->setDevice();
+      backend->synchronize(); // note: synchronize() does not set the device by itself
+    }
+  }
+
  std::string mpiIdStr() const { // (for logging)
    return mpi_ ? mpi_->idStr() : "";
  }
@ -150,6 +159,16 @@ public:
      CUDA_CHECK(cudaStreamCreate(&streams_[i]));
    }

+    // Note: due to a bug in NCCL 2.3.5, NCCL's allocation of shared memory intermittently fails with
+    //          Failed, NCCL error 2 'unhandled system error' - ncclGroupEnd()
+    //          include/shm.h:26 NCCL WARN Unable to allocate shared memory (4263936 bytes) : Interrupted system call
+    // This is caused by SIGPROF signals being raised, causing EINTR, which NCCL does not handle.
+    // Reported as Issue #137 on the NCCL Github, and supposedly fixed for 2.3.7 (to be verified).
+    // To work around, we disable the SIGPROF signal during NCCL initialization.
+#define SIG_BAD 27 // SIGPROF
+    BlockSignal blockThread(SIG_BAD, pthread_sigmask); // Note: I don't know yet which of these two makes the difference.
+    BlockSignal blockProc(SIG_BAD, sigprocmask);       // So for now just block both.
+
    // set up NCCL
    // Since we want to use MPI, we cannot use NCCL's handy convenience function. Instead, we must go the laborious route.
    // cf. https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html#multidevprothrd
@ -160,35 +179,19 @@ public:
      NCCL_CHECK(ncclGetUniqueId(&uniqueId));

    if (mpi_) {
-      //LOG(info, "[{}] before bcast", mpiIdStr());
      static_assert(sizeof(uniqueId) == NCCL_UNIQUE_ID_BYTES, "wrong NCCL_UNIQUE_ID_BYTES??"); // (this value is used in NVidia examples)
      mpi_->bCast(&uniqueId, sizeof(uniqueId), MPI_BYTE, 0);
-      //LOG(info, "[{}] after bcast", mpiIdStr());
    }

-    //mpiBarrier(); // should not be needed since bCast is a barrier
-
-    // Note: due to a bug in NCCL 2.3.5, NCCL's allocation of shared memory intermittently fails with
-    //          Failed, NCCL error 2 'unhandled system error' - ncclGroupEnd()
-    //          include/shm.h:26 NCCL WARN Unable to allocate shared memory (4263936 bytes) : Interrupted system call
-    // This is caused by SIGPROF signals being raised, causing EINTR, which NCCL does not handle.
-    // Reported as Issue #137 on the NCCL Github.
-    // To work around, we disable the SIGPROF signal during NCCL initialization.
-#define SIG_BAD 27 // SIGPROF
-    BlockSignal blockThread(SIG_BAD, pthread_sigmask); // Note: I don't know yet which of these two makes the difference.
-    BlockSignal blockProc(SIG_BAD, sigprocmask);       // So for now just block both.
-
    groupStart();
    for (int localDeviceIndex = 0; localDeviceIndex < devices_.size(); localDeviceIndex++) {
      CUDA_CHECK(cudaSetDevice(devices_[localDeviceIndex]));
-      //LOG(info, "[{}] ncclCommInitRank {} out of {}: GPU[{}]", mpiIdStr(), myNcclRank(localDeviceIndex), numNcclRanks(), localDeviceIndex);
      NCCL_CHECK(ncclCommInitRank(&comms_[localDeviceIndex], numNcclRanks(), uniqueId, myNcclRank(localDeviceIndex)));
-      //LOG(info, "[{}] done ncclCommInitRank {} out of {}, GPU[{}]", mpiIdStr(), myNcclRank(localDeviceIndex), numNcclRanks(), localDeviceIndex);
    }
    groupEnd();

    mpiBarrier(); // (synchronize the log messages)
-    LOG(debug, "NCCLCommunicator constructed successfully for {}", mpiIdStr());
+    LOG(info, "NCCLCommunicator constructed successfully.");
    mpiBarrier(); // (synchronize the log messages)
  }

@ -206,61 +209,46 @@ public:
    for(size_t i = 0; i < graphs_.size(); ++i) {
      size_t begin, end; std::tie
      (begin, end) = localShardRange(i);
-      //std::cerr << "[" << mpiIdStr() << "] foreach " << begin << " " << end << std::endl;
-try{
      if (parallel)
        threadResults_[i] = threadPool_.enqueue(func, i, begin, end);
-        //group.emplace_back(func, i, begin, end);
-        //threadPool_.enqueue([&](size_t i){
-        //  func(i, begin, end);
-        //}, i);
      else
        func(i, begin, end);
-}
-catch (const std::exception& e) // something leaks thread handles
-{
-  // keeping this around, in case the error still happens  --@TODO: remove once this has not been observed anymore
-  LOG(info, "caught exception in foreach {}", i);
-  system("ps -T -A");
-  throw;
-}
    }
    if (parallel)
      for(size_t i = 0; i < graphs_.size(); ++i)
        threadResults_[i].wait();
-    //for(auto& t : group) // (note: group is empty is not parallel)
-    //  t.join();
  }

  void scatterReduce() const override {
+    synchronizeAllOnNullStream();
+
    groupStart();
    for(int i = 0; i < graphs_.size(); ++i) {
      size_t begin, end; std::tie
      (begin, end) = localShardRange(i);
-      //std::cerr << "[" << mpiIdStr() << "] scatterReduce " << begin << " " << end << std::endl;

      auto grads = graphs_[i]->params()->grads();
      const auto* sendbuf = grads->data();
      auto*       recvbuf = grads->subtensor(begin, end-begin)->data();
      size_t      bufsize = shardSize();
+      ABORT_IF(grads->subtensor(begin, end-begin)->size() != bufsize, "unexpected subtensor size??");

      NCCL_CHECK(ncclReduceScatter(sendbuf, recvbuf, bufsize, ncclFloat, ncclSum, comms_[i], streams_[i]));
    }
    groupEnd();
-    //std::cerr << "scatterReduce submitted" << std::endl;
    synchronizeAll();
-    //std::cerr << "scatterReduce completed" << std::endl;
  }

  // This distributes all 64 model shards to all 64 GPUs.
  // @TODO: For unknown reasons, this takes longer than any other operation incl. scatterReduce().
  //        But both should have the same number of data transfers of the same size.
  void allGather() const override {
+    synchronizeAllOnNullStream();
+
    groupStart();
    for(int i = 0; i < graphs_.size(); ++i) {
      size_t begin, end; std::tie
      (begin, end) = localShardRange(i);
-      //std::cerr << "[" << mpiIdStr() << "] allGather " << begin << " " << end << std::endl;

      auto vals = graphs_[i]->params()->vals();
      const auto* sendbuf = vals->subtensor(begin, end-begin)->data();
@ -281,14 +269,12 @@ catch (const std::exception& e) // something leaks thread handles
    auto distributedParams = gatherState([&](size_t localDeviceIndex) {
      std::vector<float> tmp;
      distributedParamShards[localDeviceIndex]->get(tmp);
-      //LOG(info, "[{}] swapParams.getFn({}) -> size {}, ({}, {}, {}, ...)", mpiIdStr(), localDeviceIndex, tmp.size(), tmp[0], tmp[1], tmp[2]);
      return tmp;
    });
    // Now all MPI processes hold an identical copy of a concatenation of all distributedParamShards[] across local and remote devices.
    std::vector<float> localParams;
    graphs_[0]->params()->vals()->get(localParams);
    // Now all MPI processes hold an identical copy of params() (remember, we assumed all devices hold the same params()).
-    //LOG(info, "[{}] swapParams: distributedParams.size = {}, localParams.size = {}", mpiIdStr(), distributedParams.size(), localParams.size());
    ABORT_IF(distributedParams.size() != localParams.size(), "distributed sharded and local params have different size??");

    // swap
@ -331,7 +317,6 @@ catch (const std::exception& e) // something leaks thread handles
      tmp = getFn(localDeviceIndex);
      localData.insert(localData.end(), tmp.begin(), tmp.end());
    }
-    //LOG(info, "[{}] gatherState: localData.size = {}", mpiIdStr(), localData.size());
    // second, concatenate across MPI processes
    // Note that all local devices occupy consecutive ncclRanks in order.
    std::vector<float> data;
--- a/src/training/exponential_smoothing.h
+++ b/src/training/exponential_smoothing.h
@ -3,6 +3,7 @@
 #include "common/definitions.h"
 #include "functional/functional.h"
 #include "tensors/tensor_operators.h"
+#include "optimizers/optimizers.h"

 namespace marian {

@ -12,18 +13,33 @@ namespace marian {
 */
 class ExponentialSmoothing {
 public:
-  ExponentialSmoothing(float decay = 0.0f)
-      : mvAvg_{decay > 0}, mvDecay_{decay} {}
+    ExponentialSmoothing(Ptr<Options> options) {
+      auto args = options->get<std::vector<float>>("exponential-smoothing");
+      ABORT_IF(args.size() < 1 || args.size() > 2, "exponential-smoothing parameter must be one or two numbers");
+      mvDecayBy_ = args[0];
+      if (args.size() > 1)
+        refBatchTrgWords_ = (size_t)args[1];
+      mvAvg_ = (mvDecayBy_ > 0);
+    }

 protected:
-  void updateAvgParams(Tensor paramsAvg, Tensor params, size_t batches) {
+  void updateAvgParams(Tensor paramsAvg, Tensor params, size_t batches, size_t actualBatchTrgWords = OptimizerBase::mbSizeNotProvided) {
+    double beta = 1. - mvDecayBy_;
+    // correction term if batch size is different from what mvDecayBy_ was specified for
+    if (refBatchTrgWords_) {
+      ABORT_IF(actualBatchTrgWords == OptimizerBase::mbSizeNotProvided,
+               "This graph-group type does not support reference batch size specification for exponential-smoothing");
+      beta = pow(beta, (double)actualBatchTrgWords / (double)refBatchTrgWords_);
+    }
+    // reduce effect of decay parameter in early training stages
+    float decayBy = std::max(1.f - (float)beta,
+                             1.f - (float)(batches + 1) / (float)(batches + 10));
    using namespace functional;
-    float decay = std::max(mvDecay_,
-                           1.f - (float)(batches + 1) / (float)(batches + 10));
-    Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params);
+    Element(_1 = ((1.f - decayBy) * _1) + (decayBy * _2), paramsAvg, params);
  }

  bool mvAvg_{false};
-  float mvDecay_{1e-4f};
+  float mvDecayBy_{1e-4f};     // decay prior model by this factor
+  size_t refBatchTrgWords_{0}; // mvDecayBy_ is specified for this batch size (in target words)
 };
 }  // namespace marian
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@ -33,6 +33,10 @@ public:

  virtual void save(bool isFinal = false) = 0;

+  void validate() {
+    ABORT_IF(finalized_, "Training has already finished.");
+  }
+
  virtual void finalize() {
    finalized_ = true;
  }
@ -48,6 +52,7 @@ public:
   * The actual allowed size is then determined by multiplying it with the
   * number of devices, which is passed in as the 'multiplier'.
   */
+  // @TODO: Can this be made const? It seems wrong to have a stateful method that still returns a result.
  virtual Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
                                             Ptr<models::ModelBase> model,
                                             size_t multiplier = 1) {
@ -194,10 +199,8 @@ public:
  }

  virtual void finalize() override {
-    if (mpi_) {
+    if (mpi_)
      finalizeMPI(std::move(mpi_));
-      ABORT_IF(mpi_, "MPI not finalized??");
-    }
    Base::finalize();
  }
 };
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@ -7,7 +7,7 @@ namespace marian {

 AsyncGraphGroup::AsyncGraphGroup(Ptr<Options> config)
    : GraphGroup(config),
-      ExponentialSmoothing{options_->get<float>("exponential-smoothing")},
+      ExponentialSmoothing(options_),
      devices_{Config::getDevices(options_)},
      shardSync_(devices_.size()),
      optimizerDelay_{options_->get<size_t>("optimizer-delay")} {
--- a/src/training/graph_group_async.h
+++ b/src/training/graph_group_async.h
@ -55,7 +55,7 @@ public:
  AsyncGraphGroup(Ptr<Options> config);

  void update(Ptr<data::Batch> batch) override {
-    ABORT_IF(finalized_, "Training has already finished");
+    validate();
    execute(batch);
  }

--- a/src/training/graph_group_multinode.h
+++ b/src/training/graph_group_multinode.h
@ -376,7 +376,7 @@ public:
   * Update any client model with given batch if batch is assigned to this node.
   */
  void update(Ptr<data::Batch> batch) override {
-    ABORT_IF(finalized_, "Training has already finished");
+    validate();
    // Only take batch assigned to this node
    if(batchIter_ % mpi_->numMPIProcesses() == (size_t)mpi_->myMPIRank()) {
      execute(batch);
--- a/src/training/graph_group_multinode_sync.h
+++ b/src/training/graph_group_multinode_sync.h
@ -143,7 +143,7 @@ public:
   * Update any client model with given batch if batch is assigned to this node.
   */
  void update(Ptr<data::Batch> batch) override {
-    ABORT_IF(finalized_, "Training has already finished");
+    validate();
    if(batchIter_ % mpi_->numMPIProcesses() == mpi_->myMPIRank()) {  // Only take batch assigned to this node
      execute(batch);
    }
--- a/src/training/graph_group_singleton.h
+++ b/src/training/graph_group_singleton.h
@ -25,7 +25,7 @@ private:
 public:
  SingletonGraph(Ptr<Options> config)
      : GraphGroup(config),
-        ExponentialSmoothing(options_->get<float>("exponential-smoothing")) {
+        ExponentialSmoothing(config) {
    // Get device ID
    auto devices = Config::getDevices(options_);
    ABORT_IF(devices.size() != 1, "Only one device ID should be provided for singleton training");
@ -40,7 +40,7 @@ public:
  }

  void update(Ptr<data::Batch> batch) override {
-    ABORT_IF(finalized_, "Training has already finished");
+    validate();
    execute(batch);
  }

--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -4,7 +4,7 @@ namespace marian {

 SyncGraphGroup::SyncGraphGroup(Ptr<Options> config)
    : GraphGroup(config),
-      ExponentialSmoothing{options_->get<float>("exponential-smoothing")},
+      ExponentialSmoothing(config),
      delay_{options_->get<size_t>("optimizer-delay")} { // @TODO: rename to something else; delay means delayed updated, not accumulation

  mpi_ = initMPI(/*multiThreaded=*/false); // when not running under MPI, this will be a fake object that represents a one-MPI-process setup
@ -25,9 +25,16 @@ SyncGraphGroup::SyncGraphGroup(Ptr<Options> config)
  // This part of the code will not special-case any of this here.
  // Rather, it is assumed that the communicator knows to reduce unnecessary transfers to no-ops.
  comm_ = createCommunicator(graphs_, /*noNccl=*/options_->get<bool>("no-nccl", false), /*mpi=*/mpi_);
+
+  auto type = utils::toUpper(devices_.front().typeAsString()) + "s";
+  if (mpi_->numMPIProcesses() > 1)
+    LOG(info, "[training] Using {} {}, distributed over {} MPI processes", mpi_->numMPIProcesses() * devices_.size(), type, mpi_->numMPIProcesses());
+  else
+    LOG(info, "[training] Using {} {}", devices_.size(), type);
 }

 void SyncGraphGroup::setScheduler(Ptr<Scheduler> scheduler) /*override*/ {
+  validate();
  scheduler_ = scheduler;
  // optimizer has to be registered last to see changes of learning rate
  // @TODO: ^^Fix this comment. Either it refers to the scheduler, or it should be moved. Which one?
@ -101,31 +108,144 @@ void SyncGraphGroup::initializeAvg() {
 }

 Ptr<data::BatchStats> SyncGraphGroup::collectStats() {
-  // @TODO: This should only run on MPI process 0. Also we can share vv this vv expression with update().
-  size_t multiplier = devices_.size() * mpi_->numMPIProcesses() * delay_;
-  return GraphGroup::collectStats(graphs_[0], builders_[0], multiplier);
+  // @TODO: This is an incompatible change. Decide how to handle that.
+  //size_t multiplier = devices_.size() * mpi_->numMPIProcesses() * delay_;
+  return GraphGroup::collectStats(graphs_[0], builders_[0]/*, multiplier*/);
 }

-void SyncGraphGroup::update(Ptr<data::Batch> batch) /*override*/ {
-  ABORT_IF(finalized_, "Training has already finished");
+// helper for MB scaling: quantize the ratio with a given error margin
+static double roundUpRatio(double ratio) {
+  if (ratio == 0)
+    return ratio;
+  // find largest power of two that fits into ratio
+  double p = 1;
+  while (p*2 < ratio)
+    p *= 2;
+  // round up to nearest multiple of a largest power of 2 where relative error is within margin
+  // 25% error margin seems acceptable:
+  //  - using a 25% larger MB size should not break convergence
+  //  - @TODO: not using the first 25% of the next block is OK since those are dominated by data exchange
+  double maxError = 0.25;
+  while (p >= 1) {
+    double proposedRatio = ceil(ratio / p) * p;
+    double error = (proposedRatio - ratio) / ratio;
+    if (fabs(error) <= maxError)
+      return proposedRatio;
+    p /= 2;
+  }
+  return ratio;
+}

-  // distribute the batch over (delay, local device, MPI rank)
-  size_t numSubBatches = delay_ * devices_.size() * mpi_->numMPIProcesses();
-  auto subBatches = batch->split(numSubBatches);
-  subBatches.resize(numSubBatches); // pad with nullptrs if out of data
+// helper routine that handles accumulation and load-balancing of sub-batches to fill all devices
+// It adds 'newBatch' to 'pendingBatches_', and if sufficient batches have been queued, then
+// returns 'pendingBatches_' in 'subBatches' and resets it. If not, it returns false.
+bool SyncGraphGroup::tryGetSubBatches(Ptr<data::Batch> newBatch, std::vector<Ptr<data::Batch>>& subBatches) {
+  pendingBatches_.push_back(newBatch);
+  size_t warpSize = devices_.size() * mpi_->numMPIProcesses(); // warp := set of batches processed concurrently across GPus and workers
+
+  size_t pendingTrgWords = 0; // diagnosics only: compute how many target labels are pending so far
+  for (const auto& batch : pendingBatches_)
+    pendingTrgWords += batch->wordsTrg();
+
+  // MB-size warm-up and dynamic scaling
+  double ratio;
+  bool isDynamic = scheduler_->tryGetDynamicMBSizeMultiplier(ratio);
+  if (isDynamic)
+    ratio = roundUpRatio(ratio); // round up to full batches if within a certain error margin  --@BUGBUG: Not invariant w.r.t. GPU size, as ratio is relative to what fits into 1 GPU
+  else   // if dynamic scaling not enabled, then fill each GPU with a batch
+    ratio = (double)(delay_ * warpSize);
+  if (pendingBatches_.size() < ratio)
+    return false; // not enough data yet
+
+  // now we have enough to fill at least 'ratio' batches
+  if (pendingBatches_.size() == ratio)
+    return true; // nothing to do, e.g. warm-up not enabled
+
+  // warm-up is happening
+  LOG_ONCE(info, "[training] Mini-batch-warmup enabled");
+
+  // shorten all batches a little to accurately reflect ratio
+  // e.g. ratio = 3.3 for 4 batches: Reduce each by 3.3/4
+  // Alternatively, we could just shorten the last 'warp', but that would not be invariant to warp size.
+  size_t before = 0, after = 0;
+  for (auto& batch : pendingBatches_) {
+    auto reducedBatchSize = (size_t)ceil((double)batch->size() * ratio / (double)pendingBatches_.size());
+    size_t minSize = 1;
+    if (pendingBatches_.size() == 1) { // enforce a minimum (only needed/correct if still in first batch)
+      size_t minTrgWords = 256;    // don't go below this number of target words, as it seems excessive  --@TODO: parameterize?
+      minSize = 1 + (minTrgWords * batch->size() - 1) / batch->wordsTrg(); // approximately convert minTrgWords into a #sentences
+    }
+    reducedBatchSize = std::max(reducedBatchSize, minSize);
+    before += batch->wordsTrg();
+    if (reducedBatchSize < batch->size())
+      batch = batch->split(/*numSubBatches=*/1, reducedBatchSize).front();
+    after += batch->wordsTrg();
+  }
+
+  // load-balance: distribute the last numWarps-group's batches over GPUs
+  // This is tricky since batches do not have the same length, therefore we can only split, but not merge.
+  auto numWarps = (pendingBatches_.size() - 1) / warpSize + 1; // = ceil(#buffers / (#GPUs * #workers))
+  auto availableBatches = numWarps * warpSize; // we got this many GPUs anyways, so we better make use of them
+  if (pendingBatches_.size() < availableBatches) {
+    // we are not using all available GPUs -> try to load-balance a bit better
+    auto fullBatches = (numWarps - 1) * warpSize;
+    auto expandLast = pendingBatches_.size() - fullBatches;
+    auto toLast = availableBatches - fullBatches;
+    LOG(info, "attempt to redistribute {} last batches over {}", expandLast, toLast);
+    auto splitInto = toLast / expandLast; // unfortunately we can only split in integer ratios
+    // @TODO: We can do better since the last batch is typically smaller.
+    if (splitInto > 1) {
+      // split each of last numWarps's batches into 'splitInto' batches
+      // pop them first
+      std::vector<Ptr<data::Batch>> batchesToSplit;
+      while (pendingBatches_.size() > fullBatches) {
+        batchesToSplit.push_back(pendingBatches_.back());
+        pendingBatches_.pop_back();
+      }
+      // now split them
+      for (auto& batchToSplit : batchesToSplit) {
+        LOG(info, "{}-way splitting batchToSplit with size {}", splitInto, batchToSplit->size());
+        auto splitBatches = batchToSplit->split(splitInto);
+        for (auto& splitBatch : splitBatches) {
+          LOG(info, " -> getting batchToSplit with size {}", splitBatch->size());
+          pendingBatches_.push_back(splitBatch);
+        }
+      }
+    }
+    ABORT_IF(pendingBatches_.size() > availableBatches, "somehow split into too many batches??");
+  }
+  subBatches = std::move(pendingBatches_);
+
+  // @TODO: sort by width, so that in case of delay > 1, each GPU gets about the same size
+  return true;
+}
+
+void SyncGraphGroup::update(Ptr<data::Batch> newBatch) /*override*/ {
+  validate();
+
+  std::vector<Ptr<data::Batch>> subBatches;
+  bool gotSubBatches = tryGetSubBatches(newBatch, subBatches);
+
+  // not enough data yet: return right away
+  if (!gotSubBatches)
+    return;

  // Helper to access the subBatches array
-  auto getSubBatch = [&](size_t t, size_t localDeviceIndex, size_t rank) {
+  auto getSubBatch = [&](size_t t, size_t localDeviceIndex, size_t rank) -> Ptr<data::Batch> {
    // 't' (the delay) should be slowest changing dimension. If subBatches are sorted by
    // length, then grouping sentences of similar length into the same delay step can
    // reduce unnecessary time spent in padding.
-    return subBatches[(t * mpi_->numMPIProcesses() + rank) * devices_.size() + localDeviceIndex];
+    auto index = (t * mpi_->numMPIProcesses() + rank) * devices_.size() + localDeviceIndex;
+    if (index < subBatches.size())
+      return subBatches[index];
+    else
+      return nullptr;
  };

  // Upon very first execution, reset everything
  if(first_) {
-    LOG(debug, "[{}] Processing first minibatch. Batches are processed as {} processes x {} GPUs/process x {} delay steps",
-         mpi_->idStr(), mpi_->numMPIProcesses(), devices_.size(), delay_);
+    LOG(info, "[training] Processing first minibatch. Batches are processed as {} processes x {} GPUs/process",
+        mpi_->numMPIProcesses(), devices_.size());
    initialize(subBatches.front());
    if(mvAvg_ && paramsAvg_.empty())
      initializeAvg();
@ -133,33 +253,34 @@ void SyncGraphGroup::update(Ptr<data::Batch> batch) /*override*/ {
  }

  // Compute gradients
-  // This happens in multiple steps in case of delay_ > 1.
+  // This happens in multiple steps in case of delay > 1.
  std::vector<float> localDeviceCosts(devices_.size(), 0.f); // [local device index] aggregate cost for each local device
-  for (size_t t = 0; t < delay_; t++) {
+  for (size_t t = 0; getSubBatch(t, 0, 0); t++) { // @TODO: rename 't' to 'delay'
    // Execute single forward/backward step
    auto forwardBackward = [&](size_t localDeviceIndex, size_t /*begin*/, size_t /*end*/) {
      auto graph = graphs_[localDeviceIndex];
      auto subBatch = getSubBatch(t, localDeviceIndex, mpi_->myMPIRank());

      if(subBatch) {
-        timer::Timer timer;
        auto costNode = builders_[localDeviceIndex]->build(graph, subBatch);
-        //LOG(info, timer.format(2, "after build: %ws"));
        graph->forward();
-        //LOG(info, timer.format(2, "after forward (no sync): %ws"));
        localDeviceCosts[localDeviceIndex] += costNode->scalar();
        graph->backward(/*zero=*/t == 0); // only reset gradients to 0 if t = 0
-        //LOG(info, timer.format(2, "after backward (no sync): %ws"));
-        //localDeviceCosts[localDeviceIndex] += costNode->scalar(); // moved here for time measurements; @TODO: move this back
-        //LOG(info, timer.format(2, "after scalar() (that's a sync): %ws"));
      }
      else { // empty batch: execute do-nothing fw-bw step for proper inits and resets
+#if 1   // @TODO: double-check whether the #else branch is the same; and if so, use it instead
+        graph->params()->allocateBackward();
+        if (t == 0) // these have already been sized
+          graph->params()->set_zero_adjoint();
+#else
+        graph->clear(); // instead of build()
        graph->forward();
        graph->backward(/*zero=*/t == 0);
+#endif
      }
    };

-    comm_->foreach(forwardBackward); // compute gradients in parallel on each device. Aggregate if delay_ > 1.
+    comm_->foreach(forwardBackward); // compute gradients in parallel on each device. Aggregate if delay > 1.
  }
  // At this point, each device on each MPI process has a gradient aggregated over a subset of the sub-batches.

@ -177,23 +298,25 @@ void SyncGraphGroup::update(Ptr<data::Batch> batch) /*override*/ {
      Element(_1 = _1 / (float)div, curGrad);
    }

+    // determine num words for dynamic hyper-parameter adjustment
+    size_t mbWords = OptimizerBase::mbSizeNotProvided;
+    if (options_->get<std::string>("cost-type") == "ce-sum") { // presently only supported for ce-sum
+      mbWords = 0;
+      for (const auto& batch : subBatches)
+        mbWords += batch->words(-1);  // @TODO: use wordsTrg (it's the same)
+    }
+
    // actual model update
-    shardOpt_[idx]->update(curParam, curGrad);
+    shardOpt_[idx]->update(curParam, curGrad, mbWords);

    if(mvAvg_)
      updateAvgParams(
-          paramsAvg_[idx], curParam, scheduler_->numberOfBatches());
+          paramsAvg_[idx], curParam, scheduler_->numberOfBatches(), mbWords);
  };

-  timer::Timer timer;
  comm_->scatterReduce(); // reduce gradients across all devices (globally) into shards
-  //LOG(info, timer.format(2, "after scatterReduce (has sync): %ws"));
  comm_->foreach(update); // per-shard model-update
-  //LOG(info, timer.format(2, "after model update (no sync): %ws"));
-  //graphs_.front()->getBackend()->synchronize(); // @TODO: This is strictly for time measurement. Make sure it doesn't accidentally stay in here!!
-  //LOG(info, timer.format(2, "after model update sync (which is unnecessary except for time measurements): %ws"));
  comm_->allGather();     // distribute param value shards back
-  //LOG(info, timer.format(2, "after allGather (has sync): %ws"));

  // cost across all local devices (scheduler will aggregate cross-process)
  float localCost = 0;
@ -202,7 +325,7 @@ void SyncGraphGroup::update(Ptr<data::Batch> batch) /*override*/ {

  // if localCost is average-based, we need to turn the sum over devices into an average as well
  if(options_->get<std::string>("cost-type") != "ce-sum")
-    localCost /= numSubBatches;
+    localCost /= subBatches.size();

  if(scheduler_) {
    // track and log localCost
@ -224,6 +347,7 @@ void SyncGraphGroup::update(Ptr<data::Batch> batch) /*override*/ {
 }

 void SyncGraphGroup::load() /*override*/ {
+  validate();

  // This function loads the main parameters in the graphs.
  // In case of exponential smoothing, we also need to restore paramsAvg_.
@ -253,10 +377,11 @@ void SyncGraphGroup::load() /*override*/ {
        [&](const std::vector<float>& optimizerStateVector, const OptimizerBase::ScatterStateSetFunc& setShardFn) {
          comm_->scatterState(optimizerStateVector, setShardFn);
        });
+      LOG(info, "[training] Model reloaded from {}", name);
    } else if(options_->has("pretrained-model")) {
      std::string nameInit = options_->get<std::string>("pretrained-model");
      LOG(info,
-          "Initialize model weights with the pre-trained model {}",
+          "[training] Initializing model weights with the pre-trained model {}",
          nameInit);

      size_t i = 0;
@ -267,44 +392,34 @@ void SyncGraphGroup::load() /*override*/ {
 }

 void SyncGraphGroup::save(bool final) /*override*/ {
+  validate();
  barrier(); // (for better grouping of log messages)
-  //LOG(info, "[{}] save() line {}!", this->mpi_->idStr(), __LINE__);
  // do final validation
  if(final && scheduler_) {
    // bring the smoothed model in
    // Note that it is sharded. For multi-node, it is sharded over multiple machines, so this is a network access.
    // Also note that the swap must run on all MPI processes concurrently, although only one actually validates.
-    //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
    swapParamsAvg();
-    //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
    if (isMainProcess()) // in multi-node, only first MPI process saves the model (they are all identical)
      scheduler_->validate(graphs_, true);
-    //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
    swapParamsAvg();
-    //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
  }

  std::string name = options_->get<std::string>("model");

-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
  barrier(); // (for better grouping of log messages)
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
  // if smoothing then save original (unsmoothed) parameters as well
-  // @TODO: Check whether we are reloading the correct file (the unsmoothed one).
  if(mvAvg_ && paramsAvg_.size() > 0 && isMainProcess()) // only save from one MPI process
    // Save the original parameters in model.npz.orig.npz
    builders_[0]->save(graphs_[0], name + ".orig.npz", true);

  // Temporarily switch to the averaged parameters
  // Note: the smoothed model is sharded across GPUs, and across MPI processes if applicable. This brings it into MPI process[*].device[*]
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
  swapParamsAvg();
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);

  // save main model file
  if (isMainProcess()) { // only save from one MPI process
    // if not overwrite then save a copy with number of updates in the model pathname
-    //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
    if(!options_->get<bool>("overwrite") && !final) {
      std::string numberOfBatches
          = scheduler_ ? std::to_string(scheduler_->numberOfBatches())
@ -313,40 +428,34 @@ void SyncGraphGroup::save(bool final) /*override*/ {
      nameOverwrite.replace(name.size() - 4, 4, ".iter" + numberOfBatches + ".npz"); // @TODO: use insert?
      builders_[0]->save(graphs_[0], nameOverwrite);
    }
-    //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
    // save main model file
    builders_[0]->save(graphs_[0], name, true);
-    //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
    // save scheduler-related state
    if (scheduler_)
      scheduler_->save(name);
-    //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
  }

  // Switch back to the original parameters
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
  swapParamsAvg();
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);

-#if 0 // temporary, for testing of saving distributed models; must be identical to .orig.npz
-  if(mvAvg_ && paramsAvg_.size() > 0 && isMainProcess())
-    builders_[0]->save(graphs_[0], name + ".orig_after_swapping.npz", true);
-#endif
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
  barrier(); // (for better grouping of log messages)
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);

  // persist optimizer state
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
+  LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
  shardOpt_[0]->save(name + ".optimizer.npz", shardOpt_,
    [&](const OptimizerBase::GatherStateGetFunc& getShardFn) {
      return comm_->gatherState(getShardFn);
    },
    isMainProcess());
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
+  LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);

  barrier(); // (for better grouping of log messages)
-  //LOG(info, "[{}] save() line {}", this->mpi_->idStr(), __LINE__);
+}
+
+void SyncGraphGroup::finalize() /*override*/ {
+  validate();
+  finalizeMPI(std::move(mpi_));
+  Base::finalize();
 }

 }  // namespace marian
--- a/src/training/graph_group_sync.h
+++ b/src/training/graph_group_sync.h
@ -7,6 +7,7 @@
 namespace marian {

 class SyncGraphGroup : public GraphGroup, public ExponentialSmoothing {
+  using Base = GraphGroup;
  const size_t delay_{ 1 }; // optimizer-delay parameter

  Ptr<ICommunicator> comm_; // [not null] communicator, e.g. NCCLCommunicator
@ -23,7 +24,10 @@ class SyncGraphGroup : public GraphGroup, public ExponentialSmoothing {
  std::vector<Ptr<TensorAllocator>> paramsAllocs_; // [deviceIndex] we must hold a reference to the memory until this class dies
  // @TODO: move this nto ExponentialSmoothing, together with paramsAvg_?

-  bool first_{ true }; // gets interpreted and cleared by update()
+  // state for update()
+  bool first_{ true };                           // gets interpreted and cleared by update()
+  std::vector<Ptr<data::Batch>> pendingBatches_; // in case of delay, multi-worker, and/or multi-GPU, we buffer up batches
+  size_t typicalTrgWords_{};                     // typical batch size in words (labels); remembered from collectStats()

  void initialize(const Ptr<data::Batch>& exampleBatch);
  void initializeAvg();
@ -32,6 +36,8 @@ class SyncGraphGroup : public GraphGroup, public ExponentialSmoothing {
  void barrier() const { mpi_->barrier(); } // (we need this several times)
  void swapParamsAvg() { if (mvAvg_ && paramsAvg_.size() > 0) comm_->swapParams(paramsAvg_); } // note: must call this on all MPI ranks in parallel

+  bool tryGetSubBatches(Ptr<data::Batch> newBatch, std::vector<Ptr<data::Batch>>& subBatches);
+
 public:
  SyncGraphGroup(Ptr<Options> config);

@ -42,6 +48,8 @@ public:
  void load() override;
  void save(bool final = false) override;

+  void finalize() override;
+
  Ptr<data::BatchStats> collectStats();
  // @TODO: consider to make this a virtual as well? Currently it is a template dispatch
 };
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -13,12 +13,34 @@ private:
  std::vector<Ptr<ValidatorBase>> validators_;

  bool first_{true};
+  size_t typicalTrgBatchWords_{0}; // for dynamic batch sizing

  Ptr<TrainingState> state_;

  timer::Timer timer_, heartBeatTimer_;

-  float getLearningRate(TrainingState& state) {
+  // determine LR decay factor from --lr-decay-inv-sqrt option
+  float getLearningRateDecayFactor(const TrainingState& state) const {
+    auto args = options_->get<std::vector<std::string>>("lr-decay-inv-sqrt");
+    ABORT_IF(args.empty() || args.size() > 2, "--lr-decay-inv-sqrt argument must be one or two numbers with units");
+    auto decayGoogle = SchedulingParameter::parse(args[0]);
+    size_t progress = state.getProgressIn(decayGoogle.unit);
+    size_t start = decayGoogle.n;
+    if (args.size() > 1) {
+      auto decayStart = SchedulingParameter::parse(args[1]);
+      ABORT_IF(decayStart && decayStart.unit != decayGoogle.unit, "both --lr-decay-inv-sqrt arguments must have the same unit");
+      start = decayStart.n;
+    }
+    if (decayGoogle && progress > start) {
+      progress = progress - start + decayGoogle.n; // shift so that we get 1 at progress==start
+      return (float)(std::sqrt((double)decayGoogle.n / (double)progress));
+    }
+    else
+      return 1.f;
+  }
+
+  // determine the dynamically adjusted learning rate, incl. warm-up and decay
+  float getLearningRate(const TrainingState& state) const {
    float baselr = options_->get<float>("learn-rate");

    float mult1 = 1.f;
@ -29,11 +51,7 @@ private:
      mult1 = std::min(1.f, (float)bno / (float)warmup.n);
    }

-    float mult2 = 1.f;
-    auto decayGoogle = SchedulingParameter::parse(options_->get<std::string>("lr-decay-inv-sqrt"));
-    if(decayGoogle) {
-      mult2 = std::min(1.f, (float)(std::sqrt(decayGoogle.n) / std::sqrt(state.getProgressIn(decayGoogle.unit))));
-    }
+    float mult2 = getLearningRateDecayFactor(state);

    baselr = baselr * mult1 * mult2;

@ -45,6 +63,54 @@ private:
  }

 public:
+  void setTypicalTrgBatchWords(size_t typicalTrgBatchWords) { // needed for tryGetDynamicMBSizeMultiplier()
+    typicalTrgBatchWords_ = typicalTrgBatchWords;
+    LOG(info, "batch size estimate is {} target words", typicalTrgBatchWords_);
+  }
+
+  // determine dynamic MB size, if respective parameters are given (return false if not)
+  bool tryGetDynamicMBSizeMultiplier(double /*out*/ &ratio) const {
+    auto mbWarmupOpts = options_->get<std::vector<std::string>>("mini-batch-warmup");
+    ABORT_IF(mbWarmupOpts.empty() || mbWarmupOpts.size() > 2, "--mini-batch-warmup argument must be one or two numbers with units");
+    auto mbWarmup = SchedulingParameter::parse(mbWarmupOpts[0]);
+    if (!mbWarmup)
+      return false;
+
+    ratio = 1.0;
+    // mini-batch-warmup
+    LOG_ONCE(info, "[scheduler] Mini-batch size warmup {}", std::string(mbWarmup));
+
+    // This scales MB size up from the start.
+    // now scale batch size relative to progress within warm-up period
+    size_t progress = state_->getProgressIn(mbWarmup.unit); // number of updates/labels processed
+    auto progressRatio = (double)progress / (double)mbWarmup.n; // where are we relatively within target warm-up period
+    if (mbWarmup.unit == SchedulingUnit::trgLabels)
+      progressRatio = std::sqrt(progressRatio);
+    // apply ratio to actual batch size
+    ratio *= progressRatio;
+
+    // adjust for reference batch size if given
+    // At progress == mbWarmup.n (ratio=1), we would like to have refBatchLabels instead of whichever
+    // the actual batch size is. We approximate the latter as typicalTrgBatchWords_, and scale ratio accordingly.
+    if (mbWarmupOpts.size() > 1) {
+      ABORT_IF(typicalTrgBatchWords_ == 0, "dynamic scaling with words target requires MB size to be known in words"); // happens if MB size is specified in sentences
+      auto refBatchLabels = (size_t)std::stoull(mbWarmupOpts[1]);
+      LOG_ONCE(info, "[scheduler] Scaling to {} reference labels. Typical actual batch words is {}", refBatchLabels, typicalTrgBatchWords_);
+      ratio *= (double)refBatchLabels / (double)typicalTrgBatchWords_;
+    }
+
+    // dynamic MB-size tracking with learning rate
+    // As LR goes down, MB gets ramped up by the same ratio, which has been found to be safe.
+    auto mbTracking = options_->get<bool>("mini-batch-track-lr");
+    if (mbTracking) {
+      auto lrFactor = getLearningRateDecayFactor(*state_);
+      if (lrFactor != 1)
+        LOG_ONCE(info, "[scheduler] Dynamic mini-batch size adjustment enabled and kicking in");
+      ratio /= lrFactor;
+    }
+    return true;
+  }
+
  Scheduler(Ptr<Options> options, Ptr<TrainingState> state)
      : options_(options), state_(state) {
    state_->eta = getLearningRate(*state);
@ -120,12 +186,13 @@ public:
      float value = validator->validate(graphs);
      if(validator->stalled() > 0) {
        LOG_VALID(info,
-                  "Ep. {} : Up. {} : {} : {} : stalled {} times",
+                  "Ep. {} : Up. {} : {} : {} : stalled {} times (last best: {})",
                  state_->epochs,
                  state_->batches,
                  validator->type(),
                  value,
-                  validator->stalled());
+                  validator->stalled(),
+                  validator->lastBest());
      } else {
        LOG_VALID(info,
                  "Ep. {} : Up. {} : {} : {} : new best",
@ -170,13 +237,12 @@ public:
    size_t batchLabels = 0;  // number of target words in batch

    for(const auto& batch : batches) {
-      if (batch) { // (nullptr is allowed as result of split)
-        batchSize += batch->size();
-        batchLabels += batch->words(-1);
-      }
+      batchSize += batch->size();
+      batchLabels += batch->words(-1);
    }

-    // extrapolate cost across MPI processes, so that we have numbers in the right range
+    // Since batchLabels is counted across all MPI processes, we also should temporarily
+    // extrapolate cost across MPI processes, to have numbers in the right range.
    // When doing the actual log, we then aggregate across MPI processes to get the accurate number.
    if (mpi)
      cost *= mpi->numMPIProcesses(); // @BUGBUG: this is presently correct for ce-sum, but possibly not the av-based losses
@ -203,42 +269,42 @@ public:
    state_->samplesEpoch += batchSize;   // sentences processed in this epoch
    state_->labelsTotal  += batchLabels; // total labels processed

-    state_->newBatch();
+    state_->newUpdate(batches.size());

    if(state_->enteredNewPeriodOf(options_->get<std::string>("disp-freq")) ||
       state_->batches <= options_->get<size_t>("disp-first")) {
      // if MPI then aggregate precise cost across workers
      if (mpi) {
-        //LOG(info, "all-reducing cost from {}", state_->costSum);
        state_->costSum /= mpi->numMPIProcesses(); // undo the extra scaling
        mpi->allReduce(&state_->costSum, &state_->costSum, 1, MPI_FLOAT, MPI_SUM);
-        //LOG(info, "all-reduced cost to {}", state_->costSum);
      }
      if (mpi && mpi->myMPIRank() != 0)
        ; // skip the report on alternate worker processes
      else if(dispLabelCounts) {
        if(options_->get<bool>("lr-report")) {  // if true then show the learning rate
          LOG(info,
-              "Ep. {} : Up. {} : Sen. {} : Cost {:.8f} * {} after {} : Time {:.2f}s : {:.2f} "
+              "Ep. {} : Up. {} : Sen. {} : Cost {:.8f} * {} @ {} after {} : Time {:.2f}s : {:.2f} "
              "words/s : L.r. {:.4e}",
              state_->epochs,
              state_->batches,
              utils::withCommas(state_->samplesEpoch),
              state_->costSum / state_->costCount,
              utils::withCommas(state_->costCount),  // show cost as "av * count"
+              batchLabels,
              utils::withCommas(state_->labelsTotal),
              timer_.elapsed(),
              state_->wordsDisp / timer_.elapsed(),
              state_->eta);
        } else {
          LOG(info,
-              "Ep. {} : Up. {} : Sen. {} : Cost {:.8f} * {} after {} : Time {:.2f}s : {:.2f} "
+              "Ep. {} : Up. {} : Sen. {} : Cost {:.8f} * {} @ {} after {} : Time {:.2f}s : {:.2f} "
              "words/s",
              state_->epochs,
              state_->batches,
              utils::withCommas(state_->samplesEpoch),
              state_->costSum / state_->costCount,
              utils::withCommas(state_->costCount),
+              batchLabels,
              utils::withCommas(state_->labelsTotal),
              timer_.elapsed(),
              state_->wordsDisp / timer_.elapsed());
@ -272,12 +338,14 @@ public:
    }
    // progress heartbeat for MS-internal Philly compute cluster
    // This environment variable exists when running on the cluster.
+    using namespace std::chrono;
    if((!mpi || mpi->myMPIRank() == 0) && getenv("PHILLY_JOB_ID")
       && heartBeatTimer_.elapsed<std::chrono::minutes>() >= 10) {
-      printf("PROGRESS: %.2f%%\nEVALERR: %.7f\n", (double)state_->epochs, state_->costSum / state_->costCount), fflush(stdout);
-#if 0
-      LOG(info, "heart beat after {} updates", state_->batches);
-#endif
+      printf("PROGRESS: %.2f%%\nEVALERR: %.7f%%\n",
+          (double)state_->epochs,
+          state_->costSum / state_->costCount / (mpi ? mpi->numMPIProcesses() : 1));
+      fflush(stdout);
+      std::cout << "MBSIZE: " << batchLabels << " after " << state_->batches << " updates = " << state_->labelsTotal << " labels" << std::endl << std::flush;
      heartBeatTimer_.start();
    }
  }
--- a/src/training/training.h
+++ b/src/training/training.h
@ -56,7 +56,9 @@ public:
    }

    auto batchGenerator = New<CorpusBatchGenerator>(dataset, options_, stats);
+
    scheduler->registerTrainingObserver(batchGenerator);
+    scheduler->setTypicalTrgBatchWords(batchGenerator->estimateTypicalTrgBatchWords()); // needed for dynamic MB scaling

    auto model = New<ModelWrapper>(options_);
    model->setScheduler(scheduler);
@ -85,12 +87,14 @@ public:
    }
    scheduler->finished();

-    model->finalize();
-
    // Avoid saving the model twice if it has been loaded and training did not
    // progress
    if(!trainState->loaded)
      model->save(true);
+
+    // finalize, including communicating successful completion to MPI
+    // @BUGBUG: This is wrong for async, but needed for sync. How to solve it?
+    model->finalize();
  }
 };
 }  // namespace marian
--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@ -2,6 +2,7 @@

 #include "common/definitions.h"
 #include "common/filesystem.h"
+#include "common/utils.h"

 #include <fstream>
 #include <vector>
@ -42,7 +43,9 @@ struct SchedulingParameter {
      }
      param.pop_back();
    }
-    res.n = (size_t)std::stoull(param);
+    double number = utils::parseNumber(param);
+    res.n = (size_t)number;
+    ABORT_IF(number != (double)res.n, "Scheduling parameters must be whole numbers");
    return res;
  }

@ -62,9 +65,9 @@ class TrainingState {
 public:
  // Current epoch
  size_t epochs{1};
-  // The total number of batches (=updates) processed since beginning of training   --@TODO: rename to 'updates'
+  // The total number of updates since beginning of training   --@TODO: rename to 'updates'
  size_t batches{0};
-  // The number of batches seen in this epoch  --@TODO: rename to 'updatesEpoch' or 'updatesInCurrentEpoch'
+  // The number of batches seen in this epoch  --note: not updates; an update can consist of multiple batches
  size_t batchesEpoch{0};
  // The number of sentences seen in this epoch  --@TODO: rename to 'sentencesEpoch'
  size_t samplesEpoch{0};
@ -172,9 +175,9 @@ public:
    batchesEpoch = 0;
  }

-  void newBatch() {
+  void newUpdate(size_t batchesInUpdate) {
    ++batches;
-    ++batchesEpoch;
+    batchesEpoch += batchesInUpdate;
    loaded = false;
    validated = false;
    for(auto observer : observers_)
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -137,7 +137,7 @@ protected:
      lastBest_ = val;
      if(options_->get<bool>("keep-best"))
        keepBest(graphs);
-    } else {
+    } else if (lastBest_ != val) { // (special case 0 at start)  @TODO: needed? Seems stall count gets reset each time it does improve. If not needed, remove "if(...)" again.
      stalled_++;
    }
  }
@ -166,7 +166,6 @@ public:

 protected:
  virtual float validateBG(const std::vector<Ptr<ExpressionGraph>>& graphs) override {
-
    auto ctype = options_->get<std::string>("cost-type");
    options_->set("cost-type", "ce-sum");

--- a/vs/Marian.sln
+++ b/vs/Marian.sln
@ -1,377 +1,25 @@
+
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 15
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{5216F769-E887-369E-AD1E-D6A1F69E834E}"
-	ProjectSection(ProjectDependencies) = postProject
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33} = {17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA} = {5AF43E07-5917-3D8F-9BF0-B41F698242EA}
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4} = {885D3D2B-7278-30EF-BB1B-50E83D1635C4}
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033} = {3CD61EAE-244E-33AB-8C7D-F5182481E033}
-		{97131187-E592-3981-886F-222EE20FB669} = {97131187-E592-3981-886F-222EE20FB669}
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151} = {25A05D30-AFC2-3F0E-B475-0B2B81530151}
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7} = {8A6B1F60-8E2D-3171-828B-07E732C8E7D7}
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34} = {3784D69C-33A9-33A7-A557-F809EF2F4D34}
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC} = {EA3973A2-F92E-3124-9817-81B2458EC8DC}
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D} = {36953645-6D01-37E4-ACF7-D3F9BFFCA49D}
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162} = {F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-		{5857EF98-C87F-3197-A399-F0F9A20913FC} = {5857EF98-C87F-3197-A399-F0F9A20913FC}
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F} = {F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}
-		{FBB107B9-523B-3094-95CF-A103E2388006} = {FBB107B9-523B-3094-95CF-A103E2388006}
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC} = {5B4A6D26-C638-3350-9E1A-0F987C448DEC}
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F} = {11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3} = {1134F859-3DE4-34B1-924F-82CA38D4D4F3}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "INSTALL", "INSTALL.vcxproj", "{9DAF8CA3-052E-3480-A332-34676CAE852B}"
-	ProjectSection(ProjectDependencies) = postProject
-		{5216F769-E887-369E-AD1E-D6A1F69E834E} = {5216F769-E887-369E-AD1E-D6A1F69E834E}
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PACKAGE", "PACKAGE.vcxproj", "{3A3C6EA5-65CD-324E-90F4-6B4D70DD5A37}"
-	ProjectSection(ProjectDependencies) = postProject
-		{5216F769-E887-369E-AD1E-D6A1F69E834E} = {5216F769-E887-369E-AD1E-D6A1F69E834E}
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SQLiteCpp", "src\3rd_party\SQLiteCpp\SQLiteCpp.vcxproj", "{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{806A44E1-15D4-3368-B0B9-2A6CC352D505}"
-	ProjectSection(ProjectDependencies) = postProject
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libyaml-cpp", "src\3rd_party\yaml-cpp\libyaml-cpp.vcxproj", "{5AF43E07-5917-3D8F-9BF0-B41F698242EA}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian", "src\marian.vcxproj", "{885D3D2B-7278-30EF-BB1B-50E83D1635C4}"
-	ProjectSection(ProjectDependencies) = postProject
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33} = {17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA} = {5AF43E07-5917-3D8F-9BF0-B41F698242EA}
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C} = {55A27783-64A4-3AA7-A4B1-49C4B628F18C}
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162} = {F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3} = {1134F859-3DE4-34B1-924F-82CA38D4D4F3}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian_conv", "src\marian_conv.vcxproj", "{3CD61EAE-244E-33AB-8C7D-F5182481E033}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4} = {885D3D2B-7278-30EF-BB1B-50E83D1635C4}
-		{97131187-E592-3981-886F-222EE20FB669} = {97131187-E592-3981-886F-222EE20FB669}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian_cuda", "src\marian_cuda.vcxproj", "{97131187-E592-3981-886F-222EE20FB669}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian_decoder", "src\marian_decoder.vcxproj", "{25A05D30-AFC2-3F0E-B475-0B2B81530151}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4} = {885D3D2B-7278-30EF-BB1B-50E83D1635C4}
-		{97131187-E592-3981-886F-222EE20FB669} = {97131187-E592-3981-886F-222EE20FB669}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian_scorer", "src\marian_scorer.vcxproj", "{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4} = {885D3D2B-7278-30EF-BB1B-50E83D1635C4}
-		{97131187-E592-3981-886F-222EE20FB669} = {97131187-E592-3981-886F-222EE20FB669}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian_server", "src\marian_server.vcxproj", "{3784D69C-33A9-33A7-A557-F809EF2F4D34}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4} = {885D3D2B-7278-30EF-BB1B-50E83D1635C4}
-		{97131187-E592-3981-886F-222EE20FB669} = {97131187-E592-3981-886F-222EE20FB669}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian_train", "src\marian_train.vcxproj", "{EA3973A2-F92E-3124-9817-81B2458EC8DC}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4} = {885D3D2B-7278-30EF-BB1B-50E83D1635C4}
-		{97131187-E592-3981-886F-222EE20FB669} = {97131187-E592-3981-886F-222EE20FB669}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian_version", "src\marian_version.vcxproj", "{55A27783-64A4-3AA7-A4B1-49C4B628F18C}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "marian_vocab", "src\marian_vocab.vcxproj", "{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4} = {885D3D2B-7278-30EF-BB1B-50E83D1635C4}
-		{97131187-E592-3981-886F-222EE20FB669} = {97131187-E592-3981-886F-222EE20FB669}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pathie-cpp", "src\3rd_party\pathie-cpp\pathie-cpp.vcxproj", "{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sentencepiece-static", "src\3rd_party\sentencepiece\src\sentencepiece-static.vcxproj", "{D9D20410-4011-370C-8E15-A6F5C311F337}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sentencepiece_train-static", "src\3rd_party\sentencepiece\src\sentencepiece_train-static.vcxproj", "{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "spm_decode", "src\3rd_party\sentencepiece\src\spm_decode.vcxproj", "{5857EF98-C87F-3197-A399-F0F9A20913FC}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "spm_encode", "src\3rd_party\sentencepiece\src\spm_encode.vcxproj", "{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "spm_export_vocab", "src\3rd_party\sentencepiece\src\spm_export_vocab.vcxproj", "{FBB107B9-523B-3094-95CF-A103E2388006}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "spm_normalize", "src\3rd_party\sentencepiece\src\spm_normalize.vcxproj", "{5B4A6D26-C638-3350-9E1A-0F987C448DEC}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "spm_train", "src\3rd_party\sentencepiece\src\spm_train.vcxproj", "{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-		{D9D20410-4011-370C-8E15-A6F5C311F337} = {D9D20410-4011-370C-8E15-A6F5C311F337}
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678} = {4A20AD5F-7334-31D3-B31D-9AAF53CC6678}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "zlib", "src\3rd_party\zlib\zlib.vcxproj", "{1134F859-3DE4-34B1-924F-82CA38D4D4F3}"
-	ProjectSection(ProjectDependencies) = postProject
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505} = {806A44E1-15D4-3368-B0B9-2A6CC352D505}
-	EndProjectSection
+VisualStudioVersion = 15.0.27703.2047
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Marian", "Marian.vcxproj", "{E2F320FE-0C01-4C80-810C-3A92205A29DC}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
-		MinSizeRel|x64 = MinSizeRel|x64
-		RelWithDebInfo|x64 = RelWithDebInfo|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{5216F769-E887-369E-AD1E-D6A1F69E834E}.Debug|x64.ActiveCfg = Debug|x64
-		{5216F769-E887-369E-AD1E-D6A1F69E834E}.Debug|x64.Build.0 = Debug|x64
-		{5216F769-E887-369E-AD1E-D6A1F69E834E}.Release|x64.ActiveCfg = Release|x64
-		{5216F769-E887-369E-AD1E-D6A1F69E834E}.Release|x64.Build.0 = Release|x64
-		{5216F769-E887-369E-AD1E-D6A1F69E834E}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{5216F769-E887-369E-AD1E-D6A1F69E834E}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{5216F769-E887-369E-AD1E-D6A1F69E834E}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{5216F769-E887-369E-AD1E-D6A1F69E834E}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{9DAF8CA3-052E-3480-A332-34676CAE852B}.Debug|x64.ActiveCfg = Debug|x64
-		{9DAF8CA3-052E-3480-A332-34676CAE852B}.Release|x64.ActiveCfg = Release|x64
-		{9DAF8CA3-052E-3480-A332-34676CAE852B}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{9DAF8CA3-052E-3480-A332-34676CAE852B}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{3A3C6EA5-65CD-324E-90F4-6B4D70DD5A37}.Debug|x64.ActiveCfg = Debug|x64
-		{3A3C6EA5-65CD-324E-90F4-6B4D70DD5A37}.Release|x64.ActiveCfg = Release|x64
-		{3A3C6EA5-65CD-324E-90F4-6B4D70DD5A37}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{3A3C6EA5-65CD-324E-90F4-6B4D70DD5A37}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}.Debug|x64.ActiveCfg = Debug|x64
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}.Debug|x64.Build.0 = Debug|x64
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}.Release|x64.ActiveCfg = Release|x64
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}.Release|x64.Build.0 = Release|x64
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{17E8F84B-76CD-326B-B50A-C4F3C3A8CE33}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505}.Debug|x64.ActiveCfg = Debug|x64
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505}.Debug|x64.Build.0 = Debug|x64
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505}.Release|x64.ActiveCfg = Release|x64
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505}.Release|x64.Build.0 = Release|x64
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{806A44E1-15D4-3368-B0B9-2A6CC352D505}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA}.Debug|x64.ActiveCfg = Debug|x64
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA}.Debug|x64.Build.0 = Debug|x64
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA}.Release|x64.ActiveCfg = Release|x64
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA}.Release|x64.Build.0 = Release|x64
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{5AF43E07-5917-3D8F-9BF0-B41F698242EA}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4}.Debug|x64.ActiveCfg = Debug|x64
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4}.Debug|x64.Build.0 = Debug|x64
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4}.Release|x64.ActiveCfg = Release|x64
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4}.Release|x64.Build.0 = Release|x64
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{885D3D2B-7278-30EF-BB1B-50E83D1635C4}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033}.Debug|x64.ActiveCfg = Debug|x64
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033}.Debug|x64.Build.0 = Debug|x64
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033}.Release|x64.ActiveCfg = Release|x64
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033}.Release|x64.Build.0 = Release|x64
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{3CD61EAE-244E-33AB-8C7D-F5182481E033}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{97131187-E592-3981-886F-222EE20FB669}.Debug|x64.ActiveCfg = Debug|x64
-		{97131187-E592-3981-886F-222EE20FB669}.Debug|x64.Build.0 = Debug|x64
-		{97131187-E592-3981-886F-222EE20FB669}.Release|x64.ActiveCfg = Release|x64
-		{97131187-E592-3981-886F-222EE20FB669}.Release|x64.Build.0 = Release|x64
-		{97131187-E592-3981-886F-222EE20FB669}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{97131187-E592-3981-886F-222EE20FB669}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{97131187-E592-3981-886F-222EE20FB669}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{97131187-E592-3981-886F-222EE20FB669}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151}.Debug|x64.ActiveCfg = Debug|x64
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151}.Debug|x64.Build.0 = Debug|x64
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151}.Release|x64.ActiveCfg = Release|x64
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151}.Release|x64.Build.0 = Release|x64
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{25A05D30-AFC2-3F0E-B475-0B2B81530151}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}.Debug|x64.ActiveCfg = Debug|x64
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}.Debug|x64.Build.0 = Debug|x64
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}.Release|x64.ActiveCfg = Release|x64
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}.Release|x64.Build.0 = Release|x64
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{8A6B1F60-8E2D-3171-828B-07E732C8E7D7}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34}.Debug|x64.ActiveCfg = Debug|x64
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34}.Debug|x64.Build.0 = Debug|x64
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34}.Release|x64.ActiveCfg = Release|x64
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34}.Release|x64.Build.0 = Release|x64
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{3784D69C-33A9-33A7-A557-F809EF2F4D34}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC}.Debug|x64.ActiveCfg = Debug|x64
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC}.Debug|x64.Build.0 = Debug|x64
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC}.Release|x64.ActiveCfg = Release|x64
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC}.Release|x64.Build.0 = Release|x64
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{EA3973A2-F92E-3124-9817-81B2458EC8DC}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C}.Debug|x64.ActiveCfg = Debug|x64
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C}.Debug|x64.Build.0 = Debug|x64
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C}.Release|x64.ActiveCfg = Release|x64
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C}.Release|x64.Build.0 = Release|x64
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{55A27783-64A4-3AA7-A4B1-49C4B628F18C}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}.Debug|x64.ActiveCfg = Debug|x64
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}.Debug|x64.Build.0 = Debug|x64
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}.Release|x64.ActiveCfg = Release|x64
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}.Release|x64.Build.0 = Release|x64
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{36953645-6D01-37E4-ACF7-D3F9BFFCA49D}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}.Debug|x64.ActiveCfg = Debug|x64
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}.Debug|x64.Build.0 = Debug|x64
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}.Release|x64.ActiveCfg = Release|x64
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}.Release|x64.Build.0 = Release|x64
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{F4AD2C38-E6B9-3C4A-A281-4AB7440D6162}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{D9D20410-4011-370C-8E15-A6F5C311F337}.Debug|x64.ActiveCfg = Debug|x64
-		{D9D20410-4011-370C-8E15-A6F5C311F337}.Debug|x64.Build.0 = Debug|x64
-		{D9D20410-4011-370C-8E15-A6F5C311F337}.Release|x64.ActiveCfg = Release|x64
-		{D9D20410-4011-370C-8E15-A6F5C311F337}.Release|x64.Build.0 = Release|x64
-		{D9D20410-4011-370C-8E15-A6F5C311F337}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{D9D20410-4011-370C-8E15-A6F5C311F337}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{D9D20410-4011-370C-8E15-A6F5C311F337}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{D9D20410-4011-370C-8E15-A6F5C311F337}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}.Debug|x64.ActiveCfg = Debug|x64
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}.Debug|x64.Build.0 = Debug|x64
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}.Release|x64.ActiveCfg = Release|x64
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}.Release|x64.Build.0 = Release|x64
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{4A20AD5F-7334-31D3-B31D-9AAF53CC6678}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{5857EF98-C87F-3197-A399-F0F9A20913FC}.Debug|x64.ActiveCfg = Debug|x64
-		{5857EF98-C87F-3197-A399-F0F9A20913FC}.Debug|x64.Build.0 = Debug|x64
-		{5857EF98-C87F-3197-A399-F0F9A20913FC}.Release|x64.ActiveCfg = Release|x64
-		{5857EF98-C87F-3197-A399-F0F9A20913FC}.Release|x64.Build.0 = Release|x64
-		{5857EF98-C87F-3197-A399-F0F9A20913FC}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{5857EF98-C87F-3197-A399-F0F9A20913FC}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{5857EF98-C87F-3197-A399-F0F9A20913FC}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{5857EF98-C87F-3197-A399-F0F9A20913FC}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}.Debug|x64.ActiveCfg = Debug|x64
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}.Debug|x64.Build.0 = Debug|x64
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}.Release|x64.ActiveCfg = Release|x64
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}.Release|x64.Build.0 = Release|x64
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{F6E7B14E-D9E6-343C-B58D-CA0381A3BB8F}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{FBB107B9-523B-3094-95CF-A103E2388006}.Debug|x64.ActiveCfg = Debug|x64
-		{FBB107B9-523B-3094-95CF-A103E2388006}.Debug|x64.Build.0 = Debug|x64
-		{FBB107B9-523B-3094-95CF-A103E2388006}.Release|x64.ActiveCfg = Release|x64
-		{FBB107B9-523B-3094-95CF-A103E2388006}.Release|x64.Build.0 = Release|x64
-		{FBB107B9-523B-3094-95CF-A103E2388006}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{FBB107B9-523B-3094-95CF-A103E2388006}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{FBB107B9-523B-3094-95CF-A103E2388006}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{FBB107B9-523B-3094-95CF-A103E2388006}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC}.Debug|x64.ActiveCfg = Debug|x64
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC}.Debug|x64.Build.0 = Debug|x64
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC}.Release|x64.ActiveCfg = Release|x64
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC}.Release|x64.Build.0 = Release|x64
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{5B4A6D26-C638-3350-9E1A-0F987C448DEC}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}.Debug|x64.ActiveCfg = Debug|x64
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}.Debug|x64.Build.0 = Debug|x64
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}.Release|x64.ActiveCfg = Release|x64
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}.Release|x64.Build.0 = Release|x64
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{11AB9AE9-CF65-341B-B425-9EDFC4E2F22F}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3}.Debug|x64.ActiveCfg = Debug|x64
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3}.Debug|x64.Build.0 = Debug|x64
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3}.Release|x64.ActiveCfg = Release|x64
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3}.Release|x64.Build.0 = Release|x64
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
-		{1134F859-3DE4-34B1-924F-82CA38D4D4F3}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{E2F320FE-0C01-4C80-810C-3A92205A29DC}.Debug|x64.ActiveCfg = Debug|x64
+		{E2F320FE-0C01-4C80-810C-3A92205A29DC}.Debug|x64.Build.0 = Debug|x64
+		{E2F320FE-0C01-4C80-810C-3A92205A29DC}.Release|x64.ActiveCfg = Release|x64
+		{E2F320FE-0C01-4C80-810C-3A92205A29DC}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {A73289FB-DB51-3D6F-802E-B474CC102EDA}
-	EndGlobalSection
-	GlobalSection(ExtensibilityAddIns) = postSolution
+		SolutionGuid = {8CA1BE8F-87A9-4094-B549-E8C790F79D8C}
 	EndGlobalSection
 EndGlobal
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@ -76,7 +76,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>zlib.lib; msmpi.lib; mkl_intel_ilp64.lib; mkl_sequential.lib; mkl_core.lib; kernel32.lib; user32.lib; gdi32.lib; winspool.lib; comdlg32.lib; advapi32.lib; shell32.lib; ole32.lib; oleaut32.lib; uuid.lib; odbc32.lib; odbccp32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>zlib.lib;msmpi.lib;mkl_intel_ilp64.lib;mkl_sequential.lib;mkl_core.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <StackReserveSize>100000000</StackReserveSize>
      <TreatLinkerWarningAsErrors>true</TreatLinkerWarningAsErrors>
    </Link>
@ -106,13 +106,20 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>zlib.lib; msmpi.lib; mkl_intel_ilp64.lib; mkl_sequential.lib; mkl_core.lib; kernel32.lib; user32.lib; gdi32.lib; winspool.lib; comdlg32.lib; advapi32.lib; shell32.lib; ole32.lib; oleaut32.lib; uuid.lib; odbc32.lib; odbccp32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>zlib.lib;msmpi.lib;mkl_intel_ilp64.lib;mkl_sequential.lib;mkl_core.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <StackReserveSize>100000000</StackReserveSize>
      <TreatLinkerWarningAsErrors>true</TreatLinkerWarningAsErrors>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="..\src\3rd_party\ExceptionWithCallStack.cpp" />
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\entry_iterator.cpp" />
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\errors.cpp" />
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\path.cpp" />
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie.cpp" />
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie_ifstream.cpp" />
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie_ofstream.cpp" />
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\temp.cpp" />
    <ClCompile Include="..\src\3rd_party\sentencepiece\src\bpe_model.cc">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -316,6 +323,103 @@
    <ClCompile Include="..\src\3rd_party\yaml-cpp\binary_renamed.cpp" />
    <ClCompile Include="..\src\3rd_party\yaml-cpp\yaml-node.cpp" />
    <ClInclude Include="..\src\3rd_party\ExceptionWithCallStack.h" />
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\collectives.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\all_gather.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\all_reduce.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\broadcast.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\common.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\common_kernel.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\ll_kernel.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\primitives.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\reduce.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\reduce_kernel.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\reduce_scatter.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\bootstrap.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\common_coll.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\core.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\debug.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\enqueue.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\group.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\ibvwrap.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\nccl_net.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\net.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\nvlink.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\nvmlwrap.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\param.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\ring.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\rings.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\shm.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\socket.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\topo.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\transport.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\utils.h">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\entry_iterator.hpp" />
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\errors.hpp" />
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\path.hpp" />
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie.hpp" />
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie_ifstream.hpp" />
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie_ofstream.hpp" />
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\temp.hpp" />
    <ClInclude Include="..\src\3rd_party\sentencepiece\src\bpe_model.h">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -459,7 +563,6 @@
    <ClCompile Include="..\src\data\corpus_nbest.cpp" />
    <ClCompile Include="..\src\data\text_input.cpp" />
    <ClCompile Include="..\src\3rd_party\cnpy\cnpy.cpp" />
-    <ClCompile Include="..\src\3rd_party\svd\svd.cpp" />
    <ClCompile Include="..\src\layers\loss.cpp" />
    <ClCompile Include="..\src\layers\weight.cpp" />
    <ClCompile Include="..\src\microsoft\quicksand.cpp">
@ -620,8 +723,6 @@
    <ClInclude Include="..\src\3rd_party\spdlog\tests\catch.hpp" />
    <ClInclude Include="..\src\3rd_party\spdlog\tests\includes.h" />
    <ClInclude Include="..\src\3rd_party\spdlog\tests\utils.h" />
-    <ClInclude Include="..\src\3rd_party\svd\defs_and_types.h" />
-    <ClInclude Include="..\src\3rd_party\svd\svd.h" />
    <ClInclude Include="..\src\3rd_party\yaml-cpp\anchor.h" />
    <ClInclude Include="..\src\3rd_party\yaml-cpp\binary.h" />
    <ClInclude Include="..\src\3rd_party\yaml-cpp\collectionstack.h" />
@ -738,7 +839,6 @@
    <ClInclude Include="..\src\models\decoder.h" />
    <ClInclude Include="..\src\models\encoder.h" />
    <ClInclude Include="..\src\models\encoder_decoder.h" />
-    <ClInclude Include="..\src\models\hardatt.h" />
    <ClInclude Include="..\src\models\model_base.h" />
    <ClInclude Include="..\src\models\model_factory.h" />
    <ClInclude Include="..\src\models\model_task.h" />
@ -808,6 +908,96 @@
    <ClInclude Include="..\src\translator\translator.h" />
  </ItemGroup>
  <ItemGroup>
+    <None Include="..\src\3rd_party\nccl\src\bootstrap.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\all_gather.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\all_reduce.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\broadcast.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\all_gather.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\all_reduce.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\broadcast.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\functions.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\Makefile">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\reduce.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\reduce_scatter.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\reduce.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\reduce_scatter.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\init.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\Makefile">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\enqueue.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\group.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\ibvwrap.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\nvmlwrap.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\rings.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\utils.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\nccl.h.in">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\ring.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\net.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\net_ib.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\net_socket.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\p2p.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\shm.cu">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </None>
+    <None Include="..\src\3rd_party\pathie-cpp\CHANGELOG" />
+    <None Include="..\src\3rd_party\pathie-cpp\LICENSE" />
+    <None Include="..\src\3rd_party\pathie-cpp\README.md" />
    <None Include="..\src\3rd_party\sentencepiece\src\sentencepiece.proto">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -846,6 +1036,7 @@
    <None Include=".editorConfig" />
  </ItemGroup>
  <ItemGroup>
+    <Text Include="..\src\3rd_party\pathie-cpp\CMakeLists.txt" />
    <Text Include="..\src\3rd_party\sentencepiece\src\CMakeLists.txt">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@ -31,9 +31,6 @@
    <ClCompile Include="..\src\3rd_party\cnpy\cnpy.cpp">
      <Filter>3rd_party\cnpy</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\svd\svd.cpp">
-      <Filter>3rd_party\svd</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\tensors\backend.cpp">
      <Filter>tensors</Filter>
    </ClCompile>
@ -421,8 +418,26 @@
    <ClCompile Include="..\src\rescorer\score_collector.cpp">
      <Filter>rescorer</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\command\marian_train.cpp">
-      <Filter>command</Filter>
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\entry_iterator.cpp">
+      <Filter>3rd_party\pathie-cpp\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\errors.cpp">
+      <Filter>3rd_party\pathie-cpp\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\path.cpp">
+      <Filter>3rd_party\pathie-cpp\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie.cpp">
+      <Filter>3rd_party\pathie-cpp\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie_ifstream.cpp">
+      <Filter>3rd_party\pathie-cpp\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie_ofstream.cpp">
+      <Filter>3rd_party\pathie-cpp\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\temp.cpp">
+      <Filter>3rd_party\pathie-cpp\src</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
@ -640,12 +655,6 @@
    <ClInclude Include="..\src\3rd_party\spdlog\tests\utils.h">
      <Filter>3rd_party\spdlog\tests</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\svd\defs_and_types.h">
-      <Filter>3rd_party\svd</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\svd\svd.h">
-      <Filter>3rd_party\svd</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\3rd_party\yaml-cpp\anchor.h">
      <Filter>3rd_party\yaml-cpp</Filter>
    </ClInclude>
@ -988,9 +997,6 @@
    <ClInclude Include="..\src\models\encoder_decoder.h">
      <Filter>models</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\models\hardatt.h">
-      <Filter>models</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\models\model_base.h">
      <Filter>models</Filter>
    </ClInclude>
@ -1345,6 +1351,118 @@
    <ClInclude Include="..\src\3rd_party\sentencepiece\src\word_model_trainer.h">
      <Filter>3rd_party\sentencepiece\src</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\command\marian_train.cpp" />
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\collectives.h">
+      <Filter>3rd_party\nccl\src\collectives</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\all_gather.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\all_reduce.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\broadcast.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\common.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\common_kernel.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\ll_kernel.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\primitives.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\reduce.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\reduce_kernel.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\device\reduce_scatter.h">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\bootstrap.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\common_coll.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\core.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\debug.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\enqueue.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\group.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\ibvwrap.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\nccl_net.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\net.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\nvlink.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\nvmlwrap.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\param.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\ring.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\rings.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\shm.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\socket.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\topo.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\transport.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\nccl\src\include\utils.h">
+      <Filter>3rd_party\nccl\src\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\entry_iterator.hpp">
+      <Filter>3rd_party\pathie-cpp\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\errors.hpp">
+      <Filter>3rd_party\pathie-cpp\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\path.hpp">
+      <Filter>3rd_party\pathie-cpp\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie.hpp">
+      <Filter>3rd_party\pathie-cpp\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie_ifstream.hpp">
+      <Filter>3rd_party\pathie-cpp\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie_ofstream.hpp">
+      <Filter>3rd_party\pathie-cpp\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\temp.hpp">
+      <Filter>3rd_party\pathie-cpp\include</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="3rd_party">
@ -1386,9 +1504,6 @@
    <Filter Include="3rd_party\spdlog\tests">
      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0041}</UniqueIdentifier>
    </Filter>
-    <Filter Include="3rd_party\svd">
-      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0044}</UniqueIdentifier>
-    </Filter>
    <Filter Include="3rd_party\yaml-cpp">
      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0047}</UniqueIdentifier>
    </Filter>
@ -1482,6 +1597,36 @@
    <Filter Include="3rd_party\sentencepiece\src">
      <UniqueIdentifier>{638bf0e1-4f83-4b37-9077-2be549d75909}</UniqueIdentifier>
    </Filter>
+    <Filter Include="3rd_party\nccl">
+      <UniqueIdentifier>{0ba105eb-79fb-4e2a-8940-f1ecebbcd4fe}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\nccl\src">
+      <UniqueIdentifier>{fbc17f5e-3f10-44a9-b3ad-66ce12573174}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\nccl\src\collectives">
+      <UniqueIdentifier>{c6036c35-5848-4fd5-b1a0-59e2042cbb69}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\nccl\src\misc">
+      <UniqueIdentifier>{7b9a131d-9e0a-4c28-8a51-08232ff2e35e}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\nccl\src\transport">
+      <UniqueIdentifier>{0bd9cca8-660b-46f6-aac6-691fb50245f0}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\nccl\src\include">
+      <UniqueIdentifier>{2beba56f-5dda-4994-bef0-16170b6552b4}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\nccl\src\collectives\device">
+      <UniqueIdentifier>{ac585624-4e66-42cd-8e4e-62cb90029610}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\pathie-cpp">
+      <UniqueIdentifier>{825beb7c-2997-408b-af81-34ab5f14593a}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\pathie-cpp\include">
+      <UniqueIdentifier>{db1dd5a2-f331-495d-9e3b-6dc1c01528ab}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\pathie-cpp\src">
+      <UniqueIdentifier>{5d5ee615-192f-4b7f-bdfd-fb8316ceabc8}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include=".editorConfig" />
@ -1524,10 +1669,109 @@
    <None Include="..\src\3rd_party\sentencepiece\src\sentencepiece_model.proto">
      <Filter>3rd_party\sentencepiece\src</Filter>
    </None>
+    <None Include="..\src\3rd_party\nccl\src\bootstrap.cu">
+      <Filter>3rd_party\nccl\src</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\init.cu">
+      <Filter>3rd_party\nccl\src</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\Makefile">
+      <Filter>3rd_party\nccl\src</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\nccl.h.in">
+      <Filter>3rd_party\nccl\src</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\ring.cu">
+      <Filter>3rd_party\nccl\src</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport.cu">
+      <Filter>3rd_party\nccl\src</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\all_gather.cu">
+      <Filter>3rd_party\nccl\src\collectives</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\all_reduce.cu">
+      <Filter>3rd_party\nccl\src\collectives</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\broadcast.cu">
+      <Filter>3rd_party\nccl\src\collectives</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\reduce.cu">
+      <Filter>3rd_party\nccl\src\collectives</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\reduce_scatter.cu">
+      <Filter>3rd_party\nccl\src\collectives</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\all_gather.cu">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\all_reduce.cu">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\broadcast.cu">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\functions.cu">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\Makefile">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\reduce.cu">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\collectives\device\reduce_scatter.cu">
+      <Filter>3rd_party\nccl\src\collectives\device</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\enqueue.cu">
+      <Filter>3rd_party\nccl\src\misc</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\group.cu">
+      <Filter>3rd_party\nccl\src\misc</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\ibvwrap.cu">
+      <Filter>3rd_party\nccl\src\misc</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\nvmlwrap.cu">
+      <Filter>3rd_party\nccl\src\misc</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\rings.cu">
+      <Filter>3rd_party\nccl\src\misc</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\misc\utils.cu">
+      <Filter>3rd_party\nccl\src\misc</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\net.cu">
+      <Filter>3rd_party\nccl\src\transport</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\net_ib.cu">
+      <Filter>3rd_party\nccl\src\transport</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\net_socket.cu">
+      <Filter>3rd_party\nccl\src\transport</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\p2p.cu">
+      <Filter>3rd_party\nccl\src\transport</Filter>
+    </None>
+    <None Include="..\src\3rd_party\nccl\src\transport\shm.cu">
+      <Filter>3rd_party\nccl\src\transport</Filter>
+    </None>
+    <None Include="..\src\3rd_party\pathie-cpp\CHANGELOG">
+      <Filter>3rd_party\pathie-cpp</Filter>
+    </None>
+    <None Include="..\src\3rd_party\pathie-cpp\LICENSE">
+      <Filter>3rd_party\pathie-cpp</Filter>
+    </None>
+    <None Include="..\src\3rd_party\pathie-cpp\README.md">
+      <Filter>3rd_party\pathie-cpp</Filter>
+    </None>
  </ItemGroup>
  <ItemGroup>
    <Text Include="..\src\3rd_party\sentencepiece\src\CMakeLists.txt">
      <Filter>3rd_party\sentencepiece\src</Filter>
    </Text>
+    <Text Include="..\src\3rd_party\pathie-cpp\CMakeLists.txt">
+      <Filter>3rd_party\pathie-cpp</Filter>
+    </Text>
  </ItemGroup>
 </Project>