fix merge

2024-09-17 09:47:34 +03:00 · 2018-07-27 10:14:21 -07:00 · 2018-07-27 10:14:21 -07:00 · dceb7185d8
commit dceb7185d8
parent 5cc8674d97 8b0e2f951b
66 changed files with 2786 additions and 1169 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,41 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

 ## [Unreleased]

+### Added
+
+- Faster training (20-30%) by optimizing gradient popagation of biases
+- Returning Moses-style hard alignments during decoding single models, ensembles and n-best
+  lists
+- Hard alignment extraction strategy taking source words that have the
+  attention value greater than the threshold
+- Refactored sync sgd for easier communication and integration with NCCL
+- Smaller memory-overhead for sync-sgd
+- NCCL integration (version 2.2.13)
+
+### Fixed
+
+- A couple of bugs in "selection" (transpose, shift, cols, rows) operators during
+  back-prob for a very specific case: one of the operators is the first operator after
+  a branch, in that case gradient propgation might be interrupted. This did not affect
+  any of the existing models as such a case was not present, but might have caused
+  future models to not train properly.
+- Bug in mini-batch-fit, tied embeddings would result in identical embeddings in fake
+  source and target batch. Caused under-estimation of memory usage and re-allocation.
+
+## [1.5.0] - 2018-06-17
+
+### Added
+
+- Average Attention Networks for Transformer model
+- 16-bit matrix multiplication on CPU
+- Memoization for constant nodes for decoding
+- Autotuning for decoding
+
+### Fixed
+
+- GPU decoding optimizations, about 2x faster decoding of transformer models
+- Multi-node MPI-based training on GPUs
+
 ## [1.4.0] - 2018-03-13

 ### Added
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,6 +13,7 @@ option(COMPILE_CPU "Compile CPU version" ON)
 option(COMPILE_CUDA "Compile GPU version" ON)
 option(USE_STATIC_LIBS "Compile GPU version" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
+option(USE_NCCL "Use NCCL library" ON)
 option(USE_MPI "Use MPI library" OFF)

 # Project versioning
@ -49,6 +50,17 @@ if(CUDA_FOUND)
      LIST(APPEND CUDA_NVCC_FLAGS -DCUDNN; )
    endif(CUDNN_FOUND)
  endif(USE_CUDNN)
+
+  if(USE_NCCL)
+    find_package(NCCL)
+    if(NCCL_FOUND)
+      include_directories(${NCCL_INCLUDE_DIR})
+      set(EXT_LIBS ${EXT_LIBS} ${NCCL_LIBRARIES})
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NCCL")
+      LIST(APPEND CUDA_NVCC_FLAGS -DUSE_NCCL; )
+    endif(NCCL_FOUND)
+  endif(USE_NCCL)
+
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDA_FOUND")
  list(APPEND CUDA_NVCC_FLAGS -DCUDA_FOUND; )
 else(CUDA_FOUND)
--- a/README.md
+++ b/README.md
@ -3,6 +3,7 @@ Marian

 [![Join the chat at https://gitter.im/marian-nmt](https://badges.gitter.im/amunmt/marian.svg)](https://gitter.im/marian-nmt?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 [![Build Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=marian-dev)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev/)
+[![CPU Build Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=marian-dev)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/)
 [![Tests Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=marian-regression-tests)](http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/)
 [![Twitter](https://img.shields.io/twitter/follow/marian_nmt.svg?style=social&label=Follow)](https://twitter.com/intent/follow?screen_name=marian_nmt)

@ -22,7 +23,7 @@ Named in honour of Marian Rejewski, a Polish mathematician and cryptologist.
 cd marian-dev
 mkdir -p build
 cd build
-cmake .. -DCMAKE_BUILD_TYPE=relwithdebinfo
+cmake .. -DCMAKE_BUILD_TYPE=Release
 make -j
 ```

--- a/2
+++ b/2
@ -1 +1 @@
-v1.4.0
+v1.5.0
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@ -0,0 +1,30 @@
+set(NCCL_INC_PATHS
+    /usr/include
+    /usr/local/include
+    /usr/local/cuda/include
+    $ENV{NCCL_DIR}/include
+    $ENV{CUDA_TOOLKIT_ROOT_DIRCUDA_ROOT}/include
+)
+
+set(NCCL_LIB_PATHS
+    /lib
+    /lib64
+    /usr/lib
+    /usr/lib64
+    /usr/local/lib
+    /usr/local/lib64
+    /usr/local/cuda/lib64
+    $ENV{NCCL_DIR}/lib64
+    $ENV{CUDA_TOOLKIT_ROOT_DIR}/lib64
+)
+
+find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS})
+find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+
+if (NCCL_FOUND)
+  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+endif ()
--- a/scripts/python/example.py
+++ b/scripts/python/example.py
@ -1,17 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + "../../build"))
-import libmariannmt as nmt
-
-print >>sys.stderr, "marian-nmt version: ", nmt.version()
-
-if len(sys.argv) == 1:
-    print >>sys.stderr, "Specify s2s arguments"
-    exit(1)
-
-nmt.init(' '.join(sys.argv))
-for line in sys.stdin:
-    print nmt.translate([line.rstrip()])
--- a/scripts/python/mariannmt_server.py
+++ b/scripts/python/mariannmt_server.py
@ -1,53 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-import os
-import argparse
-
-sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + "./../build"))
-import libmariannmt as nmt
-
-from bottle import request, Bottle, abort
-
-app = Bottle()
-
-
-@app.route('/translate')
-def handle_websocket():
-    wsock = request.environ.get('wsgi.websocket')
-    if not wsock:
-        abort(400, 'Expected WebSocket request.')
-
-    while True:
-        try:
-            message = wsock.receive()
-            if message is not None:
-                # force potential unicode to str() for boost conversion
-                listSentences = str(message).split('\n')
-                numEle = len(listSentences)
-                if numEle > 0 and listSentences[numEle - 1] == "":
-                    del listSentences[numEle - 1]
-                trans = nmt.translate(listSentences)
-                wsock.send('\n'.join(trans))
-        except WebSocketError:
-            break
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-c", dest="config")
-    parser.add_argument('-p', dest="port", default=8080, type=int)
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    nmt.init("-c {}".format(args.config))
-
-    from gevent.pywsgi import WSGIServer
-    from geventwebsocket import WebSocketError
-    from geventwebsocket.handler import WebSocketHandler
-    server = WSGIServer(
-        ("0.0.0.0", args.port), app, handler_class=WebSocketHandler)
-    server.serve_forever()
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -37,6 +37,8 @@ add_library(marian STATIC
  graph/node_initializers.cpp

  layers/convolution.cpp
+  layers/loss.cpp
+  layers/weight.cpp

  rnn/cells.cpp
  rnn/attention.cpp
@ -49,6 +51,7 @@ add_library(marian STATIC

  translator/history.cpp
  translator/output_collector.cpp
+  translator/output_printer.cpp
  translator/nth_element.cpp
  translator/helpers.cpp
  translator/scorers.cpp
@ -58,10 +61,13 @@ add_library(marian STATIC
  training/graph_group_sync.cpp
  training/graph_group_singleton.cpp
  training/graph_group_multinode.cpp
+  training/graph_group_multinode_sync.cpp
  training/validator.cpp
+  training/communicator.cpp

  $<TARGET_OBJECTS:libyaml-cpp>
-  $<TARGET_OBJECTS:SQLiteCpp>)
+  $<TARGET_OBJECTS:SQLiteCpp>
+)

 if(CUDA_FOUND)
 cuda_add_library(marian_cuda
@ -77,6 +83,7 @@ cuda_add_library(marian_cuda
  translator/helpers.cu
  training/gradient_dropping/gpu/dropper.cu
  training/gradient_dropping/gpu/sparse_algorithm.cu
+  training/communicator.cu
  STATIC)
 endif(CUDA_FOUND)

@ -98,14 +105,16 @@ set_target_properties(marian_vocab PROPERTIES OUTPUT_NAME marian-vocab)
 set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab)

 # marian.zip and marian.tgz
-# This combines marian, marian_decoder in a single ZIP or TAR file for execution in MSFT internal tools FLO and Philly.
-# For Philly submission, we need statically-linked versions to deal with library dependencies, so this target is only enabled for static builds.
+# This combines marian, marian_decoder in a single ZIP or TAR file for
+# execution in MSFT internal tools FLO and Philly.
+# For Philly submission, we need statically-linked versions to deal with
+# library dependencies, so this target is only enabled for static builds.
 if(USE_STATIC_LIBS)
  add_custom_command(
    OUTPUT "${CMAKE_BINARY_DIR}/marian.zip"
    COMMAND zip -v -0 -j "${CMAKE_BINARY_DIR}/marian.zip"
                "${CMAKE_BINARY_DIR}/marian"
-                "${CMAKE_BINARY_DIR}/marian-decoder" 
+                "${CMAKE_BINARY_DIR}/marian-decoder"
                "${CMAKE_BINARY_DIR}/marian-scorer"
                "${CMAKE_BINARY_DIR}/marian-vocab"
    DEPENDS marian_train marian_decoder marian_scorer marian_vocab)
@ -146,10 +155,6 @@ endforeach(exec)

 #set_target_properties(align2steps PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")

-if(PYTHONLIBS_FOUND)
-#  add_subdirectory(python)
-endif(PYTHONLIBS_FOUND)
-
 if(COMPILE_TESTS)
  set(CATCH_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/3rd_party)
  add_library(Catch INTERFACE)
--- a/src/command/marian.cpp
+++ b/src/command/marian.cpp
@ -1,16 +1,17 @@
 #include "marian.h"

 #include "training/graph_group_async.h"
-#include "training/graph_group_multinode.h"
+#include "training/graph_group_multinode_sync.h"
 #include "training/graph_group_singleton.h"
 #include "training/graph_group_sync.h"
 #include "training/training.h"

 #ifdef CUDA_FOUND
 #include "training/graph_group_async_drop.h"
+#include "training/graph_group_multinode.h"
 #endif

-bool configureMPI(int, char**);
+bool configureMPI(int, char**, bool);

 int main(int argc, char** argv) {
  using namespace marian;
@ -19,38 +20,54 @@ int main(int argc, char** argv) {
  auto devices = options->getDevices();

  if(options->get<bool>("multi-node")) {
-    ABORT_IF(!configureMPI(argc, argv), "MPI not found.");
-
+    ABORT_IF(!configureMPI(argc, argv, options->get<bool>("sync-sgd")),
+             "MPI not found.");
    LOG(warn, "[experimental] Running multi-node training");
-    New<Train<MultiNodeGraphGroup>>(options)->run();
+
+    if(options->get<bool>("sync-sgd")) {
+      New<Train<MultiNodeGraphGroupSync>>(options)->run();
+    }
+    else {
+#ifdef CUDA_FOUND
+      New<Train<MultiNodeGraphGroup>>(options)->run();
+#else
+      ABORT("Asynchronous multi-node training requires CUDA");
+#endif
+    }
  } else {
    if(devices.size() == 1) {
      New<Train<SingletonGraph>>(options)->run();
    } else {
-      if(options->get<bool>("sync-sgd"))
+      if(options->get<bool>("sync-sgd")) {
        New<Train<SyncGraphGroup>>(options)->run();
+      }
+      else if(options->get<float>("grad-dropping-rate") > 0.0) {
 #ifdef CUDA_FOUND
-      else if(options->get<float>("grad-dropping-rate") > 0.0)
        New<Train<AsyncGraphGroupDrop>>(options)->run();
+#else
+      ABORT("Asynchronous training with gradient dropping requires CUDA");
 #endif
-      else
+      }
+      else {
        New<Train<AsyncGraphGroup>>(options)->run();
+      }
    }
  }

  return 0;
 }

-bool configureMPI(int argc, char** argv) {
+bool configureMPI(int argc, char** argv, bool sync) {
  bool enable = false;
 #if MPI_FOUND
+  int required_mode = sync ? MPI_THREAD_SERIALIZED : MPI_THREAD_MULTIPLE;
  int provided_thread_mode = 0;
  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided_thread_mode);
  // Enable if occasional truncation errors
  MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);

  ABORT_IF(
-      provided_thread_mode < MPI_THREAD_MULTIPLE,
+      provided_thread_mode < required_mode,
      "Your version of MPI does not support multi-threaded communication.");

  enable = true;
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -490,6 +490,10 @@ void ConfigParser::addOptionsTraining(po::options_description& desc) {
      ->multitoken()
      ->default_value(std::vector<std::string>({"0"}), "0"),
      "GPU ID(s) to use for training")
+#ifdef USE_NCCL
+    ("no-nccl", po::value<bool>()->zero_tokens()->default_value(false),
+     "Disable inter-GPU communication via NCCL")
+#endif
 #ifdef CUDA_FOUND
    ("cpu-threads", po::value<size_t>()->default_value(0)->implicit_value(1),
      "Use CPU-based computation with this many independent threads, 0 means GPU-based computation")
@ -607,10 +611,6 @@ void ConfigParser::addOptionsTraining(po::options_description& desc) {
    ("multi-node-overlap", po::value<bool>()
      ->default_value(true),
     "Overlap model computations with MPI communication")
-    ("multi-node-local-optimizers", po::value<bool>()
-      ->zero_tokens()
-      ->default_value(false),
-     "Enable local optimizers with multi-node. Requires optimizer delay to be turned on.")
  ;
  // clang-format on
  desc.add(training);
@ -722,9 +722,10 @@ void ConfigParser::addOptionsTranslate(po::options_description& desc) {
      "Display n-best list")
    ("shortlist", po::value<std::vector<std::string>>()->multitoken(),
     "Use softmax shortlist: path first best prune")
-    ("weights", po::value<std::vector<float>>()
-      ->multitoken(),
+    ("weights", po::value<std::vector<float>>()->multitoken(),
      "Scorer weights")
+    ("alignment", po::value<float>()->default_value(0.f)->implicit_value(1.f),
+     "Return word alignments")
    // TODO: the options should be available only in server
    ("port,p", po::value<size_t>()->default_value(8080),
      "Port number for web socket server")
@ -1006,7 +1007,10 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {

    SET_OPTION("multi-node", bool);
    SET_OPTION("multi-node-overlap", bool);
-    SET_OPTION("multi-node-local-optimizers", bool);
+
+#ifdef USE_NCCL
+    SET_OPTION("no-nccl", bool);
+#endif
  }

  if(mode_ == ConfigMode::rescoring) {
@ -1031,6 +1035,7 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
    SET_OPTION("mini-batch-words", int);
    SET_OPTION_NONDEFAULT("weights", std::vector<float>);
    SET_OPTION_NONDEFAULT("shortlist", std::vector<std::string>);
+    SET_OPTION("alignment", float);
    SET_OPTION("port", size_t);
    SET_OPTION("optimize", bool);
    SET_OPTION("max-length-factor", float);
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@ -207,6 +207,13 @@ public:
    return currentBatch_;
  }

+  std::vector<BatchPtr> nextN(size_t num) {
+    std::vector<BatchPtr> batches;
+    for(int i = 0; i < num && *this; ++i)
+      batches.push_back(next());
+    return batches;
+  }
+
  void prepare(bool shuffle = true) {
    if(shuffle)
      data_->shuffle();
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -172,33 +172,35 @@ public:
   * @see marian::data::Batch::split(size_t n)
   */
  std::vector<Ptr<SubBatch>> split(size_t n) {
+    ABORT_IF(size_ == 0, "Encoutered sub-batch size of 0");
+
    std::vector<Ptr<SubBatch>> splits;
-
    size_t subSize = std::ceil(size_ / (float)n);
-    size_t totSize = size_;
-
+    
+    size_t restSize = size_;
    int pos = 0;
    for(int k = 0; k < n; ++k) {
-      size_t __size__ = std::min(subSize, totSize);
+      size_t __size__ = std::min(subSize, restSize);
+      if(__size__ > 0) {
+        auto sb = New<SubBatch>(__size__, width_, vocab_);

-      auto sb = New<SubBatch>(__size__, width_, vocab_);
+        size_t __words__ = 0;
+        for(int j = 0; j < width_; ++j) {
+          for(int i = 0; i < __size__; ++i) {
+            sb->data()[j * __size__ + i] = indices_[j * size_ + pos + i];
+            sb->mask()[j * __size__ + i] = mask_[j * size_ + pos + i];

-      size_t __words__ = 0;
-      for(int j = 0; j < width_; ++j) {
-        for(int i = 0; i < __size__; ++i) {
-          sb->data()[j * __size__ + i] = indices_[j * size_ + pos + i];
-          sb->mask()[j * __size__ + i] = mask_[j * size_ + pos + i];
-
-          if(mask_[j * size_ + pos + i] != 0)
-            __words__++;
+            if(mask_[j * size_ + pos + i] != 0)
+              __words__++;
+          }
        }
+
+        sb->setWords(__words__);
+        splits.push_back(sb);
+
+        restSize -= __size__;
+        pos += __size__;
      }
-
-      sb->setWords(__words__);
-      splits.push_back(sb);
-
-      totSize -= __size__;
-      pos += __size__;
    }
    return splits;
  }
@ -260,7 +262,7 @@ public:
   * @brief The number of sentences in the batch, target words.
   */
  size_t sizeTrg() const { return subBatches_.back()->batchSize(); }
-  
+
  /**
   * @brief The number of words for the longest sentence in the batch plus one.
   */
@ -291,17 +293,19 @@ public:
                                    Ptr<Options> options) {
    std::vector<Ptr<SubBatch>> batches;

+    size_t idx = 0;
    for(auto len : lengths) {
      auto vocab = New<Vocab>();
      vocab->createFake();
      auto sb = New<SubBatch>(batchSize, len, vocab); // data: gets initialized to 0. No EOS symbol is distinguished.
+      std::fill(sb->data().begin(), sb->data().end(), idx++); // set word indices to different values to avoid same hashes
      std::fill(sb->mask().begin(), sb->mask().end(), 1); // mask: no items ask being masked out

      batches.push_back(sb);
    }

    auto batch = New<CorpusBatch>(batches);
-    
+
    if(!options) return batch;

    if(options->has("guided-alignment")) {
@ -331,12 +335,17 @@ public:
   * @see marian::data::SubBatch::split(size_t n)
   */
  std::vector<Ptr<Batch>> split(size_t n) {
+    ABORT_IF(size() == 0, "Encoutered batch size of 0");
+
+    std::vector<std::vector<Ptr<SubBatch>>> subs;
    // split each subbatch separately
-    std::vector<std::vector<Ptr<SubBatch>>> subs(n);
    for(auto subBatch : subBatches_) {
      size_t i = 0;
-      for(auto splitSubBatch : subBatch->split(n))
+      for(auto splitSubBatch : subBatch->split(n)) {
+        if(subs.size() <= i)
+          subs.resize(i + 1);
        subs[i++].push_back(splitSubBatch);
+      }
    }

    // create batches from split subbatches
--- a/src/data/corpus_sqlite.cpp
+++ b/src/data/corpus_sqlite.cpp
@ -99,7 +99,7 @@ void CorpusSQLite::fillSQLite() {
      }
    }
    db_->exec("commit;");
-    LOG(info, "[sqlite] Inserted {} lines", lines);
+    LOG(info, "[sqlite] Inserted {} lines", lines - 1);
    LOG(info, "[sqlite] Creating primary index");
    db_->exec("create unique index idx_line on lines (_id);");
  }
--- a/src/functional/predicates.h
+++ b/src/functional/predicates.h
@ -1,5 +1,7 @@
 #pragma once

+#include <cmath>
+
 #include "functional/defs.h"
 #include "functional/operands.h"

--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -244,12 +244,13 @@ public:
    }
  }

-  void backward() {
+  void backward(bool zero = true) {
    ABORT_IF(topNodes_.size() > 1,
             "There are more than one top most node for backward step");

    params_->allocateBackward();
-    params_->set_zero_adjoint();
+    if(zero)
+      params_->set_zero_adjoint();

    for(auto&& v : topNodes_)
      v->init_dependent();
@ -264,7 +265,7 @@ public:
      nodesBackward_.pop_back();

      for(auto&& child : v->children()) {
-        if(child->trainable())
+        if(child->trainable() && child->type() != "param")
          child->set_zero_adjoint();
      }

--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -313,7 +313,9 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
        if(bc != b)
          bc = rec2(bc);

-        std::vector<Expr> nodes = {ac, bc, bias};
+        int rows = ac->shape().elements() / ac->shape()[-1];
+        Expr ones = ac->graph()->ones({rows, 1});
+        std::vector<Expr> nodes = {ac, bc, bias, ones};
        return rec2(Expression<AffineNodeOp>(nodes, transA, transB, scale),
                    true);
      };
@ -333,13 +335,16 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
  }
  else {
    // general version, MKL, CBlas or CUDA
+
    // if clipValue > 0, the inputs will be clipped to range [-clipValue, clipValue]
    // This is meant to keep values at the same range as used during training when
    // optimizing for 8-bit integer products. Likely to be removed in the future
    // when we explore better ways to handle this.
-    std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias};
-    return Expression<AffineNodeOp>(nodes, transA, transB, scale);

+    int rows = a->shape().elements() / a->shape()[-1];
+    Expr ones = a->graph()->ones({rows, 1});
+    std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias, ones};
+    return Expression<AffineNodeOp>(nodes, transA, transB, scale);
  }
 }

@ -462,6 +467,7 @@ Expr shift(Expr a, Shape shift, float padValue) {
 //}

 #ifdef CUDA_FOUND
+#ifdef CUDNN

 Expr avg_pooling(Expr x,
                 int height,
@ -525,5 +531,6 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
  return Expression<PoolingWithMaskingOp>(x, mask, width, isEven);
 }

+#endif
 #endif
 }
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@ -106,7 +106,6 @@ Expr flatten_2d(Expr a);

 Expr rows(Expr a, const std::vector<size_t>& indices);
 Expr cols(Expr a, const std::vector<size_t>& indices);
-
 Expr select(Expr a, int axis, const std::vector<size_t>& indices);

 /*********************************************************/
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -4,9 +4,12 @@

 #include "functional/functional.h"
 #include "graph/node.h"
-#include "tensors/gpu/cudnn_wrappers.h"
 #include "tensors/tensor_operators.h"

+#ifdef CUDNN
+#include "tensors/gpu/cudnn_wrappers.h"
+#endif
+
 namespace marian {

 class DotNodeOp : public NaryNodeOp {
@ -167,15 +170,17 @@ public:

  NodeOps forwardOps() {
    using namespace functional;
+
    return {
-      NodeOp(ProdWithBias(val_,
-                          child(0)->val(),
-                          child(1)->val(),
-                          child(2)->val(),
-                          transA_,
-                          transB_,
-                          0.f,
-                          scalar_))
+      NodeOp(Prod(val_,
+                  child(0)->val(),
+                  child(1)->val(),
+                  transA_, transB_, 0.f, scalar_);
+             Prod(val_,
+                  child(3)->val(),
+                  child(2)->val(),
+                  false, false, 1.f, 1.f)
+             )
    };
  }

@ -202,7 +207,12 @@ public:
                          false,
                          1.0,
                          scalar_)),
-              NodeOp(Add(_1, child(2)->grad(), adj_))};
+              NodeOp(Prod(child(2)->grad(),
+                          child(3)->val(), adj_,
+                          true, false,
+                          0.f, 1.f))
+              //NodeOp(Add(_1, child(2)->grad(), adj_))
+      };

    if(transA_ && !transB_)
      return {NodeOp(Prod(child(0)->grad(),
@ -219,7 +229,12 @@ public:
                          false,
                          1.0,
                          scalar_)),
-              NodeOp(Add(_1, child(2)->grad(), adj_))};
+              NodeOp(Prod(child(2)->grad(),
+                          child(3)->val(), adj_,
+                          true, false,
+                          0.f, 1.f))
+              //NodeOp(Add(_1, child(2)->grad(), adj_))
+      };

    if(transA_ && transB_)
      return {NodeOp(Prod(child(0)->grad(),
@ -236,7 +251,12 @@ public:
                          true,
                          1.0,
                          scalar_)),
-              NodeOp(Add(_1, child(2)->grad(), adj_))};
+              NodeOp(Prod(child(2)->grad(),
+                          child(3)->val(), adj_,
+                          true, false,
+                          0.f, 1.f))
+              //NodeOp(Add(_1, child(2)->grad(), adj_))
+      };

    return {NodeOp(Prod(child(0)->grad(),
                        adj_,
@ -252,7 +272,12 @@ public:
                        false,
                        1.0,
                        scalar_)),
-            NodeOp(Add(_1, child(2)->grad(), adj_))};
+            NodeOp(Prod(child(2)->grad(),
+                        child(3)->val(), adj_,
+                        true, false,
+                        0.f, 1.f))
+            //NodeOp(Add(_1, child(2)->grad(), adj_))
+    };
  }

  const std::string type() { return "affine"; }
@ -294,6 +319,7 @@ public:
  NodeOps forwardOps() {
    // C = alpha * dot(op(A), op(B))
    return {NodeOp(ProdBatched(val_,
+                               graph()->allocator(),
                               child(0)->val(),
                               child(1)->val(),
                               transA_,
@ -311,6 +337,7 @@ public:

    if(!transA_ && transB_)
      return {NodeOp(ProdBatched(child(0)->grad(),
+                                 graph()->allocator(),
                                 adj_,
                                 child(1)->val(),
                                 false,
@ -318,6 +345,7 @@ public:
                                 1.0,
                                 scalar_)),
              NodeOp(ProdBatched(child(1)->grad(),
+                                 graph()->allocator(),
                                 adj_,
                                 child(0)->val(),
                                 true,
@ -327,6 +355,7 @@ public:

    if(transA_ && !transB_)
      return {NodeOp(ProdBatched(child(0)->grad(),
+                                 graph()->allocator(),
                                 child(1)->val(),
                                 adj_,
                                 false,
@ -334,6 +363,7 @@ public:
                                 1.0,
                                 scalar_)),
              NodeOp(ProdBatched(child(1)->grad(),
+                                 graph()->allocator(),
                                 child(0)->val(),
                                 adj_,
                                 false,
@ -343,6 +373,7 @@ public:

    if(transA_ && transB_)
      return {NodeOp(ProdBatched(child(0)->grad(),
+                                 graph()->allocator(),
                                 child(1)->val(),
                                 adj_,
                                 true,
@ -350,6 +381,7 @@ public:
                                 1.0,
                                 scalar_)),
              NodeOp(ProdBatched(child(1)->grad(),
+                                 graph()->allocator(),
                                 adj_,
                                 child(0)->val(),
                                 true,
@ -358,6 +390,7 @@ public:
                                 scalar_))};

    return {NodeOp(ProdBatched(child(0)->grad(),
+                               graph()->allocator(),
                               adj_,
                               child(1)->val(),
                               false,
@ -365,6 +398,7 @@ public:
                               1.0,
                               scalar_)),
            NodeOp(ProdBatched(child(1)->grad(),
+                               graph()->allocator(),
                               child(0)->val(),
                               adj_,
                               true,
@ -766,6 +800,7 @@ struct HighwayNodeOp : public NaryNodeOp {
  const std::string type() { return "highway"; }
 };

+#ifdef CUDNN
 class ConvolutionOp : public NaryNodeOp {
 public:
  ConvolutionOp(const std::vector<Expr>& nodes,
@ -802,4 +837,5 @@ public:
 protected:
  ConvolutionWrapper conv_;
 };
+#endif
 }
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@ -7,7 +7,9 @@
 #include "graph/node.h"
 #include "tensors/tensor_operators.h"

-//#include "tensors/gpu/cudnn_wrappers.h"
+#ifdef CUDNN
+#include "tensors/gpu/cudnn_wrappers.h"
+#endif

 namespace marian {

@ -815,7 +817,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
  }

  NodeOps backwardOps() {
-    return {NodeOp(TransposeND(child(0)->grad(), adj_, axes_))};
+    return {NodeOp(TransposeNDGrad(child(0)->grad(), adj_, axes_))};
  }

  template <class... Args>
@ -1009,7 +1011,9 @@ struct ShiftNodeOp : public UnaryNodeOp {
  }

  NodeOps backwardOps() {
-    return {NodeOp(Shift(child(0)->grad(), adj_, shift_, /*padValue=*/0.f, /*invert=*/true))};
+    // last parameter beta=1 says to use += (out = in + beta * out)
+    // @TODO: check need for padValue_
+    return {NodeOp(ShiftGrad(child(0)->grad(), adj_, shift_, true))};
  }

  const std::string type() { return "shift"; }
@ -1076,6 +1080,7 @@ struct ShiftNodeOp : public UnaryNodeOp {
 //  Ptr<sparse::CSR> lf_;
 //};

+#ifdef CUDNN
 class PoolingOp : public UnaryNodeOp {
 public:
  PoolingOp(Expr x,
@ -1109,6 +1114,7 @@ public:
 protected:
  PoolingWrapper pooling_;
 };
+#endif

 class PoolingWithMaskingOp : public UnaryNodeOp {
 public:
--- a/src/layers/convolution.cpp
+++ b/src/layers/convolution.cpp
@ -2,6 +2,8 @@
 #include "graph/node_operators_binary.h"

 namespace marian {
+
+#ifdef CUDNN
 Convolution::Convolution(Ptr<ExpressionGraph> graph) : Factory(graph) {}

 Expr Convolution::apply(Expr x) {
@ -29,4 +31,6 @@ Expr Convolution::apply(const std::vector<Expr>&) {
  ABORT("Can't apply convolution on many inputs at once");
  return nullptr;
 }
+#endif
+
 }
--- a/src/layers/convolution.h
+++ b/src/layers/convolution.h
@ -7,6 +7,7 @@

 namespace marian {

+#ifdef CUDNN
 class Convolution : public Factory {
 protected:
  Ptr<Options> getOptions() { return options_; }
@ -82,4 +83,6 @@ protected:
  std::vector<int> kernelNums_;
  int stride_;
 };
+#endif
+
 }
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@ -2,11 +2,14 @@

 #include "marian.h"

-#include "layers/factory.h"
 #include "data/shortlist.h"
+#include "layers/factory.h"

 namespace marian {
 namespace mlp {
+/**
+ * @brief Activation functions
+ */
 enum struct act : int { linear, tanh, sigmoid, ReLU, LeakyReLU, PReLU, swish };
 }
 }
@ -64,12 +67,9 @@ public:
      if(inputs.size() > 1)
        num = std::to_string(i);

-      Expr W = g->param(name + "_W" + num,
-                        {in->shape()[-1], dim},
-                        inits::glorot_uniform);
-      Expr b = g->param(name + "_b" + num,
-                        {1, dim},
-                        inits::zeros);
+      Expr W = g->param(
+          name + "_W" + num, {in->shape()[-1], dim}, inits::glorot_uniform);
+      Expr b = g->param(name + "_b" + num, {1, dim}, inits::zeros);

      if(useLayerNorm) {
        if(useNematusNorm) {
@ -82,9 +82,8 @@ public:

          outputs.push_back(layerNorm(affine(in, W, b), ln_s, ln_b, NEMATUS_LN_EPS));
        } else {
-          auto gamma = g->param(name + "_gamma" + num,
-                                {1, dim},
-                                inits::from_value(1.0));
+          auto gamma = g->param(
+              name + "_gamma" + num, {1, dim}, inits::from_value(1.0));

          outputs.push_back(layerNorm(dot(in, W), gamma, b));
        }
@ -107,9 +106,7 @@ public:
    }
  };

-  Expr apply(Expr input) {
-    return apply(std::vector<Expr>({input}));
-  }
+  Expr apply(Expr input) { return apply(std::vector<Expr>({input})); }
 };

 class Output : public Layer {
@ -129,9 +126,7 @@ public:
    tiedParams_[param] = graph_->get(tied);
  }

-  void set_shortlist(Ptr<data::Shortlist> shortlist) {
-    shortlist_ = shortlist;
-  }
+  void set_shortlist(Ptr<data::Shortlist> shortlist) { shortlist_ = shortlist; }

  Expr apply(Expr input) {
    if(!W_) {
@ -146,15 +141,13 @@ public:
          W_ = rows(W_, shortlist_->indices());
      } else {
        W_ = graph_->param(name + "_" + nameW,
-                          {input->shape()[-1], dim},
-                          inits::glorot_uniform);
+                           {input->shape()[-1], dim},
+                           inits::glorot_uniform);
        if(shortlist_)
          W_ = cols(W_, shortlist_->indices());
      }

-      b_ = graph_->param(name + "_b",
-                         {1, dim},
-                         inits::zeros);
+      b_ = graph_->param(name + "_b", {1, dim}, inits::zeros);
      if(shortlist_)
        b_ = cols(b_, shortlist_->indices());
    }
@ -165,10 +158,8 @@ public:
  virtual Expr apply(const std::vector<Expr>& inputs) {
    ABORT("Not implemented");
  };
-
 };

-
 }  // namespace mlp

 struct EmbeddingFactory : public Factory {
@ -195,51 +186,4 @@ struct EmbeddingFactory : public Factory {
 };

 typedef Accumulator<EmbeddingFactory> embedding;
-
-static inline Expr Cost(Expr logits,
-                        Expr indices,
-                        Expr mask,
-                        std::string costType = "cross-entropy",
-                        float smoothing = 0,
-                        Expr weights = nullptr) {
-  using namespace keywords;
-
-  auto ce = cross_entropy(logits, indices);
-
-  if(weights)
-    ce = weights * ce;
-
-  if(smoothing > 0) {
-    // @TODO: add this to CE kernels instead
-    auto ceq = mean(logsoftmax(logits), axis = -1);
-    ce = (1 - smoothing) * ce - smoothing * ceq;
-  }
-
-  if(mask)
-    ce = ce * mask;
-
-  auto costSum = sum(ce, axis = -3);
-
-  Expr cost;
-  // axes:
-  //  - time axis (words): -3
-  //  - batch axis (sentences): -2
-  if(costType == "ce-mean"
-     || costType
-            == "cross-entropy") {  // sum over words; average over sentences
-    cost = mean(costSum, axis = -2);
-  } else if(costType == "ce-mean-words") {  // average over target tokens
-    cost = sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2);
-  } else if(costType == "ce-sum") {  // sum over target tokens
-    cost = sum(costSum, axis = -2);
-  } else if(costType == "perplexity") {  // ==exp('ce-mean-words')
-    cost = exp(sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2));
-  } else if(costType == "ce-rescore") {  // sum over words, keep batch axis
-    cost = -costSum;
-  } else {  // same as ce-mean
-    cost = mean(costSum, axis = -2);
-  }
-
-  return cost;
-}
 }
--- a/src/layers/loss.cpp
+++ b/src/layers/loss.cpp
@ -0,0 +1,94 @@
+#include "layers/loss.h"
+
+namespace marian {
+
+Ptr<LossBase> LossFactory(Ptr<Options> options, bool inference) {
+  float smoothing = inference ? 0.f : options->get<float>("label-smoothing");
+  std::string costType = options->get<std::string>("cost-type", "ce-mean");
+  if(costType == "ce-mean" || costType == "cross-entropy") {
+    return New<CrossEntropyMeanLoss>(smoothing);
+  } else if(costType == "ce-mean-words") {
+    return New<CrossEntropyMeanWordsLoss>(smoothing);
+  } else if(costType == "ce-sum") {
+    return New<CrossEntropySumLoss>(smoothing);
+  } else if(costType == "perplexity") {
+    return New<PerplexityLoss>(smoothing);
+  } else if(costType == "ce-rescore") {
+    return New<CrossEntropyRescoreLoss>(smoothing);
+  } else {  // same as ce-mean
+    return New<CrossEntropyMeanLoss>(smoothing);
+  }
+}
+
+Expr LossBase::getCrossEntropy(Expr logits,
+                               Expr indices,
+                               Expr mask,
+                               Expr weights) {
+  using namespace keywords;
+
+  auto ce = cross_entropy(logits, indices);
+
+  if(smoothing_ > 0) {
+    // @TODO: add this to CE kernels instead
+    auto ceq = mean(logsoftmax(logits), axis = -1);
+    ce = (1 - smoothing_) * ce - smoothing_ * ceq;
+  }
+
+  if(mask)
+    ce = ce * mask;
+
+  if(weights)
+    ce = ce * weights;
+
+  return ce;
+}
+
+Expr CrossEntropyMeanLoss::getCost(Expr logits,
+                                   Expr indices,
+                                   Expr mask,
+                                   Expr weights) {
+  using namespace keywords;
+  auto ce = getCrossEntropy(logits, indices, mask, weights);
+  // Time axis (words): -3
+  // Batch axis (sentences): -2
+  return mean(sum(ce, axis = -3), axis = -2);
+}
+
+Expr CrossEntropyMeanWordsLoss::getCost(Expr logits,
+                                        Expr indices,
+                                        Expr mask,
+                                        Expr weights) {
+  using namespace keywords;
+  auto ce = getCrossEntropy(logits, indices, mask, weights);
+  return sum(sum(ce, axis = -3), axis = -2)
+         / sum(sum(mask, axis = -3), axis = -2);
+}
+
+Expr CrossEntropySumLoss::getCost(Expr logits,
+                                  Expr indices,
+                                  Expr mask,
+                                  Expr weights) {
+  using namespace keywords;
+  auto ce = getCrossEntropy(logits, indices, mask, weights);
+  return sum(sum(ce, axis = -3), axis = -2);
+}
+
+Expr PerplexityLoss::getCost(Expr logits,
+                             Expr indices,
+                             Expr mask,
+                             Expr weights) {
+  using namespace keywords;
+  auto ce = getCrossEntropy(logits, indices, mask, weights);
+  return exp(sum(sum(ce, axis = -3), axis = -2)
+             / sum(sum(mask, axis = -3), axis = -2));
+}
+
+Expr CrossEntropyRescoreLoss::getCost(Expr logits,
+                                      Expr indices,
+                                      Expr mask,
+                                      Expr weights) {
+  using namespace keywords;
+  auto ce = getCrossEntropy(logits, indices, mask, weights);
+  return -sum(ce, axis = -3);
+}
+}
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@ -0,0 +1,70 @@
+#pragma once
+
+#include "marian.h"
+
+namespace marian {
+class LossBase {
+protected:
+  float smoothing_;
+
+public:
+  explicit LossBase(float smoothing = 0) : smoothing_(smoothing){};
+
+  Expr getCrossEntropy(Expr logits, Expr indices, Expr mask, Expr weights);
+  virtual Expr getCost(Expr logits,
+                       Expr indices,
+                       Expr mask,
+                       Expr weights = nullptr)
+      = 0;
+};
+
+/*
+ * @brief The cross entropy loss function
+ *
+ * A sum over words and average over sentences
+ */
+class CrossEntropyMeanLoss : public LossBase {
+public:
+  explicit CrossEntropyMeanLoss(float smoothing = 0) : LossBase(smoothing){};
+  Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
+};
+
+/*
+ * @brief The cross entropy loss function as an average over target tokens
+ */
+class CrossEntropyMeanWordsLoss : public LossBase {
+public:
+  explicit CrossEntropyMeanWordsLoss(float smoothing = 0)
+      : LossBase(smoothing){};
+  Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
+};
+
+/*
+ * @brief The cross entropy loss function as a sum over target tokens
+ */
+class CrossEntropySumLoss : public LossBase {
+public:
+  explicit CrossEntropySumLoss(float smoothing = 0) : LossBase(smoothing){};
+  Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
+};
+
+/*
+ * @brief The perplexity loss function
+ */
+class PerplexityLoss : public LossBase {
+public:
+  explicit PerplexityLoss(float smoothing = 0) : LossBase(smoothing){};
+  Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
+};
+
+/*
+ * @brief The cross entropy loss function that keeps sentence-level costs
+ */
+class CrossEntropyRescoreLoss : public LossBase {
+public:
+  explicit CrossEntropyRescoreLoss(float smoothing = 0) : LossBase(smoothing){};
+  Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
+};
+
+Ptr<LossBase> LossFactory(Ptr<Options> options, bool inference);
+}
--- a/src/layers/weight.cpp
+++ b/src/layers/weight.cpp
@ -0,0 +1,21 @@
+#include "layers/weight.h"
+
+namespace marian {
+
+Ptr<WeightingBase> WeightingFactory(Ptr<Options> options) {
+  if(options->has("data-weighting"))
+    return New<DataWeighting>(options->get<std::string>("data-weighting-type"));
+}
+
+Expr DataWeighting::getWeights(Ptr<ExpressionGraph> graph,
+                               Ptr<data::CorpusBatch> batch) {
+  ABORT_IF(batch->getDataWeights().empty(),
+           "Vector of weights is unexpectedly empty!");
+  bool sentenceWeighting = weightingType_ == "sentence";
+  int dimBatch = batch->size();
+  int dimWords = sentenceWeighting ? 1 : batch->back()->batchWidth();
+  auto weights = graph->constant({1, dimWords, dimBatch, 1},
+                                 inits::from_vector(batch->getDataWeights()));
+  return weights;
+}
+}
--- a/src/layers/weight.h
+++ b/src/layers/weight.h
@ -0,0 +1,33 @@
+#pragma once
+
+#include "common/options.h"
+#include "data/corpus.h"
+#include "graph/expression_graph.h"
+#include "graph/expression_operators.h"
+#include "graph/node_initializers.h"
+
+namespace marian {
+
+class WeightingBase {
+public:
+  WeightingBase(){};
+  virtual Expr getWeights(Ptr<ExpressionGraph> graph,
+                          Ptr<data::CorpusBatch> batch)
+      = 0;
+  virtual void debugWeighting(std::vector<float> weightedMask,
+                              std::vector<float> freqMask,
+                              Ptr<data::CorpusBatch> batch){};
+};
+
+class DataWeighting : public WeightingBase {
+protected:
+  std::string weightingType_;
+
+public:
+  DataWeighting(std::string weightingType)
+      : WeightingBase(), weightingType_(weightingType){};
+  Expr getWeights(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch);
+};
+
+Ptr<WeightingBase> WeightingFactory(Ptr<Options> options);
+}
--- a/src/models/costs.h
+++ b/src/models/costs.h
@ -1,8 +1,10 @@
 #pragma once

-#include "models/encoder_decoder.h"
 #include "layers/generic.h"
 #include "layers/guided_alignment.h"
+#include "layers/loss.h"
+#include "layers/weight.h"
+#include "models/encoder_decoder.h"

 namespace marian {
 namespace models {
@ -12,58 +14,57 @@ public:
  virtual Expr apply(Ptr<ModelBase> model,
                     Ptr<ExpressionGraph> graph,
                     Ptr<data::Batch> batch,
-                     bool clearGraph = true) = 0;
+                     bool clearGraph = true)
+      = 0;
 };

-
 class EncoderDecoderCE : public CostBase {
 protected:
  Ptr<Options> options_;

+  bool inference_{false};
+  bool toBeWeighted_{false};
+  Ptr<LossBase> loss_;
+  Ptr<WeightingBase> weighter_;
+
 public:
  EncoderDecoderCE(Ptr<Options> options)
-  : options_(options) {}
+      : options_(options), inference_(options->get<bool>("inference", false)) {
+    loss_ = LossFactory(options_, inference_);
+
+    toBeWeighted_ = (options_->has("data-weighting") && !inference_)
+                    || (options_->has("dynamic-weighting")
+                        && options_->get<bool>("dynamic-weighting")
+                        && !inference_);
+    if(toBeWeighted_)
+      weighter_ = WeightingFactory(options_);
+  }

  Expr apply(Ptr<ModelBase> model,
             Ptr<ExpressionGraph> graph,
             Ptr<data::Batch> batch,
             bool clearGraph = true) {
-
    auto encdec = std::static_pointer_cast<EncoderDecoder>(model);
    auto corpusBatch = std::static_pointer_cast<data::CorpusBatch>(batch);

    auto state = encdec->stepAll(graph, corpusBatch, clearGraph);

-    std::string costType = options_->get<std::string>("cost-type");
-    bool inference = options_->get<bool>("inference", false);
-
-    float ls = inference ? 0.f : options_->get<float>("label-smoothing");
+    float ls = inference_ ? 0.f : options_->get<float>("label-smoothing");

    Expr weights;
+    Expr cost;
    bool sentenceWeighting = false;

-    if(options_->has("data-weighting") && !inference) {
-      ABORT_IF(corpusBatch->getDataWeights().empty(),
-               "Vector of weights is unexpectedly empty!");
-
-      sentenceWeighting
-          = options_->get<std::string>("data-weighting-type") == "sentence";
-      int dimBatch = corpusBatch->size();
-      int dimWords = sentenceWeighting ? 1 : corpusBatch->back()->batchWidth();
-
-      weights = graph->constant({1, dimWords, dimBatch, 1},
-                                inits::from_vector(corpusBatch->getDataWeights()));
+    if(toBeWeighted_) {
+      weights = weighter_->getWeights(graph, corpusBatch);
    }

-    auto cost
-        = Cost(state->getProbs(),
-               state->getTargetIndices(),
-               state->getTargetMask(),
-               costType,
-               ls,
-               weights);
+    cost = loss_->getCost(state->getProbs(),
+                          state->getTargetIndices(),
+                          state->getTargetMask(),
+                          weights);

-    if(options_->has("guided-alignment") && !inference) {
+    if(options_->has("guided-alignment") && !inference_) {
      auto alignments = encdec->getDecoders()[0]->getAlignments();
      ABORT_IF(alignments.empty(), "Model does not seem to support alignments");

@ -73,8 +74,6 @@ public:
    } else {
      return cost;
    }
-
-    return cost;
  }
 };

@ -85,7 +84,7 @@ protected:

 public:
  Trainer(Ptr<ModelBase> model, Ptr<CostBase> cost)
-  : model_(model), cost_(cost) {}
+      : model_(model), cost_(cost) {}

  Ptr<ModelBase> getModel() { return model_; }

@ -104,16 +103,10 @@ public:
  virtual Expr build(Ptr<ExpressionGraph> graph,
                     Ptr<data::Batch> batch,
                     bool clearGraph = true) {
-    return cost_->apply(model_,
-                        graph,
-                        batch,
-                        clearGraph);
-  };
-
-  virtual void clear(Ptr<ExpressionGraph> graph) {
-    model_->clear(graph);
+    return cost_->apply(model_, graph, batch, clearGraph);
  };

+  virtual void clear(Ptr<ExpressionGraph> graph) { model_->clear(graph); };
 };

 typedef Trainer Scorer;
@ -138,11 +131,11 @@ protected:

 public:
  Stepwise(Ptr<EncoderDecoderBase> encdec, Ptr<CostStep> cost)
-  : encdec_(encdec), cost_(cost) {}
+      : encdec_(encdec), cost_(cost) {}

  virtual void load(Ptr<ExpressionGraph> graph,
                    const std::string& name,
-                    bool markedReloaded = true)  {
+                    bool markedReloaded = true) {
    encdec_->load(graph, name, markedReloaded);
  }

@ -152,9 +145,7 @@ public:
    encdec_->save(graph, name, saveTranslatorConfig);
  }

-  virtual void clear(Ptr<ExpressionGraph> graph) {
-    encdec_->clear(graph);
-  }
+  virtual void clear(Ptr<ExpressionGraph> graph) { encdec_->clear(graph); }

  virtual Expr build(Ptr<ExpressionGraph> graph,
                     Ptr<data::Batch> batch,
@ -174,7 +165,8 @@ public:
                                 const std::vector<size_t>& embIndices,
                                 int dimBatch,
                                 int beamSize) {
-    auto nextState = encdec_->step(graph, state, hypIndices, embIndices, dimBatch, beamSize);
+    auto nextState = encdec_->step(
+        graph, state, hypIndices, embIndices, dimBatch, beamSize);
    return cost_->apply(nextState);
  }

@ -185,11 +177,10 @@ public:
    return nullptr;
  }

-  virtual Ptr<Options> getOptions() {
-    return encdec_->getOptions();
-  };
+  virtual Ptr<Options> getOptions() { return encdec_->getOptions(); };

-  virtual void setShortlistGenerator(Ptr<data::ShortlistGenerator> shortlistGenerator) {
+  virtual void setShortlistGenerator(
+      Ptr<data::ShortlistGenerator> shortlistGenerator) {
    encdec_->setShortlistGenerator(shortlistGenerator);
  };

@ -197,10 +188,14 @@ public:
    return encdec_->getShortlist();
  };

+  virtual std::vector<float> getAlignment() {
+    return encdec_->getAlignment();
+  }
 };

-static Ptr<ModelBase> add_cost(Ptr<EncoderDecoder> encdec, Ptr<Options> options) {
-  switch (options->get<usage>("usage", usage::raw)) {
+static Ptr<ModelBase> add_cost(Ptr<EncoderDecoder> encdec,
+                               Ptr<Options> options) {
+  switch(options->get<usage>("usage", usage::raw)) {
    case usage::training:
      return New<Trainer>(encdec, New<EncoderDecoderCE>(options));
    case usage::scoring:
@ -208,10 +203,8 @@ static Ptr<ModelBase> add_cost(Ptr<EncoderDecoder> encdec, Ptr<Options> options)
    case usage::translation:
      return New<Stepwise>(encdec, New<LogsoftmaxStep>());
    case usage::raw:
-    default:
-      return encdec;
+    default: return encdec;
  }
 }
-
 }
 }
--- a/src/models/encoder_decoder.h
+++ b/src/models/encoder_decoder.h
@ -44,6 +44,8 @@ public:
  virtual void setShortlistGenerator(Ptr<data::ShortlistGenerator> shortlistGenerator) = 0;

  virtual Ptr<data::Shortlist> getShortlist() = 0;
+
+  virtual std::vector<float> getAlignment() = 0;
 };

 class EncoderDecoder : public EncoderDecoderBase {
@ -113,6 +115,12 @@ public:
    return decoders_[0]->getShortlist();
  };

+  virtual std::vector<float> getAlignment() {
+    std::vector<float> softAlign;
+    decoders_[0]->getAlignments()[0]->val()->get(softAlign);
+    return softAlign;
+  };
+
  /*********************************************************************/

  virtual Ptr<DecoderState> startState(Ptr<ExpressionGraph> graph,
--- a/src/models/nematus.h
+++ b/src/models/nematus.h
@ -21,6 +21,10 @@ public:
    ABORT_IF(options_->get<std::string>("dec-cell") != "gru-nematus",
             "--type nematus does not currently support other rnn cells "
             "than gru-nematus, use --type s2s");
+
+    ABORT_IF(options_->get<int>("dec-cell-high-depth") > 1,
+             "--type nematus does not currently support "
+             "--dec-cell-high-depth > 1, use --type s2s");
  }

  void load(Ptr<ExpressionGraph> graph,
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@ -1,42 +0,0 @@
-cuda_add_library(pymarian SHARED
-  mariannmt.cpp
-  ../3rd_party/cnpy/cnpy.cpp
-  ../3rd_party/exception.cpp
-  ../3rd_party/svd/svd.cpp
-  ../graph/expression_graph.cpp
-  ../graph/expression_operators.cu
-  ../graph/node.cu
-  ../graph/node_operators.cu
-  ../tensors/tensor.cu
-  ../tensors/device.cpp
-  ../kernels/tensor_operators.cu
-  ../tensors/gpu/dropout.cu
-  ../tensors/cpu/dropout.cpp
-  ../kernels/sparse.cu
-  #../layers/param_initializers.cu
-  ../rnn/attention.cu
-  ../rnn/cells.cu
-  #../optimizers/clippers.cu
-  #../optimizers/optimizers.cu
-  ../common/utils.cpp
-  ../common/logging.cpp
-  ../common/config.cpp
-  ../common/config_parser.cpp
-  ../translator/history.cpp
-  ../translator/output_collector.cpp
-  ../translator/nth_element.cu
-  ../translator/helpers.cu
-  ../data/vocab.cpp
-  ../data/corpus.cpp
-  ../data/text_input.cpp
-  #../rescorer/score_collector.cpp
-  $<TARGET_OBJECTS:libyaml-cpp>
-)
-
-set_target_properties(pymarian PROPERTIES EXCLUDE_FROM_ALL 1)
-set_target_properties(pymarian PROPERTIES OUTPUT_NAME mariannmt)
-set_target_properties(pymarian PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
-set_target_properties(pymarian PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
-
-target_link_libraries(pymarian ${EXT_LIBS} marian)
-cuda_add_cublas_to_target(pymarian)
--- a/src/python/mariannmt.cpp
+++ b/src/python/mariannmt.cpp
@ -1,44 +0,0 @@
-#include <cstdlib>
-#include <iostream>
-#include <string>
-
-#include <boost/python.hpp>
-
-#include "common/utils.h"
-#include "common/version.h"
-#include "translator/beam_search.h"
-#include "translator/translator.h"
-
-using namespace marian;
-
-Ptr<TranslateServiceMultiGPU<BeamSearch>> task;
-
-void init(const std::string& argopts) {
-  auto options = New<Config>(argopts, ConfigMode::translating);
-  task = New<TranslateServiceMultiGPU<BeamSearch>>(options);
-  LOG(info, "Translator initialized");
-}
-
-boost::python::list translate(boost::python::list& pyinput) {
-  std::vector<std::string> input;
-  for(int i = 0; i < boost::python::len(pyinput); ++i) {
-    input.emplace_back(
-        boost::python::extract<std::string>(boost::python::object(pyinput[i])));
-  }
-
-  auto output = task->run(input);
-
-  boost::python::list pyoutput;
-  pyoutput.append(Join(output, "\n"));
-  return pyoutput;
-}
-
-std::string version() {
-  return PROJECT_VERSION;
-}
-
-BOOST_PYTHON_MODULE(libmariannmt) {
-  boost::python::def("init", init);
-  boost::python::def("translate", translate);
-  boost::python::def("version", version);
-}
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@ -16,9 +16,27 @@
 namespace marian {

 class AllocationException : public std::exception {
+private:
+  char* message_;
+
 public:
-  virtual const char* what() const throw() {
-    return "Memory re-allocation attempted";
+  AllocationException(size_t available, size_t asked) {
+    std::string mstr = "Attempted allocation of "
+      + std::to_string(asked)
+      + ", but only "
+      + std::to_string(available)
+      + " free";
+
+    message_ = new char[mstr.size() + 1];
+    std::copy(mstr.begin(), mstr.end(), message_);
+  }
+
+  ~AllocationException() {
+    delete[] message_;
+  }
+
+  virtual const char* what() const noexcept {
+    return message_;
  }
 };

@ -111,7 +129,7 @@ private:
    auto it = std::lower_bound(gaps_.begin(), gaps_.end(), Gap(nullptr, size));

    if(throw_ && it == gaps_.end()) {
-      throw AllocationException();
+      throw AllocationException(available_, size);
    }

    while(it == gaps_.end()) {
@ -119,8 +137,11 @@ private:
      it = std::lower_bound(gaps_.begin(), gaps_.end(), Gap(nullptr, size));
    }

-    available_ -= it->size();
-    return *it;
+    Gap gap = *it;
+    gaps_.erase(it);
+
+    available_ -= gap.size();
+    return gap;
  }

  void insertGap(Gap gap, bool consolidate = true) {
@ -186,7 +207,6 @@ public:
    bytes = align(bytes);
    Gap gap = getGap(bytes);

-    gaps_.erase(gap);
    if(gap.size() > bytes) {
      insertGap(gap.rest(bytes), false);
    }
--- a/src/tensors/cpu/element.h
+++ b/src/tensors/cpu/element.h
@ -23,9 +23,9 @@ template <size_t I = 0> struct E {
                             functional::Array<functional::Tensor<float>, K>& tensors,
                             functional::Array<int, K> indices) {

-    auto& shape = tensors[0].shape();
+    const auto& shape = tensors[0].shape();

-    // loop for outer-most dimension
+    // loop over outer-most dimension
    for(int i = 0; i < shape[I]; ++i) {

      // call loop for next-inner dimension
@ -66,7 +66,7 @@ void Element(const Functor& functor, marian::Tensor out, Tensors... tensors) {

  // call elementwise operation going from outer-most dimension
  // to inner-most element.
-  E<>::element(functor, gTensors, indices);
+  E<0>::element(functor, gTensors, indices);
 }

 }
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@ -95,8 +95,9 @@ void Prod(marian::Tensor C,
 }

 void ProdBatched(marian::Tensor C,
-                 const marian::Tensor& A,
-                 const marian::Tensor& B,
+                 Ptr<Allocator> allocator,
+                 const marian::Tensor A,
+                 const marian::Tensor B,
                 bool transA,
                 bool transB,
                 float beta,
@ -128,30 +129,21 @@ void ProdBatched(marian::Tensor C,
  auto strideA = batchA == 1 ? 0 : m * k;
  auto strideC = n * m;

-  int steps = std::max(batchA, batchB);
-
-  int offsetA = 0;
-  int offsetB = 0;
-  int offsetC = 0;
-
-  for(int i = 0; i < steps; ++i) {
+  int batchC = std::max(batchA, batchB);
+  for(int i = 0; i < batchC; ++i) {
    sgemm(transA,
          transB,
          m,
          n,
          k,
          alpha,
-          A->data() + offsetA,
+          A->data() + (i % batchA) * strideA,
          lda,
-          B->data() + offsetB,
+          B->data() + (i % batchB) * strideB,
          ldb,
          beta,
-          C->data() + offsetC,
+          C->data() + i * strideC,
          ldc);
-
-    offsetA += strideA;
-    offsetB += strideB;
-    offsetC += strideC;
  }
 #else
  ABORT("Not implemented!");
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -50,12 +50,13 @@ inline void gInsertCols(float* out,
                        size_t cols_out,
                        size_t cols_in,
                        size_t offset_out,
-                        size_t offset_in) {
+                        size_t offset_in,
+                        float beta) {
  for(int j = 0; j < rows; ++j) {
    float* rowOut = out + j * cols_out + offset_out;
    const float* rowIn = in + j * cols_in + offset_in;
    for(int i = 0; i < cols; ++i) {
-      rowOut[i] = rowIn[i];
+      rowOut[i] = rowIn[i] + beta * rowOut[i];
    }
  }
 }
@ -71,7 +72,7 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
             "First dimension must be equal");
    int cols_in = in->shape().back();
    cpu::gInsertCols(
-        out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
+        out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0, 0);
    offset += cols_in;
  }
 }
@ -91,8 +92,11 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
    ABORT_IF(rows != out->shape().elements() / out->shape().back(),
             "First dimension must be equal");
    int cols_out = out->shape().back();
+
+    // set last parameter to 1 to enable += instead of =
+    // @TODO: do this in a more principled ways accross all/most kernels
    cpu::gInsertCols(
-        out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset);
+        out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset, 1);
    offset += cols_out;
  }
 }
@ -108,9 +112,17 @@ void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
      size_t size = out->shape().elements() / step;
      size_t offset2 = i * size;

-      std::copy(in->data() + offset1,
-                in->data() + offset1 + size,
-                out->data() + offset2);
+      // BUG: This overwrites gradients!
+      //std::copy(in->data() + offset1,
+      //          in->data() + offset1 + size,
+      //          out->data() + offset2);
+
+      // Fixes gradient problem, @TODO: check performance
+      std::transform(in->data() + offset1,
+                     in->data() + offset1 + size,
+                     out->data() + offset2,
+                     out->data() + offset2,
+                     [](float a, float b){ return a + b; });

      offset1 += size;
    }
@ -124,6 +136,7 @@ void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
    SplitCont(outputs, in, ax);
 }

+template <bool add>
 void Transpose0213(Tensor out, Tensor in) {
  int cols = in->shape()[-1];
  int rows = in->shape().elements() / in->shape()[-1];
@ -141,7 +154,15 @@ void Transpose0213(Tensor out, Tensor in) {
      const float* inRow = in->data() + src * cols ;
      float* outRow = out->data() + dst * cols;

-      std::copy(inRow, inRow + cols, outRow);
+      if(!add) {
+        // mostly for fast forward computation
+        std::copy(inRow, inRow + cols, outRow);
+      }
+      else {
+        for(int i = 0; i < cols; ++i) {
+          outRow[i] += inRow[i];
+        }
+      }
    }
  }
 }
@ -186,6 +207,7 @@ void Transpose10(Tensor out, const Tensor in) {
 }

 // @TODO: optimize this, currently it's quite horrible
+template <bool add>
 void TransposeGeneric(Tensor out, Tensor in, const std::vector<int>& vAxis) {
  functional::Array<int, functional::Shape::size()> permute;
  int diff = functional::Shape::size() - vAxis.size();
@ -207,19 +229,29 @@ void TransposeGeneric(Tensor out, Tensor in, const std::vector<int>& vAxis) {
    gOut.shape().dims(index, oDims);
    for(int i = 0; i < N; ++i)
      pDims[permute[i]] = oDims[i];
-    gOut[index] = gIn[pDims];
+    if(add)
+      gOut[index] += gIn[pDims];
+    else
+      gOut[index] = gIn[pDims];
  }
 }

 void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis) {
  if(vAxis == std::vector<int>({0, 2, 1, 3}))
-    Transpose0213(out, in);
-  else if(vAxis == std::vector<int>({1, 0}) 
-          && in->shape()[-1] % 16 == 0 
+    Transpose0213<false>(out, in);
+  else if(vAxis == std::vector<int>({1, 0})
+          && in->shape()[-1] % 16 == 0
          && in->shape()[-2] % 16 == 0)
    Transpose10(out, in);
  else
-    TransposeGeneric(out, in, vAxis);
+    TransposeGeneric<false>(out, in, vAxis);
+}
+
+void TransposeNDGrad(Tensor out, Tensor in, const std::vector<int>& vAxis) {
+  if(vAxis == std::vector<int>({0, 2, 1, 3}))
+    Transpose0213<true>(out, in);
+  else
+    TransposeGeneric<true>(out, in, vAxis);
 }

 void Softmax(Tensor out_, Tensor in_, Tensor mask_) {
@ -412,9 +444,8 @@ void PasteCols(Tensor out_,
    const float* rowIn = in + j * colsIn;
    float* rowOut = out + j * colsOut;

-    // @TODO: should this be a sum?
    for(int i = 0; i < colsIn; ++i) {
-      rowOut[indices[i]] = rowIn[i];
+      rowOut[indices[i]] += rowIn[i];
    }
  }
 }
@ -458,7 +489,6 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {

 #pragma omp simd
    for(int i = 0; i < cols; ++i) {
-      // @TODO: stable sigmoid
      float r = stableSigmoid(xWrow[i] + sUrow[i] + b[i]);

      int k = i + cols;
@ -901,6 +931,26 @@ void Shift(Tensor out_, Tensor in_, marian::Shape shift, float padValue, bool in
  }
 }

+void ShiftGrad(Tensor out_, Tensor in_, marian::Shape shift, bool invert) {
+  int offset = 0;
+  for(int i = 0; i < shift.size(); ++i)
+    offset += in_->shape().stride(i) * shift[i];
+
+  if(invert)
+    offset = -offset;
+
+  float* out = out_->data();
+  const float* in = in_->data();
+
+  int length = out_->shape().elements();
+#pragma omp parallel for
+  for(int i = 0; i < length; ++i) {
+    if(i - offset >= 0 && i - offset < length) {
+      out[i] += in[i - offset];
+    }
+  }
+}
+
 void SetSparse(float* out,
               const std::vector<size_t>& indices,
               const std::vector<float>& values) {
--- a/src/tensors/gpu/add.cu
+++ b/src/tensors/gpu/add.cu
@ -160,7 +160,6 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
    bool broadcast = false;
    for(int i = 0; i < K; ++i)
      broadcast = broadcast || gOut.shape() != gIns[i].shape();
-
    gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast);
  } else {
    int threads = std::min(MAX_THREADS, length);
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@ -41,3 +41,4 @@ template void Element<Assign<Var<1>, BinaryFunctor<elem::Clip, Assignee<2>, Capt
 template void Element<Assign<Var<1>, BinaryFunctor<elem::LogAddExp, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::LogAddExp, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Element<Assign<Var<1>, BinaryFunctor<elem::Maximum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Maximum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Element<Assign<Var<1>, BinaryFunctor<elem::Minimum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Minimum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Div, Assignee<1>, Capture>>>(Assign<Var<1>, BinaryFunctor<elem::Div, Assignee<1>, Capture>>, marian::Tensor);
--- a/src/tensors/gpu/prod.cu
+++ b/src/tensors/gpu/prod.cu
@ -67,6 +67,30 @@ void Prod(marian::Tensor C,
 #endif
 }

+__global__ void gAddBias(float* out, const float* bias, size_t length, size_t cols) {
+  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+    if(index < length) {
+      size_t index2 = index % cols;
+      out[index] += bias[index2];
+    }
+  }
+}
+
+void AddBias(marian::Tensor C, const marian::Tensor bias) {
+  cudaSetDevice(C->getDevice().no);
+
+  int length = C->shape().elements();
+  int cols = bias->shape().elements();
+
+  int threads = std::min(MAX_THREADS, length);
+  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+  gAddBias<<<blocks, threads>>>(C->data(), bias->data(), length, cols);
+
+  cudaStreamSynchronize(0);
+}
+
 void ProdWithBias(marian::Tensor C,
          const marian::Tensor& A,
          const marian::Tensor& B,
@ -76,13 +100,13 @@ void ProdWithBias(marian::Tensor C,
          float beta,
          float scalar) {
  marian::gpu::Prod(C, A, B, transA, transB, beta, scalar);
-  marian::gpu::Add(functional::_1, 1.f, C, bias);
+  marian::gpu::AddBias(C, bias);
 }

-
 void ProdBatched(marian::Tensor C,
-                 const marian::Tensor& A,
-                 const marian::Tensor& B,
+                 Ptr<Allocator> allocator,
+                 const marian::Tensor A,
+                 const marian::Tensor B,
                 bool transA,
                 bool transB,
                 float beta,
@ -116,30 +140,57 @@ void ProdBatched(marian::Tensor C,
  auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())
                          ->getCublasHandle();

+
+  int strideA = batchA == 1 ? 0 : m * k;
+  int strideB = batchB == 1 ? 0 : n * k;
+  int strideC = n * m;
+  int batchC = std::max(batchA, batchB);
+
+  std::vector<const float*> aptr;
+  std::vector<const float*> bptr;
+  std::vector<float*> cptr;
+
+  for(int i = 0; i < batchC; i++) {
+    aptr.push_back(A->data() + (i % batchA) * strideA);
+    bptr.push_back(B->data() + (i % batchB) * strideB);
+    cptr.push_back(C->data() + i * strideC);
+  }
+
+  auto mp_aptr = allocator->alloc<const float*>(aptr.size());
+  CudaCopy(aptr.data(), aptr.data() + aptr.size(), mp_aptr->data<const float*>());
+
+  auto mp_bptr = allocator->alloc<const float*>(bptr.size());
+  CudaCopy(bptr.data(), bptr.data() + bptr.size(), mp_bptr->data<const float*>());
+
+  auto mp_cptr = allocator->alloc<float*>(cptr.size());
+  CudaCopy(cptr.data(), cptr.data() + cptr.size(), mp_cptr->data<float*>());
+
 #if CUDA_VERSION >= 9000
  cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
 #endif
-  cublasSgemmStridedBatched(cublasHandle,
-                            opB,
-                            opA,
-                            n,
-                            m,
-                            k,
-                            &alpha,
-                            B->data(),
-                            ldb,
-                            batchB == 1 ? 0 : n * k,
-                            A->data(),
-                            lda,
-                            batchA == 1 ? 0 : m * k,
-                            &beta,
-                            C->data(),
-                            ldc,
-                            n * m,
-                            std::max(batchA, batchB));
+  cublasSgemmBatched(cublasHandle,
+                      opB,
+                      opA,
+                      n,
+                      m,
+                      k,
+                      &alpha,
+                      mp_bptr->data<const float*>(),
+                      ldb,
+                      mp_aptr->data<const float*>(),
+                      lda,
+                      &beta,
+                      mp_cptr->data<float*>(),
+                      ldc,
+                      batchC);
 #if CUDA_VERSION >= 9000
  cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
 #endif
-}
+
+  allocator->free(mp_aptr);
+  allocator->free(mp_bptr);
+  allocator->free(mp_cptr);
+}
+
 }
 }
--- a/src/tensors/gpu/prod.h
+++ b/src/tensors/gpu/prod.h
@ -26,8 +26,9 @@ void ProdWithBias(marian::Tensor C,
          float scalar = 1);

 void ProdBatched(marian::Tensor C,
-                 const marian::Tensor& A,
-                 const marian::Tensor& B,
+                 Ptr<Allocator> allocator,
+                 const marian::Tensor A,
+                 const marian::Tensor B,
                 bool transA,
                 bool transB,
                 float beta = 0,
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@ -38,6 +38,8 @@ bool IsNan(Tensor in) {
 }

 void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
+
+
  cudaSetDevice(out->getDevice().no);
  int step = 1;
  for(int i = 0; i < axis; ++i)
@ -49,7 +51,7 @@ void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
      size_t size = in->shape().elements() / step;
      size_t offset2 = i * size;

-      cudaMemcpyAsync(out->data() + offset1,
+      cudaMemcpy(out->data() + offset1,
                      in->data() + offset2,
                      size * sizeof(float),
                      cudaMemcpyDeviceToDevice);
@ -60,14 +62,15 @@ void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
  cudaStreamSynchronize(0);
 }

+template <bool add>
 __global__ void gInsertCols(float* out,
-                            const float* in,
-                            size_t rows,
-                            size_t cols,
-                            size_t cols_out,
-                            size_t cols_in,
-                            size_t offset_out,
-                            size_t offset_in) {
+                 const float* in,
+                 size_t rows,
+                 size_t cols,
+                 size_t cols_out,
+                 size_t cols_in,
+                 size_t offset_out,
+                 size_t offset_in) {
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int j = bid + blockIdx.x;
    if(j < rows) {
@ -77,7 +80,10 @@ __global__ void gInsertCols(float* out,
      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int i = tid + threadIdx.x;
        if(i < cols)
-          rowOut[i] = rowIn[i];
+          if(add)
+            rowOut[i] += rowIn[i];
+          else
+            rowOut[i] = rowIn[i];
      }
    }
  }
@ -99,16 +105,81 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
    int blocks = std::min(MAX_BLOCKS, rows);
    int threads = std::min(MAX_THREADS, cols_in);

-    gInsertCols<<<blocks, threads>>>(
+    gInsertCols<false><<<blocks, threads>>>(
        out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
    offset += cols_in;
  }
  cudaStreamSynchronize(0);
 }

+
+__global__ void gJoin2(float* out, size_t rowBatch, size_t cols,
+                       const float* in1, size_t inStride1,
+                       const float* in2, size_t inStride2) {
+
+  int outStride = inStride1 + inStride2;
+  int rows = rowBatch * outStride;
+
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+
+      float* rowOut = out + j * cols;
+
+      int curBatch = j / outStride;
+      int curPos = j % outStride;
+
+      int jIn1 = (curBatch * inStride1) + curPos;
+      int jIn2 = (curBatch * inStride2) + curPos - inStride1;
+
+      const float* rowIn1 = in1 + jIn1 * cols;
+      const float* rowIn2 = in2 + jIn2 * cols;
+
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols) {
+          if(curPos < inStride1)
+            rowOut[i] = rowIn1[i];
+          else
+            rowOut[i] = rowIn2[i];
+        }
+      }
+
+    }
+  }
+}
+
+
+void Concatenate2(Tensor out, Tensor in1, Tensor in2) {
+  cudaSetDevice(out->getDevice().no);
+
+  size_t rows = out->shape().elements() / out->shape().back();
+  size_t cols = out->shape().back();
+
+  size_t rowStride1 = in1->shape()[-2];
+  size_t rowStride2 = in2->shape()[-2];
+
+  size_t rowBatch = rows / out->shape()[-2];
+
+  int blocks = std::min(MAX_BLOCKS, (int)rows);
+  int threads = std::min(MAX_THREADS, (int)cols);
+
+  gJoin2<<<blocks, threads>>>(out->data(),
+                              rowBatch,
+                              cols,
+                              in1->data(),
+                              rowStride1,
+                              in2->data(),
+                              rowStride2);
+
+  cudaStreamSynchronize(0);
+}
+
 void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax) {
  if(ax == out->shape().size() - 1)
    Concatenate1(out, inputs);
+  else if(ax == out->shape().size() - 2 && inputs.size() == 2)
+    Concatenate2(out, inputs[0], inputs[1]);
  else
    ConcatCont(out, inputs, ax);
 }
@ -127,13 +198,24 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
    int blocks = std::min(MAX_BLOCKS, rows);
    int threads = std::min(MAX_THREADS, cols_out);

-    gInsertCols<<<blocks, threads>>>(
+    gInsertCols<true><<<blocks, threads>>>(
        out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset);
    offset += cols_out;
  }
  cudaStreamSynchronize(0);
 }

+// @TODO: this function is just a temporary fix until I come up with
+// something better for the situation below.
+__global__ void gAddRow(float* out, const float* in, int length) {
+  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+    if(index < length) {
+      out[index] = in[index] + out[index];
+    }
+  }
+}
+
 void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
  cudaSetDevice(in->getDevice().no);

@ -141,17 +223,25 @@ void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
  for(int i = 0; i < axis; ++i)
    step *= in->shape()[i];

-  size_t offset1 = 0;
+  int offset1 = 0;
  for(int i = 0; i < step; ++i) {
    for(auto out : outputs) {
-      size_t size = out->shape().elements() / step;
-      size_t offset2 = i * size;
+      int size = out->shape().elements() / step;
+      int offset2 = i * size;

-      cudaMemcpyAsync(out->data() + offset2,
-                      in->data() + offset1,
-                      size * sizeof(float),
-                      cudaMemcpyDeviceToDevice);
+      // BUG: this is does not add gradients
+      //cudaMemcpyAsync(out->data() + offset2,
+      //                in->data() + offset1,
+      //                size * sizeof(float),
+      //                cudaMemcpyDeviceToDevice);

+      // @TODO: this is a quick but bad fix for the above bug
+      int threads = std::min(MAX_THREADS, size);
+      int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
+
+      gAddRow<<<blocks, threads>>>(out->data() + offset2,
+                                   in->data() + offset1,
+                                   size);
      offset1 += size;
    }
  }
@ -165,6 +255,7 @@ void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
    SplitCont(outputs, in, ax);
 }

+template <bool add>
 __global__ void gTransposeND(
    functional::Tensor<float> out,
    const functional::Tensor<float> in,
@ -180,27 +271,114 @@ __global__ void gTransposeND(
      out.shape().dims(index, oDims);
      for(int i = 0; i < N; ++i)
        pDims[permute[i]] = oDims[i];
-      out[index] = in[pDims];
+      if(add)
+        out[index] += in[pDims];
+      else
+        out[index] = in[pDims];
    }
  }
 }

+template <bool add>
+__global__ void gTranspose0213(float* out, const float* in,
+                    int rows,
+                    int cols,
+                    int stride1,
+                    int stride2) {
+
+  int stride = stride1 * stride2;
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      float* rowOut = out + j * cols;
+
+      int z = j / stride;
+      int y = (j % stride) / stride1;
+      int x = (j % stride) % stride1;
+      int j2 = z * stride + x * stride2 + y;
+
+      const float* rowIn = in + j2 * cols;
+
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols) {
+          if(add)
+            rowOut[i] += rowIn[i];
+          else
+            rowOut[i] = rowIn[i];
+        }
+      }
+    }
+  }
+
+}
+
 void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis) {
  cudaSetDevice(out->getDevice().no);
+  if(vAxis == std::vector<int>({0, 2, 1, 3})) {

-  functional::Array<int, functional::Shape::size()> axes;
-  int diff = functional::Shape::size() - vAxis.size();
-  for(int i = 0; i < axes.size(); ++i)
-    if(i < diff)
-      axes[i] = i;
-    else
-      axes[i] = vAxis[i - diff] + diff;
+    int rows = out->shape().elements() / out->shape().back();
+    int cols = out->shape().back();

-  int length = out->shape().elements();
-  int threads = std::min(MAX_THREADS, length);
-  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+    int blocks = std::min(MAX_BLOCKS, rows);
+    int threads = std::min(MAX_THREADS, cols);

-  gTransposeND<<<blocks, threads>>>(out, in, axes);
+    int stride1 = out->shape()[-2];
+    int stride2 = out->shape()[-3];
+
+    gTranspose0213<false><<<blocks, threads>>>(out->data(), in->data(),
+                                               rows, cols, stride1, stride2);
+  }
+  else {
+
+    functional::Array<int, functional::Shape::size()> axes;
+    int diff = functional::Shape::size() - vAxis.size();
+    for(int i = 0; i < axes.size(); ++i)
+      if(i < diff)
+        axes[i] = i;
+      else
+        axes[i] = vAxis[i - diff] + diff;
+
+    int length = out->shape().elements();
+    int threads = std::min(MAX_THREADS, length);
+    int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+    gTransposeND<false><<<blocks, threads>>>(out, in, axes);
+  }
+}
+
+void TransposeNDGrad(Tensor out, Tensor in, const std::vector<int>& vAxis) {
+  cudaSetDevice(out->getDevice().no);
+  if(vAxis == std::vector<int>({0, 2, 1, 3})) {
+
+    int rows = out->shape().elements() / out->shape().back();
+    int cols = out->shape().back();
+
+    int blocks = std::min(MAX_BLOCKS, rows);
+    int threads = std::min(MAX_THREADS, cols);
+
+    int stride1 = out->shape()[-2];
+    int stride2 = out->shape()[-3];
+
+    gTranspose0213<true><<<blocks, threads>>>(out->data(), in->data(),
+                                              rows, cols, stride1, stride2);
+  }
+  else {
+
+    functional::Array<int, functional::Shape::size()> axes;
+    int diff = functional::Shape::size() - vAxis.size();
+    for(int i = 0; i < axes.size(); ++i)
+      if(i < diff)
+        axes[i] = i;
+      else
+        axes[i] = vAxis[i - diff] + diff;
+
+    int length = out->shape().elements();
+    int threads = std::min(MAX_THREADS, length);
+    int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+    gTransposeND<true><<<blocks, threads>>>(out, in, axes);
+  }
 }

 __global__ void gSoftmax(float* out,
@ -697,7 +875,7 @@ __global__ void gPasteCols(float* out,
      for(int tid = 0; tid < colsIn; tid += blockDim.x) {
        int i = tid + threadIdx.x;
        if(i < colsIn)
-          rowOut[targetColIdx[i]] = rowIn[i];
+          rowOut[targetColIdx[i]] += rowIn[i];
      }
    }
  }
@ -764,7 +942,7 @@ __global__ void gInsert(float* out,
      inShape.dims(index, dims);
      dims[axis] = d_indices[dims[index]];
      int outIndex = outShape.index(dims);
-      out[outIndex] = in[index];
+      out[outIndex] += in[index];
    }
  }
 }
@ -1558,14 +1736,21 @@ void LayerNormalizationGrad(Tensor gradX,
      eps);
 }

+template <bool add>
 __global__ void gShift(float* out, const float* in, int length, int offset, float padValue) {
  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
    if(index < length) {
-      if(index - offset < 0 || index - offset >= length)
-        out[index] = padValue;
-      else
-        out[index] = in[index - offset];
+      if(add) {
+        if(index - offset >= 0 && index - offset < length)
+          out[index] += in[index - offset];
+      }
+      else {
+        if(index - offset < 0 || index - offset >= length)
+          out[index] = padValue;
+        else
+          out[index] = in[index - offset];
+      }
    }
  }
 }
@ -1588,7 +1773,28 @@ void Shift(Tensor out, Tensor in, marian::Shape shift, float padValue, bool inve
  int threads = std::min(MAX_THREADS, length);
  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));

-  gShift<<<blocks, threads>>>(out->data(), in->data(), length, offset, padValue);
+  gShift<false><<<blocks, threads>>>(out->data(), in->data(), length, offset, padValue);
+}
+
+void ShiftGrad(Tensor out, Tensor in, marian::Shape shift, bool invert) {
+  ABORT_IF(in->shape().size() != shift.size(), "bad dimensions");
+
+  // BUGBUG: This can only shift along the first axis. Shifting, e.g., along the last axis cannot be implemented this way.
+  int offset = 0;
+  for(int i = 0; i < shift.size(); ++i)
+    offset += in->shape().stride(i) * shift[i];
+
+  if(invert)
+    offset = -offset;
+
+  cudaSetDevice(out->getDevice().no);
+
+  int length = out->shape().elements();
+
+  int threads = std::min(MAX_THREADS, length);
+  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+  gShift<true><<<blocks, threads>>>(out->data(), in->data(), length, offset, 0.f);
 }

 __global__ void gSetSparse(float* out,
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@ -64,7 +64,8 @@ void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
 // clang-format off
  DISPATCH7(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float)
  DISPATCH8(ProdWithBias, marian::Tensor, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float)
-  DISPATCH7(ProdBatched, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float)
+
+  DISPATCH8(ProdBatched, marian::Tensor, Ptr<Allocator>, const marian::Tensor, const marian::Tensor, bool, bool, float, float)

  DISPATCH2(Dropout, marian::Tensor, float)

@ -78,7 +79,10 @@ void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
  DISPATCH4(CrossEntropyPickBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)

  DISPATCH3(TransposeND, marian::Tensor, marian::Tensor, const std::vector<int>&)
+  DISPATCH3(TransposeNDGrad, marian::Tensor, marian::Tensor, const std::vector<int>&)
+
  DISPATCH5(Shift, marian::Tensor, marian::Tensor, marian::Shape, float, bool)
+  DISPATCH4(ShiftGrad, marian::Tensor, marian::Tensor, marian::Shape, bool)

  DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
 // clang-format on
--- a/src/training/communicator.cpp
+++ b/src/training/communicator.cpp
@ -0,0 +1,13 @@
+#include "training/communicator.h"
+
+namespace marian {
+
+// Compile this if cuda is not being compiled.
+// Version with CUDA and/or NCCL is compiled in communicator.cu
+#ifndef CUDA_FOUND
+Ptr<Communicator> createCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs, bool noNccl) {
+  return New<DefaultCommunicator>(graphs);
+}
+#endif
+
+}
--- a/src/training/communicator.cu
+++ b/src/training/communicator.cu
@ -0,0 +1,239 @@
+#include "training/communicator.h"
+#include "functional/functional.h"
+#include "tensors/tensor_operators.h"
+
+#ifdef USE_NCCL
+#include "cuda_runtime.h"
+#include "nccl.h"
+#endif
+
+namespace marian {
+
+#ifdef USE_NCCL
+class NCCLCommunicator : public Communicator {
+private:
+  std::vector<ncclComm_t> comms_;
+  std::vector<cudaStream_t> streams_;
+  std::vector<int> devices_;
+
+  void synchronizeAll() {
+    for(int i = 0; i < graphs_.size(); ++i) {
+      cudaSetDevice(devices_[i]);
+      cudaStreamSynchronize(streams_[i]);
+    }
+  }
+
+public:
+  NCCLCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs)
+   : Communicator(graphs),
+     comms_(graphs.size()),
+     streams_(graphs.size()),
+     devices_(graphs.size())
+  {
+    LOG(info, "[comm] Using NCCL library for GPU communication");
+
+    for(int i = 0; i < graphs_.size(); ++i) {
+      auto device = graphs_[i]->getBackend()->getDevice();
+
+      ABORT_IF(device.type != DeviceType::gpu,
+               "NCCL communicator can only be used with GPUs");
+
+      devices_[i] = device.no;
+      cudaSetDevice(devices_[i]);
+      cudaStreamCreate(&streams_[i]);
+    }
+
+    ncclCommInitAll(comms_.data(), devices_.size(), devices_.data());
+  }
+
+  ~NCCLCommunicator() override {
+    for(int i = 0; i < devices_.size(); ++i) {
+      cudaSetDevice(devices_[i]);
+      cudaStreamDestroy(streams_[i]);
+      ncclCommDestroy(comms_[i]);
+    }
+  }
+
+  void scatterReduce() override {
+    int totalSize = graphs_[0]->params()->vals()->size();
+    int shardSize = ceil(totalSize / (float)graphs_.size());
+
+    int pos = 0;
+
+    ncclGroupStart();
+    for(int i = 0; i < graphs_.size(); ++i) {
+      int size = std::min(shardSize, totalSize);
+
+      const void* sendbuff = (const void*)graphs_[i]->params()->grads()->data();
+      auto subgrad = graphs_[i]->params()->grads()->subtensor(pos, size);
+      void* recvbuff = subgrad->data();
+
+      ncclReduceScatter(sendbuff,
+                        recvbuff,
+                        shardSize,
+                        ncclFloat,
+                        ncclSum,
+                        comms_[i],
+                        streams_[i]);
+
+      pos += size;
+      totalSize -= size;
+    }
+    ncclGroupEnd();
+
+    synchronizeAll();
+  }
+
+  void allGather() override {
+    int totalSize = graphs_[0]->params()->vals()->size();
+    int shardSize = ceil(totalSize / (float)graphs_.size());
+
+    int pos = 0;
+
+    ncclGroupStart();
+    for(int i = 0; i < graphs_.size(); ++i) {
+      int size = std::min(shardSize, totalSize);
+
+      auto subparam = graphs_[i]->params()->vals()->subtensor(pos, size);
+      const void* sendbuff = (const void*)subparam->data();
+      void* recvbuff = (void*)graphs_[i]->params()->vals()->data();
+
+      ncclAllGather(sendbuff,
+                    recvbuff,
+                    shardSize,
+                    ncclFloat,
+                    comms_[i],
+                    streams_[i]);
+
+      pos += size;
+      totalSize -= size;
+    }
+    ncclGroupEnd();
+
+    synchronizeAll();
+  }
+
+  void swapParams(const std::vector<Tensor>& params) override {
+    // Update all graphs with parameter shard
+    ABORT_IF(graphs_.size() < 2, "Swap requires at least two graphs");
+
+    auto gather = [this, params](size_t idx, int pos) {
+      // copy parameter shard to each graph, apart from last graph
+      for(int i = 0; i < graphs_.size() - 1; ++i) {
+        auto subParam = graphs_[i]->params()->vals()->subtensor(pos, params[idx]->size());
+        subParam->copyFrom(params[idx]);
+      }
+
+      // back-up shard from last graph
+      auto subParamLast = graphs_.back()->params()->vals()->subtensor(pos, params[idx]->size());
+      params[idx]->copyFrom(subParamLast);
+
+      auto subParamFirst = graphs_[0]->params()->vals()->subtensor(pos, params[idx]->size());
+      subParamLast->copyFrom(subParamFirst);
+    };
+
+    // execute for each shard
+    this->foreach(gather);
+  }
+
+  void pushParams(std::vector<Tensor>& params) override {
+    // Copy paramter shard from i-th graph to shard params[i].
+    // Graphs and shards with the same index live on the same device.
+
+    auto copy = [this, params](size_t idx, int pos) {
+      // copy parameter shard to each graph
+      auto subParam = graphs_[idx]->params()->vals()->subtensor(pos, params[idx]->size());
+      params[idx]->copyFrom(subParam);
+    };
+
+    this->foreach(copy);
+  }
+
+  void pullParams(const std::vector<Tensor>& params) override {
+    // Update all graphs with parameter shard
+
+    auto gather = [this, params](size_t idx, int pos) {
+      // copy parameter shard to each graph
+      for(auto graph : graphs_) {
+        auto subParam = graph->params()->vals()->subtensor(pos, params[idx]->size());
+        subParam->copyFrom(params[idx]);
+      }
+    };
+    this->foreach(gather);
+  }
+
+  // Doesn't work yet with NCCL
+  // void pushParams(std::vector<Tensor>& params) {
+  //   // Copy paramter shard from i-th graph to shard params[i].
+  //   // Graphs and shards with the same index live on the same device.
+
+  //   int pos = 0;
+  //   for(int i = 0; i < graphs_.size(); ++i) {
+  //     auto subParam = graphs_[i]->params()->vals()->subtensor(pos, params[i]->size());
+  //     ncclGroupStart();
+  //     ncclBroadcast((const void*)subParam->data(),
+  //                   (void*)params[i]->data(),
+  //                   params[i]->size(),
+  //                   ncclFloat,
+  //                   0,
+  //                   comms_[i],
+  //                   streams_[i]);
+  //     ncclGroupEnd();
+  //     pos += params[i]->size();
+  //   }
+  //   synchronizeAll();
+  // }
+
+  // void pullParams(const std::vector<Tensor>& params) {
+  //   // Update all graphs with parameter shard
+
+  //   int totalSize = graphs_[0]->params()->vals()->size();
+  //   int shardSize = ceil(totalSize / (float)graphs_.size());
+
+  //   ncclGroupStart();
+  //   for(int i = 0; i < graphs_.size(); ++i) {
+
+  //     const void* sendbuff = (const void*)params[i]->data();
+  //     void* recvbuff = (void*)graphs_[i]->params()->vals()->data();
+
+  //     ncclAllGather(sendbuff,
+  //                   recvbuff,
+  //                   shardSize,
+  //                   ncclFloat,
+  //                   comms_[i],
+  //                   streams_[i]);
+  //   }
+  //   ncclGroupEnd();
+
+  //   synchronizeAll();
+  // }
+};
+#endif
+
+Ptr<Communicator> createCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs, bool noNccl) {
+#ifdef USE_NCCL
+  if(noNccl) {
+    LOG(warn, "[comm] NCCL communicator overridden");
+    return New<DefaultCommunicator>(graphs);
+  }
+
+  // if at least one of the devices is not a gpu, fall-back to default
+  for(auto& graph : graphs) {
+    if(graph->getBackend()->getDevice().type == DeviceType::cpu) {
+      return New<DefaultCommunicator>(graphs);
+    }
+  }
+
+  size_t d = graphs.size();
+  if((d & (d - 1)) != 0) {
+    LOG(warn, "[comm] Number of devices {} is not a power of 2 and communication might be slow with NCCL", d);
+    LOG(warn, "[comm] You can switch off NCCL with --no-nccl option", d);
+  }
+
+  return New<NCCLCommunicator>(graphs);
+#else
+  return New<DefaultCommunicator>(graphs);
+#endif
+}
+
+}
--- a/src/training/communicator.h
+++ b/src/training/communicator.h
@ -0,0 +1,178 @@
+#include "graph/expression_graph.h"
+#include "functional/functional.h"
+#include "tensors/tensor_operators.h"
+
+namespace marian {
+
+class Communicator {
+protected:
+  const std::vector<Ptr<ExpressionGraph>> graphs_;
+
+public:
+  Communicator(const std::vector<Ptr<ExpressionGraph>>& graphs)
+  : graphs_(graphs) {}
+
+  virtual ~Communicator() {}
+
+  virtual void foreach(const std::function<void(size_t, int)>& func) {
+    int totalSize = graphs_[0]->params()->vals()->size();
+    int shardSize = ceil(totalSize / (float)graphs_.size());
+
+    int pos = 0;
+    std::vector<std::thread> group;
+    // iterate over all shards
+    for(int idx = 0; idx < graphs_.size(); ++idx) {
+      int size = std::min(shardSize, totalSize);
+
+      group.emplace_back(func, idx, pos);
+
+      pos += size;
+      totalSize -= size;
+    }
+    for(auto& t : group)
+      t.join();
+  }
+
+  virtual void scatterReduce() = 0;
+  virtual void allGather() = 0;
+
+  virtual void pushParams(std::vector<Tensor>& params) = 0;
+  virtual void pullParams(const std::vector<Tensor>& params) = 0;
+  virtual void swapParams(const std::vector<Tensor>& params) = 0;
+};
+
+class DefaultCommunicator : public Communicator {
+private:
+  std::vector<Ptr<TensorAllocator>> paramsAllocs_;
+  std::vector<Tensor> tmpTensors_;
+
+  void init() {
+    if(tmpTensors_.size() == 0) {
+      int totalSize = graphs_[0]->params()->vals()->size();
+      int shardSize = ceil(totalSize / (float)graphs_.size());
+
+      int pos = 0;
+      for(auto graph : graphs_) {
+        int __size__ = std::min(shardSize, totalSize);
+
+        auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
+        paramsAllocs_.push_back(paramsAlloc);
+
+        paramsAlloc->reserveExact(__size__ * sizeof(float));
+
+        Tensor tmp;
+
+        paramsAlloc->allocate(tmp, {1, __size__});
+        tmpTensors_.push_back(tmp);
+
+        // move to next shard
+        pos += __size__;
+        totalSize -= __size__;
+      }
+    }
+  }
+
+public:
+  DefaultCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs)
+  : Communicator(graphs) {}
+
+  ~DefaultCommunicator() override {}
+
+  void scatterReduce() override {
+    init();
+
+    int totalSize = graphs_[0]->params()->vals()->size();
+    int shardSize = ceil(totalSize / (float)graphs_.size());
+
+    // Gather gradients from different devices into current gradient shards
+    auto scatter = [this, shardSize](size_t idx, int pos) {
+      auto curGrad = graphs_[idx]->params()->grads()->subtensor(pos, shardSize);
+
+      // collect and sum gradients
+      // to be replaced with ncclScatterReduce
+      for(auto graph : graphs_) {
+        if(graph != graphs_[idx]) {
+          auto subGrad = graph->params()->grads()->subtensor(pos, shardSize);
+          tmpTensors_[idx]->copyFrom(subGrad);
+
+          using namespace functional;
+          Element(_1 = _1 + _2, curGrad, tmpTensors_[idx]);
+        }
+      }
+    };
+
+    this->foreach(scatter);
+  }
+
+  void allGather() override {
+    int totalSize = graphs_[0]->params()->vals()->size();
+    int shardSize = ceil(totalSize / (float)graphs_.size());
+
+    // Update all graphs with parameter shard
+    auto gather = [this, shardSize](size_t idx, int pos) {
+      auto curParam = graphs_[idx]->params()->vals()->subtensor(pos, shardSize);
+
+      // copy parameter shard to each graph
+      for(auto graph : graphs_) {
+        if(graph != graphs_[idx]) {
+          auto subParam = graph->params()->vals()->subtensor(pos, shardSize);
+          subParam->copyFrom(curParam);
+        }
+      }
+    };
+
+    this->foreach(gather);
+  }
+
+  void pushParams(std::vector<Tensor>& params) override {
+    // Copy paramter shard from i-th graph to shard params[i].
+    // Graphs and shards with the same index live on the same device.
+
+    auto copy = [this, params](size_t idx, int pos) {
+      // copy parameter shard to each graph
+      auto subParam = graphs_[idx]->params()->vals()->subtensor(pos, params[idx]->size());
+      params[idx]->copyFrom(subParam);
+    };
+
+    this->foreach(copy);
+  }
+
+  void pullParams(const std::vector<Tensor>& params) override {
+    // Update all graphs with parameter shard
+
+    auto gather = [this, params](size_t idx, int pos) {
+      // copy parameter shard to each graph
+      for(auto graph : graphs_) {
+        auto subParam = graph->params()->vals()->subtensor(pos, params[idx]->size());
+        subParam->copyFrom(params[idx]);
+      }
+    };
+    this->foreach(gather);
+  }
+
+  void swapParams(const std::vector<Tensor>& params) override {
+    // Update all graphs with parameter shard
+    ABORT_IF(graphs_.size() < 2, "Swap requires at least two graphs");
+
+    auto gather = [this, params](size_t idx, int pos) {
+      // copy parameter shard to each graph, apart from last graph
+      for(int i = 0; i < graphs_.size() - 1; ++i) {
+        auto subParam = graphs_[i]->params()->vals()->subtensor(pos, params[idx]->size());
+        subParam->copyFrom(params[idx]);
+      }
+
+      // back-up shard from last graph
+      auto subParamLast = graphs_.back()->params()->vals()->subtensor(pos, params[idx]->size());
+      params[idx]->copyFrom(subParamLast);
+
+      auto subParamFirst = graphs_[0]->params()->vals()->subtensor(pos, params[idx]->size());
+      subParamLast->copyFrom(subParamFirst);
+    };
+    // execute for each shard
+    this->foreach(gather);
+  }
+};
+
+Ptr<Communicator> createCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs, bool noNccl = false);
+
+}
--- a/src/training/gradient_dropping/gpu/sparse_algorithm.cu
+++ b/src/training/gradient_dropping/gpu/sparse_algorithm.cu
@ -1,99 +1,139 @@
 #include "training/gradient_dropping/gpu/sparse_algorithm.h"

-#include "tensors/gpu/algorithm.h"
-#include "tensors/gpu/cuda_helpers.h"
 #include <curand.h>
 #include <curand_kernel.h>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/copy.h>
-#include <thrust/binary_search.h>
+#include "tensors/gpu/algorithm.h"
+#include "tensors/gpu/cuda_helpers.h"

 namespace marian {
-  namespace gpu {
-    struct non_zero
-    {
-        __host__ __device__
-        bool operator()(const float x)
-        {
-            return x != 0;
-        }
-    };
+namespace gpu {
+struct non_zero {
+  __host__ __device__ bool operator()(const float x) { return x != 0; }
+};

-    __global__ void copy_id(float* data,
-                            int* indices,
-                            float* out,
-                            int size) {
-      int idx = blockDim.x * blockIdx.x + threadIdx.x;
-      if(idx >= size)
-        return;
-      out[idx] = data[indices[idx]];
-    }
+__global__ void copy_id(float* data, int* indices, float* out, int size) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if(idx >= size)
+    return;
+  out[idx] = data[indices[idx]];
+}

-    __global__ void gScatterAdd(float* denseData,
+__global__ void gScatterAdd(float* denseData,
                            float* sparseData,
                            int* sparseIndices,
                            int denseSize,
                            int sparseSize,
                            int offset) {
-      int idx = blockDim.x * blockIdx.x + threadIdx.x;
-      if(idx >= sparseSize)
-        return;
-      if(sparseIndices[idx] >= -offset
-         && sparseIndices[idx] + offset < denseSize)
-        denseData[sparseIndices[idx] + offset] += sparseData[idx];
-    }
-
-
-    std::vector<int> lower_bounds(int* data, std::vector<int> values, int size, DeviceId device) {  
-      cudaSetDevice(device.no);
-
-      thrust::device_ptr<int> data_ptr(data);
-      thrust::device_vector<int> d_values(values);
-      thrust::device_vector<int> d_output(values.size());
-
-      thrust::lower_bound(data_ptr, data_ptr + size,
-                          d_values.begin(), d_values.end(),
-                          d_output.begin());
-
-         std::vector<int> output(values.size());
-         thrust::copy(d_output.begin(), d_output.end(), output.begin());
-
-         return output;
-    }
-
-
-    int buildSparse(Tensor t, float* data, int* indices) {
-      cudaSetDevice(t->getDevice().no);
-      using namespace thrust;
-
-      device_ptr<float> grad_ptr(t->data());
-      device_ptr<float> sparse_grad_ptr(data);
-      device_ptr<int> indices_ptr(indices);
-
-      int sparse_size = copy_if(make_counting_iterator<int>(0), 
-        make_counting_iterator<int>(t->size()), 
-        grad_ptr, 
-        indices_ptr, 
-        non_zero()) - indices_ptr;
-
-      int threads = 512;
-      int blocks = 1 + t->size() / threads;
-      copy_id<<<blocks, threads>>>(t->data(), indices, data, sparse_size);
-
-      return sparse_size;
-    }
-
-
-    void scatterAdd(Tensor t, float* data, int *indices, int size, int offset) {
-      cudaSetDevice(t->getDevice().no);
-
-      int threads = 512;
-      int blocks = 1 + size / threads;
-      gScatterAdd<<<blocks, threads>>>(
-          t->data(), data, indices, t->size(), size, offset);
-      cudaStreamSynchronize(0);
-    }
-  }
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if(idx >= sparseSize)
+    return;
+  if(sparseIndices[idx] >= -offset && sparseIndices[idx] + offset < denseSize)
+    denseData[sparseIndices[idx] + offset] += sparseData[idx];
+}
+
+__global__ void gScatterUpdate(float* denseData,
+                               float* sparseData,
+                               int* sparseIndices,
+                               int denseSize,
+                               int sparseSize,
+                               int offset) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if(idx >= sparseSize)
+    return;
+  if(sparseIndices[idx] >= -offset && sparseIndices[idx] + offset < denseSize)
+    denseData[sparseIndices[idx] + offset] = sparseData[idx];
+}
+
+__global__ void gGather(float* denseData,
+                        float* sparseData,
+                        int* sparseIndices,
+                        int denseSize,
+                        int sparseSize,
+                        int offset) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if(idx >= sparseSize)
+    return;
+  if(sparseIndices[idx] >= -offset && sparseIndices[idx] + offset < denseSize)
+    sparseData[idx] = denseData[sparseIndices[idx] + offset];
+}
+
+std::vector<int> lower_bounds(int* data,
+                              std::vector<int> values,
+                              int size,
+                              DeviceId device) {
+  cudaSetDevice(device.no);
+
+  thrust::device_ptr<int> data_ptr(data);
+  thrust::device_vector<int> d_values(values);
+  thrust::device_vector<int> d_output(values.size());
+
+  thrust::lower_bound(data_ptr,
+                      data_ptr + size,
+                      d_values.begin(),
+                      d_values.end(),
+                      d_output.begin());
+
+  std::vector<int> output(values.size());
+  thrust::copy(d_output.begin(), d_output.end(), output.begin());
+
+  return output;
+}
+
+int buildSparse(Tensor t, float* data, int* indices) {
+  cudaSetDevice(t->getDevice().no);
+  using namespace thrust;
+
+  device_ptr<float> grad_ptr(t->data());
+  device_ptr<float> sparse_grad_ptr(data);
+  device_ptr<int> indices_ptr(indices);
+
+  int sparse_size = copy_if(make_counting_iterator<int>(0),
+                            make_counting_iterator<int>(t->size()),
+                            grad_ptr,
+                            indices_ptr,
+                            non_zero())
+                    - indices_ptr;
+
+  int threads = 512;
+  int blocks = 1 + t->size() / threads;
+  copy_id<<<blocks, threads>>>(t->data(), indices, data, sparse_size);
+
+  return sparse_size;
+}
+
+void scatterAdd(Tensor t, float* data, int* indices, int size, int offset) {
+  cudaSetDevice(t->getDevice().no);
+
+  int threads = 512;
+  int blocks = 1 + size / threads;
+  gScatterAdd<<<blocks, threads>>>(
+      t->data(), data, indices, t->size(), size, offset);
+  cudaStreamSynchronize(0);
+}
+
+void scatterUpdate(Tensor t, float* data, int* indices, int size, int offset) {
+  cudaSetDevice(t->getDevice().no);
+
+  int threads = 512;
+  int blocks = 1 + size / threads;
+  gScatterUpdate<<<blocks, threads>>>(
+      t->data(), data, indices, t->size(), size, offset);
+  cudaStreamSynchronize(0);
+}
+
+void gather(Tensor t, float* data, int* indices, int size, int offset) {
+  cudaSetDevice(t->getDevice().no);
+
+  int threads = 512;
+  int blocks = 1 + size / threads;
+  gGather<<<blocks, threads>>>(
+      t->data(), data, indices, t->size(), size, offset);
+  cudaStreamSynchronize(0);
+}
+}
 }
--- a/src/training/gradient_dropping/gpu/sparse_algorithm.h
+++ b/src/training/gradient_dropping/gpu/sparse_algorithm.h
@ -4,17 +4,24 @@
 #include "tensors/backend.h"
 #include "tensors/tensor.h"

-
 namespace marian {
-  namespace gpu {
-    // output is a vector of size values.size. Output[i] is lower_bound of values[i] in data
-    std::vector<int> lower_bounds(int* data,
-                                  std::vector<int> values,
-                                  int size,
-                                  DeviceId device);
+namespace gpu {
+/**
+ * @brief Output[i] is lower_bound of values[i] in data.
+ *
+ * @return A vector of size values.size
+ */
+std::vector<int> lower_bounds(int* data,
+                              std::vector<int> values,
+                              int size,
+                              DeviceId device);

-    int buildSparse(Tensor t, float* data, int* indices);
+int buildSparse(Tensor t, float* data, int* indices);

-    void scatterAdd(Tensor t, float* data, int *indices, int size, int offset);
-  }
-}
+void scatterAdd(Tensor t, float* data, int* indices, int size, int offset);
+
+void scatterUpdate(Tensor t, float* data, int* indices, int size, int offset);
+
+void gather(Tensor t, float* data, int* indices, int size, int offset);
+}
+}
--- a/src/training/gradient_dropping/sparse_tensor.h
+++ b/src/training/gradient_dropping/sparse_tensor.h
@ -1,14 +1,15 @@
 #pragma once

+#include <algorithm>
 #include <memory>

 #include "common/definitions.h"
 #include "tensors/backend.h"
-
-#include "tensors/tensor_operators.h"
 #include "tensors/device.h"
+#include "tensors/tensor_operators.h"

 #ifdef CUDA_FOUND
+#include "tensors/gpu/algorithm.h"
 #include "training/gradient_dropping/gpu/sparse_algorithm.h"
 #endif

@ -20,26 +21,26 @@ class SparseTensorBase : public std::enable_shared_from_this<SparseTensorBase> {
  int size_;
  int capacity_;
  Ptr<Backend> backend_;
-  
+
  std::vector<Ptr<Device>> devices;

-  template<typename T>
+  template <typename T>
  T* newData(int size, Ptr<Backend> backend) {
    Ptr<Device> device = DispatchDevice(backend->getDevice());
    device->reserve(size * sizeof(T));
    devices.push_back(device);
    return (T*)device->data();
  }
-  
+
 public:
  SparseTensorBase(int capacity, Ptr<Backend> backend)
-  : backend_(backend), capacity_(capacity) {
+      : backend_(backend), capacity_(capacity) {
    data_ = newData<float>(capacity, backend);
    indices_ = newData<int>(capacity, backend);
  }

-  SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend) 
-  : backend_(backend) {
+  SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend)
+      : backend_(backend) {
    data_ = data;
    indices_ = indices;
    size_ = size;
@ -60,6 +61,37 @@ public:

  int* indices() { return indices_; }

+  // copy to cpu vector
+  void get(std::vector<float>& g, std::vector<int>& i) {
+    int s = std::min((int)g.size(), size());
+    if(backend_->getDevice().type == DeviceType::cpu) {
+      std::copy(data(), data() + s, g.data());
+      std::copy(indices(), indices() + s, i.data());
+    }
+#ifdef CUDA_FOUND
+    else {
+      gpu::copy(backend_, data(), data() + s, g.data());
+      gpu::copy(backend_, indices(), indices() + s, i.data());
+    }
+#endif
+  }
+
+  // copy from cpu vector
+  void set(const std::vector<float>& g, const std::vector<int>& i) {
+    int s = std::min((int)g.size(), capacity());
+    size_ = s;
+    if(backend_->getDevice().type == DeviceType::cpu) {
+      std::copy(g.data(), g.data() + s, data());
+      std::copy(i.data(), i.data() + s, indices());
+    }
+#ifdef CUDA_FOUND
+    else {
+      gpu::copy(backend_, g.data(), g.data() + s, data());
+      gpu::copy(backend_, i.data(), i.data() + s, indices());
+    }
+#endif
+  }
+
  void copyFrom(float* ndata, int* nindices, int nsize) {
    size_ = nsize;
    if(backend_->getDevice().type == DeviceType::cpu) {
@ -77,11 +109,13 @@ public:
    copyFrom(t->data(), t->indices(), t->size());
  }

-  void toDense(Tensor t, int offset) {
+  // Convert sparseTensor into a Tensor
+  void toDense(Tensor t, int offset = 0) {
    t->set(0);
    scatterAdd(t, offset);
  }

+  // Convert a tensor into a sparse tensor format
  void fromDense(Tensor t) {
    if(backend_->getDevice().type == DeviceType::cpu) {
      ABORT("Gradient Dropping for CPU is not yet supported");
@ -94,6 +128,7 @@ public:
 #endif
  }

+  // Add t[indices[i]] += data[i]
  void scatterAdd(Tensor t, int offset = 0) {
    if(backend_->getDevice().type == DeviceType::cpu) {
      ABORT("Gradient Dropping for CPU is not yet supported");
@ -105,6 +140,30 @@ public:
 #endif
  }

+  // Add t[indices[i]] = data[i]
+  void scatterUpdate(Tensor t, int offset = 0) {
+    if(backend_->getDevice().type == DeviceType::cpu) {
+      ABORT("Gradient Dropping for CPU is not yet supported");
+    }
+#ifdef CUDA_FOUND
+    else {
+      gpu::scatterUpdate(t, data(), indices(), size(), offset);
+    }
+#endif
+  }
+
+  // data[i] = t[indices[i]]
+  void gather(Tensor t, int offset = 0) {
+    if(backend_->getDevice().type == DeviceType::cpu) {
+      ABORT("Gradient Dropping for CPU is not yet supported");
+    }
+#ifdef CUDA_FOUND
+    else {
+      gpu::gather(t, data(), indices(), size(), offset);
+    }
+#endif
+  }
+
  std::shared_ptr<SparseTensorBase> subtensor(int pos, int subsize) {
    int startOffset = 0;
    int endOffset = 0;
@ -118,8 +177,8 @@ public:
    }
 #ifdef CUDA_FOUND
    else {
-      std::vector<int> outputs = gpu::lower_bounds(
-        indices(), values, size(), backend_->getDevice());
+      std::vector<int> outputs
+          = gpu::lower_bounds(indices(), values, size(), backend_->getDevice());

      startOffset = outputs[0];
      endOffset = outputs[1];
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@ -32,7 +32,7 @@ public:

  virtual ~GraphGroup() {}

-  virtual void update(Ptr<data::Batch>) = 0;
+  virtual void update(Ptr<data::Batch> batch) = 0;

  virtual void load() = 0;

--- a/src/training/graph_group_async_drop.cpp
+++ b/src/training/graph_group_async_drop.cpp
@ -8,113 +8,74 @@

 namespace marian {

-Tensor AsyncGraphGroupDrop::newTensor(int size, Ptr<Backend> backend) {
-  Tensor t;
-  Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(backend);
-  allocator_->reserveExact(size * sizeof(float));
-  allocator_->allocate(t, {1, size});
-  allocators.push_back(allocator_);
-
-  return t;
-}
-
 void AsyncGraphGroupDrop::fetchParams(Tensor oldParams,
                                      const std::vector<Tensor>& params,
                                      int device_id) {
-  using namespace functional;
-  // @TODO read guard on parameters
-  int pos = 0;
+  // Full fetch when fetching moving average OR still in warm-up period.
+  if(&params == &paramsAvg_ || fetchStep_[device_id]++ <= dropping_warmup) {
+    AsyncGraphGroup::fetchParams(oldParams, params, device_id);
+    return;
+  }

  std::vector<std::thread> threads;
-  for(int i = 0; i < devices_.size(); i++) {
+  int pos = 0;
+  for(int idx = 0; idx < devices_.size(); idx++) {
    threads.emplace_back(std::thread(
-        [&](int idx, int pos) {
+        [=](int idx, int pos) {
+          auto sparseGrad = sparseGrads_[device_id][idx];
+          auto sparseShard = sparseShards_[device_id][idx];
+
          // individual mutex per-shard
          std::lock_guard<std::mutex> guard(shardSync_[idx]);

-          // normal fetch
-          if(fetchStep_[device_id] <= dropping_warmup
-             || &params == &paramsAvg_) {  // Do not use sparse fetch when
-                                           // fetching from paramsAvg
-            oldParams->subtensor(pos, params[idx]->size())
-                ->copyFrom(params[idx]);
-            paramsLocal_[device_id][idx]->copyFrom(params[idx]);
-            return;
-          }
-
-          // sparse fetch
-          // get delta : params latest version - current param (locally)
-          Element(_1 = _2 - _3,
-                  paramsDelta_[idx],
-                  params[idx],
-                  paramsLocal_[device_id][idx]);
-
-          // update current local param
-          paramsLocal_[device_id][idx]->copyFrom(params[idx]);
-
-          // get sparse delta
-          fetchDropper[device_id][idx]->dropGraph(paramsDelta_[idx],
-                                                  fetchSparseGradient_[idx],
-                                                  droping_rate,
-                                                  dropping_momentum);
-
-          // move sparse delta
-          fetchShardedSparseGradient_[device_id][idx]->copyFrom(
-              fetchSparseGradient_[idx]);
-
-          fetchShardedSparseGradient_[device_id][idx]->scatterAdd(
+          sparseShard->gather(params[idx]);
+          sparseGrad->copyFrom(sparseShard);
+          sparseGrad->scatterUpdate(
              oldParams->subtensor(pos, params[idx]->size()));
        },
-        i,
+        idx,
        pos));

    pos += shardSize_;
  }
-#if 0
-  for(auto&& t : threads)
-    t.join();
-  // BUGBUG [compiler]: This fails to compile on VS 2015, for the comparison of the iterator with end()
-#else
-  for (size_t i = 0; i < threads.size(); i++)
-      threads[i].join();
-#endif
-  fetchStep_[device_id]++;
+  for(size_t i = 0; i < threads.size(); i++)
+    threads[i].join();
 }

 void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
                                        size_t batch_words,
                                        int device_id) {
-  if(pushStep_[device_id]++ <= dropping_warmup) {
+  if(pushStep_[device_id]++ < dropping_warmup) {
    AsyncGraphGroup::pushGradients(newGrads, batch_words, device_id);
    return;
  }

-  // get the sparse gradient
-  pushDropper_[device_id]->dropGraph(newGrads,
-                                     pushSparseGradient_[device_id],
-                                     droping_rate,
-                                     dropping_momentum);
-
-  SparseTensor newSparseGrads = pushSparseGradient_[device_id];
  // add instead of copy?
  std::vector<std::thread> threads;
  int pos = 0;
  for(int idx = 0; idx < devices_.size(); idx++) {
    threads.emplace_back(std::thread(
        [=](int idx, int pos) {
+          auto dropper = droppers_[device_id][idx];
+          auto sparseGrad = sparseGrads_[device_id][idx];
+          auto sparseShard = sparseShards_[device_id][idx];
+          auto tensor = newGrads->subtensor(pos, grads_[idx]->size());
          // individual mutex per-shard
          std::lock_guard<std::mutex> guard(shardSync_[idx]);

-          // split to shard
-          SparseTensor subGrad
-              = newSparseGrads->subtensor(pos, grads_[idx]->size());
+          // drop the gradients
+          dropper->dropGraph(
+              tensor, sparseGrad, droping_rate, dropping_momentum);

          // send the sharded sparse tensor
-          pushShardedSparseGradient_[idx]->copyFrom(subGrad);
+          sparseShard->copyFrom(sparseGrad);

          // convert back to dense, store it in grads_[idx]
-          pushShardedSparseGradient_[idx]->toDense(grads_[idx], -pos);
+          // sparseShard indices is equal to the indices of the sparse gradient
+          // which will be used for sparse fetching
+          sparseShard->toDense(grads_[idx]);

+          // optimize
          if(scaleLearningRate_) {
            shardOpt_[idx]->update(
                params_[idx], grads_[idx], batch_words / avgBatchWords_);
@ -125,7 +86,6 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
          if(movingAvg_)
            updateMovingAverage(
                paramsAvg_[idx], params_[idx], scheduler_->numberOfBatches());
-
        },
        idx,
        pos));
@ -140,51 +100,34 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
  AsyncGraphGroup::init(batch);
  // extra inits for gradient dropping
  if(drop_first) {
-    int totalSize = graphs_[0]->params()->vals()->size();
-    int sparseCap = totalSize * 1.5 * (1.0 - droping_rate);
-    int shardSize = ceil(totalSize / devices_.size());
-
-    for(int i = 0; i < devices_.size(); i++)
-      paramsLocal_.push_back(std::vector<Tensor>());
-
    for(int i = 0; i < devices_.size(); i++) {
      // warm-up counter
      fetchStep_.push_back(0);
      pushStep_.push_back(0);
+      fetch_ready.push_back(false);

-      // temporary tensor to compute parameter delta before fetching
-      paramsDelta_.push_back(newTensor(shardSize, graphs_[i]->getBackend()));
+      // Size of the sparse tensor
+      int totalSize = graphs_[0]->params()->vals()->size();
+      int sparseCap = totalSize * 1.2 * (1.0 - droping_rate);

-      // tensors to store local params history
-      for(int h_id = 0; h_id < devices_.size(); h_id++) {
-        Tensor tmp = newTensor(params_[i]->size(), graphs_[i]->getBackend());
-        tmp->copyFrom(params_[i]);
-        paramsLocal_[h_id].push_back(tmp);
-      }
-
-      // individual Gradient dropper per-device
-      pushDropper_.push_back(PrepareGradientDrop(graphs_[i]->getDevice()));
-
-      // N-dropper for fetch
+      // prepare droppers
      std::vector<GradientDrop> tmpDropper;
      for(auto device : devices_)
        tmpDropper.push_back(PrepareGradientDrop(graphs_[i]->getDevice()));
-      fetchDropper.push_back(tmpDropper);
-
-      // sparsetensor to store sparsified gradients per-device
-      pushSparseGradient_.push_back(SparseTensor(
-          new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
-
-      pushShardedSparseGradient_.push_back(SparseTensor(
-          new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
-      fetchSparseGradient_.push_back(SparseTensor(new SparseTensorBase(
-          sparseCap / devices_.size(), graphs_[i]->getBackend())));
+      droppers_.push_back(tmpDropper);

+      // sparsetensor to store sparsified gradients per-device per-shard
      std::vector<SparseTensor> tmp;
      for(int j = 0; j < devices_.size(); j++)
        tmp.push_back(SparseTensor(new SparseTensorBase(
            sparseCap / devices_.size(), graphs_[i]->getBackend())));
-      fetchShardedSparseGradient_.push_back(tmp);
+      sparseGrads_.push_back(tmp);
+
+      std::vector<SparseTensor> tmp2;
+      for(int j = 0; j < devices_.size(); j++)
+        tmp2.push_back(SparseTensor(new SparseTensorBase(
+            sparseCap / devices_.size(), graphs_[j]->getBackend())));
+      sparseShards_.push_back(tmp2);
    }
    drop_first = false;
  }
--- a/src/training/graph_group_async_drop.h
+++ b/src/training/graph_group_async_drop.h
@ -10,6 +10,7 @@ namespace marian {
 class AsyncGraphGroupDrop : public AsyncGraphGroup {
  std::vector<int> fetchStep_;
  std::vector<int> pushStep_;
+  std::vector<bool> fetch_ready;

  bool drop_first = 1;

@ -17,21 +18,9 @@ class AsyncGraphGroupDrop : public AsyncGraphGroup {
  float droping_rate;
  float dropping_momentum;

-  std::vector<GradientDrop> pushDropper_;
-  std::vector<std::vector<GradientDrop>> fetchDropper;
+  std::vector<std::vector<GradientDrop>> droppers_;

-  std::vector<SparseTensor> pushSparseGradient_;
-  std::vector<SparseTensor> pushShardedSparseGradient_;
-
-  std::vector<SparseTensor> fetchSparseGradient_;
-  std::vector<std::vector<SparseTensor>> fetchShardedSparseGradient_;
-
-  std::vector<Tensor> paramsDelta_;
-  std::vector<std::vector<Tensor>> paramsLocal_;
-
-  std::vector<Ptr<TensorAllocator>> allocators;
-
-  Tensor newTensor(int size, Ptr<Backend> backend);
+  std::vector<std::vector<SparseTensor>> sparseGrads_, sparseShards_;

 protected:
  void init(Ptr<data::Batch> batch);
--- a/src/training/graph_group_multinode.cpp
+++ b/src/training/graph_group_multinode.cpp
@ -52,19 +52,19 @@ void MultiNodeGraphGroup::init(Ptr<data::Batch> batch) {
  }

  // setup delayed gradient storage
-  if (tau_ > 1) {
+  if(tau_ > 1) {
    delay_count = std::vector<size_t>(mpi_comm_world_size_);
    totalBatchWords = std::vector<int>(mpi_comm_world_size_);
    optDelayMutex_ = std::vector<std::mutex>(mpi_comm_world_size_);
-    
-    for (int i = 0;i < mpi_comm_world_size_; i++) {
+
+    for(int i = 0; i < mpi_comm_world_size_; i++) {
      // Shard buffers across GPUs
      auto backend = clientGraphs_[i % devices_.size()]->getBackend();
      Tensor accGrad = newTensor(nodeSizes_[i], backend);
      Tensor accGradBuff = newTensor(nodeSizes_[i], backend);
      accGradients.push_back(accGrad);
      accGradientBuffer.push_back(accGradBuff);
-    } 
+    }
  }
 }

@ -221,7 +221,7 @@ void MultiNodeGraphGroup::calculateShardSizes() {
 */
 void MultiNodeGraphGroup::initShardGpuTensors() {
  size_t offset = 0;
-  for (int i = 0; i < mpi_my_rank_; i++) {
+  for(int i = 0; i < mpi_my_rank_; i++) {
    offset += nodeSizes_[i];
  }
  for(int shard = 0; shard < devices_.size(); shard++) {
@ -242,7 +242,8 @@ void MultiNodeGraphGroup::initShardGpuTensors() {
 * updated parameters.
 */
 void MultiNodeGraphGroup::launchServerThread() {
-#if MPI_FOUND
+// @TODO: move CUDA stuff into separate .cu files and remove '&& CUDA_FOUND'
+#if MPI_FOUND && CUDA_FOUND
  serverShardThread_ = new std::thread([this] {
    // keep track of number of nodes still communicating with this shard
    int nCommunicatingNodes = mpi_comm_world_size_;
@ -400,7 +401,8 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,
                                                      Tensor oldParams,
                                                      int gpu,
                                                      size_t batchWords) {
-#if MPI_FOUND
+// @TODO: move CUDA stuff into separate .cu files and remove '&& CUDA_FOUND'
+#if MPI_FOUND && CUDA_FOUND
  size_t offset = 0;
  for(int node = 0; node < mpi_comm_world_size_; node++) {
    size_t nodeSize = nodeSizes_[node];
@ -410,9 +412,10 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,
      Tensor gradient;

      // Delayed Gradient Update
-      if (tau_ > 1) {
+      if(tau_ > 1) {
        std::lock_guard<std::mutex> guard(optDelayMutex_[node]);
-        accGradientBuffer[node]->copyFrom(newGrads->subtensor(offset, nodeSize));
+        accGradientBuffer[node]->copyFrom(
+            newGrads->subtensor(offset, nodeSize));
        // Accumulate the gradient
        using namespace functional;
        Element(_1 += _2, accGradients[node], accGradientBuffer[node]);
@ -420,14 +423,14 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,
        totalBatchWords[node] += batchWords;
        delay_count[node]++;

-        if (delay_count[node] < tau_)
+        if(delay_count[node] < tau_)
          continue;
        delay_count[node] = 0;
        gradient = accGradients[node];
        batchWords = totalBatchWords[node];
-     } else {
+      } else {
        gradient = newGrads->subtensor(offset, nodeSize);
-     }
+      }

      // Copy grads from GPU to CPU (for MPI sending)
      cudaMemcpy(clientCommBuffersCPU_[gpu].data(),
@ -455,7 +458,7 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,
                MPI_TAG_GRAD_PUSH_,
                MPI_COMM_WORLD);
      // Reset total gradient and batch words
-      if (tau_ > 1) {
+      if(tau_ > 1) {
        std::lock_guard<std::mutex> guard(optDelayMutex_[node]);
        accGradients[node]->set(0);
        totalBatchWords[node] = 0;
@ -554,9 +557,9 @@ void MultiNodeGraphGroup::execute(Ptr<data::Batch> batch) {
    auto costNode = builder->build(graph, batch);

 #if MPI_FOUND
-    if (t == 0) {
+    if(t == 0) {
      MPI_Barrier(MPI_COMM_WORLD);
-      if (my_id != 0)
+      if(my_id != 0)
        graph->params()->vals()->copyFrom(clientGraphs_[0]->params()->vals());
      MPI_Barrier(MPI_COMM_WORLD);
    }
@ -628,20 +631,19 @@ void MultiNodeGraphGroup::execute(Ptr<data::Batch> batch) {
      // Wait until the thread that wants to do validation is finished.
      clientThreadPool_->wait_for_one(lock);

-      if (options_->get<std::string>("cost-type") != "ce-sum")
+      if(options_->get<std::string>("cost-type") != "ce-sum")
        cost /= tau_;

-      if (tau_ > 1) {
+      if(tau_ > 1) {
        std::vector<size_t> fakeLength = {1, 1};
-        auto fb = data::CorpusBatch::fakeBatch(fakeLength,
-                                          num_seen_sentences,
-                                          NULL);
+        auto fb = data::CorpusBatch::fakeBatch(
+            fakeLength, num_seen_sentences, NULL);
        fb->front()->setWords(num_seen_words);
        scheduler_->update(cost, fb);
      } else {
        scheduler_->update(cost, batch);
      }
-      
+
      num_seen_words = 0;
      num_seen_sentences = 0;
      cost = 0;
@ -653,11 +655,11 @@ void MultiNodeGraphGroup::execute(Ptr<data::Batch> batch) {
        // a safe state.
        clientThreadPool_->wait_for_others(lock);
 #if MPI_FOUND
-        //wait until other nodes are ready
+        // wait until other nodes are ready
        MPI_Barrier(MPI_COMM_WORLD);
- 
+
        // TODO: Saving is broken
-        //if(mpi_my_rank_ == 0 && scheduler_->saving())
+        // if(mpi_my_rank_ == 0 && scheduler_->saving())
        //  this->save(graph);

        if(mpi_my_rank_ == 0 && scheduler_->validating())
--- a/src/training/graph_group_multinode.h
+++ b/src/training/graph_group_multinode.h
@ -2,6 +2,9 @@

 #if MPI_FOUND
 #include "mpi.h"
+#endif
+
+#ifdef CUDA_FOUND
 #include "cuda_runtime.h"
 #endif

@ -124,7 +127,7 @@ protected:
  int mpi_comm_world_size_{1};

  /**
-   * Flag to indicate that an MPI message contains message info 
+   * Flag to indicate that an MPI message contains message info
   * before sending the gradient (client -> server).
   */
  static const int MPI_TAG_GRAD_PUSH_MSG_{0};
@ -233,7 +236,7 @@ protected:
  /**
   * LocalOptimizers related variables
   */
-  bool useLocalOpt_;
+  // bool useLocalOpt_;

  /**
   * Allocate new tensor on given GPU and store allocator.
@ -405,10 +408,10 @@ public:
  MultiNodeGraphGroup(Ptr<Config> options)
      : GraphGroup(options),
        tau_{options_->get<size_t>("optimizer-delay")},
-        useLocalOpt_{options_->get<bool>("multi-node-local-optimizers")},
+        //        useLocalOpt_{options_->get<bool>("multi-node-local-optimizers")},
        clientCommOverlap{options_->get<bool>("multi-node-overlap")} {
    // Set up devices for this node
-    setupMPI(); //Setup MPI before creating device vectors
+    setupMPI();  // Setup MPI before creating device vectors
    std::vector<size_t> devices;
    for(auto& d : options_->getDevices())
      devices.push_back(d.no);
@ -526,8 +529,6 @@ public:
    return GraphGroup::collectStats(clientGraphs_[0], clientBuilders_[0]);
  }

-  virtual void finalize() {
-    finalized_ = true;
-  }
+  virtual void finalize() { finalized_ = true; }
 };
 }
--- a/src/training/graph_group_multinode_sync.cpp
+++ b/src/training/graph_group_multinode_sync.cpp
@ -0,0 +1,280 @@
+#include "training/graph_group_multinode_sync.h"
+#include "functional/functional.h"
+#include "tensors/tensor_operators.h"
+
+namespace marian {
+
+void MultiNodeGraphGroupSync::updateMovingAverage(Tensor paramsAvg,
+                                         Tensor params,
+                                         size_t batches) {
+  using namespace functional;
+  float decay
+      = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
+  Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params);
+}
+
+/**
+ * Set given scheduler to register training observers on the shard optimizers.
+ */
+void MultiNodeGraphGroupSync::setScheduler(Ptr<Scheduler> scheduler) {
+  scheduler_ = scheduler;
+  // optimizer has to be registered last to see a change of learning rate
+  scheduler_->registerTrainingObserver(scheduler_);
+
+  scheduler_->registerTrainingObserver(syncOptimizer_);
+
+}
+
+/**
+ * Allocate new tensor on given GPU and store allocator.
+ */
+Tensor MultiNodeGraphGroupSync::newTensor(int size, Ptr<Backend> backend) {
+  Tensor t;
+  Ptr<TensorAllocator> allocator = New<TensorAllocator>(backend);
+  allocator->reserveExact(size * sizeof(float));
+  allocator->allocate(t, {1, size});
+  allocators_.push_back(allocator);
+  return t;
+}
+
+/**
+ * Setup training environment and launch server thread and (if enabled) client
+ * communication overlap threads.
+ * Includes setting up MPI, node and shard sizes, clients, server shards and
+ * communication overlap stuff.
+ */
+void MultiNodeGraphGroupSync::init(Ptr<data::Batch> batch) {
+  // Setup clients and shards
+  setupClients(batch);
+  int network_size = clientGraphs_[0]->params()->vals()->size();
+  LOG(info, "model size = {} float params" , network_size);
+  if (movingAvg_)
+    paramsAvg_ = newTensor(network_size, clientGraphs_.back()->getBackend());
+
+  // setup sync sgd storage, We keep the summed gradient on Node 0
+  sumGradientBuffer = newTensor(network_size, clientGraphs_[0]->getBackend());
+  accGradientsSync = newTensor(network_size, clientGraphs_[0]->getBackend());
+}
+
+
+/**
+ * Initialize the CPU arrays, with pinned memory for faster CudaMemCpy operations.
+ * Requires the graph to be initialized first so we know its size
+ */
+void MultiNodeGraphGroupSync::initCPUArrays() {
+  accGradientsSync_cpu = std::vector<float>(clientGraphs_[0]->params()->vals()->size());
+  receiveBuffer_cpu = std::vector<float>(clientGraphs_[0]->params()->vals()->size());
+}
+
+/**
+ * Setup MPI world size and rank of this node.
+ */
+void MultiNodeGraphGroupSync::setupMPI() {
+#if MPI_FOUND
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_world_size_);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_my_rank_);
+#endif
+}
+
+/**
+ * Setup clients that will compute gradients and communicate them with the
+ * server shards.
+ * There is one client per GPU.
+ */
+void MultiNodeGraphGroupSync::setupClients(Ptr<data::Batch> batch) {
+  runBatchThroughClientGraphs(batch);
+  initCPUArrays();
+}
+
+/**
+ * Initialize the graphs (models) of all clients on this node with the given
+ * batch.
+ */
+void MultiNodeGraphGroupSync::runBatchThroughClientGraphs(Ptr<data::Batch> batch) {
+  for(int i = 0; i < devices_.size(); i++) {
+    THREAD_GUARD(clientBuilders_[i]->build(clientGraphs_[i], batch);
+                 clientGraphs_[i]->forward();
+                 clientGraphs_[i]->getBackend()->synchronize(););
+  }
+}
+
+/**
+ * Initialize variables required for overlapping client computations and
+ * communication.
+ * Includes summed and committed word counts, buffer flags, mutexes and
+ * condition variables.
+ */
+void MultiNodeGraphGroupSync::sumGRAD(Tensor gradient) {
+  std::lock_guard<std::mutex> guard(sumGradientMutex_);
+  sumGradientBuffer->copyFrom(gradient);
+  using namespace functional; //@TODO makes more sense to do that on the CPU i think
+  Element(_1 += _2, accGradientsSync, sumGradientBuffer);
+}
+
+/**
+ * If it's rank 0, it's a local update, if it's rank one it's remote
+ * send and receive. Make sure you only call from device 0.
+ */
+void MultiNodeGraphGroupSync::sendReceiveUpdateSync() {
+  #if MPI_FOUND
+  int network_size = accGradientsSync_cpu.size();
+
+  // Copy the data to the CPU
+  accGradientsSync->get(accGradientsSync_cpu);
+
+  // Wait until all nodes are ready
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  int reduce_result = MPI_Allreduce(accGradientsSync_cpu.data(), //CPU buffers
+              receiveBuffer_cpu.data(),
+              network_size,
+              MPI_FLOAT,
+              MPI_SUM,
+              MPI_COMM_WORLD);
+
+  // Copy the data back to the GPU and do optimizer update
+  // Do update with last GPU to distribute the memory
+  clientGraphs_.back()->params()->grads()->set(receiveBuffer_cpu);
+
+  // Perform optimizer step
+  syncOptimizer_->update(clientGraphs_.back());
+
+  if(movingAvg_)
+      updateMovingAverage(
+        paramsAvg_, clientGraphs_.back()->params()->vals(),
+        scheduler_->numberOfBatches());
+
+  //Distribute the graph to the rest of the devices
+  std::vector<std::thread> threads;
+  for(int idx = 0; idx < devices_.size() - 1; idx++) {
+    threads.emplace_back(std::thread(
+        [=](int idx) {
+          clientGraphs_[idx]->params()->vals()->copyFrom(
+            clientGraphs_.back()->params()->vals());
+        },
+        idx));
+  }
+  for(auto&& t : threads) {
+    t.join();
+  }
+
+  //set the accumulating buffers to zero;
+  accGradientsSync->set(0);
+  std::fill(accGradientsSync_cpu.begin(), accGradientsSync_cpu.end(), 0);
+  std::fill(receiveBuffer_cpu.begin(), receiveBuffer_cpu.end(), 0);
+  #endif
+}
+
+
+/**
+ * Execute given batch on this node, pushing/pulling the resulting
+ * gradients/parameters to/from the server shards
+ * or -- if comm. overlap enabled -- to/from the communication buffers, summing
+ * gradients locally if the communication thread is busy
+ *
+ * @param batch Batch on which to perform forward and backward passes.
+ */
+void MultiNodeGraphGroupSync::execute(Ptr<data::Batch> fullBatch) {
+  if(!initialized_) {
+    init(fullBatch);
+    initialized_ = true;
+  }
+
+  std::vector<Ptr<data::Batch>> batches = fullBatch->split(devices_.size());
+
+  static int t = 0;
+
+  static float cost = 0;
+  static size_t num_seen_words = 0;
+  static size_t num_seen_sentences = 0;
+
+  {
+    auto task = [this, batches](int my_id) {
+      auto batch = batches[my_id];
+      auto graph = clientGraphs_[my_id];
+      auto builder = clientBuilders_[my_id];
+
+      auto costNode = builder->build(graph, batch);
+
+      if (t == 0) {
+        if (my_id != 0)
+          graph->params()->vals()->copyFrom(clientGraphs_[0]->params()->vals());
+      }
+
+      graph->forward();
+      {
+        std::lock_guard<std::mutex> guard(sumCostMutex_);
+        cost += costNode->scalar();
+        num_seen_words += batch->words();
+        num_seen_sentences += batch->size();
+      }
+      graph->backward();
+
+      graph->getBackend()->synchronize(); //@Alham do you know why we need this here?
+
+      sumGRAD(graph->params()->grads());
+    };
+
+    ThreadPool pool(devices_.size(), devices_.size());
+    for(int idx = 0; idx < devices_.size(); ++idx)
+      pool.enqueue(task, idx);
+  }
+
+  if (t % tau_ == 0)
+    sendReceiveUpdateSync();
+
+  t++;
+
+  // Run scheduler (if enabled)
+  if(t % tau_ == 0 && scheduler_) {
+    if (options_->get<std::string>("cost-type") != "ce-sum")
+      cost /= (tau_ * devices_.size());
+
+    if (tau_ > 1) {
+      std::vector<size_t> fakeLength = {1, 1};
+      auto fb = data::CorpusBatch::fakeBatch(fakeLength,
+                                        num_seen_sentences,
+                                        NULL);
+      fb->front()->setWords(num_seen_words);
+      scheduler_->update(cost, fb);
+    } else {
+      scheduler_->update(cost, fullBatch);
+    }
+
+    num_seen_words = 0;
+    num_seen_sentences = 0;
+    cost = 0;
+
+    if((scheduler_->saving() || scheduler_->validating())) {
+      #if MPI_FOUND
+      //wait until other nodes are ready
+      MPI_Barrier(MPI_COMM_WORLD);
+
+      // TODO: Saving is broken
+      //if(mpi_my_rank_ == 0 && scheduler_->saving())
+      //  this->save(graph);
+
+      if(mpi_my_rank_ == 0 && scheduler_->validating()) {
+        // temporarily save current params
+        if(movingAvg_)
+          accGradientsSync->copyFrom(clientGraphs_[0]->params()->vals());
+
+        if(movingAvg_)
+          for(auto graph : clientGraphs_)
+            graph->params()->vals()->copyFrom(paramsAvg_);
+
+        scheduler_->validate(clientGraphs_);
+
+        if(movingAvg_)
+          for(auto graph : clientGraphs_)
+            graph->params()->vals()->copyFrom(accGradientsSync);
+      }
+
+      // inform other nodes to continue
+      MPI_Barrier(MPI_COMM_WORLD);
+      #endif
+    }
+  }
+
+}
+}
--- a/src/training/graph_group_multinode_sync.h
+++ b/src/training/graph_group_multinode_sync.h
@ -0,0 +1,305 @@
+#pragma once
+
+#if MPI_FOUND
+#include "mpi.h"
+#endif
+
+#ifdef CUDA_FOUND
+#include "cuda_runtime.h"
+#endif
+
+#include <condition_variable>
+#include <future>
+#include <thread>
+
+#include <boost/filesystem.hpp>
+#include <boost/thread/locks.hpp>
+#include <boost/thread/shared_mutex.hpp>
+
+#include "3rd_party/threadpool.h"
+#include "training/graph_group.h"
+
+namespace marian {
+
+/**
+ * Multi-node graph group for asynchronous training over multiple
+ * machines each with one or multiple GPUs
+ */
+class MultiNodeGraphGroupSync : public GraphGroup {
+public:
+  virtual void setScheduler(Ptr<Scheduler> scheduler);
+
+protected:
+  ////////////////////////////////////////////////////////////////////////////
+  // General variables.
+
+  /** Number of clients on nodes in MPI world (cluster). */
+  std::vector<int> numberClientsOfNodes_;  //@TODO not used for now, but might
+                                           // be useful maybe?
+
+  /** Whether graph group has been properly initialized with a first batch. */
+  bool initialized_{false};
+
+  /** Memory allocators for tensors (GPUs). */
+  std::vector<Ptr<TensorAllocator>> allocators_;
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Client variables.
+
+  /** Graph builders for clients (which run forward and backward passes). */
+  std::vector<Ptr<models::ModelBase>> clientBuilders_;
+
+  /** Graphs of clients. */
+  std::vector<Ptr<ExpressionGraph>> clientGraphs_;
+
+  /** Devices (GPUs) on this node. */
+  std::vector<size_t> devices_;
+
+  /** Mutex to ensure clients are uniquely assigned to graphs and builders. */
+  std::mutex mutexClientInit_;
+
+  /** Mutex to avoid race conditions in scheduler. */
+  std::mutex schedulerMutex_;
+
+  /**
+   * Batch number counter used for evenly distributing mini-batches across
+   * nodes.
+   */
+  size_t batchIter_ = 0;
+
+  ////////////////////////////////////////////////////////////////////////////
+  // Communication variables.
+
+  /** MPI rank of this node. */
+  int mpi_my_rank_{0};
+
+  /** Number of nodes in MPI world (cluster). */
+  int mpi_comm_world_size_{1};
+
+  /**
+   * Variables for optimizer delay and synchronous SGD
+   */
+  size_t tau_{1};
+  std::mutex sumGradientMutex_;
+  std::mutex updateParamsMutex_;
+  std::mutex sumCostMutex_;
+  Tensor accGradientsSync;
+  Tensor sumGradientBuffer;
+  Tensor paramsAvg_;
+  std::vector<float> accGradientsSync_cpu;
+  std::vector<float> receiveBuffer_cpu;
+  bool synchronization_happened{false};
+
+  Ptr<OptimizerBase> syncOptimizer_;
+
+  std::vector<std::mutex> optDelayMutex_;
+  std::vector<size_t> delay_count;
+  std::vector<int> totalBatchWords;
+  std::vector<Tensor> accGradients, accGradientBuffer;
+
+  bool movingAvg_{false};
+  float mvDecay_{1e-4};
+
+  /**
+   * Allocate new tensor on given GPU and store allocator.
+   */
+  Tensor newTensor(int size, Ptr<Backend> backend);
+
+  /*
+   * exponential smoothing
+   */
+  void updateMovingAverage(Tensor paramsAvg, Tensor params, size_t batches);
+
+  /**
+   * Setup training environment and launch server thread and (if enabled) client
+   * communication overlap threads..
+   * Includes setting up MPI, node and shard sizes, clients, server shards and
+   * communication overlap stuff.
+   */
+  virtual void init(Ptr<data::Batch> batch);
+
+  /**
+   * Setup MPI world size and rank of this node.
+   */
+  void setupMPI();
+
+  /**
+   * Setup clients that will compute gradients and communicate them with the
+   * server shards.
+   * There is one client per GPU.
+   */
+  void setupClients(Ptr<data::Batch> batch);
+
+  /**
+   * Initialize the graphs (models) of all clients on this node with the given
+   * batch.
+   */
+  void runBatchThroughClientGraphs(Ptr<data::Batch> batch);
+
+  /**
+   * Initialize the CPU arrays, with pinned memory for faster CudaMemCpy
+   * operations.
+   */
+  void initCPUArrays();
+
+  /**
+   * Sums the gradients from a node, taking care of locking
+   * @param gradient - the gradient
+   */
+
+  void sumGRAD(Tensor gradient);
+
+  /**
+   * Does the MPI Communication, parameter update and copying back parameters.
+   * @TODO ALHAM. God function too godly?
+   */
+  void sendReceiveUpdateSync();
+
+  void execute(Ptr<data::Batch> batch);
+
+  /**
+   * Load the GPU configuration of this node (i.e. which GPUs to use) and the
+   * number of GPUs on the other nodes.
+   */
+  void loadDeviceConfig(std::vector<size_t> deviceConfig) {
+    size_t index = 0, node = 0, nClientsSeen = 0;
+    numberClientsOfNodes_ = std::vector<int>(mpi_comm_world_size_, 0);
+    while(index < deviceConfig.size()) {
+      if(numberClientsOfNodes_[node] == 0) {
+        numberClientsOfNodes_[node] = deviceConfig[index];
+        nClientsSeen = 0;
+      } else if(nClientsSeen < numberClientsOfNodes_[node]) {
+        if(node == mpi_my_rank_) {
+          devices_.push_back(deviceConfig[index]);
+        }
+        nClientsSeen++;
+      } else {
+        node++;
+        index--;
+      }
+      index++;
+    }
+  }
+
+public:
+  /**
+   * (Constructor) Call super class and initialize client graphs and builders.
+   */
+  MultiNodeGraphGroupSync(Ptr<Config> options)
+      : GraphGroup(options),
+        tau_{options_->get<size_t>("optimizer-delay")},
+        movingAvg_{options_->get<float>("exponential-smoothing") > 0},
+        mvDecay_{options_->get<float>("exponential-smoothing")},
+        syncOptimizer_{Optimizer(options_)} {
+    // Set up devices for this node
+    setupMPI();  // Setup MPI before creating device vectors
+    std::vector<size_t> devices;
+    for(auto& d : options_->getDevices())
+      devices.push_back(d.no);
+    loadDeviceConfig(devices);
+
+    // Create builders and graphs for clients.
+    for(size_t i = 0; i < devices_.size(); i++) {
+      clientGraphs_.push_back(New<ExpressionGraph>());
+      clientGraphs_[i]->setDevice({devices_[i], DeviceType::gpu});
+      clientGraphs_[i]->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+      clientBuilders_.push_back(
+          models::from_config(options_, models::usage::training));
+    }
+  }
+
+  /**
+   * Update any client model with given batch if batch is assigned to this node.
+   */
+  void update(Ptr<data::Batch> batch) {
+    ABORT_IF(finalized_, "Training has already finished.");
+    if(batchIter_ % mpi_comm_world_size_
+       == mpi_my_rank_) {  // Only take batch assigned to this node
+      execute(batch);
+    }
+    batchIter_++;
+  }
+
+  /**
+   * Load models from disk if file exists and setting is not disabled
+   */
+  void load() {
+    if(!options_->get<bool>("no-reload")) {
+      std::string name = options_->get<std::string>("model");
+
+      if(boost::filesystem::exists(name)) {
+        if(scheduler_)
+          scheduler_->load(name);
+        size_t i = 0;
+        for(auto graph : clientGraphs_)
+          clientBuilders_[i++]->load(graph, name);
+      } else if(options_->has("pretrained-model")) {
+        std::string init = options_->get<std::string>("pretrained-model");
+        LOG(info,
+            "Initialize model weights with the pre-trained model {}",
+            init);
+        size_t i = 0;
+        for(auto graph : clientGraphs_)
+          clientBuilders_[i++]->load(graph, init, false);
+      }
+    }
+  }
+
+  /**
+   * Save model of first client's graph to disk
+   */
+  void save(bool final = false) { save(clientGraphs_[0], final); }
+
+  /**
+   * Save model of given graph to disk.
+   */
+  void save(Ptr<ExpressionGraph> graph, bool final = false) {
+    int idx = 0;
+    for(int i = 0; i < clientGraphs_.size(); ++i) {
+      if(graph == clientGraphs_[i]) {
+        idx = i;
+        break;
+      }
+    }
+
+    if(options_->get<bool>("overwrite")) {
+      std::string name = options_->get<std::string>("model");
+
+      clientBuilders_[idx]->save(clientGraphs_[idx], name, true);
+      if(scheduler_)
+        scheduler_->save(name);
+    } else {
+      std::string name = options_->get<std::string>("model");
+
+      if(!final) {
+        std::string numberOfBatches
+            = scheduler_ ? std::to_string(scheduler_->numberOfBatches())
+                         : "unknown";
+        std::string nameOverwrite = name;
+        nameOverwrite.replace(
+            name.size() - 4, 4, ".iter" + numberOfBatches + ".npz");
+        clientBuilders_[idx]->save(clientGraphs_[idx], nameOverwrite);
+      }
+
+      clientBuilders_[idx]->save(clientGraphs_[idx], name, true);
+      if(scheduler_)
+        scheduler_->save(name);
+    }
+  }
+
+  /**
+   * Collect statistics from first client's graph.
+   */
+  Ptr<data::BatchStats> collectStats() {
+    return GraphGroup::collectStats(
+        clientGraphs_[0], clientBuilders_[0], devices_.size());
+  }
+
+  virtual void finalize() {
+    finalized_ = true;
+#if MPI_FOUND
+    MPI_Finalize();
+#endif
+  }
+};
+}
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -4,6 +4,27 @@

 namespace marian {

+SyncGraphGroup::SyncGraphGroup(Ptr<Config> config)
+    : GraphGroup(config),
+      devices_{options_->getDevices()},
+      movingAvg_{options_->get<float>("exponential-smoothing") > 0},
+      mvDecay_{options_->get<float>("exponential-smoothing")},
+      delay_{options_->get<size_t>("optimizer-delay")} {
+
+  for(auto device : devices_) {
+    auto graph = New<ExpressionGraph>();
+    graph->setDevice(device);
+    graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+    graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
+
+    graphs_.push_back(graph);
+    shardOpt_.push_back(Optimizer(options_));
+    builders_.push_back(models::from_config(options_, models::usage::training));
+  }
+
+  comm_ = createCommunicator(graphs_, options_->get<bool>("no-nccl", false));
+}
+
 void SyncGraphGroup::setScheduler(Ptr<Scheduler> scheduler) {
  scheduler_ = scheduler;
  // optimizer has to be registered last to see changes of learning rate
@ -22,169 +43,133 @@ void SyncGraphGroup::updateMovingAverage(Tensor paramsAvg,
  Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params);
 }

-void SyncGraphGroup::fetchParams(Tensor oldParams,
-                                 const std::vector<Tensor>& params) {
-  // @TODO read guard on parameters
-  int pos = 0;
-  std::vector<std::thread> threads;
-  for(int idx = 0; idx < devices_.size(); idx++) {
-    threads.emplace_back(std::thread(
-        [=](int idx, int pos) {
-          oldParams->subtensor(pos, params[idx]->size())->copyFrom(params[idx]);
-        },
-        idx,
-        pos));
-    pos += shardSize_;
+void SyncGraphGroup::initialize(const std::vector<Ptr<data::Batch>>& batches) {
+  // Initialize 0th graph with random weights in one forward step
+  {
+    THREAD_GUARD(builders_[0]->build(graphs_[0], batches[0]);
+                 graphs_[0]->forward(););
+
+    // Copy weights from 0th graph to all other graphs
+    // to have equal weights across devices
+    ThreadPool pool(graphs_.size() - 1, graphs_.size() - 1);
+    for(size_t i = 1; i < graphs_.size(); ++i) {
+      auto init = [&](size_t i) {
+        // initialize i-th graph and weights
+        builders_[i]->build(graphs_[i], batches[0]);
+        graphs_[i]->forward();
+        // overwrite weights of i-th graph with weights from 0th graph
+        graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
+      };
+      pool.enqueue(init, i);
+    }
  }
-  for(auto&& t : threads) {
-    t.join();
+
+  if(movingAvg_ && paramsAvg_.size() == 0) {
+    int totalSize = graphs_[0]->params()->vals()->size();
+    shardSize_ = ceil(totalSize / (float)devices_.size());
+
+    int pos = 0;
+    for(auto graph : graphs_) {
+      int __size__ = std::min(shardSize_, totalSize);
+
+      auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
+      paramsAllocs_.push_back(paramsAlloc);
+
+      paramsAlloc->reserveExact(__size__ * sizeof(float));
+
+      Tensor paramAvg;
+      paramsAlloc->allocate(paramAvg, {1, __size__});
+      paramsAvg_.push_back(paramAvg);
+
+      paramAvg->copyFrom(graphs_[0]->params()->vals()->subtensor(pos, __size__));
+
+      // move to next shard
+      pos += __size__;
+      totalSize -= __size__;
+    }
  }
 }

-void SyncGraphGroup::execute(Ptr<data::Batch> fullBatch) {
-  std::vector<Ptr<data::Batch>> delayedBatches =
-    delay_ > 1 ?
-      fullBatch->split(delay_) :
-      std::vector<Ptr<data::Batch>>({ fullBatch });
+void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
+  size_t devs = devices_.size();
+  auto batches = batch->split(delay_ * devs);
+
+  float div = batches.size();  // no. of batches
+  // do not average gradients if cost type is sum.
+  if(options_->get<std::string>("cost-type") == "ce-sum")
+    div = 1;
+
+  std::vector<std::vector<Ptr<data::Batch>>> delayedBatches;
+
+  for(int i = 0; i < delay_; ++i) {
+    if(i * devs < batches.size()) {
+      delayedBatches.emplace_back();
+      for(int j = 0; j < devs; ++j) {
+        size_t index = i * devs + j;
+        if(index < batches.size())
+          delayedBatches.back().push_back(batches[i * devs + j]);
+        else
+          delayedBatches.back().push_back(nullptr);
+      }
+    }
+  }

  std::vector<float> costs(devices_.size(), 0.f);
-
  size_t t = 1;
-  for(auto batch : delayedBatches) {
-    std::vector<Ptr<data::Batch>> batches = batch->split(devices_.size());

+  for(const auto& curBatches : delayedBatches) {
    if(first_) {
-      {
-        THREAD_GUARD(builders_[0]->build(graphs_[0], batches[0]);
-                     graphs_[0]->forward(););
-
-        ThreadPool pool(graphs_.size() - 1, graphs_.size() - 1);
-        for(size_t i = 1; i < graphs_.size(); ++i) {
-          auto init = [&](size_t i) {
-            builders_[i]->build(graphs_[i], batches[0]);
-            graphs_[i]->forward();
-            graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
-          };
-          pool.enqueue(init, i);
-        }
-      }
-
-      if(params_.size() == 0) {
-        int totalSize = graphs_[0]->params()->vals()->size();
-        shardSize_ = ceil(totalSize / (float)devices_.size());
-
-        int pos = 0;
-        for(auto graph : graphs_) {
-          int __size__ = std::min(shardSize_, totalSize);
-
-          auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
-          paramsAllocs_.push_back(paramsAlloc);
-
-          paramsAlloc->reserveExact(3 * __size__ * sizeof(float));
-
-          Tensor param, grad, tmp;
-          paramsAlloc->allocate(param, {1, __size__});
-          paramsAlloc->allocate(grad, {1, __size__});
-          paramsAlloc->allocate(tmp, {1, __size__});
-          params_.push_back(param);
-
-          grad->set(0.f);
-          grads_.push_back(grad);
-
-          tmpTensors_.push_back(tmp);
-
-          param->copyFrom(graphs_[0]->params()->vals()->subtensor(pos, __size__));
-          pos += __size__;
-          totalSize -= __size__;
-        }
-      }
-
-      if(movingAvg_ && paramsAvg_.size() == 0) {
-        int totalSize = graphs_[0]->params()->vals()->size();
-
-        int i = 0;
-        for(auto graph : graphs_) {
-          int __size__ = std::min(shardSize_, totalSize);
-          totalSize -= __size__;
-          Tensor paramAvg;
-          auto allocator = New<TensorAllocator>(graph->getBackend());
-
-          allocator->reserveExact(__size__ * sizeof(float));
-          allocator->allocate(paramAvg, {1, __size__});
-
-          paramAvg->copyFrom(params_[i++]);
-
-          paramsAllocAvg_.push_back(allocator);
-          paramsAvg_.push_back(paramAvg);
-        }
-      }
-
+      initialize(curBatches);
      first_ = false;
    }

-    {
-      auto task = [this, &costs, batches](size_t idx) {
-        auto graph = graphs_[idx];
-        auto batch = batches[idx];
+    // Execute single forward/backward step
+    auto forwardBackward = [this, &costs, curBatches, t](size_t idx, int pos) {
+      auto graph = graphs_[idx];
+      auto batch = curBatches[idx];

-        if(batch->size() > 0) {
-          auto costNode = builders_[idx]->build(graph, batch);
-          graph->forward();
-          costs[idx] += costNode->scalar();
-          graph->backward();
-        }
-      };
+      if(batch) {
+        auto costNode = builders_[idx]->build(graph, batch);
+        graph->forward();
+        costs[idx] += costNode->scalar();

-      ThreadPool pool(devices_.size(), devices_.size());
-      for(int idx = 0; idx < batches.size(); ++idx)
-        pool.enqueue(task, idx);
-    }
-
-    {
-      auto task = [this, batches](size_t idx, int pos, bool update) {
-        int size = params_[idx]->size();
-        int i = 0;
-
-        float div = devices_.size();  // no. of GPUs
-
-        // do not average gradients if cost type is sum.
-        if(options_->get<std::string>("cost-type") == "ce-sum") {
-          div = 1;
-        }
-
-        for(auto graph : graphs_) {
-          if(batches[i]->size() > 0) {
-            auto subGrad = graph->params()->grads()->subtensor(pos, size);
-            tmpTensors_[idx]->copyFrom(subGrad);
-
-            using namespace functional;
-            Element(_1 = _1 + (_2 / div), grads_[idx], tmpTensors_[idx]);
-          }
-          i++;
-        }
-
-        if(update) {
-          shardOpt_[idx]->update(params_[idx], grads_[idx]);
-          grads_[idx]->set(0.f);
-
-          if(movingAvg_)
-            updateMovingAverage(
-              paramsAvg_[idx], params_[idx], scheduler_->numberOfBatches());
-
-          for(auto graph : graphs_) {
-            auto subParam = graph->params()->vals()->subtensor(pos, size);
-            subParam->copyFrom(params_[idx]);
-          }
-        }
-
-      };
-
-      ThreadPool pool(devices_.size(), devices_.size());
-      int pos = 0;
-      for(int idx = 0; idx < devices_.size(); ++idx) {
-        pool.enqueue(task, idx, pos, t == delay_);
-        pos += params_[idx]->size();
+        // only reset gradients to 0 if t == 1
+        graph->backward(t == 1);
      }
+      else {
+        // handle case of empty batch, execute do-nothing fw-bw step for
+        // proper inits and resets.
+        graph->forward();
+        graph->backward(t == 1);
+      }
+    };
+
+    // Update parameter shard with gradient shard
+    auto update = [this, div](size_t idx, int pos) {
+      int totalSize = graphs_[0]->params()->vals()->size();
+      int shardSize = ceil(totalSize / (float)devices_.size());
+
+      int size = std::min(totalSize - pos, shardSize);
+
+      auto curGrad  = graphs_[idx]->params()->grads()->subtensor(pos, size);
+      auto curParam = graphs_[idx]->params()->vals()->subtensor(pos, size);
+
+      if(div != 1) {
+        using namespace functional;
+        Element(_1 = _1 / div, curGrad);
+      }
+
+      shardOpt_[idx]->update(curParam, curGrad);
+
+      if(movingAvg_)
+        updateMovingAverage(paramsAvg_[idx], curParam, scheduler_->numberOfBatches());
+    };
+
+    comm_->foreach(forwardBackward);
+    if(t == delayedBatches.size()) {
+      comm_->scatterReduce();
+      comm_->foreach(update);
+      comm_->allGather();
    }

    t++;
@ -202,24 +187,108 @@ void SyncGraphGroup::execute(Ptr<data::Batch> fullBatch) {
  }

  if(scheduler_) {
-    scheduler_->update(cost, fullBatch);
+    scheduler_->update(cost, batches);

    if(scheduler_->saving()) {
      this->save();
    }

    if(scheduler_->validating()) {
-      if(movingAvg_)
-        for(auto graph : graphs_)
-          fetchParams(graph->params()->vals(), paramsAvg_);
+      if(movingAvg_) {
+        comm_->swapParams(paramsAvg_);
+      }

      // safe, because all graphs are idle during validation with sync sgd
      scheduler_->validate(graphs_);

-      if(movingAvg_)
-        for(auto graph : graphs_)
-          fetchParams(graph->params()->vals(), params_);
+      if(movingAvg_) {
+        comm_->swapParams(paramsAvg_);
+      }
    }
  }
 }
+
+void SyncGraphGroup::load() {
+  if(!options_->get<bool>("no-reload")) {
+    std::string name = options_->get<std::string>("model");
+
+    if(boost::filesystem::exists(name)) {
+      size_t i = 0;
+      if(scheduler_)
+        scheduler_->load(name);
+      for(auto graph : graphs_)
+        builders_[i++]->load(graph, name);
+
+      // @TODO: probably we want to have the list of DeviceIds as an attribute
+      std::vector<Ptr<Backend>> backends;
+      for(auto graph : graphs_)
+        backends.push_back(graph->getBackend());
+      shardOpt_[0]->load(name + ".optimizer.npz", shardOpt_, backends);
+
+    } else if(options_->has("pretrained-model")) {
+      std::string init = options_->get<std::string>("pretrained-model");
+      LOG(info,
+          "Initialize model weights with the pre-trained model {}",
+          init);
+      size_t i = 0;
+      for(auto graph : graphs_)
+        builders_[i++]->load(graph, init, false);
+    }
+  }
+}
+
+void SyncGraphGroup::save(bool final) {
+    if(final && scheduler_) {
+      if(movingAvg_ && paramsAvg_.size() > 0)
+        comm_->swapParams(paramsAvg_);
+
+      scheduler_->validate(graphs_, true);
+
+      if(movingAvg_ && paramsAvg_.size() > 0)
+        comm_->swapParams(paramsAvg_);
+    }
+    save(graphs_[0], final);
+  }
+
+  void SyncGraphGroup::save(Ptr<ExpressionGraph> graph, bool final) {
+    int idx = 0;
+    for(int i = 0; i < graphs_.size(); ++i) {
+      if(graph == graphs_[i]) {
+        idx = i;
+        break;
+      }
+    }
+
+    if(movingAvg_ && paramsAvg_.size() > 0)
+      comm_->swapParams(paramsAvg_);
+
+    std::string name = options_->get<std::string>("model");
+
+    if(options_->get<bool>("overwrite")) {
+      builders_[idx]->save(graphs_[idx], name, true);
+      if(scheduler_)
+        scheduler_->save(name);
+    } else {
+      if(!final) {
+        std::string numberOfBatches
+            = scheduler_ ? std::to_string(scheduler_->numberOfBatches())
+                         : "unknown";
+        std::string nameOverwrite = name;
+        nameOverwrite.replace(
+            name.size() - 4, 4, ".iter" + numberOfBatches + ".npz");
+        builders_[idx]->save(graphs_[idx], nameOverwrite);
+      }
+
+      builders_[idx]->save(graphs_[idx], name, true);
+      if(scheduler_)
+        scheduler_->save(name);
+    }
+
+    if(movingAvg_ && paramsAvg_.size() > 0)
+      comm_->swapParams(paramsAvg_);
+
+    size_t totalSize = graphs_[idx]->params()->vals()->size();
+    shardOpt_[idx]->save(name + ".optimizer.npz", shardOpt_, totalSize);
+  }
+
 }
--- a/src/training/graph_group_sync.h
+++ b/src/training/graph_group_sync.h
@ -4,6 +4,7 @@

 #include "3rd_party/threadpool.h"
 #include "training/graph_group.h"
+#include "training/communicator.h"

 namespace marian {

@ -12,26 +13,27 @@ public:
  virtual void setScheduler(Ptr<Scheduler> scheduler);

 private:
+
+  Ptr<Communicator> comm_;
+
  std::vector<Ptr<models::ModelBase>> builders_;
  std::vector<Ptr<ExpressionGraph>> graphs_;
  std::vector<DeviceId> devices_;

-  std::vector<Tensor> params_;
-  std::vector<Tensor> grads_;
-  std::vector<Tensor> tmpTensors_;
-  std::vector<Ptr<TensorAllocator>> paramsAllocs_;
-
  std::vector<Ptr<OptimizerBase>> shardOpt_;

  int shardSize_;
  bool first_{true};

  std::vector<Tensor> paramsAvg_;
-  std::vector<Ptr<TensorAllocator>> paramsAllocAvg_;
+  std::vector<Ptr<TensorAllocator>> paramsAllocs_;
+
  bool movingAvg_{false};
  float mvDecay_{1e-4f};
  size_t delay_{1};

+  void initialize(const std::vector<Ptr<data::Batch>>& batches);
+
  void updateMovingAverage(Tensor paramsAvg, Tensor params, size_t batches);

  void fetchParams(Tensor oldParams, const std::vector<Tensor>& params);
@ -39,117 +41,23 @@ private:
  void execute(Ptr<data::Batch> batch);

 public:
-  SyncGraphGroup(Ptr<Config> config)
-      : GraphGroup(config),
-        devices_{options_->getDevices()},
-        movingAvg_{options_->get<float>("exponential-smoothing") > 0},
-        mvDecay_{options_->get<float>("exponential-smoothing")},
-        delay_{options_->get<size_t>("optimizer-delay")} {
-    for(auto device : devices_) {
-      auto graph = New<ExpressionGraph>();
-      graph->setDevice(device);
-      graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
-
-      graphs_.push_back(graph);
-      shardOpt_.push_back(Optimizer(options_));
-      builders_.push_back(models::from_config(options_, models::usage::training));
-    }
-  }
+  SyncGraphGroup(Ptr<Config> config);

  void update(Ptr<data::Batch> batch) {
    ABORT_IF(finalized_, "Training has already finished.");
    execute(batch);
  }

-  void load() {
-    if(!options_->get<bool>("no-reload")) {
-      std::string name = options_->get<std::string>("model");
-
-      if(boost::filesystem::exists(name)) {
-        size_t i = 0;
-        if(scheduler_)
-          scheduler_->load(name);
-        for(auto graph : graphs_)
-          builders_[i++]->load(graph, name);
-
-        // @TODO: probably we want to have the list of DeviceIds as an attribute
-        std::vector<Ptr<Backend>> backends;
-        for(auto graph : graphs_)
-          backends.push_back(graph->getBackend());
-        shardOpt_[0]->load(name + ".optimizer.npz", shardOpt_, backends);
-
-      } else if(options_->has("pretrained-model")) {
-        std::string init = options_->get<std::string>("pretrained-model");
-        LOG(info,
-            "Initialize model weights with the pre-trained model {}",
-            init);
-        size_t i = 0;
-        for(auto graph : graphs_)
-          builders_[i++]->load(graph, init, false);
-      }
-    }
-  }
-
-  void save(bool final = false) {
-    if(final && scheduler_) {
-      if(movingAvg_ && paramsAvg_.size() > 0)
-        for(auto graph : graphs_)
-          fetchParams(graph->params()->vals(), paramsAvg_);
-
-      scheduler_->validate(graphs_, true);
-
-      if(movingAvg_ && paramsAvg_.size() > 0)
-        for(auto graph : graphs_)
-          fetchParams(graph->params()->vals(), params_);
-    }
-
-    save(graphs_[0], final);
-  }
-
-  void save(Ptr<ExpressionGraph> graph, bool final = false) {
-    int idx = 0;
-    for(int i = 0; i < graphs_.size(); ++i) {
-      if(graph == graphs_[i]) {
-        idx = i;
-        break;
-      }
-    }
-
-    if(movingAvg_ && paramsAvg_.size() > 0)
-      fetchParams(graphs_[idx]->params()->vals(), paramsAvg_);
-
-    std::string name = options_->get<std::string>("model");
-
-    if(options_->get<bool>("overwrite")) {
-      builders_[idx]->save(graphs_[idx], name, true);
-      if(scheduler_)
-        scheduler_->save(name);
-    } else {
-      if(!final) {
-        std::string numberOfBatches
-            = scheduler_ ? std::to_string(scheduler_->numberOfBatches())
-                         : "unknown";
-        std::string nameOverwrite = name;
-        nameOverwrite.replace(
-            name.size() - 4, 4, ".iter" + numberOfBatches + ".npz");
-        builders_[idx]->save(graphs_[idx], nameOverwrite);
-      }
-
-      builders_[idx]->save(graphs_[idx], name, true);
-      if(scheduler_)
-        scheduler_->save(name);
-    }
-
-    if(movingAvg_ && paramsAvg_.size() > 0)
-      fetchParams(graphs_[idx]->params()->vals(), params_);
-
-    size_t totalSize = graphs_[idx]->params()->vals()->size();
-    shardOpt_[idx]->save(name + ".optimizer.npz", shardOpt_, totalSize);
-  }
+  void load();
+  void save(bool final = false);
+  void save(Ptr<ExpressionGraph> graph, bool final = false);

  Ptr<data::BatchStats> collectStats() {
-    return GraphGroup::collectStats(graphs_[0], builders_[0], devices_.size() * delay_);
+    return GraphGroup::collectStats(graphs_[0], builders_[0], numBatches());
+  }
+
+  size_t numBatches() {
+    return devices_.size() * delay_;
  }

  virtual void finalize() {
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -154,10 +154,20 @@ public:
  }

  void update(float cost, Ptr<data::Batch> batch) {
+    update(cost, std::vector<Ptr<data::Batch>>({batch}));
+  }
+
+  void update(float cost, const std::vector<Ptr<data::Batch>>& batches) {
    state_->validated = false;

-    auto batchSize   = batch->size();    // number of sentences in batch
-    auto batchLabels = batch->words(-1); // number of target words in batch
+    auto batchSize   = 0; // number of sentences in batch
+    auto batchLabels = 0; // number of target words in batch
+
+    for(const auto& batch : batches) {
+      batchSize += batch->size();
+      batchLabels += batch->words(-1);
+    }
+
    // reconstruct sum cost, for displaying epoch-level averages instead of minibatch-level
    auto costType = options_->get<std::string>("cost-type");
    auto dispLabelCounts = options_->get<bool>("disp-label-counts"); // if true then show as "cost per label * number of labels"
@ -178,6 +188,7 @@ public:
    state_->wordsDisp    += batchLabels; // target words processed since last display, for speed display
    state_->samplesEpoch += batchSize;   // sentences processed in this epoch
    state_->labelsTotal  += batchLabels; // total labels processed
+
    state_->newBatch();

    if(state_->batches % options_->get<size_t>("disp-freq") == 0) {
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -14,7 +14,7 @@
 #include "translator/beam_search.h"
 #include "translator/history.h"
 #include "translator/output_collector.h"
-#include "translator/printer.h"
+#include "translator/output_printer.h"
 #include "translator/scorers.h"

 namespace marian {
@ -294,9 +294,11 @@ public:

    boost::timer::cpu_timer timer;
    {
+      auto printer = New<OutputPrinter>(options_, vocabs_.back());
      auto collector = options_->has("valid-translation-output")
                           ? New<OutputCollector>(fileName)
                           : New<OutputCollector>(*tempFile);
+
      if(quiet_)
        collector->setPrintingStrategy(New<QuietPrinting>());
      else
@ -329,7 +331,7 @@ public:
          for(auto history : histories) {
            std::stringstream best1;
            std::stringstream bestn;
-            Printer(options_, vocabs_.back(), history, best1, bestn);
+            printer->print(history, best1, bestn);
            collector->Write(history->GetLineNum(),
                             best1.str(),
                             bestn.str(),
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -36,16 +36,22 @@ public:
               const Beams& beams,
               std::vector<Ptr<ScorerState>>& states,
               size_t beamSize,
-               bool first) {
+               bool first,
+               Ptr<data::CorpusBatch> batch) {
    Beams newBeams(beams.size());
-    for(int i = 0; i < keys.size(); ++i) {

-      // keys is contains indices to vocab items in the entire beam.
-      // values can be between 0 and beamSize * vocabSize.
+    std::vector<float> alignments;
+    if(options_->get<float>("alignment", 0.f))
+      // Use alignments from the first scorer, even if ensemble
+      alignments = scorers_[0]->getAlignment();
+
+    for(int i = 0; i < keys.size(); ++i) {
+      // Keys contains indices to vocab items in the entire beam.
+      // Values can be between 0 and beamSize * vocabSize.
      int embIdx = keys[i] % vocabSize;
      int beamIdx = i / beamSize;

-      // retrieve short list for final softmax (based on words aligned
+      // Retrieve short list for final softmax (based on words aligned
      // to source sentences). If short list has been set, map the indices
      // in the sub-selected vocabulary matrix back to their original positions.
      auto shortlist = scorers_[0]->getShortlist();
@ -72,6 +78,8 @@ public:
          beamHypIdx = 0;

        auto hyp = New<Hypothesis>(beam[beamHypIdx], embIdx, hypIdxTrans, cost);
+
+        // Set cost breakdown for n-best lists
        if(options_->get<bool>("n-best")) {
          std::vector<float> breakDown(states.size(), 0);
          beam[beamHypIdx]->GetCostBreakdown().resize(states.size(), 0);
@ -82,12 +90,57 @@ public:
          }
          hyp->GetCostBreakdown() = breakDown;
        }
+
+        // Set alignments
+        if(!alignments.empty()) {
+          auto align = getHardAlignmentsForHypothesis(
+              alignments, batch, beamSize, beamHypIdx, beamIdx);
+          hyp->SetAlignment(align);
+        }
+
        newBeam.push_back(hyp);
      }
    }
    return newBeams;
  }

+  std::vector<float> getHardAlignmentsForHypothesis(
+      const std::vector<float> alignments,
+      Ptr<data::CorpusBatch> batch,
+      int beamSize,
+      int beamHypIdx,
+      int beamIdx) {
+    // Let's B be the beam size, N be the number of batched sentences,
+    // and L the number of words in the longest sentence in the batch.
+    // The alignment vector:
+    //
+    // if(first)
+    //   * has length of N x L if it's the first beam
+    //   * stores elements in the following order:
+    //     beam1 = [word1-batch1, word1-batch2, ..., word2-batch1, ...]
+    // else
+    //   * has length of N x L x B
+    //   * stores elements in the following order:
+    //     beams = [beam1, beam2, ..., beam_n]
+    //
+    // The mask vector is always of length N x L and has 1/0s stored like
+    // in a single beam, i.e.:
+    //   * [word1-batch1, word1-batch2, ..., word2-batch1, ...]
+    //
+    size_t batchSize = batch->size();
+    size_t batchWidth = batch->width() * batchSize;
+    std::vector<float> align;
+
+    for(size_t w = 0; w < batchWidth / batchSize; ++w) {
+      size_t a = ((batchWidth * beamHypIdx) + beamIdx) + (batchSize * w);
+      size_t m = a % batchWidth;
+      if(batch->front()->mask()[m] != 0)
+        align.emplace_back(alignments[a]);
+    }
+
+    return align;
+  }
+
  Beams pruneBeam(const Beams& beams) {
    Beams newBeams;
    for(auto beam : beams) {
@ -108,7 +161,9 @@ public:
    Histories histories;
    for(int i = 0; i < dimBatch; ++i) {
      size_t sentId = batch->getSentenceIds()[i];
-      auto history = New<History>(sentId, options_->get<float>("normalize"), options_->get<float>("word-penalty"));
+      auto history = New<History>(sentId,
+                                  options_->get<float>("normalize"),
+                                  options_->get<float>("word-penalty"));
      histories.push_back(history);
    }

@ -183,8 +238,12 @@ public:
      // BUGBUG: it's not cost but score (higher=better)

      for(int i = 0; i < scorers_.size(); ++i) {
-        states[i] = scorers_[i]->step(
-            graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);
+        states[i] = scorers_[i]->step(graph,
+                                      states[i],
+                                      hypIndices,
+                                      embIndices,
+                                      dimBatch,
+                                      localBeamSize);

        if(scorers_[i]->getWeight() != 1.f)
          totalCosts
@ -219,14 +278,22 @@ public:
      nth->getNBestList(beamSizes, totalCosts->val(), outCosts, outKeys, first);

      int dimTrgVoc = totalCosts->shape()[-1];
-      beams = toHyps(
-          outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first);
+      beams = toHyps(outKeys,
+                     outCosts,
+                     dimTrgVoc,
+                     beams,
+                     states,
+                     localBeamSize,
+                     first,
+                     batch);

      auto prunedBeams = pruneBeam(beams);
      for(int i = 0; i < dimBatch; ++i) {
        if(!beams[i].empty()) {
          final = final
-                  || histories[i]->size() >= options_->get<float>("max-length-factor") * batch->front()->batchWidth();
+                  || histories[i]->size()
+                         >= options_->get<float>("max-length-factor")
+                                * batch->front()->batchWidth();
          histories[i]->Add(beams[i], trgEosId_, prunedBeams[i].empty() || final);
        }
      }
--- a/src/translator/hypothesis.h
+++ b/src/translator/hypothesis.h
@ -24,6 +24,9 @@ public:
  float GetCost() const { return cost_; }

  std::vector<float>& GetCostBreakdown() { return costBreakdown_; }
+  std::vector<float>& GetAlignment() { return alignment_; }
+
+  void SetAlignment(const std::vector<float>& align) { alignment_ = align; };

 private:
  const Ptr<Hypothesis> prevHyp_;
@ -32,6 +35,7 @@ private:
  const float cost_;

  std::vector<float> costBreakdown_;
+  std::vector<float> alignment_;
 };

 typedef std::vector<Ptr<Hypothesis>> Beam;
--- a/src/translator/output_printer.cpp
+++ b/src/translator/output_printer.cpp
@ -0,0 +1,64 @@
+#include "output_printer.h"
+
+namespace marian {
+
+std::vector<HardAlignment> OutputPrinter::getAlignment(
+    const Ptr<Hypothesis>& hyp,
+    float threshold) {
+  std::vector<SoftAlignment> alignSoft;
+  // Skip EOS
+  auto last = hyp->GetPrevHyp();
+  // Get soft alignments for each target word
+  while(last->GetPrevHyp().get() != nullptr) {
+    alignSoft.push_back(last->GetAlignment());
+    last = last->GetPrevHyp();
+  }
+
+  std::vector<HardAlignment> align;
+  // Alignments by maximum value
+  if(threshold == 1.f) {
+    for(size_t t = 0; t < alignSoft.size(); ++t) {
+      // Retrieved alignments are in reversed order
+      size_t rev = alignSoft.size() - t - 1;
+      size_t maxArg = 0;
+      for(size_t s = 0; s < alignSoft[0].size(); ++s) {
+        if(alignSoft[rev][maxArg] < alignSoft[rev][s]) {
+          maxArg = s;
+        }
+      }
+      align.push_back(std::make_pair(maxArg, t));
+    }
+  } else {
+    // Alignments by greather-than-threshold
+    for(size_t t = 0; t < alignSoft.size(); ++t) {
+      // Retrieved alignments are in reversed order
+      size_t rev = alignSoft.size() - t - 1;
+      for(size_t s = 0; s < alignSoft[0].size(); ++s) {
+        if(alignSoft[rev][s] > threshold) {
+          align.push_back(std::make_pair(s, t));
+        }
+      }
+    }
+  }
+
+  // Sort alignment pairs in ascending order
+  std::sort(align.begin(),
+            align.end(),
+            [](const HardAlignment& a, const HardAlignment& b) {
+              return (a.first == b.first) ? a.second < b.second
+                                          : a.first < b.first;
+            });
+
+  return align;
+}
+
+std::string OutputPrinter::getAlignmentString(
+    const std::vector<HardAlignment>& align) {
+  std::stringstream alignStr;
+  alignStr << " |||";
+  for(auto p = align.begin(); p != align.end(); ++p) {
+    alignStr << " " << p->first << "-" << p->second;
+  }
+  return alignStr.str();
+}
+}
--- a/src/translator/output_printer.h
+++ b/src/translator/output_printer.h
@ -0,0 +1,86 @@
+#pragma once
+
+#include <vector>
+
+#include "common/config.h"
+#include "common/utils.h"
+#include "data/vocab.h"
+#include "translator/history.h"
+#include "translator/hypothesis.h"
+
+namespace marian {
+
+typedef std::vector<float> SoftAlignment;
+typedef std::pair<size_t, size_t> HardAlignment;
+
+class OutputPrinter {
+public:
+  OutputPrinter(Ptr<Config> options, Ptr<Vocab> vocab)
+      : vocab_(vocab),
+        reverse_(options->get<bool>("right-left")),
+        nbest_(options->get<bool>("n-best", false)
+                   ? options->get<size_t>("beam-size")
+                   : 0),
+        alignment_(options->get<float>("alignment", 0.f)) {}
+
+  template <class OStream>
+  void print(Ptr<History> history, OStream& best1, OStream& bestn) {
+    const auto& nbl = history->NBest(nbest_);
+
+    for(size_t i = 0; i < nbl.size(); ++i) {
+      const auto& result = nbl[i];
+      const auto& words = std::get<0>(result);
+      const auto& hypo = std::get<1>(result);
+
+      std::string translation = Join((*vocab_)(words), " ", reverse_);
+      bestn << history->GetLineNum() << " ||| " << translation;
+
+      if(alignment_ > 0.f) {
+        auto align = getAlignment(hypo, alignment_);
+        bestn << getAlignmentString(align);
+      }
+
+      bestn << " |||";
+
+      if(hypo->GetCostBreakdown().empty()) {
+        bestn << " F0=" << hypo->GetCost();
+      } else {
+        for(size_t j = 0; j < hypo->GetCostBreakdown().size(); ++j) {
+          bestn << " F" << j << "= " << hypo->GetCostBreakdown()[j];
+        }
+      }
+
+      float realCost = std::get<2>(result);
+      bestn << " ||| " << realCost;
+
+      if(i < nbl.size() - 1)
+        bestn << std::endl;
+      else
+        bestn << std::flush;
+    }
+
+    auto result = history->Top();
+    const auto& words = std::get<0>(result);
+
+    std::string translation = Join((*vocab_)(words), " ", reverse_);
+
+    best1 << translation;
+    if(alignment_ > 0.f) {
+      const auto& hypo = std::get<1>(result);
+      auto align = getAlignment(hypo, alignment_);
+      best1 << getAlignmentString(align);
+    }
+    best1 << std::flush;
+  }
+
+private:
+  Ptr<Vocab> vocab_;
+  bool reverse_{false};
+  size_t nbest_{0};
+  float alignment_{0.f};
+
+  std::vector<HardAlignment> getAlignment(const Ptr<Hypothesis>& hyp,
+                                          float threshold);
+  std::string getAlignmentString(const std::vector<HardAlignment>& align);
+};
+}
--- a/src/translator/printer.cpp
+++ b/src/translator/printer.cpp
@ -1,35 +0,0 @@
-#include "printer.h"
-
-namespace marian {
-
-std::vector<size_t> GetAlignment(const HypothesisPtr& hypothesis) {
-  std::vector<SoftAlignment> aligns;
-  HypothesisPtr last = hypothesis->GetPrevHyp();
-  while(last->GetPrevHyp().get() != nullptr) {
-    aligns.push_back(*(last->GetAlignment(0)));
-    last = last->GetPrevHyp();
-  }
-
-  std::vector<size_t> alignment;
-  for(auto it = aligns.rbegin(); it != aligns.rend(); ++it) {
-    size_t maxArg = 0;
-    for(size_t i = 0; i < it->size(); ++i) {
-      if((*it)[maxArg] < (*it)[i]) {
-        maxArg = i;
-      }
-    }
-    alignment.push_back(maxArg);
-  }
-
-  return alignment;
-}
-
-std::string GetAlignmentString(const std::vector<size_t>& alignment) {
-  std::stringstream alignString;
-  alignString << " |||";
-  for(size_t wordIdx = 0; wordIdx < alignment.size(); ++wordIdx) {
-    alignString << " " << wordIdx << "-" << alignment[wordIdx];
-  }
-  return alignString.str();
-}
-}
--- a/src/translator/printer.h
+++ b/src/translator/printer.h
@ -1,56 +0,0 @@
-#pragma once
-
-#include <vector>
-
-#include "common/utils.h"
-#include "data/vocab.h"
-#include "translator/history.h"
-
-namespace marian {
-
-template <class OStream>
-void Printer(Ptr<Config> options,
-             Ptr<Vocab> vocab,
-             Ptr<History> history,
-             OStream& best1,
-             OStream& bestn) {
-  bool reverse = options->get<bool>("right-left");
-
-  if(options->has("n-best") && options->get<bool>("n-best")) {
-    const auto& nbl = history->NBest(options->get<size_t>("beam-size"));
-
-    for(size_t i = 0; i < nbl.size(); ++i) {
-      const auto& result = nbl[i];
-      const auto& words = std::get<0>(result);
-      const auto& hypo  = std::get<1>(result);
-
-      float realCost = std::get<2>(result);
-
-      std::string translation = Join((*vocab)(words), " ", reverse);
-
-      bestn << history->GetLineNum() << " ||| " << translation << " |||";
-
-      if(hypo->GetCostBreakdown().empty()) {
-        bestn << " F0=" << hypo->GetCost();
-      } else {
-        for(size_t j = 0; j < hypo->GetCostBreakdown().size(); ++j) {
-          bestn << " F" << j << "= " << hypo->GetCostBreakdown()[j];
-        }
-      }
-
-      bestn << " ||| " << realCost;
-
-      if(i < nbl.size() - 1)
-        bestn << std::endl;
-      else
-        bestn << std::flush;
-    }
-  }
-
-  auto bestTranslation = history->Top();
-
-  std::string translation
-      = Join((*vocab)(std::get<0>(bestTranslation)), " ", reverse);
-  best1 << translation << std::flush;
-}
-}
--- a/src/translator/scorers.h
+++ b/src/translator/scorers.h
@ -1,8 +1,9 @@
 #pragma once

 #include "marian.h"
-#include "models/model_factory.h"
+
 #include "data/shortlist.h"
+#include "models/model_factory.h"

 namespace marian {

@ -41,8 +42,10 @@ public:

  virtual void init(Ptr<ExpressionGraph> graph) {}

-  virtual void setShortlistGenerator(Ptr<data::ShortlistGenerator> shortlistGenerator) {};
+  virtual void setShortlistGenerator(
+      Ptr<data::ShortlistGenerator> shortlistGenerator){};
  virtual Ptr<data::Shortlist> getShortlist() { return nullptr; };
+  virtual std::vector<float> getAlignment() { return {}; };
 };

 class ScorerWrapperState : public ScorerState {
@ -104,97 +107,19 @@ public:
        graph, wrappedState, hypIndices, embIndices, dimBatch, beamSize));
  }

-  virtual void setShortlistGenerator(Ptr<data::ShortlistGenerator> shortlistGenerator) {
+  virtual void setShortlistGenerator(
+      Ptr<data::ShortlistGenerator> shortlistGenerator) {
    encdec_->setShortlistGenerator(shortlistGenerator);
  };

  virtual Ptr<data::Shortlist> getShortlist() {
    return encdec_->getShortlist();
  };
-};

-//class WordPenaltyState : public ScorerState {
-//private:
-//  int dimVocab_;
-//  Expr penalties_;
-//
-//public:
-//  WordPenaltyState(int dimVocab, Expr penalties)
-//      : dimVocab_(dimVocab), penalties_(penalties) {}
-//
-//  virtual Expr getProbs() { return penalties_; };
-//
-//  virtual float breakDown(size_t i) {
-//    return getProbs()->val()->get(i % dimVocab_);
-//  }
-//};
-//
-//class WordPenalty : public Scorer {
-//private:
-//  int dimVocab_;
-//  Expr penalties_;
-//
-//public:
-//  WordPenalty(const std::string& name, float weight, int dimVocab)
-//      : Scorer(name, weight), dimVocab_(dimVocab) {}
-//
-//  virtual void clear(Ptr<ExpressionGraph> graph) {}
-//
-//  virtual Ptr<ScorerState> startState(Ptr<ExpressionGraph> graph,
-//                                      Ptr<data::CorpusBatch> batch) {
-//    std::vector<float> p(dimVocab_, 1);
-//    p[0] = 0;
-//    p[2] = 0;
-//
-//    penalties_ = graph->constant({1, dimVocab_}, inits::from_vector(p));
-//    return New<WordPenaltyState>(dimVocab_, penalties_);
-//  }
-//
-//  virtual Ptr<ScorerState> step(Ptr<ExpressionGraph> graph,
-//                                Ptr<ScorerState> state,
-//                                const std::vector<size_t>& hypIndices,
-//                                const std::vector<size_t>& embIndices,
-//                                int dimBatch,
-//                                int beamSize) {
-//    return state;
-//  }
-//};
-//
-//class UnseenWordPenalty : public Scorer {
-//private:
-//  int batchIndex_;
-//  int dimVocab_;
-//  Expr penalties_;
-//
-//public:
-//  UnseenWordPenalty(const std::string& name,
-//                    float weight,
-//                    int dimVocab,
-//                    int batchIndex)
-//      : Scorer(name, weight), dimVocab_(dimVocab), batchIndex_(batchIndex) {}
-//
-//  virtual void clear(Ptr<ExpressionGraph> graph) {}
-//
-//  virtual Ptr<ScorerState> startState(Ptr<ExpressionGraph> graph,
-//                                      Ptr<data::CorpusBatch> batch) {
-//    std::vector<float> p(dimVocab_, -1);
-//    for(auto i : (*batch)[batchIndex_]->data())
-//      p[i] = 0;
-//    p[2] = 0;
-//
-//    penalties_ = graph->constant({1, dimVocab_}, inits::from_vector(p));
-//    return New<WordPenaltyState>(dimVocab_, penalties_);
-//  }
-//
-//  virtual Ptr<ScorerState> step(Ptr<ExpressionGraph> graph,
-//                                Ptr<ScorerState> state,
-//                                const std::vector<size_t>& hypIndices,
-//                                const std::vector<size_t>& embIndices,
-//                                int dimBatch,
-//                                int beamSize) {
-//    return state;
-//  }
-//};
+  virtual std::vector<float> getAlignment() {
+    return encdec_->getAlignment();
+  }
+};

 Ptr<Scorer> scorerByType(std::string fname,
                         float weight,
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@ -8,7 +8,7 @@
 #include "3rd_party/threadpool.h"
 #include "translator/history.h"
 #include "translator/output_collector.h"
-#include "translator/printer.h"
+#include "translator/output_printer.h"

 #include "models/model_task.h"
 #include "translator/scorers.h"
@ -83,6 +83,7 @@ public:

    size_t batchId = 0;
    auto collector = New<OutputCollector>();
+    auto printer = New<OutputPrinter>(options_, trgVocab_);
    if(options_->get<bool>("quiet-translation"))
      collector->setPrintingStrategy(New<QuietPrinting>());

@ -111,7 +112,7 @@ public:
        for(auto history : histories) {
          std::stringstream best1;
          std::stringstream bestn;
-          Printer(options_, trgVocab_, history, best1, bestn);
+          printer->print(history, best1, bestn);
          collector->Write(history->GetLineNum(),
                           best1.str(),
                           bestn.str(),
@ -176,6 +177,7 @@ public:
    data::BatchGenerator<data::TextInput> bg(corpus_, options_);

    auto collector = New<StringCollector>();
+    auto printer = New<OutputPrinter>(options_, trgVocab_);
    size_t batchId = 0;

    // @TODO: unify this and get rid of Config object.
@ -205,7 +207,7 @@ public:
          for(auto history : histories) {
            std::stringstream best1;
            std::stringstream bestn;
-            Printer(options_, trgVocab_, history, best1, bestn);
+            printer->print(history, best1, bestn);
            collector->add(history->GetLineNum(), best1.str(), bestn.str());
          }
        };