Autoformat files

2024-09-17 09:47:34 +03:00 · 2018-03-12 20:34:10 +00:00 · 2018-03-12 20:34:10 +00:00 · 6d0c75cf48
commit 6d0c75cf48
parent 5f2eedc6e5
106 changed files with 2055 additions and 2103 deletions
--- a/contrib/autoformat.sh
+++ b/contrib/autoformat.sh
@ -1,3 +1,3 @@
 #!/bin/bash

-find ./src \( -path ./src/3rd_party -o -path ./src/tests -o -path ./src/models/experimental \) -prune -o -iname *.h -o -iname *.cpp -o -iname *.cu | xargs clang-format-3.8 -i
+find ./src \( -path ./src/3rd_party -o -path ./src/tests -o -path ./src/models/experimental \) -prune -o -iname *.h -o -iname *.cpp -o -iname *.cu | xargs clang-format -i
--- a/src/command/marian.cpp
+++ b/src/command/marian.cpp
@ -10,7 +10,6 @@
 #include "training/graph_group_async_drop.h"
 #endif

-
 bool configureMPI(int, char**);

 int main(int argc, char** argv) {
@ -18,7 +17,7 @@ int main(int argc, char** argv) {

  auto options = New<Config>(argc, argv);
  auto devices = options->getDevices();
-  
+
  if(options->get<bool>("multi-node")) {
    ABORT_IF(!configureMPI(argc, argv), "MPI not found.");

--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@ -65,5 +65,4 @@ void Config::AddYamlToNpz(const YAML::Node& yaml,
  unsigned shape = out.size() + 1;
  cnpy::npz_save(fName, varName, out.c_str(), &shape, 1, "a");
 }
-
 }
--- a/src/common/config.h
+++ b/src/common/config.h
@ -76,7 +76,9 @@ public:
    log();

    if(has("version"))
-      LOG(info, "[config] Model created with Marian {}", get("version").as<std::string>());
+      LOG(info,
+          "[config] Model created with Marian {}",
+          get("version").as<std::string>());
  }

  Config(const Config& other) : config_(YAML::Clone(other.config_)) {}
@ -110,9 +112,7 @@ public:
  YAML::Node getModelParameters();
  void loadModelParameters(const std::string& name);

-  const std::vector<DeviceId>& getDevices() {
-    return devices_;
-  }
+  const std::vector<DeviceId>& getDevices() { return devices_; }

  void save(const std::string& name) {
    OutputFileStream out(name);
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -2,8 +2,8 @@
 #include <boost/algorithm/string.hpp>
 #include <boost/regex.hpp>
 #include <set>
-#include <string>
 #include <stdexcept>
+#include <string>

 #if MKL_FOUND
 //#include <omp.h>
@ -15,9 +15,9 @@
 #endif
 #endif

-
 #include "3rd_party/cnpy/cnpy.h"
 #include "common/definitions.h"
+
 #include "common/config.h"
 #include "common/config_parser.h"
 #include "common/file_stream.h"
@ -146,13 +146,14 @@ bool ConfigParser::has(const std::string& key) const {

 void ConfigParser::validateOptions() const {
  if(mode_ == ConfigMode::translating) {
-    UTIL_THROW_IF2(!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
+    UTIL_THROW_IF2(
+        !has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
        "Translating, but vocabularies are not given!");

    for(const auto& modelFile : get<std::vector<std::string>>("models")) {
      boost::filesystem::path modelPath(modelFile);
      UTIL_THROW_IF2(!boost::filesystem::exists(modelPath),
-          "Model file does not exist: " + modelFile);
+                     "Model file does not exist: " + modelFile);
    }

    return;
@ -177,9 +178,10 @@ void ConfigParser::validateOptions() const {

  if(mode_ == ConfigMode::rescoring) {
    UTIL_THROW_IF2(!boost::filesystem::exists(modelPath),
-        "Model file does not exist: " + modelPath.string());
+                   "Model file does not exist: " + modelPath.string());

-    UTIL_THROW_IF2(!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
+    UTIL_THROW_IF2(
+        !has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
        "Scoring, but vocabularies are not given!");

    return;
@ -193,8 +195,9 @@ void ConfigParser::validateOptions() const {
      !modelDir.empty() && !boost::filesystem::is_directory(modelDir),
      "Model directory does not exist");

-  UTIL_THROW_IF2(!modelDir.empty() && !(boost::filesystem::status(modelDir).permissions()
-                 & boost::filesystem::owner_write),
+  UTIL_THROW_IF2(!modelDir.empty()
+                     && !(boost::filesystem::status(modelDir).permissions()
+                          & boost::filesystem::owner_write),
                 "No write permission in model directory");

  UTIL_THROW_IF2(
@ -835,7 +838,6 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
  SET_OPTION("transformer-dim-ffn", int);
  SET_OPTION("transformer-ffn-activation", std::string);

-
 #ifdef CUDNN
  SET_OPTION("char-stride", int);
  SET_OPTION("char-highway", int);
@ -976,7 +978,7 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
  SET_OPTION("relative-paths", bool);
  SET_OPTION("devices", std::vector<std::string>);
  SET_OPTION("cpu-threads", size_t);
-  //SET_OPTION("omp-threads", size_t);
+  // SET_OPTION("omp-threads", size_t);

  SET_OPTION("mini-batch", int);
  SET_OPTION("maxi-batch", int);
@ -1021,24 +1023,22 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
    exit(0);
  }

-// @TODO: this should probably be in processOptionDevices()
-//#ifdef BLAS_FOUND
-//  //omp_set_num_threads(vm_["omp-threads"].as<size_t>());
-//#ifdef MKL_FOUND
-//  mkl_set_num_threads(vm_["omp-threads"].as<size_t>());
-//#endif
-//#endif
+  // @TODO: this should probably be in processOptionDevices()
+  //#ifdef BLAS_FOUND
+  //  //omp_set_num_threads(vm_["omp-threads"].as<size_t>());
+  //#ifdef MKL_FOUND
+  //  mkl_set_num_threads(vm_["omp-threads"].as<size_t>());
+  //#endif
+  //#endif
 }

 std::vector<DeviceId> ConfigParser::getDevices() {
  std::vector<DeviceId> devices;

  try {
-
    std::string devicesStr
        = Join(config_["devices"].as<std::vector<std::string>>());

-
    if(mode_ == ConfigMode::training && get<bool>("multi-node")) {
      auto parts = Split(devicesStr, ":");
      for(size_t i = 1; i < parts.size(); ++i) {
@ -1061,11 +1061,10 @@ std::vector<DeviceId> ConfigParser::getDevices() {
    if(config_["cpu-threads"].as<size_t>() > 0) {
      devices.clear();
      for(size_t i = 0; i < config_["cpu-threads"].as<size_t>(); ++i)
-      devices.push_back({i, DeviceType::cpu});
+        devices.push_back({i, DeviceType::cpu});
    }

-  }
-  catch(...) {
+  } catch(...) {
    ABORT("Problem parsing devices, please report an issue on github");
  }

--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@ -1,10 +1,10 @@
 #pragma once

 #include <functional>
+#include <iostream>
 #include <memory>
 #include <string>
 #include <vector>
-#include <iostream>

 #include "common/logging.h"
 #include "shape.h"
@ -57,7 +57,6 @@ struct DeviceId {
  friend bool operator==(DeviceId id1, DeviceId id2) {
    return id1.no == id2.no && id1.type == id2.type;
  }
-
 };

 class TensorBase;
--- a/src/common/shape.h
+++ b/src/common/shape.h
@ -1,195 +1,191 @@
 #pragma once

+#include <algorithm>
 #include <cstdint>
 #include <iostream>
-#include <string>
 #include <sstream>
+#include <string>
 #include <vector>
-#include <algorithm>

 #include "common/logging.h"

 namespace marian {

 struct Shape {
-  public:
-    std::vector<int> shape_;
+public:
+  std::vector<int> shape_;

-  public:
-    Shape() : shape_{1} {}
+public:
+  Shape() : shape_{1} {}

-    Shape(std::initializer_list<int> il) : Shape() {
-      shape_.resize(il.size());
-      std::copy(il.begin(), il.end(), begin());
+  Shape(std::initializer_list<int> il) : Shape() {
+    shape_.resize(il.size());
+    std::copy(il.begin(), il.end(), begin());
+  }
+
+  void resize(size_t n) { shape_.resize(n, 1); }
+
+  const int* data() const { return shape_.data(); }
+
+  int* data() { return shape_.data(); }
+
+  Shape(const Shape& shape) : Shape() {
+    shape_.resize(shape.size());
+    std::copy(shape.begin(), shape.end(), begin());
+  }
+
+  inline void set(int i, int val) { dim(i) = val; }
+
+  inline int& dim(int i) {
+    if(i >= 0) {
+      ABORT_IF(i >= size(),
+               "Index {} is out of bounds, shape has {} dimension",
+               i,
+               size());
+      return shape_[i];
+    } else {
+      ABORT_IF((int)size() + i < 0,
+               "Negative index {} is out of bounds, shape has {} dimension",
+               i,
+               size());
+      return shape_[size() + i];
    }
+  }

-    void resize(size_t n) {
-      shape_.resize(n, 1);
-    }
+  inline const int& dim(int i) const {
+    return const_cast<Shape&>(*this).dim(i);
+  }

-    const int* data() const {
-      return shape_.data();
-    }
+  inline int operator[](int i) { return dim(i); }

-    int* data() {
-      return shape_.data();
-    }
+  inline int operator[](int i) const { return dim(i); }

-    Shape(const Shape& shape) : Shape() {
-      shape_.resize(shape.size());
-      std::copy(shape.begin(), shape.end(), begin());
-    }
+  inline int& back() { return shape_.back(); }

-    inline void set(int i, int val) {
-      dim(i) = val;
-    }
+  inline int stride(int i) const {
+    std::vector<int> stride(shape_.size(), 1);
+    for(int j = shape_.size() - 2; j >= 0; --j)
+      stride[j] = stride[j + 1] * shape_[j + 1];

-    inline int& dim(int i) {
-      if(i >= 0) {
-        ABORT_IF(i >= size(),
-                 "Index {} is out of bounds, shape has {} dimension", i, size());
-        return shape_[i];
-      }
-      else {
-        ABORT_IF((int)size() + i < 0,
-                 "Negative index {} is out of bounds, shape has {} dimension", i, size());
-        return shape_[size() + i];
+    if(i >= 0)
+      return stride[i];
+    else
+      return stride[size() + i];
+  }
+
+  inline size_t size() const { return shape_.size(); }
+
+  inline int elements() const {
+    int el = 1;
+    for(auto s : shape_)
+      el *= s;
+    return el;
+  }
+
+  inline void dims(int i, std::vector<int>& d) const {
+    d.resize(shape_.size());
+
+    std::vector<int> stride(shape_.size(), 1);
+    for(int j = shape_.size() - 2; j >= 0; --j)
+      stride[j] = stride[j + 1] * shape_[j + 1];
+
+    for(int j = 0; j < d.size(); ++j)
+      d[j] = (i / stride[j]) % shape_[j];
+  }
+
+  auto begin() -> decltype(shape_.begin()) { return shape_.begin(); }
+  auto begin() const -> decltype(shape_.begin()) { return shape_.begin(); }
+
+  auto end() -> decltype(shape_.end()) { return shape_.end(); }
+  auto end() const -> decltype(shape_.end()) { return shape_.end(); }
+
+  auto rbegin() -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
+  auto rbegin() const -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
+
+  auto rend() -> decltype(shape_.rend()) { return shape_.rend(); }
+  auto rend() const -> decltype(shape_.rend()) { return shape_.rend(); }
+
+  bool operator==(const Shape& other) const {
+    return size() == other.size() && std::equal(begin(), end(), other.begin());
+  }
+
+  bool operator!=(const Shape& other) const { return !(*this == other); }
+
+  std::string toString() const {
+    std::stringstream strm;
+    strm << "shape=" << (*this)[0];
+    for(int i = 1; i < size(); ++i)
+      strm << "x" << (*this)[i];
+    strm << " size=" << elements() << " (" << elements() * sizeof(float)
+         << "B)";
+    return strm.str();
+  }
+
+  friend std::ostream& operator<<(std::ostream& strm, const Shape& shape) {
+    strm << shape.toString();
+    return strm;
+  }
+
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  int axis(int ax) {
+    if(ax < 0)
+      return size() + ax;
+    else
+      return ax;
+  }
+
+  static Shape broadcast(const std::vector<Shape>& shapes) {
+    int maxDims = 0;
+    for(auto& s : shapes)
+      if(s.size() > maxDims)
+        maxDims = s.size();
+
+    Shape shape;
+    shape.resize(maxDims);
+
+    for(auto& s : shapes) {
+      for(int i = 0; i < s.size(); ++i) {
+        ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1,
+                 "Shapes {} and {} cannot be broadcasted",
+                 (std::string)shape,
+                 (std::string)s);
+        shape.set(-i, std::max(shape[-i], s[-i]));
      }
    }
+    return shape;
+  }

-    inline const int& dim(int i) const { return const_cast<Shape&>(*this).dim(i); }
+  template <typename T>
+  static Shape broadcast(const std::initializer_list<T>& il) {
+    return broadcast(std::vector<T>(il));
+  }

-    inline int operator[](int i) { return dim(i); }
+  template <typename T>
+  static Shape broadcast(const std::vector<T>& nodes) {
+    int maxDims = 0;
+    for(auto& n : nodes)
+      if(n->shape().size() > maxDims)
+        maxDims = n->shape().size();

-    inline int operator[](int i) const { return dim(i); }
+    Shape shape;
+    shape.resize(maxDims);

-    inline int& back() { return shape_.back(); }
-
-    inline int stride(int i) const {
-      std::vector<int> stride(shape_.size(), 1);
-      for(int j = shape_.size() - 2; j >= 0; --j)
-        stride[j] = stride[j + 1] * shape_[j + 1];
-
-      if(i >= 0)
-        return stride[i];
-      else
-        return stride[size() + i];
-    }
-
-    inline size_t size() const { return shape_.size(); }
-
-    inline int elements() const {
-      int el = 1;
-      for(auto s : shape_)
-        el *= s;
-      return el;
-    }
-
-    inline void dims(int i, std::vector<int>& d) const {
-      d.resize(shape_.size());
-
-      std::vector<int> stride(shape_.size(), 1);
-      for(int j = shape_.size() - 2; j >= 0; --j)
-        stride[j] = stride[j + 1] * shape_[j + 1];
-
-      for(int j = 0; j < d.size(); ++j)
-        d[j] = (i / stride[j]) % shape_[j];
-    }
-
-    auto begin() -> decltype(shape_.begin()) { return shape_.begin(); }
-    auto begin() const -> decltype(shape_.begin()) { return shape_.begin(); }
-
-    auto end() -> decltype(shape_.end()) { return shape_.end(); }
-    auto end() const -> decltype(shape_.end()) { return shape_.end(); }
-
-    auto rbegin() -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
-    auto rbegin() const -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
-
-    auto rend() -> decltype(shape_.rend()) { return shape_.rend(); }
-    auto rend() const -> decltype(shape_.rend()) { return shape_.rend(); }
-
-    bool operator==(const Shape& other) const {
-      return size() == other.size() && std::equal(begin(), end(), other.begin());
-    }
-
-    bool operator!=(const Shape& other) const { return !(*this == other); }
-
-    std::string toString() const {
-      std::stringstream strm;
-      strm << "shape=" << (*this)[0];
-      for(int i = 1; i < size(); ++i)
-        strm << "x" << (*this)[i];
-      strm << " size=" << elements() << " ("
-           << elements() * sizeof(float) << "B)";
-      return strm.str();
-    }
-
-    friend std::ostream& operator<<(std::ostream& strm, const Shape& shape) {
-      strm << shape.toString();
-      return strm;
-    }
-
-    operator std::string() const {
-      std::stringstream ss;
-      ss << *this;
-      return ss.str();
-    }
-
-    int axis(int ax) {
-      if(ax < 0)
-        return size() + ax;
-      else
-        return ax;
-    }
-
-    static Shape broadcast(const std::vector<Shape>& shapes) {
-      int maxDims = 0;
-      for(auto& s : shapes)
-        if(s.size() > maxDims)
-          maxDims = s.size();
-
-      Shape shape;
-      shape.resize(maxDims);
-
-      for(auto& s : shapes) {
-        for(int i = 0; i < s.size(); ++i) {
-          ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1,
-                   "Shapes {} and {} cannot be broadcasted",
-                   (std::string)shape,
-                   (std::string)s);
-          shape.set(-i, std::max(shape[-i], s[-i]));
-        }
+    for(auto& node : nodes) {
+      const Shape& shapen = node->shape();
+      for(int i = 1; i <= shapen.size(); ++i) {
+        ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1,
+                 "Shapes {} and {} cannot be broadcasted",
+                 (std::string)shape,
+                 (std::string)shapen);
+        shape.set(-i, std::max(shape[-i], shapen[-i]));
      }
-      return shape;
-    }
-
-    template <typename T>
-    static Shape broadcast(const std::initializer_list<T>& il) {
-      return broadcast(std::vector<T>(il));
-    }
-
-    template <typename T>
-    static Shape broadcast(const std::vector<T>& nodes) {
-      int maxDims = 0;
-      for(auto& n : nodes)
-        if(n->shape().size() > maxDims)
-          maxDims = n->shape().size();
-
-      Shape shape;
-      shape.resize(maxDims);
-
-      for(auto& node : nodes) {
-        const Shape& shapen = node->shape();
-        for(int i = 1; i <= shapen.size(); ++i) {
-          ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1,
-                   "Shapes {} and {} cannot be broadcasted",
-                   (std::string)shape,
-                   (std::string)shapen);
-          shape.set(-i, std::max(shape[-i], shapen[-i]));
-        }
-      }
-      return shape;
    }
+    return shape;
+  }
 };
-
 }
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@ -1,11 +1,11 @@
 #pragma once

+#include <boost/timer/timer.hpp>
+#include <condition_variable>
 #include <deque>
 #include <functional>
-#include <queue>
 #include <mutex>
-#include <condition_variable>
-#include <boost/timer/timer.hpp>
+#include <queue>

 #include "common/config.h"
 #include "data/batch_stats.h"
@ -47,16 +47,17 @@ private:

  void fillBatches(bool shuffle = true) {
    typedef typename sample::value_type Item;
-    auto itemCmp = [](const Item& sa, const Item& sb) {
-      return sa.size() < sb.size();
-    };
+    auto itemCmp
+        = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); };

    auto cmpSrc = [itemCmp](const sample& a, const sample& b) {
-      return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(), itemCmp);
+      return std::lexicographical_compare(
+          a.begin(), a.end(), b.begin(), b.end(), itemCmp);
    };

    auto cmpTrg = [itemCmp](const sample& a, const sample& b) {
-      return std::lexicographical_compare(a.rbegin(), a.rend(), b.rbegin(), b.rend(), itemCmp);
+      return std::lexicographical_compare(
+          a.rbegin(), a.rend(), b.rbegin(), b.rend(), itemCmp);
    };

    auto cmpNone = [](const sample& a, const sample& b) { return &a < &b; };
@ -168,9 +169,8 @@ public:
  operator bool() const {
    // wait if empty but loading
    std::unique_lock<std::mutex> lock(loadMutex_);
-    loadCondition_.wait(lock, [this]{
-      return loadReady_ || !bufferedBatches_.empty();
-    });
+    loadCondition_.wait(
+        lock, [this] { return loadReady_ || !bufferedBatches_.empty(); });

    return !bufferedBatches_.empty();
  }
@ -178,15 +178,16 @@ public:
  BatchPtr next() {
    {
      std::unique_lock<std::mutex> lock(loadMutex_);
-      loadCondition_.wait(lock, [this]{
-        return loadReady_ || !bufferedBatches_.empty();
-      });
+      loadCondition_.wait(
+          lock, [this] { return loadReady_ || !bufferedBatches_.empty(); });
    }

    ABORT_IF(bufferedBatches_.empty(), "No batches to fetch, run prepare()");
    currentBatch_ = bufferedBatches_.front();

-    if(loadReady_ && bufferedBatches_.size() <= std::max(options_->get<int>("maxi-batch") / 5, 1)) {
+    if(loadReady_
+       && bufferedBatches_.size()
+              <= std::max(options_->get<int>("maxi-batch") / 5, 1)) {
      {
        std::unique_lock<std::mutex> lock(loadMutex_);
        loadReady_ = false;
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -400,7 +400,6 @@ public:
      std::cerr << std::endl;
    }
  }
-
 };

 class CorpusIterator;
--- a/src/data/corpus_nbest.cpp
+++ b/src/data/corpus_nbest.cpp
@ -1,7 +1,7 @@
 #include <random>

-#include "data/corpus_nbest.h"
 #include "common/utils.h"
+#include "data/corpus_nbest.h"

 namespace marian {
 namespace data {
@ -10,8 +10,8 @@ CorpusNBest::CorpusNBest(Ptr<Config> options, bool translate /*= false*/)
    : CorpusBase(options, translate) {}

 CorpusNBest::CorpusNBest(std::vector<std::string> paths,
-               std::vector<Ptr<Vocab>> vocabs,
-               Ptr<Config> options)
+                         std::vector<Ptr<Vocab>> vocabs,
+                         Ptr<Config> options)
    : CorpusBase(paths, vocabs, options) {}

 int numFromNbest(const std::string& line) {
@ -19,7 +19,8 @@ int numFromNbest(const std::string& line) {
  Split(line, fields, " ||| ", true);
  ABORT_IF(fields.size() < 4,
           "Too few fields ({}) in line \"{}\", is this a correct n-best list?",
-           fields.size(), line);
+           fields.size(),
+           line);
  return std::stoi(fields[0]);
 }

@ -28,7 +29,8 @@ std::string lineFromNbest(const std::string& line) {
  Split(line, fields, " ||| ", true);
  ABORT_IF(fields.size() < 4,
           "Too few fields ({}) in line \"{}\", is this a correct n-best list?",
-           fields.size(), line);
+           fields.size(),
+           line);
  return fields[1];
 }

@ -56,7 +58,8 @@ SentenceTuple CorpusNBest::next() {
      for(size_t i = 0; i < last; ++i) {
        if(curr_num > lastNum_) {
          ABORT_IF(!std::getline((std::istream&)*files_[i], lastLines_[i]),
-                  "Too few lines in input {}", i);
+                   "Too few lines in input {}",
+                   i);
        }
        addWordsToSentenceTuple(lastLines_[i], i, tup);
      }
@ -90,6 +93,5 @@ void CorpusNBest::reset() {
      files_.emplace_back(new InputFileStream(path));
  }
 }
-
 }
 }
--- a/src/data/corpus_sqlite.cpp
+++ b/src/data/corpus_sqlite.cpp
@ -154,6 +154,5 @@ void CorpusSQLite::restore(Ptr<TrainingState> ts) {
    reset();
  }
 }
-
 }
 }
--- a/src/data/corpus_sqlite.h
+++ b/src/data/corpus_sqlite.h
@ -19,7 +19,6 @@
 #include <SQLiteCpp/SQLiteCpp.h>
 #include <SQLiteCpp/sqlite3/sqlite3.h>

-
 static void SQLiteRandomSeed(sqlite3_context* context,
                             int argc,
                             sqlite3_value** argv) {
--- a/src/data/vocab.cpp
+++ b/src/data/vocab.cpp
@ -138,8 +138,9 @@ void Vocab::create(const std::string& vocabPath, const std::string& trainPath) {
           "Specified vocab directory {} does not exist",
           dir);

-  ABORT_IF(!dir.empty() && !(boost::filesystem::status(dir).permissions()
-           & boost::filesystem::owner_write),
+  ABORT_IF(!dir.empty()
+               && !(boost::filesystem::status(dir).permissions()
+                    & boost::filesystem::owner_write),
           "No write permission in vocab directory {}",
           dir);

--- a/src/functional/array.h
+++ b/src/functional/array.h
@ -32,7 +32,5 @@ struct Array {
      data_[i] = val;
  }
 };
-
 }
-
 }
--- a/src/functional/floats.h
+++ b/src/functional/floats.h
@ -4,93 +4,90 @@
 #include "functional/operands.h"

 namespace marian {
-  namespace functional {
+namespace functional {

-    namespace float2unsigned {
-      constexpr float abs(float x) { return x < 0 ? -x : x; }
+namespace float2unsigned {
+constexpr float abs(float x) {
+  return x < 0 ? -x : x;
+}

-      constexpr int exponent(float x) {
-          return abs(x) >= 2 ? exponent(x / 2) + 1 :
-                               abs(x) < 1 ? exponent(x * 2) - 1 :
-                                            0;
-      }
+// clang-format off
+constexpr int exponent(float x) {
+  return abs(x) >= 2 ? exponent(x / 2) + 1 :
+                       abs(x) < 1 ? exponent(x * 2) - 1 :
+                                    0;
+}

-      constexpr float scalbn(float value, int exponent) {
-          return exponent == 0  ? value :
-                                 exponent > 0 ? scalbn(value * 2, exponent - 1) :
-                                                scalbn(value / 2, exponent + 1);
-      }
+constexpr float scalbn(float value, int exponent) {
+  return exponent == 0  ? value :
+                         exponent > 0 ? scalbn(value * 2, exponent - 1) :
+                                        scalbn(value / 2, exponent + 1);
+}
+// clang-format on

-      constexpr unsigned mantissa(float x, int exp) {
-          // remove hidden 1 and bias the exponent to get integer
-          return abs(x) < std::numeric_limits<float>::infinity() ?
-            scalbn(scalbn(abs(x), -exp) - 1, 23) : 0;
-      }
+constexpr unsigned mantissa(float x, int exp) {
+  // remove hidden 1 and bias the exponent to get integer
+  return abs(x) < std::numeric_limits<float>::infinity()
+             ? scalbn(scalbn(abs(x), -exp) - 1, 23)
+             : 0;
+}

-      constexpr unsigned to_binary(float x, unsigned sign, int exp) {
-        return sign * (1u << 31)
-          + (exp + 127) * (1u << 23)
-          + mantissa(x, exp);
-      }
+constexpr unsigned to_binary(float x, unsigned sign, int exp) {
+  return sign * (1u << 31) + (exp + 127) * (1u << 23) + mantissa(x, exp);
+}

-      constexpr unsigned to_binary(float x) {
-        return x == 0 ? 0 : to_binary(x, x < 0, exponent(x));
-      }
+constexpr unsigned to_binary(float x) {
+  return x == 0 ? 0 : to_binary(x, x < 0, exponent(x));
+}
+}

-    }
+namespace unsigned2float {

-    namespace unsigned2float {
+constexpr float sign(unsigned i) {
+  return (i & (1u << 31)) ? -1.f : 1.f;
+}

-      constexpr float sign(unsigned i) {
-        return (i & (1u << 31)) ? -1.f : 1.f;
-      }
+constexpr int exponent(unsigned i) {
+  return int((i >> 23) & 255u) - 127;
+}

-      constexpr int exponent(unsigned i) {
-        return int((i >> 23) & 255u) - 127;
-      }
+constexpr float sig(unsigned i, unsigned shift) {
+  return ((i >> shift) & 1u) * 1.f / (1u << (23 - shift))
+         + (shift > 0 ? sig(i, shift - 1) : 0);
+}

-      constexpr float sig(unsigned i, unsigned shift) {
-        return ((i >> shift) & 1u) * 1.f / (1u << (23 - shift))
-          + (shift > 0 ? sig(i, shift - 1) : 0);
-      }
+constexpr float powr(int exp) {
+  return exp > 0 ? 2.f * powr(exp - 1) : 1.f;
+}

-      constexpr float powr(int exp) {
-        return exp > 0 ? 2.f * powr(exp - 1) : 1.f;
-      }
+constexpr float pow(int exp) {
+  return exp < 0 ? 1.f / powr(-exp) : powr(exp);
+}

-      constexpr float pow(int exp) {
-        return exp < 0 ? 1.f / powr(-exp) : powr(exp);
-      }
+constexpr float from_binary(unsigned i) {
+  return (1.f + sig(i, 22u)) * pow(exponent(i)) * sign(i);
+}
+}

-      constexpr float from_binary(unsigned i) {
-      return (1.f + sig(i, 22u))
-        * pow(exponent(i))
-        * sign(i);
-      }
-    }
+constexpr unsigned f2i(float x) {
+  return float2unsigned::to_binary(x);
+}

-    constexpr unsigned f2i(float x) {
-      return float2unsigned::to_binary(x);
-    }
+constexpr float i2f(float x) {
+  return unsigned2float::from_binary(x);
+}

-    constexpr float i2f(float x) {
-      return unsigned2float::from_binary(x);
-    }
-
-    template <unsigned V>
-    struct F {
-      static constexpr auto value = i2f(V);
-      static constexpr auto binary = V;
-
-      template <typename ...Args>
-      __HDI__ constexpr float operator()(Args&&... args) const {
-        return value;
-      }
-
-      std::string to_string() {
-        return "F<" + std::to_string(value) + ">";
-      }
-    };
+template <unsigned V>
+struct F {
+  static constexpr auto value = i2f(V);
+  static constexpr auto binary = V;

+  template <typename... Args>
+  __HDI__ constexpr float operator()(Args&&... args) const {
+    return value;
  }
-}
+
+  std::string to_string() { return "F<" + std::to_string(value) + ">"; }
+};
+}
+}
--- a/src/functional/functional.h
+++ b/src/functional/functional.h
@ -4,25 +4,25 @@
 #include "functional/predicates.h"

 namespace marian {
-  namespace functional {
+namespace functional {

-    template <int N>
-    using ref = Assignee<N>;
+template <int N>
+using ref = Assignee<N>;

-    static ref<1> _1;
-    static ref<2> _2;
-    static ref<3> _3;
-    static ref<4> _4;
-    static ref<5> _5;
-    static ref<6> _6;
-    static ref<7> _7;
-    static ref<8> _8;
-    static ref<9> _9;
+static ref<1> _1;
+static ref<2> _2;
+static ref<3> _3;
+static ref<4> _4;
+static ref<5> _5;
+static ref<6> _6;
+static ref<7> _7;
+static ref<8> _8;
+static ref<9> _9;

-    static C<0> _0c;
-    static C<1> _1c;
-    static C<2> _2c;
-    static C<-1> _1cneg;
-    static C<-2> _2cneg;
-  }
+static C<0> _0c;
+static C<1> _1c;
+static C<2> _2c;
+static C<-1> _1cneg;
+static C<-2> _2cneg;
+}
 }
--- a/src/functional/operands.h
+++ b/src/functional/operands.h
@ -5,71 +5,69 @@
 #include "functional/defs.h"

 namespace marian {
-  namespace functional {
+namespace functional {

-    template <class C>
-    using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type;
-
-    template <int N>
-    struct Select {
-      template <typename T, typename ...Args>
-      __HDI__ static auto apply(T&& arg, Args&&... args) -> decltype(Select<N-1>::apply(args...)) {
-        return Select<N-1>::apply(args...);
-      }
-    };
-
-    template <>
-    struct Select<0> {
-      template <typename T, typename ...Args>
-      __HDI__ static T apply(T&& arg, Args&&... args) {
-        return arg;
-      }
-    };
-
-/******************************************************************************/
-
-    template <int V>
-    struct C {
-      static constexpr auto value = V;
-
-      template <typename ...Args>
-      __HDI__ float operator()(Args&&... args) { return V; }
-
-      std::string to_string() {
-        return "C<" + std::to_string(V) + ">";
-      }
-    };
-
-/******************************************************************************/
-
-    struct Capture {
-      float value;
-
-      Capture(float val) : value(val) {};
-
-      template <typename ...Args>
-      __HDI__ float operator()(Args&&... args) { return value; }
-
-      std::string to_string() {
-        return "Cap(" + std::to_string(value) + ")";
-      }
-    };
-
-/******************************************************************************/
-
-    template <int N>
-    struct Var {
-      static constexpr auto index = N;
-
-      template <typename ...Args>
-      __HDI__ float& operator()(Args&&... args) {
-        return Select<N-1>::apply(args...);
-      }
-
-      std::string to_string() {
-        return "Var<" + std::to_string(N) + ">";
-      }
-    };
+template <class C>
+using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type;

+template <int N>
+struct Select {
+  template <typename T, typename... Args>
+  __HDI__ static auto apply(T&& arg, Args&&... args)
+      -> decltype(Select<N - 1>::apply(args...)) {
+    return Select<N - 1>::apply(args...);
  }
+};
+
+template <>
+struct Select<0> {
+  template <typename T, typename... Args>
+  __HDI__ static T apply(T&& arg, Args&&... args) {
+    return arg;
+  }
+};
+
+/******************************************************************************/
+
+template <int V>
+struct C {
+  static constexpr auto value = V;
+
+  template <typename... Args>
+  __HDI__ float operator()(Args&&... args) {
+    return V;
+  }
+
+  std::string to_string() { return "C<" + std::to_string(V) + ">"; }
+};
+
+/******************************************************************************/
+
+struct Capture {
+  float value;
+
+  Capture(float val) : value(val){};
+
+  template <typename... Args>
+  __HDI__ float operator()(Args&&... args) {
+    return value;
+  }
+
+  std::string to_string() { return "Cap(" + std::to_string(value) + ")"; }
+};
+
+/******************************************************************************/
+
+template <int N>
+struct Var {
+  static constexpr auto index = N;
+
+  template <typename... Args>
+  __HDI__ float& operator()(Args&&... args) {
+    return Select<N - 1>::apply(args...);
+  }
+
+  std::string to_string() { return "Var<" + std::to_string(N) + ">"; }
+};
+}
 }
--- a/src/functional/predicates.h
+++ b/src/functional/predicates.h
@ -4,237 +4,230 @@
 #include "functional/operands.h"

 namespace marian {
-  namespace functional {
+namespace functional {

-    template <typename Function, typename X>
-    struct UnaryFunctor {
-      X x;
+template <typename Function, typename X>
+struct UnaryFunctor {
+  X x;

-      template <class Arg>
-      UnaryFunctor(Arg a) : x(a) {}
+  template <class Arg>
+  UnaryFunctor(Arg a) : x(a) {}

-      template <typename ...Args>
-      __HDI__ float operator()(Args&&... args) {
-        return Function::apply(x(args...));
-      }
+  template <typename... Args>
+  __HDI__ float operator()(Args&&... args) {
+    return Function::apply(x(args...));
+  }

-      std::string to_string() {
-        return Function::n() + "<" + x.to_string() + ">";
-      }
-    };
+  std::string to_string() { return Function::n() + "<" + x.to_string() + ">"; }
+};

-    template <class Function, class X, class Y>
-    struct BinaryFunctor {
-      X x;
-      Y y;
+template <class Function, class X, class Y>
+struct BinaryFunctor {
+  X x;
+  Y y;

-      template <class Arg1, class Arg2>
-      BinaryFunctor(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
+  template <class Arg1, class Arg2>
+  BinaryFunctor(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}

-      template <typename ...Args>
-      __HDI__ float operator()(Args&&... args) {
-        return Function::apply(x(args...), y(args...));
-      }
+  template <typename... Args>
+  __HDI__ float operator()(Args&&... args) {
+    return Function::apply(x(args...), y(args...));
+  }

-      std::string to_string() {
-        return Function::n() +
-          "<" + x.to_string() +
-          "," + y.to_string() + ">";
-      }
-    };
+  std::string to_string() {
+    return Function::n() + "<" + x.to_string() + "," + y.to_string() + ">";
+  }
+};

-    #define UNARY(name, name2, func) \
-    namespace elem { \
-      struct name { \
-        __HDI__ static float apply(float x) { return func; } \
-        static std::string n() { return #name; }\
-      }; \
-    }\
-    template <class X> using name = UnaryFunctor<elem::name, X>;\
-    template <typename X>\
-    name<IsClass<X>> name2(X x) {\
-      return name<X>(x);\
-    }\
-    static name<Capture> name2(Capture x) {\
-      return name<Capture>(x);\
-    }
+#define UNARY(name, name2, func)                         \
+  namespace elem {                                       \
+  struct name {                                          \
+    __HDI__ static float apply(float x) { return func; } \
+    static std::string n() { return #name; }             \
+  };                                                     \
+  }                                                      \
+  template <class X>                                     \
+  using name = UnaryFunctor<elem::name, X>;              \
+  template <typename X>                                  \
+  name<IsClass<X>> name2(X x) {                          \
+    return name<X>(x);                                   \
+  }                                                      \
+  static name<Capture> name2(Capture x) { return name<Capture>(x); }

-    #define BINARY(name, name2, func) \
-    namespace elem { \
-      struct name { \
-        __HDI__ static float apply(float x, float y) { return func; } \
-        static std::string n() { return #name; }\
-      }; \
-    }\
-    template <class X, class Y> using name = BinaryFunctor<elem::name, X, Y>;\
-    template <class X, class Y>\
-    name<IsClass<X>, IsClass<Y>> name2(X x, Y y) {\
-      return name<X, Y>(x, y);\
-    }\
-    template <class Y>\
-    name<Capture, IsClass<Y>> name2(Capture x, Y y) {\
-      return name<Capture, Y>(x, y);\
-    }\
-    template <class X>\
-    name<IsClass<X>, Capture> name2(X x, Capture y) {\
-      return name<X, Capture>(x, y);\
-    }
+#define BINARY(name, name2, func)                                 \
+  namespace elem {                                                \
+  struct name {                                                   \
+    __HDI__ static float apply(float x, float y) { return func; } \
+    static std::string n() { return #name; }                      \
+  };                                                              \
+  }                                                               \
+  template <class X, class Y>                                     \
+  using name = BinaryFunctor<elem::name, X, Y>;                   \
+  template <class X, class Y>                                     \
+  name<IsClass<X>, IsClass<Y>> name2(X x, Y y) {                  \
+    return name<X, Y>(x, y);                                      \
+  }                                                               \
+  template <class Y>                                              \
+  name<Capture, IsClass<Y>> name2(Capture x, Y y) {               \
+    return name<Capture, Y>(x, y);                                \
+  }                                                               \
+  template <class X>                                              \
+  name<IsClass<X>, Capture> name2(X x, Capture y) {               \
+    return name<X, Capture>(x, y);                                \
+  }

-    UNARY(Tanh, tanh, tanhf(x));
-    UNARY(Sin, sin, sinf(x));
-    UNARY(Cos, cos, cosf(x));
-    UNARY(Tan, tan, tanf(x));
-    UNARY(Log, log, logf(x));
-    UNARY(Exp, exp, expf(x));
-    UNARY(Abs, abs, fabs(x));
-    UNARY(Sqrt, sqrt, sqrtf(x));
-    UNARY(Neg, operator-, -x);
-    UNARY(Logit, logit, x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x))));
+UNARY(Tanh, tanh, tanhf(x));
+UNARY(Sin, sin, sinf(x));
+UNARY(Cos, cos, cosf(x));
+UNARY(Tan, tan, tanf(x));
+UNARY(Log, log, logf(x));
+UNARY(Exp, exp, expf(x));
+UNARY(Abs, abs, fabs(x));
+UNARY(Sqrt, sqrt, sqrtf(x));
+UNARY(Neg, operator-, -x);
+UNARY(Logit,
+      logit,
+      x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x))));

-    BINARY(Plus, operator+, x + y);
-    BINARY(Minus, operator-, x - y);
-    BINARY(Mult, operator*, x * y);
-    BINARY(Div, operator/, x / y);
+BINARY(Plus, operator+, x + y);
+BINARY(Minus, operator-, x - y);
+BINARY(Mult, operator*, x* y);
+BINARY(Div, operator/, x / y);

-    UNARY(Negate, operator!, !x);
-    BINARY(Eq, operator==, x == y);
-    BINARY(NEq, operator!=, x != y);
-    BINARY(Gt, operator>, x > y);
-    BINARY(Lt, operator<, x < y);
-    BINARY(Geq, operator>=, x >= y);
-    BINARY(Leq, operator<=, x <= y);
-    BINARY(And, operator&&, x && y);
-    BINARY(Or, operator||, x || y);
+UNARY(Negate, operator!, !x);
+BINARY(Eq, operator==, x == y);
+BINARY(NEq, operator!=, x != y);
+BINARY(Gt, operator>, x> y);
+BINARY(Lt, operator<, x<y);
+BINARY(Geq, operator>=, x >= y);
+BINARY(Leq, operator<=, x <= y);
+BINARY(And, operator&&, x&& y);
+BINARY(Or, operator||, x || y);

-    template <typename T>
-    __HDI__ T sgn(T val) {
-      return (float(0) < val) - (val < float(0));
-    }
+template <typename T>
+__HDI__ T sgn(T val) {
+  return (float(0) < val) - (val < float(0));
+}

-    UNARY(Sgn, sgn, sgn(x));
+UNARY(Sgn, sgn, sgn(x));

-    BINARY(Pow, pow, pow(x, y));
+BINARY(Pow, pow, pow(x, y));

-    BINARY(Clip, clip, fabs(x) >= y ? sgn(x) * y : x);
+BINARY(Clip, clip, fabs(x) >= y ? sgn(x) * y : x);

-    UNARY(sReLU, ReLU, x > 0.f ? x : 0.f);
-    UNARY(sReLUBack, ReLUback, x > 0.f ? 1.f : 0.f);
-    BINARY(sPReLU, PReLU, x > 0.f ? x : x * y);
-    BINARY(sPReLUBack, PReLUback, x > 0.f ? 1.f : y);
+UNARY(sReLU, ReLU, x > 0.f ? x : 0.f);
+UNARY(sReLUBack, ReLUback, x > 0.f ? 1.f : 0.f);
+BINARY(sPReLU, PReLU, x > 0.f ? x : x * y);
+BINARY(sPReLUBack, PReLUback, x > 0.f ? 1.f : y);

-    template <class Function, class X, class Y, class Z>
-    struct TernaryFunctor {
-      X x;
-      Y y;
-      Z z;
+template <class Function, class X, class Y, class Z>
+struct TernaryFunctor {
+  X x;
+  Y y;
+  Z z;

-      template <class Arg1, class Arg2, class Arg3>
-      TernaryFunctor(Arg1 arg1, Arg2 arg2, Arg3 arg3)
-      : x(arg1), y(arg2), z(arg3) {}
+  template <class Arg1, class Arg2, class Arg3>
+  TernaryFunctor(Arg1 arg1, Arg2 arg2, Arg3 arg3) : x(arg1), y(arg2), z(arg3) {}

-      template <typename ...Args>
-      __HDI__ float operator()(Args&&... args) {
-        return Function::apply(x(args...), y(args...), z(args...));
-      }
-    };
+  template <typename... Args>
+  __HDI__ float operator()(Args&&... args) {
+    return Function::apply(x(args...), y(args...), z(args...));
+  }
+};

-    #define TERNARY(name, name2, func) \
-    namespace elem { \
-      struct name { \
-        __HDI__ static float apply(float x, float y, float z) { return func; } \
-      }; \
-    }\
-    template <class X, class Y, class Z> using name = TernaryFunctor<elem::name, X, Y, Z>;\
-    template <typename X, typename Y, typename Z>\
-    name<IsClass<X>, IsClass<Y>, IsClass<Z>> name2(X x, Y y, Z z) {\
-      return name<X, Y, Z>(x, y, z);\
-    }\
-    template <typename X, typename Z>\
-    name<IsClass<X>, Capture, IsClass<Z>> name2(X x, Capture y, Z z) {\
-      return name<X, Capture, Z>(x, y, z);\
-    }\
-    template <typename Y, typename Z>\
-    name<Capture, IsClass<Y>, IsClass<Z>> name2(Capture x, Y y, Z z) {\
-      return name<Capture, Y, Z>(x, y, z);\
-    }\
-    template <typename X>\
-    name<IsClass<X>, Capture, Capture> name2(X x, Capture y, Capture z) {\
-      return name<X, Capture, Capture>(x, y, z);\
-    }\
-    template <typename Y>\
-    name<Capture, IsClass<Y>, Capture> name2(Capture x, Y y, Capture z) {\
-      return name<Capture, Y, Capture>(x, y, z);\
-    }\
-    template <typename Z>\
-    name<Capture, Capture, IsClass<Z>> name2(Capture x, Capture y, Z z) {\
-      return name<Capture, Capture, Z>(x, y, z);\
-    }
+#define TERNARY(name, name2, func)                                         \
+  namespace elem {                                                         \
+  struct name {                                                            \
+    __HDI__ static float apply(float x, float y, float z) { return func; } \
+  };                                                                       \
+  }                                                                        \
+  template <class X, class Y, class Z>                                     \
+  using name = TernaryFunctor<elem::name, X, Y, Z>;                        \
+  template <typename X, typename Y, typename Z>                            \
+  name<IsClass<X>, IsClass<Y>, IsClass<Z>> name2(X x, Y y, Z z) {          \
+    return name<X, Y, Z>(x, y, z);                                         \
+  }                                                                        \
+  template <typename X, typename Z>                                        \
+  name<IsClass<X>, Capture, IsClass<Z>> name2(X x, Capture y, Z z) {       \
+    return name<X, Capture, Z>(x, y, z);                                   \
+  }                                                                        \
+  template <typename Y, typename Z>                                        \
+  name<Capture, IsClass<Y>, IsClass<Z>> name2(Capture x, Y y, Z z) {       \
+    return name<Capture, Y, Z>(x, y, z);                                   \
+  }                                                                        \
+  template <typename X>                                                    \
+  name<IsClass<X>, Capture, Capture> name2(X x, Capture y, Capture z) {    \
+    return name<X, Capture, Capture>(x, y, z);                             \
+  }                                                                        \
+  template <typename Y>                                                    \
+  name<Capture, IsClass<Y>, Capture> name2(Capture x, Y y, Capture z) {    \
+    return name<Capture, Y, Capture>(x, y, z);                             \
+  }                                                                        \
+  template <typename Z>                                                    \
+  name<Capture, Capture, IsClass<Z>> name2(Capture x, Capture y, Z z) {    \
+    return name<Capture, Capture, Z>(x, y, z);                             \
+  }

-    TERNARY(IfThenElse, if_then_else, x ? y : z);
+TERNARY(IfThenElse, if_then_else, x ? y : z);

+template <class X, class Y>
+struct Assign {
+  X x;
+  Y y;

+  template <class Arg1, class Arg2>
+  Assign(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}

-    template <class X, class Y>
-    struct Assign {
-      X x;
-      Y y;
+  template <typename... Args>
+  __HDI__ float operator()(Args&&... args) {
+    return x(args...) = y(args...);
+  }
+};

-      template <class Arg1, class Arg2>
-      Assign(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
+template <int N>
+struct Assignee {
+  Var<N> var;

-      template <typename ...Args>
-      __HDI__ float operator()(Args&&... args) {
-        return x(args...) = y(args...);
-      }
-    };
+  Assignee() {}
+  Assignee(Var<N> v) : var(v) {}

-    template <int N>
-    struct Assignee {
-      Var<N> var;
+  template <typename... Args>
+  __HDI__ float& operator()(Args&&... args) {
+    return var(args...);
+  }

-      Assignee() {}
-      Assignee(Var<N> v) : var(v) {}
+  template <class X>
+  Assign<Var<N>, IsClass<X>> operator=(X x) {
+    return Assign<Var<N>, X>(var, x);
+  }

-      template <typename ...Args>
-      __HDI__ float& operator()(Args&&... args) {
-        return var(args...);
-      }
+  Assign<Var<N>, Capture> operator=(Capture x) {
+    return Assign<Var<N>, Capture>(var, x);
+  }

-      template <class X>
-      Assign<Var<N>, IsClass<X>> operator=(X x) {
-        return Assign<Var<N>, X>(var, x);
-      }
+  template <class X>
+  auto operator+=(X x) -> decltype(*this = *this + x) {
+    return *this = *this + x;
+  }

-      Assign<Var<N>, Capture> operator=(Capture x) {
-        return Assign<Var<N>, Capture>(var, x);
-      }
+  template <class X>
+  auto operator-=(X x) -> decltype(*this = *this - x) {
+    return *this = *this - x;
+  }

-      template <class X>
-      auto operator+=(X x)->decltype(*this = *this + x)  {
-        return *this = *this + x;
-      }
+  template <class X>
+  auto operator*=(X x) -> decltype(*this = *this * x) {
+    return *this = *this * x;
+  }

-      template <class X>
-      auto operator-=(X x)->decltype(*this = *this - x)  {
-        return *this = *this - x;
-      }
+  template <class X>
+  auto operator/=(X x) -> decltype(*this = *this / x) {
+    return *this = *this / x;
+  }

-      template <class X>
-      auto operator*=(X x)->decltype(*this = *this * x)  {
-        return *this = *this * x;
-      }
-
-      template <class X>
-      auto operator/=(X x)->decltype(*this = *this / x)  {
-        return *this = *this / x;
-      }
-
-      std::string to_string() {
-        return var.to_string();
-      }
-    };
+  std::string to_string() { return var.to_string(); }
+};

 /******************************************************************************/
-
-  }
+}
 }
--- a/src/functional/shape.h
+++ b/src/functional/shape.h
@ -17,7 +17,6 @@ namespace functional {
 * @brief Represents the size of each dimension in a tensor.
 */

-
 template <const int N>
 struct ConstantShape {
  Array<int, N> shape_;
@ -32,10 +31,10 @@ struct ConstantShape {
  }

  __HD__ ConstantShape(const ConstantShape& shape)
-  : shape_(shape.shape_),
-    stride_(shape.stride_),
-    bstride_(shape.bstride_),
-    elements_(shape.elements_) {}
+      : shape_(shape.shape_),
+        stride_(shape.stride_),
+        bstride_(shape.bstride_),
+        elements_(shape.elements_) {}

  ConstantShape(const Shape& shape) {
    size_t filled = shape.size();
@ -43,7 +42,8 @@ struct ConstantShape {
    ABORT_IF(filled > N,
             "Recompile with CONST_SHAPE_DIMS >= " + std::to_string(filled));

-    std::copy(shape.shape_.begin(), shape.shape_.end(), shape_.begin() + N - filled);
+    std::copy(
+        shape.shape_.begin(), shape.shape_.end(), shape_.begin() + N - filled);
    if(N - filled)
      std::fill_n(shape_.begin(), N - filled, 1);
    updateStrides();
@ -51,7 +51,6 @@ struct ConstantShape {
  }

  __HDI__ void updateStrides() {
-
    stride_[N - 1] = 1;
    bstride_[N - 1] = shape_[N - 1] == 1 ? 0 : stride_[N - 1];

@ -73,7 +72,6 @@ struct ConstantShape {
    updateElements();
  }

-
  __HDI__ int dim(int i) { return shape_[i]; }

  __HDI__ int dim(int i) const {
@ -92,9 +90,7 @@ struct ConstantShape {

  __HDI__ static constexpr size_t size() { return N; }

-  __HDI__ int elements() const {
-    return elements_;
-  }
+  __HDI__ int elements() const { return elements_; }

  __HDI__ int index(const Array<int, N>& d) const {
    int i = 0;
@ -113,7 +109,7 @@ struct ConstantShape {
  __HDI__ void dims(int i, Array<int, N>& d) const {
    for(int j = 0; j < N; ++j)
      d[j] = (i / stride_[j]) % shape_[j];
-    }
+  }

  __HDI__ bool operator==(const ConstantShape& other) const {
    for(int i = 0; i < N; ++i)
@ -128,7 +124,5 @@ struct ConstantShape {
 };

 typedef ConstantShape<CONST_SHAPE_DIMS> Shape;
-
 }
-
 }
--- a/src/functional/tensor.h
+++ b/src/functional/tensor.h
@ -7,7 +7,7 @@
 namespace marian {
 namespace functional {

-template<typename T>
+template <typename T>
 struct Tensor {
  T* data_;
  functional::Shape shape_;
@ -15,19 +15,20 @@ struct Tensor {
  __HD__ Tensor() {}

  __HD__ Tensor(T* ptr, const functional::Shape& shape)
-  : data_(ptr), shape_(shape) {}
+      : data_(ptr), shape_(shape) {}

-  __H__ Tensor(marian::Tensor t)
-  : data_(t->data()), shape_(t->shape()) {}
+  __H__ Tensor(marian::Tensor t) : data_(t->data()), shape_(t->shape()) {}

  __HDI__ float& operator[](size_t i) { return data_[i]; }
  __HDI__ const float& operator[](size_t i) const { return data_[i]; }

-  __HDI__ float& operator[](const functional::Array<int, functional::Shape::size()>& indices) {
+  __HDI__ float& operator[](
+      const functional::Array<int, functional::Shape::size()>& indices) {
    return data_[shape_.index(indices)];
  }

-  __HDI__ const float& operator[](const functional::Array<int, functional::Shape::size()>& indices) const {
+  __HDI__ const float& operator[](
+      const functional::Array<int, functional::Shape::size()>& indices) const {
    return data_[shape_.index(indices)];
  }

@ -37,6 +38,5 @@ struct Tensor {
  __HDI__ Shape& shape() { return shape_; }
  __HDI__ const Shape& shape() const { return shape_; }
 };
-
 }
 }
--- a/src/functional/tmp.h
+++ b/src/functional/tmp.h
@ -12,82 +12,86 @@ struct FApply {};

 template <class Functor>
 struct FApply<1, Functor> {
-  __HDI__ static float apply(Functor functor,
-                             functional::Array<functional::Tensor<float>, 1>& in,
-                             const functional::Array<int, 1>& indices) {
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 1>& in,
+      const functional::Array<int, 1>& indices) {
    return functor(in[0][indices[0]]);
  }

-  __HDI__ static float apply(Functor functor,
-                             functional::Array<functional::Tensor<float>, 1>& in,
-                             int index) {
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 1>& in,
+      int index) {
    return functor(in[0][index]);
  }
 };

 template <class Functor>
 struct FApply<2, Functor> {
-  __HDI__ static float apply(Functor functor,
-                             functional::Array<functional::Tensor<float>, 2>& in,
-                             const functional::Array<int, 2>& indices) {
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 2>& in,
+      const functional::Array<int, 2>& indices) {
    return functor(in[0][indices[0]], in[1][indices[1]]);
  }

-  __HDI__ static float apply(Functor functor,
-                             functional::Array<functional::Tensor<float>, 2>& in,
-                             int index) {
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 2>& in,
+      int index) {
    return functor(in[0][index], in[1][index]);
  }
 };

 template <class Functor>
 struct FApply<3, Functor> {
-  __HDI__ static float apply(Functor functor,
-                             functional::Array<functional::Tensor<float>, 3>& in,
-                             const functional::Array<int, 3>& indices) {
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 3>& in,
+      const functional::Array<int, 3>& indices) {
    return functor(in[0][indices[0]], in[1][indices[1]], in[2][indices[2]]);
  }

-  __HDI__ static float apply(Functor functor,
-                             functional::Array<functional::Tensor<float>, 3>& in,
-                             int index) {
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 3>& in,
+      int index) {
    return functor(in[0][index], in[1][index], in[2][index]);
  }
 };

-
 template <class Functor>
 struct FApply<4, Functor> {
-  __HDI__ static float apply(Functor functor,
-                             functional::Array<functional::Tensor<float>, 4>& in,
-                             const functional::Array<int, 4>& indices) {
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 4>& in,
+      const functional::Array<int, 4>& indices) {
    return functor(in[0][indices[0]],
                   in[1][indices[1]],
                   in[2][indices[2]],
                   in[3][indices[3]]);
  }

-  __HDI__ static float apply(Functor functor,
-                             functional::Array<functional::Tensor<float>, 4>& in,
-                             int index) {
-    return functor(in[0][index],
-                   in[1][index],
-                   in[2][index],
-                   in[3][index]);
+  __HDI__ static float apply(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, 4>& in,
+      int index) {
+    return functor(in[0][index], in[1][index], in[2][index], in[3][index]);
  }
 };

 template <size_t K, class Functor>
-  __HDI__ float apply(Functor functor,
-                      functional::Array<functional::Tensor<float>, K>& in,
-                      const functional::Array<int, K>& indices) {
+__HDI__ float apply(Functor functor,
+                    functional::Array<functional::Tensor<float>, K>& in,
+                    const functional::Array<int, K>& indices) {
  return FApply<K, Functor>::apply(functor, in, indices);
 }

 template <size_t K, class Functor>
-  __HDI__ float apply(Functor functor,
-                      functional::Array<functional::Tensor<float>, K>& in,
-                      int index) {
+__HDI__ float apply(Functor functor,
+                    functional::Array<functional::Tensor<float>, K>& in,
+                    int index) {
  return FApply<K, Functor>::apply(functor, in, index);
 }

@ -96,11 +100,12 @@ template <size_t K, class Functor>
 template <size_t n, size_t N, size_t K>
 struct Loop {
  template <class Functor>
-  __HDI__ static float result(Functor functor,
-                              functional::Array<functional::Tensor<float>, K>& in,
-                              const functional::Array<int, K>& pAcc,
-                              const functional::Array<int, N>& length,
-                              const functional::Array<int, N>& dim) {
+  __HDI__ static float result(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, K>& in,
+      const functional::Array<int, K>& pAcc,
+      const functional::Array<int, N>& length,
+      const functional::Array<int, N>& dim) {
    float sum = 0;
    functional::Array<int, K> acc;
    for(int i = 0; i < length[N - n]; ++i) {
@ -116,11 +121,12 @@ struct Loop {
 template <size_t N, size_t K>
 struct Loop<1, N, K> {
  template <class Functor>
-  __HDI__ static float result(Functor functor,
-                              functional::Array<functional::Tensor<float>, K>& in,
-                              const functional::Array<int, K>& pAcc,
-                              const functional::Array<int, N>& length,
-                              const functional::Array<int, N>& dim) {
+  __HDI__ static float result(
+      Functor functor,
+      functional::Array<functional::Tensor<float>, K>& in,
+      const functional::Array<int, K>& pAcc,
+      const functional::Array<int, N>& length,
+      const functional::Array<int, N>& dim) {
    float sum = 0;
    functional::Array<int, K> acc;
    for(int i = 0; i < length[N - 1]; ++i) {
@ -141,6 +147,5 @@ __HDI__ float loops(Functor functor,
  functional::Array<int, K> acc = {0};
  return Loop<N, N, K>::result(functor, in, acc, length, dim);
 }
-
 }
-}
+}
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@ -1,5 +1,5 @@
-#include <sstream>
 #include "graph/expression_graph.h"
+#include <sstream>

 #include "tensors/tensor_operators.h"

@ -18,15 +18,12 @@ void ExpressionGraph::setDevice(DeviceId deviceId) {
 }

 Expr ExpressionGraph::dropout(float prob, const Shape& shape) {
-  return Expression<ConstantNode>(shared_from_this(),
-                                  shape,
-                                  [prob, this](Tensor t) {
-                                    Dropout(t, prob);
-                                  });
+  return Expression<ConstantNode>(
+      shared_from_this(), shape, [prob, this](Tensor t) { Dropout(t, prob); });
 }

 void ExpressionGraph::checkNan(Tensor t) {
  ABORT_IF(throwNaN_, "Not implemented");
-  //ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN");
+  // ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN");
 }
 }
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -215,7 +215,9 @@ public:
      ABORT_IF(shape != p->shape(),
               "Requested shape {} for existing parameter '{}' does not match "
               "original shape {}",
-               shape, name, p->shape());
+               shape,
+               name,
+               p->shape());

      p->setTrainable(!fixed);
      add(p);
@ -239,10 +241,8 @@ public:
    return p;
  }

-  Expr constant(const Shape& shape,
-                const NodeInitializer& init) {
-    return Expression<ConstantNode>(
-        shared_from_this(), shape, init);
+  Expr constant(const Shape& shape, const NodeInitializer& init) {
+    return Expression<ConstantNode>(shared_from_this(), shape, init);
  }

  Expr ones(const Shape& shape) {
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -126,7 +126,6 @@ Expr repeat(Expr a, size_t repeats, keywords::axis_k ax) {
  return concatenate(std::vector<Expr>(repeats, a), ax);
 }

-
 Expr reshape(Expr a, Shape shape) {
  return Expression<ReshapeNodeOp>(a, shape);
 }
@ -165,10 +164,7 @@ Expr flatten(Expr a) {
 }

 Expr flatten_2d(Expr a) {
-  Shape shape = {
-    a->shape().elements() / a->shape()[-1],
-    a->shape()[-1]
-  };
+  Shape shape = {a->shape().elements() / a->shape()[-1], a->shape()[-1]};

  return Expression<ReshapeNodeOp>(a, shape);
 }
@ -232,17 +228,16 @@ Expr step(Expr a, int step, int axis) {
 }

 Expr cross_entropy(Expr a, Expr b) {
-  //auto sOrig = a->shape();
-  //auto sOut = a->shape();
-  //Shape sTemp({sOrig[0] * sOrig[2] * sOrig[3], sOrig[1], 1, 1});
-  //sOut.set(1, 1);
-  //return reshape(Expression<CrossEntropyNodeOp>(reshape(a, sTemp), b), sOut);
+  // auto sOrig = a->shape();
+  // auto sOut = a->shape();
+  // Shape sTemp({sOrig[0] * sOrig[2] * sOrig[3], sOrig[1], 1, 1});
+  // sOut.set(1, 1);
+  // return reshape(Expression<CrossEntropyNodeOp>(reshape(a, sTemp), b), sOut);

  return Expression<CrossEntropyNodeOp>(a, b);
 }

-Expr affine(Expr a, Expr b, Expr c,
-            bool transA, bool transB, float scalar) {
+Expr affine(Expr a, Expr b, Expr c, bool transA, bool transB, float scalar) {
  std::vector<Expr> nodes = {a, b, c};
  return Expression<AffineNodeOp>(nodes, transA, transB, scalar);
 }
@ -299,6 +294,7 @@ Expr highway(Expr y, Expr x, Expr t) {
 }

 Expr highway(const std::string prefix, Expr x) {
+  // clang-format off
  size_t outDim = x->shape()[-1];
  auto g = mlp::dense(x->graph())
      ("prefix", prefix + "_highway_d1")
@ -311,6 +307,7 @@ Expr highway(const std::string prefix, Expr x) {
      ("activation", mlp::act::ReLU)
      .construct()->apply(x);
  return (g * relued) + ((1 - g) * x);
+  // clang-format on
 }

 // Expr batch_norm(Expr x, Expr gamma, Expr beta) {
@ -334,41 +331,26 @@ Expr shift(Expr a, Shape shift) {

 #ifdef CUDA_FOUND

-Expr avg_pooling(
-    Expr x,
-    int height,
-    int width,
-    int padHeight,
-    int padWidth,
-    int strideHeight,
-    int strideWidth) {
-  return Expression<PoolingOp>(x,
-      height,
-      width,
-      padHeight,
-      padWidth,
-      strideHeight,
-      strideWidth,
-      "avg");
+Expr avg_pooling(Expr x,
+                 int height,
+                 int width,
+                 int padHeight,
+                 int padWidth,
+                 int strideHeight,
+                 int strideWidth) {
+  return Expression<PoolingOp>(
+      x, height, width, padHeight, padWidth, strideHeight, strideWidth, "avg");
 }

-Expr max_pooling(
-    Expr x,
-    int height,
-    int width,
-    int padHeight,
-    int padWidth,
-    int strideHeight,
-    int strideWidth)
-{
-  return Expression<PoolingOp>(x,
-      height,
-      width,
-      padHeight,
-      padWidth,
-      strideHeight,
-      strideWidth,
-      "max");
+Expr max_pooling(Expr x,
+                 int height,
+                 int width,
+                 int padHeight,
+                 int padWidth,
+                 int strideHeight,
+                 int strideWidth) {
+  return Expression<PoolingOp>(
+      x, height, width, padHeight, padWidth, strideHeight, strideWidth, "max");
 }

 Expr convert2cudnnFormat(Expr x) {
@ -377,13 +359,13 @@ Expr convert2cudnnFormat(Expr x) {
  int embSize = x->shape()[2];

  std::vector<size_t> newIndeces;
-  for (int b = 0; b < numExamples; ++b) {
-    for (int t = 0; t < numWords; ++t) {
+  for(int b = 0; b < numExamples; ++b) {
+    for(int t = 0; t < numWords; ++t) {
      newIndeces.push_back((t * numExamples) + b);
    }
  }

-  auto xRows = reshape(x, {x->shape()[0] * x ->shape()[1], x->shape()[2]});
+  auto xRows = reshape(x, {x->shape()[0] * x->shape()[1], x->shape()[2]});

  Shape outShape({numExamples, 1, numWords, embSize});
  return reshape(rows(xRows, newIndeces), outShape);
@ -397,8 +379,8 @@ Expr convertFromcudnnFormat(Expr x) {
  auto reshapedX = reshape(x, {batchDim * sentenceDim, embSize});

  std::vector<size_t> newIndeces;
-  for (int t = 0; t < sentenceDim; ++t) {
-    for (int b = 0; b < batchDim; ++b) {
+  for(int t = 0; t < sentenceDim; ++t) {
+    for(int b = 0; b < batchDim; ++b) {
      newIndeces.push_back(b * sentenceDim + t);
    }
  }
@ -412,5 +394,4 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
 }

 #endif
-
 }
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@ -110,7 +110,6 @@ Expr mean(Expr a, keywords::axis_k ax = 0);

 Expr cross_entropy(Expr a, Expr b);

-
 Expr scalar_product(Expr a, Expr b, keywords::axis_k ax = 0);

 Expr weighted_average(Expr in, Expr weights, keywords::axis_k ax = 0);
@ -161,6 +160,5 @@ Expr max_pooling(Expr x,
                 int strideHeight = 1,
                 int strideWidth = 1);

-Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven=false);
-
+Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven = false);
 }
--- a/src/graph/node.cpp
+++ b/src/graph/node.cpp
@ -1,4 +1,5 @@
 #include "tensors/backend.h"
+
 #include "graph/expression_graph.h"
 #include "graph/node.h"

--- a/src/graph/node.h
+++ b/src/graph/node.h
@ -33,8 +33,7 @@ protected:

 public:
  Node(Ptr<ExpressionGraph> graph, Shape shape)
-      : graph_(graph),
-        shape_(shape) {}
+      : graph_(graph), shape_(shape) {}

  virtual ~Node() {
    if(destroy_) {
@ -152,7 +151,7 @@ struct NaryNodeOp : public Node {
  }

  NaryNodeOp(const std::vector<Expr>& nodes)
-  : NaryNodeOp(nodes, nodes[0]->shape()) {}
+      : NaryNodeOp(nodes, nodes[0]->shape()) {}

  virtual ~NaryNodeOp() {}

--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@ -109,9 +109,8 @@ void ortho(Tensor t) {

 NodeInitializer from_vector(const std::vector<float>& v) {
  auto vPtr = New<std::vector<float>>(v.begin(), v.end());
-  return [vPtr](Tensor t) {
-    t->set(vPtr->data(), vPtr->data() + vPtr->size());
-  };
+  return
+      [vPtr](Tensor t) { t->set(vPtr->data(), vPtr->data() + vPtr->size()); };
 }

 NodeInitializer from_vector(const std::vector<size_t>& v) {
@ -138,9 +137,9 @@ NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np) {

 // move this somewhere else
 NodeInitializer from_word2vec(const std::string& file,
-                                          int dimVoc,
-                                          int dimEmb,
-                                          bool normalize /*= false*/) {
+                              int dimVoc,
+                              int dimEmb,
+                              bool normalize /*= false*/) {
  return [file, dimVoc, dimEmb, normalize](Tensor t) {
    auto embs = Word2VecReader().read(file, dimVoc, dimEmb);

--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@ -70,9 +70,9 @@ NodeInitializer from_sparse_vector(
 NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np);

 NodeInitializer from_word2vec(const std::string& file,
-                                          int dimVoc,
-                                          int dimEmb,
-                                          bool normalize = false);
+                              int dimVoc,
+                              int dimEmb,
+                              bool normalize = false);
 }

 }  // namespace marian
--- a/src/graph/node_operators.h
+++ b/src/graph/node_operators.h
@ -7,11 +7,12 @@
 namespace marian {

 struct ConstantNode : public Node {
-  ConstantNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init)
+  ConstantNode(Ptr<ExpressionGraph> graph,
+               const Shape& shape,
+               const NodeInitializer& init)
      : Node(graph, shape),
        init_(new NodeInitializer(init)),
        initialized_(false) {
-
    setTrainable(false);
  }

@ -41,11 +42,13 @@ private:
 };

 struct ParamNode : public Node {
-  ParamNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init, bool fixed = false)
+  ParamNode(Ptr<ExpressionGraph> graph,
+            const Shape& shape,
+            const NodeInitializer& init,
+            bool fixed = false)
      : Node(graph, shape),
        init_(new NodeInitializer(init)),
        initialized_(false) {
-
    setTrainable(!fixed);
  }

--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -16,13 +16,8 @@ private:
  float scalar_;

 public:
-  DotNodeOp(Expr a,
-            Expr b,
-            bool transA,
-            bool transB,
-            float scalar)
-      : NaryNodeOp({a, b},
-                   newShape(a, b, transA, transB)),
+  DotNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar)
+      : NaryNodeOp({a, b}, newShape(a, b, transA, transB)),
        transA_(transA),
        transB_(transB),
        scalar_(scalar) {}
@ -49,14 +44,13 @@ public:

  NodeOps forwardOps() {
    // C = alpha * dot(op(A), op(B))
-    return {NodeOp(Prod(
-        val_,
-        child(0)->val(),
-        child(1)->val(),
-        transA_,
-        transB_,
-        0.f,
-        scalar_))};
+    return {NodeOp(Prod(val_,
+                        child(0)->val(),
+                        child(1)->val(),
+                        transA_,
+                        transB_,
+                        0.f,
+                        scalar_))};
  }

  NodeOps backwardOps() {
@ -149,7 +143,7 @@ public:
      : NaryNodeOp(nodes, newShape(nodes[0], nodes[1], transA, transB)),
        transA_(transA),
        transB_(transB),
-        scalar_(scalar){}
+        scalar_(scalar) {}

  Shape newShape(Expr a, Expr b, bool transA, bool transB) {
    auto shapeA = a->shape();
@ -171,19 +165,17 @@ public:
    return outShape;
  }

-
  NodeOps forwardOps() {
    using namespace functional;
    return {
-      NodeOp(Prod(
-        val_,
-        child(0)->val(),
-        child(1)->val(),
-        transA_,
-        transB_,
-        0.f,
-        scalar_);
-        Add(_1, val_, child(2)->val()))
+      NodeOp(Prod(val_,
+                  child(0)->val(),
+                  child(1)->val(),
+                  transA_,
+                  transB_,
+                  0.f,
+                  scalar_);
+             Add(_1, val_, child(2)->val()))
    };
  }

@ -266,7 +258,6 @@ public:
  const std::string type() { return "affine"; }
 };

-
 class DotBatchedNodeOp : public NaryNodeOp {
 private:
  bool transA_;
@ -274,13 +265,8 @@ private:
  float scalar_;

 public:
-  DotBatchedNodeOp(Expr a,
-                   Expr b,
-                   bool transA,
-                   bool transB,
-                   float scalar)
-      : NaryNodeOp({a, b},
-                   newShape(a, b, transA, transB)),
+  DotBatchedNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar)
+      : NaryNodeOp({a, b}, newShape(a, b, transA, transB)),
        transA_(transA),
        transB_(transB),
        scalar_(scalar) {}
@ -307,14 +293,13 @@ public:

  NodeOps forwardOps() {
    // C = alpha * dot(op(A), op(B))
-    return {NodeOp(ProdBatched(
-        val_,
-        child(0)->val(),
-        child(1)->val(),
-        transA_,
-        transB_,
-        0.f,
-        scalar_))};
+    return {NodeOp(ProdBatched(val_,
+                               child(0)->val(),
+                               child(1)->val(),
+                               transA_,
+                               transB_,
+                               0.f,
+                               scalar_))};
  }

  NodeOps backwardOps() {
@ -325,71 +310,67 @@ public:
    // to sum gradients from different graph parts

    if(!transA_ && transB_)
-      return {
-          NodeOp(ProdBatched(child(0)->grad(),
-                             adj_,
-                             child(1)->val(),
-                             false,
-                             false,
-                             1.0,
-                             scalar_)),
-          NodeOp(ProdBatched(child(1)->grad(),
-                             adj_,
-                             child(0)->val(),
-                             true,
-                             false,
-                             1.0,
-                             scalar_))};
+      return {NodeOp(ProdBatched(child(0)->grad(),
+                                 adj_,
+                                 child(1)->val(),
+                                 false,
+                                 false,
+                                 1.0,
+                                 scalar_)),
+              NodeOp(ProdBatched(child(1)->grad(),
+                                 adj_,
+                                 child(0)->val(),
+                                 true,
+                                 false,
+                                 1.0,
+                                 scalar_))};

    if(transA_ && !transB_)
-      return {
-          NodeOp(ProdBatched(child(0)->grad(),
-                             child(1)->val(),
-                             adj_,
-                             false,
-                             true,
-                             1.0,
-                             scalar_)),
-          NodeOp(ProdBatched(child(1)->grad(),
-                             child(0)->val(),
-                             adj_,
-                             false,
-                             false,
-                             1.0,
-                             scalar_))};
+      return {NodeOp(ProdBatched(child(0)->grad(),
+                                 child(1)->val(),
+                                 adj_,
+                                 false,
+                                 true,
+                                 1.0,
+                                 scalar_)),
+              NodeOp(ProdBatched(child(1)->grad(),
+                                 child(0)->val(),
+                                 adj_,
+                                 false,
+                                 false,
+                                 1.0,
+                                 scalar_))};

    if(transA_ && transB_)
-      return {
-          NodeOp(ProdBatched(child(0)->grad(),
-                             child(1)->val(),
-                             adj_,
-                             true,
-                             true,
-                             1.0,
-                             scalar_)),
-          NodeOp(ProdBatched(child(1)->grad(),
-                             adj_,
-                             child(0)->val(),
-                             true,
-                             true,
-                             1.0,
-                             scalar_))};
+      return {NodeOp(ProdBatched(child(0)->grad(),
+                                 child(1)->val(),
+                                 adj_,
+                                 true,
+                                 true,
+                                 1.0,
+                                 scalar_)),
+              NodeOp(ProdBatched(child(1)->grad(),
+                                 adj_,
+                                 child(0)->val(),
+                                 true,
+                                 true,
+                                 1.0,
+                                 scalar_))};

-    return {
-        NodeOp(ProdBatched(child(0)->grad(),
-                           adj_,
-                           child(1)->val(),
-                           false,
-                           true,
-                           1.0,
-                           scalar_)),
-        NodeOp(ProdBatched(child(1)->grad(),
-                           child(0)->val(),
-                           adj_,
-                           true,
-                           false,
-                           1.0,
-                           scalar_))};
+    return {NodeOp(ProdBatched(child(0)->grad(),
+                               adj_,
+                               child(1)->val(),
+                               false,
+                               true,
+                               1.0,
+                               scalar_)),
+            NodeOp(ProdBatched(child(1)->grad(),
+                               child(0)->val(),
+                               adj_,
+                               true,
+                               false,
+                               1.0,
+                               scalar_))};
  }

  const std::string type() { return "•"; }
@ -400,8 +381,7 @@ public:
 struct ScalarProductNodeOp : public NaryNodeOp {
  template <typename... Args>
  ScalarProductNodeOp(Expr a, Expr b, Args... args)
-      : NaryNodeOp({a, b}, newShape(a, b, args...)) {
-  }
+      : NaryNodeOp({a, b}, newShape(a, b, args...)) {}

  template <typename... Args>
  Shape newShape(Expr a, Expr b, Args... args) {
@ -433,12 +413,9 @@ struct ScalarProductNodeOp : public NaryNodeOp {
 };

 struct ElementBinaryNodeOp : public NaryNodeOp {
-  ElementBinaryNodeOp(Expr a, Expr b)
-      : NaryNodeOp({a, b}, newShape(a, b)) {}
+  ElementBinaryNodeOp(Expr a, Expr b) : NaryNodeOp({a, b}, newShape(a, b)) {}

-  Shape newShape(Expr a, Expr b) {
-    return Shape::broadcast({a, b});
-  }
+  Shape newShape(Expr a, Expr b) { return Shape::broadcast({a, b}); }

  const std::string color() { return "yellow"; }
 };
@ -553,8 +530,7 @@ struct DivNodeOp : public ElementBinaryNodeOp {

 // Cross-entropy node. It computes -b*log(softmax(a)), summing rowwise.
 struct CrossEntropyNodeOp : public NaryNodeOp {
-  CrossEntropyNodeOp(Expr a, Expr b)
-      : NaryNodeOp({a, b}, newShape(a)) {}
+  CrossEntropyNodeOp(Expr a, Expr b) : NaryNodeOp({a, b}, newShape(a)) {}

  Shape newShape(Expr a) {
    Shape shape1 = a->shape();
@ -578,7 +554,9 @@ struct CrossEntropyNodeOp : public NaryNodeOp {
 struct ConcatenateNodeOp : public NaryNodeOp {
  template <typename... Args>
  ConcatenateNodeOp(const std::vector<Expr>& nodes, Args... args)
-      : NaryNodeOp(nodes, newShape(nodes, keywords::Get(keywords::axis, 0, args...))) {}
+      : NaryNodeOp(nodes,
+                   newShape(nodes, keywords::Get(keywords::axis, 0, args...))) {
+  }

  Shape newShape(const std::vector<Expr>& nodes, int ax) {
    Shape shape = nodes.back()->shape();
@ -730,38 +708,33 @@ struct HighwayNodeOp : public NaryNodeOp {

 class ConvolutionOp : public NaryNodeOp {
 public:
-  ConvolutionOp(
-      const std::vector<Expr>& nodes,
-      int hPad = 0,
-      int wPad = 0,
-      int hStride = 1,
-      int wStride = 1)
-    : NaryNodeOp(nodes),
-      conv_(nodes[1]->shape(),
-            nodes[2]->shape(),
-            hPad,
-            wPad,
-            hStride,
-            wStride) {
+  ConvolutionOp(const std::vector<Expr>& nodes,
+                int hPad = 0,
+                int wPad = 0,
+                int hStride = 1,
+                int wStride = 1)
+      : NaryNodeOp(nodes),
+        conv_(nodes[1]->shape(),
+              nodes[2]->shape(),
+              hPad,
+              wPad,
+              hStride,
+              wStride) {
    conv_.getOutputShape(nodes[0]->shape(), shape_);
  }

  NodeOps forwardOps() {
    return {NodeOp(conv_.forward(
-          child(0)->val(),
-          child(1)->val(),
-          child(2)->val(),
-          val_))};
+        child(0)->val(), child(1)->val(), child(2)->val(), val_))};
  }

  NodeOps backwardOps() {
-    return {NodeOp(conv_.backward(
-          child(0)->val(),
-          child(0)->grad(),
-          child(1)->val(),
-          child(1)->grad(),
-          child(2)->grad(),
-          adj_))};
+    return {NodeOp(conv_.backward(child(0)->val(),
+                                  child(0)->grad(),
+                                  child(1)->val(),
+                                  child(1)->grad(),
+                                  child(2)->grad(),
+                                  adj_))};
  }

  const std::string type() { return "layer_convolution"; }
@ -769,5 +742,4 @@ public:
 protected:
  ConvolutionWrapper conv_;
 };
-
 }
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@ -12,11 +12,9 @@
 namespace marian {

 struct UnaryNodeOp : public NaryNodeOp {
-  UnaryNodeOp(Expr a, Shape shape)
-      : NaryNodeOp({a}, shape) {}
+  UnaryNodeOp(Expr a, Shape shape) : NaryNodeOp({a}, shape) {}

-  UnaryNodeOp(Expr a)
-      : NaryNodeOp({a}, a->shape()) {}
+  UnaryNodeOp(Expr a) : NaryNodeOp({a}, a->shape()) {}

  const std::string color() { return "yellow"; }
 };
@ -26,9 +24,7 @@ private:
  float scalar_{0};

 public:
-  ScalarAddNodeOp(Expr a, float scalar)
-      : UnaryNodeOp(a),
-        scalar_{scalar} {}
+  ScalarAddNodeOp(Expr a, float scalar) : UnaryNodeOp(a), scalar_{scalar} {}

  NodeOps forwardOps() {
    using namespace functional;
@ -67,8 +63,7 @@ private:
  float scalar_{0};

 public:
-  ScalarMultNodeOp(Expr a, float scalar)
-      : UnaryNodeOp(a), scalar_{scalar} {}
+  ScalarMultNodeOp(Expr a, float scalar) : UnaryNodeOp(a), scalar_{scalar} {}

  NodeOps forwardOps() {
    using namespace functional;
@ -210,7 +205,6 @@ struct TanhNodeOp : public NaryNodeOp {
  const std::string type() { return "tanh"; }
 };

-
 struct ReLUNodeOp : public UnaryNodeOp {
  ReLUNodeOp(Expr a) : UnaryNodeOp(a) {}

@ -262,8 +256,7 @@ struct ReLUNodeOp : public UnaryNodeOp {
 * \f]
 */
 struct PReLUNodeOp : public UnaryNodeOp {
-  PReLUNodeOp(float alpha, Expr a)
-      : UnaryNodeOp(a), alpha_(alpha) {}
+  PReLUNodeOp(float alpha, Expr a) : UnaryNodeOp(a), alpha_(alpha) {}

  NodeOps forwardOps() {
    using namespace functional;
@ -334,11 +327,9 @@ struct SwishNodeOp : public UnaryNodeOp {
 };

 struct SoftmaxNodeOp : public UnaryNodeOp {
-  SoftmaxNodeOp(Expr a)
-      : UnaryNodeOp(a), mask_(nullptr) {}
+  SoftmaxNodeOp(Expr a) : UnaryNodeOp(a), mask_(nullptr) {}

-  SoftmaxNodeOp(Expr a, Expr mask)
-      : UnaryNodeOp(a), mask_(mask) {}
+  SoftmaxNodeOp(Expr a, Expr mask) : UnaryNodeOp(a), mask_(mask) {}

  Expr mask_;

@ -407,17 +398,18 @@ struct SumNodeOp : public UnaryNodeOp {
  int ax_;

  template <typename... Args>
-  SumNodeOp(Expr a, Args... args)
-      : UnaryNodeOp(a, newShape(a, args...)) {}
+  SumNodeOp(Expr a, Args... args) : UnaryNodeOp(a, newShape(a, args...)) {}

  NodeOps forwardOps() {
    using namespace functional;

-    return {NodeOp(Reduce(_1, val_, child(0)->val()))}; }
+    return {NodeOp(Reduce(_1, val_, child(0)->val()))};
+  }

  NodeOps backwardOps() {
    using namespace functional;
-    return {NodeOp(Add(_1, child(0)->grad(), adj_))}; }
+    return {NodeOp(Add(_1, child(0)->grad(), adj_))};
+  }

  template <class... Args>
  Shape newShape(Expr a, Args... args) {
@ -456,8 +448,7 @@ struct MeanNodeOp : public UnaryNodeOp {
  int ax_;

  template <typename... Args>
-  MeanNodeOp(Expr a, Args... args)
-      : UnaryNodeOp(a, newShape(a, args...)) {}
+  MeanNodeOp(Expr a, Args... args) : UnaryNodeOp(a, newShape(a, args...)) {}

  NodeOps forwardOps() {
    using namespace functional;
@ -543,8 +534,7 @@ struct ExpNodeOp : public UnaryNodeOp {
 struct SqrtNodeOp : public UnaryNodeOp {
  float epsilon_;

-  SqrtNodeOp(Expr a, float epsilon)
-      : UnaryNodeOp(a), epsilon_(epsilon) {}
+  SqrtNodeOp(Expr a, float epsilon) : UnaryNodeOp(a), epsilon_(epsilon) {}

  NodeOps forwardOps() {
    using namespace functional;
@ -614,8 +604,7 @@ struct NegNodeOp : public UnaryNodeOp {

 struct RowsNodeOp : public UnaryNodeOp {
  RowsNodeOp(Expr a, const std::vector<size_t>& indeces)
-      : UnaryNodeOp(a, newShape(a, indeces)),
-        indices_(indeces) {}
+      : UnaryNodeOp(a, newShape(a, indeces)), indices_(indeces) {}

  NodeOps forwardOps() {
    // @TODO: solve this with a tensor!
@ -666,8 +655,7 @@ struct RowsNodeOp : public UnaryNodeOp {

 struct ColsNodeOp : public UnaryNodeOp {
  ColsNodeOp(Expr a, const std::vector<size_t>& indeces)
-      : UnaryNodeOp(a, newShape(a, indeces)),
-        indices_(indeces) {}
+      : UnaryNodeOp(a, newShape(a, indeces)), indices_(indeces) {}

  NodeOps forwardOps() {
    // @TODO: solve this with a tensor!
@ -716,8 +704,7 @@ struct ColsNodeOp : public UnaryNodeOp {

 struct SelectNodeOp : public UnaryNodeOp {
  SelectNodeOp(Expr a, int axis, const std::vector<size_t>& indeces)
-      : UnaryNodeOp(a, newShape(a, axis, indeces)),
-        indices_(indeces) {}
+      : UnaryNodeOp(a, newShape(a, axis, indeces)), indices_(indeces) {}

  NodeOps forwardOps() {
    return {NodeOp(
@ -772,8 +759,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
  std::vector<int> axes_;

  TransposeNodeOp(Expr a, const std::vector<int>& axes)
-      : UnaryNodeOp(a, newShape(a, axes)),
-        axes_{axes} {}
+      : UnaryNodeOp(a, newShape(a, axes)), axes_{axes} {}

  NodeOps forwardOps() {
    return {NodeOp(TransposeND(val_, child(0)->val(), axes_))};
@ -788,7 +774,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
    Shape shape = a->shape();

    ABORT_IF(shape.size() != axes.size(),
-            "Shape and transpose axes have different number of dimensions");
+             "Shape and transpose axes have different number of dimensions");

    for(int i = 0; i < shape.size(); ++i)
      shape.set(i, a->shape()[axes[i]]);
@ -829,8 +815,7 @@ private:

 public:
  template <typename... Args>
-  ReshapeNodeOp(Expr a, Shape shape)
-      : UnaryNodeOp(a, shape), reshapee_(a) {
+  ReshapeNodeOp(Expr a, Shape shape) : UnaryNodeOp(a, shape), reshapee_(a) {
    Node::destroy_ = false;
  }

@ -894,9 +879,7 @@ private:

 public:
  StepNodeOp(Expr a, int step, int axis)
-      : UnaryNodeOp(a, newShape(a, axis)),
-        stepNode_(a),
-        step_(step) {
+      : UnaryNodeOp(a, newShape(a, axis)), stepNode_(a), step_(step) {
    Node::destroy_ = false;
  }

@ -1056,67 +1039,54 @@ public:
                 padWidth,
                 strideHeight,
                 strideWidth,
-                 mode) {
-  }
+                 mode) {}

  NodeOps forwardOps() {
    return {NodeOp(pooling_.forward(child(0)->val(), val_))};
  }

  NodeOps backwardOps() {
-    return {NodeOp(pooling_.backward(
-          child(0)->val(),
-          child(0)->grad(),
-          val_,
-          adj_))};
+    return {NodeOp(
+        pooling_.backward(child(0)->val(), child(0)->grad(), val_, adj_))};
  }

  const std::string type() { return "layer_pooling"; }

-
 protected:
  PoolingWrapper pooling_;
 };

 class PoolingWithMaskingOp : public UnaryNodeOp {
-  public:
-    PoolingWithMaskingOp( Expr x, Expr mask, int width, bool isEven=false)
-      : UnaryNodeOp(x),
-        mask_(mask),
-        width_(width),
-        isEven_(isEven)
-    {
-      auto xShape = x->shape();
-      int dimBatch = xShape[0];
-      int dimWord = xShape[1];
-      int cols = (isEven_) ? xShape[2] - 1 : xShape[2];
-      int dimSentence = (cols / width_) + (cols % width_ != 0);
-      shape_ = {dimBatch, dimWord, dimSentence};
-    }
+public:
+  PoolingWithMaskingOp(Expr x, Expr mask, int width, bool isEven = false)
+      : UnaryNodeOp(x), mask_(mask), width_(width), isEven_(isEven) {
+    auto xShape = x->shape();
+    int dimBatch = xShape[0];
+    int dimWord = xShape[1];
+    int cols = (isEven_) ? xShape[2] - 1 : xShape[2];
+    int dimSentence = (cols / width_) + (cols % width_ != 0);
+    shape_ = {dimBatch, dimWord, dimSentence};
+  }

-    NodeOps forwardOps() {
-      return {NodeOp(PoolingWithMaskingForward(val_,
+  NodeOps forwardOps() {
+    return {NodeOp(PoolingWithMaskingForward(
+        val_, child(0)->val(), mask_->val(), width_, isEven_))};
+  }
+
+  NodeOps backwardOps() {
+    return {NodeOp(PoolingWithMaskingBackward(adj_,
+                                              child(0)->grad(),
                                              child(0)->val(),
                                              mask_->val(),
                                              width_,
                                              isEven_))};
-    }
+  }

-    NodeOps backwardOps() {
-      return {NodeOp(PoolingWithMaskingBackward(adj_,
-                                               child(0)->grad(),
-                                               child(0)->val(),
-                                               mask_->val(),
-                                               width_,
-                                               isEven_))};
-    }
+  const std::string type() { return "layer_pooling"; }

-    const std::string type() {return "layer_pooling";}
-
-  protected:
-    Expr mask_;
-    int width_;
-    bool isEven_;
+protected:
+  Expr mask_;
+  int width_;
+  bool isEven_;
 };
-
 }
--- a/src/layers/constructors.h
+++ b/src/layers/constructors.h
@ -23,18 +23,18 @@ struct LayerFactory : public Factory {
    return as<Cast>() != nullptr;
  }

-  virtual Ptr<Layer> construct() = 0;  
+  virtual Ptr<Layer> construct() = 0;
 };

 class DenseFactory : public LayerFactory {
 protected:
-  //std::vector<std::pair<std::string, std::string>> tiedParams_;
+  // std::vector<std::pair<std::string, std::string>> tiedParams_;
  std::vector<std::pair<std::string, std::string>> tiedParamsTransposed_;

 public:
  DenseFactory(Ptr<ExpressionGraph> graph) : LayerFactory(graph) {}

-  //Accumulator<DenseFactory> tie(const std::string& param,
+  // Accumulator<DenseFactory> tie(const std::string& param,
  //                              const std::string& tied) {
  //  tiedParams_.push_back({param, tied});
  //  return Accumulator<DenseFactory>(*this);
@ -48,21 +48,20 @@ public:

  Ptr<Layer> construct() {
    auto dense = New<Dense>(graph_, options_);
-    //for(auto& p : tiedParams_)
+    // for(auto& p : tiedParams_)
    //  dense->tie(p.first, p.second);
    for(auto& p : tiedParamsTransposed_)
      dense->tie_transposed(p.first, p.second);
    return dense;
  }
-  
+
  DenseFactory clone() {
    DenseFactory aClone(graph_);
    aClone.options_->merge(options_);
-    //aClone.tiedParams_ = tiedParams_;
+    // aClone.tiedParams_ = tiedParams_;
    aClone.tiedParamsTransposed_ = tiedParamsTransposed_;
    return aClone;
  }
-
 };

 typedef Accumulator<DenseFactory> dense;
@ -95,7 +94,6 @@ public:
  }

  void push_back(Ptr<Layer> layer) { layers_.push_back(layer); }
-  
 };

 class MLPFactory : public Factory {
@ -121,7 +119,7 @@ public:
    layers_.push_back(New<LF>(lf));
    return Accumulator<MLPFactory>(*this);
  }
-  
+
  MLPFactory clone() {
    MLPFactory aClone(graph_);
    aClone.options_->merge(options_);
@ -129,7 +127,6 @@ public:
      aClone.push_back(lf->clone());
    return aClone;
  }
-
 };

 typedef Accumulator<MLPFactory> mlp;
--- a/src/layers/convolution.cpp
+++ b/src/layers/convolution.cpp
@ -2,8 +2,7 @@
 #include "graph/node_operators_binary.h"

 namespace marian {
-Convolution::Convolution(Ptr<ExpressionGraph> graph)
-  : Factory(graph) {}
+Convolution::Convolution(Ptr<ExpressionGraph> graph) : Factory(graph) {}

 Expr Convolution::apply(Expr x) {
  auto prefix = opt<std::string>("prefix");
@ -13,28 +12,21 @@ Expr Convolution::apply(Expr x) {
  auto strides = opt<std::pair<int, int>>("strides", std::make_pair(1, 1));

  int layerIn = x->shape()[1];
-  auto kernel = graph_->param(prefix + "_conv_kernels",
-                              {layerIn,
-                               kernelNum,
-                               kernelDims.first,
-                               kernelDims.second},
-                               inits::glorot_uniform);
+  auto kernel
+      = graph_->param(prefix + "_conv_kernels",
+                      {layerIn, kernelNum, kernelDims.first, kernelDims.second},
+                      inits::glorot_uniform);

-  auto bias = graph_->param(prefix + "_conv_bias",
-                            {1, kernelNum, 1, 1},
-                            inits::zeros);
+  auto bias = graph_->param(
+      prefix + "_conv_bias", {1, kernelNum, 1, 1}, inits::zeros);

  std::vector<Expr> nodes = {x, kernel, bias};
-  return Expression<ConvolutionOp>(nodes,
-                                   paddings.first,
-                                   paddings.second,
-                                   strides.first,
-                                   strides.second);
+  return Expression<ConvolutionOp>(
+      nodes, paddings.first, paddings.second, strides.first, strides.second);
 }

 Expr Convolution::apply(const std::vector<Expr>&) {
  ABORT("Can't apply convolution on many inputs at once");
  return nullptr;
 }
-
 }
--- a/src/layers/factory.h
+++ b/src/layers/factory.h
@ -33,6 +33,7 @@ public:
 template <class BaseFactory>
 class Accumulator : public BaseFactory {
  typedef BaseFactory Factory;
+
 public:
  Accumulator() : Factory(nullptr) {}
  Accumulator(Ptr<ExpressionGraph> graph) : Factory(graph) {}
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@ -1,6 +1,7 @@
 #pragma once

 #include "marian.h"
+
 #include "layers/factory.h"

 namespace marian {
@ -75,11 +76,9 @@ public:
      if(tiedParams_.count(nameW)) {
        W = tiedParams_[nameW];
        transposeW = true;
-      }
-      else {
-        W = g->param(name + "_" + nameW,
-                     {in->shape()[-1], dim},
-                     inits::glorot_uniform);
+      } else {
+        W = g->param(
+            name + "_" + nameW, {in->shape()[-1], dim}, inits::glorot_uniform);
      }

      Expr b;
@ -87,8 +86,7 @@ public:
      if(tiedParams_.count(nameB))
        b = tiedParams_[nameB];
      else
-        b = g->param(
-            name + "_" + nameB, {1, dim}, inits::zeros);
+        b = g->param(name + "_" + nameB, {1, dim}, inits::zeros);

      params_.push_back(W);
      params_.push_back(b);
@ -98,19 +96,19 @@ public:
          auto ln_s = g->param(name + "_ln_s" + std::to_string(i),
                               {1, dim},
                               inits::from_value(1.f));
-          auto ln_b = g->param(name + "_ln_b" + std::to_string(i),
-                               {1, dim},
-                               inits::zeros);
+          auto ln_b = g->param(
+              name + "_ln_b" + std::to_string(i), {1, dim}, inits::zeros);

-          outputs.push_back(
-              layer_norm(affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS));
+          outputs.push_back(layer_norm(
+              affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS));
        } else {
          auto gamma = g->param(name + "_gamma" + std::to_string(i),
                                {1, dim},
                                inits::from_value(1.0));

          params_.push_back(gamma);
-          outputs.push_back(layer_norm(dot(in, W, false, transposeW), gamma, b));
+          outputs.push_back(
+              layer_norm(dot(in, W, false, transposeW), gamma, b));
        }

      } else {
@ -147,11 +145,9 @@ public:
    if(tiedParams_.count(nameW)) {
      transposeW = true;
      W = tiedParams_[nameW];
-    }
-    else {
-      W = g->param(name + "_" + nameW,
-                   {input->shape()[-1], dim},
-                   inits::glorot_uniform);
+    } else {
+      W = g->param(
+          name + "_" + nameW, {input->shape()[-1], dim}, inits::glorot_uniform);
    }
    Expr b;
    std::string nameB = "b";
@ -165,16 +161,14 @@ public:
    Expr out;
    if(layerNorm) {
      if(nematusNorm) {
-        auto ln_s = g->param(
-            name + "_ln_s", {1, dim}, inits::from_value(1.f));
-        auto ln_b
-            = g->param(name + "_ln_b", {1, dim}, inits::zeros);
+        auto ln_s = g->param(name + "_ln_s", {1, dim}, inits::from_value(1.f));
+        auto ln_b = g->param(name + "_ln_b", {1, dim}, inits::zeros);

-        out = layer_norm(affine(input, W, b, false, transposeW),
-                         ln_s, ln_b, NEMATUS_LN_EPS);
+        out = layer_norm(
+            affine(input, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS);
      } else {
-        auto gamma = g->param(
-            name + "_gamma", {1, dim}, inits::from_value(1.0));
+        auto gamma
+            = g->param(name + "_gamma", {1, dim}, inits::from_value(1.0));

        params_.push_back(gamma);
        out = layer_norm(dot(input, W, false, transposeW), gamma, b);
@ -217,22 +211,18 @@ struct EmbeddingFactory : public Factory {
      }
    }

-    return graph_->param(name,
-                         {dimVoc, dimEmb},
-                         initFunc,
-                         fixed);
+    return graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
  }
 };

 typedef Accumulator<EmbeddingFactory> embedding;

-static inline
-Expr Cost(Expr logits,
-          Expr indices,
-          Expr mask,
-          std::string costType = "cross-entropy",
-          float smoothing = 0,
-          Expr weights = nullptr) {
+static inline Expr Cost(Expr logits,
+                        Expr indices,
+                        Expr mask,
+                        std::string costType = "cross-entropy",
+                        float smoothing = 0,
+                        Expr weights = nullptr) {
  using namespace keywords;

  auto ce = cross_entropy(logits, indices);
@ -255,15 +245,17 @@ Expr Cost(Expr logits,
  // axes:
  //  - time axis (words): -3
  //  - batch axis (sentences): -2
-  if(costType == "ce-mean" || costType == "cross-entropy") { // sum over words; average over sentences
+  if(costType == "ce-mean"
+     || costType
+            == "cross-entropy") {  // sum over words; average over sentences
    cost = mean(costSum, axis = -2);
-  } else if(costType == "ce-mean-words") { // average over target tokens
+  } else if(costType == "ce-mean-words") {  // average over target tokens
    cost = sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2);
-  } else if(costType == "ce-sum") { // sum over target tokens
+  } else if(costType == "ce-sum") {  // sum over target tokens
    cost = sum(costSum, axis = -2);
-  } else if(costType == "perplexity") { // ==exp('ce-mean-words')
+  } else if(costType == "perplexity") {  // ==exp('ce-mean-words')
    cost = exp(sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2));
-  } else if(costType == "ce-rescore") { // sum over words, keep batch axis
+  } else if(costType == "ce-rescore") {  // sum over words, keep batch axis
    cost = -costSum;
  } else {  // same as ce-mean
    cost = mean(costSum, axis = -2);
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@ -4,20 +4,18 @@

 namespace marian {

-static inline
-Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph,
-                         Ptr<data::CorpusBatch> batch,
-                         Ptr<Options> options,
-                         Expr att) {
+static inline Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph,
+                                       Ptr<data::CorpusBatch> batch,
+                                       Ptr<Options> options,
+                                       Expr att) {
  using namespace keywords;

  int dimBatch = att->shape()[0];
  int dimSrc = att->shape()[2];
  int dimTrg = att->shape()[3];

-  auto aln = graph->constant(
-      {dimBatch, 1, dimSrc, dimTrg},
-      inits::from_vector(batch->getGuidedAlignment()));
+  auto aln = graph->constant({dimBatch, 1, dimSrc, dimTrg},
+                             inits::from_vector(batch->getGuidedAlignment()));

  std::string guidedCostType
      = options->get<std::string>("guided-alignment-cost");
--- a/src/models/char_s2s.h
+++ b/src/models/char_s2s.h
@ -2,6 +2,7 @@

 #include "marian.h"
 #include "models/s2s.h"
+
 #include "layers/convolution.h"

 namespace marian {
@ -29,21 +30,18 @@ public:

    int dimEmb = opt<int>("dim-emb");
    auto convSizes = options_->get<std::vector<int>>("char-conv-filters-num");
-    auto convWidths = options_->get<std::vector<int>>("char-conv-filters-widths");
+    auto convWidths
+        = options_->get<std::vector<int>>("char-conv-filters-widths");
    int stride = opt<int>("char-stride");
    int highwayNum = opt<int>("char-highway");

    auto conved = CharConvPooling(
-        prefix_ + "conv_pooling",
-        dimEmb,
-        convWidths,
-        convSizes,
-        stride)
-      (batchEmbeddings, batchMask);
+        prefix_ + "conv_pooling", dimEmb, convWidths, convSizes, stride)(
+        batchEmbeddings, batchMask);

    auto inHighway = conved;
-    for (int i = 0; i < highwayNum; ++i) {
-      inHighway = highway(prefix_ +"_" + std::to_string(i), inHighway);
+    for(int i = 0; i < highwayNum; ++i) {
+      inHighway = highway(prefix_ + "_" + std::to_string(i), inHighway);
    }

    Expr stridedMask = getStridedMask(graph, batch, stride);
@ -52,24 +50,26 @@ public:

    return New<EncoderState>(context, stridedMask, batch);
  }
+
 protected:
-  Expr getStridedMask(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch,
+  Expr getStridedMask(Ptr<ExpressionGraph> graph,
+                      Ptr<data::CorpusBatch> batch,
                      int stride) {
    auto subBatch = (*batch)[batchIndex_];

    int dimBatch = subBatch->batchSize();

    std::vector<float> strided;
-    for (size_t wordIdx = 0; wordIdx < subBatch->mask().size(); wordIdx += stride * dimBatch) {
-      for (size_t j = wordIdx; j < wordIdx + dimBatch; ++j) {
+    for(size_t wordIdx = 0; wordIdx < subBatch->mask().size();
+        wordIdx += stride * dimBatch) {
+      for(size_t j = wordIdx; j < wordIdx + dimBatch; ++j) {
        strided.push_back(subBatch->mask()[j]);
      }
    }
    int dimWords = strided.size() / dimBatch;
-    auto stridedMask = graph->constant({dimWords, dimBatch, 1},
-                                       inits::from_vector(strided));
+    auto stridedMask
+        = graph->constant({dimWords, dimBatch, 1}, inits::from_vector(strided));
    return stridedMask;
  }
 };
-
 }
--- a/src/models/encdec.h
+++ b/src/models/encdec.h
@ -1,6 +1,7 @@
 #pragma once

 #include "marian.h"
+
 #include "layers/generic.h"
 #include "layers/guided_alignment.h"
 #include "model_base.h"
@ -30,8 +31,8 @@ protected:

    auto batchEmbeddings
        = reshape(chosenEmbeddings, {dimWords, dimBatch, dimEmb});
-    auto batchMask = graph->constant(
-        {dimWords, dimBatch, 1}, inits::from_vector(subBatch->mask()));
+    auto batchMask = graph->constant({dimWords, dimBatch, 1},
+                                     inits::from_vector(subBatch->mask()));

    return std::make_tuple(batchEmbeddings, batchMask);
  }
@ -129,7 +130,8 @@ public:
  virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
                                Ptr<DecoderState> state,
                                const std::vector<size_t>& embIdx,
-                                int dimBatch, int dimBeam) {
+                                int dimBatch,
+                                int dimBeam) {
    using namespace keywords;

    int dimTrgEmb = opt<int>("dim-emb");
@ -149,12 +151,10 @@ public:

    Expr selectedEmbs;
    if(embIdx.empty()) {
-      selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb},
-                                     inits::zeros);
+      selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb}, inits::zeros);
    } else {
      selectedEmbs = rows(yEmb, embIdx);
-      selectedEmbs
-          = reshape(selectedEmbs, {dimBeam, 1, dimBatch, dimTrgEmb});
+      selectedEmbs = reshape(selectedEmbs, {dimBeam, 1, dimBatch, dimTrgEmb});
    }
    state->setTargetEmbeddings(selectedEmbs);
  }
@ -174,14 +174,16 @@ public:
  virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
                                Ptr<DecoderState> state,
                                const std::vector<size_t>&,
-                                int dimBatch, int beamSize)
+                                int dimBatch,
+                                int beamSize)
      = 0;

  virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
                                 Ptr<DecoderState>,
                                 const std::vector<size_t>&,
                                 const std::vector<size_t>&,
-                                 int dimBatch, int beamSize)
+                                 int dimBatch,
+                                 int beamSize)
      = 0;

  virtual Ptr<DecoderState> step(Ptr<ExpressionGraph>, Ptr<DecoderState>) = 0;
@ -224,7 +226,8 @@ protected:

    decoder["mini-batch"] = opt<size_t>("valid-mini-batch");
    decoder["maxi-batch"] = opt<size_t>("valid-mini-batch") > 1 ? 100 : 1;
-    decoder["maxi-batch-sort"] = opt<size_t>("valid-mini-batch") > 1 ? "trg" : "none";
+    decoder["maxi-batch-sort"]
+        = opt<size_t>("valid-mini-batch") > 1 ? "trg" : "none";

    decoder["relative-paths"] = false;

@ -320,8 +323,10 @@ public:
                                 Ptr<DecoderState> state,
                                 const std::vector<size_t>& hypIndices,
                                 const std::vector<size_t>& embIndices,
-                                 int dimBatch, int beamSize) {
-    auto selectedState = hypIndices.empty() ? state : state->select(hypIndices, beamSize);
+                                 int dimBatch,
+                                 int beamSize) {
+    auto selectedState
+        = hypIndices.empty() ? state : state->select(hypIndices, beamSize);
    selectEmbeddings(graph, selectedState, embIndices, dimBatch, beamSize);
    selectedState->setSingleStep(true);
    auto nextState = step(graph, selectedState);
@ -332,7 +337,8 @@ public:
  virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
                                Ptr<DecoderState> state,
                                const std::vector<size_t>& embIdx,
-                                int dimBatch, int beamSize) {
+                                int dimBatch,
+                                int beamSize) {
    decoders_[0]->selectEmbeddings(graph, state, embIdx, dimBatch, beamSize);
  }

@ -366,17 +372,12 @@ public:
      int dimBatch = batch->size();
      int dimWords = sentenceWeighting ? 1 : batch->back()->batchWidth();

-      weights = graph->constant(
-          {1, dimWords, dimBatch, 1},
-          inits::from_vector(batch->getDataWeights()));
+      weights = graph->constant({1, dimWords, dimBatch, 1},
+                                inits::from_vector(batch->getDataWeights()));
    }

-    auto cost = Cost(nextState->getProbs(),
-                     trgIdx,
-                     trgMask,
-                     costType,
-                     ls,
-                     weights);
+    auto cost
+        = Cost(nextState->getProbs(), trgIdx, trgMask, costType, ls, weights);

    if(options_->has("guided-alignment") && !inference_) {
      auto alignments = decoders_[0]->getAlignments();
@ -428,7 +429,7 @@ public:

      do {
        size_t current = (start + end) / 2;
-        //std::cerr << i << " " << current << std::endl;
+        // std::cerr << i << " " << current << std::endl;
        auto batch = data::CorpusBatch::fakeBatch(lengths, current, options_);
        build(graph, batch);
        fits = graph->fits();
@ -436,8 +437,7 @@ public:
        if(fits) {
          stats->add(batch, multiplier);
          start = current + 1;
-        }
-        else {
+        } else {
          end = current - 1;
        }
      } while(end - start > step);
--- a/src/models/hardatt.h
+++ b/src/models/hardatt.h
@ -1,9 +1,10 @@
 #pragma once

 #include "marian.h"
+
 #include "layers/generic.h"
-#include "rnn/types.h"
 #include "rnn/attention_constructors.h"
+#include "rnn/types.h"

 #include <numeric>

@ -21,13 +22,16 @@ public:
      : DecoderState(states, probs, encStates),
        attentionIndices_(attentionIndices) {}

-  virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, int beamSize) {
+  virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx,
+                                   int beamSize) {
    std::vector<size_t> selectedAttentionIndices;
    for(auto i : selIdx)
      selectedAttentionIndices.push_back(attentionIndices_[i]);

-    return New<DecoderStateHardAtt>(
-        states_.select(selIdx, beamSize), probs_, encStates_, selectedAttentionIndices);
+    return New<DecoderStateHardAtt>(states_.select(selIdx, beamSize),
+                                    probs_,
+                                    encStates_,
+                                    selectedAttentionIndices);
  }

  virtual void setAttentionIndices(
@ -85,11 +89,11 @@ public:
    Expr start;
    if(!meanContexts.empty()) {
      // apply single layer network to mean to map into decoder space
-      auto mlp = mlp::mlp(graph)                                   //
-                     .push_back(mlp::dense(graph)                  //
-                                ("prefix", prefix_ + "_ff_state")  //
-                                ("dim", opt<int>("dim-rnn"))       //
-                                ("activation", (int)mlp::act::tanh)//
+      auto mlp = mlp::mlp(graph)                                     //
+                     .push_back(mlp::dense(graph)                    //
+                                ("prefix", prefix_ + "_ff_state")    //
+                                ("dim", opt<int>("dim-rnn"))         //
+                                ("activation", (int)mlp::act::tanh)  //
                                ("layer-normalization",
                                 opt<bool>("layer-normalization")));
      start = mlp->apply(meanContexts);
@ -133,7 +137,8 @@ public:
    int dimBeam = trgEmbeddings->shape()[-4];

    if(dropoutTrg) {
-      trgEmbeddings = dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1});
+      trgEmbeddings
+          = dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1});
    }

    auto flatContext = reshape(context, {dimBatch * dimSrcWords, dimContext});
--- a/src/models/model_base.h
+++ b/src/models/model_base.h
@ -1,7 +1,7 @@
 #pragma once

-#include "marian.h"
 #include <string>
+#include "marian.h"

 namespace marian {
 namespace models {
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@ -1,12 +1,13 @@
 #include "marian.h"
+
 #include "models/model_factory.h"

+#include "models/amun.h"
+#include "models/encdec.h"
+#include "models/hardatt.h"
+#include "models/nematus.h"
 #include "models/s2s.h"
 #include "models/transformer.h"
-#include "models/hardatt.h"
-#include "models/amun.h"
-#include "models/nematus.h"
-#include "models/encdec.h"

 #ifdef CUDNN
 #include "models/char_s2s.h"
--- a/src/models/model_factory.h
+++ b/src/models/model_factory.h
@ -1,8 +1,9 @@
 #pragma once

 #include "marian.h"
-#include "layers/factory.h"
+
 #include "encdec.h"
+#include "layers/factory.h"

 namespace marian {

--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@ -1,9 +1,10 @@
 #pragma once

 #include "marian.h"
+
 #include "layers/constructors.h"
-#include "rnn/constructors.h"
 #include "rnn/attention_constructors.h"
+#include "rnn/constructors.h"

 namespace marian {

--- a/src/models/states.h
+++ b/src/models/states.h
@ -48,8 +48,10 @@ public:
  virtual Expr getProbs() { return probs_; }
  virtual void setProbs(Expr probs) { probs_ = probs; }

-  virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, int beamSize) {
-    return New<DecoderState>(states_.select(selIdx, beamSize), probs_, encStates_);
+  virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx,
+                                   int beamSize) {
+    return New<DecoderState>(
+        states_.select(selIdx, beamSize), probs_, encStates_);
  }

  virtual const rnn::States& getStates() { return states_; }
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -1,11 +1,12 @@
 #pragma once

 #include "marian.h"
-#include "layers/factory.h"
+
+#include "encdec.h"
 #include "layers/constructors.h"
+#include "layers/factory.h"
 #include "model_base.h"
 #include "model_factory.h"
-#include "encdec.h"

 namespace marian {

@ -34,8 +35,8 @@ public:
    }

    // shared across batch entries
-    auto signal = graph->constant({dimWords, 1, dimEmb},
-                                  inits::from_vector(vPos));
+    auto signal
+        = graph->constant({dimWords, 1, dimEmb}, inits::from_vector(vPos));
    return input + signal;
  }

@ -47,15 +48,14 @@ public:
    for(int i = 0; i < length; ++i)
      for(int j = 0; j <= i; ++j)
        vMask[i * length + j] = 1.f;
-    return graph->constant({1, length, length},
-                           inits::from_vector(vMask));
+    return graph->constant({1, length, length}, inits::from_vector(vMask));
  }

  Expr InverseMask(Expr mask) {
    // convert 0/1 mask to transformer style -inf mask
    auto ms = mask->shape();
    mask = (1 - mask) * -99999999.f;
-    return reshape(mask, {ms[-3], 1, ms[-2], ms[-1]}) ;
+    return reshape(mask, {ms[-3], 1, ms[-2], ms[-1]});
  }

  Expr SplitHeads(Expr input, int dimHeads) {
@ -135,18 +135,17 @@ public:
      if(op == 'h') {
        auto Wh = graph->param(
            prefix + "_Wh", {dimModel, dimModel}, inits::glorot_uniform);
-        auto bh
-            = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros);
+        auto bh = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros);

        auto t = affine(prevInput, Wh, bh);
        output = highway(output, prevInput, t);
      }
      // layer normalization
      if(op == 'n') {
-        auto scale = graph->param(
-            prefix + "_ln_scale", {1, dimModel}, inits::ones);
-        auto bias = graph->param(
-            prefix + "_ln_bias", {1, dimModel}, inits::zeros);
+        auto scale
+            = graph->param(prefix + "_ln_scale", {1, dimModel}, inits::ones);
+        auto bias
+            = graph->param(prefix + "_ln_bias", {1, dimModel}, inits::zeros);
        output = layer_norm(output, scale, bias, 1e-6);
      }
    }
@ -219,17 +218,13 @@ public:
      if(i > 0)
        prefixProj += "_enc" + std::to_string(i + 1);

-      auto Wk = graph->param(prefixProj + "_Wk",
-                             {dimModel, dimModel},
-                             inits::glorot_uniform);
-      auto bk = graph->param(
-          prefixProj + "_bk", {1, dimModel}, inits::zeros);
+      auto Wk = graph->param(
+          prefixProj + "_Wk", {dimModel, dimModel}, inits::glorot_uniform);
+      auto bk = graph->param(prefixProj + "_bk", {1, dimModel}, inits::zeros);

-      auto Wv = graph->param(prefixProj + "_Wv",
-                             {dimModel, dimModel},
-                             inits::glorot_uniform);
-      auto bv = graph->param(
-          prefixProj + "_bv", {1, dimModel}, inits::zeros);
+      auto Wv = graph->param(
+          prefixProj + "_Wv", {dimModel, dimModel}, inits::glorot_uniform);
+      auto bv = graph->param(prefixProj + "_bv", {1, dimModel}, inits::zeros);

      auto kh = affine(keys[i], Wk, bk);
      auto vh = affine(values[i], Wv, bv);
@ -254,8 +249,8 @@ public:

    int dimAtt = output->shape()[-1];

-    auto Wo = graph->param(
-        prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
+    auto Wo
+        = graph->param(prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
    auto bo = graph->param(prefix + "_bo", {1, dimOut}, inits::zeros);
    output = affine(output, Wo, bo);

@ -468,11 +463,12 @@ public:
                   std::vector<Ptr<EncoderState>> &encStates)
      : DecoderState(states, probs, encStates) {}

-  virtual Ptr<DecoderState> select(const std::vector<size_t> &selIdx, int beamSize) {
+  virtual Ptr<DecoderState> select(const std::vector<size_t> &selIdx,
+                                   int beamSize) {
    rnn::States selectedStates;

    int dimDepth = states_[0].output->shape()[-1];
-    int dimTime  = states_[0].output->shape()[-2];
+    int dimTime = states_[0].output->shape()[-2];
    int dimBatch = selIdx.size() / beamSize;

    std::vector<size_t> selIdx2;
@ -553,7 +549,7 @@ public:
      decoderMask = reshape(TransposeTimeBatch(decoderMask),
                            {1, dimBatch, 1, dimTrgWords});
      selfMask = selfMask * decoderMask;
-      //if(dimBeam > 1)
+      // if(dimBeam > 1)
      //  selfMask = repeat(selfMask, dimBeam, axis = -4);
    }

@ -586,7 +582,8 @@ public:
    for(int i = 1; i <= opt<int>("dec-depth"); ++i) {
      auto values = query;
      if(prevDecoderStates.size() > 0)
-        values = concatenate({prevDecoderStates[i - 1].output, query}, axis = -2);
+        values
+            = concatenate({prevDecoderStates[i - 1].output, query}, axis = -2);

      decoderStates.push_back({values, nullptr});

--- a/src/optimizers/clippers.cpp
+++ b/src/optimizers/clippers.cpp
@ -15,5 +15,4 @@ void Norm::clip(Tensor t) {
  if(l2Norm >= c_)
    Element(_1 = (c_ / l2Norm) * _1, t);
 }
-
 }
--- a/src/optimizers/optimizers.cpp
+++ b/src/optimizers/optimizers.cpp
@ -58,7 +58,8 @@ void Adagrad::load(const std::string& name,
    // extract data into vectors
    if(name == "adagrad_gt") {
      vGt.resize(totalSize);
-      std::copy((float*)np->data(), (float*)np->data() + totalSize, vGt.begin());
+      std::copy(
+          (float*)np->data(), (float*)np->data() + totalSize, vGt.begin());
    }
  }

@ -178,11 +179,13 @@ void Adam::load(const std::string& name,
    // extract data into vectors
    if(name == "adam_mt") {
      vMt.resize(totalSize);
-      std::copy((float*)np->data(), (float*)np->data() + totalSize, vMt.begin());
+      std::copy(
+          (float*)np->data(), (float*)np->data() + totalSize, vMt.begin());
    }
    if(name == "adam_vt") {
      vVt.resize(totalSize);
-      std::copy((float*)np->data(), (float*)np->data() + totalSize, vVt.begin());
+      std::copy(
+          (float*)np->data(), (float*)np->data() + totalSize, vVt.begin());
    }
  }

--- a/src/optimizers/optimizers.h
+++ b/src/optimizers/optimizers.h
@ -1,8 +1,8 @@
 #pragma once

+#include <algorithm>
 #include <map>
 #include <memory>
-#include <algorithm>

 #include "common/config.h"
 #include "graph/expression_graph.h"
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@ -42,9 +42,11 @@ private:
 public:
  Rescore(Ptr<Config> options)
      : options_(options),
-        corpus_(options_->get<bool>("n-best") ?
-                std::static_pointer_cast<CorpusBase>(New<CorpusNBest>(options_)) :
-                std::static_pointer_cast<CorpusBase>(New<Corpus>(options_))) {
+        corpus_(
+            options_->get<bool>("n-best")
+                ? std::static_pointer_cast<CorpusBase>(
+                      New<CorpusNBest>(options_))
+                : std::static_pointer_cast<CorpusBase>(New<Corpus>(options_))) {
    corpus_->prepare();

    auto devices = options_->getDevices();
@ -66,12 +68,12 @@ public:
    models_.resize(graphs_.size());
    ThreadPool pool(graphs_.size(), graphs_.size());
    for(int i = 0; i < graphs_.size(); ++i) {
-
-      pool.enqueue([=](int j) {
-        models_[j] = New<Model>(temp);
-        models_[j]->load(graphs_[j], modelFile);
-      }, i);
-
+      pool.enqueue(
+          [=](int j) {
+            models_[j] = New<Model>(temp);
+            models_[j]->load(graphs_[j], modelFile);
+          },
+          i);
    }
  }

@ -81,9 +83,10 @@ public:
    auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
    batchGenerator->prepare(false);

-    Ptr<ScoreCollector> output = options_->get<bool>("n-best") ?
-      std::static_pointer_cast<ScoreCollector>(New<ScoreCollectorNBest>(options_)) :
-      New<ScoreCollector>();
+    Ptr<ScoreCollector> output = options_->get<bool>("n-best")
+                                     ? std::static_pointer_cast<ScoreCollector>(
+                                           New<ScoreCollectorNBest>(options_))
+                                     : New<ScoreCollector>();

    bool summarize = options_->has("summary");
    std::string summary
--- a/src/rescorer/score_collector.h
+++ b/src/rescorer/score_collector.h
@ -13,8 +13,7 @@ namespace marian {

 class ScoreCollector {
 public:
-  ScoreCollector()
-  : nextId_(0), outStrm_(new OutputFileStream(std::cout)) {};
+  ScoreCollector() : nextId_(0), outStrm_(new OutputFileStream(std::cout)){};

  virtual void Write(long id, const std::string& message) {
    boost::mutex::scoped_lock lock(mutex_);
@ -52,9 +51,7 @@ public:
    }
  }

-  virtual void Write(long id, float value) {
-    Write(id, std::to_string(value));
-  }
+  virtual void Write(long id, float value) { Write(id, std::to_string(value)); }

 protected:
  long nextId_{0};
@ -86,7 +83,9 @@ public:

  ScoreCollectorNBest(const ScoreCollectorNBest&) = delete;

-  std::string addToNBest(const std::string nbest, const std::string feature, float score) {
+  std::string addToNBest(const std::string nbest,
+                         const std::string feature,
+                         float score) {
    std::vector<std::string> fields;
    Split(nbest, fields, "|||");
    std::stringstream ss;
@ -96,13 +95,15 @@ public:
  }

  virtual void Write(long id, float score) {
-
    std::string line;
    {
      boost::mutex::scoped_lock lock(mutex_);
      auto iter = buffer_.find(id);
      if(iter == buffer_.end()) {
-        ABORT_IF(lastRead_ >= id, "Entry {} < {} already read but not in buffer", id, lastRead_);
+        ABORT_IF(lastRead_ >= id,
+                 "Entry {} < {} already read but not in buffer",
+                 id,
+                 lastRead_);
        std::string line;
        while(lastRead_ < id && std::getline((std::istream&)*file_, line)) {
          lastRead_++;
@ -116,6 +117,5 @@ public:

    ScoreCollector::Write(id, addToNBest(line, fname_, score));
  }
-
 };
 }
--- a/src/rnn/attention_constructors.h
+++ b/src/rnn/attention_constructors.h
@ -1,10 +1,11 @@
 #pragma once

 #include "marian.h"
+
 #include "layers/factory.h"
-#include "rnn/types.h"
-#include "rnn/constructors.h"
 #include "rnn/attention.h"
+#include "rnn/constructors.h"
+#include "rnn/types.h"

 namespace marian {
 namespace rnn {
--- a/src/rnn/cells.cpp
+++ b/src/rnn/cells.cpp
@ -52,8 +52,7 @@ Expr gruOps(const std::vector<Expr>& nodes, bool final) {
 /******************************************************************************/

 struct LSTMCellNodeOp : public NaryNodeOp {
-  LSTMCellNodeOp(const std::vector<Expr>& nodes)
-      : NaryNodeOp(nodes) {}
+  LSTMCellNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {}

  NodeOps forwardOps() {
    std::vector<Tensor> inputs;
@ -89,8 +88,7 @@ struct LSTMCellNodeOp : public NaryNodeOp {
 };

 struct LSTMOutputNodeOp : public NaryNodeOp {
-  LSTMOutputNodeOp(const std::vector<Expr>& nodes)
-      : NaryNodeOp(nodes) {}
+  LSTMOutputNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {}

  NodeOps forwardOps() {
    std::vector<Tensor> inputs;
--- a/src/rnn/cells.h
+++ b/src/rnn/cells.h
@ -43,8 +43,7 @@ public:
                        {dimInput, dimState},
                        inits::glorot_uniform);

-    b_ = graph->param(
-        prefix + "_b", {1, dimState}, inits::zeros);
+    b_ = graph->param(prefix + "_b", {1, dimState}, inits::zeros);

    if(dropout_ > 0.0f) {
      if(dimInput)
@ -520,8 +519,7 @@ public:
                        {dimInput, 4 * dimState},
                        inits::glorot_uniform);

-    b_ = graph->param(
-        prefix + "_b", {1, 4 * dimState}, inits::zeros);
+    b_ = graph->param(prefix + "_b", {1, 4 * dimState}, inits::zeros);

    if(dropout_ > 0.0f) {
      if(dimInput)
--- a/src/rnn/constructors.h
+++ b/src/rnn/constructors.h
@ -1,7 +1,7 @@
 #pragma once

-#include "marian.h"
 #include "layers/factory.h"
+#include "marian.h"
 #include "rnn/rnn.h"

 namespace marian {
--- a/src/rnn/rnn.h
+++ b/src/rnn/rnn.h
@ -1,9 +1,9 @@
 #pragma once

-#include "marian.h"
 #include "layers/generic.h"
-#include "rnn/types.h"
+#include "marian.h"
 #include "rnn/cells.h"
+#include "rnn/types.h"

 #include <algorithm>
 #include <chrono>
@ -74,10 +74,9 @@ private:
        j = timeSteps - i - 1;

      std::vector<Expr> steps(xWs.size());
-      std::transform(xWs.begin(),
-                     xWs.end(),
-                     steps.begin(),
-                     [j](Expr e) { return step(e, j, -3); });
+      std::transform(xWs.begin(), xWs.end(), steps.begin(), [j](Expr e) {
+        return step(e, j, -3);
+      });

      if(mask)
        state = cell_->applyState(steps, state, step(mask, j, -3));
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@ -9,8 +9,8 @@
 #include <vector>

 #include "common/definitions.h"
-#include "tensors/memory_piece.h"
 #include "tensors/device.h"
+#include "tensors/memory_piece.h"

 namespace marian {

@ -92,8 +92,8 @@ private:
    gaps_.swap(oldGaps);

    for(auto gap : oldGaps)
-      gaps_.insert(
-          Gap(device_->data() + std::distance(oldData, gap.data()), gap.size()));
+      gaps_.insert(Gap(device_->data() + std::distance(oldData, gap.data()),
+                       gap.size()));
    insertGap(Gap(device_->data() + oldSize, add));

    std::unordered_map<uint8_t*, Ptr<MemoryPiece>> oldAllocated;
@ -141,7 +141,10 @@ private:
  }

 public:
-  Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256)
+  Allocator(DeviceId deviceId,
+            size_t bytes,
+            size_t step,
+            size_t alignment = 256)
      : device_(DispatchDevice(deviceId, alignment)),
        step_(step),
        available_(0),
--- a/src/tensors/backend.cpp
+++ b/src/tensors/backend.cpp
@ -16,5 +16,4 @@ Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
 #endif
    return New<cpu::Backend>(deviceId, seed);
 }
-
 }
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@ -10,8 +10,7 @@ protected:
  size_t seed_;

 public:
-  Backend(DeviceId deviceId, size_t seed)
-    : deviceId_(deviceId), seed_(seed) {}
+  Backend(DeviceId deviceId, size_t seed) : deviceId_(deviceId), seed_(seed) {}

  virtual DeviceId getDevice() { return deviceId_; };
  virtual void setDevice() = 0;
@ -19,5 +18,4 @@ public:
 };

 Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed);
-
 }
--- a/src/tensors/cpu/add.h
+++ b/src/tensors/cpu/add.h
@ -5,24 +5,22 @@

 #pragma once

-#include "tensors/tensor.h"
 #include "functional/functional.h"
 #include "functional/shape.h"
-#include "functional/tmp.h"
 #include "functional/tensor.h"
+#include "functional/tmp.h"
+#include "tensors/tensor.h"

 namespace marian {

 namespace cpu {

-
 template <size_t K, class Functor>
 void gAddGeneric(Functor functor,
                 const functional::Shape full,
                 functional::Tensor<float> out,
                 functional::Array<functional::Tensor<float>, K> ins,
                 float scale = 1.0) {
-
  int outLength = out.shape().elements();
  bool same = outLength == full.elements();
  for(int i = 0; i < K; ++i)
@ -73,7 +71,6 @@ void gAddReduce(Functor functor,
                functional::Tensor<float> out,
                functional::Array<functional::Tensor<float>, K> ins,
                float scale = 1.0) {
-
  int rows = full.elements() / full.back();
  int cols = full.back();

@ -100,12 +97,8 @@ void gAddReduce(Functor functor,
  }
 }

-template <class Functor, class ...Tensors>
-void Add(Functor functor,
-         float scale,
-         marian::Tensor out,
-         Tensors... tensors) {
-
+template <class Functor, class... Tensors>
+void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
  auto full = marian::Shape::broadcast({out, tensors...});

  int length = out->shape().elements();
@ -113,7 +106,7 @@ void Add(Functor functor,
  constexpr size_t K = sizeof...(Tensors);

  functional::Tensor<float> gOut = out;
-  functional::Array<functional::Tensor<float>, K> gIns = {tensors ...};
+  functional::Array<functional::Tensor<float>, K> gIns = {tensors...};

  if(full.back() != 1 && out->shape().back() == 1) {
    size_t m = full.elements() / length;
@ -128,8 +121,5 @@ void Add(Functor functor,
    cpu::gAddGeneric(functor, full, gOut, gIns, scale);
  }
 }
-
-
 }
-
 }
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@ -15,17 +15,13 @@ private:

 public:
  Backend(DeviceId deviceId, size_t seed)
-  : marian::Backend(deviceId, seed),
-    gen_(seed_) {}
+      : marian::Backend(deviceId, seed), gen_(seed_) {}

-  void setDevice() { }
+  void setDevice() {}

  void synchronize() {}

-  std::default_random_engine& getRandomGenerator() {
-    return gen_;
-  }
+  std::default_random_engine& getRandomGenerator() { return gen_; }
 };
-
 }
 }
--- a/src/tensors/cpu/device.cpp
+++ b/src/tensors/cpu/device.cpp
@ -1,31 +1,31 @@
-#include <iostream>
 #include "tensors/device.h"
+#include <iostream>

 #include <stdlib.h>

 namespace marian {
 namespace cpu {

-  Device::~Device() {
+Device::~Device() {
+  free(data_);
+  data_ = nullptr;
+  size_ = 0;
+}
+
+void Device::reserve(size_t size) {
+  size = align(size);
+  ABORT_IF(size < size_ || size == 0,
+           "New size must be larger than old size and larger than 0");
+
+  if(data_) {
+    uint8_t *temp = static_cast<uint8_t *>(aligned_alloc(alignment_, size));
+    std::copy(data_, data_ + size_, temp);
    free(data_);
-    data_ = nullptr;
-    size_ = 0;
+    data_ = temp;
+  } else {
+    data_ = static_cast<uint8_t *>(aligned_alloc(alignment_, size));
  }
-
-  void Device::reserve(size_t size) {
-    size = align(size);
-    ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
-
-    if(data_) {
-      uint8_t *temp = static_cast<uint8_t*>(aligned_alloc(alignment_, size));
-      std::copy(data_, data_ + size_, temp);
-      free(data_);
-      data_ = temp;
-    } else {
-      data_ = static_cast<uint8_t*>(aligned_alloc(alignment_, size));
-    }
-    size_ = size;
-  }
-
+  size_ = size;
+}
 }
 }
--- a/src/tensors/cpu/dropout.cpp
+++ b/src/tensors/cpu/dropout.cpp
@ -1,19 +1,20 @@
 #include <algorithm>
 #include <random>

-#include "tensors/tensor_operators.h"
 #include "tensors/cpu/backend.h"
+#include "tensors/tensor_operators.h"

 namespace marian {
-  namespace cpu {
+namespace cpu {

-    void Dropout(Tensor tensor, float p) {
-      auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
-      auto &gen = cpuBackend->getRandomGenerator();
-      std::bernoulli_distribution dist(1.f - p);
-      std::generate(tensor->data(), tensor->data() + tensor->size(),
-                    [&]() { return dist(gen) / (1.f - p); });
-    }
-
-  }
+void Dropout(Tensor tensor, float p) {
+  auto cpuBackend
+      = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
+  auto &gen = cpuBackend->getRandomGenerator();
+  std::bernoulli_distribution dist(1.f - p);
+  std::generate(tensor->data(), tensor->data() + tensor->size(), [&]() {
+    return dist(gen) / (1.f - p);
+  });
+}
+}
 }
--- a/src/tensors/cpu/element.h
+++ b/src/tensors/cpu/element.h
@ -13,12 +13,11 @@ namespace cpu {
 template <size_t K, bool broadcast, class Functor>
 void gElement(Functor functor,
              functional::Array<functional::Tensor<float>, K> tensors) {
-
  int length = tensors[0].shape().elements();
  functional::Array<int, functional::Shape::size()> dims;
  functional::Array<int, K> indices;

-  #pragma omp parallel for simd
+#pragma omp parallel for simd
  for(int index = 0; index < length; ++index) {
    indices.fill(index);
    if(broadcast) {
@ -30,8 +29,8 @@ void gElement(Functor functor,
  }
 }

-template <class Functor, class ...Tensors>
-void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
+template <class Functor, class... Tensors>
+void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
  constexpr size_t K = sizeof...(tensors) + 1;
  functional::Array<functional::Tensor<float>, K> gTensors = {out, tensors...};

@ -46,6 +45,5 @@ void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
  else
    cpu::gElement<K, false>(functor, gTensors);
 }
-
 }
 }
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@ -1,15 +1,15 @@
-/* All or part of this file was contributed by Intel under license:                                                                                                                          
- *   Copyright (C) 2017-2018 Intel Corporation                                                                                                                                               
- *   SPDX-License-Identifier: MIT                                                                                                                                                            
- */  
+/* All or part of this file was contributed by Intel under license:
+ *   Copyright (C) 2017-2018 Intel Corporation
+ *   SPDX-License-Identifier: MIT
+ */

-#include "tensors/tensor.h"
 #include "tensors/cpu/backend.h"
+#include "tensors/tensor.h"

 #if MKL_FOUND
 #include <mkl.h>
 #else
-#if BLAS_FOUND 
+#if BLAS_FOUND
 #include <cblas.h>
 #endif
 #endif
@ -25,7 +25,6 @@ void Prod(marian::Tensor C,
          bool transB,
          float beta,
          float scalar) {
-
 #if BLAS_FOUND
  float alpha = scalar;

@ -46,19 +45,20 @@ void Prod(marian::Tensor C,
  if(transB)
    ldc = B->shape().elements() / B->shape()[-1];

-  cblas_sgemm(
-        CblasColMajor,
-        transB ? CblasTrans : CblasNoTrans,
-        transA ? CblasTrans : CblasNoTrans,
-        n, m, k,
-        alpha,
-        B->data(),
-        ldb,
-        A->data(),
-        lda,
-        beta,
-        C->data(),
-        ldc);
+  cblas_sgemm(CblasColMajor,
+              transB ? CblasTrans : CblasNoTrans,
+              transA ? CblasTrans : CblasNoTrans,
+              n,
+              m,
+              k,
+              alpha,
+              B->data(),
+              ldb,
+              A->data(),
+              lda,
+              beta,
+              C->data(),
+              ldc);
 #else
  ABORT("Not implemented!");
 #endif
@ -73,7 +73,7 @@ void ProdBatched(marian::Tensor C,
                 float scalar) {
 #if BLAS_FOUND
  float alpha = scalar;
-  
+
  size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]);
  size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]);

@ -95,33 +95,34 @@ void ProdBatched(marian::Tensor C,
    ldc = B->shape()[-2];

  auto opA = transA ? CblasTrans : CblasNoTrans;
-  auto opB = transB ? CblasTrans : CblasNoTrans;  
-    
+  auto opB = transB ? CblasTrans : CblasNoTrans;
+
  auto strideB = batchB == 1 ? 0 : n * k;
  auto strideA = batchA == 1 ? 0 : m * k;
  auto strideC = n * m;
-  
+
  int steps = std::max(batchA, batchB);
-  
+
  int offsetA = 0;
  int offsetB = 0;
  int offsetC = 0;
-  
+
  for(int i = 0; i < steps; ++i) {
-    cblas_sgemm(
-          CblasColMajor,
-          opB,
-          opA,
-          n, m, k,
-          alpha,
-          B->data() + offsetB,
-          ldb,
-          A->data() + offsetA,
-          lda,
-          beta,
-          C->data() + offsetC,
-          ldc);
-    
+    cblas_sgemm(CblasColMajor,
+                opB,
+                opA,
+                n,
+                m,
+                k,
+                alpha,
+                B->data() + offsetB,
+                ldb,
+                A->data() + offsetA,
+                lda,
+                beta,
+                C->data() + offsetC,
+                ldc);
+
    offsetA += strideA;
    offsetB += strideB;
    offsetC += strideC;
@ -130,6 +131,5 @@ void ProdBatched(marian::Tensor C,
  ABORT("Not implemented!");
 #endif
 }
-
 }
 }
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -44,13 +44,13 @@ void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
 }

 inline void gInsertCols(float* out,
-                 const float* in,
-                 size_t rows,
-                 size_t cols,
-                 size_t cols_out,
-                 size_t cols_in,
-                 size_t offset_out,
-                 size_t offset_in) {
+                        const float* in,
+                        size_t rows,
+                        size_t cols,
+                        size_t cols_out,
+                        size_t cols_in,
+                        size_t offset_out,
+                        size_t offset_in) {
  for(int j = 0; j < rows; ++j) {
    float* rowOut = out + j * cols_out + offset_out;
    const float* rowIn = in + j * cols_in + offset_in;
@ -68,9 +68,10 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {

  for(auto in : inputs) {
    ABORT_IF(rows != in->shape().elements() / in->shape().back(),
-                   "First dimension must be equal");
+             "First dimension must be equal");
    int cols_in = in->shape().back();
-    cpu::gInsertCols(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
+    cpu::gInsertCols(
+        out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
    offset += cols_in;
  }
 }
@ -88,11 +89,10 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
  int cols_in = in->shape().back();
  for(auto out : outputs) {
    ABORT_IF(rows != out->shape().elements() / out->shape().back(),
-            "First dimension must be equal");
+             "First dimension must be equal");
    int cols_out = out->shape().back();
-    cpu::gInsertCols(out->data(), in->data(),
-                     rows, cols_out, cols_out, cols_in,
-                     0, offset);
+    cpu::gInsertCols(
+        out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset);
    offset += cols_out;
  }
 }
@ -158,24 +158,24 @@ void Softmax(Tensor out_, Tensor in_, Tensor mask_) {
  int rows = out_->shape().elements() / out_->shape().back();
  int cols = out_->shape().back();

-  for (int j = 0; j < rows; ++j) {
-    float* so = out + j*cols;
-    const float* sp = in + j*cols;
-    const float* mp = mask ? mask + j*cols : nullptr;
+  for(int j = 0; j < rows; ++j) {
+    float* so = out + j * cols;
+    const float* sp = in + j * cols;
+    const float* mp = mask ? mask + j * cols : nullptr;

    float max = sp[0];
-    for (int i = 1; i < cols; ++i) {
+    for(int i = 1; i < cols; ++i) {
      max = std::max(max, sp[i]);
    }

    float sum = 0.f;
-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      float ex = !mask || mp[i] ? std::exp(sp[i] - max) : 0.f;
      so[i] = ex;
      sum += ex;
    }

-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      so[i] /= sum;
    }
  }
@ -188,24 +188,24 @@ void LogSoftmax(Tensor out_, Tensor in_) {
  int rows = out_->shape().elements() / out_->shape().back();
  int cols = out_->shape().back();

-  for (int j = 0; j < rows; ++j) {
+  for(int j = 0; j < rows; ++j) {
    float* so = out + j * cols;
-    const float* sp = in + j*cols;
+    const float* sp = in + j * cols;

    float max = sp[0];
-    for (int i = 1; i < cols; ++i) {
+    for(int i = 1; i < cols; ++i) {
      max = std::max(max, sp[i]);
    }

    float sum = 0.f;
-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      float sm = sp[i] - max;
      float ex = std::exp(sm);
      so[i] = sm;
      sum += ex;
    }

-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      so[i] -= std::log(sum);
    }
  }
@ -219,17 +219,17 @@ void SoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) {
  const float* adj = adj_->data();
  const float* val = val_->data();

-  for (size_t j = 0; j < rows; ++j) {
-    float* gradRow = grad + j*cols;
-    const float* adjRow = adj + j*cols;
-    const float* valRow = val + j*cols;
+  for(size_t j = 0; j < rows; ++j) {
+    float* gradRow = grad + j * cols;
+    const float* adjRow = adj + j * cols;
+    const float* valRow = val + j * cols;

    float sum = 0.f;
-    for (size_t i = 0; i < cols; ++i) {
+    for(size_t i = 0; i < cols; ++i) {
      sum += valRow[i] * adjRow[i];
    }

-    for (size_t i = 0; i < cols; ++i) {
+    for(size_t i = 0; i < cols; ++i) {
      gradRow[i] += valRow[i] * (adjRow[i] - sum);
    }
  }
@ -243,62 +243,68 @@ void LogSoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) {
  const float* adj = adj_->data();
  const float* val = val_->data();

-  for (int j = 0; j < rows; ++j) {
-    float* gradRow = grad + j*cols;
-    const float* adjRow = adj + j*cols;
-    const float* valRow = val + j*cols;
+  for(int j = 0; j < rows; ++j) {
+    float* gradRow = grad + j * cols;
+    const float* adjRow = adj + j * cols;
+    const float* valRow = val + j * cols;

    float sum = 0.f;
-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      sum += adjRow[i];
    }

-    for (int i = 0; i < cols; ++i) {
-      gradRow[i] += adjRow[i] - sum*std::exp(valRow[i]);
+    for(int i = 0; i < cols; ++i) {
+      gradRow[i] += adjRow[i] - sum * std::exp(valRow[i]);
    }
  }
 }

-void CopyRows(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
+void CopyRows(Tensor out_,
+              const Tensor in_,
+              const std::vector<size_t>& indices) {
  size_t cols = in_->shape()[1];
  size_t rows = indices.size();

  float* out = out_->data();
  const float* in = in_->data();

-  #pragma omp parallel for
-  for (int j = 0; j < rows; ++j) {
+#pragma omp parallel for
+  for(int j = 0; j < rows; ++j) {
    size_t dst = j;
    size_t src = indices[j];

-    float* rowOut = out + dst*cols;
-    const float* rowIn = in + src*cols;
+    float* rowOut = out + dst * cols;
+    const float* rowIn = in + src * cols;

    std::copy(rowIn, rowIn + cols, rowOut);
  }
 }

-void PasteRows(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
+void PasteRows(Tensor out_,
+               const Tensor in_,
+               const std::vector<size_t>& indices) {
  size_t cols = in_->shape()[-1];
  size_t rows = indices.size();

  float* out = out_->data();
  const float* in = in_->data();

-  for (int j = 0; j < rows; ++j) {
-    size_t dst = indices[j]; // not a permutation - may alias, unlike PasteCols
+  for(int j = 0; j < rows; ++j) {
+    size_t dst = indices[j];  // not a permutation - may alias, unlike PasteCols
    size_t src = j;

-    float* rowOut = out + dst*cols;
-    const float* rowIn = in + src*cols;
+    float* rowOut = out + dst * cols;
+    const float* rowIn = in + src * cols;

-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      rowOut[i] += rowIn[i];
    }
  }
 }

-void CopyCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
+void CopyCols(Tensor out_,
+              const Tensor in_,
+              const std::vector<size_t>& indices) {
  size_t rows = in_->shape().elements() / in_->shape()[-1];
  size_t colsIn = in_->shape()[-1];
  size_t colsOut = indices.size();
@ -306,18 +312,20 @@ void CopyCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices)
  float* out = out_->data();
  const float* in = in_->data();

-  #pragma omp parallel for
-  for (int j = 0; j < rows; ++j) {
-    const float* rowIn = in + j*colsIn;
-    float* rowOut = out + j*colsOut;
+#pragma omp parallel for
+  for(int j = 0; j < rows; ++j) {
+    const float* rowIn = in + j * colsIn;
+    float* rowOut = out + j * colsOut;

-    for (int i = 0; i < colsOut; ++i) {
+    for(int i = 0; i < colsOut; ++i) {
      rowOut[i] = rowIn[indices[i]];
    }
  }
 }

-void PasteCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
+void PasteCols(Tensor out_,
+               const Tensor in_,
+               const std::vector<size_t>& indices) {
  size_t rows = out_->shape().elements() / out_->shape()[-1];
  size_t colsOut = out_->shape()[-1];
  size_t colsIn = indices.size();
@ -328,12 +336,12 @@ void PasteCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices
  /* n.b. Unlike PasteRows, currently appears safe to assume indices[i] is a
   *      permutation i.e. no racy aliases, and no need to sum vs. just assign.
   */
-  for (int j = 0; j < rows; ++j) {
-    const float* rowIn = in + j*colsIn;
-    float* rowOut = out + j*colsOut;
+  for(int j = 0; j < rows; ++j) {
+    const float* rowIn = in + j * colsIn;
+    float* rowOut = out + j * colsOut;

    // @TODO: should this be a sum?
-    for (int i = 0; i < colsIn; ++i) {
+    for(int i = 0; i < colsIn; ++i) {
      rowOut[indices[i]] = rowIn[i];
    }
  }
@ -367,8 +375,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
  const float* b = inputs[3]->data();
  const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;

-  #pragma omp parallel for
-  for (int j = 0; j < rows; ++j) {
+#pragma omp parallel for
+  for(int j = 0; j < rows; ++j) {
    float m = !mask || mask[j];
    float* rowOut = out + j * cols;
    const float* rowState = state + j * cols;
@ -376,8 +384,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
    const float* xWrow = xW + j * cols * 3;
    const float* sUrow = sU + j * cols * 3;

-    #pragma omp simd
-    for (int i = 0; i < cols; ++i) {
+#pragma omp simd
+    for(int i = 0; i < cols; ++i) {
      // @TODO: stable logit
      float r = stableLogit(xWrow[i] + sUrow[i] + b[i]);

@ -417,8 +425,8 @@ void GRUFastBackward(std::vector<Tensor> outputs,
  const float* mask = inputs.size() > 4 ? inputs[4]->data() : 0;
  const float* adj = adj_->data();

-  #pragma omp parallel
-  for (int j = 0; j < rows; ++j) {
+#pragma omp parallel
+  for(int j = 0; j < rows; ++j) {
    float m = !mask || mask[j];

    float* rowOutState = outState + j * cols;
@ -430,8 +438,8 @@ void GRUFastBackward(std::vector<Tensor> outputs,
    const float* rowSU = sU + j * cols * 3;
    const float* rowAdj = adj + j * cols;

-    #pragma omp for simd nowait
-    for (int i = 0; i < cols; ++i) {
+#pragma omp for simd nowait
+    for(int i = 0; i < cols; ++i) {
      int k = i + cols;
      int l = i + 2 * cols;

@ -446,10 +454,11 @@ void GRUFastBackward(std::vector<Tensor> outputs,

      float adj = rowAdj[i];

-      float t = (1-z)*(1-h*h);
+      float t = (1 - z) * (1 - h * h);

      // df/ds
-      if(outState) rowOutState[i] += (m * z - m + 1) * adj;
+      if(outState)
+        rowOutState[i] += (m * z - m + 1) * adj;

      // df/d(xW_r) ...
      float dfdxW_r = m * r * (1 - r) * t * adj;
@ -457,20 +466,28 @@ void GRUFastBackward(std::vector<Tensor> outputs,
        dfdxW_r *= rowSU[l] + b[l];
      else
        dfdxW_r *= rowSU[l];
-      if(outXW) rowOutXW[i] += dfdxW_r;
-      if(outSU) rowOutSU[i] += dfdxW_r;
-      if(outB)  outB[i] += dfdxW_r;
+      if(outXW)
+        rowOutXW[i] += dfdxW_r;
+      if(outSU)
+        rowOutSU[i] += dfdxW_r;
+      if(outB)
+        outB[i] += dfdxW_r;

      // df/d(xW_z) ...
      float dfdxW_z = m * (1 - z) * z * (rowState[i] - h) * adj;
-      if(outXW) rowOutXW[k] += dfdxW_z;
-      if(outSU) rowOutSU[k] += dfdxW_z;
-      if(outB)  outB[k] += dfdxW_z;
+      if(outXW)
+        rowOutXW[k] += dfdxW_z;
+      if(outSU)
+        rowOutSU[k] += dfdxW_z;
+      if(outB)
+        outB[k] += dfdxW_z;

      // df/d(xW_x) ...
      float dfdxW_x = m * t * adj;
-      if(outXW) rowOutXW[l] += dfdxW_x;
-      if(outSU) rowOutSU[l] += dfdxW_x * r;
+      if(outXW)
+        rowOutXW[l] += dfdxW_x;
+      if(outSU)
+        rowOutSU[l] += dfdxW_x * r;
      if(outB)
        if(final)
          outB[l] += dfdxW_x * r;
@ -490,18 +507,18 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) {
  int rows = inShape.elements() / inShape.back();
  int cols = inShape.back();

-  #pragma omp parallel for
-  for (int j = 0; j < rows; ++j) {
-    const float* sp = in + j*cols;
+#pragma omp parallel for
+  for(int j = 0; j < rows; ++j) {
+    const float* sp = in + j * cols;
    float max = sp[0];
-    #pragma omp simd reduction(max:max)
-    for (int i = 1; i < cols; ++i) {
+#pragma omp simd reduction(max : max)
+    for(int i = 1; i < cols; ++i) {
      max = std::max(max, sp[i]);
    }

    float sum = 0.f;
-    #pragma omp simd reduction(+:sum)
-    for (int i = 0; i < cols; ++i) {
+#pragma omp simd reduction(+ : sum)
+    for(int i = 0; i < cols; ++i) {
      sum += std::exp(sp[i] - max);
    }

@ -512,7 +529,10 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) {
  }
 }

-void CrossEntropyPickBackward(Tensor out_, Tensor adj_, Tensor a, Tensor pick_) {
+void CrossEntropyPickBackward(Tensor out_,
+                              Tensor adj_,
+                              Tensor a,
+                              Tensor pick_) {
  float* out = out_->data();
  Shape& outShape = out_->shape();
  const float* adj = adj_->data();
@ -522,23 +542,23 @@ void CrossEntropyPickBackward(Tensor out_, Tensor adj_, Tensor a, Tensor pick_)
  int rows = outShape.elements() / outShape.back();
  int cols = outShape.back();

-  #pragma omp parallel for
-  for (int j = 0; j < rows; ++j) {
-    const float* sp = in + j*cols;
-    float* so = out + j*cols;
+#pragma omp parallel for
+  for(int j = 0; j < rows; ++j) {
+    const float* sp = in + j * cols;
+    float* so = out + j * cols;

    float max = sp[0];
-    for (int i = 1; i < cols; ++i) {
+    for(int i = 1; i < cols; ++i) {
      max = std::max(max, sp[i]);
    }

    float sum = 0.f;
-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      sum += std::exp(sp[i] - max);
    }

    // cross-entropy
-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      float sub = (float)(i == (int)pick[j]);
      so[i] += adj[j] * (std::exp(sp[i] - max) / sum - sub);
    }
@ -549,8 +569,8 @@ float L2Norm(Tensor in) {
  float sum = 0.f;
  size_t size = in->size();
  const float* data = in->data();
-  #pragma omp parallel for simd reduction(+:sum)
-  for (size_t i = 0; i < size; ++i) {
+#pragma omp parallel for simd reduction(+ : sum)
+  for(size_t i = 0; i < size; ++i) {
    sum += data[i] * data[i];
  }
  return std::sqrt(sum);
@ -570,15 +590,15 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) {
  int rows = m;
  int cols = k;

-  #pragma omp parallel for
-  for (size_t j = 0; j < rows; ++j) {
+#pragma omp parallel for
+  for(size_t j = 0; j < rows; ++j) {
    const float* vaRow = va;
    const float* ctxRow = ctx + (j % (b * t)) * cols;
    const float* stateRow = state + ((j / (b * t)) * b + j % b) * cols;

    float sum = 0.f;
-    #pragma omp simd reduction(+:sum)
-    for (size_t i = 0; i < cols; ++i) {
+#pragma omp simd reduction(+ : sum)
+    for(size_t i = 0; i < cols; ++i) {
      float z = ctxRow[i] + stateRow[i];
      sum += std::tanh(z) * vaRow[i];
    }
@ -587,8 +607,12 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) {
  }
 }

-void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,
-             Tensor va_, Tensor context_, Tensor state_,
+void AttBack(Tensor gVa_,
+             Tensor gContext_,
+             Tensor gState_,
+             Tensor va_,
+             Tensor context_,
+             Tensor state_,
             Tensor adj_) {
  float* gVa = gVa_->data();
  float* gContext = gContext_->data();
@ -603,8 +627,8 @@ void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,
  size_t k = context_->shape()[-1];
  size_t n = context_->shape()[-2];

-  #pragma omp parallel for reduction(+:gState[:n*k], gVa[:k])
-  for (size_t j = 0; j < m; ++j) {
+#pragma omp parallel for reduction(+ : gState[ : n* k], gVa[ : k])
+  for(size_t j = 0; j < m; ++j) {
    float* gcRow = gContext + j * k;
    float* gsRow = gState + (j % n) * k;

@ -613,8 +637,8 @@ void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,

    float adj_j = adj[j];

-    #pragma omp simd
-    for (size_t i = 0; i < k; ++i) {
+#pragma omp simd
+    for(size_t i = 0; i < k; ++i) {
      float z = cRow[i] + sRow[i];

      float t = std::tanh(z);
@ -642,31 +666,31 @@ void LayerNormalization(Tensor out_,
  int rows = in_->shape().elements() / in_->shape().back();
  int cols = in_->shape().back();

-  #pragma omp parallel for
-  for (int j = 0; j < rows; ++j) {
-    float* so = out + j*cols;
-    const float* sp = in + j*cols;
+#pragma omp parallel for
+  for(int j = 0; j < rows; ++j) {
+    float* so = out + j * cols;
+    const float* sp = in + j * cols;

    float sum = 0.f;
-    #pragma omp simd reduction(+:sum)
-    for (int i = 0; i < cols; ++i) {
+#pragma omp simd reduction(+ : sum)
+    for(int i = 0; i < cols; ++i) {
      sum += sp[i];
    }

    float mean = sum / cols;
    float sqSum = 0.f;
-    #pragma omp simd reduction(+:sqSum)
-    for (int i = 0; i < cols; ++i) {
+#pragma omp simd reduction(+ : sqSum)
+    for(int i = 0; i < cols; ++i) {
      float ex = sp[i] - mean;
-      sqSum += ex*ex;
+      sqSum += ex * ex;
    }

    float sigma = std::sqrt(eps + sqSum / cols);

-    #pragma omp simd
-    for (int i = 0; i < cols; ++i) {
+#pragma omp simd
+    for(int i = 0; i < cols; ++i) {
      float t = alpha[i] * ((sp[i] - mean) / sigma);
-      if (beta != nullptr) {
+      if(beta != nullptr) {
        t += beta[i];
      }

@ -696,36 +720,36 @@ void LayerNormalizationGrad(Tensor gradX_,
  size_t rows = y_->shape().elements() / y_->shape()[-1];
  size_t cols = y_->shape()[-1];

-  if (beta) {
-    #pragma omp parallel for reduction(+:gradGamma[:cols], gradBeta[:cols])
-    for (size_t j = 0; j < rows; ++j) {
-      const float* xRow = x + j*cols;
-      const float* yRow = y + j*cols;
-      const float* adjRow = adj + j*cols;
-      float* gradXRow = gradX + j*cols;
+  if(beta) {
+#pragma omp parallel for reduction(+ : gradGamma[ : cols], gradBeta[ : cols])
+    for(size_t j = 0; j < rows; ++j) {
+      const float* xRow = x + j * cols;
+      const float* yRow = y + j * cols;
+      const float* adjRow = adj + j * cols;
+      float* gradXRow = gradX + j * cols;

      float sum_x = 0.f;
      float sum_adj = 0.f;
      float sum_adj_x = 0.f;
      float sum_sqr = 0.f;

-      #pragma omp simd reduction(+:sum_x, sum_adj_x, sum_adj)
-      for (size_t i = 0; i < cols; ++i) {
+#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj)
+      for(size_t i = 0; i < cols; ++i) {
        sum_x += xRow[i];
        sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[i] : 0.f)) / gamma[i];
        sum_adj += adjRow[i];
      }

      float mean = sum_x / cols;
-      #pragma omp simd reduction(+:sum_sqr)
-      for (size_t i = 0; i < cols; ++i) {
+#pragma omp simd reduction(+ : sum_sqr)
+      for(size_t i = 0; i < cols; ++i) {
        float ex = xRow[i] - mean;
-        sum_sqr += ex*ex;
+        sum_sqr += ex * ex;
      }

      float sigma = std::sqrt(eps + sum_sqr / cols);
-      #pragma omp simd
-      for (size_t i = 0; i < cols; ++i) {
+#pragma omp simd
+      for(size_t i = 0; i < cols; ++i) {
        float grad_x = 0.f;
        float x_hat = (yRow[i] - beta[i]) / gamma[i];
        grad_x += cols * adjRow[i];
@ -739,35 +763,35 @@ void LayerNormalizationGrad(Tensor gradX_,
      }
    }
  } else {
-    #pragma omp parallel for reduction(+:gradGamma[:cols])
-    for (size_t j = 0; j < rows; ++j) {
-      const float* xRow = x + j*cols;
-      const float* yRow = y + j*cols;
-      const float* adjRow = adj + j*cols;
-      float* gradXRow = gradX + j*cols;
+#pragma omp parallel for reduction(+ : gradGamma[ : cols])
+    for(size_t j = 0; j < rows; ++j) {
+      const float* xRow = x + j * cols;
+      const float* yRow = y + j * cols;
+      const float* adjRow = adj + j * cols;
+      float* gradXRow = gradX + j * cols;

      float sum_x = 0.f;
      float sum_adj = 0.f;
      float sum_adj_x = 0.f;
      float sum_sqr = 0.f;

-      #pragma omp simd reduction(+:sum_x, sum_adj_x, sum_adj)
-      for (size_t i = 0; i < cols; ++i) {
+#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj)
+      for(size_t i = 0; i < cols; ++i) {
        sum_x += xRow[i];
        sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[i] : 0.f)) / gamma[i];
        sum_adj += adjRow[i];
      }

      float mean = sum_x / cols;
-      #pragma omp simd reduction(+:sum_sqr)
-      for (size_t i = 0; i < cols; ++i) {
+#pragma omp simd reduction(+ : sum_sqr)
+      for(size_t i = 0; i < cols; ++i) {
        float ex = xRow[i] - mean;
-        sum_sqr += ex*ex;
+        sum_sqr += ex * ex;
      }

      float sigma = std::sqrt(eps + sum_sqr / cols);
-      #pragma omp simd
-      for (size_t i = 0; i < cols; ++i) {
+#pragma omp simd
+      for(size_t i = 0; i < cols; ++i) {
        float grad_x = 0.f;
        float x_hat = yRow[i] / gamma[i];
        grad_x += cols * adjRow[i];
@ -794,9 +818,9 @@ void Shift(Tensor out_, Tensor in_, marian::Shape shift, bool invert) {
  const float* in = in_->data();

  int length = out_->shape().elements();
-  #pragma omp parallel for
-  for (int i = 0; i < length; ++i) {
-    if (i - offset < 0 || i - offset >= length) {
+#pragma omp parallel for
+  for(int i = 0; i < length; ++i) {
+    if(i - offset < 0 || i - offset >= length) {
      out[i] = 0.f;
    } else {
      out[i] = in[i - offset];
@ -808,7 +832,7 @@ void SetSparse(float* out,
               const std::vector<size_t>& indices,
               const std::vector<float>& values) {
  int length = indices.size();
-  for (int index = 0; index < length; ++index) {
+  for(int index = 0; index < length; ++index) {
    out[indices[index]] = values[index];
  }
 }
@ -824,26 +848,26 @@ void LSTMCellForward(Tensor out_, std::vector<Tensor> inputs) {
  const float* b = inputs[3]->data();
  const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;

-  for (int j = 0; j < rows; ++j) {
+  for(int j = 0; j < rows; ++j) {
    float m = !mask || mask[j];

-    float* rowOut = out + j*cols;
-    const float* rowCell = cell + j*cols;
+    float* rowOut = out + j * cols;
+    const float* rowCell = cell + j * cols;

-    const float* xWrow = xW + j*cols*4;
-    const float* sUrow = sU + j*cols*4;
+    const float* xWrow = xW + j * cols * 4;
+    const float* sUrow = sU + j * cols * 4;

-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);

      int k = i + cols;
      float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);

-      int l = i + 2*cols;
+      int l = i + 2 * cols;
      float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);

-      float cout = gf*rowCell[i] + gi*gc;
-      rowOut[i] = m*cout + (1-m)*rowCell[i];
+      float cout = gf * rowCell[i] + gi * gc;
+      rowOut[i] = m * cout + (1 - m) * rowCell[i];
    }
  }
 }
@ -858,15 +882,15 @@ void LSTMOutputForward(Tensor out_, std::vector<Tensor> inputs) {
  const float* sU = inputs[2]->data();
  const float* b = inputs[3]->data();

-  for (int j = 0; j <rows; ++j) {
-    float* rowOut = out + j*cols;
-    const float* rowCell = cell + j*cols;
+  for(int j = 0; j < rows; ++j) {
+    float* rowOut = out + j * cols;
+    const float* rowCell = cell + j * cols;

-    const float* xWrow = xW + j*cols*4;
-    const float* sUrow = sU + j*cols*4;
+    const float* xWrow = xW + j * cols * 4;
+    const float* sUrow = sU + j * cols * 4;

-    for (int i = 0; i < cols; ++i) {
-      int k = i + 3*cols;
+    for(int i = 0; i < cols; ++i) {
+      int k = i + 3 * cols;
      float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);

      rowOut[i] = go * std::tanh(rowCell[i]);
@ -893,52 +917,70 @@ void LSTMCellBackward(std::vector<Tensor> outputs,
  const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;
  const float* adj = adj_->data();

-  for (int j = 0; j <rows; ++j) {
+  for(int j = 0; j < rows; ++j) {
    float m = !mask || mask[j];

-    float* rowOutCell = outCell + j*cols;
-    float* rowOutXW = outXW + j*cols*4;
-    float* rowOutSU = outSU + j*cols*4;
+    float* rowOutCell = outCell + j * cols;
+    float* rowOutXW = outXW + j * cols * 4;
+    float* rowOutSU = outSU + j * cols * 4;

-    const float* rowCell = cell + j*cols;
-    const float* xWrow = xW + j*cols*4;
-    const float* sUrow = sU + j*cols*4;
+    const float* rowCell = cell + j * cols;
+    const float* xWrow = xW + j * cols * 4;
+    const float* sUrow = sU + j * cols * 4;

-    const float* rowAdj = adj + j*cols;
+    const float* rowAdj = adj + j * cols;

-    for (int i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);

      int k = i + cols;
      float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);

-      int l = i + 2*cols;
+      int l = i + 2 * cols;
      float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);

      float adj = rowAdj[i];

      // dc/dx_{t-1}
-      if (outCell) {
-        rowOutCell[i] += (m*gf - m + 1)*adj;
+      if(outCell) {
+        rowOutCell[i] += (m * gf - m + 1) * adj;
      }

      // dc/d(b_f) = dc/d(xW_f) ...
-      float dcdxf = m*rowCell[i] * gf*(1-gf) * adj;
-      if (outXW) { rowOutXW[i] += dcdxf; }
-      if (outSU) { rowOutSU[i] += dcdxf; }
-      if (outB) { outB[i] += dcdxf; }
+      float dcdxf = m * rowCell[i] * gf * (1 - gf) * adj;
+      if(outXW) {
+        rowOutXW[i] += dcdxf;
+      }
+      if(outSU) {
+        rowOutSU[i] += dcdxf;
+      }
+      if(outB) {
+        outB[i] += dcdxf;
+      }

      // dc/d(b_i) ...
-      float dcdb_i = m * gc * gi*(1-gi) * adj;
-      if (outXW) { rowOutXW[k] += dcdb_i; }
-      if (outSU) { rowOutSU[k] += dcdb_i; }
-      if (outB) { outB[k] += dcdb_i; }
+      float dcdb_i = m * gc * gi * (1 - gi) * adj;
+      if(outXW) {
+        rowOutXW[k] += dcdb_i;
+      }
+      if(outSU) {
+        rowOutSU[k] += dcdb_i;
+      }
+      if(outB) {
+        outB[k] += dcdb_i;
+      }

      // dc/d(b_c) ...
-      float dcdxc = m * gi * (1 - gc*gc) * adj;
-      if (outXW) { rowOutXW[l] += dcdxc; }
-      if (outSU) { rowOutSU[l] += dcdxc; }
-      if (outB) { outB[l] += dcdxc; }
+      float dcdxc = m * gi * (1 - gc * gc) * adj;
+      if(outXW) {
+        rowOutXW[l] += dcdxc;
+      }
+      if(outSU) {
+        rowOutSU[l] += dcdxc;
+      }
+      if(outB) {
+        outB[l] += dcdxc;
+      }
    }
  }
 }
@ -961,19 +1003,19 @@ void LSTMOutputBackward(std::vector<Tensor> outputs,

  const float* adj = adj_->data();

-  for (int j = 0; j < rows; ++j) {
-    float* rowOutCell = outCell + j*cols;
-    float* rowOutXW = outXW + j*cols*4;
-    float* rowOutSU = outSU + j*cols*4;
+  for(int j = 0; j < rows; ++j) {
+    float* rowOutCell = outCell + j * cols;
+    float* rowOutXW = outXW + j * cols * 4;
+    float* rowOutSU = outSU + j * cols * 4;

-    const float* rowCell = cell + j*cols;
-    const float* xWrow = xW + j*cols*4;
-    const float* sUrow = sU + j*cols*4;
+    const float* rowCell = cell + j * cols;
+    const float* xWrow = xW + j * cols * 4;
+    const float* sUrow = sU + j * cols * 4;

-    const float* rowAdj = adj + j*cols;
+    const float* rowAdj = adj + j * cols;

-    for (int i = 0; i < cols; ++i) {
-      int k = i + 3*cols;
+    for(int i = 0; i < cols; ++i) {
+      int k = i + 3 * cols;
      float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);

      float t = std::tanh(rowCell[i]);
@ -981,15 +1023,21 @@ void LSTMOutputBackward(std::vector<Tensor> outputs,
      float adj = rowAdj[i];

      // dc/dc_{t-1}
-      if (outCell) {
-        rowOutCell[i] += go * (1 - t*t) * adj;
+      if(outCell) {
+        rowOutCell[i] += go * (1 - t * t) * adj;
      }

      // dc/d(b_o) = dc/d(xW_f) ...
-      float dcdxo = t * go*(1-go) * adj;
-      if (outXW) { rowOutXW[k] += dcdxo; }
-      if (outSU) { rowOutSU[k] += dcdxo; }
-      if (outB) { outB[k] += dcdxo; }
+      float dcdxo = t * go * (1 - go) * adj;
+      if(outXW) {
+        rowOutXW[k] += dcdxo;
+      }
+      if(outSU) {
+        rowOutSU[k] += dcdxo;
+      }
+      if(outB) {
+        outB[k] += dcdxo;
+      }
    }
  }
 }
@ -1027,6 +1075,5 @@ void PoolingWithMaskingBackward(Tensor adj,
                                bool isEven) {
  ABORT("Not implemented!");
 }
-
 }
 }  // namespace marian
--- a/src/tensors/device.h
+++ b/src/tensors/device.h
@ -23,7 +23,7 @@ public:
  Device(DeviceId deviceId, size_t alignment = 256)
      : deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {}

-  virtual ~Device() {};
+  virtual ~Device(){};

  virtual void reserve(size_t size) = 0;

@ -35,30 +35,31 @@ public:
 };

 namespace gpu {
-  class Device : public marian::Device {
-    public:
-      Device(DeviceId deviceId, size_t alignment = 256)
+class Device : public marian::Device {
+public:
+  Device(DeviceId deviceId, size_t alignment = 256)
      : marian::Device(deviceId, alignment) {}

-      ~Device();
+  ~Device();

-      void reserve(size_t size);
-  };
+  void reserve(size_t size);
+};
 }

 namespace cpu {
-  class Device : public marian::Device {
-    public:
-      Device(DeviceId deviceId, size_t alignment = 256)
+class Device : public marian::Device {
+public:
+  Device(DeviceId deviceId, size_t alignment = 256)
      : marian::Device(deviceId, alignment) {}

-      ~Device();
+  ~Device();

-      void reserve(size_t size);
-  };
+  void reserve(size_t size);
+};
 }

-static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
+static inline Ptr<Device> DispatchDevice(DeviceId deviceId,
+                                         size_t alignment = 256) {
 #ifdef CUDA_FOUND
  if(deviceId.type == DeviceType::gpu)
    return New<gpu::Device>(deviceId, alignment);
@ -71,5 +72,4 @@ static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 2
    return New<cpu::Device>(deviceId, alignment);
 #endif
 }
-
 }
--- a/src/tensors/dispatch.h
+++ b/src/tensors/dispatch.h
@ -2,204 +2,250 @@

 #ifdef CUDA_FOUND

-#define DISPATCH1(Function, Arg1) \
-  namespace gpu { \
-    void Function(Arg1); \
-  } \
-  namespace cpu { \
-    void Function(Arg1); \
-  } \
-  void Function(Arg1 arg1) { \
+#define DISPATCH1(Function, Arg1)                               \
+  namespace gpu {                                               \
+  void Function(Arg1);                                          \
+  }                                                             \
+  namespace cpu {                                               \
+  void Function(Arg1);                                          \
+  }                                                             \
+  void Function(Arg1 arg1) {                                    \
    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
-      gpu::Function(arg1); \
-    else \
-      cpu::Function(arg1); \
+      gpu::Function(arg1);                                      \
+    else                                                        \
+      cpu::Function(arg1);                                      \
  }

-#define DISPATCH2(Function, Arg1, Arg2) \
-  namespace gpu { \
-    void Function(Arg1, Arg2); \
-  } \
-  namespace cpu { \
-    void Function(Arg1, Arg2); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2) { \
+#define DISPATCH2(Function, Arg1, Arg2)                         \
+  namespace gpu {                                               \
+  void Function(Arg1, Arg2);                                    \
+  }                                                             \
+  namespace cpu {                                               \
+  void Function(Arg1, Arg2);                                    \
+  }                                                             \
+  static inline void Function(Arg1 arg1, Arg2 arg2) {           \
    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
-      gpu::Function(arg1, arg2); \
-    else \
-      cpu::Function(arg1, arg2); \
+      gpu::Function(arg1, arg2);                                \
+    else                                                        \
+      cpu::Function(arg1, arg2);                                \
  }

-#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
-  namespace gpu { \
-    void Function(Arg1, Arg2, Arg3); \
-  } \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3); \
-  } \
+#define DISPATCH3(Function, Arg1, Arg2, Arg3)                    \
+  namespace gpu {                                                \
+  void Function(Arg1, Arg2, Arg3);                               \
+  }                                                              \
+  namespace cpu {                                                \
+  void Function(Arg1, Arg2, Arg3);                               \
+  }                                                              \
  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
-    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
-      gpu::Function(arg1, arg2, arg3); \
-    else \
-      cpu::Function(arg1, arg2, arg3); \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu)  \
+      gpu::Function(arg1, arg2, arg3);                           \
+    else                                                         \
+      cpu::Function(arg1, arg2, arg3);                           \
  }

-#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
-  namespace gpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4); \
-  } \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4); \
-  } \
+#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4)                         \
+  namespace gpu {                                                           \
+  void Function(Arg1, Arg2, Arg3, Arg4);                                    \
+  }                                                                         \
+  namespace cpu {                                                           \
+  void Function(Arg1, Arg2, Arg3, Arg4);                                    \
+  }                                                                         \
  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
-    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
-      gpu::Function(arg1, arg2, arg3, arg4); \
-    else \
-      cpu::Function(arg1, arg2, arg3, arg4); \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu)             \
+      gpu::Function(arg1, arg2, arg3, arg4);                                \
+    else                                                                    \
+      cpu::Function(arg1, arg2, arg3, arg4);                                \
  }

-#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
-  namespace gpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
-  } \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
+#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5)       \
+  namespace gpu {                                               \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5);                  \
+  }                                                             \
+  namespace cpu {                                               \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5);                  \
+  }                                                             \
+  static inline void Function(                                  \
+      Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) {  \
    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
-      gpu::Function(arg1, arg2, arg3, arg4, arg5); \
-    else \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5); \
+      gpu::Function(arg1, arg2, arg3, arg4, arg5);              \
+    else                                                        \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5);              \
  }

-#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
-  namespace gpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
-  } \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
-    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
-      gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
-    else \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
+#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6)           \
+  namespace gpu {                                                         \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6);                      \
+  }                                                                       \
+  namespace cpu {                                                         \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6);                      \
+  }                                                                       \
+  static inline void Function(                                            \
+      Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu)           \
+      gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6);                  \
+    else                                                                  \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6);                  \
  }

 #define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
-  namespace gpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
-  } \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
-    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
-      gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
-    else \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
+  namespace gpu {                                                     \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7);            \
+  }                                                                   \
+  namespace cpu {                                                     \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7);            \
+  }                                                                   \
+  static inline void Function(Arg1 arg1,                              \
+                              Arg2 arg2,                              \
+                              Arg3 arg3,                              \
+                              Arg4 arg4,                              \
+                              Arg5 arg5,                              \
+                              Arg6 arg6,                              \
+                              Arg7 arg7) {                            \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu)       \
+      gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7);        \
+    else                                                              \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7);        \
  }

 #define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
-  namespace gpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
-  } \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
-    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
-      gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
-    else \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
+  namespace gpu {                                                           \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8);            \
+  }                                                                         \
+  namespace cpu {                                                           \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8);            \
+  }                                                                         \
+  static inline void Function(Arg1 arg1,                                    \
+                              Arg2 arg2,                                    \
+                              Arg3 arg3,                                    \
+                              Arg4 arg4,                                    \
+                              Arg5 arg5,                                    \
+                              Arg6 arg6,                                    \
+                              Arg7 arg7,                                    \
+                              Arg8 arg8) {                                  \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu)             \
+      gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8);        \
+    else                                                                    \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8);        \
  }

-#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
-  namespace gpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
-  } \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
-    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+#define DISPATCH9(                                                         \
+    Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9)        \
+  namespace gpu {                                                          \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9);     \
+  }                                                                        \
+  namespace cpu {                                                          \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9);     \
+  }                                                                        \
+  static inline void Function(Arg1 arg1,                                   \
+                              Arg2 arg2,                                   \
+                              Arg3 arg3,                                   \
+                              Arg4 arg4,                                   \
+                              Arg5 arg5,                                   \
+                              Arg6 arg6,                                   \
+                              Arg7 arg7,                                   \
+                              Arg8 arg8,                                   \
+                              Arg9 arg9) {                                 \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu)            \
      gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
-    else \
+    else                                                                   \
      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
  }

 #else

 #define DISPATCH1(Function, Arg1) \
-  namespace cpu { \
-    void Function(Arg1); \
-  } \
-  void Function(Arg1 arg1) { \
-      cpu::Function(arg1); \
-  }
+  namespace cpu {                 \
+  void Function(Arg1);            \
+  }                               \
+  void Function(Arg1 arg1) { cpu::Function(arg1); }

-#define DISPATCH2(Function, Arg1, Arg2) \
-  namespace cpu { \
-    void Function(Arg1, Arg2); \
-  } \
+#define DISPATCH2(Function, Arg1, Arg2)               \
+  namespace cpu {                                     \
+  void Function(Arg1, Arg2);                          \
+  }                                                   \
  static inline void Function(Arg1 arg1, Arg2 arg2) { \
-      cpu::Function(arg1, arg2); \
+    cpu::Function(arg1, arg2);                        \
  }

-#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3); \
-  } \
+#define DISPATCH3(Function, Arg1, Arg2, Arg3)                    \
+  namespace cpu {                                                \
+  void Function(Arg1, Arg2, Arg3);                               \
+  }                                                              \
  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
-      cpu::Function(arg1, arg2, arg3); \
+    cpu::Function(arg1, arg2, arg3);                             \
  }

-#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4); \
-  } \
+#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4)                         \
+  namespace cpu {                                                           \
+  void Function(Arg1, Arg2, Arg3, Arg4);                                    \
+  }                                                                         \
  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
-      cpu::Function(arg1, arg2, arg3, arg4); \
+    cpu::Function(arg1, arg2, arg3, arg4);                                  \
  }

-#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5); \
+#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5)      \
+  namespace cpu {                                              \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5);                 \
+  }                                                            \
+  static inline void Function(                                 \
+      Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
+    cpu::Function(arg1, arg2, arg3, arg4, arg5);               \
  }

-#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
+#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6)           \
+  namespace cpu {                                                         \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6);                      \
+  }                                                                       \
+  static inline void Function(                                            \
+      Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
+    cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6);                    \
  }

 #define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
+  namespace cpu {                                                     \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7);            \
+  }                                                                   \
+  static inline void Function(Arg1 arg1,                              \
+                              Arg2 arg2,                              \
+                              Arg3 arg3,                              \
+                              Arg4 arg4,                              \
+                              Arg5 arg5,                              \
+                              Arg6 arg6,                              \
+                              Arg7 arg7) {                            \
+    cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7);          \
  }

 #define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
+  namespace cpu {                                                           \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8);            \
+  }                                                                         \
+  static inline void Function(Arg1 arg1,                                    \
+                              Arg2 arg2,                                    \
+                              Arg3 arg3,                                    \
+                              Arg4 arg4,                                    \
+                              Arg5 arg5,                                    \
+                              Arg6 arg6,                                    \
+                              Arg7 arg7,                                    \
+                              Arg8 arg8) {                                  \
+    cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8);          \
  }

-#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
-  namespace cpu { \
-    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
-  } \
-  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
-      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
+#define DISPATCH9(                                                       \
+    Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9)      \
+  namespace cpu {                                                        \
+  void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9);   \
+  }                                                                      \
+  static inline void Function(Arg1 arg1,                                 \
+                              Arg2 arg2,                                 \
+                              Arg3 arg3,                                 \
+                              Arg4 arg4,                                 \
+                              Arg5 arg5,                                 \
+                              Arg6 arg6,                                 \
+                              Arg7 arg7,                                 \
+                              Arg8 arg8,                                 \
+                              Arg9 arg9) {                               \
+    cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
  }

 #endif
--- a/src/tensors/gpu/add.cu
+++ b/src/tensors/gpu/add.cu
@ -9,8 +9,8 @@

 #include "functional/functional.h"
 #include "functional/shape.h"
-#include "functional/tmp.h"
 #include "functional/tensor.h"
+#include "functional/tmp.h"

 namespace marian {

@ -22,7 +22,6 @@ __global__ void gAddGeneric(Functor functor,
                            functional::Tensor<float> out,
                            functional::Array<functional::Tensor<float>, K> ins,
                            float scale = 1.0) {
-
  int outLength = out.shape().elements();
  bool same = outLength == full.elements();
  for(int i = 0; i < K; ++i)
@ -37,14 +36,12 @@ __global__ void gAddGeneric(Functor functor,
  for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) {
    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
    if(index < outLength) {
-
      if(same) {
        out[index] += functional::apply(functor, ins, index) * scale;
      } else {
        out.shape().dims(index, dims);
        out[index] += functional::loops(functor, ins, len, dims) * scale;
      }
-
    }
  }
 }
@ -81,7 +78,6 @@ __global__ void gAddReduce(Functor functor,
                           functional::Tensor<float> out,
                           functional::Array<functional::Tensor<float>, K> ins,
                           float scale = 1.0) {
-
  int rows = full.elements() / full.back();
  int cols = full.back();

@ -133,12 +129,8 @@ __global__ void gAddReduce(Functor functor,
  }
 }

-template <class Functor, class ...Tensors>
-void Add(Functor functor,
-         float scale,
-         marian::Tensor out,
-         Tensors... tensors) {
-
+template <class Functor, class... Tensors>
+void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
  cudaSetDevice(out->getDevice().no);

  auto full = marian::Shape::broadcast({out, tensors...});
@ -148,7 +140,7 @@ void Add(Functor functor,
  constexpr size_t K = sizeof...(Tensors);

  functional::Tensor<float> gOut = out;
-  functional::Array<functional::Tensor<float>, K> gIns = {tensors ...};
+  functional::Array<functional::Tensor<float>, K> gIns = {tensors...};

  if(full.back() != 1 && out->shape().back() == 1) {
    size_t m = full.elements() / length;
@ -180,6 +172,5 @@ void Add(Functor functor,
 }

 #include "tensors/gpu/add.inc"
-
 }
 }
--- a/src/tensors/gpu/add.h
+++ b/src/tensors/gpu/add.h
@ -6,11 +6,7 @@ namespace marian {

 namespace gpu {

-template <class Functor, class ...Tensors>
-void Add(Functor functor,
-         float scale,
-         marian::Tensor out,
-         Tensors... tensors);
-
+template <class Functor, class... Tensors>
+void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors);
 }
 }
--- a/src/tensors/gpu/algorithm.cu
+++ b/src/tensors/gpu/algorithm.cu
@ -4,45 +4,49 @@
 #include "tensors/gpu/cuda_helpers.h"

 namespace marian {
-  namespace gpu {
-    template <typename T>
-    void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) {
-      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
-      CudaCopy(begin, end, dest);
-      CUDA_CHECK(cudaStreamSynchronize(0));
+namespace gpu {
+template <typename T>
+void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) {
+  CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+  CudaCopy(begin, end, dest);
+  CUDA_CHECK(cudaStreamSynchronize(0));
+}
+
+template void copy<float>(Ptr<Backend> backend,
+                          const float* begin,
+                          const float* end,
+                          float* dest);
+template void copy<int>(Ptr<Backend> backend,
+                        const int* begin,
+                        const int* end,
+                        int* dest);
+
+__global__ void gFill(float* d_in, int size, float val) {
+  for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
+    int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
+    if(index < size) {
+      d_in[index] = val;
    }
-
-    template void copy<float>(Ptr<Backend> backend, const float* begin, const float* end, float* dest);
-    template void copy<int>(Ptr<Backend> backend, const int* begin, const int* end, int* dest);
-
-
-    __global__ void gFill(float *d_in, int size, float val) {
-      for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
-        int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
-        if(index < size) {
-          d_in[index] = val;
-        }
-      }
-    }
-
-    void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
-      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
-      int size = end - begin;
-      int threads = std::min(512, size);
-      int blocks = (size / threads) + (size % threads != 0);
-      gFill<<<blocks, threads>>>(begin, size, value);
-      CUDA_CHECK(cudaStreamSynchronize(0));
-    }
-
-    void setSparse(Ptr<Backend> backend,
-                   const std::vector<size_t>& keys,
-                   const std::vector<float>& values,
-                   float* data) {
-      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
-      ABORT("no SetSparse");
-      //gpu::SetSparse(data, keys, values);
-      CUDA_CHECK(cudaStreamSynchronize(0));
-    }
-
  }
 }
+
+void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
+  CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+  int size = end - begin;
+  int threads = std::min(512, size);
+  int blocks = (size / threads) + (size % threads != 0);
+  gFill<<<blocks, threads>>>(begin, size, value);
+  CUDA_CHECK(cudaStreamSynchronize(0));
+}
+
+void setSparse(Ptr<Backend> backend,
+               const std::vector<size_t>& keys,
+               const std::vector<float>& values,
+               float* data) {
+  CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+  ABORT("no SetSparse");
+  // gpu::SetSparse(data, keys, values);
+  CUDA_CHECK(cudaStreamSynchronize(0));
+}
+}
+}
--- a/src/tensors/gpu/algorithm.h
+++ b/src/tensors/gpu/algorithm.h
@ -3,12 +3,15 @@
 #include "tensors/backend.h"

 namespace marian {
-  namespace gpu {
-    template <typename T>
-    void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest);
-    
-    void fill(Ptr<Backend> backend, float* begin, float* end, float value);
+namespace gpu {
+template <typename T>
+void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest);

-    void setSparse(Ptr<Backend> backend, const std::vector<size_t>&, const std::vector<float>&, float*);
-  }
+void fill(Ptr<Backend> backend, float* begin, float* end, float value);
+
+void setSparse(Ptr<Backend> backend,
+               const std::vector<size_t>&,
+               const std::vector<float>&,
+               float*);
+}
 }
--- a/src/tensors/gpu/backend.h
+++ b/src/tensors/gpu/backend.h
@ -25,13 +25,9 @@ public:
    setHandles();
  }

-  void setDevice() {
-    cudaSetDevice(deviceId_.no);
-  }
+  void setDevice() { cudaSetDevice(deviceId_.no); }

-  void synchronize() {
-    cudaStreamSynchronize(0);
-  }
+  void synchronize() { cudaStreamSynchronize(0); }

  cublasHandle_t getCublasHandle() { return cublasHandle_; }

@ -41,13 +37,11 @@ private:
  cublasHandle_t cublasHandle_;
  curandGenerator_t curandGenerator_;

-
  void setHandles() {
    cublasHandle_ = create_handle();
    curandGenerator_ = createCurandGenerator();
  }

-
  curandGenerator_t createCurandGenerator() {
    cudaSetDevice(deviceId_.no);
    curandGenerator_t generator;
@ -67,6 +61,5 @@ private:
    return cublasHandle;
  }
 };
-
 }
 }
--- a/src/tensors/gpu/cuda_helpers.h
+++ b/src/tensors/gpu/cuda_helpers.h
@ -13,9 +13,8 @@ const int MAX_BLOCKS = 65535;
 #define CUDA_CHECK(ans) \
  { gpuAssert((ans), __FILE__, __LINE__); }

-
 inline void gpuAssert(cudaError_t code,
-                      const char *file,
+                      const char* file,
                      int line,
                      bool abort = true) {
  if(code != cudaSuccess) {
@ -26,8 +25,8 @@ inline void gpuAssert(cudaError_t code,

 template <typename T>
 void CudaCopy(const T* start, const T* end, T* dest) {
-  CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T),
-             cudaMemcpyDefault));
+  CUDA_CHECK(cudaMemcpy(
+      (void*)dest, (void*)start, (end - start) * sizeof(T), cudaMemcpyDefault));
 }

 #define CUSPARSE_CHECK(x)                               \
--- a/src/tensors/gpu/device.cu
+++ b/src/tensors/gpu/device.cu
@ -7,34 +7,34 @@
 namespace marian {
 namespace gpu {

-  Device::~Device() {
-    cudaSetDevice(deviceId_.no);
-    if(data_) {
-      CUDA_CHECK(cudaFree(data_));
-    }
-    cudaDeviceSynchronize();
-  }
-
-  void Device::reserve(size_t size) {
-    size = align(size);
-    cudaSetDevice(deviceId_.no);
-
-    ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
-
-    if(data_) {
-      // Allocate memory by going through host memory
-      uint8_t *temp = new uint8_t[size_];
-      CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
-      CUDA_CHECK(cudaFree(data_));
-      CUDA_CHECK(cudaMalloc(&data_, size));
-      CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
-      delete[] temp;
-    } else {
-      CUDA_CHECK(cudaMalloc(&data_, size));
-    }
-
-    size_ = size;
+Device::~Device() {
+  cudaSetDevice(deviceId_.no);
+  if(data_) {
+    CUDA_CHECK(cudaFree(data_));
+  }
+  cudaDeviceSynchronize();
+}
+
+void Device::reserve(size_t size) {
+  size = align(size);
+  cudaSetDevice(deviceId_.no);
+
+  ABORT_IF(size < size_ || size == 0,
+           "New size must be larger than old size and larger than 0");
+
+  if(data_) {
+    // Allocate memory by going through host memory
+    uint8_t *temp = new uint8_t[size_];
+    CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaFree(data_));
+    CUDA_CHECK(cudaMalloc(&data_, size));
+    CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
+    delete[] temp;
+  } else {
+    CUDA_CHECK(cudaMalloc(&data_, size));
  }

+  size_ = size;
+}
 }
 }
--- a/src/tensors/gpu/dropout.cu
+++ b/src/tensors/gpu/dropout.cu
@ -22,31 +22,29 @@
    }                                                 \
  } while(0)

-
 namespace marian {
-  namespace gpu {
-
-    __global__ void gScale(float* data, int n, float p) {
-      int index = threadIdx.x + blockIdx.x * blockDim.x;
-
-      while(index < n) {
-        data[index] = (data[index] < p) / p;
-        index += gridDim.x * blockDim.x;
-      }
-    }
-
-    void Dropout(Tensor tensor, float p) {
-      auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
-      curandGenerator_t gen = gpuBackend->getCurandGenerator();
-      int n = tensor->size();
-      CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
-
-      int numThreads = std::min(n, 512);
-      int numBlocks = n / numThreads + (n % numThreads != 0);
-
-      gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
-    }
+namespace gpu {

+__global__ void gScale(float* data, int n, float p) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;

+  while(index < n) {
+    data[index] = (data[index] < p) / p;
+    index += gridDim.x * blockDim.x;
  }
 }
+
+void Dropout(Tensor tensor, float p) {
+  auto gpuBackend
+      = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
+  curandGenerator_t gen = gpuBackend->getCurandGenerator();
+  int n = tensor->size();
+  CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
+
+  int numThreads = std::min(n, 512);
+  int numBlocks = n / numThreads + (n % numThreads != 0);
+
+  gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
+}
+}
+}
--- a/src/tensors/gpu/element.cu
+++ b/src/tensors/gpu/element.cu
@ -1,6 +1,5 @@
-
-
 #include "tensors/gpu/element.h"
+
 #include "tensors/gpu/cuda_helpers.h"
 #include "functional/array.h"
 #include "functional/tensor.h"
@ -11,9 +10,9 @@ namespace marian {
 namespace gpu {

 template <size_t K, bool broadcast, class Functor>
-__global__ void gElement(Functor functor,
-                         functional::Array<functional::Tensor<float>, K> tensors) {
-
+__global__ void gElement(
+    Functor functor,
+    functional::Array<functional::Tensor<float>, K> tensors) {
  int length = tensors[0].shape().elements();
  functional::Array<int, functional::Shape::size()> dims;
  functional::Array<int, K> indices;
@ -21,7 +20,6 @@ __global__ void gElement(Functor functor,
  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
    if(index < length) {
-
      indices.fill(index);

      if(broadcast) {
@ -35,8 +33,8 @@ __global__ void gElement(Functor functor,
  }
 }

-template <class Functor, class ...Tensors>
-void Element(Functor functor, Tensor out, Tensors ...tensors) {
+template <class Functor, class... Tensors>
+void Element(Functor functor, Tensor out, Tensors... tensors) {
  cudaSetDevice(out->getDevice().no);

  constexpr size_t K = sizeof...(tensors) + 1;
@ -57,8 +55,5 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) {
 }

 #include "tensors/gpu/element.inc"
-
-
 }
 }
-
--- a/src/tensors/gpu/element.h
+++ b/src/tensors/gpu/element.h
@ -5,8 +5,7 @@
 namespace marian {
 namespace gpu {

-template <class Functor, class ...Tensors>
-void Element(Functor functor, Tensor out, Tensors ...tensors);
-
+template <class Functor, class... Tensors>
+void Element(Functor functor, Tensor out, Tensors... tensors);
 }
 }
--- a/src/tensors/gpu/prod.cu
+++ b/src/tensors/gpu/prod.cu
@ -38,10 +38,11 @@ void Prod(marian::Tensor C,
  cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
  cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;

-  auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle();
+  auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())
+                          ->getCublasHandle();

 #if CUDA_VERSION >= 9000
-  //cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
+// cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
 #endif

  cublasSgemm(cublasHandle,
@ -59,7 +60,7 @@ void Prod(marian::Tensor C,
              C->data(),
              ldc);
 #if CUDA_VERSION >= 9000
-  //cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
+// cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
 #endif
 }

@ -96,10 +97,11 @@ void ProdBatched(marian::Tensor C,
  cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
  cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;

-  auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle();
+  auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())
+                          ->getCublasHandle();

 #if CUDA_VERSION >= 9000
-  //cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
+// cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
 #endif
  cublasSgemmStridedBatched(cublasHandle,
                            opB,
@ -120,10 +122,8 @@ void ProdBatched(marian::Tensor C,
                            n * m,
                            std::max(batchA, batchB));
 #if CUDA_VERSION >= 9000
-  //cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
+// cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
 #endif
 }
-
-
 }
 }
--- a/src/tensors/gpu/prod.h
+++ b/src/tensors/gpu/prod.h
@ -21,6 +21,5 @@ void ProdBatched(marian::Tensor C,
                 bool transB,
                 float beta = 0,
                 float scalar = 1);
-
 }
 }
--- a/src/tensors/gpu/sparse.h
+++ b/src/tensors/gpu/sparse.h
@ -2,8 +2,8 @@
 #include <cusparse_v2.h>

 #include "common/definitions.h"
-#include "tensors/tensor.h"
 #include "kernels/cuda_helpers.h"
+#include "tensors/tensor.h"

 namespace marian {

--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@ -28,11 +28,11 @@ __device__ inline float stableLogit(float x) {
 }

 bool IsNan(Tensor in) {
-  //cudaSetDevice(in->getDevice().no);
-  //thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data());
-  //thrust::device_ptr<float> end
+  // cudaSetDevice(in->getDevice().no);
+  // thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data());
+  // thrust::device_ptr<float> end
  //    = thrust::device_pointer_cast(in->data() + in->size());
-  //return thrust::transform_reduce(
+  // return thrust::transform_reduce(
  //    begin, end, isnan_test(), 0, thrust::plus<bool>());
  return false;
 }
@ -93,10 +93,9 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {

  for(auto in : inputs) {
    ABORT_IF(rows != in->shape().elements() / in->shape().back(),
-                   "First dimension must be equal");
+             "First dimension must be equal");
    int cols_in = in->shape().back();

-
    int blocks = std::min(MAX_BLOCKS, rows);
    int threads = std::min(MAX_THREADS, cols_in);

@ -122,7 +121,7 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
  int cols_in = in->shape().back();
  for(auto out : outputs) {
    ABORT_IF(rows != out->shape().elements() / out->shape().back(),
-            "First dimension must be equal");
+             "First dimension must be equal");
    int cols_out = out->shape().back();

    int blocks = std::min(MAX_BLOCKS, rows);
@ -166,10 +165,10 @@ void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
    SplitCont(outputs, in, ax);
 }

-__global__ void gTransposeND(functional::Tensor<float> out,
-                             const functional::Tensor<float> in,
-                             const functional::Array<int, functional::Shape::size()> permute) {
-
+__global__ void gTransposeND(
+    functional::Tensor<float> out,
+    const functional::Tensor<float> in,
+    const functional::Array<int, functional::Shape::size()> permute) {
  constexpr size_t N = functional::Shape::size();
  functional::Array<int, N> oDims;
  functional::Array<int, N> pDims;
@ -458,7 +457,6 @@ void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
  int m = grad->shape().elements() / grad->shape().back();
  int k = grad->shape().back();

-
  int blocks = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, k);
  int shared = sizeof(float) * threads * 2;
@ -784,7 +782,9 @@ void Select(Tensor out,
  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));

  auto mp_indices = allocator->alloc<size_t>(indices.size());
-  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
+  CudaCopy(indices.data(),
+           indices.data() + indices.size(),
+           mp_indices->data<size_t>());

  int axisGPU = axis + functional::Shape::size() - out->shape().size();
  gSelect<<<blocks, threads>>>(out->data(),
@ -810,7 +810,9 @@ void Insert(Tensor out,
  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));

  auto mp_indices = allocator->alloc<size_t>(indices.size());
-  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
+  CudaCopy(indices.data(),
+           indices.data() + indices.size(),
+           mp_indices->data<size_t>());

  int axisGPU = axis + functional::Shape::size() - out->shape().size();
  gInsert<<<blocks, threads>>>(out->data(),
@ -1174,19 +1176,18 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
      out->data(), out->shape(), adj->data(), a->data(), pick->data());
 }

-
 float L2Norm(Tensor in) {
-
  cudaSetDevice(in->getDevice().no);

  int size = in->shape().elements();
  int threads = std::min(MAX_THREADS, size);
-  int blocks  = std::min(MAX_BLOCKS, size / threads  + (size % threads != 0));
+  int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));

  uint8_t* data;
  cudaMalloc(&data, blocks * sizeof(float));
-  Tensor out(new TensorBase(
-      New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend()));
+  Tensor out(new TensorBase(New<MemoryPiece>(data, blocks * sizeof(float)),
+                            {1, blocks},
+                            in->getBackend()));

  using namespace functional;
  ReduceAll(_1 * _1, out, in);
@ -1203,7 +1204,7 @@ __global__ void gAtt(float* out,
                     int m,  // total rows (batch x time x beam)
                     int k,  // depth
                     int b,  // batch size
-                     int t  // time of ctx
+                     int t   // time of ctx
                     ) {
  int rows = m;
  int cols = k;
@ -1255,14 +1256,8 @@ void Att(Tensor out, Tensor va, Tensor context, Tensor state) {
  int threads = std::min(MAX_THREADS, (int)k);
  int shared = sizeof(float) * threads * 2;

-  gAtt<<<blocks, threads, shared>>>(out->data(),
-                                    va->data(),
-                                    context->data(),
-                                    state->data(),
-                                    m,
-                                    k,
-                                    b,
-                                    t);
+  gAtt<<<blocks, threads, shared>>>(
+      out->data(), va->data(), context->data(), state->data(), m, k, b, t);
 }

 __global__ void gAttBack(float* gVa,
@ -1576,7 +1571,6 @@ __global__ void gShift(float* out, const float* in, int length, int offset) {
 }

 void Shift(Tensor out, Tensor in, marian::Shape shift, bool invert) {
-
  ABORT_IF(in->shape().size() != shift.size(), "bad dimensions");

  int offset = 0;
@ -2006,21 +2000,22 @@ __global__ void gMaxPoolingForward(float* out,
                                   int lastWidth) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

-  if (tid >= outRows * outCols) return;
+  if(tid >= outRows * outCols)
+    return;

  int rowId = tid / outRows;
  int colId = tid % outRows;

  float* b = in + (rowId * inCols) + (colId * width);
-  float* localMask = mask  + (rowId / numKernels) * maskCols + colId * width;
+  float* localMask = mask + (rowId / numKernels) * maskCols + colId * width;

-  if (colId == outRows - 1) {
+  if(colId == outRows - 1) {
    width = lastWidth;
  }

  float currentMax = b[0] * localMask[0];
-  for (int i = 1; i < width; ++i) {
-    if (b[i] * localMask[i] > currentMax) {
+  for(int i = 1; i < width; ++i) {
+    if(b[i] * localMask[i] > currentMax) {
      currentMax = b[i] * localMask[i];
    }
  }
@ -2045,15 +2040,20 @@ void PoolingWithMaskingForward(Tensor out,
  int outRows = outShape[2];
  int outCols = outShape[0] * outShape[1];

-  int lastWidth = ((inCols - isEven) % width == 0)
-                  ? width
-                  : (inCols - isEven) % width;
+  int lastWidth
+      = ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width;

-  gMaxPoolingForward<<<blocks, threads>>>(
-      out->data(), outRows, outCols,
-      in->data(), inRows, inCols,
-      mask->data(), outShape[1], mask->shape()[2],
-      width, lastWidth);
+  gMaxPoolingForward<<<blocks, threads>>>(out->data(),
+                                          outRows,
+                                          outCols,
+                                          in->data(),
+                                          inRows,
+                                          inCols,
+                                          mask->data(),
+                                          outShape[1],
+                                          mask->shape()[2],
+                                          width,
+                                          lastWidth);
 }

 __global__ void gMaxPoolingBackward(float* adj,
@ -2067,30 +2067,31 @@ __global__ void gMaxPoolingBackward(float* adj,
                                    int numKernels,
                                    int maskCols,
                                    int width,
-                                    int lastWidth)
-{
+                                    int lastWidth) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

-  if (tid >= adjRows * adjCols) return;
+  if(tid >= adjRows * adjCols)
+    return;

  int rowId = tid / adjRows;
  int colId = tid % adjRows;

  float* b = in + (rowId * inCols) + (colId * width);

-  if (colId == adjRows - 1) {
+  if(colId == adjRows - 1) {
    width = lastWidth;
  }

  float* localMask = mask + (rowId / numKernels) * maskCols + colId * width;
  size_t currentMaxIdx = 0;
-  for (int i = 1; i < width; ++i) {
-    if (b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) {
+  for(int i = 1; i < width; ++i) {
+    if(b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) {
      currentMaxIdx = i;
    }
  }

-  adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx] += adj[rowId + (colId * adjCols)];
+  adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx]
+      += adj[rowId + (colId * adjCols)];
 }

 void PoolingWithMaskingBackward(Tensor adj,
@ -2111,16 +2112,21 @@ void PoolingWithMaskingBackward(Tensor adj,
  int adjRows = adjShape[2];
  int adjCols = adjShape[0] * adjShape[1];

-  int lastWidth = ((inCols - isEven) % width == 0)
-                  ? width
-                  : (inCols - isEven) % width;
+  int lastWidth
+      = ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width;

-  gMaxPoolingBackward<<<blocks, threads>>>(
-      adj->data(), adjRows, adjCols,
-      in->data(), adjIn->data(), inRows, inCols,
-      mask->data(), adjShape[1], mask->shape()[2],
-      width, lastWidth);
+  gMaxPoolingBackward<<<blocks, threads>>>(adj->data(),
+                                           adjRows,
+                                           adjCols,
+                                           in->data(),
+                                           adjIn->data(),
+                                           inRows,
+                                           inCols,
+                                           mask->data(),
+                                           adjShape[1],
+                                           mask->shape()[2],
+                                           width,
+                                           lastWidth);
 }
-
 }
 }  // namespace marian
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -78,7 +78,7 @@ public:
 #endif
  }

-  void get(std::vector<float> &v) {
+  void get(std::vector<float>& v) {
    v.resize(size());
    if(backend_->getDevice().type == DeviceType::cpu) {
      std::copy(data(), data() + size(), v.data());
@ -101,9 +101,7 @@ public:
 #endif
  }

-  void set(const std::vector<float> &v) {
-    set(v.data(), v.data() + v.size());
-  }
+  void set(const std::vector<float>& v) { set(v.data(), v.data() + v.size()); }

  void set(float value) {
    if(backend_->getDevice().type == DeviceType::cpu) {
@ -116,8 +114,7 @@ public:
 #endif
  }

-  void setSparse(const std::vector<size_t> &k,
-                 const std::vector<float> &v) {
+  void setSparse(const std::vector<size_t>& k, const std::vector<float>& v) {
    if(backend_->getDevice().type == DeviceType::cpu) {
      for(int i = 0; i < k.size(); ++i)
        data()[k[i]] = v[i];
@ -130,8 +127,8 @@ public:
  }

  void copyFrom(Tensor in) {
-    if(in->getBackend()->getDevice().type == DeviceType::cpu &&
-       backend_->getDevice().type == DeviceType::cpu) {
+    if(in->getBackend()->getDevice().type == DeviceType::cpu
+       && backend_->getDevice().type == DeviceType::cpu) {
      std::copy(in->data(), in->data() + in->size(), data());
    }
 #ifdef CUDA_FOUND
@ -167,7 +164,6 @@ public:
        disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);

      if(disp) {
-
        if(dims.back() == 0) {
          bool par = true;
          std::vector<std::string> p;
@ -182,9 +178,7 @@ public:
          strm << " ";
        }

-        strm << std::setw(12)
-             << values[i]
-             << " ";
+        strm << std::setw(12) << values[i] << " ";

        if(dims.back() + 1 == shape().back()) {
          for(int j = dims.size() - 1; j >= 0; --j) {
@ -214,9 +208,7 @@ public:
    strm << std::endl;
    return strm.str();
  }
-
 };

 typedef std::shared_ptr<TensorBase> Tensor;
-
 }
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@ -21,53 +21,47 @@

 namespace marian {

-  template <class Functor, class ...Tensors>
-  void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
+template <class Functor, class... Tensors>
+void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
 #ifdef CUDA_FOUND
-    if(out->getBackend()->getDevice().type == DeviceType::gpu)
-      gpu::Element(functor, out, tensors...);
-    else
+  if(out->getBackend()->getDevice().type == DeviceType::gpu)
+    gpu::Element(functor, out, tensors...);
+  else
 #endif
-      cpu::Element(functor, out, tensors...);
-  }
+    cpu::Element(functor, out, tensors...);
+}

-  template <class Functor, class ...Tensors>
-  void Add(Functor functor,
-           float scale,
-           marian::Tensor out,
-           Tensors... tensors) {
+template <class Functor, class... Tensors>
+void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
 #ifdef CUDA_FOUND
-    if(out->getBackend()->getDevice().type == DeviceType::gpu)
-      gpu::Add(functor, scale, out, tensors...);
-    else
+  if(out->getBackend()->getDevice().type == DeviceType::gpu)
+    gpu::Add(functor, scale, out, tensors...);
+  else
 #endif
-      cpu::Add(functor, scale, out, tensors...);
-  }
+    cpu::Add(functor, scale, out, tensors...);
+}

-  template <class Functor, class ...Tensors>
-  void Add(Functor functor,
-           marian::Tensor out,
-           Tensors... tensors) {
-    Add(functor, 1, out, tensors...);
-  }
+template <class Functor, class... Tensors>
+void Add(Functor functor, marian::Tensor out, Tensors... tensors) {
+  Add(functor, 1, out, tensors...);
+}

-  template <class Functor, class ...Tensors>
-  void Reduce(Functor functor,
-              float scale,
-              marian::Tensor out,
-              Tensors... tensors) {
-    out->set(0);
-    Add(functor, scale, out, tensors...);
-  }
+template <class Functor, class... Tensors>
+void Reduce(Functor functor,
+            float scale,
+            marian::Tensor out,
+            Tensors... tensors) {
+  out->set(0);
+  Add(functor, scale, out, tensors...);
+}

-  template <class Functor, class ...Tensors>
-  void Reduce(Functor functor,
-              marian::Tensor out,
-              Tensors... tensors) {
-    out->set(0);
-    Add(functor, out, tensors...);
-  }
+template <class Functor, class... Tensors>
+void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
+  out->set(0);
+  Add(functor, out, tensors...);
+}

+// clang-format off
  DISPATCH7(Prod, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
  DISPATCH7(ProdBatched, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float)

@ -86,26 +80,34 @@ namespace marian {
  DISPATCH4(Shift, marian::Tensor, marian::Tensor, marian::Shape, bool)

  DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
+// clang-format on

 #ifdef CUDA_FOUND
-  namespace gpu {
-    void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
-  }
+namespace gpu {
+void Deconcatenate(std::vector<marian::Tensor>& outputs,
+                   const marian::Tensor in,
+                   int ax);
+}
 #endif

-  namespace cpu {
-    void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
-  }
+namespace cpu {
+void Deconcatenate(std::vector<marian::Tensor>& outputs,
+                   const marian::Tensor in,
+                   int ax);
+}

-  static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) {
+static inline void Deconcatenate(std::vector<marian::Tensor>& outputs,
+                                 const marian::Tensor in,
+                                 int ax) {
 #ifdef CUDA_FOUND
-    if(in->getBackend()->getDevice().type == DeviceType::gpu)
-      gpu::Deconcatenate(outputs, in, ax);
-    else
+  if(in->getBackend()->getDevice().type == DeviceType::gpu)
+    gpu::Deconcatenate(outputs, in, ax);
+  else
 #endif
-      cpu::Deconcatenate(outputs, in, ax);
-  }
+    cpu::Deconcatenate(outputs, in, ax);
+}

+// clang-format off
  DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
  DISPATCH9(LayerNormalizationGrad, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)

@ -120,113 +122,116 @@ namespace marian {
  DISPATCH5(Select, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>)
  DISPATCH5(Insert, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>)

-
  DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>)
  DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>);
+// clang-format on

 #ifdef CUDA_FOUND
-  namespace gpu {
-    void LSTMCellBackward(std::vector<marian::Tensor> outputs,
-                          std::vector<marian::Tensor> inputs,
-                          marian::Tensor adj);
-  }
+namespace gpu {
+void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+                      std::vector<marian::Tensor> inputs,
+                      marian::Tensor adj);
+}
 #endif

-  namespace cpu {
-    void LSTMCellBackward(std::vector<marian::Tensor> outputs,
-                          std::vector<marian::Tensor> inputs,
-                          marian::Tensor adj);
-  }
+namespace cpu {
+void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+                      std::vector<marian::Tensor> inputs,
+                      marian::Tensor adj);
+}

-  static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+                                    std::vector<marian::Tensor> inputs,
+                                    marian::Tensor adj) {
+#ifdef CUDA_FOUND
+  if(adj->getBackend()->getDevice().type == DeviceType::gpu)
+    gpu::LSTMCellBackward(outputs, inputs, adj);
+  else
+#endif
+    cpu::LSTMCellBackward(outputs, inputs, adj);
+}
+
+#ifdef CUDA_FOUND
+namespace gpu {
+void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
+                        std::vector<marian::Tensor> inputs,
+                        marian::Tensor adj);
+}
+#endif
+
+namespace cpu {
+void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
+                        std::vector<marian::Tensor> inputs,
+                        marian::Tensor adj);
+}
+
+static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
                                      std::vector<marian::Tensor> inputs,
                                      marian::Tensor adj) {
 #ifdef CUDA_FOUND
-    if(adj->getBackend()->getDevice().type == DeviceType::gpu)
-      gpu::LSTMCellBackward(outputs, inputs, adj);
-    else
+  if(adj->getBackend()->getDevice().type == DeviceType::gpu)
+    gpu::LSTMOutputBackward(outputs, inputs, adj);
+  else
 #endif
-      cpu::LSTMCellBackward(outputs, inputs, adj);
-  }
+    cpu::LSTMOutputBackward(outputs, inputs, adj);
+}
+
+DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)

 #ifdef CUDA_FOUND
-  namespace gpu {
-    void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
-                            std::vector<marian::Tensor> inputs,
-                            marian::Tensor adj);
-  }
+namespace gpu {
+void GRUFastBackward(std::vector<marian::Tensor> outputs,
+                     std::vector<marian::Tensor> inputs,
+                     marian::Tensor adj,
+                     bool final);
+}
 #endif

-  namespace cpu {
-    void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
-                            std::vector<marian::Tensor> inputs,
-                            marian::Tensor adj);
-  }
+namespace cpu {
+void GRUFastBackward(std::vector<marian::Tensor> outputs,
+                     std::vector<marian::Tensor> inputs,
+                     marian::Tensor adj,
+                     bool final);
+}

-  static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
-                                        std::vector<marian::Tensor> inputs,
-                                        marian::Tensor adj) {
+static inline void GRUFastBackward(std::vector<marian::Tensor> outputs,
+                                   std::vector<marian::Tensor> inputs,
+                                   marian::Tensor adj,
+                                   bool final = false) {
 #ifdef CUDA_FOUND
-    if(adj->getBackend()->getDevice().type == DeviceType::gpu)
-      gpu::LSTMOutputBackward(outputs, inputs, adj);
-    else
+  if(adj->getBackend()->getDevice().type == DeviceType::gpu)
+    gpu::GRUFastBackward(outputs, inputs, adj, final);
+  else
 #endif
-      cpu::LSTMOutputBackward(outputs, inputs, adj);
-  }
-
-  DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)
-
-#ifdef CUDA_FOUND
-  namespace gpu {
-    void GRUFastBackward(std::vector<marian::Tensor> outputs,
-                         std::vector<marian::Tensor> inputs,
-                         marian::Tensor adj,
-                         bool final);
-  }
-#endif
-
-  namespace cpu {
-    void GRUFastBackward(std::vector<marian::Tensor> outputs,
-                         std::vector<marian::Tensor> inputs,
-                         marian::Tensor adj,
-                         bool final);
-  }
-
-  static inline void GRUFastBackward(std::vector<marian::Tensor> outputs,
-                                     std::vector<marian::Tensor> inputs,
-                                     marian::Tensor adj,
-                                     bool final = false) {
-#ifdef CUDA_FOUND
-    if(adj->getBackend()->getDevice().type == DeviceType::gpu)
-      gpu::GRUFastBackward(outputs, inputs, adj, final);
-    else
-#endif
-      cpu::GRUFastBackward(outputs, inputs, adj, final);
-  }
+    cpu::GRUFastBackward(outputs, inputs, adj, final);
+}

+// clang-format off
  DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
  DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
+// clang-format on

 #ifdef CUDA_FOUND
-  namespace gpu {
-    float L2Norm(marian::Tensor in);
-  }
+namespace gpu {
+float L2Norm(marian::Tensor in);
+}
 #endif

-  namespace cpu {
-    float L2Norm(marian::Tensor in);
-  }
+namespace cpu {
+float L2Norm(marian::Tensor in);
+}

-  static inline float L2Norm(marian::Tensor in) {
+static inline float L2Norm(marian::Tensor in) {
 #ifdef CUDA_FOUND
-    if(in->getBackend()->getDevice().type == DeviceType::gpu)
-      return gpu::L2Norm(in);
-    else
+  if(in->getBackend()->getDevice().type == DeviceType::gpu)
+    return gpu::L2Norm(in);
+  else
 #endif
-      return cpu::L2Norm(in);
-  }
+    return cpu::L2Norm(in);
+}

+// clang-format off
  DISPATCH5(PoolingWithMaskingForward, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
  DISPATCH6(PoolingWithMaskingBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
-
+// clang-format on
 }
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@ -104,7 +104,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
      totalSize -= __size__;

      Tensor param;
-      Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
+      Ptr<TensorAllocator> allocator
+          = New<TensorAllocator>(graph->getBackend());
      allocator->reserveExact(__size__ * sizeof(float));
      allocator->allocate(param, {1, __size__});
      paramsAlloc_.push_back(allocator);
@ -122,7 +123,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
      int __size__ = std::min(shardSize_, totalSize);
      totalSize -= __size__;
      Tensor grad_;
-      Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(graph->getBackend());
+      Ptr<TensorAllocator> allocator_
+          = New<TensorAllocator>(graph->getBackend());

      allocator_->reserveExact(__size__ * sizeof(float));
      allocator_->allocate(grad_, {1, __size__});
@ -139,7 +141,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
        int __size__ = std::min(shardSize_, totalSize);
        totalSize -= __size__;
        Tensor paramAvg;
-        Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
+        Ptr<TensorAllocator> allocator
+            = New<TensorAllocator>(graph->getBackend());

        allocator->reserveExact(__size__ * sizeof(float));
        allocator->allocate(paramAvg, {1, __size__});
@ -230,7 +233,8 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
      scheduler_->update(cost, batch);

      if(scheduler_->saving() || scheduler_->validating()) {
-        // Wait with validation or saving until all other threads are done with update.
+        // Wait with validation or saving until all other threads are done with
+        // update.
        // We want to reuse the graphs for validation, so they need to be in
        // a safe state.
        pool_->wait_for_others(lock);
--- a/src/training/graph_group_async.h
+++ b/src/training/graph_group_async.h
@ -70,7 +70,6 @@ public:
        movingAvg_{options_->get<float>("exponential-smoothing") > 0},
        mvDecay_{options_->get<float>("exponential-smoothing")},
        tau_{options_->get<size_t>("optimizer-delay")} {
-
    pool_.reset(new ThreadPool(devices_.size(), devices_.size()));

    for(auto device : devices_) {
--- a/src/training/graph_group_multinode.cpp
+++ b/src/training/graph_group_multinode.cpp
@ -150,7 +150,8 @@ void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
  size_t modelSize = clientGraphs_[0]->params()->vals()->size();
  for(int client = 0; client < devices_.size(); client++) {
    // Communication overlap buffer (for grads + params)
-    Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend());
+    Tensor commOverlapBuffer
+        = newTensor(modelSize, clientGraphs_[client]->getBackend());
    commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals());
    clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer);
    // Gradients local sum buffer
@ -206,11 +207,13 @@ void MultiNodeGraphGroup::calculateShardSizes() {
 void MultiNodeGraphGroup::initShardGpuTensors() {
  size_t offset = 0;
  for(int shard = 0; shard < devices_.size(); shard++) {
-    Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
+    Tensor gpuParams
+        = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
    gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
        offset, shardSizes_[shard]));
    shardParams_.push_back(gpuParams);
-    shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
+    shardGrads_.push_back(
+        newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
  }
 }

--- a/src/training/graph_group_singleton.cpp
+++ b/src/training/graph_group_singleton.cpp
@ -1,5 +1,5 @@
-#include "tensors/tensor_operators.h"
 #include "training/graph_group_singleton.h"
+#include "tensors/tensor_operators.h"
 #include "functional/functional.h"

 namespace marian {
@ -15,7 +15,8 @@ void SingletonGraph::updateMovingAverage(Tensor mvAvgParams,
                                         Tensor params,
                                         size_t batches) {
  using namespace functional;
-  float decay = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
+  float decay
+      = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
  Element(_1 = ((1.f - decay) * _1) + (decay * _2), mvAvgParams, params);
 }

@ -59,8 +60,7 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
      if(mvAvg_) {
        mvAvgGraph_->reuseWorkspace(graph_);
        scheduler_->validate({mvAvgGraph_});
-      }
-      else {
+      } else {
        scheduler_->validate({graph_});
      }
    }
--- a/src/training/graph_group_singleton.h
+++ b/src/training/graph_group_singleton.h
@ -29,7 +29,6 @@ public:
      : GraphGroup(options),
        mvAvg_{options_->get<float>("exponential-smoothing") > 0},
        mvDecay_{options_->get<float>("exponential-smoothing")} {
-
    auto deviceId = options_->getDevices()[0];
    graph_ = New<ExpressionGraph>();
    graph_->setDevice(deviceId);
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -1,6 +1,6 @@
 #include "training/graph_group_sync.h"
-#include "tensors/tensor_operators.h"
 #include "functional/functional.h"
+#include "tensors/tensor_operators.h"

 namespace marian {

@ -17,7 +17,8 @@ void SyncGraphGroup::updateMovingAverage(Tensor paramsAvg,
                                         Tensor params,
                                         size_t batches) {
  using namespace functional;
-  float decay = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
+  float decay
+      = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
  Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params);
 }

@ -135,10 +136,10 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
      int size = params_[idx]->size();
      int i = 0;

-      float div = devices_.size(); // no. of GPUs
+      float div = devices_.size();  // no. of GPUs

      // do not average gradients if cost type is sum.
-      if (options_->get<std::string>("cost-type")  == "ce-sum") {
+      if(options_->get<std::string>("cost-type") == "ce-sum") {
        div = 1;
      }

@ -176,7 +177,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
  float cost = 0;
  for(auto c : costs)
    cost += c;
-  if (options_->get<std::string>("cost-type")  != "ce-sum") {
+  if(options_->get<std::string>("cost-type") != "ce-sum") {
    cost = cost / costs.size();
  }

--- a/src/training/graph_group_sync.h
+++ b/src/training/graph_group_sync.h
@ -43,7 +43,6 @@ public:
        devices_{options_->getDevices()},
        movingAvg_{options_->get<float>("exponential-smoothing") > 0},
        mvDecay_{options_->get<float>("exponential-smoothing")} {
-
    for(auto device : devices_) {
      auto graph = New<ExpressionGraph>();
      graph->setDevice(device);
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -100,7 +100,8 @@ public:
    return (state_->batches % options_->get<size_t>("save-freq") == 0);
  }

-  void validate(const std::vector<Ptr<ExpressionGraph>>& graphs, bool final = false) {
+  void validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                bool final = false) {
    if(state_->validated
       || (state_->batches % options_->get<size_t>("valid-freq") != 0
           && !final))
--- a/src/training/sparse_tensor.cu
+++ b/src/training/sparse_tensor.cu
@ -7,8 +7,8 @@
 #include "tensors/tensor.h"
 #include "tensors/tensor_operators.h"
 #include "training/sparse_tensor.h"
-#include "tensors/gpu/cuda_helpers.h"

+#include "tensors/gpu/cuda_helpers.h"

 namespace marian {

@ -48,10 +48,10 @@ __global__ void gFindSubtensor(int* indices,
 }

 SparseTensorBase::SparseTensorBase(int capacity, Ptr<Backend> backend)
-: backend_(backend), capacity_(capacity) {
+    : backend_(backend), capacity_(capacity) {
  ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
-          "Gradient dropping is currently not implemented for CPU usage");
-  
+           "Gradient dropping is currently not implemented for CPU usage");
+
  cudaSetDevice(backend_->getDevice().no);
  CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity));
  CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity));
@ -64,7 +64,7 @@ SparseTensorBase::SparseTensorBase(float* data,
                                   int* indices,
                                   int size,
                                   Ptr<Backend> backend)
-: backend_(backend) {
+    : backend_(backend) {
  data_ = data;
  indices_ = indices;
  size_ = size;
@ -98,10 +98,10 @@ void SparseTensorBase::copyFrom(float* data,
  size_ = size;
  if(size == 0)
    return;
-  
+
  ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
-          "Gradient dropping is currently not implemented for CPU usage");
-  
+           "Gradient dropping is currently not implemented for CPU usage");
+
  cudaSetDevice(backend_->getDevice().no);

  cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault);
@ -128,7 +128,7 @@ void SparseTensorBase::setSize(int size) {
 void SparseTensorBase::toDense(Tensor t, int offset) {
  ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
           "Gradient dropping is currently not implemented for CPU usage");
-  
+
  cudaSetDevice(backend_->getDevice().no);
  int threads = 512;
  int blocks = 1 + size_ / threads;
@ -152,8 +152,8 @@ std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
                                                              int size,
                                                              int idx) {
  ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
-          "Gradient dropping is currently not implemented for CPU usage");
-  
+           "Gradient dropping is currently not implemented for CPU usage");
+
  cudaSetDevice(backend_->getDevice().no);
  cudaStreamSynchronize(0);
  int* start = gstart_ + idx;
--- a/src/training/training.h
+++ b/src/training/training.h
@ -45,7 +45,6 @@ public:

    if((options_->has("valid-sets") || options_->has("valid-script-path"))
       && options_->get<size_t>("valid-freq") > 0) {
-
      for(auto validator : Validators(dataset->getVocabs(), options_))
        scheduler->addValidator(validator);
    }
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -4,9 +4,9 @@
 #include <cstdlib>
 #include <limits>

+#include "3rd_party/threadpool.h"
 #include "common/config.h"
 #include "common/utils.h"
-#include "3rd_party/threadpool.h"
 #include "data/batch_generator.h"
 #include "data/corpus.h"
 #include "graph/expression_graph.h"
@ -25,8 +25,7 @@ namespace marian {
 class ValidatorBase : public TrainingObserver {
 public:
  ValidatorBase(bool lowerIsBetter)
-      : lowerIsBetter_(lowerIsBetter),
-        lastBest_{initScore()} {}
+      : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) = 0;
  virtual std::string type() = 0;
@ -98,7 +97,8 @@ protected:
                           Ptr<data::BatchGenerator<DataSet>>)
      = 0;

-  void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs, float val) {
+  void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                     float val) {
    if((lowerIsBetter_ && lastBest_ > val)
       || (!lowerIsBetter_ && lastBest_ < val)) {
      stalled_ = 0;
@ -226,7 +226,6 @@ public:
  TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Config> options)
      : Validator(vocabs, options, false),
        quiet_(options_->get<bool>("quiet-translation")) {
-
    Ptr<Options> opts = New<Options>();
    opts->merge(options);
    opts->set("inference", true);
@ -314,7 +313,8 @@ public:
            scorer = scorers[id % graphs.size()];
          }

-          auto search = New<BeamSearch>(options_, std::vector<Ptr<Scorer>>{scorer});
+          auto search
+              = New<BeamSearch>(options_, std::vector<Ptr<Scorer>>{scorer});
          auto histories = search->search(graph, batch);

          for(auto history : histories) {
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -34,21 +34,20 @@ public:
               std::vector<Ptr<ScorerState>>& states,
               size_t beamSize,
               bool first) {
-
    Beams newBeams(beams.size());
    for(int i = 0; i < keys.size(); ++i) {
-      int embIdx  = keys[i] % vocabSize;
-      int beamIdx  = i / beamSize;
+      int embIdx = keys[i] % vocabSize;
+      int beamIdx = i / beamSize;

      if(newBeams[beamIdx].size() < beams[beamIdx].size()) {
        auto& beam = beams[beamIdx];
        auto& newBeam = newBeams[beamIdx];

        int hypIdx = keys[i] / vocabSize;
-        float cost  = costs[i];
+        float cost = costs[i];

-        int hypIdxTrans = (hypIdx / beamSize) +
-                          (hypIdx % beamSize) * beams.size();
+        int hypIdxTrans
+            = (hypIdx / beamSize) + (hypIdx % beamSize) * beams.size();
        if(first)
          hypIdxTrans = hypIdx;

@ -78,7 +77,7 @@ public:

  Beams pruneBeam(const Beams& beams) {
    Beams newBeams;
-    for(auto beam: beams) {
+    for(auto beam : beams) {
      Beam newBeam;
      for(auto hyp : beam) {
        if(hyp->GetWord() > 0) {
@ -90,9 +89,7 @@ public:
    return newBeams;
  }

-  Histories search(Ptr<ExpressionGraph> graph,
-                   Ptr<data::CorpusBatch> batch) {
-
+  Histories search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
    int dimBatch = batch->size();
    Histories histories;
    for(int i = 0; i < dimBatch; ++i) {
@ -140,8 +137,7 @@ public:
      Expr prevCosts;
      if(first) {
        // no cost
-        prevCosts = graph->constant({1, 1, 1, 1},
-                                    inits::from_value(0));
+        prevCosts = graph->constant({1, 1, 1, 1}, inits::from_value(0));
      } else {
        std::vector<float> beamCosts;

@ -155,8 +151,7 @@ public:
              hypIndices.push_back(hyp->GetPrevStateIndex());
              embIndices.push_back(hyp->GetWord());
              beamCosts.push_back(hyp->GetCost());
-            }
-            else {
+            } else {
              hypIndices.push_back(0);
              embIndices.push_back(0);
              beamCosts.push_back(-9999);
@ -164,9 +159,8 @@ public:
          }
        }

-        prevCosts
-            = graph->constant({(int)localBeamSize, 1, dimBatch, 1},
-                              inits::from_vector(beamCosts));
+        prevCosts = graph->constant({(int)localBeamSize, 1, dimBatch, 1},
+                                    inits::from_vector(beamCosts));
      }

      //**********************************************************************
@ -174,10 +168,12 @@ public:
      auto totalCosts = prevCosts;

      for(int i = 0; i < scorers_.size(); ++i) {
-        states[i] = scorers_[i]->step(graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);
+        states[i] = scorers_[i]->step(
+            graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);

        if(scorers_[i]->getWeight() != 1.f)
-          totalCosts = totalCosts + scorers_[i]->getWeight() * states[i]->getProbs();
+          totalCosts
+              = totalCosts + scorers_[i]->getWeight() * states[i]->getProbs();
        else
          totalCosts = totalCosts + states[i]->getProbs();
      }
@ -207,12 +203,14 @@ public:
      nth->getNBestList(beamSizes, totalCosts->val(), outCosts, outKeys, first);

      int dimTrgVoc = totalCosts->shape()[-1];
-      beams = toHyps(outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first);
+      beams = toHyps(
+          outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first);

      auto prunedBeams = pruneBeam(beams);
      for(int i = 0; i < dimBatch; ++i) {
        if(!beams[i].empty()) {
-          final = final || histories[i]->size() >= 3 * batch->front()->batchWidth();
+          final = final
+                  || histories[i]->size() >= 3 * batch->front()->batchWidth();
          histories[i]->Add(beams[i], prunedBeams[i].empty() || final);
        }
      }
--- a/src/translator/helpers.cpp
+++ b/src/translator/helpers.cpp
@ -18,7 +18,7 @@ void SetColumn(Tensor in_, size_t col, float value) {
  int nColumns = in_->shape()[-1];

  float* in = in_->data();
-  for (int rowNumber = 0; rowNumber < nRows; ++rowNumber) {
+  for(int rowNumber = 0; rowNumber < nRows; ++rowNumber) {
    int index = col + rowNumber * nColumns;
    in[index] = value;
  }
@ -31,7 +31,6 @@ void suppressUnk(Expr probs) {
 void suppressWord(Expr probs, Word id) {
  SetColumn(probs->val(), id, std::numeric_limits<float>::lowest());
 }
-
 }

 void suppressUnk(Expr probs) {
@ -55,5 +54,4 @@ void suppressWord(Expr probs, Word id) {
  }
 #endif
 }
-
 }
--- a/src/translator/helpers.cu
+++ b/src/translator/helpers.cu
@ -44,6 +44,5 @@ void suppressUnk(Expr probs) {
 void suppressWord(Expr probs, Word id) {
  SetColumn(probs->val(), id, std::numeric_limits<float>::lowest());
 }
-
 }
 }
--- a/src/translator/helpers.h
+++ b/src/translator/helpers.h
@ -14,7 +14,6 @@ namespace cpu {
 void suppressUnk(Expr probs);

 void suppressWord(Expr probs, Word id);
-
 }

 namespace gpu {
@ -22,11 +21,9 @@ namespace gpu {
 void suppressUnk(Expr probs);

 void suppressWord(Expr probs, Word id);
-
 }

 void suppressUnk(Expr probs);

 void suppressWord(Expr probs, Word id);
-
 }
--- a/src/translator/history.h
+++ b/src/translator/history.h
@ -27,7 +27,8 @@ public:
        if(beam[j]->GetWord() == 0 || last) {
          float cost = beam[j]->GetCost() / LengthPenalty(history_.size());
          topHyps_.push({history_.size(), j, cost});
-          //std::cerr << "Add " << history_.size() << " " << j << " " << cost << std::endl;
+          // std::cerr << "Add " << history_.size() << " " << j << " " << cost
+          // << std::endl;
        }
    }
    history_.push_back(beam);
@ -44,14 +45,14 @@ public:

      size_t start = bestHypCoord.i;
      size_t j = bestHypCoord.j;
-      //float c = bestHypCoord.cost;
-      //std::cerr << "h: " << start << " " << j << " " << c << std::endl;
+      // float c = bestHypCoord.cost;
+      // std::cerr << "h: " << start << " " << j << " " << c << std::endl;

      Words targetWords;
      Ptr<Hypothesis> bestHyp = history_[start][j];
      while(bestHyp->GetPrevHyp() != nullptr) {
        targetWords.push_back(bestHyp->GetWord());
-        //std::cerr << bestHyp->GetWord() << " " << bestHyp << std::endl;
+        // std::cerr << bestHyp->GetWord() << " " << bestHyp << std::endl;
        bestHyp = bestHyp->GetPrevHyp();
      }

--- a/Show More
+++ b/Show More