From 483426798930bc17309fecfb401b28ec8917529c Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Thu, 15 Feb 2018 15:23:45 -0800
Subject: [PATCH 01/11] clean up allocators and devices

---
 CMakeLists.txt                  | 12 +++---
 src/CMakeLists.txt              |  4 +-
 src/backend/cpu/dropout.cpp     |  4 +-
 src/graph/expression_graph.h    |  2 +-
 src/kernels/cuda_helpers.h      |  6 +++
 src/kernels/tensor_operators.cu | 12 +++---
 src/kernels/tensor_operators.h  |  5 +--
 src/python/CMakeLists.txt       |  2 +-
 src/tensors/allocator.h         | 28 +++++++-------
 src/tensors/device.cpp          | 28 ++++++++++++++
 src/tensors/device.cu           | 40 +++++++++++++++++++
 src/tensors/device.h            | 68 +++++++++++++++++++++++++++++++++
 src/tensors/device_gpu.cu       | 37 ------------------
 src/tensors/device_gpu.h        | 35 -----------------
 src/tensors/memory_piece.cu     | 11 ------
 src/tensors/memory_piece.h      |  7 ----
 src/tensors/tensor_allocator.h  |  7 ++--
 src/tests/allocator_test.cpp    | 46 +---------------------
 src/tests/dropout_test.cu       |  6 +--
 19 files changed, 181 insertions(+), 179 deletions(-)
 create mode 100644 src/tensors/device.cpp
 create mode 100644 src/tensors/device.cu
 create mode 100644 src/tensors/device.h
 delete mode 100644 src/tensors/device_gpu.cu
 delete mode 100644 src/tensors/device_gpu.h
 delete mode 100644 src/tensors/memory_piece.cu
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6628d8f4..23598378 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,12 +58,12 @@ else(Tcmalloc_FOUND)
   message(WARNING "Cannot find TCMalloc library. Continuing.")
 endif(Tcmalloc_FOUND)
 
-find_package(MPI)
-if(MPI_FOUND)
-  include_directories(${MPI_INCLUDE_PATH})
-  set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
-  add_definitions(-DMPI_FOUND=1)
-endif(MPI_FOUND)
+#find_package(MPI)
+#if(MPI_FOUND)
+#  include_directories(${MPI_INCLUDE_PATH})
+#  set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
+#  add_definitions(-DMPI_FOUND=1)
+#endif(MPI_FOUND)
 
 find_package(ZLIB)
 if(ZLIB_FOUND)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8550e51d..c1bfdaf6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,8 +14,8 @@ cuda_add_library(marian
   graph/node_operators.cu
   graph/node_initializers.cu
   tensors/tensor.cu
-  tensors/device_gpu.cu
-  tensors/memory_piece.cu
+  tensors/device.cu
+  tensors/device.cpp
   kernels/tensor_operators.cu
   kernels/cudnn_wrappers.cu
   backend/gpu/dropout.cu
diff --git a/src/backend/cpu/dropout.cpp b/src/backend/cpu/dropout.cpp
index 034dfeec..247dd460 100644
--- a/src/backend/cpu/dropout.cpp
+++ b/src/backend/cpu/dropout.cpp
@@ -1,5 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
+#include <algorithm>
 
 #include "backend/dispatch.h"
 
@@ -8,6 +7,7 @@ namespace marian {
     
     void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
       ABORT("Not implemented");
+      std::fill(tensor->data(), tensor->data() + tensor->size(), p);
     }
     
   }
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 9d7f714e..7d50e9ba 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -306,7 +306,7 @@ public:
       tensors_->free(t);
   }
 
-  Ptr<Allocator<DeviceGPU>> allocator() { return tensors_->allocator(); }
+  Ptr<Allocator> allocator() { return tensors_->allocator(); }
 
   void clear() {
     // clear everything apart from parameters
diff --git a/src/kernels/cuda_helpers.h b/src/kernels/cuda_helpers.h
index bc4c21d6..92faaf4c 100644
--- a/src/kernels/cuda_helpers.h
+++ b/src/kernels/cuda_helpers.h
@@ -17,6 +17,12 @@ inline void gpuAssert(cudaError_t code,
   }
 }
 
+template <typename T, typename D>
+void CudaCopy(const T* start, const T* end, D* dest) {
+  CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T),
+             cudaMemcpyDefault));
+}
+
 #define CUSPARSE_CHECK(x)                               \
   {                                                     \
     cusparseStatus_t _c = x;                            \
diff --git a/src/kernels/tensor_operators.cu b/src/kernels/tensor_operators.cu
index ad7c5d14..69b8afc4 100644
--- a/src/kernels/tensor_operators.cu
+++ b/src/kernels/tensor_operators.cu
@@ -879,7 +879,7 @@ __global__ void gInsert(float* out,
   }
 }
 
-void Select(Ptr<Allocator<DeviceGPU>> allocator,
+void Select(Ptr<Allocator> allocator,
             Tensor out,
             const Tensor in,
             int axis,
@@ -892,8 +892,8 @@ void Select(Ptr<Allocator<DeviceGPU>> allocator,
   int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
 
   auto mp_indices = allocator->alloc<size_t>(indices.size());
-  mp_indices->insert(indices.data(), indices.size());
-
+  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data());
+  
   int axisGPU = axis + gpu::Shape::size() - out->shape().size();
   gSelect<<<blocks, threads>>>(out->data(),
                                out->shape(),
@@ -905,7 +905,7 @@ void Select(Ptr<Allocator<DeviceGPU>> allocator,
   allocator->free(mp_indices);
 }
 
-void Insert(Ptr<Allocator<DeviceGPU>> allocator,
+void Insert(Ptr<Allocator> allocator,
             Tensor out,
             const Tensor in,
             int axis,
@@ -918,8 +918,8 @@ void Insert(Ptr<Allocator<DeviceGPU>> allocator,
   int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
 
   auto mp_indices = allocator->alloc<size_t>(indices.size());
-  mp_indices->insert(indices.data(), indices.size());
-
+  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data());
+  
   int axisGPU = axis + gpu::Shape::size() - out->shape().size();
   gInsert<<<blocks, threads>>>(out->data(),
                                out->shape(),
diff --git a/src/kernels/tensor_operators.h b/src/kernels/tensor_operators.h
index 91717c1b..77e58670 100644
--- a/src/kernels/tensor_operators.h
+++ b/src/kernels/tensor_operators.h
@@ -7,7 +7,6 @@
 #include "tensors/tensor.h"
 
 #include "tensors/allocator.h"
-#include "tensors/device_gpu.h"
 
 #include "gpu/shape.h"
 #include "gpu/tmp.h"
@@ -71,13 +70,13 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) {
 
 void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis);
 
-void Select(Ptr<Allocator<DeviceGPU>> allocator,
+void Select(Ptr<Allocator> allocator,
             Tensor out,
             Tensor in,
             int axis,
             const std::vector<size_t>&);
 
-void Insert(Ptr<Allocator<DeviceGPU>> allocator,
+void Insert(Ptr<Allocator> allocator,
             Tensor out,
             Tensor in,
             int axis,
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index baa7f97d..d547660a 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -8,7 +8,7 @@ cuda_add_library(pymarian SHARED
   ../graph/node.cu
   ../graph/node_operators.cu
   ../tensors/tensor.cu
-  ../tensors/device_gpu.cu
+  ../tensors/device.cpp
   ../kernels/tensor_operators.cu
   ../backend/gpu/dropout.cu
   ../backend/cpu/dropout.cpp
diff --git a/src/tensors/allocator.h b/src/tensors/allocator.h
index 8c9a5fee..6c9b51f6 100644
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@@ -10,6 +10,7 @@
 
 #include "common/definitions.h"
 #include "tensors/memory_piece.h"
+#include "tensors/device.h"
 
 namespace marian {
 
@@ -65,10 +66,9 @@ public:
   Gap rest(size_t offset) const { return Gap(data_ + offset, size_ - offset); }
 };
 
-template <class Device>
 class Allocator {
 private:
-  Device device_;
+  Ptr<Device> device_;
   size_t available_{0};
   size_t step_{128 * 1024 * 1024};
   size_t alignment_{256};
@@ -83,23 +83,23 @@ private:
 
   void grow(size_t add) {
     add = align(add);
-    uint8_t* oldData = device_.data();
-    size_t oldSize = device_.size();
+    uint8_t* oldData = device_->data();
+    size_t oldSize = device_->size();
 
-    device_.reserve(oldSize + add);
+    device_->reserve(oldSize + add);
 
     std::set<Gap> oldGaps;
     gaps_.swap(oldGaps);
 
     for(auto gap : oldGaps)
       gaps_.insert(
-          Gap(device_.data() + std::distance(oldData, gap.data()), gap.size()));
-    insertGap(Gap(device_.data() + oldSize, add));
+          Gap(device_->data() + std::distance(oldData, gap.data()), gap.size()));
+    insertGap(Gap(device_->data() + oldSize, add));
 
     std::unordered_map<uint8_t*, Ptr<MemoryPiece>> oldAllocated;
     allocated_.swap(oldAllocated);
     for(auto it : oldAllocated) {
-      uint8_t* newPtr = device_.data() + std::distance(oldData, it.first);
+      uint8_t* newPtr = device_->data() + std::distance(oldData, it.first);
       allocated_[newPtr] = oldAllocated[it.first];
       allocated_[newPtr]->setPtr(newPtr);
     }
@@ -142,7 +142,7 @@ private:
 
 public:
   Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256)
-      : device_(deviceId, alignment),
+      : device_(DispatchDevice(deviceId, alignment)),
         step_(step),
         available_(0),
         alignment_(alignment) {
@@ -153,7 +153,7 @@ public:
 
   void reserve(size_t bytes) {
     bytes = align(bytes);
-    device_.reserve(bytes);
+    device_->reserve(bytes);
     clear();
   }
 
@@ -211,17 +211,17 @@ public:
     available_ = 0;
     gaps_.clear();
     allocated_.clear();
-    insertGap({device_.data(), device_.size()}, false);
+    insertGap({device_->data(), device_->size()}, false);
   }
 
   Ptr<MemoryPiece> memory() {
-    return New<MemoryPiece>(device_.data(), device_.size());
+    return New<MemoryPiece>(device_->data(), device_->size());
   }
 
-  size_t size() { return device_.size(); }
+  size_t size() { return device_->size(); }
 
   size_t available() { return available_; }
 
-  DeviceId getDevice() { return device_.getDevice(); }
+  DeviceId getDevice() { return device_->getDevice(); }
 };
 }
diff --git a/src/tensors/device.cpp b/src/tensors/device.cpp
new file mode 100644
index 00000000..5e2a3568
--- /dev/null
+++ b/src/tensors/device.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include "tensors/device.h"
+
+namespace marian {
+namespace cpu {
+  
+  Device::~Device() {
+    delete[] data_;
+  }
+  
+  void Device::reserve(size_t size) {
+    size = align(size);
+    ABORT_IF(size < size_, "New size must be larger than old size");
+
+    if(data_) {
+      uint8_t *temp = new uint8_t[size_];
+      std::copy(data_, data_ + size_, temp);
+      delete[] data_;
+      data_ = temp;
+    } else {
+      data_ = new uint8_t[size];
+    }
+
+    size_ = size;
+  }
+  
+}
+}
diff --git a/src/tensors/device.cu b/src/tensors/device.cu
new file mode 100644
index 00000000..c773d566
--- /dev/null
+++ b/src/tensors/device.cu
@@ -0,0 +1,40 @@
+#include <cuda.h>
+#include <iostream>
+
+#include "tensors/device.h"
+#include "kernels/cuda_helpers.h"
+
+namespace marian {
+namespace gpu {
+  
+  Device::~Device() {
+    cudaSetDevice(deviceId_.no);
+    if(data_) {
+      CUDA_CHECK(cudaFree(data_));
+    }
+    cudaDeviceSynchronize();
+  }
+  
+  void Device::reserve(size_t size) {
+    size = align(size);
+    cudaSetDevice(deviceId_.no);
+  
+    ABORT_IF(size < size_, "New size must be larger than old size");
+  
+    if(data_) {
+      // Allocate memory by going through host memory
+      uint8_t *temp = new uint8_t[size_];
+      CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
+      CUDA_CHECK(cudaFree(data_));
+      CUDA_CHECK(cudaMalloc(&data_, size));
+      CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
+      delete[] temp;
+    } else {
+      CUDA_CHECK(cudaMalloc(&data_, size));
+    }
+  
+    size_ = size;
+  }
+  
+}
+}
diff --git a/src/tensors/device.h b/src/tensors/device.h
new file mode 100644
index 00000000..33a9c104
--- /dev/null
+++ b/src/tensors/device.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+
+#include "common/definitions.h"
+
+namespace marian {
+
+class Device {
+protected:
+  DeviceId deviceId_;
+  
+  uint8_t* data_;
+  size_t size_;
+  size_t alignment_;
+
+  size_t align(size_t size) {
+    return ceil(size / (float)alignment_) * alignment_;
+  }
+
+public:
+  Device(DeviceId deviceId, size_t alignment = 256)
+      : deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {}
+
+  virtual ~Device() {};
+
+  virtual void reserve(size_t size) = 0;
+
+  virtual uint8_t* data() { return data_; }
+
+  virtual size_t size() { return size_; }
+
+  virtual DeviceId getDevice() { return deviceId_; }
+};
+
+namespace gpu {
+  class Device : public marian::Device {
+    public:
+      Device(DeviceId deviceId, size_t alignment = 256)
+      : marian::Device(deviceId, alignment) {}
+      
+      ~Device();
+      
+      void reserve(size_t size);
+  };
+}
+
+namespace cpu {
+  class Device : public marian::Device {
+    public:
+      Device(DeviceId deviceId, size_t alignment = 256)
+      : marian::Device(deviceId, alignment) {}
+      
+      ~Device();
+      
+      void reserve(size_t size);
+  };
+}
+
+static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
+  if(deviceId.type == DeviceType::gpu)
+    return New<gpu::Device>(deviceId, alignment);
+  else
+    return New<cpu::Device>(deviceId, alignment);
+}
+
+}
\ No newline at end of file
diff --git a/src/tensors/device_gpu.cu b/src/tensors/device_gpu.cu
deleted file mode 100644
index f56c8747..00000000
--- a/src/tensors/device_gpu.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <cuda.h>
-#include <iostream>
-
-#include "tensors/device_gpu.h"
-#include "kernels/cuda_helpers.h"
-
-namespace marian {
-
-DeviceGPU::~DeviceGPU() {
-  cudaSetDevice(deviceId_.no);
-  if(data_) {
-    CUDA_CHECK(cudaFree(data_));
-  }
-  cudaDeviceSynchronize();
-}
-
-void DeviceGPU::reserve(size_t size) {
-  size = align(size);
-  cudaSetDevice(deviceId_.no);
-
-  ABORT_IF(size < size_, "New size must be larger than old size");
-
-  if(data_) {
-    // Allocate memory by going through host memory
-    uint8_t *temp = new uint8_t[size_];
-    CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(data_));
-    CUDA_CHECK(cudaMalloc(&data_, size));
-    CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
-    delete[] temp;
-  } else {
-    CUDA_CHECK(cudaMalloc(&data_, size));
-  }
-
-  size_ = size;
-}
-}
diff --git a/src/tensors/device_gpu.h b/src/tensors/device_gpu.h
deleted file mode 100644
index 279a2b2b..00000000
--- a/src/tensors/device_gpu.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include <cmath>
-#include <cstdint>
-
-#include "common/definitions.h"
-
-namespace marian {
-
-class DeviceGPU {
-private:
-  uint8_t* data_;
-  size_t size_;
-  DeviceId deviceId_;
-  size_t alignment_;
-
-  size_t align(size_t size) {
-    return ceil(size / (float)alignment_) * alignment_;
-  }
-
-public:
-  DeviceGPU(DeviceId deviceId, size_t alignment = 256)
-      : data_(0), size_(0), deviceId_(deviceId), alignment_(alignment) {}
-
-  ~DeviceGPU();
-
-  void reserve(size_t size);
-
-  uint8_t* data() { return data_; }
-
-  size_t size() { return size_; }
-
-  DeviceId getDevice() { return deviceId_; }
-};
-}
\ No newline at end of file
diff --git a/src/tensors/memory_piece.cu b/src/tensors/memory_piece.cu
deleted file mode 100644
index d19767ae..00000000
--- a/src/tensors/memory_piece.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <cuda.h>
-
-#include "kernels/cuda_helpers.h"
-#include "tensors/memory_piece.h"
-
-namespace marian {
-
-void MemoryPiece::insert(uint8_t* ptr, size_t num) {
-  CUDA_CHECK(cudaMemcpy(data_, ptr, num * sizeof(uint8_t), cudaMemcpyDefault));
-}
-}
diff --git a/src/tensors/memory_piece.h b/src/tensors/memory_piece.h
index 4b464e79..8380d2cd 100644
--- a/src/tensors/memory_piece.h
+++ b/src/tensors/memory_piece.h
@@ -34,13 +34,6 @@ public:
 
   void setPtr(uint8_t* data) { data_ = data; }
 
-  template <typename T>
-  void insert(T* ptr, size_t num) {
-    insert((uint8_t*)ptr, num * sizeof(T));
-  }
-
-  void insert(uint8_t* ptr, size_t num);
-
   friend std::ostream& operator<<(std::ostream& out, const MemoryPiece mp) {
     out << "MemoryPiece - ptr: " << std::hex << (size_t)mp.data() << std::dec
         << " size: " << mp.size();
diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h
index 3498ff18..e1c54b22 100644
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@@ -5,7 +5,6 @@
 
 #include "common/definitions.h"
 #include "tensors/allocator.h"
-#include "tensors/device_gpu.h"
 #include "tensors/tensor.h"
 
 namespace marian {
@@ -17,11 +16,11 @@ private:
   const size_t GROW = CHUNK * MBYTE;
   const size_t ALIGN = 256;
 
-  Ptr<Allocator<DeviceGPU>> allocator_;
+  Ptr<Allocator> allocator_;
 
 public:
   TensorAllocator(DeviceId deviceId)
-      : allocator_(New<Allocator<DeviceGPU>>(deviceId, 0, GROW, ALIGN)) {}
+      : allocator_(New<Allocator>(deviceId, 0, GROW, ALIGN)) {}
 
   ~TensorAllocator() { clear(); }
 
@@ -73,6 +72,6 @@ public:
 
   size_t size() { return allocator_->size() / sizeof(float); }
 
-  Ptr<Allocator<DeviceGPU>> allocator() { return allocator_; }
+  Ptr<Allocator> allocator() { return allocator_; }
 };
 }
diff --git a/src/tests/allocator_test.cpp b/src/tests/allocator_test.cpp
index 2a67fe37..1b5cb348 100644
--- a/src/tests/allocator_test.cpp
+++ b/src/tests/allocator_test.cpp
@@ -5,55 +5,11 @@
 
 #include "3rd_party/exception.h"
 #include "tensors/allocator.h"
-#include "tensors/device_gpu.h"
-
-class DeviceCPU {
-private:
-  uint8_t* data_;
-  size_t size_;
-  size_t alignment_;
-
-public:
-  DeviceCPU(size_t device, size_t alignment = 256)
-   : data_(0), size_(0), alignment_(alignment) {}
-
-  ~DeviceCPU() { delete[] data_; }
-
-  size_t align(size_t size) {
-    return ceil(size / (float)alignment_) * alignment_;
-  }
-
-  void reserve(size_t size) {
-    size = align(size);
-    ABORT_IF(size < size_, "New size must be larger than old size");
-
-    if(data_) {
-      // Allocate memory by going through host memory
-      uint8_t *temp = new uint8_t[size_];
-      std::copy(data_, data_ + size_, temp);
-      std::fill(data_, data_ + size_, 0);
-      delete[] data_;
-      data_ = new uint8_t[size];
-      std::copy(temp, temp + size_, data_);
-      delete[] temp;
-    } else {
-      data_ = new uint8_t[size];
-    }
-
-    size_ = size;
-  }
-
-  uint8_t* data() { return data_; }
-
-  size_t size() { return size_; }
-
-  size_t getDevice() { return 0; }
-};
 
 int main(int argc, char** argv) {
   using namespace marian;
 
-  auto a = New<Allocator<DeviceGPU>>({0, DeviceType::gpu}, 0, 30000, 256);
+  auto a = New<Allocator>({0, DeviceType::gpu}, 0, 30000, 256);
   std::cerr << "Size: " << a->size() << std::endl;
 
   auto mem1 = a->alloc<int>(100000);
diff --git a/src/tests/dropout_test.cu b/src/tests/dropout_test.cu
index 517056c2..37738deb 100644
--- a/src/tests/dropout_test.cu
+++ b/src/tests/dropout_test.cu
@@ -1,13 +1,9 @@
-#include <cuda.h>
-#include <curand.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <boost/chrono.hpp>
 #include <boost/timer/timer.hpp>
 #include <vector>
 
-#include "common/config.h"
-#include "graph/node_initializers.h"
 #include "marian.h"
 
 using namespace marian;
@@ -17,7 +13,7 @@ int main(int argc, char** argv) {
   auto c = New<Config>(argc, argv);
 
   auto g = New<ExpressionGraph>();
-  g->setDevice({0, DeviceType::gpu});
+  g->setDevice({0, DeviceType::cpu});
   g->reserveWorkspaceMB(512);
 
   for(int i = 0; i < 10; ++i) {

From b4fdf6134b47220955a41a4b60806c53829eb1c7 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Thu, 15 Feb 2018 15:30:16 -0800
Subject: [PATCH 02/11] remove tensor_cpu.h

---
 src/tensors/tensor.cu    |  5 +--
 src/tensors/tensor_cpu.h | 82 ----------------------------------------
 2 files changed, 1 insertion(+), 86 deletions(-)
 delete mode 100644 src/tensors/tensor_cpu.h

diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu
index eec4be6b..bc26fcec 100644
--- a/src/tensors/tensor.cu
+++ b/src/tensors/tensor.cu
@@ -1,11 +1,8 @@
 
-#include <cuda.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
+#include "tensors/tensor.h"
 
 #include "kernels/cuda_helpers.h"
 #include "kernels/tensor_operators.h"
-#include "tensors/tensor.h"
 
 namespace marian {
 
diff --git a/src/tensors/tensor_cpu.h b/src/tensors/tensor_cpu.h
deleted file mode 100644
index 18725b3d..00000000
--- a/src/tensors/tensor_cpu.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-
-// This file is part of the Marian toolkit.
-
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include <cstring>
-
-#include "tensors/tensor.h"
-
-namespace marian {
-
-class TensorCPU : public TensorBase {
-public:
-  TensorCPU(float* data, Shape shape) : TensorBase(data, shape) {}
-
-  float get(size_t i) { return data_[i]; }
-
-  void set(size_t i, float value) { data_[i] = value; }
-
-  void get(std::vector<float>& v) {
-    v.resize(size());
-    std::copy(data_, data_ + size(), v.begin());
-  }
-
-  void set(float value) { std::fill(data_, data_ + size(), value); }
-
-  void set(const std::vector<float>& v) {
-    std::copy(v.begin(), v.end(), data_);
-  }
-};
-
-class DeviceCPU {
-private:
-  float* data_;
-  size_t size_
-
-      public : DeviceCPU()
-      : data_(0), size_(0) {}
-
-  ~DeviceCPU() {
-    if(data_)
-      delete[] data_;
-  }
-
-  typedef TensorCPU tensor_type;
-
-  void reserve(size_t size) {
-    ABORT_IF(size < size_, "New size must be larger than old size");
-    float* temp = new float[size];
-
-    if(data_) {
-      std::memcpy(temp, data_, size_ * sizeof(float));
-      delete[] data_;
-    }
-
-    data_ = temp;
-    size_ = size;
-  }
-
-  float* data() { return data_; }
-
-  size_t capacity() { return size_; }
-};
-}

From ae600b3ef7cefb82c569444b1477873e4c2229f3 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Thu, 15 Feb 2018 17:42:43 -0800
Subject: [PATCH 03/11] more clean up in backend

---
 src/CMakeLists.txt                | 18 +++++----
 src/backend/cpu/dropout.cpp       | 14 -------
 src/backend/dispatch.h            | 39 -------------------
 src/backend/gpu/dropout.cu        | 50 ------------------------
 src/graph/backend.h               | 20 ----------
 src/graph/backend_gpu.h           | 65 -------------------------------
 src/graph/expression_graph.cpp    |  7 +---
 src/graph/expression_graph.h      |  2 +-
 src/graph/node.cu                 | 51 ------------------------
 src/graph/node.h                  |  5 ++-
 src/graph/node_operators_binary.h | 58 +++++++++++++--------------
 src/graph/node_operators_unary.h  |  5 ++-
 src/python/CMakeLists.txt         |  4 +-
 src/tests/dropout_test.cu         |  2 +-
 14 files changed, 51 insertions(+), 289 deletions(-)
 delete mode 100644 src/backend/cpu/dropout.cpp
 delete mode 100644 src/backend/dispatch.h
 delete mode 100644 src/backend/gpu/dropout.cu
 delete mode 100644 src/graph/backend.h
 delete mode 100644 src/graph/backend_gpu.h
 delete mode 100644 src/graph/node.cu

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c1bfdaf6..977d88f7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,19 +8,21 @@ cuda_add_library(marian
   3rd_party/cnpy/cnpy.cpp
   3rd_party/exception.cpp
   3rd_party/svd/svd.cpp
-  graph/expression_graph.cpp
-  graph/expression_operators.cu
-  graph/node.cu
-  graph/node_operators.cu
-  graph/node_initializers.cu
   tensors/tensor.cu
   tensors/device.cu
   tensors/device.cpp
+  tensors/backend.cpp
+  tensors/gpu/dropout.cu
+  tensors/cpu/dropout.cpp
   kernels/tensor_operators.cu
   kernels/cudnn_wrappers.cu
-  backend/gpu/dropout.cu
-  backend/cpu/dropout.cpp
+  graph/expression_graph.cpp
+  graph/expression_operators.cu
+  graph/node.cpp
+  graph/node_operators.cu
+  graph/node_initializers.cu
   layers/convolution.cu
+
   rnn/cells.cu
   optimizers/clippers.cu
   optimizers/optimizers.cu
@@ -91,7 +93,7 @@ endforeach(exec)
 #set_target_properties(align2steps PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 
 if(PYTHONLIBS_FOUND)
-  add_subdirectory(python)
+#  add_subdirectory(python)
 endif(PYTHONLIBS_FOUND)
 
 if(COMPILE_TESTS)
diff --git a/src/backend/cpu/dropout.cpp b/src/backend/cpu/dropout.cpp
deleted file mode 100644
index 247dd460..00000000
--- a/src/backend/cpu/dropout.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <algorithm>
-
-#include "backend/dispatch.h"
-
-namespace marian {
-  namespace cpu {
-    
-    void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
-      ABORT("Not implemented");
-      std::fill(tensor->data(), tensor->data() + tensor->size(), p);
-    }
-    
-  }
-}
diff --git a/src/backend/dispatch.h b/src/backend/dispatch.h
deleted file mode 100644
index 4d9490e1..00000000
--- a/src/backend/dispatch.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "common/definitions.h"
-#include "graph/backend.h"
-#include "tensors/tensor.h"
-
-#define DISPATCH1(Function, Arg1) \
-  namespace gpu { \
-    void Function(Ptr<Backend>, Arg1); \
-  } \
-  namespace cpu { \
-    void Function(Ptr<Backend>, Arg1); \
-  } \
-  void Function(Ptr<Backend> backend, Arg1 arg1) { \
-    if(backend->getDevice().type == DeviceType::gpu) \
-      gpu::Function(backend, arg1); \
-    else \
-      cpu::Function(backend, arg1); \
-  }
-
-#define DISPATCH2(Function, Arg1, Arg2) \
-  namespace gpu { \
-    void Function(Ptr<Backend>, Arg1, Arg2); \
-  } \
-  namespace cpu { \
-    void Function(Ptr<Backend>, Arg1, Arg2); \
-  } \
-  static inline void Function(Ptr<Backend> backend, Arg1 arg1, Arg2 arg2) { \
-    if(backend->getDevice().type == DeviceType::gpu) \
-      gpu::Function(backend, arg1, arg2); \
-    else \
-      cpu::Function(backend, arg1, arg2); \
-  }
-  
-namespace marian {
-
-  DISPATCH2(Dropout, Tensor, float)
-    
-}
\ No newline at end of file
diff --git a/src/backend/gpu/dropout.cu b/src/backend/gpu/dropout.cu
deleted file mode 100644
index f2c29810..00000000
--- a/src/backend/gpu/dropout.cu
+++ /dev/null
@@ -1,50 +0,0 @@
-#include <cuda.h>
-#include <curand.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "backend/dispatch.h"
-#include "graph/backend_gpu.h"
-
-#define CUDA_CALL(x)                                  \
-  do {                                                \
-    if((x) != cudaSuccess) {                          \
-      printf("Error at %s:%d\n", __FILE__, __LINE__); \
-      exit(1);                                        \
-    }                                                 \
-  } while(0)
-
-#define CURAND_CALL(x)                                \
-  do {                                                \
-    if((x) != CURAND_STATUS_SUCCESS) {                \
-      printf("Error at %s:%d\n", __FILE__, __LINE__); \
-      exit(1);                                        \
-    }                                                 \
-  } while(0)
-
-
-namespace marian {
-  namespace gpu {
-    
-    __global__ void gScale(float* data, int n, float p) {
-      int index = threadIdx.x + blockIdx.x * blockDim.x;
-    
-      while(index < n) {
-        data[index] = (data[index] < p) / p;
-        index += gridDim.x * blockDim.x;
-      }
-    }
-    
-    void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
-      curandGenerator_t gen = std::static_pointer_cast<BackendGPU>(backend)->getCurandGenerator();
-      int n = tensor->size();
-      CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
-    
-      int numThreads = std::min(n, 512);
-      int numBlocks = n / numThreads + (n % numThreads != 0);
-    
-      gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
-    }
-    
-  }
-}
diff --git a/src/graph/backend.h b/src/graph/backend.h
deleted file mode 100644
index 323f3f95..00000000
--- a/src/graph/backend.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#pragma once
-
-#include "common/definitions.h"
-
-namespace marian {
-
-class Backend {
-protected:
-  DeviceId deviceId_;
-  size_t seed_;
-  
-public:
-  Backend(DeviceId deviceId, size_t seed)
-    : deviceId_(deviceId), seed_(seed) {}
-  
-  virtual DeviceId getDevice() { return deviceId_; };
-  virtual void setDevice() = 0;
-};
-
-}
diff --git a/src/graph/backend_gpu.h b/src/graph/backend_gpu.h
deleted file mode 100644
index faa5cc77..00000000
--- a/src/graph/backend_gpu.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#pragma once
-
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <curand.h>
-
-#include "common/config.h"
-#include "graph/backend.h"
-
-#define CURAND_CALL(x)                                \
-  do {                                                \
-    if((x) != CURAND_STATUS_SUCCESS) {                \
-      printf("Error at %s:%d\n", __FILE__, __LINE__); \
-      exit(1);                                        \
-    }                                                 \
-  } while(0)
-
-namespace marian {
-
-class BackendGPU : public Backend {  
-public:
-  BackendGPU(DeviceId deviceId, size_t seed) : Backend(deviceId, seed) {
-    setDevice();
-    setHandles();
-  }
-
-  void setDevice() {
-    cudaSetDevice(deviceId_.no);
-  }
-
-  cublasHandle_t getCublasHandle() { return cublasHandle_; }
-
-  curandGenerator_t getCurandGenerator() { return curandGenerator_; }
-
-private:
-  cublasHandle_t cublasHandle_;
-  curandGenerator_t curandGenerator_;
-
-  
-  void setHandles() {
-    cublasHandle_ = create_handle();
-    curandGenerator_ = createCurandGenerator();
-  }
-
-  
-  curandGenerator_t createCurandGenerator() {
-    cudaSetDevice(deviceId_.no);
-    curandGenerator_t generator;
-    CURAND_CALL(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
-    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator, seed_));
-
-    // cudaStream_t stream = 0;
-    // CURAND_CALL(curandSetStream(generator, stream));
-    // CURAND_CALL(curandDestroyGenerator(generator));
-    return generator;
-  }
-
-  cublasHandle_t create_handle() {
-    cudaSetDevice(deviceId_.no);
-    cublasHandle_t cublasHandle;
-    cublasCreate(&cublasHandle);
-    return cublasHandle;
-  }
-};
-}
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 9014abb5..183b5787 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -1,8 +1,7 @@
 #include <sstream>
-#include "graph/backend_gpu.h"
 #include "graph/expression_graph.h"
 
-#include "backend/dispatch.h"
+#include "tensors/dispatch.h"
 
 namespace marian {
 
@@ -11,11 +10,9 @@ ExpressionGraph::ExpressionGraph(bool inference)
 
 void ExpressionGraph::setDevice(DeviceId deviceId) {
   if(!backend_) {
-    backend_ = New<BackendGPU>(deviceId, Config::seed);
-  
+    backend_ = BackendByDevice(deviceId, Config::seed);
     params_ = New<Parameters>();
     params_->init(backend_->getDevice());
-  
     tensors_ = New<TensorAllocator>(backend_->getDevice());
   }
 }
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 7d50e9ba..defdb475 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -8,8 +8,8 @@
 #include "common/definitions.h"
 
 #include "tensors/tensor_allocator.h"
+#include "tensors/backend.h"
 
-#include "graph/backend.h"
 #include "graph/parameters.h"
 #include "graph/chainable.h"
 #include "graph/node_operators.h"
diff --git a/src/graph/node.cu b/src/graph/node.cu
deleted file mode 100644
index a289f60c..00000000
--- a/src/graph/node.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "graph/backend_gpu.h"
-#include "graph/expression_graph.h"
-#include "graph/node.h"
-
-namespace marian {
-
-size_t Node::allocate() {
-  size_t elements = 0;
-  if(!val_) {
-    graph()->tensor(val_, shape_);
-    elements = val_->shape().elements();
-  }
-  return elements;
-}
-
-void Node::free() {
-  if(graph()) {
-    if(val_)
-      graph()->free(val_);
-    if(adj_)
-      graph()->free(adj_);
-  }
-}
-
-void Node::init_dependent() {
-  if(!adj_) {
-    graph()->tensor(adj_, shape_);
-    adj_->set(1);
-  }
-}
-
-void Node::set_zero_adjoint() {
-  if(!adj_) {
-    graph()->tensor(adj_, shape_);
-    adj_->set(0);
-  }
-}
-
-float Node::scalar() {
-  return val_->scalar();
-}
-
-Ptr<Backend> Node::getBackend() {
-  return graph()->getBackend();
-}
-
-void NaryNodeOp::remove_children_from_top_nodes() {
-  for(auto child : children_)
-    graph()->remove_top_node(child);
-}
-}
diff --git a/src/graph/node.h b/src/graph/node.h
index 126d7d49..aa450000 100644
--- a/src/graph/node.h
+++ b/src/graph/node.h
@@ -5,10 +5,11 @@
 #include <thread>
 
 #include "common/keywords.h"
-#include "graph/backend.h"
-#include "graph/chainable.h"
+#include "tensors/backend.h"
 #include "tensors/tensor.h"
 
+#include "graph/chainable.h"
+
 namespace marian {
 
 class Node : public Chainable<Tensor>,
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 50ead8e6..7d6836a3 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -2,10 +2,10 @@
 
 #include <thread>
 
-#include "graph/backend_gpu.h"
+#include "tensors/gpu/backend.h"
 #include "graph/node.h"
-#include "kernels/tensor_operators.h"
 #include "functional/functional.h"
+#include "kernels/tensor_operators.h"
 #include "kernels/cudnn_wrappers.h"
 
 namespace marian {
@@ -54,7 +54,7 @@ public:
   NodeOps forwardOps() {
     // C = alpha * dot(op(A), op(B))
     return {NodeOp(Prod(
-        std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
+        std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
         val_,
         child(0)->val(),
         child(1)->val(),
@@ -72,7 +72,7 @@ public:
     // to sum gradients from different graph parts
 
     if(!transA_ && transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(0)->grad(),
                           adj_,
@@ -81,7 +81,7 @@ public:
                           false,
                           1.0,
                           scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(1)->grad(),
                           adj_,
@@ -92,7 +92,7 @@ public:
                           scalar_))};
 
     if(transA_ && !transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(0)->grad(),
                           child(1)->val(),
@@ -101,7 +101,7 @@ public:
                           true,
                           1.0,
                           scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(1)->grad(),
                           child(0)->val(),
@@ -112,7 +112,7 @@ public:
                           scalar_))};
 
     if(transA_ && transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(0)->grad(),
                           child(1)->val(),
@@ -121,7 +121,7 @@ public:
                           true,
                           1.0,
                           scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(1)->grad(),
                           adj_,
@@ -131,7 +131,7 @@ public:
                           1.0,
                           scalar_))};
 
-    return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+    return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                             ->getCublasHandle(),
                         child(0)->grad(),
                         adj_,
@@ -140,7 +140,7 @@ public:
                         true,
                         1.0,
                         scalar_)),
-            NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+            NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                             ->getCublasHandle(),
                         child(1)->grad(),
                         child(0)->val(),
@@ -198,7 +198,7 @@ public:
     using namespace functional;
     return {
       NodeOp(Prod(
-        std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
+        std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
         val_,
         child(0)->val(),
         child(1)->val(),
@@ -219,7 +219,7 @@ public:
     using namespace functional;
 
     if(!transA_ && transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(0)->grad(),
                           adj_,
@@ -228,7 +228,7 @@ public:
                           false,
                           1.0,
                           scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(1)->grad(),
                           adj_,
@@ -240,7 +240,7 @@ public:
               NodeOp(Add(_1, child(2)->grad(), adj_))};
 
     if(transA_ && !transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(0)->grad(),
                           child(1)->val(),
@@ -249,7 +249,7 @@ public:
                           true,
                           1.0,
                           scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(1)->grad(),
                           child(0)->val(),
@@ -261,7 +261,7 @@ public:
               NodeOp(Add(_1, child(2)->grad(), adj_))};
 
     if(transA_ && transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(0)->grad(),
                           child(1)->val(),
@@ -270,7 +270,7 @@ public:
                           true,
                           1.0,
                           scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(1)->grad(),
                           adj_,
@@ -281,7 +281,7 @@ public:
                           scalar_)),
               NodeOp(Add(_1, child(2)->grad(), adj_))};
 
-    return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+    return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                             ->getCublasHandle(),
                         child(0)->grad(),
                         adj_,
@@ -290,7 +290,7 @@ public:
                         true,
                         1.0,
                         scalar_)),
-            NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+            NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                             ->getCublasHandle(),
                         child(1)->grad(),
                         child(0)->val(),
@@ -350,7 +350,7 @@ public:
   NodeOps forwardOps() {
     // C = alpha * dot(op(A), op(B))
     return {NodeOp(ProdBatched(
-        std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
+        std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
         val_,
         child(0)->val(),
         child(1)->val(),
@@ -369,7 +369,7 @@ public:
 
     if(!transA_ && transB_)
       return {
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                  ->getCublasHandle(),
                              child(0)->grad(),
                              adj_,
@@ -378,7 +378,7 @@ public:
                              false,
                              1.0,
                              scalar_)),
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                  ->getCublasHandle(),
                              child(1)->grad(),
                              adj_,
@@ -390,7 +390,7 @@ public:
 
     if(transA_ && !transB_)
       return {
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                  ->getCublasHandle(),
                              child(0)->grad(),
                              child(1)->val(),
@@ -399,7 +399,7 @@ public:
                              true,
                              1.0,
                              scalar_)),
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                  ->getCublasHandle(),
                              child(1)->grad(),
                              child(0)->val(),
@@ -411,7 +411,7 @@ public:
 
     if(transA_ && transB_)
       return {
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                  ->getCublasHandle(),
                              child(0)->grad(),
                              child(1)->val(),
@@ -420,7 +420,7 @@ public:
                              true,
                              1.0,
                              scalar_)),
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                  ->getCublasHandle(),
                              child(1)->grad(),
                              adj_,
@@ -431,7 +431,7 @@ public:
                              scalar_))};
 
     return {
-        NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+        NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                ->getCublasHandle(),
                            child(0)->grad(),
                            adj_,
@@ -440,7 +440,7 @@ public:
                            true,
                            1.0,
                            scalar_)),
-        NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+        NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                ->getCublasHandle(),
                            child(1)->grad(),
                            child(0)->val(),
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 8390a0c2..a3f27fd2 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -1,10 +1,11 @@
 #pragma once
 
-#include "graph/backend_gpu.h"
+#include "tensors/tensor.h"
+#include "tensors/gpu/backend.h"
+
 #include "graph/node.h"
 #include "kernels/sparse.h"
 #include "kernels/tensor_operators.h"
-#include "tensors/tensor.h"
 #include "functional/functional.h"
 #include "kernels/cudnn_wrappers.h"
 
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index d547660a..9d54c01a 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -10,8 +10,8 @@ cuda_add_library(pymarian SHARED
   ../tensors/tensor.cu
   ../tensors/device.cpp
   ../kernels/tensor_operators.cu
-  ../backend/gpu/dropout.cu
-  ../backend/cpu/dropout.cpp
+  ../tensors/gpu/dropout.cu
+  ../tensors/cpu/dropout.cpp
   ../kernels/sparse.cu
   #../layers/param_initializers.cu
   ../rnn/attention.cu
diff --git a/src/tests/dropout_test.cu b/src/tests/dropout_test.cu
index 37738deb..f4a42e92 100644
--- a/src/tests/dropout_test.cu
+++ b/src/tests/dropout_test.cu
@@ -13,7 +13,7 @@ int main(int argc, char** argv) {
   auto c = New<Config>(argc, argv);
 
   auto g = New<ExpressionGraph>();
-  g->setDevice({0, DeviceType::cpu});
+  g->setDevice({0, DeviceType::gpu});
   g->reserveWorkspaceMB(512);
 
   for(int i = 0; i < 10; ++i) {

From dd296e77f76143033fc1589c7dce6d12196bbfdd Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Thu, 15 Feb 2018 17:57:33 -0800
Subject: [PATCH 04/11] whitespace

---
 src/graph/node.cpp          | 51 ++++++++++++++++++++++++++++
 src/tensors/backend.cpp     | 15 ++++++++
 src/tensors/backend.h       | 22 ++++++++++++
 src/tensors/cpu/backend.h   | 24 +++++++++++++
 src/tensors/cpu/dropout.cpp | 14 ++++++++
 src/tensors/device.cu       | 12 +++----
 src/tensors/dispatch.h      | 39 +++++++++++++++++++++
 src/tensors/gpu/backend.h   | 68 +++++++++++++++++++++++++++++++++++++
 src/tensors/gpu/dropout.cu  | 51 ++++++++++++++++++++++++++++
 9 files changed, 290 insertions(+), 6 deletions(-)
 create mode 100644 src/graph/node.cpp
 create mode 100644 src/tensors/backend.cpp
 create mode 100644 src/tensors/backend.h
 create mode 100644 src/tensors/cpu/backend.h
 create mode 100644 src/tensors/cpu/dropout.cpp
 create mode 100644 src/tensors/dispatch.h
 create mode 100644 src/tensors/gpu/backend.h
 create mode 100644 src/tensors/gpu/dropout.cu

diff --git a/src/graph/node.cpp b/src/graph/node.cpp
new file mode 100644
index 00000000..1c93683c
--- /dev/null
+++ b/src/graph/node.cpp
@@ -0,0 +1,51 @@
+#include "tensors/backend.h"
+#include "graph/expression_graph.h"
+#include "graph/node.h"
+
+namespace marian {
+
+size_t Node::allocate() {
+  size_t elements = 0;
+  if(!val_) {
+    graph()->tensor(val_, shape_);
+    elements = val_->shape().elements();
+  }
+  return elements;
+}
+
+void Node::free() {
+  if(graph()) {
+    if(val_)
+      graph()->free(val_);
+    if(adj_)
+      graph()->free(adj_);
+  }
+}
+
+void Node::init_dependent() {
+  if(!adj_) {
+    graph()->tensor(adj_, shape_);
+    adj_->set(1);
+  }
+}
+
+void Node::set_zero_adjoint() {
+  if(!adj_) {
+    graph()->tensor(adj_, shape_);
+    adj_->set(0);
+  }
+}
+
+float Node::scalar() {
+  return val_->scalar();
+}
+
+Ptr<Backend> Node::getBackend() {
+  return graph()->getBackend();
+}
+
+void NaryNodeOp::remove_children_from_top_nodes() {
+  for(auto child : children_)
+    graph()->remove_top_node(child);
+}
+}
diff --git a/src/tensors/backend.cpp b/src/tensors/backend.cpp
new file mode 100644
index 00000000..a1d66e9a
--- /dev/null
+++ b/src/tensors/backend.cpp
@@ -0,0 +1,15 @@
+#include "tensors/backend.h"
+
+#include "tensors/gpu/backend.h"
+#include "tensors/cpu/backend.h"
+
+namespace marian {
+
+Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
+  if(deviceId.type == DeviceType::gpu)
+    return New<gpu::Backend>(deviceId, seed);
+  else
+    return New<cpu::Backend>(deviceId, seed);
+}
+
+}
diff --git a/src/tensors/backend.h b/src/tensors/backend.h
new file mode 100644
index 00000000..11fc60d7
--- /dev/null
+++ b/src/tensors/backend.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "common/definitions.h"
+
+namespace marian {
+
+class Backend {
+protected:
+  DeviceId deviceId_;
+  size_t seed_;
+  
+public:
+  Backend(DeviceId deviceId, size_t seed)
+    : deviceId_(deviceId), seed_(seed) {}
+  
+  virtual DeviceId getDevice() { return deviceId_; };
+  virtual void setDevice() = 0;
+};
+
+Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed);
+
+}
diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
new file mode 100644
index 00000000..7bf5770e
--- /dev/null
+++ b/src/tensors/cpu/backend.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "common/config.h"
+#include "tensors/backend.h"
+
+namespace marian {
+namespace cpu {
+  
+class Backend : public marian::Backend {  
+public:
+  Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {
+  }
+
+  void setDevice() {
+  }
+
+private:
+  void setHandles() {
+
+  }  
+};
+
+}
+}
diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp
new file mode 100644
index 00000000..4286042b
--- /dev/null
+++ b/src/tensors/cpu/dropout.cpp
@@ -0,0 +1,14 @@
+#include <algorithm>
+
+#include "tensors/dispatch.h"
+
+namespace marian {
+  namespace cpu {
+
+    void Dropout(Ptr<marian::Backend> backend, Tensor tensor, float p) {
+      ABORT("Not implemented");
+      std::fill(tensor->data(), tensor->data() + tensor->size(), p);
+    }
+
+  }
+}
diff --git a/src/tensors/device.cu b/src/tensors/device.cu
index c773d566..2264fda4 100644
--- a/src/tensors/device.cu
+++ b/src/tensors/device.cu
@@ -6,7 +6,7 @@
 
 namespace marian {
 namespace gpu {
-  
+
   Device::~Device() {
     cudaSetDevice(deviceId_.no);
     if(data_) {
@@ -14,13 +14,13 @@ namespace gpu {
     }
     cudaDeviceSynchronize();
   }
-  
+
   void Device::reserve(size_t size) {
     size = align(size);
     cudaSetDevice(deviceId_.no);
-  
+
     ABORT_IF(size < size_, "New size must be larger than old size");
-  
+
     if(data_) {
       // Allocate memory by going through host memory
       uint8_t *temp = new uint8_t[size_];
@@ -32,9 +32,9 @@ namespace gpu {
     } else {
       CUDA_CHECK(cudaMalloc(&data_, size));
     }
-  
+
     size_ = size;
   }
-  
+
 }
 }
diff --git a/src/tensors/dispatch.h b/src/tensors/dispatch.h
new file mode 100644
index 00000000..14b74f5a
--- /dev/null
+++ b/src/tensors/dispatch.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "common/definitions.h"
+#include "tensors/backend.h"
+#include "tensors/tensor.h"
+
+#define DISPATCH1(Function, Arg1) \
+  namespace gpu { \
+    void Function(Ptr<marian::Backend>, Arg1); \
+  } \
+  namespace cpu { \
+    void Function(Ptr<marian::Backend>, Arg1); \
+  } \
+  void Function(Ptr<marian::Backend> backend, Arg1 arg1) { \
+    if(backend->getDevice().type == DeviceType::gpu) \
+      gpu::Function(backend, arg1); \
+    else \
+      cpu::Function(backend, arg1); \
+  }
+
+#define DISPATCH2(Function, Arg1, Arg2) \
+  namespace gpu { \
+    void Function(Ptr<marian::Backend>, Arg1, Arg2); \
+  } \
+  namespace cpu { \
+    void Function(Ptr<marian::Backend>, Arg1, Arg2); \
+  } \
+  static inline void Function(Ptr<marian::Backend> backend, Arg1 arg1, Arg2 arg2) { \
+    if(backend->getDevice().type == DeviceType::gpu) \
+      gpu::Function(backend, arg1, arg2); \
+    else \
+      cpu::Function(backend, arg1, arg2); \
+  }
+
+namespace marian {
+
+  DISPATCH2(Dropout, Tensor, float)
+
+}
diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h
new file mode 100644
index 00000000..cafdfb5b
--- /dev/null
+++ b/src/tensors/gpu/backend.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <curand.h>
+
+#include "common/config.h"
+#include "tensors/backend.h"
+
+#define CURAND_CALL(x)                                \
+  do {                                                \
+    if((x) != CURAND_STATUS_SUCCESS) {                \
+      printf("Error at %s:%d\n", __FILE__, __LINE__); \
+      exit(1);                                        \
+    }                                                 \
+  } while(0)
+
+namespace marian {
+namespace gpu {
+  
+class Backend : public marian::Backend {  
+public:
+  Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {
+    setDevice();
+    setHandles();
+  }
+
+  void setDevice() {
+    cudaSetDevice(deviceId_.no);
+  }
+
+  cublasHandle_t getCublasHandle() { return cublasHandle_; }
+
+  curandGenerator_t getCurandGenerator() { return curandGenerator_; }
+
+private:
+  cublasHandle_t cublasHandle_;
+  curandGenerator_t curandGenerator_;
+
+  
+  void setHandles() {
+    cublasHandle_ = create_handle();
+    curandGenerator_ = createCurandGenerator();
+  }
+
+  
+  curandGenerator_t createCurandGenerator() {
+    cudaSetDevice(deviceId_.no);
+    curandGenerator_t generator;
+    CURAND_CALL(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
+    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator, seed_));
+
+    // cudaStream_t stream = 0;
+    // CURAND_CALL(curandSetStream(generator, stream));
+    // CURAND_CALL(curandDestroyGenerator(generator));
+    return generator;
+  }
+
+  cublasHandle_t create_handle() {
+    cudaSetDevice(deviceId_.no);
+    cublasHandle_t cublasHandle;
+    cublasCreate(&cublasHandle);
+    return cublasHandle;
+  }
+};
+
+}
+}
diff --git a/src/tensors/gpu/dropout.cu b/src/tensors/gpu/dropout.cu
new file mode 100644
index 00000000..6dc49c51
--- /dev/null
+++ b/src/tensors/gpu/dropout.cu
@@ -0,0 +1,51 @@
+#include <cuda.h>
+#include <curand.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensors/dispatch.h"
+#include "tensors/gpu/backend.h"
+
+#define CUDA_CALL(x)                                  \
+  do {                                                \
+    if((x) != cudaSuccess) {                          \
+      printf("Error at %s:%d\n", __FILE__, __LINE__); \
+      exit(1);                                        \
+    }                                                 \
+  } while(0)
+
+#define CURAND_CALL(x)                                \
+  do {                                                \
+    if((x) != CURAND_STATUS_SUCCESS) {                \
+      printf("Error at %s:%d\n", __FILE__, __LINE__); \
+      exit(1);                                        \
+    }                                                 \
+  } while(0)
+
+
+namespace marian {
+  namespace gpu {
+
+    __global__ void gScale(float* data, int n, float p) {
+      int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+      while(index < n) {
+        data[index] = (data[index] < p) / p;
+        index += gridDim.x * blockDim.x;
+      }
+    }
+
+    void Dropout(Ptr<marian::Backend> backend, Tensor tensor, float p) {
+      curandGenerator_t gen = std::static_pointer_cast<gpu::Backend>(backend)->getCurandGenerator();
+      int n = tensor->size();
+      CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
+
+      int numThreads = std::min(n, 512);
+      int numBlocks = n / numThreads + (n % numThreads != 0);
+
+      gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
+    }
+
+
+  }
+}

From a0e74ad35ebbbecaa0d30007edd0e8d84a4681dd Mon Sep 17 00:00:00 2001
From: Alham Fikri Aji <afaji321@gmail.com>
Date: Fri, 16 Feb 2018 14:15:30 +0000
Subject: [PATCH 05/11] remove debug

---
 src/training/graph_group_async_drop.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/training/graph_group_async_drop.cu b/src/training/graph_group_async_drop.cu
index 84985009..42dfa725 100644
--- a/src/training/graph_group_async_drop.cu
+++ b/src/training/graph_group_async_drop.cu
@@ -79,7 +79,6 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
                                         size_t batch_words,
                                         int device_id) {
   if(pushStep_[device_id]++ <= dropping_warmup) {
-    std::cout<<"WARMUP"<<std::endl;
     AsyncGraphGroup::pushGradients(newGrads, batch_words, device_id);
     return;
   }

From 327cfc1cc3fbe3ab92927c800aa30aef1c41d517 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Fri, 16 Feb 2018 11:59:12 -0800
Subject: [PATCH 06/11] pass through backend

---
 src/graph/expression_graph.cpp         |  6 ++---
 src/graph/node_operators_unary.h       | 24 ++++---------------
 src/graph/parameters.h                 |  6 ++---
 src/kernels/sparse.cu                  |  6 ++---
 src/kernels/sparse.h                   | 33 +++++++++++++-------------
 src/kernels/tensor_operators.cu        |  6 ++---
 src/optimizers/optimizers.cu           |  4 ++--
 src/tensors/cpu/dropout.cpp            |  2 +-
 src/tensors/dispatch.h                 | 25 ++++++++++---------
 src/tensors/gpu/dropout.cu             |  5 ++--
 src/tensors/tensor.cu                  | 16 ++++++-------
 src/tensors/tensor.h                   | 12 ++++++----
 src/tensors/tensor_allocator.h         | 10 ++++----
 src/training/graph_group_async.cu      | 14 +++++------
 src/training/graph_group_async_drop.cu | 21 ++++++++--------
 src/training/graph_group_async_drop.h  |  2 +-
 src/training/graph_group_multinode.cu  | 15 +++++-------
 src/training/graph_group_multinode.h   |  2 +-
 src/training/graph_group_sync.cu       |  4 ++--
 src/training/sparse_tensor.cu          | 24 +++++++++----------
 src/training/sparse_tensor.h           |  8 +++----
 21 files changed, 116 insertions(+), 129 deletions(-)

diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 183b5787..934e2b73 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -12,15 +12,15 @@ void ExpressionGraph::setDevice(DeviceId deviceId) {
   if(!backend_) {
     backend_ = BackendByDevice(deviceId, Config::seed);
     params_ = New<Parameters>();
-    params_->init(backend_->getDevice());
-    tensors_ = New<TensorAllocator>(backend_->getDevice());
+    params_->init(backend_);
+    tensors_ = New<TensorAllocator>(backend_);
   }
 }
 
 Expr ExpressionGraph::dropout(float prob, Shape shape) {
   return Expression<ConstantNode>(shared_from_this(),
                                   keywords::init = [prob, this](Tensor t) {
-                                    Dropout(backend_, t, prob);
+                                    Dropout(t, prob);
                                   },
                                   keywords::shape = shape);
 }
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index a3f27fd2..0170fc73 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -211,21 +211,7 @@ struct TanhNodeOp : public NaryNodeOp {
   const std::string type() { return "tanh"; }
 };
 
-/**
- * Represents a <a
- * href="https://en.wikipedia.org/wiki/Rectifier_(neural_networks)">rectified
- * linear</a> node in an expression graph.
- *
- * This node implements the activation function \f$ f(x) = \max(0, x) \f$ and
- * its derivative:
- * \f[
- *   f^\prime(x) =
- *   \begin{cases}
- *     0 & \text{if } x \leq 0 \\
- *     1 & \text{if } x > 0
- *   \end{cases}
- * \f]
- */
+
 struct ReLUNodeOp : public UnaryNodeOp {
   template <typename... Args>
   ReLUNodeOp(Args... args) : UnaryNodeOp(args...) {}
@@ -877,14 +863,14 @@ public:
   Tensor& val() {
     auto childVal = reshapee_->val();
     val_.reset(
-        new TensorBase(childVal->memory(), shape(), childVal->getDevice()));
+        new TensorBase(childVal->memory(), shape(), childVal->getBackend()));
     return val_;
   };
 
   Tensor& grad() {
     auto childGrad = reshapee_->grad();
     adj_.reset(
-        new TensorBase(childGrad->memory(), shape(), childGrad->getDevice()));
+        new TensorBase(childGrad->memory(), shape(), childGrad->getBackend()));
     return adj_;
   };
 
@@ -953,7 +939,7 @@ public:
     size_t offset = step_ * shape().elements() * sizeof(float);
     auto mem = New<MemoryPiece>(childVal->memory()->data() + offset,
                                 childVal->memory()->size());
-    val_.reset(new TensorBase(mem, shape(), childVal->getDevice()));
+    val_.reset(new TensorBase(mem, shape(), childVal->getBackend()));
     return val_;
   };
 
@@ -962,7 +948,7 @@ public:
     size_t offset = step_ * shape().elements() * sizeof(float);
     auto mem = New<MemoryPiece>(childGrad->memory()->data() + offset,
                                 childGrad->memory()->size());
-    adj_.reset(new TensorBase(mem, shape(), childGrad->getDevice()));
+    adj_.reset(new TensorBase(mem, shape(), childGrad->getBackend()));
     return adj_;
   };
 
diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index ed8b7690..3f282e4a 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -20,9 +20,9 @@ private:
   Ptr<TensorAllocator> grads_;
 
 public:
-  void init(DeviceId deviceId) {
-    vals_ = New<TensorAllocator>(deviceId);
-    grads_ = New<TensorAllocator>(deviceId);
+  void init(Ptr<Backend> backend) {
+    vals_ = New<TensorAllocator>(backend);
+    grads_ = New<TensorAllocator>(backend);
   }
 
   auto begin() -> decltype(params_.begin()) { return params_.begin(); }
diff --git a/src/kernels/sparse.cu b/src/kernels/sparse.cu
index 1d104474..b5080c0c 100644
--- a/src/kernels/sparse.cu
+++ b/src/kernels/sparse.cu
@@ -12,7 +12,7 @@ void multiply(Ptr<CSR> C,
               const Ptr<CSR> B,
               bool transA,
               bool transB) {
-  cudaSetDevice(C->getDevice());
+  cudaSetDevice(backend_->getDevice().no);
   int nnzTotal;
   C->allocRowIndices(A->rows());
   CUSPARSE_CHECK(cusparseXcsrgemmNnz(
@@ -91,7 +91,7 @@ void multiply(Ptr<CSR> C,
 //}
 
 void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr<CSR> sparseLf) {
-  cudaSetDevice(out->getDevice());
+  cudaSetDevice(backend_->getDevice().no);
 
   int batch = att->shape()[0];
   int srcWords = att->shape()[2];
@@ -150,7 +150,7 @@ __global__ void gCollapseAtt(float* out,
 }
 
 void CollapseAtt(Tensor out, Tensor in) {
-  cudaSetDevice(out->getDevice());
+  cudaSetDevice(backend_->getDevice().no);
   int nonzeros = out->shape().elements();
   int batch = out->shape()[0];
   int srcWords = out->shape()[2];
diff --git a/src/kernels/sparse.h b/src/kernels/sparse.h
index d70555f1..cffb398e 100644
--- a/src/kernels/sparse.h
+++ b/src/kernels/sparse.h
@@ -14,7 +14,7 @@ private:
   int nnz_{0};
   int rows_{0};
   int cols_{0};
-  DeviceId deviceId_;
+  Ptr<Backend> backend_;
 
   cusparseHandle_t handle_{0};
   cusparseMatDescr_t descr_{0};
@@ -24,9 +24,9 @@ private:
   float* values_{0};
 
 public:
-  CSR(int rows, int cols, DeviceId deviceId)
-      : rows_(rows), cols_(cols), deviceId_(deviceId) {
-    cudaSetDevice(deviceId_.no);
+  CSR(int rows, int cols, Ptr<Backend> backend)
+      : rows_(rows), cols_(cols), backend_(backend) {
+    cudaSetDevice(backend_->getDevice().no);
     CUSPARSE_CHECK(cusparseCreate(&handle_));
     CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_));
     CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL));
@@ -38,9 +38,9 @@ public:
       const std::vector<float>& values,
       const std::vector<int>& rowIndices,
       const std::vector<int>& colIndices,
-      DeviceId deviceId)
-      : nnz_(values.size()), rows_(rows), cols_(cols), deviceId_(deviceId) {
-    cudaSetDevice(deviceId_.no);
+      Ptr<Backend> backend)
+      : nnz_(values.size()), rows_(rows), cols_(cols), backend_(backend) {
+    cudaSetDevice(backend_->getDevice().no);
     CUSPARSE_CHECK(cusparseCreate(&handle_));
     CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_));
     CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL));
@@ -73,8 +73,8 @@ public:
     CUDA_CHECK(cudaFree(cooRowIndices));
   }
 
-  CSR(Tensor dense) : deviceId_(dense->getDevice()) {
-    cudaSetDevice(deviceId_.no);
+  CSR(Tensor dense) : backend_(dense->getBackend()) {
+    cudaSetDevice(backend_->getDevice().no);
     rows_ = dense->shape()[0] * dense->shape()[2] * dense->shape()[3];
     cols_ = dense->shape()[1];
 
@@ -114,7 +114,7 @@ public:
   }
 
   ~CSR() {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
     if(values_)
       CUDA_CHECK(cudaFree(values_));
     if(rowIndices_)
@@ -129,7 +129,7 @@ public:
   }
 
   void toTensor(Tensor dense) {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
     ABORT_IF(dense->size() != rows_ * cols_, "Matrix sizes do not match");
 
     cusparseScsc2dense(handle_,
@@ -154,10 +154,10 @@ public:
   int* rowIndices() { return rowIndices_; }
   int* colIndices() { return colIndices_; }
 
-  DeviceId getDevice() { return deviceId_; }
+  DeviceId getDevice() { return backend_->getDevice(); }
 
   void allocValues(int nnz = 0) {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
     if(nnz > 0)
       nnz_ = nnz;
     if(values_)
@@ -166,7 +166,7 @@ public:
   }
 
   void allocRowIndices(int rows = 0) {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
     if(rows > 0)
       rows_ = rows;
     if(rowIndices_)
@@ -175,7 +175,7 @@ public:
   }
 
   void allocColIndices(int nnz = 0) {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
     if(nnz > 0)
       nnz_ = nnz;
     if(colIndices_)
@@ -184,11 +184,12 @@ public:
   }
 
   std::string debug() {
+    cudaSetDevice(backend_->getDevice().no);
     uint8_t* buffer;
     CUDA_CHECK(cudaMalloc(&buffer, sizeof(float) * rows() * cols()));
 
     auto mem = New<MemoryPiece>(buffer, sizeof(float) * rows() * cols());
-    Tensor tensor(new TensorBase(mem, {rows(), cols()}, deviceId_));
+    Tensor tensor(new TensorBase(mem, {rows(), cols()}, backend_));
     toTensor(tensor);
     std::string temp = tensor->debug();
 
diff --git a/src/kernels/tensor_operators.cu b/src/kernels/tensor_operators.cu
index 69b8afc4..87019d9e 100644
--- a/src/kernels/tensor_operators.cu
+++ b/src/kernels/tensor_operators.cu
@@ -893,7 +893,7 @@ void Select(Ptr<Allocator> allocator,
 
   auto mp_indices = allocator->alloc<size_t>(indices.size());
   CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data());
-  
+
   int axisGPU = axis + gpu::Shape::size() - out->shape().size();
   gSelect<<<blocks, threads>>>(out->data(),
                                out->shape(),
@@ -919,7 +919,7 @@ void Insert(Ptr<Allocator> allocator,
 
   auto mp_indices = allocator->alloc<size_t>(indices.size());
   CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data());
-  
+
   int axisGPU = axis + gpu::Shape::size() - out->shape().size();
   gInsert<<<blocks, threads>>>(out->data(),
                                out->shape(),
@@ -1295,7 +1295,7 @@ float L2Norm(Tensor in) {
   uint8_t* data;
   cudaMalloc(&data, blocks * sizeof(float));
   Tensor out(new TensorBase(
-      New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getDevice()));
+      New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend()));
 
   ReduceAll(_1 * _1, out, in);
   float dataCpu = sqrtf(out->get(0));
diff --git a/src/optimizers/optimizers.cu b/src/optimizers/optimizers.cu
index 2874e1d5..e82800c9 100644
--- a/src/optimizers/optimizers.cu
+++ b/src/optimizers/optimizers.cu
@@ -13,7 +13,7 @@ void Sgd::updateImpl(Tensor params, Tensor grads) {
 
 void Adagrad::updateImpl(Tensor params, Tensor grads) {
   if(!alloc_)
-    alloc_ = New<TensorAllocator>(params->getDevice());
+    alloc_ = New<TensorAllocator>(params->getBackend());
 
   if(!gt_) {
     int elements = params->size();
@@ -42,7 +42,7 @@ void Adagrad::resetStats() {
 
 void Adam::updateImpl(Tensor params, Tensor grads) {
   if(!alloc_)
-    alloc_ = New<TensorAllocator>(params->getDevice());
+    alloc_ = New<TensorAllocator>(params->getBackend());
 
   if(!mt_) {
     int elements = params->size();
diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp
index 4286042b..cc6cea41 100644
--- a/src/tensors/cpu/dropout.cpp
+++ b/src/tensors/cpu/dropout.cpp
@@ -5,7 +5,7 @@
 namespace marian {
   namespace cpu {
 
-    void Dropout(Ptr<marian::Backend> backend, Tensor tensor, float p) {
+    void Dropout(Tensor tensor, float p) {
       ABORT("Not implemented");
       std::fill(tensor->data(), tensor->data() + tensor->size(), p);
     }
diff --git a/src/tensors/dispatch.h b/src/tensors/dispatch.h
index 14b74f5a..e63a6af1 100644
--- a/src/tensors/dispatch.h
+++ b/src/tensors/dispatch.h
@@ -1,35 +1,34 @@
 #pragma once
 
 #include "common/definitions.h"
-#include "tensors/backend.h"
 #include "tensors/tensor.h"
 
 #define DISPATCH1(Function, Arg1) \
   namespace gpu { \
-    void Function(Ptr<marian::Backend>, Arg1); \
+    void Function(Arg1); \
   } \
   namespace cpu { \
-    void Function(Ptr<marian::Backend>, Arg1); \
+    void Function(Arg1); \
   } \
-  void Function(Ptr<marian::Backend> backend, Arg1 arg1) { \
-    if(backend->getDevice().type == DeviceType::gpu) \
-      gpu::Function(backend, arg1); \
+  void Function(Arg1 arg1) { \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+      gpu::Function(arg1); \
     else \
-      cpu::Function(backend, arg1); \
+      cpu::Function(arg1); \
   }
 
 #define DISPATCH2(Function, Arg1, Arg2) \
   namespace gpu { \
-    void Function(Ptr<marian::Backend>, Arg1, Arg2); \
+    void Function(Arg1, Arg2); \
   } \
   namespace cpu { \
-    void Function(Ptr<marian::Backend>, Arg1, Arg2); \
+    void Function(Arg1, Arg2); \
   } \
-  static inline void Function(Ptr<marian::Backend> backend, Arg1 arg1, Arg2 arg2) { \
-    if(backend->getDevice().type == DeviceType::gpu) \
-      gpu::Function(backend, arg1, arg2); \
+  static inline void Function(Arg1 arg1, Arg2 arg2) { \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+      gpu::Function(arg1, arg2); \
     else \
-      cpu::Function(backend, arg1, arg2); \
+      cpu::Function(arg1, arg2); \
   }
 
 namespace marian {
diff --git a/src/tensors/gpu/dropout.cu b/src/tensors/gpu/dropout.cu
index 6dc49c51..4a4223a8 100644
--- a/src/tensors/gpu/dropout.cu
+++ b/src/tensors/gpu/dropout.cu
@@ -35,8 +35,9 @@ namespace marian {
       }
     }
 
-    void Dropout(Ptr<marian::Backend> backend, Tensor tensor, float p) {
-      curandGenerator_t gen = std::static_pointer_cast<gpu::Backend>(backend)->getCurandGenerator();
+    void Dropout(Tensor tensor, float p) {
+      auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
+      curandGenerator_t gen = gpuBackend->getCurandGenerator();
       int n = tensor->size();
       CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
 
diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu
index bc26fcec..96d979bf 100644
--- a/src/tensors/tensor.cu
+++ b/src/tensors/tensor.cu
@@ -16,7 +16,7 @@ __global__ void gFill(float *d_in, int size, float val) {
 }
 
 float TensorBase::get(size_t i) {
-  cudaSetDevice(deviceId_.no);
+  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
   float temp;
   CUDA_CHECK(
       cudaMemcpy(&temp, data() + i, sizeof(float), cudaMemcpyDeviceToHost));
@@ -25,14 +25,14 @@ float TensorBase::get(size_t i) {
 }
 
 void TensorBase::set(size_t i, float value) {
-  cudaSetDevice(deviceId_.no);
+  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
   CUDA_CHECK(
       cudaMemcpy(data() + i, &value, sizeof(float), cudaMemcpyHostToDevice));
   cudaStreamSynchronize(0);
 }
 
 void TensorBase::get(std::vector<float> &v) {
-  CUDA_CHECK(cudaSetDevice(deviceId_.no));
+  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
   v.resize(size());
   CUDA_CHECK(cudaMemcpy(
       v.data(), data(), size() * sizeof(float), cudaMemcpyDeviceToHost));
@@ -40,7 +40,7 @@ void TensorBase::get(std::vector<float> &v) {
 }
 
 void TensorBase::set(float value) {
-  cudaSetDevice(deviceId_.no);
+  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
   int threads = std::min(512, (int)size());
   int blocks = (size() / threads) + (size() % threads != 0);
   gFill<<<blocks, threads>>>(data(), size(), value);
@@ -48,7 +48,7 @@ void TensorBase::set(float value) {
 }
 
 void TensorBase::set(const std::vector<float> &v) {
-  CUDA_CHECK(cudaSetDevice(deviceId_.no));
+  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
   CUDA_CHECK(cudaMemcpy(
       data(), v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice));
   cudaStreamSynchronize(0);
@@ -56,13 +56,13 @@ void TensorBase::set(const std::vector<float> &v) {
 
 void TensorBase::setSparse(const std::vector<size_t> &k,
                            const std::vector<float> &v) {
-  cudaSetDevice(deviceId_.no);
+  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
   SetSparse(data(), k, v);
   cudaStreamSynchronize(0);
 }
 
 void TensorBase::copyFrom(Tensor in) {
-  cudaSetDevice(deviceId_.no);
+  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
   CUDA_CHECK(cudaMemcpy(data(),
                         (float *)in->data(),
                         in->size() * sizeof(float),
@@ -74,7 +74,7 @@ std::string TensorBase::debug() {
   std::stringstream strm;
   assert(shape_.size());
   strm << shape_;
-  strm << " device=" << deviceId_;
+  strm << " device=" << backend_->getDevice();
   strm << " ptr=" << (size_t)memory_->data();
   strm << " bytes=" << memory_->size();
   strm << std::endl;
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index 15d3a427..309cacd5 100644
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -9,6 +9,7 @@
 #include "common/definitions.h"
 #include "common/shape.h"
 #include "tensors/memory_piece.h"
+#include "tensors/backend.h"
 
 namespace marian {
 
@@ -16,11 +17,11 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
 private:
   Ptr<MemoryPiece> memory_;
   Shape shape_;
-  DeviceId deviceId_;
+  Ptr<Backend> backend_;
 
 public:
-  TensorBase(Ptr<MemoryPiece> memory, Shape shape, DeviceId deviceId)
-      : memory_(memory), shape_(shape), deviceId_(deviceId) {}
+  TensorBase(Ptr<MemoryPiece> memory, Shape shape, Ptr<Backend> backend)
+      : memory_(memory), shape_(shape), backend_(backend) {}
 
   ~TensorBase() {}
 
@@ -39,12 +40,13 @@ public:
     return get(0);
   }
 
-  DeviceId getDevice() { return deviceId_; }
+  Ptr<Backend> getBackend() { return backend_; }
+  DeviceId getDevice() { return backend_->getDevice(); }
 
   Tensor subtensor(int offset, int size) {
     auto mem = New<MemoryPiece>(memory_->data() + sizeof(float) * offset,
                                 sizeof(float) * size);
-    return Tensor(new TensorBase(mem, {1, size}, deviceId_));
+    return New<TensorBase>(mem, Shape{1, size}, backend_);
   }
 
   float get(size_t i);
diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h
index e1c54b22..18aae134 100644
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@@ -16,11 +16,13 @@ private:
   const size_t GROW = CHUNK * MBYTE;
   const size_t ALIGN = 256;
 
+  Ptr<Backend> backend_;
   Ptr<Allocator> allocator_;
 
 public:
-  TensorAllocator(DeviceId deviceId)
-      : allocator_(New<Allocator>(deviceId, 0, GROW, ALIGN)) {}
+  TensorAllocator(Ptr<Backend> backend)
+      : backend_(backend),
+        allocator_(New<Allocator>(backend_->getDevice(), 0, GROW, ALIGN)) {}
 
   ~TensorAllocator() { clear(); }
 
@@ -58,7 +60,7 @@ public:
     if(!t || t->shape() != shape) {
       int size = shape.elements();
       auto mem = allocator_->alloc<float>(size);
-      t = Tensor(new TensorBase(mem, shape, allocator_->getDevice()));
+      t = Tensor(new TensorBase(mem, shape, backend_));
     }
   }
 
@@ -67,7 +69,7 @@ public:
   Tensor asTensor() {
     auto mem = allocator_->memory();
     int size = mem->size() / sizeof(float);
-    return Tensor(new TensorBase(mem, {1, size}, allocator_->getDevice()));
+    return Tensor(new TensorBase(mem, {1, size}, backend_));
   }
 
   size_t size() { return allocator_->size() / sizeof(float); }
diff --git a/src/training/graph_group_async.cu b/src/training/graph_group_async.cu
index 90d575bd..18f3908a 100644
--- a/src/training/graph_group_async.cu
+++ b/src/training/graph_group_async.cu
@@ -93,12 +93,12 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
 
     int pos = 0;
     // parameter sharding
-    for(auto device : devices_) {
+    for(auto graph : graphs_) {
       int __size__ = min(shardSize_, totalSize);
       totalSize -= __size__;
 
       Tensor param;
-      Ptr<TensorAllocator> allocator = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
+      Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
       allocator->reserveExact(__size__ * sizeof(float));
       allocator->allocate(param, {1, __size__});
       paramsAlloc_.push_back(allocator);
@@ -112,11 +112,11 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
   if(grads_.size() == 0) {
     int totalSize = graphs_[0]->params()->vals()->size();
 
-    for(auto device : devices_) {
+    for(auto graph : graphs_) {
       int __size__ = min(shardSize_, totalSize);
       totalSize -= __size__;
       Tensor grad_;
-      Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
+      Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(graph->getBackend());
 
       allocator_->reserveExact(__size__ * sizeof(float));
       allocator_->allocate(grad_, {1, __size__});
@@ -129,11 +129,11 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
       int totalSize = graphs_[0]->params()->vals()->size();
 
       int i = 0;
-      for(auto device : devices_) {
+      for(auto graph : graphs_) {
         int __size__ = min(shardSize_, totalSize);
         totalSize -= __size__;
         Tensor paramAvg;
-        Ptr<TensorAllocator> allocator = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
+        Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
 
         allocator->reserveExact(__size__ * sizeof(float));
         allocator->allocate(paramAvg, {1, __size__});
@@ -187,7 +187,7 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
     Tensor gradients;
     if(tau_ > 1) {
       if(t == 0) {
-        accAlloc = New<TensorAllocator>(graph->getDevice());
+        accAlloc = New<TensorAllocator>(graph->getBackend());
         accAlloc->reserveExact(graph->params()->grads()->memory()->size());
         accAlloc->allocate(accGradients, graph->params()->grads()->shape());
         accGradients->set(0);
diff --git a/src/training/graph_group_async_drop.cu b/src/training/graph_group_async_drop.cu
index 84985009..33684bf9 100644
--- a/src/training/graph_group_async_drop.cu
+++ b/src/training/graph_group_async_drop.cu
@@ -8,9 +8,9 @@
 
 namespace marian {
 
-Tensor AsyncGraphGroupDrop::newTensor(int size, DeviceId deviceId) {
+Tensor AsyncGraphGroupDrop::newTensor(int size, Ptr<Backend> backend) {
   Tensor t;
-  Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(deviceId);
+  Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(backend);
   allocator_->reserveExact(size * sizeof(float));
   allocator_->allocate(t, {1, size});
   allocators.push_back(allocator_);
@@ -86,7 +86,7 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
 
   // get the sparse gradient
   pushDropper_[device_id]->dropGraph(
-      newGrads, pushSparseGradient_[device_id], 
+      newGrads, pushSparseGradient_[device_id],
       droping_rate, dropping_momentum);
 
   SparseTensor newSparseGrads = pushSparseGradient_[device_id];
@@ -146,13 +146,12 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
       fetchStep_.push_back(0);
       pushStep_.push_back(0);
 
-      size_t device = devices_[i];
       // temporary tensor to compute parameter delta before fetching
-      paramsDelta_.push_back(newTensor(shardSize, {device, DeviceType::gpu}));
+      paramsDelta_.push_back(newTensor(shardSize, graphs_[i]->getBackend()));
 
       // tensors to store local params history
       for(int h_id = 0; h_id < devices_.size(); h_id++) {
-        Tensor tmp = newTensor(params_[i]->size(), {device, DeviceType::gpu});
+        Tensor tmp = newTensor(params_[i]->size(), graphs_[i]->getBackend());
         tmp->copyFrom(params_[i]);
         paramsLocal_[h_id].push_back(tmp);
       }
@@ -162,23 +161,23 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
 
       // N-dropper for fetch
       std::vector<GradientDrop> tmpDropper;
-      for(int i = 0; i < devices_.size(); i++)
+      for(auto device : devices_)
         tmpDropper.push_back(GradientDrop(new GradientDropBase()));
       fetchDropper.push_back(tmpDropper);
 
       // sparsetensor to store sparsified gradients per-device
       pushSparseGradient_.push_back(
-          SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu})));
+          SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
 
       pushShardedSparseGradient_.push_back(
-          SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu})));
+          SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
       fetchSparseGradient_.push_back(SparseTensor(
-          new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu})));
+          new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend())));
 
       std::vector<SparseTensor> tmp;
       for(int i = 0; i < devices_.size(); i++)
         tmp.push_back(SparseTensor(
-            new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu})));
+            new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend())));
       fetchShardedSparseGradient_.push_back(tmp);
     }
 
diff --git a/src/training/graph_group_async_drop.h b/src/training/graph_group_async_drop.h
index 33e74a40..f32d9444 100644
--- a/src/training/graph_group_async_drop.h
+++ b/src/training/graph_group_async_drop.h
@@ -31,7 +31,7 @@ class AsyncGraphGroupDrop : public AsyncGraphGroup {
 
   std::vector<Ptr<TensorAllocator>> allocators;
 
-  Tensor newTensor(int size, DeviceId deviceId);
+  Tensor newTensor(int size, Ptr<Backend> backend);
 
 protected:
   void init(Ptr<data::Batch> batch);
diff --git a/src/training/graph_group_multinode.cu b/src/training/graph_group_multinode.cu
index 78cd842f..34aa2b5d 100644
--- a/src/training/graph_group_multinode.cu
+++ b/src/training/graph_group_multinode.cu
@@ -19,9 +19,9 @@ void MultiNodeGraphGroup::setScheduler(Ptr<Scheduler> scheduler) {
 /**
  * Allocate new tensor on given GPU and store allocator.
  */
-Tensor MultiNodeGraphGroup::newTensor(int size, DeviceId deviceId) {
+Tensor MultiNodeGraphGroup::newTensor(int size, Ptr<Backend> backend) {
   Tensor t;
-  Ptr<TensorAllocator> allocator = New<TensorAllocator>(deviceId);
+  Ptr<TensorAllocator> allocator = New<TensorAllocator>(backend);
   allocator->reserveExact(size * sizeof(float));
   allocator->allocate(t, {1, size});
   allocators_.push_back(allocator);
@@ -148,14 +148,12 @@ void MultiNodeGraphGroup::initClientCommOverlapVars() {
 void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
   size_t modelSize = clientGraphs_[0]->params()->vals()->size();
   for(int client = 0; client < devices_.size(); client++) {
-    DeviceId deviceId{devices_[client], DeviceType::gpu};
-    
     // Communication overlap buffer (for grads + params)
-    Tensor commOverlapBuffer = newTensor(modelSize, deviceId);
+    Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend());
     commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals());
     clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer);
     // Gradients local sum buffer
-    Tensor sumGrads = newTensor(modelSize, deviceId);
+    Tensor sumGrads = newTensor(modelSize, clientGraphs_[client]->getBackend());
     sumGrads->set(0);
     clientSummedGradsGPU.push_back(sumGrads);
     // Local optimizer to apply summed gradients
@@ -207,12 +205,11 @@ void MultiNodeGraphGroup::calculateShardSizes() {
 void MultiNodeGraphGroup::initShardGpuTensors() {
   size_t offset = 0;
   for(int shard = 0; shard < devices_.size(); shard++) {
-    DeviceId deviceId{devices_[shard], DeviceType::gpu};
-    Tensor gpuParams = newTensor(shardSizes_[shard], deviceId);
+    Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
     gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
         offset, shardSizes_[shard]));
     shardParams_.push_back(gpuParams);
-    shardGrads_.push_back(newTensor(shardSizes_[shard], deviceId));
+    shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
   }
 }
 
diff --git a/src/training/graph_group_multinode.h b/src/training/graph_group_multinode.h
index 90243f7c..c6dc495c 100644
--- a/src/training/graph_group_multinode.h
+++ b/src/training/graph_group_multinode.h
@@ -217,7 +217,7 @@ protected:
   /**
    * Allocate new tensor on given GPU and store allocator.
    */
-  Tensor newTensor(int size, DeviceId deviceId);
+  Tensor newTensor(int size, Ptr<Backend> backend);
 
   /**
    * Setup training environment and launch server thread and (if enabled) client
diff --git a/src/training/graph_group_sync.cu b/src/training/graph_group_sync.cu
index e713b99d..171c0652 100644
--- a/src/training/graph_group_sync.cu
+++ b/src/training/graph_group_sync.cu
@@ -60,7 +60,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
       for(auto graph : graphs_) {
         int __size__ = min(shardSize_, totalSize);
 
-        auto paramsAlloc = New<TensorAllocator>(graph->getDevice());
+        auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
         paramsAllocs_.push_back(paramsAlloc);
 
         paramsAlloc->reserveExact(3 * __size__ * sizeof(float));
@@ -87,7 +87,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
         int __size__ = min(shardSize_, totalSize);
         totalSize -= __size__;
         Tensor paramAvg;
-        auto allocator = New<TensorAllocator>(graph->getDevice());
+        auto allocator = New<TensorAllocator>(graph->getBackend());
 
         allocator->reserveExact(__size__ * sizeof(float));
         allocator->allocate(paramAvg, {1, __size__});
diff --git a/src/training/sparse_tensor.cu b/src/training/sparse_tensor.cu
index 5d32b16f..aafafa97 100644
--- a/src/training/sparse_tensor.cu
+++ b/src/training/sparse_tensor.cu
@@ -45,9 +45,9 @@ __global__ void gFindSubtensor(int* indices,
     resultEnd[0] = idx;
 }
 
-SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId)
-: deviceId_(deviceId), capacity_(capacity) {
-  cudaSetDevice(deviceId_.no);
+SparseTensorBase::SparseTensorBase(int capacity, Ptr<Backend> backend)
+: backend_(backend), capacity_(capacity) {
+  cudaSetDevice(backend_->getDevice().no);
   CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity));
   CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity));
 
@@ -58,8 +58,8 @@ SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId)
 SparseTensorBase::SparseTensorBase(float* data,
                                    int* indices,
                                    int size,
-                                   DeviceId deviceId)
-: deviceId_(deviceId) {
+                                   Ptr<Backend> backend)
+: backend_(backend) {
   data_ = data;
   indices_ = indices;
   size_ = size;
@@ -93,7 +93,7 @@ void SparseTensorBase::copyFrom(float* data,
   size_ = size;
   if(size == 0)
     return;
-  cudaSetDevice(deviceId_.no);
+  cudaSetDevice(backend_->getDevice().no);
 
   cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault);
   if(!data_only)
@@ -107,8 +107,8 @@ void SparseTensorBase::copyFrom(std::shared_ptr<SparseTensorBase> t,
   copyFrom(t->data(), t->indices(), t->size(), data_only);
 }
 
-DeviceId SparseTensorBase::getDevice() {
-  return deviceId_;
+Ptr<Backend> SparseTensorBase::getBackend() {
+  return backend_;
 }
 
 void SparseTensorBase::setSize(int size) {
@@ -117,7 +117,7 @@ void SparseTensorBase::setSize(int size) {
 
 // return the dense representation of this tensor
 void SparseTensorBase::toDense(Tensor t, int offset) {
-  cudaSetDevice(deviceId_.no);
+  cudaSetDevice(backend_->getDevice().no);
   int threads = 512;
   int blocks = 1 + size_ / threads;
   t->set(0);
@@ -127,7 +127,7 @@ void SparseTensorBase::toDense(Tensor t, int offset) {
 }
 
 void SparseTensorBase::scatterAdd(Tensor t, int offset) {
-  cudaSetDevice(deviceId_.no);
+  cudaSetDevice(backend_->getDevice().no);
   cudaStreamSynchronize(0);
   int threads = 512;
   int blocks = 1 + size_ / threads;
@@ -139,7 +139,7 @@ void SparseTensorBase::scatterAdd(Tensor t, int offset) {
 std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
                                                               int size,
                                                               int idx) {
-  cudaSetDevice(deviceId_.no);
+  cudaSetDevice(backend_->getDevice().no);
   cudaStreamSynchronize(0);
   int* start = gstart_ + idx;
   int* end = gend_ + idx;
@@ -165,6 +165,6 @@ std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
   int subtensorSize = std::max(0, endOffset - startOffset + 1);
   cudaStreamSynchronize(0);
   return std::shared_ptr<SparseTensorBase>(new SparseTensorBase(
-      data_ + startOffset, indices_ + startOffset, subtensorSize, deviceId_));
+      data_ + startOffset, indices_ + startOffset, subtensorSize, backend_));
 }
 }
diff --git a/src/training/sparse_tensor.h b/src/training/sparse_tensor.h
index 03fb53a9..9194748f 100644
--- a/src/training/sparse_tensor.h
+++ b/src/training/sparse_tensor.h
@@ -10,15 +10,15 @@ class SparseTensorBase : public std::enable_shared_from_this<SparseTensorBase> {
   int* indices_;
   int size_;
   int capacity_;
-  DeviceId deviceId_;
+  Ptr<Backend> backend_;
 
   int* d_is_unsorted;
   int* gstart_;
   int* gend_;
 
 public:
-  SparseTensorBase(int capacity, DeviceId deviceId);
-  SparseTensorBase(float* data, int* indices, int size, DeviceId deviceId);
+  SparseTensorBase(int capacity, Ptr<Backend> backend);
+  SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend);
 
   ~SparseTensorBase() {}
 
@@ -43,7 +43,7 @@ public:
   void scatterAdd(Tensor t, int offset = 0);
   std::shared_ptr<SparseTensorBase> subtensor(int pos, int size, int idx);
 
-  DeviceId getDevice();
+  Ptr<Backend> getBackend();
 
   void toDense(Tensor t, int offset);
 };

From 6931f6c3c8ad5440e151a29787e89bab69ef9745 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Fri, 16 Feb 2018 14:26:10 -0800
Subject: [PATCH 07/11] make tensor device independent

---
 src/CMakeLists.txt              |   3 +-
 src/kernels/cuda_helpers.h      |   4 +-
 src/kernels/tensor_operators.cu |   4 +-
 src/tensors/gpu/algorithm.cu    |  31 +++++++
 src/tensors/gpu/algorithm.h     |  10 +++
 src/tensors/tensor.cu           | 140 ------------------------------
 src/tensors/tensor.h            | 145 +++++++++++++++++++++++++++++---
 7 files changed, 180 insertions(+), 157 deletions(-)
 create mode 100644 src/tensors/gpu/algorithm.cu
 create mode 100644 src/tensors/gpu/algorithm.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 977d88f7..92c8a776 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,10 +8,11 @@ cuda_add_library(marian
   3rd_party/cnpy/cnpy.cpp
   3rd_party/exception.cpp
   3rd_party/svd/svd.cpp
-  tensors/tensor.cu
+#  tensors/tensor.cu
   tensors/device.cu
   tensors/device.cpp
   tensors/backend.cpp
+  tensors/gpu/algorithm.cu
   tensors/gpu/dropout.cu
   tensors/cpu/dropout.cpp
   kernels/tensor_operators.cu
diff --git a/src/kernels/cuda_helpers.h b/src/kernels/cuda_helpers.h
index 92faaf4c..46d84afe 100644
--- a/src/kernels/cuda_helpers.h
+++ b/src/kernels/cuda_helpers.h
@@ -17,8 +17,8 @@ inline void gpuAssert(cudaError_t code,
   }
 }
 
-template <typename T, typename D>
-void CudaCopy(const T* start, const T* end, D* dest) {
+template <typename T>
+void CudaCopy(const T* start, const T* end, T* dest) {
   CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T),
              cudaMemcpyDefault));
 }
diff --git a/src/kernels/tensor_operators.cu b/src/kernels/tensor_operators.cu
index 87019d9e..8dcda559 100644
--- a/src/kernels/tensor_operators.cu
+++ b/src/kernels/tensor_operators.cu
@@ -892,7 +892,7 @@ void Select(Ptr<Allocator> allocator,
   int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
 
   auto mp_indices = allocator->alloc<size_t>(indices.size());
-  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data());
+  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
 
   int axisGPU = axis + gpu::Shape::size() - out->shape().size();
   gSelect<<<blocks, threads>>>(out->data(),
@@ -918,7 +918,7 @@ void Insert(Ptr<Allocator> allocator,
   int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
 
   auto mp_indices = allocator->alloc<size_t>(indices.size());
-  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data());
+  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
 
   int axisGPU = axis + gpu::Shape::size() - out->shape().size();
   gInsert<<<blocks, threads>>>(out->data(),
diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu
new file mode 100644
index 00000000..b98a8b1c
--- /dev/null
+++ b/src/tensors/gpu/algorithm.cu
@@ -0,0 +1,31 @@
+#include "tensors/gpu/algorithm.h"
+
+#include "kernels/cuda_helpers.h"
+
+namespace marian {
+  namespace gpu {
+    void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest) {
+      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+      CudaCopy(begin, end, dest);
+      CUDA_CHECK(cudaStreamSynchronize(0));
+    }
+
+    __global__ void gFill(float *d_in, int size, float val) {
+      for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
+        int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
+        if(index < size) {
+          d_in[index] = val;
+        }
+      }
+    }
+
+    void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
+      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+      int size = end - begin;
+      int threads = std::min(512, size);
+      int blocks = (size / threads) + (size % threads != 0);
+      gFill<<<blocks, threads>>>(begin, size, value);
+      CUDA_CHECK(cudaStreamSynchronize(0));
+    }
+  }
+}
diff --git a/src/tensors/gpu/algorithm.h b/src/tensors/gpu/algorithm.h
new file mode 100644
index 00000000..7fd9a93c
--- /dev/null
+++ b/src/tensors/gpu/algorithm.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "tensors/backend.h"
+
+namespace marian {
+  namespace gpu {
+    void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest);
+    void fill(Ptr<Backend> backend, float* begin, float* end, float value);
+  }
+}
diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu
index 96d979bf..be097178 100644
--- a/src/tensors/tensor.cu
+++ b/src/tensors/tensor.cu
@@ -6,54 +6,6 @@
 
 namespace marian {
 
-__global__ void gFill(float *d_in, int size, float val) {
-  for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
-    int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
-    if(index < size) {
-      d_in[index] = val;
-    }
-  }
-}
-
-float TensorBase::get(size_t i) {
-  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
-  float temp;
-  CUDA_CHECK(
-      cudaMemcpy(&temp, data() + i, sizeof(float), cudaMemcpyDeviceToHost));
-  cudaStreamSynchronize(0);
-  return temp;
-}
-
-void TensorBase::set(size_t i, float value) {
-  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
-  CUDA_CHECK(
-      cudaMemcpy(data() + i, &value, sizeof(float), cudaMemcpyHostToDevice));
-  cudaStreamSynchronize(0);
-}
-
-void TensorBase::get(std::vector<float> &v) {
-  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
-  v.resize(size());
-  CUDA_CHECK(cudaMemcpy(
-      v.data(), data(), size() * sizeof(float), cudaMemcpyDeviceToHost));
-  cudaStreamSynchronize(0);
-}
-
-void TensorBase::set(float value) {
-  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
-  int threads = std::min(512, (int)size());
-  int blocks = (size() / threads) + (size() % threads != 0);
-  gFill<<<blocks, threads>>>(data(), size(), value);
-  cudaStreamSynchronize(0);
-}
-
-void TensorBase::set(const std::vector<float> &v) {
-  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
-  CUDA_CHECK(cudaMemcpy(
-      data(), v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice));
-  cudaStreamSynchronize(0);
-}
-
 void TensorBase::setSparse(const std::vector<size_t> &k,
                            const std::vector<float> &v) {
   CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
@@ -61,96 +13,4 @@ void TensorBase::setSparse(const std::vector<size_t> &k,
   cudaStreamSynchronize(0);
 }
 
-void TensorBase::copyFrom(Tensor in) {
-  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
-  CUDA_CHECK(cudaMemcpy(data(),
-                        (float *)in->data(),
-                        in->size() * sizeof(float),
-                        cudaMemcpyDefault));
-  cudaStreamSynchronize(0);
-}
-
-std::string TensorBase::debug() {
-  std::stringstream strm;
-  assert(shape_.size());
-  strm << shape_;
-  strm << " device=" << backend_->getDevice();
-  strm << " ptr=" << (size_t)memory_->data();
-  strm << " bytes=" << memory_->size();
-  strm << std::endl;
-
-  // values
-  size_t totSize = shape_.elements();
-  std::vector<float> values(totSize);
-  get(values);
-
-  size_t dispCols = 5;
-  strm << std::fixed << std::setprecision(8) << std::setfill(' ');
-
-  for(int i = 0; i < values.size(); ++i) {
-    std::vector<int> dims;
-    shape().dims(i, dims);
-
-    bool disp = true;
-    for(int j = 0; j < dims.size(); ++j)
-      disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
-
-    if(disp) {
-
-      if(dims.back() == 0) {
-        bool par = true;
-        std::vector<std::string> p;
-        for(int j = dims.size() - 1; j >= 0; --j) {
-          if(dims[j] != 0)
-            par = false;
-
-          p.push_back(par ? "[" : " ");
-        }
-        for(auto it = p.rbegin(); it != p.rend(); ++it)
-          strm << *it;
-        strm << " ";
-      }
-
-      strm << std::setw(12)
-           << values[i]
-           << " ";
-
-      if(dims.back() + 1 == shape().back()) {
-        for(int j = dims.size() - 1; j >= 0; --j) {
-          if(dims[j] + 1 != shape()[j])
-            break;
-          strm << "]";
-        }
-        strm << std::endl;
-      }
-
-      bool prev = true;
-      for(int j = dims.size() - 1; j >= 0; --j) {
-        if(j < dims.size() - 1)
-          prev = prev && dims[j + 1] + 1 == shape()[j + 1];
-        if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
-          if(j < dims.size() - 1)
-            for(int k = 0; k <= j; ++k)
-              strm << " ";
-          strm << "... ";
-          if(j < dims.size() - 1)
-            strm << std::endl;
-          break;
-        }
-      }
-    }
-  }
-  strm << std::endl;
-  return strm.str();
-}
-
-Tensor operator<<(Tensor t, const std::vector<float> &v) {
-  t->set(v);
-  return t;
-}
-
-Tensor operator>>(Tensor t, std::vector<float> &v) {
-  t->get(v);
-  return t;
-}
 }
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index 309cacd5..e6dbc9fd 100644
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -5,12 +5,14 @@
 #include <memory>
 #include <sstream>
 
-#include "3rd_party/exception.h"
 #include "common/definitions.h"
 #include "common/shape.h"
 #include "tensors/memory_piece.h"
 #include "tensors/backend.h"
 
+#include <algorithm>
+#include "tensors/gpu/algorithm.h"
+
 namespace marian {
 
 class TensorBase : public std::enable_shared_from_this<TensorBase> {
@@ -49,26 +51,145 @@ public:
     return New<TensorBase>(mem, Shape{1, size}, backend_);
   }
 
-  float get(size_t i);
+  float get(size_t i) {
+    float temp;
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, data() + i, data() + i + 1, &temp);
+    else
+      std::copy(data() + i, data() + i + 1, &temp);
+    return temp;
+  }
 
-  void set(size_t i, float value);
+  void set(size_t i, float value) {
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, &value, &value + 1, data() + i);
+    else
+      std::copy(&value, &value + 1, data() + i);
+  }
 
-  void get(std::vector<float>& v);
+  void get(std::vector<float> &v) {
+    v.resize(size());
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, data(), data() + size(), v.data());
+    else
+      std::copy(data(), data() + size(), v.data());
+  }
 
-  void set(float value);
+  void set(const std::vector<float> &v) {
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, v.data(), v.data() + v.size(), data());
+    else
+      std::copy(v.data(), v.data() + v.size(), data());
+  }
 
-  void set(const std::vector<float>& v);
+  void set(float value) {
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::fill(backend_, data(), data() + size(), value);
+    else
+      std::fill(data(), data() + size(), value);
+  }
 
-  void setSparse(const std::vector<size_t>& k, const std::vector<float>& v);
+  void setSparse(const std::vector<size_t> &k,
+                             const std::vector<float> &v) {
+    //CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
+    //SetSparse(data(), k, v);
+    //cudaStreamSynchronize(0);
+  }
 
-  void copyFrom(Tensor);
+  void copyFrom(Tensor in) {
+    if(in->getBackend()->getDevice().type == DeviceType::gpu ||
+       backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, in->data(), in->data() + in->size(), data());
+    else
+      std::copy(in->data(), in->data() + in->size(), data());
+  }
+
+  std::string debug() {
+    std::stringstream strm;
+    assert(shape_.size());
+    strm << shape_;
+    strm << " device=" << backend_->getDevice();
+    strm << " ptr=" << (size_t)memory_->data();
+    strm << " bytes=" << memory_->size();
+    strm << std::endl;
+
+    // values
+    size_t totSize = shape_.elements();
+    std::vector<float> values(totSize);
+    get(values);
+
+    size_t dispCols = 5;
+    strm << std::fixed << std::setprecision(8) << std::setfill(' ');
+
+    for(int i = 0; i < values.size(); ++i) {
+      std::vector<int> dims;
+      shape().dims(i, dims);
+
+      bool disp = true;
+      for(int j = 0; j < dims.size(); ++j)
+        disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
+
+      if(disp) {
+
+        if(dims.back() == 0) {
+          bool par = true;
+          std::vector<std::string> p;
+          for(int j = dims.size() - 1; j >= 0; --j) {
+            if(dims[j] != 0)
+              par = false;
+
+            p.push_back(par ? "[" : " ");
+          }
+          for(auto it = p.rbegin(); it != p.rend(); ++it)
+            strm << *it;
+          strm << " ";
+        }
+
+        strm << std::setw(12)
+             << values[i]
+             << " ";
+
+        if(dims.back() + 1 == shape().back()) {
+          for(int j = dims.size() - 1; j >= 0; --j) {
+            if(dims[j] + 1 != shape()[j])
+              break;
+            strm << "]";
+          }
+          strm << std::endl;
+        }
+
+        bool prev = true;
+        for(int j = dims.size() - 1; j >= 0; --j) {
+          if(j < dims.size() - 1)
+            prev = prev && dims[j + 1] + 1 == shape()[j + 1];
+          if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
+            if(j < dims.size() - 1)
+              for(int k = 0; k <= j; ++k)
+                strm << " ";
+            strm << "... ";
+            if(j < dims.size() - 1)
+              strm << std::endl;
+            break;
+          }
+        }
+      }
+    }
+    strm << std::endl;
+    return strm.str();
+  }
 
-  std::string debug();
 };
 
 typedef std::shared_ptr<TensorBase> Tensor;
 
-Tensor operator<<(Tensor t, const std::vector<float>& v);
-
-Tensor operator>>(Tensor t, std::vector<float>& v);
+static Tensor operator<<(Tensor t, const std::vector<float> &v) {
+  t->set(v);
+  return t;
+}
+
+static Tensor operator>>(Tensor t, std::vector<float> &v) {
+  t->get(v);
+  return t;
+}
+
 }

From 0c959c1a7f0c0a95f2fb43b3ddc577fb43f22ebf Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Fri, 16 Feb 2018 14:38:54 -0800
Subject: [PATCH 08/11] clean up tensors

---
 src/tensors/gpu/algorithm.cu | 11 +++++++++++
 src/tensors/gpu/algorithm.h  |  2 ++
 src/tensors/tensor.cu        | 16 ----------------
 src/tensors/tensor.h         | 11 +++++++----
 4 files changed, 20 insertions(+), 20 deletions(-)
 delete mode 100644 src/tensors/tensor.cu

diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu
index b98a8b1c..b26e00e9 100644
--- a/src/tensors/gpu/algorithm.cu
+++ b/src/tensors/gpu/algorithm.cu
@@ -1,6 +1,7 @@
 #include "tensors/gpu/algorithm.h"
 
 #include "kernels/cuda_helpers.h"
+#include "kernels/tensor_operators.h"
 
 namespace marian {
   namespace gpu {
@@ -27,5 +28,15 @@ namespace marian {
       gFill<<<blocks, threads>>>(begin, size, value);
       CUDA_CHECK(cudaStreamSynchronize(0));
     }
+
+    void setSparse(Ptr<Backend> backend,
+                   const std::vector<size_t>& keys,
+                   const std::vector<float>& values,
+                   float* data) {
+      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+      SetSparse(data, keys, values);
+      CUDA_CHECK(cudaStreamSynchronize(0));
+    }
+
   }
 }
diff --git a/src/tensors/gpu/algorithm.h b/src/tensors/gpu/algorithm.h
index 7fd9a93c..1bfd9acb 100644
--- a/src/tensors/gpu/algorithm.h
+++ b/src/tensors/gpu/algorithm.h
@@ -6,5 +6,7 @@ namespace marian {
   namespace gpu {
     void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest);
     void fill(Ptr<Backend> backend, float* begin, float* end, float value);
+
+    void setSparse(Ptr<Backend> backend, const std::vector<size_t>&, const std::vector<float>&, float*);
   }
 }
diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu
deleted file mode 100644
index be097178..00000000
--- a/src/tensors/tensor.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-
-#include "tensors/tensor.h"
-
-#include "kernels/cuda_helpers.h"
-#include "kernels/tensor_operators.h"
-
-namespace marian {
-
-void TensorBase::setSparse(const std::vector<size_t> &k,
-                           const std::vector<float> &v) {
-  CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
-  SetSparse(data(), k, v);
-  cudaStreamSynchronize(0);
-}
-
-}
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index e6dbc9fd..a693f1f5 100644
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -90,10 +90,13 @@ public:
   }
 
   void setSparse(const std::vector<size_t> &k,
-                             const std::vector<float> &v) {
-    //CUDA_CHECK(cudaSetDevice(backend_->getDevice().no));
-    //SetSparse(data(), k, v);
-    //cudaStreamSynchronize(0);
+                 const std::vector<float> &v) {
+    if(backend_->getDevice().type == DeviceType::gpu) {
+      gpu::setSparse(backend_, k, v, data());
+    } else {
+      for(int i = 0; i < k.size(); ++i)
+        data()[k[i]] = v[i];
+    }
   }
 
   void copyFrom(Tensor in) {

From 5e1b95e0bb0a6b86b75515c1069bf55596b6fe6c Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Fri, 16 Feb 2018 16:39:11 -0800
Subject: [PATCH 09/11] finshed backend separation

---
 src/graph/parameters.h                        |  4 +--
 src/tensors/allocator.h                       |  3 ++-
 src/tensors/cpu/backend.h                     | 25 +++++++++++--------
 src/tensors/cpu/dropout.cpp                   |  8 ++++--
 src/tensors/device.cpp                        | 11 ++++----
 src/tensors/device.cu                         |  2 +-
 src/tensors/device.h                          | 16 ++++++------
 src/tensors/gpu/backend.h                     |  8 +++---
 src/tensors/tensor_allocator.h                | 16 ++++++++----
 src/tests/CMakeLists.txt                      |  2 +-
 .../{dropout_test.cu => dropout_test.cpp}     |  5 +++-
 11 files changed, 60 insertions(+), 40 deletions(-)
 rename src/tests/{dropout_test.cu => dropout_test.cpp} (79%)

diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index 3f282e4a..99e3b2af 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -57,7 +57,7 @@ public:
   }
 
   void allocateForward() {
-    if(vals_->size() == 0) {
+    if(!params_.empty() && vals_->size() == 0) {
       vals_->reserveExact(totalCapacity(vals_));
       for(auto p : params_)
         if(!p->val())
@@ -66,7 +66,7 @@ public:
   }
 
   void allocateBackward() {
-    if(grads_->size() == 0) {
+    if(!params_.empty() && grads_->size() == 0) {
       grads_->reserveExact(totalCapacity(grads_));
       for(auto p : params_)
         if(!p->grad())
diff --git a/src/tensors/allocator.h b/src/tensors/allocator.h
index 6c9b51f6..6781afa7 100644
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@@ -153,7 +153,8 @@ public:
 
   void reserve(size_t bytes) {
     bytes = align(bytes);
-    device_->reserve(bytes);
+    if(bytes > 0)
+      device_->reserve(bytes);
     clear();
   }
 
diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
index 7bf5770e..83d0ae12 100644
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@@ -1,23 +1,28 @@
 #pragma once
 
+#include <functional>
+#include <random>
+
 #include "common/config.h"
 #include "tensors/backend.h"
 
 namespace marian {
 namespace cpu {
-  
-class Backend : public marian::Backend {  
-public:
-  Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {
-  }
-
-  void setDevice() {
-  }
 
+class Backend : public marian::Backend {
 private:
-  void setHandles() {
+  std::default_random_engine gen_;
 
-  }  
+public:
+  Backend(DeviceId deviceId, size_t seed)
+  : marian::Backend(deviceId, seed),
+    gen_(seed_) {}
+
+  void setDevice() { }
+
+  std::default_random_engine& getRandomGenerator() {
+    return gen_;
+  }
 };
 
 }
diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp
index cc6cea41..a6a0c263 100644
--- a/src/tensors/cpu/dropout.cpp
+++ b/src/tensors/cpu/dropout.cpp
@@ -1,13 +1,17 @@
 #include <algorithm>
+#include <random>
 
 #include "tensors/dispatch.h"
+#include "tensors/cpu/backend.h"
 
 namespace marian {
   namespace cpu {
 
     void Dropout(Tensor tensor, float p) {
-      ABORT("Not implemented");
-      std::fill(tensor->data(), tensor->data() + tensor->size(), p);
+      auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
+      auto &gen = cpuBackend->getRandomGenerator();
+      std::bernoulli_distribution dist(1.f - p);
+      std::generate(tensor->data(), tensor->data() + tensor->size(), [&]() { return dist(gen) / (1.f - p); });
     }
 
   }
diff --git a/src/tensors/device.cpp b/src/tensors/device.cpp
index 5e2a3568..e4bf9d17 100644
--- a/src/tensors/device.cpp
+++ b/src/tensors/device.cpp
@@ -3,14 +3,16 @@
 
 namespace marian {
 namespace cpu {
-  
+
   Device::~Device() {
     delete[] data_;
+    data_ = nullptr;
+    size_ = 0;
   }
-  
+
   void Device::reserve(size_t size) {
     size = align(size);
-    ABORT_IF(size < size_, "New size must be larger than old size");
+    ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
 
     if(data_) {
       uint8_t *temp = new uint8_t[size_];
@@ -20,9 +22,8 @@ namespace cpu {
     } else {
       data_ = new uint8_t[size];
     }
-
     size_ = size;
   }
-  
+
 }
 }
diff --git a/src/tensors/device.cu b/src/tensors/device.cu
index 2264fda4..bef5491b 100644
--- a/src/tensors/device.cu
+++ b/src/tensors/device.cu
@@ -19,7 +19,7 @@ namespace gpu {
     size = align(size);
     cudaSetDevice(deviceId_.no);
 
-    ABORT_IF(size < size_, "New size must be larger than old size");
+    ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
 
     if(data_) {
       // Allocate memory by going through host memory
diff --git a/src/tensors/device.h b/src/tensors/device.h
index 33a9c104..22e41bd5 100644
--- a/src/tensors/device.h
+++ b/src/tensors/device.h
@@ -10,9 +10,9 @@ namespace marian {
 class Device {
 protected:
   DeviceId deviceId_;
-  
-  uint8_t* data_;
-  size_t size_;
+
+  uint8_t* data_{0};
+  size_t size_{0};
   size_t alignment_;
 
   size_t align(size_t size) {
@@ -39,9 +39,9 @@ namespace gpu {
     public:
       Device(DeviceId deviceId, size_t alignment = 256)
       : marian::Device(deviceId, alignment) {}
-      
+
       ~Device();
-      
+
       void reserve(size_t size);
   };
 }
@@ -51,9 +51,9 @@ namespace cpu {
     public:
       Device(DeviceId deviceId, size_t alignment = 256)
       : marian::Device(deviceId, alignment) {}
-      
+
       ~Device();
-      
+
       void reserve(size_t size);
   };
 }
@@ -65,4 +65,4 @@ static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 2
     return New<cpu::Device>(deviceId, alignment);
 }
 
-}
\ No newline at end of file
+}
diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h
index cafdfb5b..357206fc 100644
--- a/src/tensors/gpu/backend.h
+++ b/src/tensors/gpu/backend.h
@@ -17,8 +17,8 @@
 
 namespace marian {
 namespace gpu {
-  
-class Backend : public marian::Backend {  
+
+class Backend : public marian::Backend {
 public:
   Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {
     setDevice();
@@ -37,13 +37,13 @@ private:
   cublasHandle_t cublasHandle_;
   curandGenerator_t curandGenerator_;
 
-  
+
   void setHandles() {
     cublasHandle_ = create_handle();
     curandGenerator_ = createCurandGenerator();
   }
 
-  
+
   curandGenerator_t createCurandGenerator() {
     cudaSetDevice(deviceId_.no);
     curandGenerator_t generator;
diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h
index 18aae134..e36100a2 100644
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@@ -42,11 +42,17 @@ public:
 
   void reserveExact(size_t bytes = 0) {
     size_t mbytes = bytes / MBYTE;
-    LOG(info,
-        "[memory] Reserving {} MB, device {}",
-        mbytes,
-        allocator_->getDevice());
-
+    if(mbytes == 0) {
+      LOG(info,
+          "[memory] Reserving {} B, device {}",
+          bytes,
+          allocator_->getDevice());
+    } else {
+      LOG(info,
+          "[memory] Reserving {} MB, device {}",
+          mbytes,
+          allocator_->getDevice());
+    }
     allocator_->reserve(bytes);
   }
 
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index aebaaaaa..16882b52 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -17,9 +17,9 @@ endforeach(test)
 
 # Testing apps
 add_executable(logger_test logger_test.cpp)
+add_executable(dropout_test dropout_test.cpp)
 #cuda_add_executable(bn_test bn_test.cu)
 cuda_add_executable(pooling_test pooling_test.cu)
-cuda_add_executable(dropout_test dropout_test.cu)
 #cuda_add_executable(marian_test marian_test.cu)
 cuda_add_executable(tensor_test tensor_test.cu)
 
diff --git a/src/tests/dropout_test.cu b/src/tests/dropout_test.cpp
similarity index 79%
rename from src/tests/dropout_test.cu
rename to src/tests/dropout_test.cpp
index f4a42e92..5023606b 100644
--- a/src/tests/dropout_test.cu
+++ b/src/tests/dropout_test.cpp
@@ -12,8 +12,11 @@ using namespace keywords;
 int main(int argc, char** argv) {
   auto c = New<Config>(argc, argv);
 
+  auto type = c->get<bool>("cpu") ? DeviceType::cpu : DeviceType::gpu;
+  DeviceId deviceId{0, type};
+
   auto g = New<ExpressionGraph>();
-  g->setDevice({0, DeviceType::gpu});
+  g->setDevice(deviceId);
   g->reserveWorkspaceMB(512);
 
   for(int i = 0; i < 10; ++i) {

From 79c385b894ed41c6ef699fb310ce1f59250bfc91 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Fri, 16 Feb 2018 16:43:43 -0800
Subject: [PATCH 10/11] reformatting

---
 src/tensors/cpu/dropout.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp
index a6a0c263..bb6ee799 100644
--- a/src/tensors/cpu/dropout.cpp
+++ b/src/tensors/cpu/dropout.cpp
@@ -11,7 +11,8 @@ namespace marian {
       auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
       auto &gen = cpuBackend->getRandomGenerator();
       std::bernoulli_distribution dist(1.f - p);
-      std::generate(tensor->data(), tensor->data() + tensor->size(), [&]() { return dist(gen) / (1.f - p); });
+      std::generate(tensor->data(), tensor->data() + tensor->size(),
+                    [&]() { return dist(gen) / (1.f - p); });
     }
 
   }

From 6e421f7a741dca8d7181f87acd398da60bb77f7d Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <junczys@amu.edu.pl>
Date: Fri, 16 Feb 2018 21:53:09 -0800
Subject: [PATCH 11/11] add cpu tests for graph

---
 src/tests/graph_tests.cpp | 50 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/tests/graph_tests.cpp b/src/tests/graph_tests.cpp
index 297fd73f..dbb4dd0c 100644
--- a/src/tests/graph_tests.cpp
+++ b/src/tests/graph_tests.cpp
@@ -53,3 +53,53 @@ TEST_CASE("Expression graph can be initialized with constant values",
     REQUIRE(values == v);
   }
 }
+
+TEST_CASE("Graph device is set (cpu)", "[graph]") {
+  auto graph = New<ExpressionGraph>();
+
+  graph->setDevice({0, DeviceType::cpu});
+  
+  DeviceId testId{0, DeviceType::cpu};
+  REQUIRE(graph->getDevice() == testId);
+}
+
+TEST_CASE("Expression graph can be initialized with constant values (cpu)",
+          "[graph]") {
+  auto graph = New<ExpressionGraph>();
+  graph->setDevice({0, DeviceType::cpu});
+  graph->reserveWorkspaceMB(4);
+
+  std::vector<float> values;
+
+  SECTION("initializing with zero (cpu)") {
+    graph->clear();
+    values.clear();
+    auto zeros = graph->param("0s", {2, 5}, keywords::init = inits::zeros);
+    graph->forward();
+
+    zeros->val()->get(values);
+    REQUIRE(values == std::vector<float>(10, 0.0f));
+  }
+
+  SECTION("initializing with ones (cpu)") {
+    graph->clear();
+    values.clear();
+    auto ones = graph->param("1s", {2, 5}, keywords::init = inits::ones);
+    graph->forward();
+
+    ones->val()->get(values);
+    REQUIRE(values == std::vector<float>(10, 1.0f));
+  }
+
+  SECTION("initializing from vector (cpu)") {
+    graph->clear();
+    values.clear();
+    std::vector<float> v({1, 2, 3, 4, 5, 6});
+    auto vals = graph->param("vs", {2, 3}, keywords::init = inits::from_vector(v));
+    graph->forward();
+
+    REQUIRE(values.empty());
+    vals->val()->get(values);
+    REQUIRE(values == v);
+  }
+}