Merge branch 'master' into save-adam-params

2024-09-19 02:37:14 +03:00 · 2018-02-18 21:40:31 +00:00 · 2018-02-18 21:40:31 +00:00 · 0f16ed97f3
commit 0f16ed97f3
parent 212ac9700e 6e421f7a74
51 changed files with 741 additions and 698 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -58,12 +58,12 @@ else(Tcmalloc_FOUND)
  message(WARNING "Cannot find TCMalloc library. Continuing.")
 endif(Tcmalloc_FOUND)

-find_package(MPI)
-if(MPI_FOUND)
-  include_directories(${MPI_INCLUDE_PATH})
-  set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
-  add_definitions(-DMPI_FOUND=1)
-endif(MPI_FOUND)
+#find_package(MPI)
+#if(MPI_FOUND)
+#  include_directories(${MPI_INCLUDE_PATH})
+#  set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
+#  add_definitions(-DMPI_FOUND=1)
+#endif(MPI_FOUND)

 find_package(ZLIB)
 if(ZLIB_FOUND)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -8,19 +8,22 @@ cuda_add_library(marian
  3rd_party/cnpy/cnpy.cpp
  3rd_party/exception.cpp
  3rd_party/svd/svd.cpp
-  graph/expression_graph.cpp
-  graph/expression_operators.cu
-  graph/node.cu
-  graph/node_operators.cu
-  graph/node_initializers.cu
-  tensors/tensor.cu
-  tensors/device_gpu.cu
-  tensors/memory_piece.cu
+#  tensors/tensor.cu
+  tensors/device.cu
+  tensors/device.cpp
+  tensors/backend.cpp
+  tensors/gpu/algorithm.cu
+  tensors/gpu/dropout.cu
+  tensors/cpu/dropout.cpp
  kernels/tensor_operators.cu
  kernels/cudnn_wrappers.cu
-  backend/gpu/dropout.cu
-  backend/cpu/dropout.cpp
+  graph/expression_graph.cpp
+  graph/expression_operators.cu
+  graph/node.cpp
+  graph/node_operators.cu
+  graph/node_initializers.cu
  layers/convolution.cu
+
  rnn/cells.cu
  optimizers/clippers.cu
  optimizers/optimizers.cu
@ -91,7 +94,7 @@ endforeach(exec)
 #set_target_properties(align2steps PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")

 if(PYTHONLIBS_FOUND)
-  add_subdirectory(python)
+#  add_subdirectory(python)
 endif(PYTHONLIBS_FOUND)

 if(COMPILE_TESTS)
--- a/src/backend/cpu/dropout.cpp
+++ b/src/backend/cpu/dropout.cpp
@ -1,14 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "backend/dispatch.h"
-
-namespace marian {
-  namespace cpu {
-    
-    void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
-      ABORT("Not implemented");
-    }
-    
-  }
-}
--- a/src/backend/dispatch.h
+++ b/src/backend/dispatch.h
@ -1,39 +0,0 @@
-#pragma once
-
-#include "common/definitions.h"
-#include "graph/backend.h"
-#include "tensors/tensor.h"
-
-#define DISPATCH1(Function, Arg1) \
-  namespace gpu { \
-    void Function(Ptr<Backend>, Arg1); \
-  } \
-  namespace cpu { \
-    void Function(Ptr<Backend>, Arg1); \
-  } \
-  void Function(Ptr<Backend> backend, Arg1 arg1) { \
-    if(backend->getDevice().type == DeviceType::gpu) \
-      gpu::Function(backend, arg1); \
-    else \
-      cpu::Function(backend, arg1); \
-  }
-
-#define DISPATCH2(Function, Arg1, Arg2) \
-  namespace gpu { \
-    void Function(Ptr<Backend>, Arg1, Arg2); \
-  } \
-  namespace cpu { \
-    void Function(Ptr<Backend>, Arg1, Arg2); \
-  } \
-  static inline void Function(Ptr<Backend> backend, Arg1 arg1, Arg2 arg2) { \
-    if(backend->getDevice().type == DeviceType::gpu) \
-      gpu::Function(backend, arg1, arg2); \
-    else \
-      cpu::Function(backend, arg1, arg2); \
-  }
-  
-namespace marian {
-
-  DISPATCH2(Dropout, Tensor, float)
-    
-}
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@ -1,8 +1,7 @@
 #include <sstream>
-#include "graph/backend_gpu.h"
 #include "graph/expression_graph.h"

-#include "backend/dispatch.h"
+#include "tensors/dispatch.h"

 namespace marian {

@ -11,19 +10,17 @@ ExpressionGraph::ExpressionGraph(bool inference)

 void ExpressionGraph::setDevice(DeviceId deviceId) {
  if(!backend_) {
-    backend_ = New<BackendGPU>(deviceId, Config::seed);
-  
+    backend_ = BackendByDevice(deviceId, Config::seed);
    params_ = New<Parameters>();
-    params_->init(backend_->getDevice());
-  
-    tensors_ = New<TensorAllocator>(backend_->getDevice());
+    params_->init(backend_);
+    tensors_ = New<TensorAllocator>(backend_);
  }
 }

 Expr ExpressionGraph::dropout(float prob, Shape shape) {
  return Expression<ConstantNode>(shared_from_this(),
                                  keywords::init = [prob, this](Tensor t) {
-                                    Dropout(backend_, t, prob);
+                                    Dropout(t, prob);
                                  },
                                  keywords::shape = shape);
 }
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -8,8 +8,8 @@
 #include "common/definitions.h"

 #include "tensors/tensor_allocator.h"
+#include "tensors/backend.h"

-#include "graph/backend.h"
 #include "graph/parameters.h"
 #include "graph/chainable.h"
 #include "graph/node_operators.h"
@ -306,7 +306,7 @@ public:
      tensors_->free(t);
  }

-  Ptr<Allocator<DeviceGPU>> allocator() { return tensors_->allocator(); }
+  Ptr<Allocator> allocator() { return tensors_->allocator(); }

  void clear() {
    // clear everything apart from parameters
--- a/src/graph/node.cpp
+++ b/src/graph/node.cpp
@ -1,4 +1,4 @@
-#include "graph/backend_gpu.h"
+#include "tensors/backend.h"
 #include "graph/expression_graph.h"
 #include "graph/node.h"

--- a/src/graph/node.h
+++ b/src/graph/node.h
@ -5,10 +5,11 @@
 #include <thread>

 #include "common/keywords.h"
-#include "graph/backend.h"
-#include "graph/chainable.h"
+#include "tensors/backend.h"
 #include "tensors/tensor.h"

+#include "graph/chainable.h"
+
 namespace marian {

 class Node : public Chainable<Tensor>,
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -2,10 +2,10 @@

 #include <thread>

-#include "graph/backend_gpu.h"
+#include "tensors/gpu/backend.h"
 #include "graph/node.h"
-#include "kernels/tensor_operators.h"
 #include "functional/functional.h"
+#include "kernels/tensor_operators.h"
 #include "kernels/cudnn_wrappers.h"

 namespace marian {
@ -54,7 +54,7 @@ public:
  NodeOps forwardOps() {
    // C = alpha * dot(op(A), op(B))
    return {NodeOp(Prod(
-        std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
+        std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
        val_,
        child(0)->val(),
        child(1)->val(),
@ -72,7 +72,7 @@ public:
    // to sum gradients from different graph parts

    if(!transA_ && transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(0)->grad(),
                          adj_,
@ -81,7 +81,7 @@ public:
                          false,
                          1.0,
                          scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(1)->grad(),
                          adj_,
@ -92,7 +92,7 @@ public:
                          scalar_))};

    if(transA_ && !transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(0)->grad(),
                          child(1)->val(),
@ -101,7 +101,7 @@ public:
                          true,
                          1.0,
                          scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(1)->grad(),
                          child(0)->val(),
@ -112,7 +112,7 @@ public:
                          scalar_))};

    if(transA_ && transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(0)->grad(),
                          child(1)->val(),
@ -121,7 +121,7 @@ public:
                          true,
                          1.0,
                          scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(1)->grad(),
                          adj_,
@ -131,7 +131,7 @@ public:
                          1.0,
                          scalar_))};

-    return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+    return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                            ->getCublasHandle(),
                        child(0)->grad(),
                        adj_,
@ -140,7 +140,7 @@ public:
                        true,
                        1.0,
                        scalar_)),
-            NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+            NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                            ->getCublasHandle(),
                        child(1)->grad(),
                        child(0)->val(),
@ -198,7 +198,7 @@ public:
    using namespace functional;
    return {
      NodeOp(Prod(
-        std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
+        std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
        val_,
        child(0)->val(),
        child(1)->val(),
@ -219,7 +219,7 @@ public:
    using namespace functional;

    if(!transA_ && transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(0)->grad(),
                          adj_,
@ -228,7 +228,7 @@ public:
                          false,
                          1.0,
                          scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(1)->grad(),
                          adj_,
@ -240,7 +240,7 @@ public:
              NodeOp(Add(_1, child(2)->grad(), adj_))};

    if(transA_ && !transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(0)->grad(),
                          child(1)->val(),
@ -249,7 +249,7 @@ public:
                          true,
                          1.0,
                          scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(1)->grad(),
                          child(0)->val(),
@ -261,7 +261,7 @@ public:
              NodeOp(Add(_1, child(2)->grad(), adj_))};

    if(transA_ && transB_)
-      return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+      return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(0)->grad(),
                          child(1)->val(),
@ -270,7 +270,7 @@ public:
                          true,
                          1.0,
                          scalar_)),
-              NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+              NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                              ->getCublasHandle(),
                          child(1)->grad(),
                          adj_,
@ -281,7 +281,7 @@ public:
                          scalar_)),
              NodeOp(Add(_1, child(2)->grad(), adj_))};

-    return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+    return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                            ->getCublasHandle(),
                        child(0)->grad(),
                        adj_,
@ -290,7 +290,7 @@ public:
                        true,
                        1.0,
                        scalar_)),
-            NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
+            NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
                            ->getCublasHandle(),
                        child(1)->grad(),
                        child(0)->val(),
@ -350,7 +350,7 @@ public:
  NodeOps forwardOps() {
    // C = alpha * dot(op(A), op(B))
    return {NodeOp(ProdBatched(
-        std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
+        std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
        val_,
        child(0)->val(),
        child(1)->val(),
@ -369,7 +369,7 @@ public:

    if(!transA_ && transB_)
      return {
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                 ->getCublasHandle(),
                             child(0)->grad(),
                             adj_,
@ -378,7 +378,7 @@ public:
                             false,
                             1.0,
                             scalar_)),
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                 ->getCublasHandle(),
                             child(1)->grad(),
                             adj_,
@ -390,7 +390,7 @@ public:

    if(transA_ && !transB_)
      return {
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                 ->getCublasHandle(),
                             child(0)->grad(),
                             child(1)->val(),
@ -399,7 +399,7 @@ public:
                             true,
                             1.0,
                             scalar_)),
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                 ->getCublasHandle(),
                             child(1)->grad(),
                             child(0)->val(),
@ -411,7 +411,7 @@ public:

    if(transA_ && transB_)
      return {
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                 ->getCublasHandle(),
                             child(0)->grad(),
                             child(1)->val(),
@ -420,7 +420,7 @@ public:
                             true,
                             1.0,
                             scalar_)),
-          NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+          NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                                 ->getCublasHandle(),
                             child(1)->grad(),
                             adj_,
@ -431,7 +431,7 @@ public:
                             scalar_))};

    return {
-        NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+        NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(0)->grad(),
                           adj_,
@ -440,7 +440,7 @@ public:
                           true,
                           1.0,
                           scalar_)),
-        NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
+        NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
                               ->getCublasHandle(),
                           child(1)->grad(),
                           child(0)->val(),
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@ -1,10 +1,11 @@
 #pragma once

-#include "graph/backend_gpu.h"
+#include "tensors/tensor.h"
+#include "tensors/gpu/backend.h"
+
 #include "graph/node.h"
 #include "kernels/sparse.h"
 #include "kernels/tensor_operators.h"
-#include "tensors/tensor.h"
 #include "functional/functional.h"
 #include "kernels/cudnn_wrappers.h"

@ -210,21 +211,7 @@ struct TanhNodeOp : public NaryNodeOp {
  const std::string type() { return "tanh"; }
 };

-/**
- * Represents a <a
- * href="https://en.wikipedia.org/wiki/Rectifier_(neural_networks)">rectified
- * linear</a> node in an expression graph.
- *
- * This node implements the activation function \f$ f(x) = \max(0, x) \f$ and
- * its derivative:
- * \f[
- *   f^\prime(x) =
- *   \begin{cases}
- *     0 & \text{if } x \leq 0 \\
- *     1 & \text{if } x > 0
- *   \end{cases}
- * \f]
- */
+
 struct ReLUNodeOp : public UnaryNodeOp {
  template <typename... Args>
  ReLUNodeOp(Args... args) : UnaryNodeOp(args...) {}
@ -876,14 +863,14 @@ public:
  Tensor& val() {
    auto childVal = reshapee_->val();
    val_.reset(
-        new TensorBase(childVal->memory(), shape(), childVal->getDevice()));
+        new TensorBase(childVal->memory(), shape(), childVal->getBackend()));
    return val_;
  };

  Tensor& grad() {
    auto childGrad = reshapee_->grad();
    adj_.reset(
-        new TensorBase(childGrad->memory(), shape(), childGrad->getDevice()));
+        new TensorBase(childGrad->memory(), shape(), childGrad->getBackend()));
    return adj_;
  };

@ -952,7 +939,7 @@ public:
    size_t offset = step_ * shape().elements() * sizeof(float);
    auto mem = New<MemoryPiece>(childVal->memory()->data() + offset,
                                childVal->memory()->size());
-    val_.reset(new TensorBase(mem, shape(), childVal->getDevice()));
+    val_.reset(new TensorBase(mem, shape(), childVal->getBackend()));
    return val_;
  };

@ -961,7 +948,7 @@ public:
    size_t offset = step_ * shape().elements() * sizeof(float);
    auto mem = New<MemoryPiece>(childGrad->memory()->data() + offset,
                                childGrad->memory()->size());
-    adj_.reset(new TensorBase(mem, shape(), childGrad->getDevice()));
+    adj_.reset(new TensorBase(mem, shape(), childGrad->getBackend()));
    return adj_;
  };

--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@ -20,9 +20,9 @@ private:
  Ptr<TensorAllocator> grads_;

 public:
-  void init(DeviceId deviceId) {
-    vals_ = New<TensorAllocator>(deviceId);
-    grads_ = New<TensorAllocator>(deviceId);
+  void init(Ptr<Backend> backend) {
+    vals_ = New<TensorAllocator>(backend);
+    grads_ = New<TensorAllocator>(backend);
  }

  auto begin() -> decltype(params_.begin()) { return params_.begin(); }
@ -57,7 +57,7 @@ public:
  }

  void allocateForward() {
-    if(vals_->size() == 0) {
+    if(!params_.empty() && vals_->size() == 0) {
      vals_->reserveExact(totalCapacity(vals_));
      for(auto p : params_)
        if(!p->val())
@ -66,7 +66,7 @@ public:
  }

  void allocateBackward() {
-    if(grads_->size() == 0) {
+    if(!params_.empty() && grads_->size() == 0) {
      grads_->reserveExact(totalCapacity(grads_));
      for(auto p : params_)
        if(!p->grad())
--- a/src/kernels/cuda_helpers.h
+++ b/src/kernels/cuda_helpers.h
@ -17,6 +17,12 @@ inline void gpuAssert(cudaError_t code,
  }
 }

+template <typename T>
+void CudaCopy(const T* start, const T* end, T* dest) {
+  CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T),
+             cudaMemcpyDefault));
+}
+
 #define CUSPARSE_CHECK(x)                               \
  {                                                     \
    cusparseStatus_t _c = x;                            \
--- a/src/kernels/sparse.cu
+++ b/src/kernels/sparse.cu
@ -12,7 +12,7 @@ void multiply(Ptr<CSR> C,
              const Ptr<CSR> B,
              bool transA,
              bool transB) {
-  cudaSetDevice(C->getDevice());
+  cudaSetDevice(backend_->getDevice().no);
  int nnzTotal;
  C->allocRowIndices(A->rows());
  CUSPARSE_CHECK(cusparseXcsrgemmNnz(
@ -91,7 +91,7 @@ void multiply(Ptr<CSR> C,
 //}

 void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr<CSR> sparseLf) {
-  cudaSetDevice(out->getDevice());
+  cudaSetDevice(backend_->getDevice().no);

  int batch = att->shape()[0];
  int srcWords = att->shape()[2];
@ -150,7 +150,7 @@ __global__ void gCollapseAtt(float* out,
 }

 void CollapseAtt(Tensor out, Tensor in) {
-  cudaSetDevice(out->getDevice());
+  cudaSetDevice(backend_->getDevice().no);
  int nonzeros = out->shape().elements();
  int batch = out->shape()[0];
  int srcWords = out->shape()[2];
--- a/src/kernels/sparse.h
+++ b/src/kernels/sparse.h
@ -14,7 +14,7 @@ private:
  int nnz_{0};
  int rows_{0};
  int cols_{0};
-  DeviceId deviceId_;
+  Ptr<Backend> backend_;

  cusparseHandle_t handle_{0};
  cusparseMatDescr_t descr_{0};
@ -24,9 +24,9 @@ private:
  float* values_{0};

 public:
-  CSR(int rows, int cols, DeviceId deviceId)
-      : rows_(rows), cols_(cols), deviceId_(deviceId) {
-    cudaSetDevice(deviceId_.no);
+  CSR(int rows, int cols, Ptr<Backend> backend)
+      : rows_(rows), cols_(cols), backend_(backend) {
+    cudaSetDevice(backend_->getDevice().no);
    CUSPARSE_CHECK(cusparseCreate(&handle_));
    CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_));
    CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL));
@ -38,9 +38,9 @@ public:
      const std::vector<float>& values,
      const std::vector<int>& rowIndices,
      const std::vector<int>& colIndices,
-      DeviceId deviceId)
-      : nnz_(values.size()), rows_(rows), cols_(cols), deviceId_(deviceId) {
-    cudaSetDevice(deviceId_.no);
+      Ptr<Backend> backend)
+      : nnz_(values.size()), rows_(rows), cols_(cols), backend_(backend) {
+    cudaSetDevice(backend_->getDevice().no);
    CUSPARSE_CHECK(cusparseCreate(&handle_));
    CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_));
    CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL));
@ -73,8 +73,8 @@ public:
    CUDA_CHECK(cudaFree(cooRowIndices));
  }

-  CSR(Tensor dense) : deviceId_(dense->getDevice()) {
-    cudaSetDevice(deviceId_.no);
+  CSR(Tensor dense) : backend_(dense->getBackend()) {
+    cudaSetDevice(backend_->getDevice().no);
    rows_ = dense->shape()[0] * dense->shape()[2] * dense->shape()[3];
    cols_ = dense->shape()[1];

@ -114,7 +114,7 @@ public:
  }

  ~CSR() {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
    if(values_)
      CUDA_CHECK(cudaFree(values_));
    if(rowIndices_)
@ -129,7 +129,7 @@ public:
  }

  void toTensor(Tensor dense) {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
    ABORT_IF(dense->size() != rows_ * cols_, "Matrix sizes do not match");

    cusparseScsc2dense(handle_,
@ -154,10 +154,10 @@ public:
  int* rowIndices() { return rowIndices_; }
  int* colIndices() { return colIndices_; }

-  DeviceId getDevice() { return deviceId_; }
+  DeviceId getDevice() { return backend_->getDevice(); }

  void allocValues(int nnz = 0) {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
    if(nnz > 0)
      nnz_ = nnz;
    if(values_)
@ -166,7 +166,7 @@ public:
  }

  void allocRowIndices(int rows = 0) {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
    if(rows > 0)
      rows_ = rows;
    if(rowIndices_)
@ -175,7 +175,7 @@ public:
  }

  void allocColIndices(int nnz = 0) {
-    cudaSetDevice(deviceId_.no);
+    cudaSetDevice(backend_->getDevice().no);
    if(nnz > 0)
      nnz_ = nnz;
    if(colIndices_)
@ -184,11 +184,12 @@ public:
  }

  std::string debug() {
+    cudaSetDevice(backend_->getDevice().no);
    uint8_t* buffer;
    CUDA_CHECK(cudaMalloc(&buffer, sizeof(float) * rows() * cols()));

    auto mem = New<MemoryPiece>(buffer, sizeof(float) * rows() * cols());
-    Tensor tensor(new TensorBase(mem, {rows(), cols()}, deviceId_));
+    Tensor tensor(new TensorBase(mem, {rows(), cols()}, backend_));
    toTensor(tensor);
    std::string temp = tensor->debug();

--- a/src/kernels/tensor_operators.cu
+++ b/src/kernels/tensor_operators.cu
@ -879,7 +879,7 @@ __global__ void gInsert(float* out,
  }
 }

-void Select(Ptr<Allocator<DeviceGPU>> allocator,
+void Select(Ptr<Allocator> allocator,
            Tensor out,
            const Tensor in,
            int axis,
@ -892,7 +892,7 @@ void Select(Ptr<Allocator<DeviceGPU>> allocator,
  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));

  auto mp_indices = allocator->alloc<size_t>(indices.size());
-  mp_indices->insert(indices.data(), indices.size());
+  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());

  int axisGPU = axis + gpu::Shape::size() - out->shape().size();
  gSelect<<<blocks, threads>>>(out->data(),
@ -905,7 +905,7 @@ void Select(Ptr<Allocator<DeviceGPU>> allocator,
  allocator->free(mp_indices);
 }

-void Insert(Ptr<Allocator<DeviceGPU>> allocator,
+void Insert(Ptr<Allocator> allocator,
            Tensor out,
            const Tensor in,
            int axis,
@ -918,7 +918,7 @@ void Insert(Ptr<Allocator<DeviceGPU>> allocator,
  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));

  auto mp_indices = allocator->alloc<size_t>(indices.size());
-  mp_indices->insert(indices.data(), indices.size());
+  CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());

  int axisGPU = axis + gpu::Shape::size() - out->shape().size();
  gInsert<<<blocks, threads>>>(out->data(),
@ -1295,7 +1295,7 @@ float L2Norm(Tensor in) {
  uint8_t* data;
  cudaMalloc(&data, blocks * sizeof(float));
  Tensor out(new TensorBase(
-      New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getDevice()));
+      New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend()));

  ReduceAll(_1 * _1, out, in);
  float dataCpu = sqrtf(out->get(0));
--- a/src/kernels/tensor_operators.h
+++ b/src/kernels/tensor_operators.h
@ -7,7 +7,6 @@
 #include "tensors/tensor.h"

 #include "tensors/allocator.h"
-#include "tensors/device_gpu.h"

 #include "gpu/shape.h"
 #include "gpu/tmp.h"
@ -71,13 +70,13 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) {

 void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis);

-void Select(Ptr<Allocator<DeviceGPU>> allocator,
+void Select(Ptr<Allocator> allocator,
            Tensor out,
            Tensor in,
            int axis,
            const std::vector<size_t>&);

-void Insert(Ptr<Allocator<DeviceGPU>> allocator,
+void Insert(Ptr<Allocator> allocator,
            Tensor out,
            Tensor in,
            int axis,
--- a/src/optimizers/optimizers.cu
+++ b/src/optimizers/optimizers.cu
@ -13,7 +13,7 @@ void Sgd::updateImpl(Tensor params, Tensor grads) {

 void Adagrad::updateImpl(Tensor params, Tensor grads) {
  if(!alloc_)
-    alloc_ = New<TensorAllocator>(params->getDevice());
+    alloc_ = New<TensorAllocator>(params->getBackend());

  if(!gt_) {
    int elements = params->size();
@ -42,7 +42,7 @@ void Adagrad::resetStats() {

 void Adam::updateImpl(Tensor params, Tensor grads) {
  if(!alloc_)
-    alloc_ = New<TensorAllocator>(params->getDevice());
+    alloc_ = New<TensorAllocator>(params->getBackend());

  if(!mt_) {
    int elements = params->size();
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@ -8,10 +8,10 @@ cuda_add_library(pymarian SHARED
  ../graph/node.cu
  ../graph/node_operators.cu
  ../tensors/tensor.cu
-  ../tensors/device_gpu.cu
+  ../tensors/device.cpp
  ../kernels/tensor_operators.cu
-  ../backend/gpu/dropout.cu
-  ../backend/cpu/dropout.cpp
+  ../tensors/gpu/dropout.cu
+  ../tensors/cpu/dropout.cpp
  ../kernels/sparse.cu
  #../layers/param_initializers.cu
  ../rnn/attention.cu
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@ -10,6 +10,7 @@

 #include "common/definitions.h"
 #include "tensors/memory_piece.h"
+#include "tensors/device.h"

 namespace marian {

@ -65,10 +66,9 @@ public:
  Gap rest(size_t offset) const { return Gap(data_ + offset, size_ - offset); }
 };

-template <class Device>
 class Allocator {
 private:
-  Device device_;
+  Ptr<Device> device_;
  size_t available_{0};
  size_t step_{128 * 1024 * 1024};
  size_t alignment_{256};
@ -83,23 +83,23 @@ private:

  void grow(size_t add) {
    add = align(add);
-    uint8_t* oldData = device_.data();
-    size_t oldSize = device_.size();
+    uint8_t* oldData = device_->data();
+    size_t oldSize = device_->size();

-    device_.reserve(oldSize + add);
+    device_->reserve(oldSize + add);

    std::set<Gap> oldGaps;
    gaps_.swap(oldGaps);

    for(auto gap : oldGaps)
      gaps_.insert(
-          Gap(device_.data() + std::distance(oldData, gap.data()), gap.size()));
-    insertGap(Gap(device_.data() + oldSize, add));
+          Gap(device_->data() + std::distance(oldData, gap.data()), gap.size()));
+    insertGap(Gap(device_->data() + oldSize, add));

    std::unordered_map<uint8_t*, Ptr<MemoryPiece>> oldAllocated;
    allocated_.swap(oldAllocated);
    for(auto it : oldAllocated) {
-      uint8_t* newPtr = device_.data() + std::distance(oldData, it.first);
+      uint8_t* newPtr = device_->data() + std::distance(oldData, it.first);
      allocated_[newPtr] = oldAllocated[it.first];
      allocated_[newPtr]->setPtr(newPtr);
    }
@ -142,7 +142,7 @@ private:

 public:
  Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256)
-      : device_(deviceId, alignment),
+      : device_(DispatchDevice(deviceId, alignment)),
        step_(step),
        available_(0),
        alignment_(alignment) {
@ -153,7 +153,8 @@ public:

  void reserve(size_t bytes) {
    bytes = align(bytes);
-    device_.reserve(bytes);
+    if(bytes > 0)
+      device_->reserve(bytes);
    clear();
  }

@ -211,17 +212,17 @@ public:
    available_ = 0;
    gaps_.clear();
    allocated_.clear();
-    insertGap({device_.data(), device_.size()}, false);
+    insertGap({device_->data(), device_->size()}, false);
  }

  Ptr<MemoryPiece> memory() {
-    return New<MemoryPiece>(device_.data(), device_.size());
+    return New<MemoryPiece>(device_->data(), device_->size());
  }

-  size_t size() { return device_.size(); }
+  size_t size() { return device_->size(); }

  size_t available() { return available_; }

-  DeviceId getDevice() { return device_.getDevice(); }
+  DeviceId getDevice() { return device_->getDevice(); }
 };
 }
--- a/src/tensors/backend.cpp
+++ b/src/tensors/backend.cpp
@ -0,0 +1,15 @@
+#include "tensors/backend.h"
+
+#include "tensors/gpu/backend.h"
+#include "tensors/cpu/backend.h"
+
+namespace marian {
+
+Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
+  if(deviceId.type == DeviceType::gpu)
+    return New<gpu::Backend>(deviceId, seed);
+  else
+    return New<cpu::Backend>(deviceId, seed);
+}
+
+}
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@ -17,4 +17,6 @@ public:
  virtual void setDevice() = 0;
 };

+Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed);
+
 }
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include <functional>
+#include <random>
+
+#include "common/config.h"
+#include "tensors/backend.h"
+
+namespace marian {
+namespace cpu {
+
+class Backend : public marian::Backend {
+private:
+  std::default_random_engine gen_;
+
+public:
+  Backend(DeviceId deviceId, size_t seed)
+  : marian::Backend(deviceId, seed),
+    gen_(seed_) {}
+
+  void setDevice() { }
+
+  std::default_random_engine& getRandomGenerator() {
+    return gen_;
+  }
+};
+
+}
+}
--- a/src/tensors/cpu/dropout.cpp
+++ b/src/tensors/cpu/dropout.cpp
@ -0,0 +1,19 @@
+#include <algorithm>
+#include <random>
+
+#include "tensors/dispatch.h"
+#include "tensors/cpu/backend.h"
+
+namespace marian {
+  namespace cpu {
+
+    void Dropout(Tensor tensor, float p) {
+      auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
+      auto &gen = cpuBackend->getRandomGenerator();
+      std::bernoulli_distribution dist(1.f - p);
+      std::generate(tensor->data(), tensor->data() + tensor->size(),
+                    [&]() { return dist(gen) / (1.f - p); });
+    }
+
+  }
+}
--- a/src/tensors/device.cpp
+++ b/src/tensors/device.cpp
@ -0,0 +1,29 @@
+#include <iostream>
+#include "tensors/device.h"
+
+namespace marian {
+namespace cpu {
+
+  Device::~Device() {
+    delete[] data_;
+    data_ = nullptr;
+    size_ = 0;
+  }
+
+  void Device::reserve(size_t size) {
+    size = align(size);
+    ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
+
+    if(data_) {
+      uint8_t *temp = new uint8_t[size_];
+      std::copy(data_, data_ + size_, temp);
+      delete[] data_;
+      data_ = temp;
+    } else {
+      data_ = new uint8_t[size];
+    }
+    size_ = size;
+  }
+
+}
+}
--- a/src/tensors/device.cu
+++ b/src/tensors/device.cu
@ -0,0 +1,40 @@
+#include <cuda.h>
+#include <iostream>
+
+#include "tensors/device.h"
+#include "kernels/cuda_helpers.h"
+
+namespace marian {
+namespace gpu {
+
+  Device::~Device() {
+    cudaSetDevice(deviceId_.no);
+    if(data_) {
+      CUDA_CHECK(cudaFree(data_));
+    }
+    cudaDeviceSynchronize();
+  }
+
+  void Device::reserve(size_t size) {
+    size = align(size);
+    cudaSetDevice(deviceId_.no);
+
+    ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
+
+    if(data_) {
+      // Allocate memory by going through host memory
+      uint8_t *temp = new uint8_t[size_];
+      CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
+      CUDA_CHECK(cudaFree(data_));
+      CUDA_CHECK(cudaMalloc(&data_, size));
+      CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
+      delete[] temp;
+    } else {
+      CUDA_CHECK(cudaMalloc(&data_, size));
+    }
+
+    size_ = size;
+  }
+
+}
+}
--- a/src/tensors/device.h
+++ b/src/tensors/device.h
@ -0,0 +1,68 @@
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+
+#include "common/definitions.h"
+
+namespace marian {
+
+class Device {
+protected:
+  DeviceId deviceId_;
+
+  uint8_t* data_{0};
+  size_t size_{0};
+  size_t alignment_;
+
+  size_t align(size_t size) {
+    return ceil(size / (float)alignment_) * alignment_;
+  }
+
+public:
+  Device(DeviceId deviceId, size_t alignment = 256)
+      : deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {}
+
+  virtual ~Device() {};
+
+  virtual void reserve(size_t size) = 0;
+
+  virtual uint8_t* data() { return data_; }
+
+  virtual size_t size() { return size_; }
+
+  virtual DeviceId getDevice() { return deviceId_; }
+};
+
+namespace gpu {
+  class Device : public marian::Device {
+    public:
+      Device(DeviceId deviceId, size_t alignment = 256)
+      : marian::Device(deviceId, alignment) {}
+
+      ~Device();
+
+      void reserve(size_t size);
+  };
+}
+
+namespace cpu {
+  class Device : public marian::Device {
+    public:
+      Device(DeviceId deviceId, size_t alignment = 256)
+      : marian::Device(deviceId, alignment) {}
+
+      ~Device();
+
+      void reserve(size_t size);
+  };
+}
+
+static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
+  if(deviceId.type == DeviceType::gpu)
+    return New<gpu::Device>(deviceId, alignment);
+  else
+    return New<cpu::Device>(deviceId, alignment);
+}
+
+}
--- a/src/tensors/device_gpu.cu
+++ b/src/tensors/device_gpu.cu
@ -1,37 +0,0 @@
-#include <cuda.h>
-#include <iostream>
-
-#include "tensors/device_gpu.h"
-#include "kernels/cuda_helpers.h"
-
-namespace marian {
-
-DeviceGPU::~DeviceGPU() {
-  cudaSetDevice(deviceId_.no);
-  if(data_) {
-    CUDA_CHECK(cudaFree(data_));
-  }
-  cudaDeviceSynchronize();
-}
-
-void DeviceGPU::reserve(size_t size) {
-  size = align(size);
-  cudaSetDevice(deviceId_.no);
-
-  ABORT_IF(size < size_, "New size must be larger than old size");
-
-  if(data_) {
-    // Allocate memory by going through host memory
-    uint8_t *temp = new uint8_t[size_];
-    CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaFree(data_));
-    CUDA_CHECK(cudaMalloc(&data_, size));
-    CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
-    delete[] temp;
-  } else {
-    CUDA_CHECK(cudaMalloc(&data_, size));
-  }
-
-  size_ = size;
-}
-}
--- a/src/tensors/device_gpu.h
+++ b/src/tensors/device_gpu.h
@ -1,35 +0,0 @@
-#pragma once
-
-#include <cmath>
-#include <cstdint>
-
-#include "common/definitions.h"
-
-namespace marian {
-
-class DeviceGPU {
-private:
-  uint8_t* data_;
-  size_t size_;
-  DeviceId deviceId_;
-  size_t alignment_;
-
-  size_t align(size_t size) {
-    return ceil(size / (float)alignment_) * alignment_;
-  }
-
-public:
-  DeviceGPU(DeviceId deviceId, size_t alignment = 256)
-      : data_(0), size_(0), deviceId_(deviceId), alignment_(alignment) {}
-
-  ~DeviceGPU();
-
-  void reserve(size_t size);
-
-  uint8_t* data() { return data_; }
-
-  size_t size() { return size_; }
-
-  DeviceId getDevice() { return deviceId_; }
-};
-}
--- a/src/tensors/dispatch.h
+++ b/src/tensors/dispatch.h
@ -0,0 +1,38 @@
+#pragma once
+
+#include "common/definitions.h"
+#include "tensors/tensor.h"
+
+#define DISPATCH1(Function, Arg1) \
+  namespace gpu { \
+    void Function(Arg1); \
+  } \
+  namespace cpu { \
+    void Function(Arg1); \
+  } \
+  void Function(Arg1 arg1) { \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+      gpu::Function(arg1); \
+    else \
+      cpu::Function(arg1); \
+  }
+
+#define DISPATCH2(Function, Arg1, Arg2) \
+  namespace gpu { \
+    void Function(Arg1, Arg2); \
+  } \
+  namespace cpu { \
+    void Function(Arg1, Arg2); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2) { \
+    if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+      gpu::Function(arg1, arg2); \
+    else \
+      cpu::Function(arg1, arg2); \
+  }
+
+namespace marian {
+
+  DISPATCH2(Dropout, Tensor, float)
+
+}
--- a/src/tensors/gpu/algorithm.cu
+++ b/src/tensors/gpu/algorithm.cu
@ -0,0 +1,42 @@
+#include "tensors/gpu/algorithm.h"
+
+#include "kernels/cuda_helpers.h"
+#include "kernels/tensor_operators.h"
+
+namespace marian {
+  namespace gpu {
+    void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest) {
+      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+      CudaCopy(begin, end, dest);
+      CUDA_CHECK(cudaStreamSynchronize(0));
+    }
+
+    __global__ void gFill(float *d_in, int size, float val) {
+      for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
+        int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
+        if(index < size) {
+          d_in[index] = val;
+        }
+      }
+    }
+
+    void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
+      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+      int size = end - begin;
+      int threads = std::min(512, size);
+      int blocks = (size / threads) + (size % threads != 0);
+      gFill<<<blocks, threads>>>(begin, size, value);
+      CUDA_CHECK(cudaStreamSynchronize(0));
+    }
+
+    void setSparse(Ptr<Backend> backend,
+                   const std::vector<size_t>& keys,
+                   const std::vector<float>& values,
+                   float* data) {
+      CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
+      SetSparse(data, keys, values);
+      CUDA_CHECK(cudaStreamSynchronize(0));
+    }
+
+  }
+}
--- a/src/tensors/gpu/algorithm.h
+++ b/src/tensors/gpu/algorithm.h
@ -0,0 +1,12 @@
+#pragma once
+
+#include "tensors/backend.h"
+
+namespace marian {
+  namespace gpu {
+    void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest);
+    void fill(Ptr<Backend> backend, float* begin, float* end, float value);
+
+    void setSparse(Ptr<Backend> backend, const std::vector<size_t>&, const std::vector<float>&, float*);
+  }
+}
--- a/src/tensors/gpu/backend.h
+++ b/src/tensors/gpu/backend.h
@ -5,7 +5,7 @@
 #include <curand.h>

 #include "common/config.h"
-#include "graph/backend.h"
+#include "tensors/backend.h"

 #define CURAND_CALL(x)                                \
  do {                                                \
@ -16,10 +16,11 @@
  } while(0)

 namespace marian {
+namespace gpu {

-class BackendGPU : public Backend {  
+class Backend : public marian::Backend {
 public:
-  BackendGPU(DeviceId deviceId, size_t seed) : Backend(deviceId, seed) {
+  Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {
    setDevice();
    setHandles();
  }
@ -36,13 +37,13 @@ private:
  cublasHandle_t cublasHandle_;
  curandGenerator_t curandGenerator_;

-  
+
  void setHandles() {
    cublasHandle_ = create_handle();
    curandGenerator_ = createCurandGenerator();
  }

-  
+
  curandGenerator_t createCurandGenerator() {
    cudaSetDevice(deviceId_.no);
    curandGenerator_t generator;
@ -62,4 +63,6 @@ private:
    return cublasHandle;
  }
 };
+
+}
 }
--- a/src/tensors/gpu/dropout.cu
+++ b/src/tensors/gpu/dropout.cu
@ -1,50 +1,52 @@
-#include <cuda.h>
-#include <curand.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "backend/dispatch.h"
-#include "graph/backend_gpu.h"
-
-#define CUDA_CALL(x)                                  \
-  do {                                                \
-    if((x) != cudaSuccess) {                          \
-      printf("Error at %s:%d\n", __FILE__, __LINE__); \
-      exit(1);                                        \
-    }                                                 \
-  } while(0)
-
-#define CURAND_CALL(x)                                \
-  do {                                                \
-    if((x) != CURAND_STATUS_SUCCESS) {                \
-      printf("Error at %s:%d\n", __FILE__, __LINE__); \
-      exit(1);                                        \
-    }                                                 \
-  } while(0)
-
-
-namespace marian {
-  namespace gpu {
-    
-    __global__ void gScale(float* data, int n, float p) {
-      int index = threadIdx.x + blockIdx.x * blockDim.x;
-    
-      while(index < n) {
-        data[index] = (data[index] < p) / p;
-        index += gridDim.x * blockDim.x;
-      }
-    }
-    
-    void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
-      curandGenerator_t gen = std::static_pointer_cast<BackendGPU>(backend)->getCurandGenerator();
-      int n = tensor->size();
-      CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
-    
-      int numThreads = std::min(n, 512);
-      int numBlocks = n / numThreads + (n % numThreads != 0);
-    
-      gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
-    }
-    
-  }
-}
+#include <cuda.h>
+#include <curand.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensors/dispatch.h"
+#include "tensors/gpu/backend.h"
+
+#define CUDA_CALL(x)                                  \
+  do {                                                \
+    if((x) != cudaSuccess) {                          \
+      printf("Error at %s:%d\n", __FILE__, __LINE__); \
+      exit(1);                                        \
+    }                                                 \
+  } while(0)
+
+#define CURAND_CALL(x)                                \
+  do {                                                \
+    if((x) != CURAND_STATUS_SUCCESS) {                \
+      printf("Error at %s:%d\n", __FILE__, __LINE__); \
+      exit(1);                                        \
+    }                                                 \
+  } while(0)
+
+
+namespace marian {
+  namespace gpu {
+
+    __global__ void gScale(float* data, int n, float p) {
+      int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+      while(index < n) {
+        data[index] = (data[index] < p) / p;
+        index += gridDim.x * blockDim.x;
+      }
+    }
+
+    void Dropout(Tensor tensor, float p) {
+      auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
+      curandGenerator_t gen = gpuBackend->getCurandGenerator();
+      int n = tensor->size();
+      CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
+
+      int numThreads = std::min(n, 512);
+      int numBlocks = n / numThreads + (n % numThreads != 0);
+
+      gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
+    }
+
+
+  }
+}
--- a/src/tensors/memory_piece.cu
+++ b/src/tensors/memory_piece.cu
@ -1,11 +0,0 @@
-#include <cuda.h>
-
-#include "kernels/cuda_helpers.h"
-#include "tensors/memory_piece.h"
-
-namespace marian {
-
-void MemoryPiece::insert(uint8_t* ptr, size_t num) {
-  CUDA_CHECK(cudaMemcpy(data_, ptr, num * sizeof(uint8_t), cudaMemcpyDefault));
-}
-}
--- a/src/tensors/memory_piece.h
+++ b/src/tensors/memory_piece.h
@ -34,13 +34,6 @@ public:

  void setPtr(uint8_t* data) { data_ = data; }

-  template <typename T>
-  void insert(T* ptr, size_t num) {
-    insert((uint8_t*)ptr, num * sizeof(T));
-  }
-
-  void insert(uint8_t* ptr, size_t num);
-
  friend std::ostream& operator<<(std::ostream& out, const MemoryPiece mp) {
    out << "MemoryPiece - ptr: " << std::hex << (size_t)mp.data() << std::dec
        << " size: " << mp.size();
--- a/src/tensors/tensor.cu
+++ b/src/tensors/tensor.cu
@ -1,159 +0,0 @@
-
-#include <cuda.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-
-#include "kernels/cuda_helpers.h"
-#include "kernels/tensor_operators.h"
-#include "tensors/tensor.h"
-
-namespace marian {
-
-__global__ void gFill(float *d_in, int size, float val) {
-  for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
-    int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
-    if(index < size) {
-      d_in[index] = val;
-    }
-  }
-}
-
-float TensorBase::get(size_t i) {
-  cudaSetDevice(deviceId_.no);
-  float temp;
-  CUDA_CHECK(
-      cudaMemcpy(&temp, data() + i, sizeof(float), cudaMemcpyDeviceToHost));
-  cudaStreamSynchronize(0);
-  return temp;
-}
-
-void TensorBase::set(size_t i, float value) {
-  cudaSetDevice(deviceId_.no);
-  CUDA_CHECK(
-      cudaMemcpy(data() + i, &value, sizeof(float), cudaMemcpyHostToDevice));
-  cudaStreamSynchronize(0);
-}
-
-void TensorBase::get(std::vector<float> &v) {
-  CUDA_CHECK(cudaSetDevice(deviceId_.no));
-  v.resize(size());
-  CUDA_CHECK(cudaMemcpy(
-      v.data(), data(), size() * sizeof(float), cudaMemcpyDeviceToHost));
-  cudaStreamSynchronize(0);
-}
-
-void TensorBase::set(float value) {
-  cudaSetDevice(deviceId_.no);
-  int threads = std::min(512, (int)size());
-  int blocks = (size() / threads) + (size() % threads != 0);
-  gFill<<<blocks, threads>>>(data(), size(), value);
-  cudaStreamSynchronize(0);
-}
-
-void TensorBase::set(const std::vector<float> &v) {
-  CUDA_CHECK(cudaSetDevice(deviceId_.no));
-  CUDA_CHECK(cudaMemcpy(
-      data(), v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice));
-  cudaStreamSynchronize(0);
-}
-
-void TensorBase::setSparse(const std::vector<size_t> &k,
-                           const std::vector<float> &v) {
-  cudaSetDevice(deviceId_.no);
-  SetSparse(data(), k, v);
-  cudaStreamSynchronize(0);
-}
-
-void TensorBase::copyFrom(Tensor in) {
-  cudaSetDevice(deviceId_.no);
-  CUDA_CHECK(cudaMemcpy(data(),
-                        (float *)in->data(),
-                        in->size() * sizeof(float),
-                        cudaMemcpyDefault));
-  cudaStreamSynchronize(0);
-}
-
-std::string TensorBase::debug() {
-  std::stringstream strm;
-  assert(shape_.size());
-  strm << shape_;
-  strm << " device=" << deviceId_;
-  strm << " ptr=" << (size_t)memory_->data();
-  strm << " bytes=" << memory_->size();
-  strm << std::endl;
-
-  // values
-  size_t totSize = shape_.elements();
-  std::vector<float> values(totSize);
-  get(values);
-
-  size_t dispCols = 5;
-  strm << std::fixed << std::setprecision(8) << std::setfill(' ');
-
-  for(int i = 0; i < values.size(); ++i) {
-    std::vector<int> dims;
-    shape().dims(i, dims);
-
-    bool disp = true;
-    for(int j = 0; j < dims.size(); ++j)
-      disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
-
-    if(disp) {
-
-      if(dims.back() == 0) {
-        bool par = true;
-        std::vector<std::string> p;
-        for(int j = dims.size() - 1; j >= 0; --j) {
-          if(dims[j] != 0)
-            par = false;
-
-          p.push_back(par ? "[" : " ");
-        }
-        for(auto it = p.rbegin(); it != p.rend(); ++it)
-          strm << *it;
-        strm << " ";
-      }
-
-      strm << std::setw(12)
-           << values[i]
-           << " ";
-
-      if(dims.back() + 1 == shape().back()) {
-        for(int j = dims.size() - 1; j >= 0; --j) {
-          if(dims[j] + 1 != shape()[j])
-            break;
-          strm << "]";
-        }
-        strm << std::endl;
-      }
-
-      bool prev = true;
-      for(int j = dims.size() - 1; j >= 0; --j) {
-        if(j < dims.size() - 1)
-          prev = prev && dims[j + 1] + 1 == shape()[j + 1];
-        if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
-          if(j < dims.size() - 1)
-            for(int k = 0; k <= j; ++k)
-              strm << " ";
-          strm << "... ";
-          if(j < dims.size() - 1)
-            strm << std::endl;
-          break;
-        }
-      }
-    }
-  }
-  strm << std::endl;
-  return strm.str();
-}
-
-Tensor operator<<(Tensor t, const std::vector<float> &v) {
-  t->set(v);
-  return t;
-}
-
-Tensor operator>>(Tensor t, std::vector<float> &v) {
-  t->get(v);
-  return t;
-}
-}
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -5,10 +5,13 @@
 #include <memory>
 #include <sstream>

-#include "3rd_party/exception.h"
 #include "common/definitions.h"
 #include "common/shape.h"
 #include "tensors/memory_piece.h"
+#include "tensors/backend.h"
+
+#include <algorithm>
+#include "tensors/gpu/algorithm.h"

 namespace marian {

@ -16,11 +19,11 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
 private:
  Ptr<MemoryPiece> memory_;
  Shape shape_;
-  DeviceId deviceId_;
+  Ptr<Backend> backend_;

 public:
-  TensorBase(Ptr<MemoryPiece> memory, Shape shape, DeviceId deviceId)
-      : memory_(memory), shape_(shape), deviceId_(deviceId) {}
+  TensorBase(Ptr<MemoryPiece> memory, Shape shape, Ptr<Backend> backend)
+      : memory_(memory), shape_(shape), backend_(backend) {}

  ~TensorBase() {}

@ -39,34 +42,157 @@ public:
    return get(0);
  }

-  DeviceId getDevice() { return deviceId_; }
+  Ptr<Backend> getBackend() { return backend_; }
+  DeviceId getDevice() { return backend_->getDevice(); }

  Tensor subtensor(int offset, int size) {
    auto mem = New<MemoryPiece>(memory_->data() + sizeof(float) * offset,
                                sizeof(float) * size);
-    return Tensor(new TensorBase(mem, {1, size}, deviceId_));
+    return New<TensorBase>(mem, Shape{1, size}, backend_);
  }

-  float get(size_t i);
+  float get(size_t i) {
+    float temp;
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, data() + i, data() + i + 1, &temp);
+    else
+      std::copy(data() + i, data() + i + 1, &temp);
+    return temp;
+  }

-  void set(size_t i, float value);
+  void set(size_t i, float value) {
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, &value, &value + 1, data() + i);
+    else
+      std::copy(&value, &value + 1, data() + i);
+  }

-  void get(std::vector<float>& v);
+  void get(std::vector<float> &v) {
+    v.resize(size());
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, data(), data() + size(), v.data());
+    else
+      std::copy(data(), data() + size(), v.data());
+  }

-  void set(float value);
+  void set(const std::vector<float> &v) {
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, v.data(), v.data() + v.size(), data());
+    else
+      std::copy(v.data(), v.data() + v.size(), data());
+  }

-  void set(const std::vector<float>& v);
+  void set(float value) {
+    if(backend_->getDevice().type == DeviceType::gpu)
+      gpu::fill(backend_, data(), data() + size(), value);
+    else
+      std::fill(data(), data() + size(), value);
+  }

-  void setSparse(const std::vector<size_t>& k, const std::vector<float>& v);
+  void setSparse(const std::vector<size_t> &k,
+                 const std::vector<float> &v) {
+    if(backend_->getDevice().type == DeviceType::gpu) {
+      gpu::setSparse(backend_, k, v, data());
+    } else {
+      for(int i = 0; i < k.size(); ++i)
+        data()[k[i]] = v[i];
+    }
+  }

-  void copyFrom(Tensor);
+  void copyFrom(Tensor in) {
+    if(in->getBackend()->getDevice().type == DeviceType::gpu ||
+       backend_->getDevice().type == DeviceType::gpu)
+      gpu::copy(backend_, in->data(), in->data() + in->size(), data());
+    else
+      std::copy(in->data(), in->data() + in->size(), data());
+  }
+
+  std::string debug() {
+    std::stringstream strm;
+    assert(shape_.size());
+    strm << shape_;
+    strm << " device=" << backend_->getDevice();
+    strm << " ptr=" << (size_t)memory_->data();
+    strm << " bytes=" << memory_->size();
+    strm << std::endl;
+
+    // values
+    size_t totSize = shape_.elements();
+    std::vector<float> values(totSize);
+    get(values);
+
+    size_t dispCols = 5;
+    strm << std::fixed << std::setprecision(8) << std::setfill(' ');
+
+    for(int i = 0; i < values.size(); ++i) {
+      std::vector<int> dims;
+      shape().dims(i, dims);
+
+      bool disp = true;
+      for(int j = 0; j < dims.size(); ++j)
+        disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
+
+      if(disp) {
+
+        if(dims.back() == 0) {
+          bool par = true;
+          std::vector<std::string> p;
+          for(int j = dims.size() - 1; j >= 0; --j) {
+            if(dims[j] != 0)
+              par = false;
+
+            p.push_back(par ? "[" : " ");
+          }
+          for(auto it = p.rbegin(); it != p.rend(); ++it)
+            strm << *it;
+          strm << " ";
+        }
+
+        strm << std::setw(12)
+             << values[i]
+             << " ";
+
+        if(dims.back() + 1 == shape().back()) {
+          for(int j = dims.size() - 1; j >= 0; --j) {
+            if(dims[j] + 1 != shape()[j])
+              break;
+            strm << "]";
+          }
+          strm << std::endl;
+        }
+
+        bool prev = true;
+        for(int j = dims.size() - 1; j >= 0; --j) {
+          if(j < dims.size() - 1)
+            prev = prev && dims[j + 1] + 1 == shape()[j + 1];
+          if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
+            if(j < dims.size() - 1)
+              for(int k = 0; k <= j; ++k)
+                strm << " ";
+            strm << "... ";
+            if(j < dims.size() - 1)
+              strm << std::endl;
+            break;
+          }
+        }
+      }
+    }
+    strm << std::endl;
+    return strm.str();
+  }

-  std::string debug();
 };

 typedef std::shared_ptr<TensorBase> Tensor;

-Tensor operator<<(Tensor t, const std::vector<float>& v);
-
-Tensor operator>>(Tensor t, std::vector<float>& v);
+static Tensor operator<<(Tensor t, const std::vector<float> &v) {
+  t->set(v);
+  return t;
+}
+
+static Tensor operator>>(Tensor t, std::vector<float> &v) {
+  t->get(v);
+  return t;
+}
+
 }
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@ -5,7 +5,6 @@

 #include "common/definitions.h"
 #include "tensors/allocator.h"
-#include "tensors/device_gpu.h"
 #include "tensors/tensor.h"

 namespace marian {
@ -17,11 +16,13 @@ private:
  const size_t GROW = CHUNK * MBYTE;
  const size_t ALIGN = 256;

-  Ptr<Allocator<DeviceGPU>> allocator_;
+  Ptr<Backend> backend_;
+  Ptr<Allocator> allocator_;

 public:
-  TensorAllocator(DeviceId deviceId)
-      : allocator_(New<Allocator<DeviceGPU>>(deviceId, 0, GROW, ALIGN)) {}
+  TensorAllocator(Ptr<Backend> backend)
+      : backend_(backend),
+        allocator_(New<Allocator>(backend_->getDevice(), 0, GROW, ALIGN)) {}

  ~TensorAllocator() { clear(); }

@ -41,11 +42,17 @@ public:

  void reserveExact(size_t bytes = 0) {
    size_t mbytes = bytes / MBYTE;
-    LOG(info,
-        "[memory] Reserving {} MB, device {}",
-        mbytes,
-        allocator_->getDevice());
-
+    if(mbytes == 0) {
+      LOG(info,
+          "[memory] Reserving {} B, device {}",
+          bytes,
+          allocator_->getDevice());
+    } else {
+      LOG(info,
+          "[memory] Reserving {} MB, device {}",
+          mbytes,
+          allocator_->getDevice());
+    }
    allocator_->reserve(bytes);
  }

@ -59,7 +66,7 @@ public:
    if(!t || t->shape() != shape) {
      int size = shape.elements();
      auto mem = allocator_->alloc<float>(size);
-      t = Tensor(new TensorBase(mem, shape, allocator_->getDevice()));
+      t = Tensor(new TensorBase(mem, shape, backend_));
    }
  }

@ -68,11 +75,11 @@ public:
  Tensor asTensor() {
    auto mem = allocator_->memory();
    int size = mem->size() / sizeof(float);
-    return Tensor(new TensorBase(mem, {1, size}, allocator_->getDevice()));
+    return Tensor(new TensorBase(mem, {1, size}, backend_));
  }

  size_t size() { return allocator_->size() / sizeof(float); }

-  Ptr<Allocator<DeviceGPU>> allocator() { return allocator_; }
+  Ptr<Allocator> allocator() { return allocator_; }
 };
 }
--- a/src/tensors/tensor_cpu.h
+++ b/src/tensors/tensor_cpu.h
@ -1,82 +0,0 @@
-#pragma once
-
-// This file is part of the Marian toolkit.
-
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include <cstring>
-
-#include "tensors/tensor.h"
-
-namespace marian {
-
-class TensorCPU : public TensorBase {
-public:
-  TensorCPU(float* data, Shape shape) : TensorBase(data, shape) {}
-
-  float get(size_t i) { return data_[i]; }
-
-  void set(size_t i, float value) { data_[i] = value; }
-
-  void get(std::vector<float>& v) {
-    v.resize(size());
-    std::copy(data_, data_ + size(), v.begin());
-  }
-
-  void set(float value) { std::fill(data_, data_ + size(), value); }
-
-  void set(const std::vector<float>& v) {
-    std::copy(v.begin(), v.end(), data_);
-  }
-};
-
-class DeviceCPU {
-private:
-  float* data_;
-  size_t size_
-
-      public : DeviceCPU()
-      : data_(0), size_(0) {}
-
-  ~DeviceCPU() {
-    if(data_)
-      delete[] data_;
-  }
-
-  typedef TensorCPU tensor_type;
-
-  void reserve(size_t size) {
-    ABORT_IF(size < size_, "New size must be larger than old size");
-    float* temp = new float[size];
-
-    if(data_) {
-      std::memcpy(temp, data_, size_ * sizeof(float));
-      delete[] data_;
-    }
-
-    data_ = temp;
-    size_ = size;
-  }
-
-  float* data() { return data_; }
-
-  size_t capacity() { return size_; }
-};
-}
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@ -17,9 +17,9 @@ endforeach(test)

 # Testing apps
 add_executable(logger_test logger_test.cpp)
+add_executable(dropout_test dropout_test.cpp)
 #cuda_add_executable(bn_test bn_test.cu)
 cuda_add_executable(pooling_test pooling_test.cu)
-cuda_add_executable(dropout_test dropout_test.cu)
 #cuda_add_executable(marian_test marian_test.cu)
 cuda_add_executable(tensor_test tensor_test.cu)

--- a/src/tests/allocator_test.cpp
+++ b/src/tests/allocator_test.cpp
@ -5,55 +5,11 @@

 #include "3rd_party/exception.h"
 #include "tensors/allocator.h"
-#include "tensors/device_gpu.h"
-
-class DeviceCPU {
-private:
-  uint8_t* data_;
-  size_t size_;
-  size_t alignment_;
-
-public:
-  DeviceCPU(size_t device, size_t alignment = 256)
-   : data_(0), size_(0), alignment_(alignment) {}
-
-  ~DeviceCPU() { delete[] data_; }
-
-  size_t align(size_t size) {
-    return ceil(size / (float)alignment_) * alignment_;
-  }
-
-  void reserve(size_t size) {
-    size = align(size);
-    ABORT_IF(size < size_, "New size must be larger than old size");
-
-    if(data_) {
-      // Allocate memory by going through host memory
-      uint8_t *temp = new uint8_t[size_];
-      std::copy(data_, data_ + size_, temp);
-      std::fill(data_, data_ + size_, 0);
-      delete[] data_;
-      data_ = new uint8_t[size];
-      std::copy(temp, temp + size_, data_);
-      delete[] temp;
-    } else {
-      data_ = new uint8_t[size];
-    }
-
-    size_ = size;
-  }
-
-  uint8_t* data() { return data_; }
-
-  size_t size() { return size_; }
-
-  size_t getDevice() { return 0; }
-};

 int main(int argc, char** argv) {
  using namespace marian;

-  auto a = New<Allocator<DeviceGPU>>({0, DeviceType::gpu}, 0, 30000, 256);
+  auto a = New<Allocator>({0, DeviceType::gpu}, 0, 30000, 256);
  std::cerr << "Size: " << a->size() << std::endl;

  auto mem1 = a->alloc<int>(100000);
--- a/src/tests/dropout_test.cpp
+++ b/src/tests/dropout_test.cpp
@ -1,13 +1,9 @@
-#include <cuda.h>
-#include <curand.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <boost/chrono.hpp>
 #include <boost/timer/timer.hpp>
 #include <vector>

-#include "common/config.h"
-#include "graph/node_initializers.h"
 #include "marian.h"

 using namespace marian;
@ -16,8 +12,11 @@ using namespace keywords;
 int main(int argc, char** argv) {
  auto c = New<Config>(argc, argv);

+  auto type = c->get<bool>("cpu") ? DeviceType::cpu : DeviceType::gpu;
+  DeviceId deviceId{0, type};
+
  auto g = New<ExpressionGraph>();
-  g->setDevice({0, DeviceType::gpu});
+  g->setDevice(deviceId);
  g->reserveWorkspaceMB(512);

  for(int i = 0; i < 10; ++i) {
--- a/src/tests/graph_tests.cpp
+++ b/src/tests/graph_tests.cpp
@ -53,3 +53,53 @@ TEST_CASE("Expression graph can be initialized with constant values",
    REQUIRE(values == v);
  }
 }
+
+TEST_CASE("Graph device is set (cpu)", "[graph]") {
+  auto graph = New<ExpressionGraph>();
+
+  graph->setDevice({0, DeviceType::cpu});
+  
+  DeviceId testId{0, DeviceType::cpu};
+  REQUIRE(graph->getDevice() == testId);
+}
+
+TEST_CASE("Expression graph can be initialized with constant values (cpu)",
+          "[graph]") {
+  auto graph = New<ExpressionGraph>();
+  graph->setDevice({0, DeviceType::cpu});
+  graph->reserveWorkspaceMB(4);
+
+  std::vector<float> values;
+
+  SECTION("initializing with zero (cpu)") {
+    graph->clear();
+    values.clear();
+    auto zeros = graph->param("0s", {2, 5}, keywords::init = inits::zeros);
+    graph->forward();
+
+    zeros->val()->get(values);
+    REQUIRE(values == std::vector<float>(10, 0.0f));
+  }
+
+  SECTION("initializing with ones (cpu)") {
+    graph->clear();
+    values.clear();
+    auto ones = graph->param("1s", {2, 5}, keywords::init = inits::ones);
+    graph->forward();
+
+    ones->val()->get(values);
+    REQUIRE(values == std::vector<float>(10, 1.0f));
+  }
+
+  SECTION("initializing from vector (cpu)") {
+    graph->clear();
+    values.clear();
+    std::vector<float> v({1, 2, 3, 4, 5, 6});
+    auto vals = graph->param("vs", {2, 3}, keywords::init = inits::from_vector(v));
+    graph->forward();
+
+    REQUIRE(values.empty());
+    vals->val()->get(values);
+    REQUIRE(values == v);
+  }
+}
--- a/src/training/graph_group_async.cu
+++ b/src/training/graph_group_async.cu
@ -93,12 +93,12 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {

    int pos = 0;
    // parameter sharding
-    for(auto device : devices_) {
+    for(auto graph : graphs_) {
      int __size__ = min(shardSize_, totalSize);
      totalSize -= __size__;

      Tensor param;
-      Ptr<TensorAllocator> allocator = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
+      Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
      allocator->reserveExact(__size__ * sizeof(float));
      allocator->allocate(param, {1, __size__});
      paramsAlloc_.push_back(allocator);
@ -112,11 +112,11 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
  if(grads_.size() == 0) {
    int totalSize = graphs_[0]->params()->vals()->size();

-    for(auto device : devices_) {
+    for(auto graph : graphs_) {
      int __size__ = min(shardSize_, totalSize);
      totalSize -= __size__;
      Tensor grad_;
-      Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
+      Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(graph->getBackend());

      allocator_->reserveExact(__size__ * sizeof(float));
      allocator_->allocate(grad_, {1, __size__});
@ -129,11 +129,11 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
      int totalSize = graphs_[0]->params()->vals()->size();

      int i = 0;
-      for(auto device : devices_) {
+      for(auto graph : graphs_) {
        int __size__ = min(shardSize_, totalSize);
        totalSize -= __size__;
        Tensor paramAvg;
-        Ptr<TensorAllocator> allocator = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
+        Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());

        allocator->reserveExact(__size__ * sizeof(float));
        allocator->allocate(paramAvg, {1, __size__});
@ -187,7 +187,7 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
    Tensor gradients;
    if(tau_ > 1) {
      if(t == 0) {
-        accAlloc = New<TensorAllocator>(graph->getDevice());
+        accAlloc = New<TensorAllocator>(graph->getBackend());
        accAlloc->reserveExact(graph->params()->grads()->memory()->size());
        accAlloc->allocate(accGradients, graph->params()->grads()->shape());
        accGradients->set(0);
--- a/src/training/graph_group_async_drop.cu
+++ b/src/training/graph_group_async_drop.cu
@ -8,9 +8,9 @@

 namespace marian {

-Tensor AsyncGraphGroupDrop::newTensor(int size, DeviceId deviceId) {
+Tensor AsyncGraphGroupDrop::newTensor(int size, Ptr<Backend> backend) {
  Tensor t;
-  Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(deviceId);
+  Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(backend);
  allocator_->reserveExact(size * sizeof(float));
  allocator_->allocate(t, {1, size});
  allocators.push_back(allocator_);
@ -79,14 +79,13 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
                                        size_t batch_words,
                                        int device_id) {
  if(pushStep_[device_id]++ <= dropping_warmup) {
-    std::cout<<"WARMUP"<<std::endl;
    AsyncGraphGroup::pushGradients(newGrads, batch_words, device_id);
    return;
  }

  // get the sparse gradient
  pushDropper_[device_id]->dropGraph(
-      newGrads, pushSparseGradient_[device_id], 
+      newGrads, pushSparseGradient_[device_id],
      droping_rate, dropping_momentum);

  SparseTensor newSparseGrads = pushSparseGradient_[device_id];
@ -146,13 +145,12 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
      fetchStep_.push_back(0);
      pushStep_.push_back(0);

-      size_t device = devices_[i];
      // temporary tensor to compute parameter delta before fetching
-      paramsDelta_.push_back(newTensor(shardSize, {device, DeviceType::gpu}));
+      paramsDelta_.push_back(newTensor(shardSize, graphs_[i]->getBackend()));

      // tensors to store local params history
      for(int h_id = 0; h_id < devices_.size(); h_id++) {
-        Tensor tmp = newTensor(params_[i]->size(), {device, DeviceType::gpu});
+        Tensor tmp = newTensor(params_[i]->size(), graphs_[i]->getBackend());
        tmp->copyFrom(params_[i]);
        paramsLocal_[h_id].push_back(tmp);
      }
@ -162,23 +160,23 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {

      // N-dropper for fetch
      std::vector<GradientDrop> tmpDropper;
-      for(int i = 0; i < devices_.size(); i++)
+      for(auto device : devices_)
        tmpDropper.push_back(GradientDrop(new GradientDropBase()));
      fetchDropper.push_back(tmpDropper);

      // sparsetensor to store sparsified gradients per-device
      pushSparseGradient_.push_back(
-          SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu})));
+          SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));

      pushShardedSparseGradient_.push_back(
-          SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu})));
+          SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
      fetchSparseGradient_.push_back(SparseTensor(
-          new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu})));
+          new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend())));

      std::vector<SparseTensor> tmp;
      for(int i = 0; i < devices_.size(); i++)
        tmp.push_back(SparseTensor(
-            new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu})));
+            new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend())));
      fetchShardedSparseGradient_.push_back(tmp);
    }

--- a/src/training/graph_group_async_drop.h
+++ b/src/training/graph_group_async_drop.h
@ -31,7 +31,7 @@ class AsyncGraphGroupDrop : public AsyncGraphGroup {

  std::vector<Ptr<TensorAllocator>> allocators;

-  Tensor newTensor(int size, DeviceId deviceId);
+  Tensor newTensor(int size, Ptr<Backend> backend);

 protected:
  void init(Ptr<data::Batch> batch);
--- a/src/training/graph_group_multinode.cu
+++ b/src/training/graph_group_multinode.cu
@ -19,9 +19,9 @@ void MultiNodeGraphGroup::setScheduler(Ptr<Scheduler> scheduler) {
 /**
 * Allocate new tensor on given GPU and store allocator.
 */
-Tensor MultiNodeGraphGroup::newTensor(int size, DeviceId deviceId) {
+Tensor MultiNodeGraphGroup::newTensor(int size, Ptr<Backend> backend) {
  Tensor t;
-  Ptr<TensorAllocator> allocator = New<TensorAllocator>(deviceId);
+  Ptr<TensorAllocator> allocator = New<TensorAllocator>(backend);
  allocator->reserveExact(size * sizeof(float));
  allocator->allocate(t, {1, size});
  allocators_.push_back(allocator);
@ -148,14 +148,12 @@ void MultiNodeGraphGroup::initClientCommOverlapVars() {
 void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
  size_t modelSize = clientGraphs_[0]->params()->vals()->size();
  for(int client = 0; client < devices_.size(); client++) {
-    DeviceId deviceId{devices_[client], DeviceType::gpu};
-    
    // Communication overlap buffer (for grads + params)
-    Tensor commOverlapBuffer = newTensor(modelSize, deviceId);
+    Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend());
    commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals());
    clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer);
    // Gradients local sum buffer
-    Tensor sumGrads = newTensor(modelSize, deviceId);
+    Tensor sumGrads = newTensor(modelSize, clientGraphs_[client]->getBackend());
    sumGrads->set(0);
    clientSummedGradsGPU.push_back(sumGrads);
    // Local optimizer to apply summed gradients
@ -207,12 +205,11 @@ void MultiNodeGraphGroup::calculateShardSizes() {
 void MultiNodeGraphGroup::initShardGpuTensors() {
  size_t offset = 0;
  for(int shard = 0; shard < devices_.size(); shard++) {
-    DeviceId deviceId{devices_[shard], DeviceType::gpu};
-    Tensor gpuParams = newTensor(shardSizes_[shard], deviceId);
+    Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
    gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
        offset, shardSizes_[shard]));
    shardParams_.push_back(gpuParams);
-    shardGrads_.push_back(newTensor(shardSizes_[shard], deviceId));
+    shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
  }
 }

--- a/src/training/graph_group_multinode.h
+++ b/src/training/graph_group_multinode.h
@ -217,7 +217,7 @@ protected:
  /**
   * Allocate new tensor on given GPU and store allocator.
   */
-  Tensor newTensor(int size, DeviceId deviceId);
+  Tensor newTensor(int size, Ptr<Backend> backend);

  /**
   * Setup training environment and launch server thread and (if enabled) client
--- a/src/training/graph_group_sync.cu
+++ b/src/training/graph_group_sync.cu
@ -60,7 +60,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
      for(auto graph : graphs_) {
        int __size__ = min(shardSize_, totalSize);

-        auto paramsAlloc = New<TensorAllocator>(graph->getDevice());
+        auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
        paramsAllocs_.push_back(paramsAlloc);

        paramsAlloc->reserveExact(3 * __size__ * sizeof(float));
@ -87,7 +87,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
        int __size__ = min(shardSize_, totalSize);
        totalSize -= __size__;
        Tensor paramAvg;
-        auto allocator = New<TensorAllocator>(graph->getDevice());
+        auto allocator = New<TensorAllocator>(graph->getBackend());

        allocator->reserveExact(__size__ * sizeof(float));
        allocator->allocate(paramAvg, {1, __size__});
--- a/src/training/sparse_tensor.cu
+++ b/src/training/sparse_tensor.cu
@ -45,9 +45,9 @@ __global__ void gFindSubtensor(int* indices,
    resultEnd[0] = idx;
 }

-SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId)
-: deviceId_(deviceId), capacity_(capacity) {
-  cudaSetDevice(deviceId_.no);
+SparseTensorBase::SparseTensorBase(int capacity, Ptr<Backend> backend)
+: backend_(backend), capacity_(capacity) {
+  cudaSetDevice(backend_->getDevice().no);
  CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity));
  CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity));

@ -58,8 +58,8 @@ SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId)
 SparseTensorBase::SparseTensorBase(float* data,
                                   int* indices,
                                   int size,
-                                   DeviceId deviceId)
-: deviceId_(deviceId) {
+                                   Ptr<Backend> backend)
+: backend_(backend) {
  data_ = data;
  indices_ = indices;
  size_ = size;
@ -93,7 +93,7 @@ void SparseTensorBase::copyFrom(float* data,
  size_ = size;
  if(size == 0)
    return;
-  cudaSetDevice(deviceId_.no);
+  cudaSetDevice(backend_->getDevice().no);

  cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault);
  if(!data_only)
@ -107,8 +107,8 @@ void SparseTensorBase::copyFrom(std::shared_ptr<SparseTensorBase> t,
  copyFrom(t->data(), t->indices(), t->size(), data_only);
 }

-DeviceId SparseTensorBase::getDevice() {
-  return deviceId_;
+Ptr<Backend> SparseTensorBase::getBackend() {
+  return backend_;
 }

 void SparseTensorBase::setSize(int size) {
@ -117,7 +117,7 @@ void SparseTensorBase::setSize(int size) {

 // return the dense representation of this tensor
 void SparseTensorBase::toDense(Tensor t, int offset) {
-  cudaSetDevice(deviceId_.no);
+  cudaSetDevice(backend_->getDevice().no);
  int threads = 512;
  int blocks = 1 + size_ / threads;
  t->set(0);
@ -127,7 +127,7 @@ void SparseTensorBase::toDense(Tensor t, int offset) {
 }

 void SparseTensorBase::scatterAdd(Tensor t, int offset) {
-  cudaSetDevice(deviceId_.no);
+  cudaSetDevice(backend_->getDevice().no);
  cudaStreamSynchronize(0);
  int threads = 512;
  int blocks = 1 + size_ / threads;
@ -139,7 +139,7 @@ void SparseTensorBase::scatterAdd(Tensor t, int offset) {
 std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
                                                              int size,
                                                              int idx) {
-  cudaSetDevice(deviceId_.no);
+  cudaSetDevice(backend_->getDevice().no);
  cudaStreamSynchronize(0);
  int* start = gstart_ + idx;
  int* end = gend_ + idx;
@ -165,6 +165,6 @@ std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
  int subtensorSize = std::max(0, endOffset - startOffset + 1);
  cudaStreamSynchronize(0);
  return std::shared_ptr<SparseTensorBase>(new SparseTensorBase(
-      data_ + startOffset, indices_ + startOffset, subtensorSize, deviceId_));
+      data_ + startOffset, indices_ + startOffset, subtensorSize, backend_));
 }
 }
--- a/src/training/sparse_tensor.h
+++ b/src/training/sparse_tensor.h
@ -10,15 +10,15 @@ class SparseTensorBase : public std::enable_shared_from_this<SparseTensorBase> {
  int* indices_;
  int size_;
  int capacity_;
-  DeviceId deviceId_;
+  Ptr<Backend> backend_;

  int* d_is_unsorted;
  int* gstart_;
  int* gend_;

 public:
-  SparseTensorBase(int capacity, DeviceId deviceId);
-  SparseTensorBase(float* data, int* indices, int size, DeviceId deviceId);
+  SparseTensorBase(int capacity, Ptr<Backend> backend);
+  SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend);

  ~SparseTensorBase() {}

@ -43,7 +43,7 @@ public:
  void scatterAdd(Tensor t, int offset = 0);
  std::shared_ptr<SparseTensorBase> subtensor(int pos, int size, int idx);

-  DeviceId getDevice();
+  Ptr<Backend> getBackend();

  void toDense(Tensor t, int offset);
 };