merged with memory branch

2024-09-19 18:59:18 +03:00 · 2016-11-13 00:16:38 +01:00 · 2016-11-13 00:16:38 +01:00 · bb1e69b7ec
commit bb1e69b7ec
parent 40a35cf35f 822eeb738f
32 changed files with 1618 additions and 684 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.5.1)
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

 project(marian CXX)
-SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
-LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
+SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -Ofast -Wno-unused-result -Wno-deprecated -fPIC")
+LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_61; --use_fast_math; -Xcompiler '-fPIC')
 add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
 SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -3,16 +3,17 @@ include_directories(.)
 include_directories(3rd_party)

 cuda_add_library(marian_lib
-  3rd_party/cnpy/cnpy.cpp
+#  3rd_party/cnpy/cnpy.cpp
  3rd_party/exception.cpp
  expression_graph.cu
  expression_operators.cu
  node.cu
+  node_operators.cu
  node_operators_unary.cu
  node_operators_binary.cu
-  tensor.cu
+  tensors/tensor.cu
  tensor_operators.cu
-  vocab.cpp
+#  vocab.cpp
 )

 target_link_libraries(marian_lib)
@ -23,13 +24,13 @@ cuda_add_executable(
 )

 cuda_add_executable(
-  mnist_benchmark
-  mnist_benchmark.cu
+  tensor_test
+  tensor_test.cu
 )

 cuda_add_executable(
-  xor
-  xor.cu
+  mnist_benchmark
+  mnist_benchmark.cu
 )

 #cuda_add_executable(
@ -37,18 +38,25 @@ cuda_add_executable(
 #  validate_encoder_decoder.cu
 #)

-cuda_add_executable(
-  test_nodes
-  test_nodes.cu
-)
+#cuda_add_executable(
+#  test_nodes
+#  test_nodes.cu
+#)

-#target_link_libraries(softmax_benchmark marian_lib)
-#target_link_libraries(mnist_benchmark marian_lib)
+target_link_libraries(softmax_benchmark marian_lib)
+target_link_libraries(tensor_test marian_lib)
+target_link_libraries(mnist_benchmark marian_lib)
 #target_link_libraries(validate_encoder_decoder marian_lib)
 #target_link_libraries(test_nodes marian_lib)

-foreach(exec mnist_benchmark softmax_benchmark xor test_nodes )
-  target_link_libraries(${exec} marian_lib ${EXT_LIBS} cuda cudnn curand)
+#foreach(exec mnist_benchmark softmax_benchmark test_nodes tensor_test)
+#  target_link_libraries(${exec} ${EXT_LIBS} cuda cudnn curand)
+#  cuda_add_cublas_to_target(${exec})
+#  set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+#endforeach(exec)
+
+foreach(exec mnist_benchmark tensor_test softmax_benchmark)
+  target_link_libraries(${exec} ${EXT_LIBS} cuda cudnn curand)
  cuda_add_cublas_to_target(${exec})
  set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 endforeach(exec)
--- a/src/batch_generator.h
+++ b/src/batch_generator.h
@ -131,8 +131,9 @@ class BatchGenerator {
      return currentBatch_;
    }

-    void prepare() {
+    void prepare(bool shuffle=true) {
      //boost::timer::cpu_timer total;
+      if(shuffle)
        data_->shuffle();
      //std::cerr << "shuffle: " << total.format(5, "%ws") << std::endl;
      current_ = data_->begin();
--- a/src/chainable.h
+++ b/src/chainable.h
@ -100,15 +100,8 @@ struct Chainable {
    virtual ExpressionGraphPtr graph() = 0;
    virtual const Shape& shape() = 0;

-    virtual DataType val() = 0;
-    virtual DataType grad() = 0;
-
-    virtual void setVal(DataType t) {
-      UTIL_THROW2("Tensors can only be assigned to input and parameter nodes");
-    };
-    virtual void setGrad(DataType t) {
-      UTIL_THROW2("Gradients can only be assigned to parameter nodes");
-    };
+    virtual DataType& val() = 0;
+    virtual DataType& grad() = 0;
 };

 /** @brief Defines a convenience type to represent a shared pointer to a Chainable<Tensor> object. */
--- a/src/dataset.h
+++ b/src/dataset.h
@ -65,7 +65,7 @@ class Input {
    /** @brief Constructs a new Input object with the specified Shape */
    Input(const Shape& shape)
    : shape_(shape),
-      data_(new Data(shape_.totalSize(), 0.0f)) {}
+      data_(new Data(shape_.elements(), 0.0f)) {}

    /** @brief Gets an iterator pointing to the beginning of this object's ::Data */
    Data::iterator begin() {
--- a/src/definitions.h
+++ b/src/definitions.h
@ -25,6 +25,9 @@
 #include <string>
 #include <functional>
 #include <memory>
+#include <cuda.h>
+
+#include "shape.h"

 namespace marian {
  /** @brief Creates shared_ptr of any type, passes all arguments to any available constructor */
@ -33,7 +36,6 @@ namespace marian {
    return std::shared_ptr<T>(new T(std::forward<Args>(args)...));
  }

-  const size_t SHAPE_SIZE = 2;

  typedef float Float;

@ -44,113 +46,14 @@ namespace marian {
   * In that case, this placeholder would be used to specify that the batch size value will be defined at some later point.
   */
  const int whatevs{-1};
-
-  /**
-   * @brief Represents the size of each dimension in a tensor.
-   *
-   * Note: this class currently is hard-coded to 2 dimensions.
-   *       This is likely to change.
-   */
-  class Shape {
-    private:
-      int shape_[SHAPE_SIZE];
-
-    public:
-
-      /**
-       * @brief Constructs a default shape.
-       *
-       * This default shape has two dimensions.
-       * The size of each dimension is 1.
-       */
-      Shape() : shape_{1, 1} { }
-
-      /**
-       * @brief Constructs a shape.
-       *
-       * @param i A list of integers representing the size of each dimension.
-       */
-      Shape(std::initializer_list<int> il) {
-       std::copy(il.begin(), il.end(), begin());
 }

-      /**
-       * @brief Gets a reference to the int representing the size of the <code>i</code>th dimension represented by this object.
-       *
-       * @return a reference to the int representing the size of the <code>i</code>th dimension represented by this object
-       */
-      int& operator[](int i) {
-        return shape_[i];
-      }
-
-      /**
-       * @brief Gets the size of the <code>i</code>th dimension represented by this object.
-       *
-       * @return the size of the <code>i</code>th dimension represented by this object
-       */
-      const int& operator[](int i) const {
-        return shape_[i];
-      }
-
-	  /**
-	   * @brief Gets the number of dimensions represented by this object
-	   *
-	   * @return the number of dimensions represented by this object
-	   */
-      size_t size() const {
-        return SHAPE_SIZE;
-      }
-
-      /**
-       * @brief Gets the total number of elements in a tensor of this shape.
-       *
-       * For example, if this shape represents a 5x100 tensor, this method would return 500.
-       *
-       * @return the total number of elements in a tensor of this shape
-       */
-      size_t totalSize() const {
-        size_t s = 1;
-        for(int i = 0; i < size(); ++i)
-          s *= shape_[i];
-        return s;
-      }
-
-      /** @brief Gets a pointer to an int that specifies the size of the first dimension represented by this object */
-      int* begin() { return shape_; }
-
-      /** @brief Gets a pointer to an int that specifies the size of the last dimension represented by this object */
-      int* end() { return shape_ + SHAPE_SIZE; }
-
-      /** @brief Gets a const pointer to an int that specifies the size of the first dimension represented by this object */
-      const int* begin() const { return shape_; }
-
-      /** @brief Gets a const pointer to an int that specifies the size of the last dimension represented by this object */
-      const int* end() const { return shape_+ SHAPE_SIZE; }
-
-      /**
-       * @brief Tests this object for equality against another <code>Shape</code> object.
-       *
-       * @return <code>true</code> if the size of each dimension in this object
-       *         is equal to the size of the corresponding dimension in the other object,
-       *         <code>false</code> otherwise
-       */
-      bool operator==(const Shape& other) const {
-        return std::equal(begin(), end(), other.begin());
-      }
-
-      /**
-       * @brief Tests this object for inequality against another <code>Shape</code> object.
-       */
-      bool operator!=(const Shape& other) const {
-        return !(*this == other);
-      }
-  };
-}

 #include "keywords.h"

 namespace marian {
-  class Tensor;
+  class TensorBase;
+  typedef std::shared_ptr<TensorBase> Tensor;

  class OptimizerBase;
  typedef std::shared_ptr<OptimizerBase> OptimizerBasePtr;
@ -173,7 +76,7 @@ namespace marian {
    KEY(value, float)
    KEY(lazy_shape, std::function<Shape()>)
    KEY(lazy_value, std::function<float()>)
-    KEY(init, std::function<void(Tensor)>)
+    KEY(init, std::function<void(Tensor&)>)

    KEY(optimizer, OptimizerBasePtr)
    KEY(batch_size, int)
--- a/src/expression_graph.h
+++ b/src/expression_graph.h
@ -28,16 +28,74 @@
 #include "definitions.h"
 #include "chainable.h"
 #include "node_operators.h"
-#include "tensor.h"
 #include "batch_generator.h"
+#include "tensors/tensor_allocator.h"
+#include "tensors/tensor_gpu.h"

 namespace marian {

-// Forward declaration of ExpressionGraph class; this enables it to be used in the following typedef of ExpressionGraphPtr
-class ExpressionGraph;
+class Parameters {
+  private:
+    /** @brief List of all parameter nodes of this expression graph. */
+    std::vector<Expr> params_;
+    TensorAllocator vals_;
+    TensorAllocator grads_;

-/** @brief A pointer to an expression graph. */
-typedef std::shared_ptr<ExpressionGraph> ExpressionGraphPtr;
+  public:
+    Parameters()
+      : vals_(newTensorAllocator<DeviceGPU>()),
+        grads_(newTensorAllocator<DeviceGPU>())
+    {}
+
+    auto begin() -> decltype(params_.begin()) {
+      return params_.begin();
+    }
+
+    auto end() -> decltype(params_.begin()) {
+      return params_.end();
+    }
+
+    size_t size() {
+      return params_.size();
+    }
+
+    size_t totalSize() {
+      size_t sum = 0;
+      for(auto p : params_)
+        sum += p->shape().elements();
+      return sum;
+    }
+
+    void add(Expr p) {
+      params_.push_back(p);
+    }
+
+    void allocateForward() {
+      if(vals_->capacity() == 0) {
+        vals_->reserveExact(totalSize());
+        for(auto p: params_)
+          if(!p->val())
+            vals_->allocate(p->val(), p->shape());
+      }
+    }
+
+    void allocateBackward() {
+      if(grads_->capacity() == 0) {
+        grads_->reserveExact(totalSize());
+        for(auto p: params_)
+          if(!p->grad())
+            grads_->allocate(p->grad(), p->shape());
+      }
+    }
+
+    Tensor vals() {
+      return vals_->asTensor();
+    }
+
+    Tensor grads() {
+      return grads_->asTensor();
+    }
+};

 template <class T, typename ...Args>
 Expr Expression(Args&& ... args);
@ -50,7 +108,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
    /** @brief Constructs a new expression graph
     * Constructor is private to force use of New<ExpressionGraph>()
    */
-    ExpressionGraph() {}
+    ExpressionGraph() : tensors_(newTensorAllocator<DeviceGPU>()) {}

    // delete copy and move constructors
    ExpressionGraph(const ExpressionGraph&) = delete;
@ -69,8 +127,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {

      for(int i = 0; i < gInputs.size(); ++i) {
        if(!gInputs[i]->val())
-          gInputs[i]->setVal(Tensor(bInputs[i].shape()));
-        gInputs[i]->val().set(bInputs[i].begin(), bInputs[i].end());
+          tensor(gInputs[i]->val(), bInputs[i].shape());
+        gInputs[i]->val()->set(bInputs[i].data());
      }
    }

@ -80,7 +138,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
     * Backpropogation is implemented by performing first the forward pass
     *    and then the backward pass of algorithmic differentiation (AD) on the nodes of the graph.
     *
-     * @param batchSize       XXX Marcin, could you provide a description of this param?
+     * @param batch A batch of training data
     */
    void backprop(data::BatchPtr batch) {
      forward(batch);
@ -103,6 +161,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
     * @param batchSize       XXX Marcin, could you provide a description of this param?
     */
    void forward(data::BatchPtr batch) {
+      params_.allocateForward();
+
      for(auto&& v : tape_)
        if(!v->skipped_training())
          v->allocate(batch->dim());
@ -144,6 +204,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
      UTIL_THROW_IF2(topNodes_.size() > 1,
        "There are more than one top most node for backward step");

+      params_.allocateBackward();
+
      for(auto&& v : tape_)
        if(!v->skipped_training())
          v->set_zero_adjoint();
@ -241,7 +303,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
    template <typename ...Args>
    inline Expr param(Args ...args) {
      auto e = Expression<ParamNode>(shared_from_this(), args...);
-      params_.emplace_back(e);
+      params_.add(e);
      return e;
    }

@ -318,7 +380,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
     *
     * @return the list of all parameter nodes of this expression graph
     */
-    std::vector<Expr>& params() {
+    Parameters& params() {
      return params_;
    }

@ -346,14 +408,6 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
      named_.emplace(name, e);
    }

-    /**
-     * @brief Returns a pointer to the list of items contained in this graph.
-     *
-     * The items in the list will be in the order they were created.
-     *
-     * @return a pointer to the list of items contained in this graph
-     */
-
    void add(Expr node) {
      tape_.push_back(node);
      if(!node->skipped_training())
@ -364,24 +418,34 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
      topNodes_.erase(node);
    }

+
+
+    template <class ...Args>
+    void tensor(Tensor& t, Args&&... args) {
+      tensors_->allocate(t, args...);
+    }
+
  private:

-    /** @brief Pointer to the list of nodes */
+    /** @brief The full list of nodes */
    Tape tape_;

    /** @brief Maps from name to expression node. */
    std::map<std::string, Expr> named_;

-    /** @brief List of all parameter nodes of this expression graph. */
-    std::vector<Expr> params_;
-
    /** @brief List of all input nodes of this expression graph. */
    std::vector<Expr> inputs_;

    /** @brief Contains all nodes with regard to which we want to calculate derivatives */
    std::unordered_set<Expr> topNodes_;
+
+    Parameters params_;
+    TensorAllocator tensors_;
 };

+/** @brief A pointer to an expression graph. */
+typedef std::shared_ptr<ExpressionGraph> ExpressionGraphPtr;
+
 template <class T, typename ...Args>
 Expr Expression(Args&& ... args) {
  auto e = Expr(new T(std::forward<Args>(args)...));
@ -389,5 +453,4 @@ Expr Expression(Args&& ... args) {
  return e;
 }

-
 }
--- a/src/expression_operators.h
+++ b/src/expression_operators.h
@ -92,39 +92,9 @@ Expr reluplus(Expr a, Expr b);

 /*********************************************************/

-// inefficient
 template <typename ...Args>
-inline Expr sum(Expr a, Args ...args) {
-  using namespace keywords;
-  Keywords params(args...);
-  int ax = params.Get(axis, whatevs);
-
-  if(ax == 0) {
-    auto lshape = [a]() -> Shape {
-      int rows = a->val().shape()[0];
-      return {1, rows};
-    };
-    Expr one = a->graph()->ones(shape={1, a->shape()[0]},
-                    lazy_shape=lshape);
-    return dot(one, a);
-  }
-  else if(ax == 1) {
-    auto lshape = [a]() -> Shape {
-      int cols = a->val().shape()[1];
-      //std::cerr << "Shape will be " << cols << " by 1." << std::endl;
-      return {cols, 1};
-    };
-    Expr one = a->graph()->ones(shape={a->shape()[1], 1},
-                        lazy_shape=lshape);
-    return dot(a, one);
-  }
-  else if(ax == 2) {
-    UTIL_THROW2("Not implemented");
-  }
-  else if(ax == 3) {
-    UTIL_THROW2("Not implemented");
-  }
-  return sum(sum(a, axis=0), axis=1);
+Expr sum(Expr a, Args ...args) {
+  return Expression<SumNodeOp>(a, args...);
 }

 Expr softmax(Expr a);
@ -133,34 +103,9 @@ Expr logsoftmax(Expr a);

 Expr argmax(Expr a);

-// inefficient
 template <typename ...Args>
-inline Expr mean(Expr a, Args ...args) {
-  using namespace keywords;
-  Keywords params(args...);
-  size_t ax = params.Get(axis, whatevs);
-
-  switch (ax) {
-    case 0:
-      return sum(a, axis=0) / a->graph()->constant(shape={1, 1},
-                                       lazy_value=[a]() -> Float {
-                                         return a->val().shape()[0];
-                                       });
-    case 1:
-      return sum(a, axis=1) / a->graph()->constant(shape={1, 1},
-                                       lazy_value=[a]() -> Float {
-                                         return a->val().shape()[1];
-                                       });
-    case 2:
-      UTIL_THROW2("Not implemented");
-    case 3:
-      UTIL_THROW2("Not implemented");
-    default:
-      return sum(a) / a->graph()->constant(shape={1, 1},
-                               lazy_value=[a]() -> Float {
-                                 return a->val().size();
-                               });
-  }
+Expr mean(Expr a, Args ...args) {
+  return Expression<MeanNodeOp>(a, args...);
 }

 Expr cross_entropy(Expr a, Expr b);
--- a/src/mnist_benchmark.cu
+++ b/src/mnist_benchmark.cu
@ -10,6 +10,10 @@
 #include "trainer.h"
 #include "models/feedforward.h"

+#include "tensors/tensor.h"
+#include "tensors/tensor_gpu.h"
+#include "tensors/tensor_allocator.h"
+
 using namespace marian;
 using namespace keywords;
 using namespace data;
--- a/src/node.cu
+++ b/src/node.cu
@ -9,67 +9,96 @@ void Node::skip_training() {
  graph_->remove_top_node(shared_from_this());
 }

+void Node::allocate(size_t batchSize) {
+  auto it1 = shape_.begin();
+  auto it2 = givenShape_.begin();
+  while(it1 != shape_.end()) {
+    if(*it2 == whatevs)
+      *it1 = batchSize;
+    it1++; it2++;
+  }
+
+  graph_->tensor(val_, shape_);
+
+  if(Has(keywords::value))
+    val_->set(Get(keywords::value, 0));
+}
+
+void Node::init_dependent() {
+  if(!adj_)
+    graph_->tensor(adj_, shape_);
+  adj_->set(1);
+}
+
+void Node::set_zero_adjoint() {
+  if(!adj_) {
+    graph_->tensor(adj_, shape_);
+  }
+  adj_->set(0);
+}
+
+
 // GPU
 void Node::calc_numeric_grad(Float delta, Tensor input, Tensor grad) {
  using namespace std;

-  size_t inputSize = GetTotalSize(input.shape());
-  size_t valSize = GetTotalSize(val_.shape());
-
-  UTIL_THROW_IF2(inputSize != GetTotalSize(grad.shape()),
-			  "inputSize != gradSize:" << inputSize << "!=" << GetTotalSize(grad.shape()));
-  UTIL_THROW_IF2(valSize != GetTotalSize(adj_.shape()),
-			  "valSize != adjSize :" << valSize << "!=" << GetTotalSize(adj_.shape()));
-
-  cerr	<< "inputSize=grad=" << Debug(input.shape())<< "=" << inputSize << " "
-		<< "valSize=adj_=" << Debug(val_.shape()) << "=" << valSize
-		<< endl;
-
-  //cerr << "input=" << input.Debug() << endl;
-  //cerr << "adj_=" << adj_.Debug() << endl;
-
-  std::vector<float> prevCalcGrad;
-  prevCalcGrad << grad;
-  //cerr << "origGrad=" << grad.Debug() << endl;
-  //output("diffGrad", diffGrad);
-
-  //output("prevCalcGrad", prevCalcGrad.begin(), prevCalcGrad.end());
-
-  Tensor newValTensor(input.shape());
-
-  // LOOP thru each element in input & add delta
-  for (size_t inputInd = 0; inputInd < inputSize; ++inputInd) {
-	  input.incr(inputInd, delta);
-	  //output("input", input.begin(), input.end());
-
-	  forward();
-
-	  val_.sum(newValTensor, inputInd);
-	  //cudaDeviceSynchronize();
-
-	  input.incr(inputInd, -delta);
-  }
-
-  std::vector<float> newVal;
-  newVal << newValTensor;
-  //cudaDeviceSynchronize();
-
-  // orig value
-  forward();
-
-  float sumValOrig = val_.sum();
-  //float sumValOrig = thrust::reduce(val_.begin(), val_.end(), (float) 0.0f, thrust::plus<float>());
-  //cudaDeviceSynchronize();
-
-  //output("newVal", newVal.begin(), newVal.end());
-
-  // calc gradient
-  Tensor prevGradTensor(input.shape());
-  thrust::copy(grad.begin(), grad.end(), prevGradTensor.begin());
-
-  Tensor gradTensor(input.shape());
-  Element(_1 = (_2 - sumValOrig) / delta, gradTensor, newValTensor);
-  Element(_1 = _2 * _3 + _4, grad, adj_, gradTensor, prevGradTensor);
+//  size_t inputSize = GetTotalSize(input->shape());
+//  size_t valSize = GetTotalSize(val_->shape());
+//
+//  UTIL_THROW_IF2(inputSize != GetTotalSize(grad->shape()),
+//			  "inputSize != gradSize:" << inputSize << "!=" << grad->shape()->elements());
+//  UTIL_THROW_IF2(valSize != GetTotalSize(adj_->shape()),
+//			  "valSize != adjSize :" << valSize << "!=" << adj_->shape()->elements());
+//
+//  cerr	<< "inputSize=grad=" << Debug(input->shape())<< "=" << inputSize << " "
+//		<< "valSize=adj_=" << Debug(val_->shape()) << "=" << valSize
+//		<< endl;
+//
+//  //cerr << "input=" << input.Debug() << endl;
+//  //cerr << "adj_=" << adj_.Debug() << endl;
+//
+//  std::vector<float> prevCalcGrad;
+//  prevCalcGrad << grad;
+//  //cerr << "origGrad=" << grad.Debug() << endl;
+//  //output("diffGrad", diffGrad);
+//
+//  //output("prevCalcGrad", prevCalcGrad.begin(), prevCalcGrad.end());
+//
+//  Tensor newValTensor(input.shape());
+//
+//  // LOOP thru each element in input & add delta
+//  for (size_t inputInd = 0; inputInd < inputSize; ++inputInd) {
+//	  input.incr(inputInd, delta);
+//	  //output("input", input.begin(), input.end());
+//
+//	  forward();
+//
+//	  val_.sum(newValTensor, inputInd);
+//	  //cudaDeviceSynchronize();
+//
+//	  input.incr(inputInd, -delta);
+//  }
+//
+//  std::vector<float> newVal;
+//  newVal << newValTensor;
+//  //cudaDeviceSynchronize();
+//
+//  // orig value
+//  forward();
+//
+//  float sumValOrig = val_.sum();
+//  //float sumValOrig = thrust::reduce(val_.begin(), val_.end(), (float) 0.0f, thrust::plus<float>());
+//  //cudaDeviceSynchronize();
+//
+//  //output("newVal", newVal.begin(), newVal.end());
+//
+//  // calc gradient
+//  Tensor prevGradTensor(input.shape());
+//  thrust::copy(grad.begin(), grad.end(), prevGradTensor.begin());
+//
+//  Tensor gradTensor(input.shape());
+//  Element(_1 = (_2 - sumValOrig) / delta, gradTensor, newValTensor);
+//  Element(_1 = _2 * _3 + _4, grad, adj_, gradTensor, prevGradTensor);
 }

 /*
--- a/src/node.h
+++ b/src/node.h
@ -22,9 +22,10 @@
 // SOFTWARE.

 #include <memory>
+#include <iostream>

 #include "keywords.h"
-#include "tensor.h"
+#include "tensors/tensor.h"
 #include "chainable.h"

 namespace marian {
@ -60,52 +61,17 @@ class Node : public Chainable<Tensor>,

    virtual bool skipped_training() { return skipTraining_; }

-    virtual void allocate(size_t batchSize) {
-      auto it1 = shape_.begin();
-      auto it2 = givenShape_.begin();
-      while(it1 != shape_.end()) {
-        if(*it2 == whatevs)
-          *it1 = batchSize;
-        it1++; it2++;
-      }
+    virtual void allocate(size_t batchSize);

-      if(Has(keywords::lazy_shape)) {
-        auto defaultShape = [this]() -> Shape { return shape_; };
-        shape_ = Get(keywords::lazy_shape, defaultShape)();
-      }
-      if(Has(keywords::lazy_value))
-        val_.allocate(shape_, Get(
-          keywords::lazy_value, []()->Float{return 0.f;})());
-      else if(Has(keywords::value))
-        val_.allocate(shape_, Get(keywords::value, 0));
-      else
-        val_.allocate(shape_);
-    }
+    virtual void init_dependent();

-    virtual void init_dependent() {
-      if(adj_) {
-        adj_.set(1);
-      }
-      else {
-        adj_.allocate(shape_, 1);
-      }
-    }
+    virtual void set_zero_adjoint();

-    virtual void set_zero_adjoint() {
-      if(adj_) {
-        adj_.set(0);
-      }
-      else {
-        adj_.allocate(shape_, 0);
-      }
-    }
-
-    virtual Tensor val()  {
+    virtual Tensor& val()  {
      return val_;
    };

-    virtual Tensor grad() {
-      //UTIL_THROW_IF2(!adj_, "Tensor has not been allocated");
+    virtual Tensor& grad() {
      return adj_;
    };

--- a/src/node_operators.cu
+++ b/src/node_operators.cu
@ -0,0 +1,37 @@
+// This file is part of the Marian toolkit.
+// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "node_operators.h"
+#include "expression_graph.h"
+
+namespace marian {
+
+  void ParamNode::allocate(size_t batchSize) {
+    // @TODO params
+    graph()->tensor(val_, shape_);
+    if(!initialized_) {
+      init_(val_);
+      initialized_ = true;
+    }
+  }
+
+
+}
--- a/src/node_operators.h
+++ b/src/node_operators.h
@ -39,12 +39,6 @@ struct InputNode : public Node {

  ~InputNode() {}

-  virtual void setVal(Tensor t)  {
-    val_ = t;
-    shape_ = t.shape();
-    //@todo, shape checking
-  }
-
  void forward() {}
  void backward() {}

@ -82,7 +76,7 @@ struct ParamNode : public Node {
  template <typename ...Args>
  ParamNode(Args ...args)
  : Node(args...),
-    init_(Get(keywords::init, [](Tensor){ })),
+    init_(Get(keywords::init, [](Tensor&){ })),
    initialized_(false)
  {
    UTIL_THROW_IF2(!Has(keywords::shape) &&
@ -90,30 +84,12 @@ struct ParamNode : public Node {
                   "Param items require shape information");
  }

-  virtual void setVal(Tensor t)  {
-    val_ = t;
-    shape_ = t.shape();
-    //@todo, shape checking
-  };
-
  ~ParamNode() {}

-  virtual void setGrad(Tensor t)  {
-    adj_ = t;
-    shape_ = t.shape();
-    //@todo, shape checking
-  };
-
  void forward() {}
  void backward() {}

-  virtual void allocate(size_t batchSize) {
-    val_.allocate(shape_);
-    if(!initialized_) {
-      init_(val_);
-      initialized_ = true;
-    }
-  }
+  virtual void allocate(size_t batchSize);

  virtual std::string graphviz() {
    std::stringstream ss;
@ -125,7 +101,7 @@ struct ParamNode : public Node {


  private:
-    std::function<void(Tensor)> init_;
+    std::function<void(Tensor&)> init_;
    bool initialized_;
 };

--- a/src/node_operators_binary.cu
+++ b/src/node_operators_binary.cu
@ -6,4 +6,21 @@ namespace marian {
    graph_->remove_top_node(a_);
    graph_->remove_top_node(b_);
  }
+
+  // We're caching the logsoftmax probabilities here because we'll need them for
+  // the backward computation.
+  void CrossEntropyNodeOp::forward() {
+    // C = sum(-B * logsoftmax(A))
+    if(!probs_)
+      graph_->tensor(probs_, a_->val()->shape());
+
+    CudnnLogSoftmax(probs_, a_->val());
+
+    if(!result_)
+      graph_->tensor(result_, a_->val()->shape());
+    Element(_1 = -_2 * _3, result_, b_->val(), probs_);
+    Sum(val_, result_, 1);
+  }
+
+
 }
--- a/src/node_operators_binary.h
+++ b/src/node_operators_binary.h
@ -1,6 +1,7 @@
 #pragma once

 #include "node.h"
+#include "thrust_functions.h"
 #include "tensor_operators.h"

 namespace marian {
@ -29,53 +30,53 @@ struct BinaryNodeOp : public Node {
  void remove_children_from_top_nodes();

  void backward_debug(Float delta) {
-	  using namespace std;
-
-	  cerr << "BinaryNodeOp::" << typeid(*this).name() << "::backward_debug()" << endl;
-
-	  std::vector<float> preCalcGradA, diffGradA, numericalGradA;
-	  preCalcGradA << a_->grad();
-	  //output("preCalcGradA", preCalcGradA);
-
-	  std::vector<float> preCalcGradB, diffGradB, numericalGradB;
-	  preCalcGradB << b_->grad();
-	  //output("preCalcGradB", preCalcGradB);
-
-	  // use df/dx to calc grad
-	  backward();
-	  cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
-	  cerr << "orig b_->grad()=" << b_->grad().Debug() << endl;
-
-	  diffGradA << a_->grad();
-	  diffGradB << b_->grad();
-
+    //using namespace std;
+    //
+    //cerr << "BinaryNodeOp::" << typeid(*this).name() << "::backward_debug()" << endl;
+    //
+    //std::vector<float> preCalcGradA, diffGradA, numericalGradA;
+    //preCalcGradA << a_->grad();
+    ////output("preCalcGradA", preCalcGradA);
+    //
+    //std::vector<float> preCalcGradB, diffGradB, numericalGradB;
+    //preCalcGradB << b_->grad();
+    ////output("preCalcGradB", preCalcGradB);
+    //
+    //// use df/dx to calc grad
+    //backward();
    //cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
    //cerr << "orig b_->grad()=" << b_->grad().Debug() << endl;
-
-	  cerr << "TENSOR A:" << endl;
-	  a_->grad().set(preCalcGradA);
-	  b_->grad().set(preCalcGradB);
-
-	  calc_numeric_grad(delta, a_->val(), a_->grad());
-	  cerr << "numerical a_->grad()=" << a_->grad().Debug() << endl;
-
-	  numericalGradA << a_->grad();
-	  outputL2Norm("TENSOR A", diffGradA, numericalGradA);
-
-
-	  cerr << "TENSOR B:" << endl;
-	  a_->grad().set(preCalcGradA);
-	  b_->grad().set(preCalcGradB);
-
-	  calc_numeric_grad(delta, b_->val(), b_->grad());
-	  cerr << "numerical b_->grad()=" << b_->grad().Debug() << endl;
-
-	  numericalGradB << b_->grad();
-	  outputL2Norm("TENSOR B", diffGradB, numericalGradB);
-
-	  // reset to diff grad
-	  a_->grad().set(diffGradA);
-	  b_->grad().set(diffGradB);
+    //
+    //diffGradA << a_->grad();
+    //diffGradB << b_->grad();
+    //
+    ////cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
+    ////cerr << "orig b_->grad()=" << b_->grad().Debug() << endl;
+    //
+    //cerr << "TENSOR A:" << endl;
+    //a_->grad().set(preCalcGradA);
+    //b_->grad().set(preCalcGradB);
+    //
+    //calc_numeric_grad(delta, a_->val(), a_->grad());
+    //cerr << "numerical a_->grad()=" << a_->grad().Debug() << endl;
+    //
+    //numericalGradA << a_->grad();
+    //outputL2Norm("TENSOR A", diffGradA, numericalGradA);
+    //
+    //
+    //cerr << "TENSOR B:" << endl;
+    //a_->grad().set(preCalcGradA);
+    //b_->grad().set(preCalcGradB);
+    //
+    //calc_numeric_grad(delta, b_->val(), b_->grad());
+    //cerr << "numerical b_->grad()=" << b_->grad().Debug() << endl;
+    //
+    //numericalGradB << b_->grad();
+    //outputL2Norm("TENSOR B", diffGradB, numericalGradB);
+    //
+    //// reset to diff grad
+    //a_->grad().set(diffGradA);
+    //b_->grad().set(diffGradB);
  }


@ -231,7 +232,7 @@ struct MultNodeOp : public BinaryNodeOp {

  virtual std::string graphviz() {
    std::stringstream ss;
-    ss << "\"" << this << "\" [shape=\"box\", label=" << label("x")
+    ss << "\"" << this << "\" [shape=\"box\", label=" << label("×")
      << ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
    ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
    ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
@ -285,22 +286,7 @@ struct CrossEntropyNodeOp : public BinaryNodeOp {
    return shape1;
  }

-  // We're caching the logsoftmax probabilities here because we'll need them for
-  // the backward computation.
-  void forward() {
-    // C = -dot(B, logsoftmax(A)).
-    if (probs_) {
-      probs_.set(0.0);
-    } else {
-      probs_.allocate(a_->val().shape(), 0.0);
-    }
-
-	CudnnLogSoftmax(probs_, a_->val());
-	if(!result_)
-	  result_.allocate(a_->val().shape());
-    Element(_1 = -_2 * _3, result_, b_->val(), probs_);
-    SumRowwise(result_, val_);
-  }
+  void forward();

  // @TODO: In most cases it's wasteful to compute the derivative with respect
  // to the second input which is typically an input node in the computation
@ -318,15 +304,15 @@ struct CrossEntropyNodeOp : public BinaryNodeOp {
      a_->grad(), adj_, probs_, b_->val());

    // Compute second input derivative.
-    Element(_1 -= _2 * _3, b_->grad(),
-			adj_, probs_);
+    Element(_1 -= _2 * _3,
+      b_->grad(), adj_, probs_);
  }

  virtual std::string graphviz() {
    std::stringstream ss;
    ss << "\"" << this << "\" [shape=\"box\", label=" << label("x-ent")
      << ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
-    ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
+    ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
    ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
    return ss.str();
  };
--- a/src/node_operators_unary.h
+++ b/src/node_operators_unary.h
@ -1,7 +1,9 @@
 #pragma once

 #include "node.h"
+#include "tensors/tensor.h"
 #include "tensor_operators.h"
+#include "thrust_functions.h"

 namespace marian {

@ -26,29 +28,29 @@ struct UnaryNodeOp : public Node {

    void backward_debug(Float delta) {
        using namespace std;
-
-      cerr << "UnaryNodeOp::" << typeid(*this).name() << "::backward_numeric()" << endl;
-
-	  std::vector<float> preCalcGradA, diffGradA, numericalGradA;
-	  preCalcGradA << a_->grad();
-	  //output("preCalcGradA", preCalcGradA);
-
-	  // use df/dx to calc grad
-	  backward();
-	  cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
-	  diffGradA << a_->grad();
-
-	  a_->grad().set(preCalcGradA);
-
-	  calc_numeric_grad(delta, a_->val(), a_->grad());
-	  cerr << "numerical a_->grad()=" << a_->grad().Debug() << endl;
-
-	  numericalGradA << a_->grad();
-
-	  outputL2Norm("", diffGradA, numericalGradA);
-
-	  // reset to diff grad
-	  a_->grad().set(diffGradA);
+        //
+        //cerr << "UnaryNodeOp::" << typeid(*this).name() << "::backward_numeric()" << endl;
+        //
+        //std::vector<float> preCalcGradA, diffGradA, numericalGradA;
+        //a_->grad() >> preCalcGradA ;
+        ////output("preCalcGradA", preCalcGradA);
+        //
+        //// use df/dx to calc grad
+        //backward();
+        //cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
+        //a_->grad() >> diffGradA;
+        //
+        //a_->grad()->set(preCalcGradA);
+        //
+        //calc_numeric_grad(delta, a_->val(), a_->grad());
+        ////cerr << "numerical a_->grad()=" << a_->grad()->Debug() << endl;
+        //
+        //a_->grad() >> numericalGradA;
+        //
+        //outputL2Norm("", diffGradA, numericalGradA);
+        //
+        //// reset to diff grad
+        //a_->grad()->set(diffGradA);
    }

 };
@ -68,10 +70,6 @@ struct LogitNodeOp : public UnaryNodeOp {
            a_->grad(), adj_, val_);
  }

-  void check() {
-
-  }
-
  virtual std::string graphviz() {
    std::stringstream ss;
    ss << "\"" << this << "\" [shape=\"box\", label=" << label("logit")
@ -210,7 +208,7 @@ struct SoftmaxNodeOp : public UnaryNodeOp {
    : UnaryNodeOp(args...) { }

  void forward() {
-    CudnnSoftmax(val_, a_->val());
+    Softmax(val_, a_->val());
  }

  void backward() {
@ -292,12 +290,94 @@ struct ArgmaxNodeOp : public UnaryNodeOp {

 };

+struct SumNodeOp : public UnaryNodeOp {
+  template <typename ...Args>
+  SumNodeOp(Expr a, Args ...args)
+    : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { }
+
+  void forward() {
+    Sum(val_, a_->val(), Get(keywords::axis, -1));
+  }
+
+  void backward() {
+    SumBackward(a_->grad(), adj_, Get(keywords::axis, -1));
+  }
+
+  template <class ...Args>
+  Shape newShape(Expr a, Args ...args) {
+    int ax = keywords::Get(keywords::axis, -1, args...);
+    Shape shape = a->shape();
+    if(ax == 0) {
+      shape[0] = 1;
+    }
+    else if(ax == 1) {
+      shape[1] = 1;
+    }
+    else {
+      shape[0] = 1;
+      shape[1] = 1;
+    }
+    return shape;
+  }
+
+  virtual std::string graphviz() {
+    std::stringstream ss;
+    ss << "\"" << this << "\" [shape=\"box\", label="
+      << label("sum") << ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
+    ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
+    return ss.str();
+  };
+
+};
+
+struct MeanNodeOp : public UnaryNodeOp {
+  template <typename ...Args>
+  MeanNodeOp(Expr a, Args ...args)
+    : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { }
+
+  void forward() {
+    Sum(val_, a_->val(), Get(keywords::axis, -1), true);
+  }
+
+  void backward() {
+    SumBackward(a_->grad(), adj_, Get(keywords::axis, -1), true);
+  }
+
+  template <class ...Args>
+  Shape newShape(Expr a, Args ...args) {
+    int ax = keywords::Get(keywords::axis, -1, args...);
+    Shape shape = a->shape();
+    if(ax == 0) {
+      shape[0] = 1;
+    }
+    else if(ax == 1) {
+      shape[1] = 1;
+    }
+    else {
+      shape[0] = 1;
+      shape[1] = 1;
+    }
+    return shape;
+  }
+
+  virtual std::string graphviz() {
+    std::stringstream ss;
+    ss << "\"" << this << "\" [shape=\"box\", label="
+      << label("mean") << ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
+    ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
+    return ss.str();
+  };
+
+};
+
+
 struct LogNodeOp : public UnaryNodeOp {
  template <typename ...Args>
  LogNodeOp(Args ...args)
  : UnaryNodeOp(args...) {}

  void forward() {
+    std::cerr << val_.get() << " <-> " << a_->val().get() << std::endl;
    Element(_1 = Log(_2), val_, a_->val());
  }

--- a/src/optimizers.h
+++ b/src/optimizers.h
@ -38,29 +38,35 @@ class Sgd : public OptimizerBase {
 class Adagrad : public OptimizerBase {
  public:
    Adagrad(float eta=0.01, float eps=1e-8)
-    : eta_(eta), eps_(eps) {}
+    : eta_(eta), eps_(eps),
+      alloc_(newTensorAllocator<DeviceGPU>())
+    {}

    void update(ExpressionGraphPtr graph, data::BatchPtr batch) {
      graph->backprop(batch);

-      if(gt_.size() < graph->params().size())
-        for(auto& param : graph->params())
-          gt_.emplace_back(Tensor(param->grad().shape(), 0));
-
-      auto gtIt = gt_.begin();
-      for(auto& param : graph->params()) {
-        Element(_1 += (_2 * _2),
-                *gtIt, param->grad());
-        Element(_1 -= (eta_ / (Sqrt(_2) + eps_)) * _3,
-                param->val(), *gtIt, param->grad());
-        gtIt++;
+      if(!gt_) {
+        int totalSize = graph->params().totalSize();
+        alloc_->reserveExact(totalSize);
+        alloc_->allocate(gt_, {1, totalSize});
+        gt_->set(0);
      }
+
+      Tensor pVals = graph->params().vals();
+      Tensor pGrads = graph->params().grads();
+
+      ElementVec(_1 += (_2 * _2),
+                 gt_, pGrads);
+
+      ElementVec(_1 -= (eta_ / (Sqrt(_2) + eps_)) * _3,
+                 pVals, gt_, pGrads);
    }

  private:
    float eta_;
    float eps_;
-    std::vector<Tensor> gt_;
+    TensorAllocator alloc_;
+    Tensor gt_;
 };


@ -69,34 +75,39 @@ class Adagrad : public OptimizerBase {
 class Adam : public OptimizerBase {
  public:
    Adam(float eta=0.001, float beta1=0.9, float beta2=0.999, float eps=1e-8)
-    : eta_(eta), beta1_(beta1), beta2_(beta2), eps_(eps), t_(0) {}
+    : eta_(eta), beta1_(beta1), beta2_(beta2), eps_(eps), t_(0),
+      mtAlloc_(newTensorAllocator<DeviceGPU>()),
+      vtAlloc_(newTensorAllocator<DeviceGPU>())
+    {}

    void update(ExpressionGraphPtr graph, data::BatchPtr batch) {
      graph->backprop(batch);

-      if(mt_.size() < graph->params().size()) {
-        for(auto& param : graph->params()) {
-          mt_.emplace_back(Tensor(param->grad().shape(), 0));
-          vt_.emplace_back(Tensor(param->grad().shape(), 0));
-        }
+      if(!mt_) {
+        int totalSize = graph->params().totalSize();
+        mtAlloc_->reserveExact(totalSize);
+        mtAlloc_->allocate(mt_, {1, totalSize});
+        mt_->set(0);
+
+        vtAlloc_->reserveExact(totalSize);
+        vtAlloc_->allocate(vt_, {1, totalSize});
+        vt_->set(0);
      }

      t_++;
      float denom1 = 1 - pow(beta1_, t_);
      float denom2 = 1 - pow(beta2_, t_);

-      auto mtIt = mt_.begin();
-      auto vtIt = vt_.begin();
+      Tensor pVals = graph->params().vals();
+      Tensor pGrads = graph->params().grads();

-      for(auto& param : graph->params()) {
-        Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2),
-                *mtIt, param->grad());
-        Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)),
-                *vtIt, param->grad());
-        Element(_1 -= eta_ * (_2 / denom1) / (Sqrt(_3 / denom2) + eps_),
-                param->val(), *mtIt, *vtIt);
-        mtIt++; vtIt++;
-      }
+      ElementVec(_1 = (beta1_ * _1) + ((1 - beta1_) * _2),
+                 mt_, pGrads);
+      ElementVec(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)),
+                 vt_, pGrads);
+
+      ElementVec(_1 -= eta_ * (_2 / denom1) / (Sqrt(_3 / denom2) + eps_),
+                 pVals, mt_, vt_);
    }

  private:
@ -105,8 +116,11 @@ class Adam : public OptimizerBase {
    float beta2_;
    float eps_;
    size_t t_;
-    std::vector<Tensor> mt_;
-    std::vector<Tensor> vt_;
+
+    TensorAllocator mtAlloc_;
+    Tensor mt_;
+    TensorAllocator vtAlloc_;
+    Tensor vt_;
 };

 template <class Algorithm, typename ...Args>
--- a/src/param_initializers.h
+++ b/src/param_initializers.h
@ -27,7 +27,7 @@
 #include <functional>
 #include <stdint.h>

-#include "tensor.h"
+#include "tensors/tensor.h"

 namespace marian {

@ -47,58 +47,58 @@ float xor128() {
 // Use a constant seed for deterministic behaviour.
 std::default_random_engine engine(42);

-void zeros(Tensor t) {
-  t.set(0.f);
+void zeros(Tensor& t) {
+  t->set(0.f);
 }

-void ones(Tensor t) {
-  t.set(1.0f);
+void ones(Tensor& t) {
+  t->set(1.0f);
 }

 template <class Distribution>
-void distribution(Tensor t, float a, float b) {
+void distribution(Tensor& t, float a, float b) {
  //std::random_device device;
  //std::default_random_engine engine(device());
  Distribution dist(a, b);
  auto gen = std::bind(dist, engine);

-  std::vector<float> vals(t.size());
+  std::vector<float> vals(t->size());
  std::generate(begin(vals), end(vals), gen);

  t << vals;
 }

-std::function<void(Tensor)> normal(float mean = 0.0, float std = 0.05) {
-  return [mean, std](Tensor t) {
+std::function<void(Tensor&)> normal(float mean = 0.0, float std = 0.05) {
+  return [mean, std](Tensor& t) {
    distribution<std::normal_distribution<float>>(t, mean, std);
  };
 }

-std::function<void(Tensor)> uniform(float a = -0.05, float b = 0.05) {
-  return [a, b](Tensor t) {
+std::function<void(Tensor&)> uniform(float a = -0.05, float b = 0.05) {
+  return [a, b](Tensor& t) {
    distribution<std::uniform_real_distribution<float>>(t, a, b);
  };
 }

-void glorot_uniform(Tensor t) {
-  float b = sqrtf( 6.0f / (t.shape()[0] + t.shape()[1]) );
+void glorot_uniform(Tensor& t) {
+  float b = sqrtf( 6.0f / (t->shape()[0] + t->shape()[1]) );
  distribution<std::uniform_real_distribution<float>>(t, -b, b);
 }

-void xorshift(Tensor t) {
-  std::vector<float> vals(t.size());
+void xorshift(Tensor& t) {
+  std::vector<float> vals(t->size());
  for(auto&& v : vals)
    v = xor128();
  t << vals;
 }

-void glorot_normal(Tensor t) {
-  float b = sqrtf( 2.0f / (t.shape()[0] + t.shape()[1]) );
+void glorot_normal(Tensor& t) {
+  float b = sqrtf( 2.0f / (t->shape()[0] + t->shape()[1]) );
  distribution<std::uniform_real_distribution<float>>(t, -b, b);
 }

-std::function<void(Tensor)> from_vector(const std::vector<float>& v) {
-  return [v](Tensor t) {
+std::function<void(Tensor&)> from_vector(const std::vector<float>& v) {
+  return [v](Tensor& t) {
    t << v;
  };
 }
--- a/src/shape.h
+++ b/src/shape.h
@ -0,0 +1,136 @@
+#pragma once
+
+// This file is part of the Marian toolkit.
+// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cstdint>
+
+#include "exception.h"
+
+namespace marian {
+
+  /**
+   * @brief Represents the size of each dimension in a tensor.
+   *
+   * Note: this class currently is hard-coded to four dimensions.
+   */
+
+  const size_t SHAPE_SIZE = 2;
+
+
+  struct Shape {
+      int shape_[SHAPE_SIZE];
+
+      /**
+       * @brief Constructs a default shape.
+       *
+       * This default shape has four dimensions.
+       * The size of each dimension is 1.
+       */
+      Shape() : shape_{1, 1} { }
+
+      /**
+       * @brief Constructs a shape.
+       *
+       * @param i A list of integers representing the size of each dimension.
+       */
+      Shape(std::initializer_list<int> il) {
+       std::copy(il.begin(), il.end(), begin());
+      }
+
+      Shape(const Shape& shape) {
+       std::copy(shape.begin(), shape.end(), begin());
+      }
+
+      /**
+       * @brief Gets a reference to the int representing the size of the <code>i</code>th dimension represented by this object.
+       *
+       * @return a reference to the int representing the size of the <code>i</code>th dimension represented by this object
+       */
+      __host__ __device__
+      int& operator[](int i) {
+        return shape_[i];
+      }
+
+      /**
+       * @brief Gets the size of the <code>i</code>th dimension represented by this object.
+       *
+       * @return the size of the <code>i</code>th dimension represented by this object
+       */
+      __host__ __device__
+      const int& operator[](int i) const {
+        return shape_[i];
+      }
+
+	  /**
+	   * @brief Gets the number of dimensions represented by this object
+	   *
+	   * @return the number of dimensions represented by this object
+	   */
+      size_t size() const {
+        return SHAPE_SIZE;
+      }
+
+      /**
+       * @brief Gets the total number of elements in a tensor of this shape.
+       *
+       * For example, if this shape represents a 5x100 tensor, this method would return 500.
+       *
+       * @return the total number of elements in a tensor of this shape
+       */
+      size_t elements() const {
+        size_t s = 1;
+        for(int i = 0; i < size(); ++i)
+          s *= shape_[i];
+        return s;
+      }
+
+      /** @brief Gets a pointer to an int that specifies the size of the first dimension represented by this object */
+      int* begin() { return shape_; }
+
+      /** @brief Gets a pointer to an int that specifies the size of the last dimension represented by this object */
+      int* end() { return shape_ + SHAPE_SIZE; }
+
+      /** @brief Gets a const pointer to an int that specifies the size of the first dimension represented by this object */
+      const int* begin() const { return shape_; }
+
+      /** @brief Gets a const pointer to an int that specifies the size of the last dimension represented by this object */
+      const int* end() const { return shape_+ SHAPE_SIZE; }
+
+      /**
+       * @brief Tests this object for equality against another <code>Shape</code> object.
+       *
+       * @return <code>true</code> if the size of each dimension in this object
+       *         is equal to the size of the corresponding dimension in the other object,
+       *         <code>false</code> otherwise
+       */
+      bool operator==(const Shape& other) const {
+        return std::equal(begin(), end(), other.begin());
+      }
+
+      /**
+       * @brief Tests this object for inequality against another <code>Shape</code> object.
+       */
+      bool operator!=(const Shape& other) const {
+        return !(*this == other);
+      }
+  };
+}
--- a/src/softmax_benchmark.cu
+++ b/src/softmax_benchmark.cu
@ -2,10 +2,13 @@
 #include <stdio.h>
 #include <time.h>
 #include <cudnn.h>
+#include <unistd.h>

 #include <boost/timer/timer.hpp>

-#include "tensor.h"
+#include "tensors/tensor_allocator.h"
+#include "tensors/tensor_gpu.h"
+
 #include "tensor_operators.h"
 #include "param_initializers.h"

@ -15,8 +18,12 @@ template <class F>
 void testForward(F f, size_t l,
                 const Shape& shape,
                 const std::string& desc) {
-    Tensor in(shape);
-    Tensor out(shape);
+
+    auto ta = newTensorAllocator<DeviceGPU>();
+
+    Tensor in, out;
+    ta->allocate(in, shape);
+    ta->allocate(out, shape);

    uniform(-5, 5)(in);

@ -34,9 +41,14 @@ template <class F>
 void testBackward(F f, size_t l,
                  const Shape& shape,
                  const std::string& desc) {
-    Tensor in(shape);
-    Tensor adj(shape, 1);
-    Tensor grad(shape);
+
+    auto ta = newTensorAllocator<DeviceGPU>();
+
+    Tensor in, adj, grad;
+    ta->allocate(in, shape);
+    ta->allocate(adj, shape);
+    adj->set(1);
+    ta->allocate(grad, shape);

    uniform(-5, 5)(in);

--- a/src/tensor_operators.cu
+++ b/src/tensor_operators.cu
@ -39,63 +39,73 @@ static cudnnHandle_t create_handle_dnn() {
 cublasHandle_t cublasHandle = create_handle();
 cudnnHandle_t cudnnHandle = create_handle_dnn();

-void CudnnSoftmax(Tensor out, Tensor in) {
+void CudnnSoftmax(Tensor& out, Tensor& in) {
    float alpha = 1, beta = 0;
+    auto inGpu = static_cast<TensorGPU*>(in.get());
+    auto outGpu = static_cast<TensorGPU*>(out.get());
    cudnnSoftmaxForward(cudnnHandle,
                        CUDNN_SOFTMAX_ACCURATE,
                        CUDNN_SOFTMAX_MODE_CHANNEL,
                        &alpha,
-                        in.cudnn(),
-                        in.data(),
+                        inGpu->cudnn(),
+                        inGpu->data(),
                        &beta,
-                        out.cudnn(),
-                        out.data());
+                        outGpu->cudnn(),
+                        outGpu->data());
    cudaDeviceSynchronize();
 }

-void CudnnLogSoftmax(Tensor out, Tensor in) {
+void CudnnLogSoftmax(Tensor& out, Tensor& in) {
    float alpha = 1, beta = 0;
+    auto inGpu = static_cast<TensorGPU*>(in.get());
+    auto outGpu = static_cast<TensorGPU*>(out.get());
    cudnnSoftmaxForward(cudnnHandle,
                        CUDNN_SOFTMAX_LOG,
                        CUDNN_SOFTMAX_MODE_CHANNEL,
                        &alpha,
-                        in.cudnn(),
-                        in.data(),
+                        inGpu->cudnn(),
+                        inGpu->data(),
                        &beta,
-                        out.cudnn(),
-                        out.data());
+                        outGpu->cudnn(),
+                        outGpu->data());
    cudaDeviceSynchronize();
 }

-void CudnnSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
+void CudnnSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val) {
    float alpha = 1, beta = 0;
+    auto valGpu = static_cast<TensorGPU*>(val.get());
+    auto adjGpu = static_cast<TensorGPU*>(adj.get());
+    auto gradGpu = static_cast<TensorGPU*>(grad.get());
    cudnnSoftmaxBackward(cudnnHandle,
                        CUDNN_SOFTMAX_ACCURATE,
                        CUDNN_SOFTMAX_MODE_CHANNEL,
                        &alpha,
-                        val.cudnn(),
-                        val.data(),
-                        adj.cudnn(),
-                        adj.data(),
+                        valGpu->cudnn(),
+                        valGpu->data(),
+                        adjGpu->cudnn(),
+                        adjGpu->data(),
                        &beta,
-                        grad.cudnn(),
-                        grad.data());
+                        gradGpu->cudnn(),
+                        gradGpu->data());
    cudaDeviceSynchronize();
 }

-void CudnnLogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
+void CudnnLogSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val) {
    float alpha = 1, beta = 0;
+    auto valGpu = static_cast<TensorGPU*>(val.get());
+    auto adjGpu = static_cast<TensorGPU*>(adj.get());
+    auto gradGpu = static_cast<TensorGPU*>(grad.get());
    cudnnSoftmaxBackward(cudnnHandle,
                        CUDNN_SOFTMAX_LOG,
                        CUDNN_SOFTMAX_MODE_CHANNEL,
                        &alpha,
-                        val.cudnn(),
-                        val.data(),
-                        adj.cudnn(),
-                        adj.data(),
+                        valGpu->cudnn(),
+                        valGpu->data(),
+                        adjGpu->cudnn(),
+                        adjGpu->data(),
                        &beta,
-                        grad.cudnn(),
-                        grad.data());
+                        gradGpu->cudnn(),
+                        gradGpu->data());
    cudaDeviceSynchronize();
 }

@ -137,18 +147,18 @@ __global__ void gSubtractMax(float* out, const float* in,
  }
 }

-void SubtractMax(Tensor out, Tensor in) {
+void SubtractMax(Tensor& out, Tensor& in) {
  // Out is a m-by-k matrix, passed as input.
  // The max element of each row of Out is computed and subtracted from Out.
  // Out is both input and output.
-  size_t m = out.shape()[0];
-  size_t k = out.shape()[1];
+  size_t m = out->shape()[0];
+  size_t k = out->shape()[1];

  int blocks = std::min(MAX_BLOCKS, (int) m);
  int threads = std::min(MAX_THREADS, (int) k);
  int shared = sizeof(float) * threads * 2;
-  gSubtractMax<<<blocks, threads, shared>>>(out.data(),
-                                            in.data(), m, k);
+  gSubtractMax<<<blocks, threads, shared>>>(out->data(),
+                                            in->data(), m, k);
  cudaStreamSynchronize(0);
 }

@ -186,18 +196,18 @@ __global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols) {
  }
 }

-void Softmax(Tensor out, Tensor in) {
-  size_t m = out.shape()[0];
-  size_t k = out.shape()[1];
+void Softmax(Tensor& out, Tensor& in) {
+  size_t m = out->shape()[0];
+  size_t k = out->shape()[1];

  int blocks = std::min(MAX_BLOCKS, (int) m);
  int threads = std::min(MAX_THREADS, (int) k);
  int shared = sizeof(float) * threads * 2;
  // Subtract the max rowwise for numerical stability (safe softmax).
-  gSubtractMax<<<blocks, threads, shared>>>(out.data(),
-                                            in.data(), m, k);
+  gSubtractMax<<<blocks, threads, shared>>>(out->data(),
+                                            in->data(), m, k);
  cudaStreamSynchronize(0);
-  gSoftMax<<<blocks, threads, shared>>>(out.data(), m, k);
+  gSoftMax<<<blocks, threads, shared>>>(out->data(), m, k);
  cudaStreamSynchronize(0);
 }

@ -240,18 +250,18 @@ __global__ void gSoftmaxGrad(float* grad, const float* adj, const float* val,
  }
 }

-void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
+void SoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val) {
  // grad and val are both m-by-k matrices, passed as input.
  // A weighted average of each row of grad (according to the weights
  // specified in val) is computed and subtracted from Out.
  // adj is multiplied for each element to get backward step in autodiff
-  int m = grad.shape()[0];
-  int k = grad.shape()[1];
+  int m = grad->shape()[0];
+  int k = grad->shape()[1];

  int blocks = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, k);
  int shared = sizeof(float) * threads * 2;
-  gSoftmaxGrad<<<blocks, threads, shared>>>(grad.data(), adj.data(), val.data(),
+  gSoftmaxGrad<<<blocks, threads, shared>>>(grad->data(), adj->data(), val->data(),
                                            m, k);
  cudaStreamSynchronize(0);
 }
@ -293,19 +303,19 @@ __global__ void gLogSoftmaxGrad(float* grad, const float* adj, const float* val,
  }
 }

-void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
+void LogSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val) {
  // grad and val are both m-by-k matrices, passed as input.
  // A weighted average of each row of grad (according to the weights
  // specified in val) is computed and subtracted from Out.
  // adj is multiplied for each element to get backward step in autodiff
-  int m = grad.shape()[0];
-  int k = grad.shape()[1];
+  int m = grad->shape()[0];
+  int k = grad->shape()[1];

  int blocks = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, k);
  int shared = sizeof(float) * threads * 2;
-  gLogSoftmaxGrad<<<blocks, threads, shared>>>(grad.data(),
-                                               adj.data(), val.data(),
+  gLogSoftmaxGrad<<<blocks, threads, shared>>>(grad->data(),
+                                               adj->data(), val->data(),
                                               m, k);
  cudaStreamSynchronize(0);
 }
@ -327,80 +337,154 @@ __global__ void gArgmax(float *out, const float *data, size_t rows, size_t cols)
    out[row] = maxInd;
 }

-void Argmax(Tensor* Out, const Tensor* In) {
-  size_t m = In->shape()[0];
-  size_t k = In->shape()[1];
-
-  int blocks = m; //std::min(MAX_BLOCKS, (int) m);
-  int threads = k; //std::min(MAX_THREADS, (int) k);
-  //int shared = sizeof(float) * threads * 2;
-  gArgmax<<<blocks, threads>>>(Out->data(), In->data(), m, k);
-  cudaStreamSynchronize(0);
-}
+//void Argmax(Tensor* Out, const Tensor* In) {
+//  size_t m = In->shape()[0];
+//  size_t k = In->shape()[1];
+//
+//  int blocks = m; //std::min(MAX_BLOCKS, (int) m);
+//  int threads = k; //std::min(MAX_THREADS, (int) k);
+//  //int shared = sizeof(float) * threads * 2;
+//  gArgmax<<<blocks, threads>>>(Out->data(), In->data(), m, k);
+//  cudaStreamSynchronize(0);
+//}

 ///////////////////////////////////////////////////////

-Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
+void Prod(cublasHandle_t handle, Tensor& C, const Tensor& A, const Tensor& B,
             bool transA, bool transB, Float beta) {
  Float alpha = 1.0;

-  size_t m = A.shape()[0];
-  size_t k = A.shape()[1];
+  size_t m = A->shape()[0];
+  size_t k = A->shape()[1];
  if(transA)
    std::swap(m, k);

-  size_t l = B.shape()[0];
-  size_t n = B.shape()[1];
+  size_t l = B->shape()[0];
+  size_t n = B->shape()[1];
  if(transB)
    std::swap(l, n);

-  size_t lda = A.shape()[1];
-  size_t ldb = B.shape()[1];
-  size_t ldc = B.shape()[1];
+  size_t lda = A->shape()[1];
+  size_t ldb = B->shape()[1];
+  size_t ldc = B->shape()[1];

  if(transB)
-    ldc = B.shape()[0];
+    ldc = B->shape()[0];

  cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
  cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;

  cublasSgemm(handle, opB, opA,
-              n, m, k, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc);
-  return C;
+              n, m, k, &alpha, B->data(), ldb, A->data(), lda, &beta, C->data(), ldc);
 }

-Tensor Prod(Tensor C, const Tensor A, const Tensor B,
+void Prod(Tensor& C, const Tensor& A, const Tensor& B,
            bool transA, bool transB, Float beta) {

-  Tensor temp = Prod(cublasHandle, C, A, B, transA, transB, beta);
-  return temp;
+  Prod(cublasHandle, C, A, B, transA, transB, beta);
 }

-Tensor SumRowwise(cublasHandle_t handle, const Tensor A, Tensor result) {
-  size_t rows = A.shape()[0];
-  size_t cols = A.shape()[1];
-  thrust::device_vector<float> d_ones(cols, 1.f);
-  Float alpha = 1.f;
-  Float beta  = 0.f;
-  cublasSgemv(handle, CUBLAS_OP_T, cols, rows, &alpha,
-              A.data(), cols,
-              thrust::raw_pointer_cast(d_ones.data()), 1, &beta,
-              result.data(), 1);
-  return result;
+void Sum(Tensor& out, const Tensor& in, int axis, bool mean) {
+  int rows = in->shape()[0];
+  int cols = in->shape()[1];
+
+  if(axis == 0) {
+    float scale = 1.f;
+    if(mean)
+      scale = 1.f / rows;
+
+    thrust::device_vector<float> d_ones(rows, scale);
+    Tensor ones(new TensorGPU(thrust::raw_pointer_cast(d_ones.data()),
+                              {1, rows}));
+    Prod(out, ones, in, false, false);
+  }
+  else if(axis == 1) {
+    float scale = 1.f;
+    if(mean)
+      scale = 1.f / cols;
+
+    thrust::device_vector<float> d_ones(cols, scale);
+    Tensor ones(new TensorGPU(thrust::raw_pointer_cast(d_ones.data()),
+                              {cols, 1}));
+    Prod(out, in, ones, false, false);
+  }
+  else {
+    float scale1 = 1.f;
+    float scale2 = 1.f;
+    if(mean) {
+      scale1 = 1.f / rows;
+      scale2 = 1.f / cols;
+    }
+    thrust::device_vector<float> d_ones1(rows, scale1);
+    Tensor ones1(new TensorGPU(thrust::raw_pointer_cast(d_ones1.data()),
+                               {1, rows}));
+    thrust::device_vector<float> d_ones2(cols, scale2);
+    Tensor ones2(new TensorGPU(thrust::raw_pointer_cast(d_ones2.data()),
+                               {cols, 1}));
+    thrust::device_vector<float> d_temp(cols, 0.f);
+    Tensor temp(new TensorGPU(thrust::raw_pointer_cast(d_temp.data()),
+                               {1, cols}));
+
+    Prod(temp, ones1, in, false, false);
+    Prod(out, temp, ones2, false, false);
+  }
 }

-Tensor SumRowwise(const Tensor A, Tensor result) {
-  Tensor temp = SumRowwise(cublasHandle, A, result);
-  return temp;
+void SumBackward(Tensor& out, const Tensor& in, int axis, bool mean) {
+  int rows = out->shape()[0];
+  int cols = out->shape()[1];
+
+  if(axis == 0) {
+    float scale = 1.f;
+    if(mean)
+      scale = 1.f / rows;
+
+    thrust::device_vector<float> d_ones(rows, scale);
+    Tensor ones(new TensorGPU(thrust::raw_pointer_cast(d_ones.data()),
+                              {rows, 1}));
+    Prod(out, ones, in, false, false);
+  }
+  else if(axis == 1) {
+    float scale = 1.f;
+    if(mean)
+      scale = 1.f / cols;
+
+    thrust::device_vector<float> d_ones(cols, scale);
+    Tensor ones(new TensorGPU(thrust::raw_pointer_cast(d_ones.data()),
+                              {1, cols}));
+    Prod(out, in, ones, false, false);
+  }
+  else {
+    float scale1 = 1.f;
+    float scale2 = 1.f;
+    if(mean) {
+      scale1 = 1.f / rows;
+      scale2 = 1.f / cols;
+    }
+    thrust::device_vector<float> d_ones1(rows, scale1);
+    Tensor ones1(new TensorGPU(thrust::raw_pointer_cast(d_ones1.data()),
+                               {rows, 1}));
+    thrust::device_vector<float> d_ones2(cols, scale2);
+    Tensor ones2(new TensorGPU(thrust::raw_pointer_cast(d_ones2.data()),
+                               {1, cols}));
+    thrust::device_vector<float> d_temp(rows, 0.f);
+    Tensor temp(new TensorGPU(thrust::raw_pointer_cast(d_temp.data()),
+                               {rows, 1}));
+
+    Prod(temp, ones1, in, false, false);
+    Prod(out, temp, ones2, false, false);
+  }
 }

-void CudnnDropoutPrepare(Tensor in, float p,
+
+void CudnnDropoutPrepare(Tensor& in, float p,
                         cudnnDropoutDescriptor_t* dropDesc,
                         void** space, size_t* spaceSize,
                         void** states, size_t seed) {
  size_t statesSize;
  cudnnDropoutGetStatesSize(cudnnHandle, &statesSize);
-  cudnnDropoutGetReserveSpaceSize(in.cudnn(), spaceSize);
+  auto inGpu = static_cast<TensorGPU*>(in.get());
+  cudnnDropoutGetReserveSpaceSize(inGpu->cudnn(), spaceSize);

  cudaMalloc((void**)states, statesSize);
  cudaMalloc((void**)space, *spaceSize);
@ -423,26 +507,30 @@ void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,

 void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
                  void* space, size_t spaceSize,
-                  Tensor out, Tensor in) {
+                  Tensor& out, Tensor& in) {
+  auto inGpu = static_cast<TensorGPU*>(in.get());
+  auto outGpu = static_cast<TensorGPU*>(out.get());
  cudnnDropoutForward(cudnnHandle,
                      dropoutDesc,
-                      in.cudnn(),
-                      in.data(),
-                      out.cudnn(),
-                      out.data(),
+                      inGpu->cudnn(),
+                      inGpu->data(),
+                      outGpu->cudnn(),
+                      outGpu->data(),
                      space,
                      spaceSize);
 }

 void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc,
                          void* space, size_t spaceSize,
-                          Tensor out, Tensor in) {
+                          Tensor& out, Tensor& in) {
+  auto inGpu = static_cast<TensorGPU*>(in.get());
+  auto outGpu = static_cast<TensorGPU*>(out.get());
  cudnnDropoutBackward(cudnnHandle,
                      dropoutDesc,
-                      in.cudnn(),
-                      in.data(),
-                      out.cudnn(),
-                      out.data(),
+                      inGpu->cudnn(),
+                      inGpu->data(),
+                      outGpu->cudnn(),
+                      outGpu->data(),
                      space,
                      spaceSize);
 }
--- a/src/tensor_operators.h
+++ b/src/tensor_operators.h
@ -21,7 +21,10 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.

-#include "tensor.h"
+#include <cublas_v2.h>
+#include <thrust/functional.h>
+
+#include "tensors/tensor_gpu.h"

 namespace marian {

@ -29,11 +32,79 @@ using namespace thrust::placeholders;
 #define MAX_THREADS 512
 #define MAX_BLOCKS 65535

+template <class Functor>
+__global__ void gElementVec(Functor functor,
+                            float* out, const float* in,
+                            int length) {
+  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+    int noColumn = bid + blockDim.x * blockIdx.x + threadIdx.x;
+    if (noColumn < length) {
+      out[noColumn] = functor(out[noColumn], in[noColumn]);
+    }
+  }
+}
+
+template <class Functor, class T1, class T2>
+void ElementVec(Functor functor,
+                T1 out, T2 in) {
+
+  int rows = out->shape()[0];
+  int cols = out->shape()[1];
+
+  int length = rows * cols;
+
+  float* d_out = out->data();
+  float* d_in  = in->data();
+
+  int threads = std::min(MAX_THREADS, length);
+  int blocks  = std::min(MAX_BLOCKS, length / threads  + (length % threads != 0));
+
+  gElementVec<<<blocks, threads>>>(functor, d_out, d_in, length);
+  cudaStreamSynchronize(0);
+}
+
+template <class Functor>
+__global__ void gElementVec(Functor functor,
+                            float* out,
+                            const float* in1,
+                            const float* in2,
+                            int length) {
+  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+    int noColumn = bid + blockDim.x * blockIdx.x + threadIdx.x;
+    if (noColumn < length) {
+      out[noColumn] = functor(out[noColumn],
+                              in1[noColumn],
+                              in2[noColumn]);
+    }
+  }
+}
+
+template <class Functor, class T1, class T2, class T3>
+void ElementVec(Functor functor,
+                T1 out, T2 in1, T3 in2) {
+
+  int rows = out->shape()[0];
+  int cols = out->shape()[1];
+
+  int length = rows * cols;
+
+  float* d_out = out->data();
+  float* d_in1  = in1->data();
+  float* d_in2  = in2->data();
+
+  int threads = std::min(MAX_THREADS, (int)length);
+  int blocks  = std::min(MAX_BLOCKS, length / threads  + (length % threads != 0));
+
+  gElementVec<<<blocks, threads>>>(functor, d_out, d_in1, d_in2, length);
+  cudaStreamSynchronize(0);
+}
+
 template <class Functor, class T>
 __global__ void gElement(Functor functor,
                         T out) {
-  int rows = out.rows();
-  int cols = out.cols();
+  int rows = out.shape()[0];
+  int cols = out.shape()[1];
+
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int i = bid + blockIdx.x;
    if(i < rows) {
@ -47,14 +118,17 @@ __global__ void gElement(Functor functor,
 }

 template <class Functor, class T>
-void Element(Functor functor, T out) {
+void Element(Functor functor, T& out) {

-  int m = out.shape()[0];
-  int n = out.shape()[1];
+  int m = out->shape()[0];
+  int n = out->shape()[1];

  int blocks  = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, n);
-  gElement<<<blocks, threads>>>(functor, out.gpu());
+
+  auto outGpu = static_cast<TensorGPU*>(out.get());
+
+  gElement<<<blocks, threads>>>(functor, outGpu->access());
  cudaStreamSynchronize(0);
 }

@ -62,38 +136,50 @@ void Element(Functor functor, T out) {
 template <class Functor, class T1, class T2>
 __global__ void gElement(Functor functor,
                         T1 out, T2 in) {
-  int rows = out.rows();
-  int cols = out.cols();
+  int rows = out.shape()[0];
+  int cols = out.shape()[1];
+
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int i = bid + blockIdx.x;
    if(i < rows) {
      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int j = tid + threadIdx.x;
-        if(j < cols)
+        if(j < cols) {
          out(i, j) = functor(out(i, j), in(i, j));
        }
      }
    }
  }
+}

 template <class Functor, class T1, class T2>
 void Element(Functor functor,
-             T1 out, T2 in) {
+             T1& out, T2& in) {

-  int m = out.shape()[0];
-  int n = out.shape()[1];
+  if(out->shape() == in->shape()) {
+    ElementVec(functor, out, in);
+  }
+  else {
+    int m = out->shape()[0];
+    int n = out->shape()[1];

    int blocks  = std::min(MAX_BLOCKS, m);
    int threads = std::min(MAX_THREADS, n);
-  gElement<<<blocks, threads>>>(functor, out.gpu(), in.gpu());
+
+    auto inGpu = static_cast<TensorGPU*>(in.get());
+    auto outGpu = static_cast<TensorGPU*>(out.get());
+
+    gElement<<<blocks, threads>>>(functor,
+                                  outGpu->access(), inGpu->access());
    cudaStreamSynchronize(0);
  }
+}

 template <class Functor, class T1, class T2, class T3>
 __global__ void gElement(Functor functor,
                         T1 out, T2 in1, T3 in2) {
-  int rows = out.rows();
-  int cols = out.cols();
+  int rows = out.shape()[0];
+  int cols = out.shape()[1];
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int i = bid + blockIdx.x;
    if(i < rows) {
@ -108,23 +194,35 @@ __global__ void gElement(Functor functor,

 template <class Functor, class T1, class T2, class T3>
 void Element(Functor functor,
-             T1 out, T2 in1, T3 in2) {
+             T1& out, T2& in1, T3& in2) {

-  int m = out.shape()[0];
-  int n = out.shape()[1];
+  if(out->shape() == in1->shape() && in1->shape() == in2->shape()) {
+    ElementVec(functor, out, in1, in2);
+  }
+  else {
+    auto in1Gpu = static_cast<TensorGPU*>(in1.get());
+    auto in2Gpu = static_cast<TensorGPU*>(in2.get());
+    auto outGpu = static_cast<TensorGPU*>(out.get());
+
+    int m = out->shape()[0];
+    int n = out->shape()[1];

    int blocks  = std::min(MAX_BLOCKS, m);
    int threads = std::min(MAX_THREADS, n);
-  gElement<<<blocks, threads>>>(functor, out.gpu(),
-                                in1.gpu(), in2.gpu());
+    gElement<<<blocks, threads>>>(functor,
+                                  outGpu->access(),
+                                  in1Gpu->access(),
+                                  in2Gpu->access());
    cudaStreamSynchronize(0);
  }
+}

 template <class Functor, class T1, class T2, class T3, class T4>
 __global__ void gElement(Functor functor,
                         T1 out, T2 in1, T3 in2, T4 in3) {
-  int rows = out.rows();
-  int cols = out.cols();
+  int rows = out.shape()[0];
+  int cols = out.shape()[1];
+
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int i = bid + blockIdx.x;
    if(i < rows) {
@ -139,48 +237,53 @@ __global__ void gElement(Functor functor,

 template <class Functor, class T1, class T2, class T3, class T4>
 void Element(Functor functor,
-             T1 out, T2 in1, T3 in2, T4 in3) {
+             T1& out, T2& in1, T3& in2, T4& in3) {

-  int m = out.shape()[0];
-  int n = out.shape()[1];
+  auto in1Gpu = static_cast<TensorGPU*>(in1.get());
+  auto in2Gpu = static_cast<TensorGPU*>(in2.get());
+  auto in3Gpu = static_cast<TensorGPU*>(in3.get());
+  auto outGpu = static_cast<TensorGPU*>(out.get());
+
+  int m = outGpu->shape()[0];
+  int n = outGpu->shape()[1];

  int blocks  = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, n);
-  gElement<<<blocks, threads>>>(functor, out.gpu(),
-                                in1.gpu(), in2.gpu(), in3.gpu());
+  gElement<<<blocks, threads>>>(functor,
+                                outGpu->access(),
+                                in1Gpu->access(),
+                                in2Gpu->access(),
+                                in3Gpu->access());
  cudaStreamSynchronize(0);
 }

-void ClipNorm(Tensor out, float threshold);
+void ClipNorm(Tensor& out, float threshold);

-void SubtractMax(Tensor out, Tensor in);
+void SubtractMax(Tensor& out, Tensor& in);

-void Softmax(Tensor out, Tensor in);
+void Softmax(Tensor& out, Tensor& in);

-void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
-void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
+void SoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val);
+void LogSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val);

-void CudnnSoftmax(Tensor out, Tensor in);
-void CudnnSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
+void CudnnSoftmax(Tensor& out, Tensor& in);
+void CudnnSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val);

-void CudnnLogSoftmax(Tensor out, Tensor in);
-void CudnnLogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
+void CudnnLogSoftmax(Tensor& out, Tensor& in);
+void CudnnLogSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val);

-void Argmax(Tensor* Out, const Tensor* In);
+void Argmax(Tensor& Out, const Tensor& In);

-Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
+void Prod(cublasHandle_t handle, Tensor& C, const Tensor& A, const Tensor& B,
             bool transA, bool transB, Float beta);

-Tensor Prod(Tensor C, const Tensor A, const Tensor B,
+void Prod(Tensor& C, const Tensor& A, const Tensor& B,
             bool transA, bool transB, Float beta = 0);

-Tensor SumRowwise(cublasHandle_t handle, const Tensor A, Tensor result);
+void Sum(Tensor& out, const Tensor& in, int axis=-1, bool mean=false);
+void SumBackward(Tensor& out, const Tensor& in, int axis=-1, bool mean=false);

-Tensor SumRowwise(const Tensor A, Tensor result);
-
-void ScaleRowwise(Tensor Out, const Tensor ScalingFactors);
-
-void CudnnDropoutPrepare(Tensor in, float p,
+void CudnnDropoutPrepare(Tensor& in, float p,
                         cudnnDropoutDescriptor_t* dropDesc,
                         void** space, size_t* spaceSize,
                         void** states, size_t seed);
@ -190,11 +293,11 @@ void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,

 void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
                  void* space, size_t spaceSize,
-                  Tensor out, Tensor in);
+                  Tensor& out, Tensor& in);

 void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc,
                          void* space, size_t spaceSize,
-                          Tensor out, Tensor in);
+                          Tensor& out, Tensor& in);


 }
--- a/src/tensor_test.cu
+++ b/src/tensor_test.cu
@ -0,0 +1,28 @@
+#include <iostream>
+
+#include "tensors/tensor_allocator.h"
+#include "tensors/tensor_gpu.h"
+#include "tensor_operators.h"
+
+using namespace marian;
+
+int main() {
+  TensorAllocator params = newTensorAllocator<DeviceGPU>();
+  //params->allocate(100000000);
+
+  std::vector<Tensor> tensors;
+  for (int i = 0; i < 200; ++i) {
+    std::cerr << i << std::endl;
+    tensors.emplace_back();
+    params->allocate(tensors.back(), {784,2048});
+    std::cerr << tensors.back()->size() << std::endl;
+    std::cerr << params->capacity() << " " << params->size() << std::endl;
+  }
+
+  for(int i = 0; i < 200; i++) {
+    tensors[i]->set(0, 3.14 * i);
+    std::cerr << tensors[i]->get(0) << std::endl;
+  }
+
+  return 0;
+}
--- a/src/tensors/bac/tensor.cu
+++ b/src/tensors/bac/tensor.cu
--- a/src/tensors/bac/tensor.h
+++ b/src/tensors/bac/tensor.h
@ -236,8 +236,7 @@ class TensorImpl {
     *
     * @return Vector in string form.
     */
-    std::string Debug() const
-    {
+    std::string Debug() const {
        std::stringstream strm;
        assert(shape_.size());
        strm << "shape=" << marian::Debug(shape_) << std::endl;
--- a/src/tensors/tensor.cu
+++ b/src/tensors/tensor.cu
@ -0,0 +1,36 @@
+// This file is part of the Marian toolkit.
+// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "tensors/tensor.h"
+
+namespace marian {
+
+Tensor& operator<<(Tensor& t, const std::vector<float>& v) {
+  t->set(v);
+  return t;
+}
+
+Tensor& operator>>(Tensor& t, std::vector<float>& v) {
+  t->get(v);
+  return t;
+}
+
+}
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -0,0 +1,76 @@
+#pragma once
+
+// This file is part of the Marian toolkit.
+// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <memory>
+#include <iostream>
+
+#include "definitions.h"
+
+namespace marian {
+
+class TensorBase {
+  public:
+    TensorBase(float* data, Shape shape)
+    : data_(data), shape_(shape)
+    { }
+
+    virtual ~TensorBase() {}
+
+    virtual void reset(float* data) {
+      data_ = data;
+    }
+
+    virtual float* data() {
+      return data_;
+    }
+
+    virtual Shape& shape() {
+      return shape_;
+    }
+
+    virtual size_t size() {
+      return shape_.elements();
+    }
+
+    virtual float get(size_t) = 0;
+    virtual void set(size_t, float) = 0;
+
+    virtual void set(float) = 0;
+
+    virtual void get(std::vector<float> &v) = 0;
+    virtual void set(const std::vector<float> &v) = 0;
+
+    virtual std::string debug() = 0;
+
+  protected:
+    float* data_;
+    Shape shape_;
+};
+
+typedef std::shared_ptr<TensorBase> Tensor;
+
+Tensor& operator<<(Tensor& t, const std::vector<float>& v);
+
+Tensor& operator>>(Tensor& t, std::vector<float>& v);
+
+}
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@ -0,0 +1,137 @@
+#pragma once
+
+// This file is part of the Marian toolkit.
+// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <set>
+
+#include "definitions.h"
+#include "tensors/tensor.h"
+
+namespace marian {
+
+class TensorAllocatorBase {
+  public:
+    virtual ~TensorAllocatorBase() {};
+    virtual void reserve(size_t) = 0;
+    virtual void reserveExact(size_t) = 0;
+    virtual void clear() = 0;
+    virtual void allocate(Tensor&, Shape) = 0;
+    virtual size_t capacity() = 0;
+    virtual size_t size() = 0;
+    virtual Tensor asTensor() = 0;
+};
+
+template <class Device>
+class TensorAllocatorDerived : public TensorAllocatorBase {
+  private:
+    const size_t CHUNK  = 128;
+    const size_t MBYTE  = 1024 * 1024;
+    const size_t FLOATS = CHUNK * MBYTE / sizeof(float);
+
+    Device device_;
+    std::vector<Tensor> allocated_;
+
+    void reset(Tensor t, float* start) {
+      t->reset(start);
+    }
+
+    void resetAllocated() {
+      float* start = device_.data();
+      for(auto t : allocated_) {
+        reset(t, start);
+        start += t->size();
+      }
+    }
+
+    void checkSpace(Shape shape) {
+      float* start = device_.data();
+      if(!allocated_.empty()) {
+        start = allocated_.back()->data() + allocated_.back()->size();
+      }
+
+      size_t available = device_.data() + device_.capacity() - start;
+      if(shape.elements() > available) {
+        reserve(device_.capacity() - available + shape.elements());
+      }
+    }
+
+  public:
+    void reserve(size_t elements = 0) {
+      float mult = elements / FLOATS + 1;
+      std::cerr << "Reserving " << mult * CHUNK << " MB" << std::endl;
+      device_.reserve(mult * FLOATS);
+      resetAllocated();
+    }
+
+    void reserveExact(size_t elements = 0) {
+      size_t mbytes = (elements * sizeof(float)) / MBYTE;
+      std::cerr << "Reserving space for " << elements
+        << " floats (" << mbytes << " MB)" << std::endl;
+      device_.reserve(elements);
+      resetAllocated();
+    }
+
+    void clear() {
+      allocated_.clear();
+    }
+
+    void allocate(Tensor &t, Shape shape) {
+      if(!t || t->shape() != shape) {
+        checkSpace(shape);
+
+        float* start = device_.data();
+        if(!allocated_.empty()) {
+          start = allocated_.back()->data() + allocated_.back()->size();
+        }
+
+        t.reset(new typename Device::tensor_type(start, shape));
+        allocated_.push_back(t);
+      }
+    }
+
+    Tensor asTensor() {
+      float* start = device_.data();
+      return Tensor(new typename Device::tensor_type(start, {1, (int)size()}));
+    }
+
+    size_t capacity() {
+      return device_.capacity();
+    }
+
+    size_t size() {
+      float* start = device_.data();
+      float* end = start;
+      if(!allocated_.empty())
+        end = allocated_.back()->data() + allocated_.back()->size();
+
+      return end - start;
+    }
+};
+
+typedef std::shared_ptr<TensorAllocatorBase> TensorAllocator;
+
+template <class Device>
+TensorAllocator newTensorAllocator() {
+  return TensorAllocator(new TensorAllocatorDerived<Device>());
+}
+
+}
--- a/src/tensors/tensor_cpu.h
+++ b/src/tensors/tensor_cpu.h
@ -0,0 +1,95 @@
+#pragma once
+
+// This file is part of the Marian toolkit.
+// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cstring>
+
+#include "tensors/tensor.h"
+
+namespace marian {
+
+class TensorCPU : public TensorBase {
+  public:
+    TensorCPU(float* data, Shape shape)
+    : TensorBase(data, shape) {}
+
+    float get(size_t i) {
+      return data_[i];
+    }
+
+    void set(size_t i, float value) {
+      data_[i] = value;
+    }
+
+    void get(std::vector<float> &v) {
+      v.resize(size());
+      std::copy(data_, data_ + size(), v.begin());
+    }
+
+    void set(float value) {
+      std::fill(data_, data_ + size(), value);
+    }
+
+    void set(const std::vector<float> &v) {
+      std::copy(v.begin(), v.end(), data_);
+    }
+};
+
+class DeviceCPU {
+  private:
+    float* data_;
+    size_t size_
+
+  public:
+    DeviceCPU()
+    : data_(0), size_(0) {}
+
+    ~DeviceCPU() {
+      if(data_)
+        delete[] data_;
+    }
+
+    typedef TensorCPU tensor_type;
+
+    void reserve(size_t size) {
+      UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
+      float* temp = new float[size];
+
+      if(data_) {
+        std::memcpy(temp, data_, size_* sizeof(float));
+        delete[] data_;
+      }
+
+      data_ = temp;
+      size_ = size;
+    }
+
+    float* data() {
+      return data_;
+    }
+
+    size_t capacity() {
+      return size_;
+    }
+};
+
+}
--- a/src/tensors/tensor_gpu.h
+++ b/src/tensors/tensor_gpu.h
@ -0,0 +1,202 @@
+#pragma once
+
+// This file is part of the Marian toolkit.
+// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <sstream>
+#include <cuda.h>
+#include <cudnn.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+
+#include "exception.h"
+#include "definitions.h"
+#include "tensors/tensor.h"
+
+namespace marian {
+
+#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+struct Access {
+    float* data_;
+    Shape shape_;
+
+    Access(float* data, const Shape& shape)
+    : data_(data), shape_(shape) {}
+
+    __device__
+    inline float& operator()(size_t i, size_t j) {
+      int rows = shape_[0];
+      int cols = shape_[1];
+      if(rows != 1 && cols != 1)
+        return data_[i * cols + j];
+      if(rows != 1 && cols == 1)
+        return data_[i];
+      if(rows == 1 && cols != 1)
+        return data_[j];
+      return data_[0];
+    }
+
+    __device__ __host__
+    float* data() {
+      return data_;
+    }
+
+    __device__ __host__
+    Shape& shape() {
+      return shape_;
+    }
+
+    //Access* toDevice() {
+    //  Access* ptr;
+    //  cudaMalloc(&ptr, sizeof(Access));
+    //  cudaMemcpy(ptr, this, sizeof(Access), cudaMemcpyHostToDevice);
+    //  return ptr;
+    //}
+};
+
+class TensorGPU : public TensorBase {
+  private:
+    // cuDNN stuff
+    cudnnTensorDescriptor_t cudnnDesc_;
+
+  public:
+    TensorGPU(float* data, Shape shape)
+    : TensorBase(data, shape) {
+      cudnnCreateTensorDescriptor(&cudnnDesc_);
+      cudnnSetTensor4dDescriptorEx(cudnnDesc_, CUDNN_DATA_FLOAT,
+                                   shape_[0], shape_[1], 1, 1,
+                                   shape_[1], 1, 1, 1);
+    }
+
+    ~TensorGPU() {
+      cudnnDestroyTensorDescriptor(cudnnDesc_);
+    }
+
+
+    float get(size_t i) {
+      float temp;
+      CUDA_CHECK(cudaMemcpy(&temp, data_ + i, sizeof(float),
+                 cudaMemcpyDeviceToHost));
+      return temp;
+    }
+
+    void set(size_t i, float value) {
+      CUDA_CHECK(cudaMemcpy(data_ + i, &value, sizeof(float),
+                 cudaMemcpyHostToDevice));
+    }
+
+    void get(std::vector<float> &v) {
+      v.resize(size());
+      CUDA_CHECK(cudaMemcpy(v.data(), data_, size() * sizeof(float),
+                 cudaMemcpyDeviceToHost));
+    }
+
+    void set(float value) {
+      thrust::fill(thrust::device_ptr<float>(data_),
+                   thrust::device_ptr<float>(data_ + size()), value);
+    }
+
+    void set(const std::vector<float> &v) {
+      CUDA_CHECK(cudaMemcpy(data_, v.data(), v.size() * sizeof(float),
+                 cudaMemcpyHostToDevice));
+    }
+
+    cudnnTensorDescriptor_t& cudnn() {
+      return cudnnDesc_;
+    }
+
+    Access access() {
+      return Access(data_, shape_);
+    }
+
+    std::string debug() {
+      std::stringstream strm;
+      assert(shape_.size());
+      strm << "shape=" << shape_[0] << "x" << shape_[1] << std::endl;
+
+      // values
+      size_t totSize = shape_.elements();
+      std::vector<Float> values(totSize);
+      get(values);
+
+      size_t ind = 0;
+      for (size_t i = 0; i < shape()[0]; ++i) {
+          for (size_t j = 0; j < shape()[1]; ++j) {
+              strm << values[ind] << " ";
+              ++ind;
+          }
+          strm << std::endl;
+      }
+      return strm.str();
+    }
+};
+
+class DeviceGPU {
+  private:
+    float* data_;
+    size_t size_;
+
+  public:
+    DeviceGPU()
+    : data_(0), size_(0) {}
+
+    ~DeviceGPU() {
+      if(data_)
+        CUDA_CHECK(cudaFree(data_));
+    }
+
+    typedef TensorGPU tensor_type;
+
+    void reserve(size_t size) {
+      UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
+      float *temp;
+      CUDA_CHECK(cudaMalloc(&temp, size * sizeof(float)));
+
+      if(data_) {
+        CUDA_CHECK(cudaMemcpy(temp, data_, size_* sizeof(float),
+                   cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaFree(data_));
+      }
+
+      data_ = temp;
+      size_ = size;
+    }
+
+    float* data() {
+      return data_;
+    }
+
+    size_t capacity() {
+      return size_;
+    }
+};
+
+}
--- a/src/trainer.h
+++ b/src/trainer.h
@ -57,7 +57,7 @@ class Trainer : public RunBase,
          while(bg) {
            BatchPtr batch = bg.next();
            opt->update(graph_, batch);
-            cost += (*graph_)["cost"]->val()[0] * batch->dim();
+            cost += (*graph_)["cost"]->val()->get(0) * batch->dim();
            totalExamples += batch->dim();
            update++;
          }
@ -115,7 +115,7 @@ class Validator : public RunBase,
        BatchGenerator bg(dataset_, batchSize);

        size_t update = 0;
-        bg.prepare();
+        bg.prepare(false);

        float total = 0;
        float cor = 0;
@ -123,7 +123,7 @@ class Validator : public RunBase,
            BatchPtr batch = bg.next();
            graph_->inference(batch);
            std::vector<float> scores;
-            scores << (*graph_)["scores"]->val();
+            (*graph_)["scores"]->val()->get(scores);

            cor += correct(scores, batch->inputs()[1].data());
            total += batch->dim();