working dropout node

2024-09-17 09:47:34 +03:00 · 2016-09-21 00:39:27 +02:00 · 2016-09-21 00:39:27 +02:00 · a057ff1776
commit a057ff1776
parent 7412e68dcd
18 changed files with 247 additions and 209 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,7 +3,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

 project(marian CXX)
 SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
-LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; --expt-extended-lambda; -Xcompiler '-fPIC')
+LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; --expt-extended-lambda; --expt-relaxed-constexpr; -Xcompiler '-fPIC')
 add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
 SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

--- a/scripts/train_test_model_multi.py
+++ b/scripts/train_test_model_multi.py
@ -50,15 +50,15 @@ from keras.optimizers import Adam, SGD

 def baseline_model(pixels_count, classes_count):
    model = Sequential()
-#    model.add(Dropout(0.2, input_shape=(pixels_count,)))
-    model.add(Dense(2048, input_dim=pixels_count, init='uniform', activation='relu'))
+    model.add(Dropout(0.2, input_shape=(pixels_count,)))
+    model.add(Dense(2048, input_dim=pixels_count, init='uniform', activation='tanh'))
 #    model.add(Dense(2048, init='uniform', activation='relu'))
-#    model.add(Dropout(0.5))
-    model.add(Dense(2048, init='uniform', activation='relu'))
-    model.add(Dense(2048, init='uniform', activation='relu'))
-    model.add(Dense(2048, init='uniform', activation='relu'))
-    model.add(Dense(2048, init='uniform', activation='relu'))
-#    model.add(Dropout(0.5))
+    model.add(Dropout(0.5))
+#    model.add(Dense(2048, init='uniform', activation='relu'))
+#    model.add(Dense(2048, init='uniform', activation='relu'))
+#    model.add(Dense(2048, init='uniform', activation='relu'))
+    model.add(Dense(2048, init='uniform', activation='tanh'))
+    model.add(Dropout(0.5))
    model.add(Dense(classes_count, init='uniform', activation='softmax'))

    opt = Adam(lr=0.0002);
@ -102,7 +102,7 @@ if __name__ == "__main__":
    # Fit the model
    
    start = time.time();
-    model.fit(X_train, y_train, nb_epoch=10, batch_size=200, verbose=2, shuffle=True)
+    model.fit(X_train, y_train, nb_epoch=20, batch_size=200, verbose=2, shuffle=True)

    print "Time elapsed", time.time() - start, "s"
    # Final evaluation of the model
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -8,6 +8,7 @@ cuda_add_library(marian_lib
  tensor.cu
  tensor_operators.cu
  expression_operators.cu
+  dropout.cu
  vocab.cpp
 )

--- a/src/chainable.h
+++ b/src/chainable.h
@ -32,6 +32,7 @@ template <class DataType>
 struct Chainable {
    Chainable() { }
    virtual ~Chainable() { }
+    virtual void inference() { forward(); }
    virtual void forward() { }
    virtual void backward() { }
    virtual void backward_numeric(Float delta) { }
--- a/src/definitions.h
+++ b/src/definitions.h
@ -25,14 +25,45 @@
 #include <string>
 #include <functional>

+#define SHAPE_SIZE 2
+
 namespace marian {
  typedef float Float;
-  typedef std::vector<int> Shape;
  const int whatevs{-1};
+  
+  // POD for shape
+  class Shape {
+    private:
+      int shape_[SHAPE_SIZE];
+      
+    public:
+      Shape() : shape_{1, 1} { }
+      
+      Shape(std::initializer_list<int> il) {
+       std::copy(il.begin(), il.end(), begin());
+      }
+    
+      int& operator[](int i) {
+        return shape_[i];
+      }
+      
+      const int& operator[](int i) const {
+        return shape_[i];
+      }
+      
+      size_t size() const {
+        return SHAPE_SIZE;
+      }
+      
+      int* begin() { return shape_; }
+      int* end() { return shape_ + SHAPE_SIZE; }
+
+      const int* begin() const { return shape_; }
+      const int* end() const { return shape_+ SHAPE_SIZE; }
+  };
 }

 #include "keywords.h"
-// #include "tensor.h"

 namespace marian {
  class Tensor;
--- a/src/dropout.cu
+++ b/src/dropout.cu
@ -0,0 +1,15 @@
+#include <curand.h>
+#include <curand_kernel.h>
+
+#include "dropout.h"
+
+namespace marian {
+
+__global__ void gInitCurandStates(curandState* states, unsigned int seed) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  curand_init(seed, tid, 0, &states[tid]);
+}
+
+unsigned Bernoulli::seed = time(0);
+
+}
--- a/src/dropout.h
+++ b/src/dropout.h
@ -0,0 +1,57 @@
+#pragma once
+
+#include <curand.h>
+#include <curand_kernel.h>
+
+#include "tensor_operators.h"
+
+namespace marian {
+
+__global__ void gInitCurandStates(curandState* states, unsigned int seed);
+
+class Bernoulli {
+  private:
+    float p_;
+    curandState* states_;
+    static unsigned seed;
+    Shape shape_;
+    
+  public:
+    Bernoulli(float p, const Shape& shape)
+    : p_(p), shape_(shape) {}
+    
+    void InitStates(curandState* states) {
+      states_ = states;
+      int blocks = std::min(MAX_BLOCKS, shape_[0]);
+      int threads = std::min(MAX_THREADS, shape_[1]);
+      int n = blocks * threads;
+      cudaMalloc((void**) &states_, n * sizeof(curandState));
+      gInitCurandStates<<<blocks, threads>>>(states_, seed++);
+      cudaStreamSynchronize(0);
+    }
+    
+    void FreeStates(curandState* states) {
+      cudaFree(states);
+    }
+    
+    __device__ float operator()(int i, int j) const {
+      int tid = threadIdx.x + blockIdx.x * blockDim.x;
+      float dist = curand_uniform(&states_[tid]);	
+      float zeroOne = dist > p_;
+      return zeroOne / (1 - p_);
+    }
+    
+    __device__ int rows() const {
+      return shape_[0];
+    }
+    
+    __device__ int cols() const {
+      return shape_[1];
+    }
+    
+    Bernoulli& gpu() {
+      return *this;
+    }
+};
+
+}
--- a/src/expression_graph.h
+++ b/src/expression_graph.h
@ -68,6 +68,15 @@ class ExpressionGraph {

    /** @brief Constructs a new expression graph */
    ExpressionGraph() : stack_(new ChainableStack) {}
+
+    void inference(int batchSize) {
+      for(auto&& v : *stack_) {
+        v->allocate(batchSize);
+      }
+      for(auto&& v : *stack_)
+        v->inference();    
+    }
+
  
    /**
     * @brief Performs backpropogation on this expression graph.
--- a/src/expression_operators.cu
+++ b/src/expression_operators.cu
@ -42,10 +42,6 @@ Expr relu(Expr a) {
  return Expr(a.graph(), new ReLUNodeOp(a));
 }

-Expr dropout(Expr a) {
-  return Expr(a.graph(), new DropoutNodeOp(a));
-}
-
 Expr log(Expr a) {
  return Expr(a.graph(), new LogNodeOp(a));
 };
--- a/src/expression_operators.h
+++ b/src/expression_operators.h
@ -33,7 +33,10 @@ Expr tanh(Expr a);

 Expr relu(Expr a);

-Expr dropout(Expr a);
+template <typename ...Args>
+Expr dropout(Expr a, Args ...args) {
+  return Expr(a.graph(), new DropoutNodeOp(a, args...));
+}

 Expr log(Expr a);

--- a/src/mnist_benchmark.cu
+++ b/src/mnist_benchmark.cu
@ -6,7 +6,6 @@

 #include "marian.h"
 #include "mnist.h"
-#include "npz_converter.h"
 #include "optimizers.h"

 using namespace marian;
@ -30,11 +29,11 @@ ExpressionGraph build_graph(const std::vector<int>& dims) {
    int out = dims[i+1];
      
    if(i == 0) {
-      layers.emplace_back(x);
+      layers.emplace_back(dropout(x, value=0.2));
    }
    else {
-      layers.emplace_back(reluplus(dot(layers.back(), weights.back()), biases.back()));
-      //layers.emplace_back(relu(dot(layers.back(), weights.back()) + biases.back()));
+      //layers.emplace_back(reluplus(dot(layers.back(), weights.back()), biases.back()));
+      layers.emplace_back(dropout(relu(dot(layers.back(), weights.back()) + biases.back()), value=0.5));
    }
    
    weights.emplace_back(
@ -115,8 +114,8 @@ int main(int argc, char** argv) {
  std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", testRows, LABEL_SIZE);
  std::cerr << "Done." << std::endl;

-  ExpressionGraph g = build_graph({IMAGE_SIZE, 2048, 2048, 2048, 2048, 2048, LABEL_SIZE});
-  std::cout << g.graphviz() << std::endl;
+  ExpressionGraph g = build_graph({IMAGE_SIZE, 2048, 2048, LABEL_SIZE});
+  //std::cout << g.graphviz() << std::endl;
  
  Tensor xt({BATCH_SIZE, IMAGE_SIZE});
  Tensor yt({BATCH_SIZE, LABEL_SIZE});
@ -167,7 +166,7 @@ int main(int argc, char** argv) {
    g["x"] = xt;
    g["y"] = yt;
    
-    g.forward(BATCH_SIZE);
+    g.inference(BATCH_SIZE);
    
    std::vector<float> bResults;
    bResults << g["scores"].val();
--- a/src/node_operators_unary.h
+++ b/src/node_operators_unary.h
@ -1,5 +1,6 @@
 #include "node.h"
 #include "tensor_operators.h"
+#include "dropout.h"

 namespace marian {

@ -107,22 +108,40 @@ struct ReLUNodeOp : public UnaryNodeOp {

 };

-// @TODO: slow and probably buggy
+// Scaling droput
 struct DropoutNodeOp : public UnaryNodeOp {
  template <typename ...Args>
  DropoutNodeOp(Args ...args)
  : UnaryNodeOp(args...),
-    p_(0.5), seed_(time(0)) { }
+    p_(Get<float>(keywords::value, 0.5)) {}

-  void forward() {
-    //Element(_1 = Bernoulli(p_, (size_t)this) * _2,
-    //        val_, a_->val())
-    Dropout(val_, a_->val(), p_, seed_++);
+  ~DropoutNodeOp() {
+    if(bernoulli)
+      bernoulli->FreeStates(states_);
  }
-
-  void backward() {
-    Element(_1 += _2 * (_3 != 0.0f), // transform non-zero to 1
-            a_->grad(), adj_, val_);
+  
+  void inference() {
+    Element(_1 = _2, val_, a_->val());
+  }
+  
+  void forward() {
+    if(!bernoulli) {
+      bernoulli.reset(new Bernoulli(p_, val_.shape()));
+      bernoulli->InitStates(states_);
+    }
+    
+    if(!mask_)
+      mask_.allocate(val_.shape());
+    
+    auto f = [] __device__ (float& mask, float drop) {
+      return mask = drop;
+    };  
+    Element(f, mask_, *bernoulli);
+    Element(_1 = _2 * _3, val_, mask_, a_->val());
+  }
+  
+  void backward() {    
+    Element(_1 += _2 * _3, a_->grad(), adj_, mask_);
  }

  virtual std::string graphviz() {
@ -135,7 +154,9 @@ struct DropoutNodeOp : public UnaryNodeOp {

  private:
    float p_;
-    int seed_;
+    curandState* states_;
+    std::shared_ptr<Bernoulli> bernoulli;
+    Tensor mask_;
 };


--- a/src/npz_converter.h
+++ b/src/npz_converter.h
@ -165,9 +165,7 @@ class NpzConverter {
        data.resize(np.size());
        std::copy(np.data(), np.data() + np.size(), data.begin());

-        shape.clear();
-        shape.push_back(np.size1());
-        shape.push_back(np.size2());
+        shape = { (int)np.size1(), (int)np.size2() };

      }
      else {
--- a/src/tensor.h
+++ b/src/tensor.h
@ -157,6 +157,7 @@ class TensorImpl {
     *
     * @return Shape of Tensor
     */
+	__host__ __device__
    const Shape& shape() const {
        return shape_;
    }
@ -269,6 +270,7 @@ class Tensor {
    Tensor() {}

    /**
+
     * @brief Constructor that allocates memory.
     *
     * @param shape Shape of Tensor. 
@ -331,7 +333,7 @@ class Tensor {
    const value_type* data() const {
      return pimpl_->data();
    }
-
+	
   /**
    * @brief Get begin iterator of GPU Tensor's vector.
    *
@ -373,6 +375,7 @@ class Tensor {
     *
     * @return Tensor's shape.
     */
+	__host__ __device__
    const Shape& shape() const {
      return pimpl_->shape();
    }
@ -436,7 +439,8 @@ class Tensor {
    void set(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end);

    void incr(Float incr) {
-    	pimpl_->incr(incr);
+    	pimpl_->incr(incr)
+;
    }

    /**
@ -457,6 +461,39 @@ class Tensor {
      vout.resize(size());
      pimpl_->get(vout.begin());
    }
+
+	class TensorView {
+	  private:
+		float* data_;
+		int rows_;
+		int cols_;
+	  
+	  public:
+		TensorView(Tensor t)
+		: data_(t.data()), rows_(t.shape()[0]), cols_(t.shape()[1]) {}
+		
+		__device__ float& operator()(int i, int j) {
+		  if(rows_ != 1 && cols_ != 1)
+			return data_[i * cols_ + j];
+		  if(rows_ != 1 && cols_ == 1)
+			return data_[i];
+		  if(rows_ == 1 && cols_ != 1)
+			return data_[j];
+		  return data_[0];
+		}
+		
+		__device__ int rows() {
+		  return rows_;
+		}
+		
+		__device__ int cols() {
+		  return cols_;
+		}
+	};
+	
+	TensorView gpu() {
+	  return TensorView(*this);
+	}
 };

 /**
--- a/src/tensor_operators.cu
+++ b/src/tensor_operators.cu
@ -19,12 +19,8 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.

-#include <curand_kernel.h>
-
 #include "tensor_operators.h"

-using namespace std;
-
 namespace marian {

 // @TODO: handle this better, maybe per thread?
@ -35,48 +31,6 @@ static cublasHandle_t create_handle() {
 }
 cublasHandle_t cublasHandle = create_handle();

-__global__ void gDropout(float* out, const float* in,
-                         int seed, const float p, int rows, int cols) {
-  
-  int shift = blockIdx.x * cols + threadIdx.x;
-  curandState state;
-  curand_init(seed, shift, 0, &state);
-  for(int bid = 0; bid < rows; bid += gridDim.x) {
-    int j = bid + blockIdx.x;
-    if(j < rows) {
-      Float* rowOut = out + j * cols;
-      const Float* rowIn = in + j * cols;
-
-      for(int tid = 0; tid < cols; tid += blockDim.x) {
-        int i = tid + threadIdx.x;
-        if(i < cols) {
-          //int offset = i;
-          float dropout = (curand_uniform(&state) >= p);
-          rowOut[i] = dropout * rowIn[i];
-        }
-      }
-    }
-  }
-}
-
-// Slow!!!
-void Dropout(Tensor out, Tensor in, float p, int seed) {
-  int m = in.shape()[0];
-  int n = in.shape()[1];
-  
-  curandGenerator_t prng;
-  curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_XORWOW);
-  curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) seed);
-  curandGenerateUniform(prng, out.data(), m * n);
-  Element(_1 = (_1 > p), out);
-  Element(_1 = _1 * _2, out, in);
-  //int blocks = std::min(MAX_BLOCKS, m);
-  //int threads = std::min(MAX_THREADS, k);
-  //gDropout<<<blocks, threads>>>(out.data(), in.data(), seed, p, m, k);
-  //cudaStreamSynchronize(0);  
-}
-
-
 __global__ void gSoftmaxGrad(float* grad, const float* adj, const float* val,
                             const int rows, const int cols) {
  for(int bid = 0; bid < rows; bid += gridDim.x) {
--- a/src/tensor_operators.h
+++ b/src/tensor_operators.h
@ -29,54 +29,9 @@ using namespace thrust::placeholders;
 #define MAX_THREADS 512
 #define MAX_BLOCKS 65535

-class TensorView {
-  private:
-    float* data_;
-    int rows_;
-    int cols_;
-  
-  public:
-    TensorView(Tensor t)
-    : data_(t.data()), rows_(t.shape()[0]), cols_(t.shape()[1]) {}
-    
-    __device__ float& operator()(int i, int j) {
-      if(rows_ != 1 && cols_ != 1)
-        return data_[i * cols_ + j];
-      if(rows_ != 1 && cols_ == 1)
-        return data_[i];
-      if(rows_ == 1 && cols_ != 1)
-        return data_[j];
-      return data_[0];
-    }
-    
-    __device__ int rows() {
-      return rows_;
-    }
-    
-    __device__ int cols() {
-      return cols_;
-    }
-};
-
-//template <class Functor>
-//__global__ void gElement(Functor functor) {
-//  int rows = out.rows();
-//  int cols = out.cols();
-//  for(int bid = 0; bid < rows; bid += gridDim.x) {
-//    int i = bid + blockIdx.x;
-//    if(i < rows) {
-//      for(int tid = 0; tid < cols; tid += blockDim.x) {
-//        int j = tid + threadIdx.x;
-//        if(j < cols)
-//          functor(i, j);
-//      }
-//    }
-//  }
-//}
-
-template <class Functor>
+template <class Functor, class T>
 __global__ void gElement(Functor functor,
-                         TensorView out) {
+                         T out) {
  int rows = out.rows();
  int cols = out.cols();
  for(int bid = 0; bid < rows; bid += gridDim.x) {
@ -91,23 +46,22 @@ __global__ void gElement(Functor functor,
  }
 }

-template <class Functor>
-void Element(Functor functor,
-              Tensor out) {
+template <class Functor, class T>
+void Element(Functor functor, T out) {

  int m = out.shape()[0];
  int n = out.shape()[1];
  
  int blocks  = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, n);
-  gElement<<<blocks, threads>>>(functor, TensorView(out));
+  gElement<<<blocks, threads>>>(functor, out.gpu());
  cudaStreamSynchronize(0);
 }


-template <class Functor>
+template <class Functor, class T1, class T2>
 __global__ void gElement(Functor functor,
-                         TensorView out, TensorView in) {
+                         T1 out, T2 in) {
  int rows = out.rows();
  int cols = out.cols();
  for(int bid = 0; bid < rows; bid += gridDim.x) {
@ -122,22 +76,22 @@ __global__ void gElement(Functor functor,
  }
 }

-template <class Functor>
+template <class Functor, class T1, class T2>
 void Element(Functor functor,
-              Tensor out, Tensor in) {
+             T1 out, T2 in) {

  int m = out.shape()[0];
  int n = out.shape()[1];
  
  int blocks  = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, n);
-  gElement<<<blocks, threads>>>(functor, TensorView(out), TensorView(in));
+  gElement<<<blocks, threads>>>(functor, out.gpu(), in.gpu());
  cudaStreamSynchronize(0);
 }

-template <class Functor>
+template <class Functor, class T1, class T2, class T3>
 __global__ void gElement(Functor functor,
-                         TensorView out, TensorView in1, TensorView in2) {
+                         T1 out, T2 in1, T3 in2) {
  int rows = out.rows();
  int cols = out.cols();
  for(int bid = 0; bid < rows; bid += gridDim.x) {
@ -152,23 +106,23 @@ __global__ void gElement(Functor functor,
  }
 }

-template <class Functor>
+template <class Functor, class T1, class T2, class T3>
 void Element(Functor functor,
-              Tensor out, Tensor in1, Tensor in2) {
+             T1 out, T2 in1, T3 in2) {

  int m = out.shape()[0];
  int n = out.shape()[1];
  
  int blocks  = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, n);
-  gElement<<<blocks, threads>>>(functor, TensorView(out),
-                                TensorView(in1), TensorView(in2));
+  gElement<<<blocks, threads>>>(functor, out.gpu(),
+                                in1.gpu(), in2.gpu());
  cudaStreamSynchronize(0);
 }

-template <class Functor>
+template <class Functor, class T1, class T2, class T3, class T4>
 __global__ void gElement(Functor functor,
-                         TensorView out, TensorView in1, TensorView in2, TensorView in3) {
+                         T1 out, T2 in1, T3 in2, T4 in3) {
  int rows = out.rows();
  int cols = out.cols();
  for(int bid = 0; bid < rows; bid += gridDim.x) {
@ -183,22 +137,20 @@ __global__ void gElement(Functor functor,
  }
 }

-template <class Functor>
-void Element(Functor functor, Tensor out,
-             Tensor in1, Tensor in2, Tensor in3) {
+template <class Functor, class T1, class T2, class T3, class T4>
+void Element(Functor functor,
+             T1 out, T2 in1, T3 in2, T4 in3) {

  int m = out.shape()[0];
  int n = out.shape()[1];
  
  int blocks  = std::min(MAX_BLOCKS, m);
  int threads = std::min(MAX_THREADS, n);
-  gElement<<<blocks, threads>>>(functor, TensorView(out),
-                                TensorView(in1), TensorView(in2), TensorView(in3));
+  gElement<<<blocks, threads>>>(functor, out.gpu(),
+                                in1.gpu(), in2.gpu(), in3.gpu());
  cudaStreamSynchronize(0);
 }

-void Dropout(Tensor Out, Tensor in, float p, int seed);
-
 void SubtractMax(Tensor* Out);

 void Softmax(Tensor* Out);
--- a/src/test.cu
+++ b/src/test.cu
@ -26,62 +26,27 @@
 #include "mnist.h"
 #include "vocab.h"
 #include "tensor_operators.h"
+#include "curand.h"

 using namespace marian;
 using namespace keywords;

-template <class Functor>
-__global__ void tgElement(Functor functor, TensorView t, int rows, int cols) {
-  for(int bid = 0; bid < rows; bid += gridDim.x) {
-    int i = bid + blockIdx.x;
-    if(i < rows) {
-      for(int tid = 0; tid < cols; tid += blockDim.x) {
-        int j = tid + threadIdx.x;
-        if(j < cols)
-          t(i, j) = functor(i, j);
-      }
-    }
-  }
-}
-
-template <class Functor>
-void tElement(Functor functor, Tensor t) {
-
-  
-  int m = t.shape()[0];
-  int n = t.shape()[1];
-  
-  int blocks  = std::min(MAX_BLOCKS, m);
-  int threads = std::min(MAX_THREADS, n);
-  tgElement<<<blocks, threads>>>(functor, TensorView(t), m, n);
-  cudaStreamSynchronize(0);
-}
-
 int main(int argc, char** argv) {
-  ExpressionGraph g;
-
-  //Tensor a({1000, 1000}, 3);
-  //Tensor b({1, 1}, 2);
-  //
-  //TensorView ta(a);
-  //TensorView tb(b);
-  //
-  //boost::timer::cpu_timer timer;
-  //
-  //
-  //auto f = _1 + _2;
-  //auto pp1 = [=] __device__ (int i, int j) mutable -> float {
-  //  return f(ta(i, j), tb(i, j));  
-  //};
-  //
-  //auto pp2 = [=] __device__ (int i, int j) mutable -> float {
-  //  return f(pp1(i, j), tb(i, j));  
-  //};
-  //
-  //for(int i = 0; i < 1000; ++i)
-  //  tElement(pp2, a);  
-
+  
+  Tensor a({1000, 1000}, 3);
+  Tensor b({1000, 1000});
+  Bernoulli dropout(0.2, b.shape());
+  
+  auto f = [] __device__ (float& r,
+                          float a,
+                          float b)  {
+    return r = a * b;
+  };
    
-//  std::cerr << timer.format(5, "%ws") << std::endl;
+  boost::timer::cpu_timer timer;
+  for(int i = 0; i < 1000; ++i)
+    Element(f, b, a, a);
+  
+  std::cerr << timer.format(5, "%ws") << std::endl;
  return 0;
 }
--- a/src/thrust_functions.h
+++ b/src/thrust_functions.h
@ -138,7 +138,6 @@ namespace thrust
      ReLUback(const actor<Eval> &_1) {
        return compose(unary_operator<unary_reluback>(), _1);
      }
-      
    }
  }
 }