restored speed

2024-09-19 02:37:14 +03:00 · 2016-11-11 01:12:07 +01:00 · 2016-11-11 01:12:07 +01:00 · d7d6062dbd
commit d7d6062dbd
parent 349bca4416
5 changed files with 34 additions and 33 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.5.1)
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

 project(marian CXX)
-SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
+SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -m64 -march=native -funroll-loops -ffinite-math-only -Wno-unused-result -Wno-deprecated -fPIC")
 LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
 add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
 SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
--- a/src/optimizers.h
+++ b/src/optimizers.h
@ -43,13 +43,14 @@ class Adagrad : public OptimizerBase {
    void update(ExpressionGraphPtr graph, data::BatchPtr batch) {
      graph->backprop(batch);

-      if(gt_.size() < graph->params().size())
+      if(gt_.size() < graph->params().size()) {
        for(auto& param : graph->params()) {
          gt_.emplace_back();
          graph->tensor(gt_.back(), param->grad()->shape());
          gt_.back()->set(0);
        }
-
+      }
+      
      auto gtIt = gt_.begin();
      for(auto& param : graph->params()) {
        Element(_1 += (_2 * _2),
--- a/src/softmax_benchmark.cu
+++ b/src/softmax_benchmark.cu
@ -21,8 +21,9 @@ void testForward(F f, size_t l,

    auto ta = newTensorAllocator<DeviceGPU>();

-    Tensor in = ta->tensor(shape);
-    Tensor out = ta->tensor(shape);
+    Tensor in, out;
+    ta->allocate(in, shape);
+    ta->allocate(out, shape);

    uniform(-5, 5)(in);

@ -43,11 +44,11 @@ void testBackward(F f, size_t l,

    auto ta = newTensorAllocator<DeviceGPU>();

-    Tensor in = ta->tensor(shape);
-    Tensor adj = ta->tensor(shape);
+    Tensor in, adj, grad;
+    ta->allocate(in, shape);
+    ta->allocate(adj, shape);
    adj->set(1);
-
-    Tensor grad = ta->tensor(shape);
+    ta->allocate(grad, shape);

    uniform(-5, 5)(in);

--- a/src/tensor_operators.h
+++ b/src/tensor_operators.h
@ -35,15 +35,16 @@ using namespace thrust::placeholders;
 template <class Functor, class T>
 __global__ void gElement(Functor functor,
                         T out) {
-  int rows = out->shape()[0];
-  int cols = out->shape()[1];
+  int rows = out.shape()[0];
+  int cols = out.shape()[1];
+
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int i = bid + blockIdx.x;
    if(i < rows) {
      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int j = tid + threadIdx.x;
        if(j < cols)
-          (*out)(i, j) = functor((*out)(i, j));
+          out(i, j) = functor(out(i, j));
      }
    }
  }
@ -68,15 +69,16 @@ void Element(Functor functor, T& out) {
 template <class Functor, class T1, class T2>
 __global__ void gElement(Functor functor,
                         T1 out, T2 in) {
-  int rows = out->shape()[0];
-  int cols = out->shape()[1];
+  int rows = out.shape()[0];
+  int cols = out.shape()[1];
+
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int i = bid + blockIdx.x;
    if(i < rows) {
      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int j = tid + threadIdx.x;
        if(j < cols) {
-          (*out)(i, j) = functor((*out)(i, j), (*in)(i, j));
+          out(i, j) = functor(out(i, j), in(i, j));
        }
      }
    }
@ -103,15 +105,15 @@ void Element(Functor functor,
 template <class Functor, class T1, class T2, class T3>
 __global__ void gElement(Functor functor,
                         T1 out, T2 in1, T3 in2) {
-  int rows = out->shape()[0];
-  int cols = out->shape()[1];
+  int rows = out.shape()[0];
+  int cols = out.shape()[1];
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int i = bid + blockIdx.x;
    if(i < rows) {
      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int j = tid + threadIdx.x;
        if(j < cols)
-          (*out)(i, j) = functor((*out)(i, j), (*in1)(i, j), (*in2)(i, j));
+          out(i, j) = functor(out(i, j), in1(i, j), in2(i, j));
      }
    }
  }
@ -140,8 +142,8 @@ void Element(Functor functor,
 template <class Functor, class T1, class T2, class T3, class T4>
 __global__ void gElement(Functor functor,
                         T1 out, T2 in1, T3 in2, T4 in3) {
-  int rows = out->shape()[0];
-  int cols = out->shape()[1];
+  int rows = out.shape()[0];
+  int cols = out.shape()[1];

  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int i = bid + blockIdx.x;
@ -149,7 +151,7 @@ __global__ void gElement(Functor functor,
      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int j = tid + threadIdx.x;
        if(j < cols)
-          (*out)(i, j) = functor((*out)(i, j), (*in1)(i, j), (*in2)(i, j), (*in3)(i, j));
+          out(i, j) = functor(out(i, j), in1(i, j), in2(i, j), in3(i, j));
      }
    }
  }
--- a/src/tensors/tensor_gpu.h
+++ b/src/tensors/tensor_gpu.h
@ -63,24 +63,22 @@ struct Access {
      return shape_;
    }

-    Access* toDevice() {
-      Access* ptr;
-      cudaMalloc(&ptr, sizeof(Access));
-      cudaMemcpy(ptr, this, sizeof(Access), cudaMemcpyHostToDevice);
-      return ptr;
-    }
+    //Access* toDevice() {
+    //  Access* ptr;
+    //  cudaMalloc(&ptr, sizeof(Access));
+    //  cudaMemcpy(ptr, this, sizeof(Access), cudaMemcpyHostToDevice);
+    //  return ptr;
+    //}
 };

 class TensorGPU : public TensorBase {
  private:
    // cuDNN stuff
    cudnnTensorDescriptor_t cudnnDesc_;
-    Access* access_;

  public:
    TensorGPU(float* data, Shape shape)
-    : TensorBase(data, shape),
-      access_(Access(data, shape).toDevice()) {
+    : TensorBase(data, shape) {
      cudnnCreateTensorDescriptor(&cudnnDesc_);
      cudnnSetTensor4dDescriptorEx(cudnnDesc_, CUDNN_DATA_FLOAT,
                                   shape_[0], shape_[1], 1, 1,
@ -89,7 +87,6 @@ class TensorGPU : public TensorBase {

    ~TensorGPU() {
      cudnnDestroyTensorDescriptor(cudnnDesc_);
-      cudaFree(access_);
    }


@ -121,8 +118,8 @@ class TensorGPU : public TensorBase {
      return cudnnDesc_;
    }

-    Access* access() {
-      return access_;
+    Access access() {
+      return Access(data_, shape_);
    }

    std::string debug() {