restored speed

This commit is contained in:
Marcin Junczys-Dowmunt 2016-11-11 01:12:07 +01:00
parent 349bca4416
commit d7d6062dbd
5 changed files with 34 additions and 33 deletions

View File

@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.5.1)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
project(marian CXX)
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -m64 -march=native -funroll-loops -ffinite-math-only -Wno-unused-result -Wno-deprecated -fPIC")
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

View File

@ -43,13 +43,14 @@ class Adagrad : public OptimizerBase {
void update(ExpressionGraphPtr graph, data::BatchPtr batch) {
graph->backprop(batch);
if(gt_.size() < graph->params().size())
if(gt_.size() < graph->params().size()) {
for(auto& param : graph->params()) {
gt_.emplace_back();
graph->tensor(gt_.back(), param->grad()->shape());
gt_.back()->set(0);
}
}
auto gtIt = gt_.begin();
for(auto& param : graph->params()) {
Element(_1 += (_2 * _2),

View File

@ -21,8 +21,9 @@ void testForward(F f, size_t l,
auto ta = newTensorAllocator<DeviceGPU>();
Tensor in = ta->tensor(shape);
Tensor out = ta->tensor(shape);
Tensor in, out;
ta->allocate(in, shape);
ta->allocate(out, shape);
uniform(-5, 5)(in);
@ -43,11 +44,11 @@ void testBackward(F f, size_t l,
auto ta = newTensorAllocator<DeviceGPU>();
Tensor in = ta->tensor(shape);
Tensor adj = ta->tensor(shape);
Tensor in, adj, grad;
ta->allocate(in, shape);
ta->allocate(adj, shape);
adj->set(1);
Tensor grad = ta->tensor(shape);
ta->allocate(grad, shape);
uniform(-5, 5)(in);

View File

@ -35,15 +35,16 @@ using namespace thrust::placeholders;
template <class Functor, class T>
__global__ void gElement(Functor functor,
T out) {
int rows = out->shape()[0];
int cols = out->shape()[1];
int rows = out.shape()[0];
int cols = out.shape()[1];
for(int bid = 0; bid < rows; bid += gridDim.x) {
int i = bid + blockIdx.x;
if(i < rows) {
for(int tid = 0; tid < cols; tid += blockDim.x) {
int j = tid + threadIdx.x;
if(j < cols)
(*out)(i, j) = functor((*out)(i, j));
out(i, j) = functor(out(i, j));
}
}
}
@ -68,15 +69,16 @@ void Element(Functor functor, T& out) {
template <class Functor, class T1, class T2>
__global__ void gElement(Functor functor,
T1 out, T2 in) {
int rows = out->shape()[0];
int cols = out->shape()[1];
int rows = out.shape()[0];
int cols = out.shape()[1];
for(int bid = 0; bid < rows; bid += gridDim.x) {
int i = bid + blockIdx.x;
if(i < rows) {
for(int tid = 0; tid < cols; tid += blockDim.x) {
int j = tid + threadIdx.x;
if(j < cols) {
(*out)(i, j) = functor((*out)(i, j), (*in)(i, j));
out(i, j) = functor(out(i, j), in(i, j));
}
}
}
@ -103,15 +105,15 @@ void Element(Functor functor,
template <class Functor, class T1, class T2, class T3>
__global__ void gElement(Functor functor,
T1 out, T2 in1, T3 in2) {
int rows = out->shape()[0];
int cols = out->shape()[1];
int rows = out.shape()[0];
int cols = out.shape()[1];
for(int bid = 0; bid < rows; bid += gridDim.x) {
int i = bid + blockIdx.x;
if(i < rows) {
for(int tid = 0; tid < cols; tid += blockDim.x) {
int j = tid + threadIdx.x;
if(j < cols)
(*out)(i, j) = functor((*out)(i, j), (*in1)(i, j), (*in2)(i, j));
out(i, j) = functor(out(i, j), in1(i, j), in2(i, j));
}
}
}
@ -140,8 +142,8 @@ void Element(Functor functor,
template <class Functor, class T1, class T2, class T3, class T4>
__global__ void gElement(Functor functor,
T1 out, T2 in1, T3 in2, T4 in3) {
int rows = out->shape()[0];
int cols = out->shape()[1];
int rows = out.shape()[0];
int cols = out.shape()[1];
for(int bid = 0; bid < rows; bid += gridDim.x) {
int i = bid + blockIdx.x;
@ -149,7 +151,7 @@ __global__ void gElement(Functor functor,
for(int tid = 0; tid < cols; tid += blockDim.x) {
int j = tid + threadIdx.x;
if(j < cols)
(*out)(i, j) = functor((*out)(i, j), (*in1)(i, j), (*in2)(i, j), (*in3)(i, j));
out(i, j) = functor(out(i, j), in1(i, j), in2(i, j), in3(i, j));
}
}
}

View File

@ -63,24 +63,22 @@ struct Access {
return shape_;
}
Access* toDevice() {
Access* ptr;
cudaMalloc(&ptr, sizeof(Access));
cudaMemcpy(ptr, this, sizeof(Access), cudaMemcpyHostToDevice);
return ptr;
}
//Access* toDevice() {
// Access* ptr;
// cudaMalloc(&ptr, sizeof(Access));
// cudaMemcpy(ptr, this, sizeof(Access), cudaMemcpyHostToDevice);
// return ptr;
//}
};
class TensorGPU : public TensorBase {
private:
// cuDNN stuff
cudnnTensorDescriptor_t cudnnDesc_;
Access* access_;
public:
TensorGPU(float* data, Shape shape)
: TensorBase(data, shape),
access_(Access(data, shape).toDevice()) {
: TensorBase(data, shape) {
cudnnCreateTensorDescriptor(&cudnnDesc_);
cudnnSetTensor4dDescriptorEx(cudnnDesc_, CUDNN_DATA_FLOAT,
shape_[0], shape_[1], 1, 1,
@ -89,7 +87,6 @@ class TensorGPU : public TensorBase {
~TensorGPU() {
cudnnDestroyTensorDescriptor(cudnnDesc_);
cudaFree(access_);
}
@ -121,8 +118,8 @@ class TensorGPU : public TensorBase {
return cudnnDesc_;
}
Access* access() {
return access_;
Access access() {
return Access(data_, shape_);
}
std::string debug() {