mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-19 02:37:14 +03:00
restored speed
This commit is contained in:
parent
349bca4416
commit
d7d6062dbd
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.5.1)
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
||||
|
||||
project(marian CXX)
|
||||
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
|
||||
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -m64 -march=native -funroll-loops -ffinite-math-only -Wno-unused-result -Wno-deprecated -fPIC")
|
||||
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
|
||||
add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
|
||||
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||
|
@ -43,13 +43,14 @@ class Adagrad : public OptimizerBase {
|
||||
void update(ExpressionGraphPtr graph, data::BatchPtr batch) {
|
||||
graph->backprop(batch);
|
||||
|
||||
if(gt_.size() < graph->params().size())
|
||||
if(gt_.size() < graph->params().size()) {
|
||||
for(auto& param : graph->params()) {
|
||||
gt_.emplace_back();
|
||||
graph->tensor(gt_.back(), param->grad()->shape());
|
||||
gt_.back()->set(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
auto gtIt = gt_.begin();
|
||||
for(auto& param : graph->params()) {
|
||||
Element(_1 += (_2 * _2),
|
||||
|
@ -21,8 +21,9 @@ void testForward(F f, size_t l,
|
||||
|
||||
auto ta = newTensorAllocator<DeviceGPU>();
|
||||
|
||||
Tensor in = ta->tensor(shape);
|
||||
Tensor out = ta->tensor(shape);
|
||||
Tensor in, out;
|
||||
ta->allocate(in, shape);
|
||||
ta->allocate(out, shape);
|
||||
|
||||
uniform(-5, 5)(in);
|
||||
|
||||
@ -43,11 +44,11 @@ void testBackward(F f, size_t l,
|
||||
|
||||
auto ta = newTensorAllocator<DeviceGPU>();
|
||||
|
||||
Tensor in = ta->tensor(shape);
|
||||
Tensor adj = ta->tensor(shape);
|
||||
Tensor in, adj, grad;
|
||||
ta->allocate(in, shape);
|
||||
ta->allocate(adj, shape);
|
||||
adj->set(1);
|
||||
|
||||
Tensor grad = ta->tensor(shape);
|
||||
ta->allocate(grad, shape);
|
||||
|
||||
uniform(-5, 5)(in);
|
||||
|
||||
|
@ -35,15 +35,16 @@ using namespace thrust::placeholders;
|
||||
template <class Functor, class T>
|
||||
__global__ void gElement(Functor functor,
|
||||
T out) {
|
||||
int rows = out->shape()[0];
|
||||
int cols = out->shape()[1];
|
||||
int rows = out.shape()[0];
|
||||
int cols = out.shape()[1];
|
||||
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
if(i < rows) {
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int j = tid + threadIdx.x;
|
||||
if(j < cols)
|
||||
(*out)(i, j) = functor((*out)(i, j));
|
||||
out(i, j) = functor(out(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -68,15 +69,16 @@ void Element(Functor functor, T& out) {
|
||||
template <class Functor, class T1, class T2>
|
||||
__global__ void gElement(Functor functor,
|
||||
T1 out, T2 in) {
|
||||
int rows = out->shape()[0];
|
||||
int cols = out->shape()[1];
|
||||
int rows = out.shape()[0];
|
||||
int cols = out.shape()[1];
|
||||
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
if(i < rows) {
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int j = tid + threadIdx.x;
|
||||
if(j < cols) {
|
||||
(*out)(i, j) = functor((*out)(i, j), (*in)(i, j));
|
||||
out(i, j) = functor(out(i, j), in(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -103,15 +105,15 @@ void Element(Functor functor,
|
||||
template <class Functor, class T1, class T2, class T3>
|
||||
__global__ void gElement(Functor functor,
|
||||
T1 out, T2 in1, T3 in2) {
|
||||
int rows = out->shape()[0];
|
||||
int cols = out->shape()[1];
|
||||
int rows = out.shape()[0];
|
||||
int cols = out.shape()[1];
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
if(i < rows) {
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int j = tid + threadIdx.x;
|
||||
if(j < cols)
|
||||
(*out)(i, j) = functor((*out)(i, j), (*in1)(i, j), (*in2)(i, j));
|
||||
out(i, j) = functor(out(i, j), in1(i, j), in2(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -140,8 +142,8 @@ void Element(Functor functor,
|
||||
template <class Functor, class T1, class T2, class T3, class T4>
|
||||
__global__ void gElement(Functor functor,
|
||||
T1 out, T2 in1, T3 in2, T4 in3) {
|
||||
int rows = out->shape()[0];
|
||||
int cols = out->shape()[1];
|
||||
int rows = out.shape()[0];
|
||||
int cols = out.shape()[1];
|
||||
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
@ -149,7 +151,7 @@ __global__ void gElement(Functor functor,
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int j = tid + threadIdx.x;
|
||||
if(j < cols)
|
||||
(*out)(i, j) = functor((*out)(i, j), (*in1)(i, j), (*in2)(i, j), (*in3)(i, j));
|
||||
out(i, j) = functor(out(i, j), in1(i, j), in2(i, j), in3(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -63,24 +63,22 @@ struct Access {
|
||||
return shape_;
|
||||
}
|
||||
|
||||
Access* toDevice() {
|
||||
Access* ptr;
|
||||
cudaMalloc(&ptr, sizeof(Access));
|
||||
cudaMemcpy(ptr, this, sizeof(Access), cudaMemcpyHostToDevice);
|
||||
return ptr;
|
||||
}
|
||||
//Access* toDevice() {
|
||||
// Access* ptr;
|
||||
// cudaMalloc(&ptr, sizeof(Access));
|
||||
// cudaMemcpy(ptr, this, sizeof(Access), cudaMemcpyHostToDevice);
|
||||
// return ptr;
|
||||
//}
|
||||
};
|
||||
|
||||
class TensorGPU : public TensorBase {
|
||||
private:
|
||||
// cuDNN stuff
|
||||
cudnnTensorDescriptor_t cudnnDesc_;
|
||||
Access* access_;
|
||||
|
||||
public:
|
||||
TensorGPU(float* data, Shape shape)
|
||||
: TensorBase(data, shape),
|
||||
access_(Access(data, shape).toDevice()) {
|
||||
: TensorBase(data, shape) {
|
||||
cudnnCreateTensorDescriptor(&cudnnDesc_);
|
||||
cudnnSetTensor4dDescriptorEx(cudnnDesc_, CUDNN_DATA_FLOAT,
|
||||
shape_[0], shape_[1], 1, 1,
|
||||
@ -89,7 +87,6 @@ class TensorGPU : public TensorBase {
|
||||
|
||||
~TensorGPU() {
|
||||
cudnnDestroyTensorDescriptor(cudnnDesc_);
|
||||
cudaFree(access_);
|
||||
}
|
||||
|
||||
|
||||
@ -121,8 +118,8 @@ class TensorGPU : public TensorBase {
|
||||
return cudnnDesc_;
|
||||
}
|
||||
|
||||
Access* access() {
|
||||
return access_;
|
||||
Access access() {
|
||||
return Access(data_, shape_);
|
||||
}
|
||||
|
||||
std::string debug() {
|
||||
|
Loading…
Reference in New Issue
Block a user