a number of comments and test

This commit is contained in:
Marcin Junczys-Dowmunt 2016-09-20 13:27:08 +02:00
parent 8797b5ffd3
commit 4c8b6bb171
4 changed files with 56 additions and 10 deletions

View File

@ -3,7 +3,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
project(marian CXX)
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; --expt-extended-lambda; -Xcompiler '-fPIC')
add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

View File

@ -33,8 +33,8 @@ ExpressionGraph build_graph(const std::vector<int>& dims) {
layers.emplace_back(x);
}
else {
layers.emplace_back(reluplus(dot(layers.back(), weights.back()), biases.back()));
//layers.emplace_back(relu(dot(layers.back(), weights.back()) + biases.back()));
//layers.emplace_back(reluplus(dot(layers.back(), weights.back()), biases.back()));
layers.emplace_back(relu(dot(layers.back(), weights.back()) + biases.back()));
}
weights.emplace_back(

View File

@ -454,11 +454,14 @@ struct ReLUPlusNodeOp : public BinaryNodeOp {
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
// v = f(g(a, b))
Element(_1 = ReLU(_2 + _3),
val_, a_->val(), b_->val());
}
void backward() {
// df/da = adj * f'(g(a, b)) : dg/da * df/dg
// df/db = adj * f'(g(a, b)) : dg/db * df/dg
Element(_1 += _2 * ReLUback(_3 + _4),
a_->grad(), adj_, a_->val(), b_->val());
Element(_1 += _2 * ReLUback(_3 + _4),

View File

@ -30,15 +30,58 @@
using namespace marian;
using namespace keywords;
template <class Functor>
__global__ void tgElement(Functor functor, TensorView t, int rows, int cols) {
for(int bid = 0; bid < rows; bid += gridDim.x) {
int i = bid + blockIdx.x;
if(i < rows) {
for(int tid = 0; tid < cols; tid += blockDim.x) {
int j = tid + threadIdx.x;
if(j < cols)
t(i, j) = functor(i, j);
}
}
}
}
template <class Functor>
void tElement(Functor functor, Tensor t) {
int m = t.shape()[0];
int n = t.shape()[1];
int blocks = std::min(MAX_BLOCKS, m);
int threads = std::min(MAX_THREADS, n);
tgElement<<<blocks, threads>>>(functor, TensorView(t), m, n);
cudaStreamSynchronize(0);
}
int main(int argc, char** argv) {
ExpressionGraph g;
Tensor a({1000, 1000}, 3);
Tensor b({1, 1}, 2);
boost::timer::cpu_timer timer;
for(int i = 0; i < 1000; ++i)
Element(_1 += _1 * _2, a, b);
std::cerr << timer.format(5, "%ws") << std::endl;
//Tensor a({1000, 1000}, 3);
//Tensor b({1, 1}, 2);
//
//TensorView ta(a);
//TensorView tb(b);
//
//boost::timer::cpu_timer timer;
//
//
//auto f = _1 + _2;
//auto pp1 = [=] __device__ (int i, int j) mutable -> float {
// return f(ta(i, j), tb(i, j));
//};
//
//auto pp2 = [=] __device__ (int i, int j) mutable -> float {
// return f(pp1(i, j), tb(i, j));
//};
//
//for(int i = 0; i < 1000; ++i)
// tElement(pp2, a);
// std::cerr << timer.format(5, "%ws") << std::endl;
return 0;
}