mirror of
https://github.com/marian-nmt/marian.git
synced 2024-11-03 20:13:47 +03:00
Merge branch 'master' of https://github.com/emjotde/Marian
This commit is contained in:
commit
0a5d197889
@ -3,7 +3,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
||||
|
||||
project(marian CXX)
|
||||
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
|
||||
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; --expt-extended-lambda; --expt-relaxed-constexpr; -Xcompiler '-fPIC')
|
||||
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
|
||||
add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
|
||||
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||
|
||||
|
@ -3,7 +3,6 @@ include_directories(.)
|
||||
|
||||
cuda_add_library(marian_lib
|
||||
cnpy/cnpy.cpp
|
||||
dropout.cu
|
||||
exception.cpp
|
||||
expression_graph.cu
|
||||
expression_operators.cu
|
||||
@ -15,11 +14,6 @@ cuda_add_library(marian_lib
|
||||
|
||||
target_link_libraries(marian_lib)
|
||||
|
||||
cuda_add_executable(
|
||||
dropout_benchmark
|
||||
dropout_benchmark.cu
|
||||
)
|
||||
|
||||
cuda_add_executable(
|
||||
softmax_benchmark
|
||||
softmax_benchmark.cu
|
||||
@ -45,14 +39,13 @@ cuda_add_executable(
|
||||
test_nodes.cu
|
||||
)
|
||||
|
||||
target_link_libraries(dropout_benchmark marian_lib)
|
||||
target_link_libraries(softmax_benchmark marian_lib)
|
||||
target_link_libraries(mnist_benchmark marian_lib)
|
||||
target_link_libraries(validate_mnist_batch marian_lib)
|
||||
target_link_libraries(validate_encoder_decoder marian_lib)
|
||||
target_link_libraries(test_nodes marian_lib)
|
||||
|
||||
foreach(exec dropout_benchmark mnist_benchmark softmax_benchmark validate_mnist_batch validate_encoder_decoder test_nodes )
|
||||
foreach(exec mnist_benchmark softmax_benchmark validate_mnist_batch validate_encoder_decoder test_nodes )
|
||||
target_link_libraries(${exec} ${EXT_LIBS} cuda cudnn curand)
|
||||
cuda_add_cublas_to_target(${exec})
|
||||
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
|
||||
|
@ -1,15 +0,0 @@
|
||||
#include <curand.h>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "dropout.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
__global__ void gInitCurandStates(curandState* states, unsigned int seed) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
curand_init(seed, tid, 0, &states[tid]);
|
||||
}
|
||||
|
||||
unsigned Bernoulli::seed = time(0);
|
||||
|
||||
}
|
@ -1,57 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <curand.h>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "tensor_operators.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
__global__ void gInitCurandStates(curandState* states, unsigned int seed);
|
||||
|
||||
class Bernoulli {
|
||||
private:
|
||||
float p_;
|
||||
curandState* states_;
|
||||
static unsigned seed;
|
||||
Shape shape_;
|
||||
|
||||
public:
|
||||
Bernoulli(float p, const Shape& shape)
|
||||
: p_(p), shape_(shape) {}
|
||||
|
||||
void InitStates(curandState* states) {
|
||||
states_ = states;
|
||||
int blocks = std::min(MAX_BLOCKS, shape_[0]);
|
||||
int threads = std::min(MAX_THREADS, shape_[1]);
|
||||
int n = blocks * threads;
|
||||
cudaMalloc((void**) &states_, n * sizeof(curandState));
|
||||
gInitCurandStates<<<blocks, threads>>>(states_, seed++);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void FreeStates(curandState* states) {
|
||||
cudaFree(states);
|
||||
}
|
||||
|
||||
__device__ float operator()(int i, int j) const {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
float dist = curand_uniform(&states_[tid]);
|
||||
float zeroOne = dist > p_;
|
||||
return zeroOne / (1 - p_);
|
||||
}
|
||||
|
||||
__device__ int rows() const {
|
||||
return shape_[0];
|
||||
}
|
||||
|
||||
__device__ int cols() const {
|
||||
return shape_[1];
|
||||
}
|
||||
|
||||
Bernoulli& gpu() {
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
// This file is part of the Marian toolkit.
|
||||
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <fstream>
|
||||
#include <boost/timer/timer.hpp>
|
||||
|
||||
#include "marian.h"
|
||||
#include "mnist.h"
|
||||
#include "vocab.h"
|
||||
#include "tensor_operators.h"
|
||||
#include "curand.h"
|
||||
|
||||
using namespace marian;
|
||||
using namespace keywords;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
Tensor a({1000, 1000}, 3);
|
||||
Tensor b({1000, 1000});
|
||||
Bernoulli dropout(0.2, b.shape());
|
||||
|
||||
auto f = [] __device__ (float& r,
|
||||
float a,
|
||||
float b) {
|
||||
return r = a * b;
|
||||
};
|
||||
|
||||
boost::timer::cpu_timer timer;
|
||||
for(int i = 0; i < 1000; ++i)
|
||||
Element(f, b, a, a);
|
||||
|
||||
std::cerr << timer.format(5, "%ws") << std::endl;
|
||||
return 0;
|
||||
}
|
@ -45,6 +45,7 @@ ExpressionGraph build_graph(const std::vector<int>& dims) {
|
||||
"scores");
|
||||
|
||||
auto cost = mean(cross_entropy(scores, y), axis=0);
|
||||
//auto cost = mean(-sum(y * logsoftmax(scores), axis=1), axis=0);
|
||||
auto costreg = named(
|
||||
cost, "cost"
|
||||
);
|
||||
|
@ -114,14 +114,6 @@ float Node::L2Norm(const std::vector<float> &vec) const
|
||||
return sqrt(ret);
|
||||
}
|
||||
|
||||
std::vector<float> Node::StoreTensorInVec(Tensor tensor)
|
||||
{
|
||||
size_t totSize = GetTotalSize(tensor.shape());
|
||||
std::vector<float> vec(totSize);
|
||||
thrust::copy(tensor.begin(), tensor.end(), vec.begin());
|
||||
return vec;
|
||||
}
|
||||
|
||||
void Node::broadcast(const std::vector<float> &largeVec, std::vector<float> &smallVec)
|
||||
{
|
||||
size_t largeSize = largeVec.size();
|
||||
|
@ -122,7 +122,6 @@ class Node : public Chainable<Tensor>,
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
std::vector<float> StoreTensorInVec(Tensor tensor);
|
||||
void calc_numeric_grad(
|
||||
Float delta,
|
||||
Tensor input,
|
||||
|
@ -17,10 +17,12 @@ struct BinaryNodeOp : public Node {
|
||||
|
||||
cerr << "BinaryNodeOp::" << typeid(*this).name() << "::backward_debug()" << endl;
|
||||
|
||||
std::vector<float> preCalcGradA = StoreTensorInVec(a_->grad());
|
||||
std::vector<float> preCalcGradA;
|
||||
preCalcGradA << a_->grad();
|
||||
//output("preCalcGradA", preCalcGradA);
|
||||
|
||||
std::vector<float> preCalcGradB = StoreTensorInVec(b_->grad());
|
||||
std::vector<float> preCalcGradB;
|
||||
preCalcGradB << b_->grad();
|
||||
//output("preCalcGradB", preCalcGradB);
|
||||
|
||||
// use df/dx to calc grad
|
||||
|
@ -1,6 +1,5 @@
|
||||
#include "node.h"
|
||||
#include "tensor_operators.h"
|
||||
#include "dropout.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -17,7 +16,8 @@ struct UnaryNodeOp : public Node {
|
||||
|
||||
cerr << "UnaryNodeOp::" << typeid(*this).name() << "::backward_numeric()" << endl;
|
||||
|
||||
std::vector<float> preCalcGradA = StoreTensorInVec(a_->grad());
|
||||
std::vector<float> preCalcGradA;
|
||||
preCalcGradA << a_->grad();
|
||||
//output("preCalcGradA", preCalcGradA);
|
||||
|
||||
// use df/dx to calc grad
|
||||
@ -108,40 +108,37 @@ struct ReLUNodeOp : public UnaryNodeOp {
|
||||
|
||||
};
|
||||
|
||||
// Scaling droput
|
||||
struct DropoutNodeOp : public UnaryNodeOp {
|
||||
template <typename ...Args>
|
||||
DropoutNodeOp(Args ...args)
|
||||
: UnaryNodeOp(args...),
|
||||
p_(Get<float>(keywords::value, 0.5)) {}
|
||||
allocated_(false), p_(Get<float>(keywords::value, 0.5)) {}
|
||||
|
||||
~DropoutNodeOp() {
|
||||
if(bernoulli)
|
||||
bernoulli->FreeStates(states_);
|
||||
}
|
||||
|
||||
if(allocated_)
|
||||
CudnnDropoutDestroy(dropDesc_, space_, states_);
|
||||
}
|
||||
|
||||
void inference() {
|
||||
Element(_1 = _2, val_, a_->val());
|
||||
}
|
||||
|
||||
void forward() {
|
||||
if(!bernoulli) {
|
||||
bernoulli.reset(new Bernoulli(p_, val_.shape()));
|
||||
bernoulli->InitStates(states_);
|
||||
}
|
||||
|
||||
if(!mask_)
|
||||
mask_.allocate(val_.shape());
|
||||
|
||||
auto f = [] __device__ (float& mask, float drop) {
|
||||
return mask = drop;
|
||||
};
|
||||
Element(f, mask_, *bernoulli);
|
||||
Element(_1 = _2 * _3, val_, mask_, a_->val());
|
||||
void forward() {
|
||||
if(!allocated_) {
|
||||
CudnnDropoutPrepare(a_->val(), p_,
|
||||
&dropDesc_,
|
||||
&space_, &spaceSize_,
|
||||
&states_, (size_t)this); // seeding with pointer address
|
||||
allocated_ = true;
|
||||
}
|
||||
|
||||
CudnnDropoutForward(dropDesc_, space_, spaceSize_,
|
||||
val_, a_->val());
|
||||
}
|
||||
|
||||
void backward() {
|
||||
Element(_1 += _2 * _3, a_->grad(), adj_, mask_);
|
||||
|
||||
void backward() {
|
||||
CudnnDropoutBackward(dropDesc_, space_, spaceSize_,
|
||||
a_->grad(), adj_);
|
||||
}
|
||||
|
||||
virtual std::string graphviz() {
|
||||
@ -153,13 +150,14 @@ struct DropoutNodeOp : public UnaryNodeOp {
|
||||
};
|
||||
|
||||
private:
|
||||
bool allocated_;
|
||||
float p_;
|
||||
curandState* states_;
|
||||
std::shared_ptr<Bernoulli> bernoulli;
|
||||
Tensor mask_;
|
||||
void* states_;
|
||||
void* space_;
|
||||
size_t spaceSize_;
|
||||
cudnnDropoutDescriptor_t dropDesc_;
|
||||
};
|
||||
|
||||
|
||||
struct SoftmaxNodeOp : public UnaryNodeOp {
|
||||
template <typename ...Args>
|
||||
SoftmaxNodeOp(Args ...args)
|
||||
@ -321,4 +319,3 @@ struct NegNodeOp : public UnaryNodeOp {
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -210,7 +210,7 @@ __global__ void gSoftmaxGrad(float* grad, const float* adj, const float* val,
|
||||
if(j < rows) {
|
||||
extern __shared__ float _share[];
|
||||
float* _sum = _share + blockDim.x;
|
||||
|
||||
|
||||
float* gradRow = grad + j * cols;
|
||||
const float* adjRow = adj + j * cols;
|
||||
const float* valRow = val + j * cols;
|
||||
@ -263,7 +263,7 @@ __global__ void gLogSoftmaxGrad(float* grad, const float* adj, const float* val,
|
||||
if(j < rows) {
|
||||
extern __shared__ float _share[];
|
||||
float* _sum = _share + blockDim.x;
|
||||
|
||||
|
||||
float* gradRow = grad + j * cols;
|
||||
const float* adjRow = adj + j * cols;
|
||||
const float* valRow = val + j * cols;
|
||||
@ -271,7 +271,7 @@ __global__ void gLogSoftmaxGrad(float* grad, const float* adj, const float* val,
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int id = tid + threadIdx.x;
|
||||
if(id < cols) {
|
||||
_sum[threadIdx.x] += adjRow[id];
|
||||
_sum[threadIdx.x] += adjRow[id];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
@ -348,22 +348,22 @@ Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
|
||||
size_t k = A.shape()[1];
|
||||
if(transA)
|
||||
std::swap(m, k);
|
||||
|
||||
|
||||
size_t l = B.shape()[0];
|
||||
size_t n = B.shape()[1];
|
||||
if(transB)
|
||||
std::swap(l, n);
|
||||
|
||||
|
||||
size_t lda = A.shape()[1];
|
||||
size_t ldb = B.shape()[1];
|
||||
size_t ldc = B.shape()[1];
|
||||
|
||||
|
||||
if(transB)
|
||||
ldc = B.shape()[0];
|
||||
|
||||
|
||||
cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
|
||||
|
||||
cublasSgemm(handle, opB, opA,
|
||||
n, m, k, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc);
|
||||
return C;
|
||||
@ -394,5 +394,57 @@ Tensor SumRowwise(const Tensor A, Tensor result) {
|
||||
return temp;
|
||||
}
|
||||
|
||||
void CudnnDropoutPrepare(Tensor in, float p,
|
||||
cudnnDropoutDescriptor_t* dropDesc,
|
||||
void** space, size_t* spaceSize,
|
||||
void** states, size_t seed) {
|
||||
size_t statesSize;
|
||||
cudnnDropoutGetStatesSize(cudnnHandle, &statesSize);
|
||||
cudnnDropoutGetReserveSpaceSize(in.cudnn(), spaceSize);
|
||||
|
||||
}
|
||||
cudaMalloc((void**)states, statesSize);
|
||||
cudaMalloc((void**)space, *spaceSize);
|
||||
|
||||
cudnnCreateDropoutDescriptor(dropDesc);
|
||||
cudnnSetDropoutDescriptor(*dropDesc,
|
||||
cudnnHandle,
|
||||
p,
|
||||
(void*)*states,
|
||||
statesSize,
|
||||
seed);
|
||||
}
|
||||
|
||||
void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,
|
||||
void* space, void* states) {
|
||||
cudnnDestroyDropoutDescriptor(dropDesc);
|
||||
cudaFree(space);
|
||||
cudaFree(states);
|
||||
}
|
||||
|
||||
void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
|
||||
void* space, size_t spaceSize,
|
||||
Tensor out, Tensor in) {
|
||||
cudnnDropoutForward(cudnnHandle,
|
||||
dropoutDesc,
|
||||
in.cudnn(),
|
||||
in.data(),
|
||||
out.cudnn(),
|
||||
out.data(),
|
||||
space,
|
||||
spaceSize);
|
||||
}
|
||||
|
||||
void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc,
|
||||
void* space, size_t spaceSize,
|
||||
Tensor out, Tensor in) {
|
||||
cudnnDropoutBackward(cudnnHandle,
|
||||
dropoutDesc,
|
||||
in.cudnn(),
|
||||
in.data(),
|
||||
out.cudnn(),
|
||||
out.data(),
|
||||
space,
|
||||
spaceSize);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ void Element(Functor functor, T out) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu());
|
||||
@ -82,7 +82,7 @@ void Element(Functor functor,
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(), in.gpu());
|
||||
@ -112,7 +112,7 @@ void Element(Functor functor,
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(),
|
||||
@ -143,7 +143,7 @@ void Element(Functor functor,
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(),
|
||||
@ -180,4 +180,21 @@ Tensor SumRowwise(const Tensor A, Tensor result);
|
||||
|
||||
void ScaleRowwise(Tensor Out, const Tensor ScalingFactors);
|
||||
|
||||
void CudnnDropoutPrepare(Tensor in, float p,
|
||||
cudnnDropoutDescriptor_t* dropDesc,
|
||||
void** space, size_t* spaceSize,
|
||||
void** states, size_t seed);
|
||||
|
||||
void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,
|
||||
void* space, void* states);
|
||||
|
||||
void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
|
||||
void* space, size_t spaceSize,
|
||||
Tensor out, Tensor in);
|
||||
|
||||
void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc,
|
||||
void* space, size_t spaceSize,
|
||||
Tensor out, Tensor in);
|
||||
|
||||
|
||||
}
|
||||
|
@ -20,8 +20,8 @@ int main(int argc, char** argv)
|
||||
using namespace marian;
|
||||
using namespace keywords;
|
||||
|
||||
int input_size = 10;
|
||||
int output_size = 10;
|
||||
int input_size = 30;
|
||||
int output_size = 30;
|
||||
int batch_size = 25;
|
||||
|
||||
// define graph
|
||||
@ -57,8 +57,8 @@ int main(int argc, char** argv)
|
||||
std::cout << g.graphviz() << std::endl;
|
||||
|
||||
// create data
|
||||
//srand(0);
|
||||
srand(time(NULL));
|
||||
srand(0);
|
||||
//srand(time(NULL));
|
||||
std::vector<float> values(batch_size * input_size);
|
||||
generate(begin(values), end(values), Rand);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user