mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-17 09:47:34 +03:00
working dropout node
This commit is contained in:
parent
7412e68dcd
commit
a057ff1776
@ -3,7 +3,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
||||
|
||||
project(marian CXX)
|
||||
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
|
||||
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; --expt-extended-lambda; -Xcompiler '-fPIC')
|
||||
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; --expt-extended-lambda; --expt-relaxed-constexpr; -Xcompiler '-fPIC')
|
||||
add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
|
||||
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||
|
||||
|
@ -50,15 +50,15 @@ from keras.optimizers import Adam, SGD
|
||||
|
||||
def baseline_model(pixels_count, classes_count):
|
||||
model = Sequential()
|
||||
# model.add(Dropout(0.2, input_shape=(pixels_count,)))
|
||||
model.add(Dense(2048, input_dim=pixels_count, init='uniform', activation='relu'))
|
||||
model.add(Dropout(0.2, input_shape=(pixels_count,)))
|
||||
model.add(Dense(2048, input_dim=pixels_count, init='uniform', activation='tanh'))
|
||||
# model.add(Dense(2048, init='uniform', activation='relu'))
|
||||
# model.add(Dropout(0.5))
|
||||
model.add(Dense(2048, init='uniform', activation='relu'))
|
||||
model.add(Dense(2048, init='uniform', activation='relu'))
|
||||
model.add(Dense(2048, init='uniform', activation='relu'))
|
||||
model.add(Dense(2048, init='uniform', activation='relu'))
|
||||
# model.add(Dropout(0.5))
|
||||
model.add(Dropout(0.5))
|
||||
# model.add(Dense(2048, init='uniform', activation='relu'))
|
||||
# model.add(Dense(2048, init='uniform', activation='relu'))
|
||||
# model.add(Dense(2048, init='uniform', activation='relu'))
|
||||
model.add(Dense(2048, init='uniform', activation='tanh'))
|
||||
model.add(Dropout(0.5))
|
||||
model.add(Dense(classes_count, init='uniform', activation='softmax'))
|
||||
|
||||
opt = Adam(lr=0.0002);
|
||||
@ -102,7 +102,7 @@ if __name__ == "__main__":
|
||||
# Fit the model
|
||||
|
||||
start = time.time();
|
||||
model.fit(X_train, y_train, nb_epoch=10, batch_size=200, verbose=2, shuffle=True)
|
||||
model.fit(X_train, y_train, nb_epoch=20, batch_size=200, verbose=2, shuffle=True)
|
||||
|
||||
print "Time elapsed", time.time() - start, "s"
|
||||
# Final evaluation of the model
|
||||
|
@ -8,6 +8,7 @@ cuda_add_library(marian_lib
|
||||
tensor.cu
|
||||
tensor_operators.cu
|
||||
expression_operators.cu
|
||||
dropout.cu
|
||||
vocab.cpp
|
||||
)
|
||||
|
||||
|
@ -32,6 +32,7 @@ template <class DataType>
|
||||
struct Chainable {
|
||||
Chainable() { }
|
||||
virtual ~Chainable() { }
|
||||
virtual void inference() { forward(); }
|
||||
virtual void forward() { }
|
||||
virtual void backward() { }
|
||||
virtual void backward_numeric(Float delta) { }
|
||||
|
@ -25,14 +25,45 @@
|
||||
#include <string>
|
||||
#include <functional>
|
||||
|
||||
#define SHAPE_SIZE 2
|
||||
|
||||
namespace marian {
|
||||
typedef float Float;
|
||||
typedef std::vector<int> Shape;
|
||||
const int whatevs{-1};
|
||||
|
||||
// POD for shape
|
||||
class Shape {
|
||||
private:
|
||||
int shape_[SHAPE_SIZE];
|
||||
|
||||
public:
|
||||
Shape() : shape_{1, 1} { }
|
||||
|
||||
Shape(std::initializer_list<int> il) {
|
||||
std::copy(il.begin(), il.end(), begin());
|
||||
}
|
||||
|
||||
int& operator[](int i) {
|
||||
return shape_[i];
|
||||
}
|
||||
|
||||
const int& operator[](int i) const {
|
||||
return shape_[i];
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return SHAPE_SIZE;
|
||||
}
|
||||
|
||||
int* begin() { return shape_; }
|
||||
int* end() { return shape_ + SHAPE_SIZE; }
|
||||
|
||||
const int* begin() const { return shape_; }
|
||||
const int* end() const { return shape_+ SHAPE_SIZE; }
|
||||
};
|
||||
}
|
||||
|
||||
#include "keywords.h"
|
||||
// #include "tensor.h"
|
||||
|
||||
namespace marian {
|
||||
class Tensor;
|
||||
|
15
src/dropout.cu
Normal file
15
src/dropout.cu
Normal file
@ -0,0 +1,15 @@
|
||||
#include <curand.h>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "dropout.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
__global__ void gInitCurandStates(curandState* states, unsigned int seed) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
curand_init(seed, tid, 0, &states[tid]);
|
||||
}
|
||||
|
||||
unsigned Bernoulli::seed = time(0);
|
||||
|
||||
}
|
57
src/dropout.h
Normal file
57
src/dropout.h
Normal file
@ -0,0 +1,57 @@
|
||||
#pragma once
|
||||
|
||||
#include <curand.h>
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "tensor_operators.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
__global__ void gInitCurandStates(curandState* states, unsigned int seed);
|
||||
|
||||
class Bernoulli {
|
||||
private:
|
||||
float p_;
|
||||
curandState* states_;
|
||||
static unsigned seed;
|
||||
Shape shape_;
|
||||
|
||||
public:
|
||||
Bernoulli(float p, const Shape& shape)
|
||||
: p_(p), shape_(shape) {}
|
||||
|
||||
void InitStates(curandState* states) {
|
||||
states_ = states;
|
||||
int blocks = std::min(MAX_BLOCKS, shape_[0]);
|
||||
int threads = std::min(MAX_THREADS, shape_[1]);
|
||||
int n = blocks * threads;
|
||||
cudaMalloc((void**) &states_, n * sizeof(curandState));
|
||||
gInitCurandStates<<<blocks, threads>>>(states_, seed++);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void FreeStates(curandState* states) {
|
||||
cudaFree(states);
|
||||
}
|
||||
|
||||
__device__ float operator()(int i, int j) const {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
float dist = curand_uniform(&states_[tid]);
|
||||
float zeroOne = dist > p_;
|
||||
return zeroOne / (1 - p_);
|
||||
}
|
||||
|
||||
__device__ int rows() const {
|
||||
return shape_[0];
|
||||
}
|
||||
|
||||
__device__ int cols() const {
|
||||
return shape_[1];
|
||||
}
|
||||
|
||||
Bernoulli& gpu() {
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
@ -68,6 +68,15 @@ class ExpressionGraph {
|
||||
|
||||
/** @brief Constructs a new expression graph */
|
||||
ExpressionGraph() : stack_(new ChainableStack) {}
|
||||
|
||||
void inference(int batchSize) {
|
||||
for(auto&& v : *stack_) {
|
||||
v->allocate(batchSize);
|
||||
}
|
||||
for(auto&& v : *stack_)
|
||||
v->inference();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Performs backpropogation on this expression graph.
|
||||
|
@ -42,10 +42,6 @@ Expr relu(Expr a) {
|
||||
return Expr(a.graph(), new ReLUNodeOp(a));
|
||||
}
|
||||
|
||||
Expr dropout(Expr a) {
|
||||
return Expr(a.graph(), new DropoutNodeOp(a));
|
||||
}
|
||||
|
||||
Expr log(Expr a) {
|
||||
return Expr(a.graph(), new LogNodeOp(a));
|
||||
};
|
||||
|
@ -33,7 +33,10 @@ Expr tanh(Expr a);
|
||||
|
||||
Expr relu(Expr a);
|
||||
|
||||
Expr dropout(Expr a);
|
||||
template <typename ...Args>
|
||||
Expr dropout(Expr a, Args ...args) {
|
||||
return Expr(a.graph(), new DropoutNodeOp(a, args...));
|
||||
}
|
||||
|
||||
Expr log(Expr a);
|
||||
|
||||
|
@ -6,7 +6,6 @@
|
||||
|
||||
#include "marian.h"
|
||||
#include "mnist.h"
|
||||
#include "npz_converter.h"
|
||||
#include "optimizers.h"
|
||||
|
||||
using namespace marian;
|
||||
@ -30,11 +29,11 @@ ExpressionGraph build_graph(const std::vector<int>& dims) {
|
||||
int out = dims[i+1];
|
||||
|
||||
if(i == 0) {
|
||||
layers.emplace_back(x);
|
||||
layers.emplace_back(dropout(x, value=0.2));
|
||||
}
|
||||
else {
|
||||
layers.emplace_back(reluplus(dot(layers.back(), weights.back()), biases.back()));
|
||||
//layers.emplace_back(relu(dot(layers.back(), weights.back()) + biases.back()));
|
||||
//layers.emplace_back(reluplus(dot(layers.back(), weights.back()), biases.back()));
|
||||
layers.emplace_back(dropout(relu(dot(layers.back(), weights.back()) + biases.back()), value=0.5));
|
||||
}
|
||||
|
||||
weights.emplace_back(
|
||||
@ -115,8 +114,8 @@ int main(int argc, char** argv) {
|
||||
std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", testRows, LABEL_SIZE);
|
||||
std::cerr << "Done." << std::endl;
|
||||
|
||||
ExpressionGraph g = build_graph({IMAGE_SIZE, 2048, 2048, 2048, 2048, 2048, LABEL_SIZE});
|
||||
std::cout << g.graphviz() << std::endl;
|
||||
ExpressionGraph g = build_graph({IMAGE_SIZE, 2048, 2048, LABEL_SIZE});
|
||||
//std::cout << g.graphviz() << std::endl;
|
||||
|
||||
Tensor xt({BATCH_SIZE, IMAGE_SIZE});
|
||||
Tensor yt({BATCH_SIZE, LABEL_SIZE});
|
||||
@ -167,7 +166,7 @@ int main(int argc, char** argv) {
|
||||
g["x"] = xt;
|
||||
g["y"] = yt;
|
||||
|
||||
g.forward(BATCH_SIZE);
|
||||
g.inference(BATCH_SIZE);
|
||||
|
||||
std::vector<float> bResults;
|
||||
bResults << g["scores"].val();
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "node.h"
|
||||
#include "tensor_operators.h"
|
||||
#include "dropout.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -107,22 +108,40 @@ struct ReLUNodeOp : public UnaryNodeOp {
|
||||
|
||||
};
|
||||
|
||||
// @TODO: slow and probably buggy
|
||||
// Scaling droput
|
||||
struct DropoutNodeOp : public UnaryNodeOp {
|
||||
template <typename ...Args>
|
||||
DropoutNodeOp(Args ...args)
|
||||
: UnaryNodeOp(args...),
|
||||
p_(0.5), seed_(time(0)) { }
|
||||
p_(Get<float>(keywords::value, 0.5)) {}
|
||||
|
||||
void forward() {
|
||||
//Element(_1 = Bernoulli(p_, (size_t)this) * _2,
|
||||
// val_, a_->val())
|
||||
Dropout(val_, a_->val(), p_, seed_++);
|
||||
~DropoutNodeOp() {
|
||||
if(bernoulli)
|
||||
bernoulli->FreeStates(states_);
|
||||
}
|
||||
|
||||
void backward() {
|
||||
Element(_1 += _2 * (_3 != 0.0f), // transform non-zero to 1
|
||||
a_->grad(), adj_, val_);
|
||||
|
||||
void inference() {
|
||||
Element(_1 = _2, val_, a_->val());
|
||||
}
|
||||
|
||||
void forward() {
|
||||
if(!bernoulli) {
|
||||
bernoulli.reset(new Bernoulli(p_, val_.shape()));
|
||||
bernoulli->InitStates(states_);
|
||||
}
|
||||
|
||||
if(!mask_)
|
||||
mask_.allocate(val_.shape());
|
||||
|
||||
auto f = [] __device__ (float& mask, float drop) {
|
||||
return mask = drop;
|
||||
};
|
||||
Element(f, mask_, *bernoulli);
|
||||
Element(_1 = _2 * _3, val_, mask_, a_->val());
|
||||
}
|
||||
|
||||
void backward() {
|
||||
Element(_1 += _2 * _3, a_->grad(), adj_, mask_);
|
||||
}
|
||||
|
||||
virtual std::string graphviz() {
|
||||
@ -135,7 +154,9 @@ struct DropoutNodeOp : public UnaryNodeOp {
|
||||
|
||||
private:
|
||||
float p_;
|
||||
int seed_;
|
||||
curandState* states_;
|
||||
std::shared_ptr<Bernoulli> bernoulli;
|
||||
Tensor mask_;
|
||||
};
|
||||
|
||||
|
||||
|
@ -165,9 +165,7 @@ class NpzConverter {
|
||||
data.resize(np.size());
|
||||
std::copy(np.data(), np.data() + np.size(), data.begin());
|
||||
|
||||
shape.clear();
|
||||
shape.push_back(np.size1());
|
||||
shape.push_back(np.size2());
|
||||
shape = { (int)np.size1(), (int)np.size2() };
|
||||
|
||||
}
|
||||
else {
|
||||
|
41
src/tensor.h
41
src/tensor.h
@ -157,6 +157,7 @@ class TensorImpl {
|
||||
*
|
||||
* @return Shape of Tensor
|
||||
*/
|
||||
__host__ __device__
|
||||
const Shape& shape() const {
|
||||
return shape_;
|
||||
}
|
||||
@ -269,6 +270,7 @@ class Tensor {
|
||||
Tensor() {}
|
||||
|
||||
/**
|
||||
|
||||
* @brief Constructor that allocates memory.
|
||||
*
|
||||
* @param shape Shape of Tensor.
|
||||
@ -331,7 +333,7 @@ class Tensor {
|
||||
const value_type* data() const {
|
||||
return pimpl_->data();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get begin iterator of GPU Tensor's vector.
|
||||
*
|
||||
@ -373,6 +375,7 @@ class Tensor {
|
||||
*
|
||||
* @return Tensor's shape.
|
||||
*/
|
||||
__host__ __device__
|
||||
const Shape& shape() const {
|
||||
return pimpl_->shape();
|
||||
}
|
||||
@ -436,7 +439,8 @@ class Tensor {
|
||||
void set(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end);
|
||||
|
||||
void incr(Float incr) {
|
||||
pimpl_->incr(incr);
|
||||
pimpl_->incr(incr)
|
||||
;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -457,6 +461,39 @@ class Tensor {
|
||||
vout.resize(size());
|
||||
pimpl_->get(vout.begin());
|
||||
}
|
||||
|
||||
class TensorView {
|
||||
private:
|
||||
float* data_;
|
||||
int rows_;
|
||||
int cols_;
|
||||
|
||||
public:
|
||||
TensorView(Tensor t)
|
||||
: data_(t.data()), rows_(t.shape()[0]), cols_(t.shape()[1]) {}
|
||||
|
||||
__device__ float& operator()(int i, int j) {
|
||||
if(rows_ != 1 && cols_ != 1)
|
||||
return data_[i * cols_ + j];
|
||||
if(rows_ != 1 && cols_ == 1)
|
||||
return data_[i];
|
||||
if(rows_ == 1 && cols_ != 1)
|
||||
return data_[j];
|
||||
return data_[0];
|
||||
}
|
||||
|
||||
__device__ int rows() {
|
||||
return rows_;
|
||||
}
|
||||
|
||||
__device__ int cols() {
|
||||
return cols_;
|
||||
}
|
||||
};
|
||||
|
||||
TensorView gpu() {
|
||||
return TensorView(*this);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -19,12 +19,8 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "tensor_operators.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace marian {
|
||||
|
||||
// @TODO: handle this better, maybe per thread?
|
||||
@ -35,48 +31,6 @@ static cublasHandle_t create_handle() {
|
||||
}
|
||||
cublasHandle_t cublasHandle = create_handle();
|
||||
|
||||
__global__ void gDropout(float* out, const float* in,
|
||||
int seed, const float p, int rows, int cols) {
|
||||
|
||||
int shift = blockIdx.x * cols + threadIdx.x;
|
||||
curandState state;
|
||||
curand_init(seed, shift, 0, &state);
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int j = bid + blockIdx.x;
|
||||
if(j < rows) {
|
||||
Float* rowOut = out + j * cols;
|
||||
const Float* rowIn = in + j * cols;
|
||||
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int i = tid + threadIdx.x;
|
||||
if(i < cols) {
|
||||
//int offset = i;
|
||||
float dropout = (curand_uniform(&state) >= p);
|
||||
rowOut[i] = dropout * rowIn[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Slow!!!
|
||||
void Dropout(Tensor out, Tensor in, float p, int seed) {
|
||||
int m = in.shape()[0];
|
||||
int n = in.shape()[1];
|
||||
|
||||
curandGenerator_t prng;
|
||||
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_XORWOW);
|
||||
curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) seed);
|
||||
curandGenerateUniform(prng, out.data(), m * n);
|
||||
Element(_1 = (_1 > p), out);
|
||||
Element(_1 = _1 * _2, out, in);
|
||||
//int blocks = std::min(MAX_BLOCKS, m);
|
||||
//int threads = std::min(MAX_THREADS, k);
|
||||
//gDropout<<<blocks, threads>>>(out.data(), in.data(), seed, p, m, k);
|
||||
//cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
|
||||
__global__ void gSoftmaxGrad(float* grad, const float* adj, const float* val,
|
||||
const int rows, const int cols) {
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
|
@ -29,54 +29,9 @@ using namespace thrust::placeholders;
|
||||
#define MAX_THREADS 512
|
||||
#define MAX_BLOCKS 65535
|
||||
|
||||
class TensorView {
|
||||
private:
|
||||
float* data_;
|
||||
int rows_;
|
||||
int cols_;
|
||||
|
||||
public:
|
||||
TensorView(Tensor t)
|
||||
: data_(t.data()), rows_(t.shape()[0]), cols_(t.shape()[1]) {}
|
||||
|
||||
__device__ float& operator()(int i, int j) {
|
||||
if(rows_ != 1 && cols_ != 1)
|
||||
return data_[i * cols_ + j];
|
||||
if(rows_ != 1 && cols_ == 1)
|
||||
return data_[i];
|
||||
if(rows_ == 1 && cols_ != 1)
|
||||
return data_[j];
|
||||
return data_[0];
|
||||
}
|
||||
|
||||
__device__ int rows() {
|
||||
return rows_;
|
||||
}
|
||||
|
||||
__device__ int cols() {
|
||||
return cols_;
|
||||
}
|
||||
};
|
||||
|
||||
//template <class Functor>
|
||||
//__global__ void gElement(Functor functor) {
|
||||
// int rows = out.rows();
|
||||
// int cols = out.cols();
|
||||
// for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
// int i = bid + blockIdx.x;
|
||||
// if(i < rows) {
|
||||
// for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
// int j = tid + threadIdx.x;
|
||||
// if(j < cols)
|
||||
// functor(i, j);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
template <class Functor>
|
||||
template <class Functor, class T>
|
||||
__global__ void gElement(Functor functor,
|
||||
TensorView out) {
|
||||
T out) {
|
||||
int rows = out.rows();
|
||||
int cols = out.cols();
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
@ -91,23 +46,22 @@ __global__ void gElement(Functor functor,
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
void Element(Functor functor,
|
||||
Tensor out) {
|
||||
template <class Functor, class T>
|
||||
void Element(Functor functor, T out) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, TensorView(out));
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu());
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
|
||||
template <class Functor>
|
||||
template <class Functor, class T1, class T2>
|
||||
__global__ void gElement(Functor functor,
|
||||
TensorView out, TensorView in) {
|
||||
T1 out, T2 in) {
|
||||
int rows = out.rows();
|
||||
int cols = out.cols();
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
@ -122,22 +76,22 @@ __global__ void gElement(Functor functor,
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
template <class Functor, class T1, class T2>
|
||||
void Element(Functor functor,
|
||||
Tensor out, Tensor in) {
|
||||
T1 out, T2 in) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, TensorView(out), TensorView(in));
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(), in.gpu());
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
template <class Functor, class T1, class T2, class T3>
|
||||
__global__ void gElement(Functor functor,
|
||||
TensorView out, TensorView in1, TensorView in2) {
|
||||
T1 out, T2 in1, T3 in2) {
|
||||
int rows = out.rows();
|
||||
int cols = out.cols();
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
@ -152,23 +106,23 @@ __global__ void gElement(Functor functor,
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
template <class Functor, class T1, class T2, class T3>
|
||||
void Element(Functor functor,
|
||||
Tensor out, Tensor in1, Tensor in2) {
|
||||
T1 out, T2 in1, T3 in2) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, TensorView(out),
|
||||
TensorView(in1), TensorView(in2));
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(),
|
||||
in1.gpu(), in2.gpu());
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
template <class Functor, class T1, class T2, class T3, class T4>
|
||||
__global__ void gElement(Functor functor,
|
||||
TensorView out, TensorView in1, TensorView in2, TensorView in3) {
|
||||
T1 out, T2 in1, T3 in2, T4 in3) {
|
||||
int rows = out.rows();
|
||||
int cols = out.cols();
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
@ -183,22 +137,20 @@ __global__ void gElement(Functor functor,
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
void Element(Functor functor, Tensor out,
|
||||
Tensor in1, Tensor in2, Tensor in3) {
|
||||
template <class Functor, class T1, class T2, class T3, class T4>
|
||||
void Element(Functor functor,
|
||||
T1 out, T2 in1, T3 in2, T4 in3) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, TensorView(out),
|
||||
TensorView(in1), TensorView(in2), TensorView(in3));
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(),
|
||||
in1.gpu(), in2.gpu(), in3.gpu());
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void Dropout(Tensor Out, Tensor in, float p, int seed);
|
||||
|
||||
void SubtractMax(Tensor* Out);
|
||||
|
||||
void Softmax(Tensor* Out);
|
||||
|
67
src/test.cu
67
src/test.cu
@ -26,62 +26,27 @@
|
||||
#include "mnist.h"
|
||||
#include "vocab.h"
|
||||
#include "tensor_operators.h"
|
||||
#include "curand.h"
|
||||
|
||||
using namespace marian;
|
||||
using namespace keywords;
|
||||
|
||||
template <class Functor>
|
||||
__global__ void tgElement(Functor functor, TensorView t, int rows, int cols) {
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
if(i < rows) {
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int j = tid + threadIdx.x;
|
||||
if(j < cols)
|
||||
t(i, j) = functor(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
void tElement(Functor functor, Tensor t) {
|
||||
|
||||
|
||||
int m = t.shape()[0];
|
||||
int n = t.shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
tgElement<<<blocks, threads>>>(functor, TensorView(t), m, n);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
ExpressionGraph g;
|
||||
|
||||
//Tensor a({1000, 1000}, 3);
|
||||
//Tensor b({1, 1}, 2);
|
||||
//
|
||||
//TensorView ta(a);
|
||||
//TensorView tb(b);
|
||||
//
|
||||
//boost::timer::cpu_timer timer;
|
||||
//
|
||||
//
|
||||
//auto f = _1 + _2;
|
||||
//auto pp1 = [=] __device__ (int i, int j) mutable -> float {
|
||||
// return f(ta(i, j), tb(i, j));
|
||||
//};
|
||||
//
|
||||
//auto pp2 = [=] __device__ (int i, int j) mutable -> float {
|
||||
// return f(pp1(i, j), tb(i, j));
|
||||
//};
|
||||
//
|
||||
//for(int i = 0; i < 1000; ++i)
|
||||
// tElement(pp2, a);
|
||||
|
||||
|
||||
Tensor a({1000, 1000}, 3);
|
||||
Tensor b({1000, 1000});
|
||||
Bernoulli dropout(0.2, b.shape());
|
||||
|
||||
auto f = [] __device__ (float& r,
|
||||
float a,
|
||||
float b) {
|
||||
return r = a * b;
|
||||
};
|
||||
|
||||
// std::cerr << timer.format(5, "%ws") << std::endl;
|
||||
boost::timer::cpu_timer timer;
|
||||
for(int i = 0; i < 1000; ++i)
|
||||
Element(f, b, a, a);
|
||||
|
||||
std::cerr << timer.format(5, "%ws") << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
@ -138,7 +138,6 @@ namespace thrust
|
||||
ReLUback(const actor<Eval> &_1) {
|
||||
return compose(unary_operator<unary_reluback>(), _1);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user