Autoformat .cu files

This commit is contained in:
Roman Grundkiewicz 2017-06-05 13:10:28 +02:00
parent d2c9e9fc74
commit bcb3851fcc
22 changed files with 1027 additions and 891 deletions

View File

@ -1,3 +1,3 @@
#!/bin/bash
find ./src -path ./src/3rd_party -prune -o -iname *.h -o -iname *.cpp | xargs clang-format-3.8 -i
find ./src -path ./src/3rd_party -prune -o -iname *.h -o -iname *.cpp -o -iname *.cu | xargs clang-format-3.8 -i

View File

@ -6,7 +6,7 @@
int main(int argc, char** argv) {
using namespace marian;
auto options = New<Config>(argc, argv);;
auto options = New<Config>(argc, argv);
auto devices = options->get<std::vector<size_t>>("devices");
if(devices.size() > 1)

View File

@ -8,34 +8,30 @@
#include "marian.h"
#include "examples/mnist/training.h"
#include "examples/mnist/model.h"
#include "examples/mnist/training.h"
#include "training/graph_group.h"
const std::vector<std::string> TRAIN_SET = {
"../src/examples/mnist/train-images-idx3-ubyte",
"../src/examples/mnist/train-labels-idx1-ubyte"
};
const std::vector<std::string> VALID_SET = {
"../src/examples/mnist/t10k-images-idx3-ubyte",
"../src/examples/mnist/t10k-labels-idx1-ubyte"
};
const std::vector<std::string> TRAIN_SET
= {"../src/examples/mnist/train-images-idx3-ubyte",
"../src/examples/mnist/train-labels-idx1-ubyte"};
const std::vector<std::string> VALID_SET
= {"../src/examples/mnist/t10k-images-idx3-ubyte",
"../src/examples/mnist/t10k-labels-idx1-ubyte"};
using namespace marian;
int main(int argc, char** argv) {
auto options = New<Config>(argc, argv, false);
if (!options->has("train-sets"))
if(!options->has("train-sets"))
options->set("train-sets", TRAIN_SET);
if (!options->has("valid-sets"))
if(!options->has("valid-sets"))
options->set("valid-sets", VALID_SET);
auto devices = options->get<std::vector<size_t>>("devices");
if (devices.size() > 1)
if(devices.size() > 1)
New<TrainMNIST<AsyncGraphGroup<models::MNISTModel>>>(options)->run();
else
New<TrainMNIST<Singleton<models::MNISTModel>>>(options)->run();

View File

@ -1,32 +1,35 @@
#include <sstream>
#include "expression_graph.h"
#include "backend_gpu.h"
#include "expression_graph.h"
#include "kernels/dropout.h"
namespace marian {
ExpressionGraph::ExpressionGraph(bool inference)
: inferenceOnly_(inference),
backend_(New<BackendGPU>()) {}
ExpressionGraph::ExpressionGraph(bool inference)
: inferenceOnly_(inference), backend_(New<BackendGPU>()) {}
void ExpressionGraph::setDevice(size_t device) {
device_ = device;
void ExpressionGraph::setDevice(size_t device) {
device_ = device;
params_ = New<Parameters>();
params_->init(device_);
params_ = New<Parameters>();
params_->init(device_);
tensors_ = New<TensorAllocator>(device);
std::static_pointer_cast<BackendGPU>(backend_)->setHandles(device, Config::seed);
}
Expr ExpressionGraph::dropout(float prob, Shape shape) {
auto dropoutInit = [prob, this](Tensor t) {
Dropout(t, prob, std::static_pointer_cast<BackendGPU>(backend_)->getCurandGenerator());
};
tensors_ = New<TensorAllocator>(device);
return Expression<ConstantNode>(shared_from_this(),
keywords::init=dropoutInit,
keywords::shape=shape);
}
std::static_pointer_cast<BackendGPU>(backend_)->setHandles(device,
Config::seed);
}
Expr ExpressionGraph::dropout(float prob, Shape shape) {
auto dropoutInit = [prob, this](Tensor t) {
Dropout(
t,
prob,
std::static_pointer_cast<BackendGPU>(backend_)->getCurandGenerator());
};
return Expression<ConstantNode>(shared_from_this(),
keywords::init = dropoutInit,
keywords::shape = shape);
}
}

View File

@ -1,9 +1,9 @@
#include "graph/expression_operators.h"
#include "kernels/sparse.h"
#include "graph/node_operators_unary.h"
#include "graph/node_operators_binary.h"
#include "graph/node_operators.h"
#include "graph/node_operators_binary.h"
#include "graph/node_operators_unary.h"
namespace marian {
@ -85,15 +85,15 @@ Expr operator-(float a, Expr b) {
}
Expr operator*(float a, Expr b) {
return Expression<ScalarMultNodeOp>(b, a);
return Expression<ScalarMultNodeOp>(b, a);
}
Expr operator*(Expr a, float b) {
return Expression<ScalarMultNodeOp>(a, b);
return Expression<ScalarMultNodeOp>(a, b);
}
Expr operator/(Expr a, float b) {
return Expression<ScalarMultNodeOp>(a, 1.f / b);
return Expression<ScalarMultNodeOp>(a, 1.f / b);
}
/*********************************************************/
@ -111,7 +111,6 @@ Expr flatten(Expr a) {
return Expression<ReshapeNodeOp>(a, shape);
}
Expr sum(Expr a, keywords::axis_k ax) {
return Expression<SumNodeOp>(a, ax);
}
@ -130,7 +129,6 @@ Expr weighted_average(Expr in, Expr weights, keywords::axis_k ax) {
return p / s;
}
Expr dot(Expr a, Expr b) {
return Expression<DotNodeOp>(a, b);
}
@ -187,7 +185,7 @@ Expr layer_norm(Expr x, Expr gamma, Expr beta) {
return Expression<LayerNormalizationOp>(nodes);
}
//Expr batch_norm(Expr x, Expr gamma, Expr beta) {
// Expr batch_norm(Expr x, Expr gamma, Expr beta) {
// auto mju = mean(x, keywords::axis=0);
// auto xmmju = x - mju;
// auto std = sqrt(mean(square(xmmju), keywords::axis=0), 1e-9);
@ -205,5 +203,4 @@ Expr shift(Expr a, Shape shift) {
Expr lexical_bias(Expr logits, Expr att, float eps, Ptr<sparse::CSR> lf) {
return Expression<LexicalProbNodeOp>(logits, att, eps, lf);
}
}

View File

@ -1,6 +1,6 @@
#include "graph/backend_gpu.h"
#include "graph/expression_graph.h"
#include "graph/node.h"
#include "graph/backend_gpu.h"
namespace marian {
@ -48,5 +48,4 @@ void NaryNodeOp::remove_children_from_top_nodes() {
for(auto child : children_)
graph()->remove_top_node(child);
}
}

View File

@ -1,41 +1,40 @@
#include "node_operators.h"
#include "expression_graph.h"
#include "node_operators.h"
namespace marian {
size_t ConstantNode::allocate() {
// @TODO params
size_t elements = 0;
if(!val_) {
graph()->tensor(val_, shape_);
elements = val_->shape().elements();
}
return elements;
size_t ConstantNode::allocate() {
// @TODO params
size_t elements = 0;
if(!val_) {
graph()->tensor(val_, shape_);
elements = val_->shape().elements();
}
void ConstantNode::init() {
if(!initialized_) {
init_(val_);
initialized_ = true;
}
}
size_t ParamNode::allocate() {
// @TODO params
size_t elements = 0;
if(!val_) {
graph()->tensor(val_, shape_);
elements = val_->shape().elements();
}
return elements;
}
void ParamNode::init() {
if(!initialized_) {
//std::cerr << "Initializing parameter " << name() << std::endl;
init_(val_);
initialized_ = true;
}
}
return elements;
}
void ConstantNode::init() {
if(!initialized_) {
init_(val_);
initialized_ = true;
}
}
size_t ParamNode::allocate() {
// @TODO params
size_t elements = 0;
if(!val_) {
graph()->tensor(val_, shape_);
elements = val_->shape().elements();
}
return elements;
}
void ParamNode::init() {
if(!initialized_) {
// std::cerr << "Initializing parameter " << name() << std::endl;
init_(val_);
initialized_ = true;
}
}
}

View File

@ -3,30 +3,34 @@
#include "kernels/dropout.h"
#define CUDA_CALL(x) \
do { \
if((x) != cudaSuccess) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
exit(1); \
} \
} while(0)
#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(1);}} while(0)
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(1);}} while(0)
#define CURAND_CALL(x) \
do { \
if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
exit(1); \
} \
} while(0)
namespace marian {
__global__
void gScale(float* data, int n, float p) {
__global__ void gScale(float* data, int n, float p) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
while (index < n) {
while(index < n) {
data[index] = (data[index] < p) / p;
index += gridDim.x * blockDim.x;
}
}
void Dropout(Tensor tensor, float p,
curandGenerator_t gen) {
void Dropout(Tensor tensor, float p, curandGenerator_t gen) {
int n = tensor->size();
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
@ -35,5 +39,4 @@ void Dropout(Tensor tensor, float p,
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
}
}

View File

@ -4,31 +4,56 @@
#include "tensors/tensor.h"
namespace marian {
namespace sparse {
void multiply(Ptr<CSR> C, const Ptr<CSR> A, const Ptr<CSR> B,
bool transA, bool transB) {
void multiply(
Ptr<CSR> C, const Ptr<CSR> A, const Ptr<CSR> B, bool transA, bool transB) {
cudaSetDevice(C->getDevice());
int nnzTotal;
C->allocRowIndices(A->rows());
CUSPARSE_CHECK(cusparseXcsrgemmNnz(A->handle(),
transA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
transB ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
A->rows(), B->cols(), A->cols(),
A->description(), A->nnz(), A->rowIndices(), A->colIndices(),
B->description(), B->nnz(), B->rowIndices(), B->colIndices(),
C->description(), C->rowIndices(), &nnzTotal));
CUSPARSE_CHECK(cusparseXcsrgemmNnz(
A->handle(),
transA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
transB ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
A->rows(),
B->cols(),
A->cols(),
A->description(),
A->nnz(),
A->rowIndices(),
A->colIndices(),
B->description(),
B->nnz(),
B->rowIndices(),
B->colIndices(),
C->description(),
C->rowIndices(),
&nnzTotal));
C->allocValues(nnzTotal);
C->allocColIndices(nnzTotal);
CUSPARSE_CHECK(cusparseScsrgemm(A->handle(),
transA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
transB ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
A->rows(), B->cols(), A->cols(),
A->description(), A->nnz(), A->values(), A->rowIndices(), A->colIndices(),
B->description(), B->nnz(), B->values(), B->rowIndices(), B->colIndices(),
C->description(), C->values(), C->rowIndices(), C->colIndices()));
C->allocColIndices(nnzTotal);
CUSPARSE_CHECK(cusparseScsrgemm(
A->handle(),
transA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
transB ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
A->rows(),
B->cols(),
A->cols(),
A->description(),
A->nnz(),
A->values(),
A->rowIndices(),
A->colIndices(),
B->description(),
B->nnz(),
B->values(),
B->rowIndices(),
B->colIndices(),
C->description(),
C->values(),
C->rowIndices(),
C->colIndices()));
}
//__global__ void gExpandAtt(float* out,
@ -36,7 +61,7 @@ void multiply(Ptr<CSR> C, const Ptr<CSR> A, const Ptr<CSR> B,
// int batch,
// int srcWords,
// int nonzeros) {
//
//
// for(int bid = 0; bid < nonzeros; bid += blockDim.x * gridDim.x) {
// int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
// if (index < nonzeros) {
@ -48,26 +73,27 @@ void multiply(Ptr<CSR> C, const Ptr<CSR> A, const Ptr<CSR> B,
//}
//
//
//void ExpandAtt(Tensor out, Tensor in) {
// void ExpandAtt(Tensor out, Tensor in) {
// cudaSetDevice(in->getDevice());
// int nonzeros = in->shape().elements();
// int batch = in->shape()[0];
// int srcWords = in->shape()[2];
// int srcWords = in->shape()[2];
//
// int threads = std::min(MAX_THREADS, nonzeros);
// int blocks = std::min(MAX_BLOCKS, nonzeros / threads + (nonzeros % threads != 0));
// int blocks = std::min(MAX_BLOCKS, nonzeros / threads + (nonzeros % threads
// != 0));
//
// gCollapseAtt<<<blocks, threads>>>(out->data(), in->data(), batch, srcWords, nonzeros);
// gCollapseAtt<<<blocks, threads>>>(out->data(), in->data(), batch, srcWords,
// nonzeros);
//}
void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr<CSR> sparseLf) {
cudaSetDevice(out->getDevice());
int batch = att->shape()[0];
int batch = att->shape()[0];
int srcWords = att->shape()[2];
int trgWords = att->shape()[3];
std::vector<float> values;
att->get(values);
int nonzeros = values.size();
@ -89,25 +115,26 @@ void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr<CSR> sparseLf) {
colInd[i] = std::get<1>(coo[i]);
values[i] = std::get<2>(coo[i]);
}
auto sparseAtt = New<CSR>(batch * trgWords, batch * srcWords,
values, rowInd, colInd, out->getDevice());
auto sparseLfa = New<CSR>(sparseAtt->rows(), sparseLf->cols(), out->getDevice());
auto sparseAtt = New<CSR>(batch * trgWords,
batch * srcWords,
values,
rowInd,
colInd,
out->getDevice());
auto sparseLfa
= New<CSR>(sparseAtt->rows(), sparseLf->cols(), out->getDevice());
multiply(sparseLfa, sparseAtt, sparseLf);
sparseLfa->toTensor(out);
}
__global__ void gCollapseAtt(float* out,
const float* in,
int batch,
int srcWords,
int nonzeros) {
__global__ void gCollapseAtt(
float* out, const float* in, int batch, int srcWords, int nonzeros) {
for(int bid = 0; bid < nonzeros; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if (index < nonzeros) {
if(index < nonzeros) {
int r = (index % batch) + (index / (srcWords * batch)) * batch;
int c = index % (srcWords * batch);
float val = in[r * srcWords * batch + c];
@ -116,47 +143,57 @@ __global__ void gCollapseAtt(float* out,
}
}
void CollapseAtt(Tensor out, Tensor in) {
cudaSetDevice(out->getDevice());
int nonzeros = out->shape().elements();
int batch = out->shape()[0];
int srcWords = out->shape()[2];
int srcWords = out->shape()[2];
int threads = std::min(MAX_THREADS, nonzeros);
int blocks = std::min(MAX_BLOCKS, nonzeros / threads + (nonzeros % threads != 0));
int blocks
= std::min(MAX_BLOCKS, nonzeros / threads + (nonzeros % threads != 0));
gCollapseAtt<<<blocks, threads>>>(out->data(), in->data(), batch, srcWords, nonzeros);
gCollapseAtt<<<blocks, threads>>>(
out->data(), in->data(), batch, srcWords, nonzeros);
}
void LfaBackward(Tensor gradAtt, Tensor adj, Ptr<CSR> sparseLf) {
cudaSetDevice(adj->getDevice());
int batch = gradAtt->shape()[0];
int batch = gradAtt->shape()[0];
int srcWords = gradAtt->shape()[2];
int trgWords = gradAtt->shape()[3];
int nonzeros = gradAtt->shape().elements();
int dimTrgVoc = adj->shape()[1];
float* expandAttGradBuffer;
CUDA_CHECK(cudaMalloc(&expandAttGradBuffer, sizeof(float) * batch * srcWords * batch * trgWords));
CUDA_CHECK(cudaMalloc(&expandAttGradBuffer,
sizeof(float) * batch * srcWords * batch * trgWords));
float alpha = 1, beta = 0;
CUSPARSE_CHECK(cusparseScsrmm2(sparseLf->handle(),
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
sparseLf->rows(), batch * trgWords, sparseLf->cols(), sparseLf->nnz(), &alpha,
sparseLf->description(), sparseLf->values(), sparseLf->rowIndices(), sparseLf->colIndices(),
adj->data(), dimTrgVoc, &beta, expandAttGradBuffer, batch * srcWords));
Tensor expandAttGrad(new TensorBase(expandAttGradBuffer,
{batch * trgWords, batch * srcWords}, 0));
CUSPARSE_OPERATION_NON_TRANSPOSE,
CUSPARSE_OPERATION_NON_TRANSPOSE,
sparseLf->rows(),
batch * trgWords,
sparseLf->cols(),
sparseLf->nnz(),
&alpha,
sparseLf->description(),
sparseLf->values(),
sparseLf->rowIndices(),
sparseLf->colIndices(),
adj->data(),
dimTrgVoc,
&beta,
expandAttGradBuffer,
batch * srcWords));
Tensor expandAttGrad(new TensorBase(
expandAttGradBuffer, {batch * trgWords, batch * srcWords}, 0));
CollapseAtt(gradAtt, expandAttGrad);
CUDA_CHECK(cudaFree(expandAttGradBuffer));
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -6,56 +6,48 @@
namespace marian {
struct AttentionNodeOp : public NaryNodeOp {
AttentionNodeOp(const std::vector<Expr>& nodes)
: NaryNodeOp(nodes,
keywords::shape=newShape(nodes)) {}
: NaryNodeOp(nodes, keywords::shape = newShape(nodes)) {}
Shape newShape(const std::vector<Expr>& nodes) {
Shape shape = nodes[1]->shape();
Shape vaShape = nodes[0]->shape();
Shape vaShape = nodes[0]->shape();
Shape ctxShape = nodes[1]->shape();
Shape stateShape = nodes[2]->shape();
for(int i = 0; i < stateShape.size(); ++i) {
UTIL_THROW_IF2(ctxShape[i] != stateShape[i] && ctxShape[i] != 1 && stateShape[i] != 1,
UTIL_THROW_IF2(ctxShape[i] != stateShape[i] && ctxShape[i] != 1
&& stateShape[i] != 1,
"Shapes cannot be broadcasted");
shape.set(i, std::max(ctxShape[i], stateShape[i]));
}
UTIL_THROW_IF2(vaShape[0] != shape[1] || vaShape[1] != 1,
"Wrong size");
UTIL_THROW_IF2(vaShape[0] != shape[1] || vaShape[1] != 1, "Wrong size");
shape.set(1, 1);
return shape;
}
NodeOps forwardOps() {
return {
NodeOp(Att(val_,
child(0)->val(),
child(1)->val(),
child(2)->val(),
children_.size() == 4 ? child(3)->val() : nullptr))
};
return {NodeOp(Att(val_,
child(0)->val(),
child(1)->val(),
child(2)->val(),
children_.size() == 4 ? child(3)->val() : nullptr))};
}
NodeOps backwardOps() {
return {
NodeOp(
AttBack(
child(0)->grad(),
child(1)->grad(),
child(2)->grad(),
children_.size() == 4 ? child(3)->grad() : nullptr,
child(0)->val(),
child(1)->val(),
child(2)->val(),
children_.size() == 4 ? child(3)->val() : nullptr,
adj_
);
)
NodeOp(AttBack(child(0)->grad(),
child(1)->grad(),
child(2)->grad(),
children_.size() == 4 ? child(3)->grad() : nullptr,
child(0)->val(),
child(1)->val(),
child(2)->val(),
children_.size() == 4 ? child(3)->val() : nullptr,
adj_);)
};
}
@ -65,13 +57,9 @@ struct AttentionNodeOp : public NaryNodeOp {
op();
}
const std::string type() {
return "Att-ops";
}
const std::string type() { return "Att-ops"; }
const std::string color() {
return "yellow";
}
const std::string color() { return "yellow"; }
};
Expr attOps(Expr va, Expr context, Expr state, Expr coverage) {
@ -81,9 +69,8 @@ Expr attOps(Expr va, Expr context, Expr state, Expr coverage) {
int dimBatch = context->shape()[0];
int dimWords = context->shape()[2];
int dimBeam = state->shape()[3];
int dimBeam = state->shape()[3];
return reshape(Expression<AttentionNodeOp>(nodes),
{dimWords, dimBatch, 1, dimBeam});
}
}

View File

@ -8,20 +8,16 @@ namespace marian {
struct GRUFastNodeOp : public NaryNodeOp {
bool final_;
template <typename ...Args>
GRUFastNodeOp(const std::vector<Expr>& nodes, bool final, Args ...args)
: NaryNodeOp(nodes,
args...),
final_(final) {}
template <typename... Args>
GRUFastNodeOp(const std::vector<Expr>& nodes, bool final, Args... args)
: NaryNodeOp(nodes, args...), final_(final) {}
NodeOps forwardOps() {
std::vector<Tensor> inputs;
for(int i = 0; i < children_.size(); ++i)
inputs.push_back(child(i)->val());
return {
NodeOp(GRUFastForward(val_, inputs, final_))
};
return {NodeOp(GRUFastForward(val_, inputs, final_))};
}
NodeOps backwardOps() {
@ -35,9 +31,7 @@ struct GRUFastNodeOp : public NaryNodeOp {
outputs.push_back(nullptr);
}
return {
NodeOp(GRUFastBackward(outputs, inputs, adj_, final_))
};
return {NodeOp(GRUFastBackward(outputs, inputs, adj_, final_))};
}
// do not check if node is trainable
@ -46,17 +40,12 @@ struct GRUFastNodeOp : public NaryNodeOp {
op();
}
const std::string type() {
return "GRU-ops";
}
const std::string type() { return "GRU-ops"; }
const std::string color() {
return "yellow";
}
const std::string color() { return "yellow"; }
};
Expr gruOps(const std::vector<Expr>& nodes, bool final) {
return Expression<GRUFastNodeOp>(nodes, final);
}
}

View File

@ -5,17 +5,16 @@ namespace marian {
thread_local Ptr<sparse::CSR> LexProbs::lexProbs_;
thread_local Ptr<sparse::CSR> LexProbs::lf_;
Expr LexicalBias::operator()(Expr logits) {
auto& alignmentsVec = attention_->getAlignments();
Expr aln;
if(single_)
aln = alignmentsVec.back();
else
aln = concatenate(alignmentsVec, keywords::axis=3);
aln = concatenate(alignmentsVec, keywords::axis = 3);
return lexical_bias(logits, aln, eps_, sentLexProbs_);
}
}

View File

@ -1,16 +1,16 @@
#include "clippers.h"
#include "kernels/thrust_functions.h"
#include "kernels/tensor_operators.h"
#include "kernels/thrust_functions.h"
namespace marian {
void Elementwise::clip(Tensor t) {
Element(_1 = Clip(_1, c_), t);
}
void Elementwise::clip(Tensor t) {
Element(_1 = Clip(_1, c_), t);
}
void Norm::clip(Tensor t) {
float l2Norm = L2Norm(t);
if(l2Norm >= c_)
Element(_1 = (c_ / l2Norm) * _1, t);
}
void Norm::clip(Tensor t) {
float l2Norm = L2Norm(t);
if(l2Norm >= c_)
Element(_1 = (c_ / l2Norm) * _1, t);
}
}

View File

@ -1,83 +1,77 @@
#include "optimizers.h"
#include "kernels/thrust_functions.h"
#include "kernels/tensor_operators.h"
#include "kernels/thrust_functions.h"
namespace marian {
void Sgd::updateImpl(Tensor params, Tensor grads) {
Element(_1 -= eta_ * _2, params, grads);
void Sgd::updateImpl(Tensor params, Tensor grads) {
Element(_1 -= eta_ * _2, params, grads);
}
void Adagrad::updateImpl(Tensor params, Tensor grads) {
if(!alloc_)
alloc_ = New<TensorAllocator>(params->getDevice());
if(!gt_) {
int totalSize = params->size();
alloc_->reserveExact(totalSize);
alloc_->allocate(gt_, {1, totalSize});
gt_->set(0);
}
void Adagrad::updateImpl(Tensor params, Tensor grads) {
if(!alloc_)
alloc_ = New<TensorAllocator>(params->getDevice());
if(!gt_) {
int totalSize = params->size();
alloc_->reserveExact(totalSize);
alloc_->allocate(gt_, {1, totalSize});
gt_->set(0);
}
Element(_1 += (_2 * _2), gt_, grads);
Element(_1 += (_2 * _2),
gt_, grads);
Element(_1 -= (eta_ / (Sqrt(_2) + eps_)) * _3, params, gt_, grads);
}
Element(_1 -= (eta_ / (Sqrt(_2) + eps_)) * _3,
params, gt_, grads);
void Adam::updateImpl(Tensor params, Tensor grads) {
if(!mtAlloc_)
mtAlloc_ = New<TensorAllocator>(params->getDevice());
if(!vtAlloc_)
vtAlloc_ = New<TensorAllocator>(params->getDevice());
if(!mt_) {
int totalSize = params->size();
mtAlloc_->reserveExact(totalSize);
mtAlloc_->allocate(mt_, {1, totalSize});
mt_->set(0);
vtAlloc_->reserveExact(totalSize);
vtAlloc_->allocate(vt_, {1, totalSize});
vt_->set(0);
}
void Adam::updateImpl(Tensor params, Tensor grads) {
if(!mtAlloc_)
mtAlloc_ = New<TensorAllocator>(params->getDevice());
if(!vtAlloc_)
vtAlloc_ = New<TensorAllocator>(params->getDevice());
if(!mt_) {
int totalSize = params->size();
mtAlloc_->reserveExact(totalSize);
mtAlloc_->allocate(mt_, {1, totalSize});
mt_->set(0);
t_++;
float denom1 = 1 - std::pow(beta1_, t_);
float denom2 = 1 - std::pow(beta2_, t_);
vtAlloc_->reserveExact(totalSize);
vtAlloc_->allocate(vt_, {1, totalSize});
vt_->set(0);
}
Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2), mt_, grads);
Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)), vt_, grads);
t_++;
float denom1 = 1 - std::pow(beta1_, t_);
float denom2 = 1 - std::pow(beta2_, t_);
Element(_1 -= eta_ * (_2 / denom1) / (Sqrt(_3 / denom2) + eps_),
params,
mt_,
vt_);
}
Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2),
mt_, grads);
Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)),
vt_, grads);
Ptr<OptimizerBase> Optimizer(Ptr<Config> options) {
Ptr<ClipperBase> clipper = nullptr;
float clipNorm = options->get<double>("clip-norm");
if(clipNorm > 0)
clipper = Clipper<Norm>(clipNorm);
Element(_1 -= eta_ * (_2 / denom1) / (Sqrt(_3 / denom2) + eps_),
params, mt_, vt_);
float lrate = options->get<double>("learn-rate");
std::string opt = options->get<std::string>("optimizer");
if(opt == "sgd") {
return Optimizer<Sgd>(lrate, keywords::clip = clipper);
} else if(opt == "adagrad") {
return Optimizer<Adagrad>(lrate, keywords::clip = clipper);
} else if(opt == "adam") {
return Optimizer<Adam>(lrate, keywords::clip = clipper);
} else {
UTIL_THROW2("Unknown optimizer: " << opt);
}
Ptr<OptimizerBase> Optimizer(Ptr<Config> options) {
Ptr<ClipperBase> clipper = nullptr;
float clipNorm = options->get<double>("clip-norm");
if(clipNorm > 0)
clipper = Clipper<Norm>(clipNorm);
float lrate = options->get<double>("learn-rate");
std::string opt = options->get<std::string>("optimizer");
if(opt == "sgd") {
return Optimizer<Sgd>(lrate, keywords::clip=clipper);
}
else if(opt == "adagrad") {
return Optimizer<Adagrad>(lrate, keywords::clip=clipper);
}
else if(opt == "adam") {
return Optimizer<Adam>(lrate, keywords::clip=clipper);
}
else {
UTIL_THROW2("Unknown optimizer: " << opt);
}
}
}
}

View File

@ -3,42 +3,42 @@
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include "tensors/tensor.h"
#include "kernels/tensor_operators.h"
#include "kernels/cuda_helpers.h"
#include "kernels/tensor_operators.h"
#include "tensors/tensor.h"
namespace marian {
__global__ void gFill(float* d_in, int size, float val) {
__global__ void gFill(float *d_in, int size, float val) {
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
if (index < size) {
if(index < size) {
d_in[index] = val;
}
}
}
float TensorBase::get(size_t i) {
cudaSetDevice(device_);
float temp;
CUDA_CHECK(cudaMemcpy(&temp, data_ + i, sizeof(float),
cudaMemcpyDeviceToHost));
cudaStreamSynchronize(0);
return temp;
}
cudaSetDevice(device_);
float temp;
CUDA_CHECK(
cudaMemcpy(&temp, data_ + i, sizeof(float), cudaMemcpyDeviceToHost));
cudaStreamSynchronize(0);
return temp;
}
void TensorBase::set(size_t i, float value) {
cudaSetDevice(device_);
CUDA_CHECK(cudaMemcpy(data_ + i, &value, sizeof(float),
cudaMemcpyHostToDevice));
CUDA_CHECK(
cudaMemcpy(data_ + i, &value, sizeof(float), cudaMemcpyHostToDevice));
cudaStreamSynchronize(0);
}
void TensorBase::get(std::vector<float> &v) {
CUDA_CHECK(cudaSetDevice(device_));
v.resize(size());
CUDA_CHECK(cudaMemcpy(v.data(), data_, size() * sizeof(float),
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(
v.data(), data_, size() * sizeof(float), cudaMemcpyDeviceToHost));
cudaStreamSynchronize(0);
}
@ -52,8 +52,8 @@ void TensorBase::set(float value) {
void TensorBase::set(const std::vector<float> &v) {
CUDA_CHECK(cudaSetDevice(device_));
CUDA_CHECK(cudaMemcpy(data_, v.data(), v.size() * sizeof(float),
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(
data_, v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice));
cudaStreamSynchronize(0);
}
@ -64,12 +64,11 @@ void TensorBase::setSparse(const std::vector<size_t> &k,
cudaStreamSynchronize(0);
}
void TensorBase::copyFrom(Tensor in) {
cudaSetDevice(device_);
CUDA_CHECK(cudaMemcpy(data_ , in->data() , in->size() * sizeof(float),
cudaMemcpyDefault));
cudaStreamSynchronize(0);
cudaSetDevice(device_);
CUDA_CHECK(cudaMemcpy(
data_, in->data(), in->size() * sizeof(float), cudaMemcpyDefault));
cudaStreamSynchronize(0);
}
std::string TensorBase::debug() {
@ -78,9 +77,9 @@ std::string TensorBase::debug() {
assert(shape_.size());
strm << "shape=" << shape_[0];
for(int i = 1; i < shape_.size(); ++i)
strm << "x" << shape_[i];
strm << " size=" << shape_.elements()
<< " (" << shape_.elements() * sizeof(float) << "B)";
strm << "x" << shape_[i];
strm << " size=" << shape_.elements() << " ("
<< shape_.elements() * sizeof(float) << "B)";
strm << " device=" << device_ << std::endl;
// values
@ -88,78 +87,76 @@ std::string TensorBase::debug() {
std::vector<float> values(totSize);
get(values);
size_t dispCols = 5;
strm << std::fixed << std::setprecision(8) << std::setfill(' ');
for(size_t l = 0; l < shape()[3]; ++l) {
for(size_t k = 0; k < shape()[2]; ++k) {
strm << "[ ";
if(shape()[0] > 10) {
for (size_t i = 0; i < shape()[0] && i < dispCols; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
if(shape()[1] > dispCols)
strm << "... ";
for (size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
strm << "[ ";
if(shape()[0] > 10) {
for(size_t i = 0; i < shape()[0] && i < dispCols; ++i) {
if(i > 0)
strm << std::endl << " ";
for(size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
strm << std::setw(12)
<< values[i * shape().stride(0) + j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3)]
<< " ";
}
strm << std::endl << " ...";
for (size_t i = shape()[0] - dispCols; i < shape()[0]; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
if(shape()[1] > dispCols)
strm << "... ";
for (size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
if(shape()[1] > dispCols)
strm << "... ";
for(size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[i * shape().stride(0) + j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3)]
<< " ";
}
}
else {
for (size_t i = 0; i < shape()[0] && i < 10; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
if(shape()[1] > dispCols)
strm << "... ";
for (size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
}
strm << std::endl << " ...";
for(size_t i = shape()[0] - dispCols; i < shape()[0]; ++i) {
if(i > 0)
strm << std::endl << " ";
for(size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
strm << std::setw(12)
<< values[i * shape().stride(0) + j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3)]
<< " ";
}
}
strm << "]" << std::endl;
if(shape()[1] > dispCols)
strm << "... ";
for(size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[i * shape().stride(0) + j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3)]
<< " ";
}
}
} else {
for(size_t i = 0; i < shape()[0] && i < 10; ++i) {
if(i > 0)
strm << std::endl << " ";
for(size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
strm << std::setw(12)
<< values[i * shape().stride(0) + j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3)]
<< " ";
}
if(shape()[1] > dispCols)
strm << "... ";
for(size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[i * shape().stride(0) + j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3)]
<< " ";
}
}
}
strm << "]" << std::endl;
}
}
return strm.str();
@ -174,36 +171,34 @@ DeviceGPU::~DeviceGPU() {
}
void DeviceGPU::reserve(size_t size) {
cudaSetDevice(device_);
cudaSetDevice(device_);
UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
if(data_) {
// Allocate memory by going through host memory
float *temp = new float[size_];
CUDA_CHECK(cudaMemcpy(temp, data_, size_* sizeof(float),
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(data_));
CUDA_CHECK(cudaMalloc(&data_, size * sizeof(float)));
CUDA_CHECK(cudaMemcpy(data_, temp, size_* sizeof(float),
cudaMemcpyHostToDevice));
delete[] temp;
}
else {
CUDA_CHECK(cudaMalloc(&data_, size * sizeof(float)));
}
if(data_) {
// Allocate memory by going through host memory
float *temp = new float[size_];
CUDA_CHECK(
cudaMemcpy(temp, data_, size_ * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(data_));
CUDA_CHECK(cudaMalloc(&data_, size * sizeof(float)));
CUDA_CHECK(
cudaMemcpy(data_, temp, size_ * sizeof(float), cudaMemcpyHostToDevice));
delete[] temp;
} else {
CUDA_CHECK(cudaMalloc(&data_, size * sizeof(float)));
}
size_ = size;
size_ = size;
}
Tensor operator<<(Tensor t, const std::vector<float>& v) {
Tensor operator<<(Tensor t, const std::vector<float> &v) {
t->set(v);
return t;
}
Tensor operator>>(Tensor t, std::vector<float>& v) {
Tensor operator>>(Tensor t, std::vector<float> &v) {
t->get(v);
return t;
}
}

View File

@ -1,13 +1,13 @@
#include <iostream>
#include <cuda.h>
#include <algorithm>
#include <cmath>
#include <functional>
#include <iostream>
#include <iostream>
#include <iterator>
#include <random>
#include <string>
#include <vector>
#include <cmath>
#include <random>
#include <algorithm>
#include <iterator>
#include <iostream>
#include <functional>
#include "layers/generic.h"
#include "marian.h"
@ -40,24 +40,23 @@ int main(int argc, char** argv) {
graph->setDevice(0);
graph->reserveWorkspaceMB(128);
auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp));
auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0));
auto beta = graph->param("beta", {1, 3072}, init=inits::zeros);
auto x
= graph->param("x", {batchSize, 3072}, init = inits::from_vector(temp));
auto gamma
= graph->param("gamma", {1, 3072}, init = inits::from_value(2.0));
auto beta = graph->param("beta", {1, 3072}, init = inits::zeros);
auto y = layer_norm(x, gamma, beta);
auto yLogitsL1 = Dense("ff_logit_l1", 512,
activation=act::tanh,
normalize=true)
(y, y, y);
auto yLogitsL1 = Dense(
"ff_logit_l1", 512, activation = act::tanh, normalize = true)(y, y, y);
auto yLogitsL2 = Dense("ff_logit_l2", 50000)
(yLogitsL1);
auto yLogitsL2 = Dense("ff_logit_l2", 50000)(yLogitsL1);
auto idx = graph->constant({(int)indeces.size(), 1},
init=inits::from_vector(indeces));
init = inits::from_vector(indeces));
auto ce = cross_entropy(yLogitsL2, idx);
auto cost = mean(sum(ce, keywords::axis=2), keywords::axis=0);
auto cost = mean(sum(ce, keywords::axis = 2), keywords::axis = 0);
debug(x, "x");
debug(gamma, "gamma");
@ -72,7 +71,8 @@ int main(int argc, char** argv) {
graph->setDevice(0);
graph->reserveWorkspaceMB(128);
auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp));
auto x = graph->param("x", {batchSize, 3072},
init=inits::from_vector(temp));
auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0));
auto beta = graph->param("beta", {1, 3072}, init=inits::zeros);

View File

@ -1,14 +1,14 @@
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include <vector>
#include <boost/timer/timer.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <boost/chrono.hpp>
#include <boost/timer/timer.hpp>
#include <vector>
#include "training/config.h"
#include "marian.h"
#include "layers/param_initializers.h"
#include "marian.h"
#include "training/config.h"
using namespace marian;
using namespace keywords;

View File

@ -1,17 +1,17 @@
#include <algorithm>
#include <boost/chrono.hpp>
#include <boost/timer/timer.hpp>
#include <chrono>
#include <cstdio>
#include <iomanip>
#include <string>
#include <cstdio>
#include <boost/timer/timer.hpp>
#include <boost/chrono.hpp>
#include "marian.h"
#include "training/config.h"
#include "optimizers/optimizers.h"
#include "optimizers/clippers.h"
#include "data/batch_generator.h"
#include "data/corpus.h"
#include "marian.h"
#include "optimizers/clippers.h"
#include "optimizers/optimizers.h"
#include "training/config.h"
#include "models/amun.h"
#include "models/s2s.h"
@ -34,8 +34,9 @@ int main(int argc, char** argv) {
if(options->has("lexical-table"))
lexProbs = New<LexProbs>(options,
corpus->getVocabs().front(),
corpus->getVocabs().back(), device);
corpus->getVocabs().back(),
device);
auto type = options->get<std::string>("type");
Ptr<EncoderDecoderBase> encdec;
if(type == "s2s")
@ -43,7 +44,7 @@ int main(int argc, char** argv) {
else if(type == "multi-s2s")
encdec = New<MultiS2S>(options);
else
encdec = New<Amun>(options, keywords::lex_probs=lexProbs);
encdec = New<Amun>(options, keywords::lex_probs = lexProbs);
auto model = options->get<std::string>("model");
if(boost::filesystem::exists(model))
@ -52,7 +53,7 @@ int main(int argc, char** argv) {
graph->reserveWorkspaceMB(options->get<size_t>("workspace"));
boost::timer::cpu_timer timer;
//size_t batches = 1;
// size_t batches = 1;
for(int i = 0; i < 1; ++i) {
bg.prepare(false);
while(bg) {
@ -60,12 +61,12 @@ int main(int argc, char** argv) {
batch->debug();
auto costNode = encdec->build(graph, batch);
//for(auto p : graph->params())
// for(auto p : graph->params())
// debug(p, p->name());
debug(costNode, "cost");
//graph->graphviz("debug.dot");
// graph->graphviz("debug.dot");
graph->forward();
graph->backward();
@ -73,7 +74,7 @@ int main(int argc, char** argv) {
}
}
//encdec->save(graph, "test.npz", true);
// encdec->save(graph, "test.npz", true);
std::cout << std::endl;
std::cout << timer.format(5, "%ws") << std::endl;

View File

@ -1,32 +1,32 @@
#include <boost/timer/timer.hpp>
#include <iostream>
#include <map>
#include <boost/timer/timer.hpp>
#include "training/config.h"
#include "tensors/tensor_allocator.h"
#include "training/config.h"
#include "kernels/tensor_operators.h"
#include "common/logging.h"
#include "kernels/tensor_operators.h"
int main(int argc, char** argv) {
using namespace marian;
marian::Config config(argc, argv);
TensorAllocator ta(0);
int batchSize = 64;
int hidden = 2048;
int words = 13;
Tensor out1, out2, out3;
Tensor a_1, a_2, a_3;
Tensor b_1, b_2, b_3;
ta.allocate(out1, {batchSize, hidden, 1});
ta.allocate(out2, {batchSize, hidden, 1});
ta.allocate(out3, {batchSize, hidden, words});
ta.allocate(a1, {batchSize, hidden, 1});
ta.allocate(a2, {batchSize, hidden, 1});
ta.allocate(a3, {batchSize, 1, words});
@ -34,73 +34,72 @@ int main(int argc, char** argv) {
ta.allocate(b1, {1, hidden, 1});
ta.allocate(b2, {batchSize, 1, 1});
ta.allocate(b3, {batchSize, hidden, 1});
out1->set(0);
out2->set(0);
out3->set(0);
a1->set(1);
a2->set(1);
a3->set(1);
b1->set(2);
b2->set(2);
b3->set(2);
for(int i = 0; i < 100; i++) {
Add(_1 * _2, out1, a1, b1);
Add(_1 * _2, out2, a2, b2);
Add(_1 * _2, out3, a3, b3);
}
//auto srcVocab = New<Vocab>();
//auto trgVocab = New<Vocab>();
// auto srcVocab = New<Vocab>();
// auto trgVocab = New<Vocab>();
//
//srcVocab->load("model/vocab.ro.yml");
//trgVocab->load("model/vocab.en.yml");
// srcVocab->load("model/vocab.ro.yml");
// trgVocab->load("model/vocab.en.yml");
//
//int srcDim = 50;
//int trgDim = 50;
// int srcDim = 50;
// int trgDim = 50;
//
//auto probs = New<LexProbs>("data/lex.s2t",
// auto probs = New<LexProbs>("data/lex.s2t",
// srcVocab, trgVocab,
// srcDim, trgDim, 0);
//
//TensorAllocator ta(0);
// TensorAllocator ta(0);
//
//int batchSize = 1;
//int srcWords = 6;
//int trgWords = 2;
// int batchSize = 1;
// int srcWords = 6;
// int trgWords = 2;
//
//
//std::vector<Ptr<data::SubBatch>> batches;
//batches.push_back(New<data::SubBatch>(batchSize, srcWords));
//batches.back()->indeces() = { 3, 4, 0, 1, 2, 0 };
// std::vector<Ptr<data::SubBatch>> batches;
// batches.push_back(New<data::SubBatch>(batchSize, srcWords));
// batches.back()->indeces() = { 3, 4, 0, 1, 2, 0 };
//
//auto batch = New<data::CorpusBatch>(batches);
// auto batch = New<data::CorpusBatch>(batches);
//
//Tensor att, logits;
//Tensor lf, lfa;
// Tensor att, logits;
// Tensor lf, lfa;
//
//ta.allocate(att, {batchSize, 1, srcWords, trgWords});
//ta.allocate(logits, {batchSize, trgDim, trgWords});
//ta.allocate(lf, {batchSize, trgDim, srcWords, 1});
//ta.allocate(lfa, {batchSize, trgDim, trgWords});
// ta.allocate(att, {batchSize, 1, srcWords, trgWords});
// ta.allocate(logits, {batchSize, trgDim, trgWords});
// ta.allocate(lf, {batchSize, trgDim, srcWords, 1});
// ta.allocate(lfa, {batchSize, trgDim, trgWords});
//
//logits->set(0);
// logits->set(0);
//
//auto slf = probs->Lf(batch);
//slf->toTensor(lf);
//std::cerr << lf->debug() << std::endl;
// auto slf = probs->Lf(batch);
// slf->toTensor(lf);
// std::cerr << lf->debug() << std::endl;
//
//std::vector<float> av = { 0.9, 0.05, 0.02, 0.01, 0.01, 0.01,
// std::vector<float> av = { 0.9, 0.05, 0.02, 0.01, 0.01, 0.01,
// 0.9, 0.05, 0.02, 0.01, 0.01, 0.01 };
//att->set(av);
//std::cerr << att->debug() << std::endl;
// att->set(av);
// std::cerr << att->debug() << std::endl;
//
//sparse::LfaForward(lfa, logits, att, slf);
//std::cerr << lfa->debug() << std::endl;
// sparse::LfaForward(lfa, logits, att, slf);
// std::cerr << lfa->debug() << std::endl;
return 0;
}

View File

@ -1,18 +1,21 @@
#include <limits>
#include <cuda.h>
#include <limits>
#include "translator/helpers.h"
#include "tensors/tensor.h"
#include "data/types.h"
#include "tensors/tensor.h"
#include "translator/helpers.h"
namespace marian {
__global__ void gSetColumn(float* d_in, size_t n_columns, size_t n_rows,
size_t noColumn, float value) {
size_t rowNumber = threadIdx.x + blockDim.x * blockIdx.x;
__global__ void gSetColumn(float* d_in,
size_t n_columns,
size_t n_rows,
size_t noColumn,
float value) {
size_t rowNumber = threadIdx.x + blockDim.x * blockIdx.x;
size_t index = noColumn + rowNumber * n_columns;
if (index < n_columns * n_rows) {
if(index < n_columns * n_rows) {
d_in[index] = value;
}
}
@ -20,8 +23,8 @@ __global__ void gSetColumn(float* d_in, size_t n_columns, size_t n_rows,
void SetColumn(Tensor in, size_t col, float value) {
int nRows = in->shape()[0] * in->shape()[2] * in->shape()[3];
int nColumns = in->shape()[1];
int nBlocks = nRows / 512 + ((nRows % 512 == 0) ? 0 : 1);
int nBlocks = nRows / 512 + ((nRows % 512 == 0) ? 0 : 1);
int nThreads = std::min(512, nRows);
gSetColumn<<<nBlocks, nThreads>>>(in->data(), nColumns, nRows, col, value);
@ -34,6 +37,4 @@ void suppressUnk(Expr probs) {
void suppressWord(Expr probs, Word id) {
SetColumn(probs->val(), id, std::numeric_limits<float>::lowest());
}
}

View File

@ -4,29 +4,35 @@
namespace marian {
void HandleError(cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
UTIL_THROW2("ERROR: " << cudaGetErrorString(err) << " in " << file << " at line " << line);
void HandleError(cudaError_t err, const char* file, int line) {
if(err != cudaSuccess) {
UTIL_THROW2("ERROR: " << cudaGetErrorString(err) << " in " << file
<< " at line "
<< line);
}
}
#define UNROLL_MAXARG_LOOP( n, max ) \
if (tid < (n) && tid + (n) < ( max ) ) { \
if (sdata[tid + ( n ) ] > sdata[tid]) { \
sdata[tid] = sdata[tid + ( n ) ]; \
indices[tid] = indices[tid + ( n ) ]; \
} \
#define UNROLL_MAXARG_LOOP(n, max) \
if(tid < (n) && tid + (n) < (max)) { \
if(sdata[tid + (n)] > sdata[tid]) { \
sdata[tid] = sdata[tid + (n)]; \
indices[tid] = indices[tid + (n)]; \
} \
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
__global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatches, int* batchFirstElementIdxs) {
__global__ void gMaxElement(float* d_out,
int* d_ind,
float* d_in,
int numBatches,
int* batchFirstElementIdxs) {
extern __shared__ float sdata[];
__shared__ int indices[512];
int tid = threadIdx.x;
for (int batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
for(int batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
int begin = batchFirstElementIdxs[batchIdx];
int end = batchFirstElementIdxs[batchIdx + 1];
@ -34,15 +40,15 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
sdata[tid] = -3.40282e+38f;
if (i < end) {
if(i < end) {
sdata[tid] = d_in[i];
indices[tid] = i;
}
if (i + blockDim.x < end) {
if(i + blockDim.x < end) {
float a = d_in[i];
float b = d_in[i + blockDim.x];
if (a > b) {
if(a > b) {
sdata[tid] = a;
indices[tid] = i;
} else {
@ -51,18 +57,18 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
}
}
while (i + 2 * gridDim.x * blockDim.x < end) {
while(i + 2 * gridDim.x * blockDim.x < end) {
i += 2 * gridDim.x * blockDim.x;
float a = d_in[i];
if (a > sdata[tid]) {
if(a > sdata[tid]) {
sdata[tid] = a;
indices[tid] = i;
}
if (i + blockDim.x < end) {
if(i + blockDim.x < end) {
float b = d_in[i + blockDim.x];
if (b > sdata[tid]) {
if(b > sdata[tid]) {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
@ -71,9 +77,9 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
__syncthreads();
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if (tid < s && tid + s < end) {
if (sdata[tid + s] > sdata[tid]) {
for(int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if(tid < s && tid + s < end) {
if(sdata[tid + s] > sdata[tid]) {
sdata[tid] = sdata[tid + s];
indices[tid] = indices[tid + s];
}
@ -88,7 +94,7 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
UNROLL_MAXARG_LOOP(2, end);
UNROLL_MAXARG_LOOP(1, end);
if (tid == 0) {
if(tid == 0) {
d_out[blockIdx.x + batchIdx * gridDim.x] = sdata[0];
d_ind[blockIdx.x + batchIdx * gridDim.x] = indices[0];
}
@ -96,7 +102,14 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
}
}
__global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, int *batchFirstElements, float* outCosts, int* outIdxs, int *cummulatedBeamSizes, int NUM_BLOCKS) {
__global__ void gMaxElementUpdate(float* binCosts,
int* binIdxs,
float* probs,
int* batchFirstElements,
float* outCosts,
int* outIdxs,
int* cummulatedBeamSizes,
int NUM_BLOCKS) {
extern __shared__ float sdata[];
__shared__ int indices[512];
__shared__ float bestBinCost;
@ -106,24 +119,26 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
const int batchIdx = blockIdx.x;
const int N = batchFirstElements[batchIdx + 1] - batchFirstElements[batchIdx];
int num_bins = int(N / (2 * 512)) + int(N % (2 * 512) != 0);
if (num_bins > 500) {
if(num_bins > 500) {
num_bins = 500;
}
for (int pos = cummulatedBeamSizes[batchIdx]; pos < cummulatedBeamSizes[batchIdx + 1]; ++pos) {
for(int pos = cummulatedBeamSizes[batchIdx];
pos < cummulatedBeamSizes[batchIdx + 1];
++pos) {
int i = tid;
sdata[tid] = -3.40282e+38f;
if (i < num_bins) {
if(i < num_bins) {
sdata[tid] = binCosts[batchIdx * NUM_BLOCKS + i];
indices[tid] = i;
}
if (i + blockDim.x < num_bins) {
if(i + blockDim.x < num_bins) {
float a = binCosts[batchIdx * NUM_BLOCKS + i];
float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x];
if (a > b) {
if(a > b) {
sdata[tid] = a;
indices[tid] = i;
} else {
@ -132,18 +147,18 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
}
}
while (i + 2 * blockDim.x < num_bins) {
while(i + 2 * blockDim.x < num_bins) {
i += 2 * blockDim.x;
float a = binCosts[batchIdx * NUM_BLOCKS + i];
if (a > sdata[tid]) {
if(a > sdata[tid]) {
sdata[tid] = a;
indices[tid] = i;
}
if (i + blockDim.x < num_bins) {
if(i + blockDim.x < num_bins) {
float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x];
if (b > sdata[tid]) {
if(b > sdata[tid]) {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
@ -152,9 +167,9 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
__syncthreads();
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if (tid < s && tid + s < num_bins) {
if (sdata[tid + s] > sdata[tid]) {
for(int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if(tid < s && tid + s < num_bins) {
if(sdata[tid + s] > sdata[tid]) {
sdata[tid] = sdata[tid + s];
indices[tid] = indices[tid + s];
}
@ -169,7 +184,7 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
UNROLL_MAXARG_LOOP(2, num_bins);
UNROLL_MAXARG_LOOP(1, num_bins);
if (tid == 0) {
if(tid == 0) {
bestBinCost = sdata[0];
bestBinCostIdx = batchIdx * NUM_BLOCKS + indices[0];
@ -181,20 +196,21 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
__syncthreads();
i = batchFirstElements[batchIdx] + (bestBinCostIdx - batchIdx * NUM_BLOCKS) * (blockDim.x * 2) + tid;
i = batchFirstElements[batchIdx]
+ (bestBinCostIdx - batchIdx * NUM_BLOCKS) * (blockDim.x * 2) + tid;
const int dist = num_bins * 2 * blockDim.x;
sdata[tid] = -3.40282e+38f;
if (i < batchFirstElements[batchIdx + 1]) {
if(i < batchFirstElements[batchIdx + 1]) {
sdata[tid] = probs[i];
indices[tid] = i;
}
if (i + blockDim.x < batchFirstElements[batchIdx + 1]) {
if(i + blockDim.x < batchFirstElements[batchIdx + 1]) {
float a = probs[i];
float b = probs[i+blockDim.x];
if (a > b) {
float b = probs[i + blockDim.x];
if(a > b) {
sdata[tid] = a;
indices[tid] = i;
} else {
@ -203,18 +219,18 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
}
}
while (i + dist < batchFirstElements[batchIdx + 1]) {
while(i + dist < batchFirstElements[batchIdx + 1]) {
i += dist;
float a = probs[i];
if (a > sdata[tid]) {
if(a > sdata[tid]) {
sdata[tid] = a;
indices[tid] = i;
}
if (i + blockDim.x < batchFirstElements[batchIdx + 1]) {
if(i + blockDim.x < batchFirstElements[batchIdx + 1]) {
float b = probs[i + blockDim.x];
if (b > sdata[tid]) {
if(b > sdata[tid]) {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
@ -223,9 +239,9 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
__syncthreads();
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if (tid < s && tid + s < batchFirstElements[batchIdx + 1]) {
if (sdata[tid + s] > sdata[tid]) {
for(int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if(tid < s && tid + s < batchFirstElements[batchIdx + 1]) {
if(sdata[tid + s] > sdata[tid]) {
sdata[tid] = sdata[tid + s];
indices[tid] = indices[tid + s];
}
@ -240,7 +256,7 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
UNROLL_MAXARG_LOOP(2, batchFirstElements[batchIdx + 1]);
UNROLL_MAXARG_LOOP(1, batchFirstElements[batchIdx + 1]);
if (tid == 0) {
if(tid == 0) {
binCosts[bestBinCostIdx] = sdata[0];
binIdxs[bestBinCostIdx] = indices[0];
}
@ -248,40 +264,49 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
}
}
__global__ void gGetValueByKey(float* d_in, float* d_out, int* indeces, int n)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid < n) {
__global__ void gGetValueByKey(float* d_in, float* d_out, int* indeces, int n) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
if(tid < n) {
int index = indeces[tid];
d_out[tid] = d_in[index];
}
}
NthElement::NthElement(size_t maxBeamSize, size_t maxBatchSize/*, cudaStream_t stream*/)
NthElement::NthElement(size_t maxBeamSize,
size_t maxBatchSize /*, cudaStream_t stream*/)
: /*stream_(stream) ,*/
NUM_BLOCKS(std::min(500, int(maxBeamSize * 85000 / (2 * BLOCK_SIZE)) + int(maxBeamSize * 85000 % (2 * BLOCK_SIZE) != 0)))
{
//std::cerr << "NthElement::NthElement" << std::endl;
NUM_BLOCKS(
std::min(500,
int(maxBeamSize * 85000 / (2 * BLOCK_SIZE))
+ int(maxBeamSize * 85000 % (2 * BLOCK_SIZE) != 0))) {
// std::cerr << "NthElement::NthElement" << std::endl;
HANDLE_ERROR( cudaMalloc((void**)&d_ind, maxBatchSize * NUM_BLOCKS * sizeof(int)) );
HANDLE_ERROR(
cudaMalloc((void**)&d_ind, maxBatchSize * NUM_BLOCKS * sizeof(int)));
HANDLE_ERROR( cudaMalloc((void**)&d_out, maxBatchSize * NUM_BLOCKS * sizeof(float)) );
HANDLE_ERROR(
cudaMalloc((void**)&d_out, maxBatchSize * NUM_BLOCKS * sizeof(float)));
HANDLE_ERROR( cudaMalloc((void**)&d_res_idx, maxBatchSize * maxBeamSize * sizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&d_res, maxBatchSize * maxBeamSize * sizeof(float)) );
HANDLE_ERROR(
cudaMalloc((void**)&d_res_idx, maxBatchSize * maxBeamSize * sizeof(int)));
HANDLE_ERROR(
cudaMalloc((void**)&d_res, maxBatchSize * maxBeamSize * sizeof(float)));
HANDLE_ERROR( cudaHostAlloc((void**) &h_res, maxBeamSize * maxBatchSize* sizeof(float),
cudaHostAllocDefault) );
HANDLE_ERROR( cudaHostAlloc((void**) &h_res_idx, maxBeamSize * maxBatchSize * sizeof(int),
cudaHostAllocDefault) );
HANDLE_ERROR(cudaHostAlloc((void**)&h_res,
maxBeamSize * maxBatchSize * sizeof(float),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&h_res_idx,
maxBeamSize * maxBatchSize * sizeof(int),
cudaHostAllocDefault));
HANDLE_ERROR( cudaMalloc((void**)&d_breakdown, maxBeamSize * sizeof(float)) );
HANDLE_ERROR( cudaMalloc((void**)&d_batchPosition, (maxBatchSize + 1) * sizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&d_cumBeamSizes, (maxBatchSize + 1) * sizeof(int)) );
HANDLE_ERROR(cudaMalloc((void**)&d_breakdown, maxBeamSize * sizeof(float)));
HANDLE_ERROR(
cudaMalloc((void**)&d_batchPosition, (maxBatchSize + 1) * sizeof(int)));
HANDLE_ERROR(
cudaMalloc((void**)&d_cumBeamSizes, (maxBatchSize + 1) * sizeof(int)));
}
NthElement::~NthElement()
{
NthElement::~NthElement() {
HANDLE_ERROR(cudaFree(d_ind));
HANDLE_ERROR(cudaFree(d_out));
HANDLE_ERROR(cudaFree(d_res_idx));
@ -293,52 +318,76 @@ NthElement::~NthElement()
HANDLE_ERROR(cudaFree(d_cumBeamSizes));
}
void NthElement::getNBestList(float* probs, const std::vector<int>& batchFirstElementIdxs,
const std::vector<int>& cummulatedBeamSizes)
{
HANDLE_ERROR( cudaMemcpyAsync(d_batchPosition, batchFirstElementIdxs.data(), batchFirstElementIdxs.size() * sizeof(int),
cudaMemcpyHostToDevice, /* stream_ */ 0) );
HANDLE_ERROR( cudaMemcpyAsync(d_cumBeamSizes, cummulatedBeamSizes.data(), cummulatedBeamSizes.size() * sizeof(int),
cudaMemcpyHostToDevice, /* stream_ */ 0) );
void NthElement::getNBestList(float* probs,
const std::vector<int>& batchFirstElementIdxs,
const std::vector<int>& cummulatedBeamSizes) {
HANDLE_ERROR(cudaMemcpyAsync(d_batchPosition,
batchFirstElementIdxs.data(),
batchFirstElementIdxs.size() * sizeof(int),
cudaMemcpyHostToDevice,
/* stream_ */ 0));
HANDLE_ERROR(cudaMemcpyAsync(d_cumBeamSizes,
cummulatedBeamSizes.data(),
cummulatedBeamSizes.size() * sizeof(int),
cudaMemcpyHostToDevice,
/* stream_ */ 0));
const int numBatches = batchFirstElementIdxs.size() - 1;
gMaxElement<<<NUM_BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), /* stream_ */ 0>>>
(d_out, d_ind, probs, numBatches, d_batchPosition);
gMaxElement<<<NUM_BLOCKS,
BLOCK_SIZE,
BLOCK_SIZE * sizeof(float),
/* stream_ */ 0>>>(
d_out, d_ind, probs, numBatches, d_batchPosition);
gMaxElementUpdate<<<numBatches, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), /* stream_ */ 0>>>
(d_out, d_ind, probs, d_batchPosition, d_res, d_res_idx, d_cumBeamSizes, NUM_BLOCKS);
gMaxElementUpdate<<<numBatches,
BLOCK_SIZE,
BLOCK_SIZE * sizeof(float),
/* stream_ */ 0>>>(d_out,
d_ind,
probs,
d_batchPosition,
d_res,
d_res_idx,
d_cumBeamSizes,
NUM_BLOCKS);
}
void NthElement::getNBestList(const std::vector<size_t>& beamSizes, Tensor Probs,
std::vector<float>& outCosts, std::vector<unsigned>& outKeys,
const bool isFirst) {
void NthElement::getNBestList(const std::vector<size_t>& beamSizes,
Tensor Probs,
std::vector<float>& outCosts,
std::vector<unsigned>& outKeys,
const bool isFirst) {
std::vector<int> cummulatedBeamSizes(beamSizes.size() + 1, 0);
std::vector<int> batchFirstElementIdxs(beamSizes.size() + 1, 0);
const size_t vocabSize = Probs->shape()[1];
for (size_t i = 0; i < beamSizes.size(); ++i) {
for(size_t i = 0; i < beamSizes.size(); ++i) {
cummulatedBeamSizes[i + 1] = cummulatedBeamSizes[i] + beamSizes[i];
batchFirstElementIdxs[i + 1] += ((isFirst) ? (i + 1) : cummulatedBeamSizes[i + 1]) * vocabSize;
batchFirstElementIdxs[i + 1]
+= ((isFirst) ? (i + 1) : cummulatedBeamSizes[i + 1]) * vocabSize;
}
getNBestList(Probs->data(), batchFirstElementIdxs, cummulatedBeamSizes);
GetPairs(cummulatedBeamSizes.back(), outKeys, outCosts);
}
void NthElement::GetPairs(size_t number,
std::vector<unsigned>& outKeys,
std::vector<float>& outValues) {
HANDLE_ERROR( cudaMemcpyAsync(h_res, d_res, number * sizeof(float),
cudaMemcpyDeviceToHost, /* stream_ */ 0) );
HANDLE_ERROR( cudaMemcpyAsync(h_res_idx, d_res_idx, number * sizeof(int),
cudaMemcpyDeviceToHost, /* stream_ */ 0) );
std::vector<unsigned>& outKeys,
std::vector<float>& outValues) {
HANDLE_ERROR(cudaMemcpyAsync(h_res,
d_res,
number * sizeof(float),
cudaMemcpyDeviceToHost,
/* stream_ */ 0));
HANDLE_ERROR(cudaMemcpyAsync(h_res_idx,
d_res_idx,
number * sizeof(int),
cudaMemcpyDeviceToHost,
/* stream_ */ 0));
cudaStreamSynchronize(/* stream_ */ 0);
for (size_t i = 0; i < number; ++i) {
for(size_t i = 0; i < number; ++i) {
outKeys.push_back(h_res_idx[i]);
outValues.push_back(h_res[i]);
}
@ -347,13 +396,14 @@ void NthElement::GetPairs(size_t number,
}
void NthElement::getValueByKey(std::vector<float>& out, float* d_in) {
gGetValueByKey<<<1, lastN, 0, /* stream_ */ 0>>>
(d_in, d_breakdown, h_res_idx, lastN);
gGetValueByKey<<<1, lastN, 0, /* stream_ */ 0>>>(
d_in, d_breakdown, h_res_idx, lastN);
HANDLE_ERROR( cudaMemcpyAsync(out.data(), d_breakdown, lastN * sizeof(float),
cudaMemcpyDeviceToHost, /* stream_ */ 0) );
HANDLE_ERROR( cudaStreamSynchronize(/* stream_ */ 0));
HANDLE_ERROR(cudaMemcpyAsync(out.data(),
d_breakdown,
lastN * sizeof(float),
cudaMemcpyDeviceToHost,
/* stream_ */ 0));
HANDLE_ERROR(cudaStreamSynchronize(/* stream_ */ 0));
}
}