mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-17 09:47:34 +03:00
Autoformat .cu files
This commit is contained in:
parent
d2c9e9fc74
commit
bcb3851fcc
@ -1,3 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
find ./src -path ./src/3rd_party -prune -o -iname *.h -o -iname *.cpp | xargs clang-format-3.8 -i
|
||||
find ./src -path ./src/3rd_party -prune -o -iname *.h -o -iname *.cpp -o -iname *.cu | xargs clang-format-3.8 -i
|
||||
|
@ -6,7 +6,7 @@
|
||||
int main(int argc, char** argv) {
|
||||
using namespace marian;
|
||||
|
||||
auto options = New<Config>(argc, argv);;
|
||||
auto options = New<Config>(argc, argv);
|
||||
auto devices = options->get<std::vector<size_t>>("devices");
|
||||
|
||||
if(devices.size() > 1)
|
||||
|
@ -8,34 +8,30 @@
|
||||
|
||||
#include "marian.h"
|
||||
|
||||
#include "examples/mnist/training.h"
|
||||
#include "examples/mnist/model.h"
|
||||
#include "examples/mnist/training.h"
|
||||
#include "training/graph_group.h"
|
||||
|
||||
|
||||
const std::vector<std::string> TRAIN_SET = {
|
||||
"../src/examples/mnist/train-images-idx3-ubyte",
|
||||
"../src/examples/mnist/train-labels-idx1-ubyte"
|
||||
};
|
||||
const std::vector<std::string> VALID_SET = {
|
||||
"../src/examples/mnist/t10k-images-idx3-ubyte",
|
||||
"../src/examples/mnist/t10k-labels-idx1-ubyte"
|
||||
};
|
||||
const std::vector<std::string> TRAIN_SET
|
||||
= {"../src/examples/mnist/train-images-idx3-ubyte",
|
||||
"../src/examples/mnist/train-labels-idx1-ubyte"};
|
||||
const std::vector<std::string> VALID_SET
|
||||
= {"../src/examples/mnist/t10k-images-idx3-ubyte",
|
||||
"../src/examples/mnist/t10k-labels-idx1-ubyte"};
|
||||
|
||||
using namespace marian;
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
auto options = New<Config>(argc, argv, false);
|
||||
|
||||
if (!options->has("train-sets"))
|
||||
if(!options->has("train-sets"))
|
||||
options->set("train-sets", TRAIN_SET);
|
||||
if (!options->has("valid-sets"))
|
||||
if(!options->has("valid-sets"))
|
||||
options->set("valid-sets", VALID_SET);
|
||||
|
||||
auto devices = options->get<std::vector<size_t>>("devices");
|
||||
|
||||
if (devices.size() > 1)
|
||||
if(devices.size() > 1)
|
||||
New<TrainMNIST<AsyncGraphGroup<models::MNISTModel>>>(options)->run();
|
||||
else
|
||||
New<TrainMNIST<Singleton<models::MNISTModel>>>(options)->run();
|
||||
|
@ -1,32 +1,35 @@
|
||||
#include <sstream>
|
||||
#include "expression_graph.h"
|
||||
#include "backend_gpu.h"
|
||||
#include "expression_graph.h"
|
||||
#include "kernels/dropout.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
ExpressionGraph::ExpressionGraph(bool inference)
|
||||
: inferenceOnly_(inference),
|
||||
backend_(New<BackendGPU>()) {}
|
||||
ExpressionGraph::ExpressionGraph(bool inference)
|
||||
: inferenceOnly_(inference), backend_(New<BackendGPU>()) {}
|
||||
|
||||
void ExpressionGraph::setDevice(size_t device) {
|
||||
device_ = device;
|
||||
void ExpressionGraph::setDevice(size_t device) {
|
||||
device_ = device;
|
||||
|
||||
params_ = New<Parameters>();
|
||||
params_->init(device_);
|
||||
params_ = New<Parameters>();
|
||||
params_->init(device_);
|
||||
|
||||
tensors_ = New<TensorAllocator>(device);
|
||||
|
||||
std::static_pointer_cast<BackendGPU>(backend_)->setHandles(device, Config::seed);
|
||||
}
|
||||
|
||||
Expr ExpressionGraph::dropout(float prob, Shape shape) {
|
||||
auto dropoutInit = [prob, this](Tensor t) {
|
||||
Dropout(t, prob, std::static_pointer_cast<BackendGPU>(backend_)->getCurandGenerator());
|
||||
};
|
||||
tensors_ = New<TensorAllocator>(device);
|
||||
|
||||
return Expression<ConstantNode>(shared_from_this(),
|
||||
keywords::init=dropoutInit,
|
||||
keywords::shape=shape);
|
||||
}
|
||||
std::static_pointer_cast<BackendGPU>(backend_)->setHandles(device,
|
||||
Config::seed);
|
||||
}
|
||||
|
||||
Expr ExpressionGraph::dropout(float prob, Shape shape) {
|
||||
auto dropoutInit = [prob, this](Tensor t) {
|
||||
Dropout(
|
||||
t,
|
||||
prob,
|
||||
std::static_pointer_cast<BackendGPU>(backend_)->getCurandGenerator());
|
||||
};
|
||||
|
||||
return Expression<ConstantNode>(shared_from_this(),
|
||||
keywords::init = dropoutInit,
|
||||
keywords::shape = shape);
|
||||
}
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
#include "graph/expression_operators.h"
|
||||
#include "kernels/sparse.h"
|
||||
|
||||
#include "graph/node_operators_unary.h"
|
||||
#include "graph/node_operators_binary.h"
|
||||
#include "graph/node_operators.h"
|
||||
#include "graph/node_operators_binary.h"
|
||||
#include "graph/node_operators_unary.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -85,15 +85,15 @@ Expr operator-(float a, Expr b) {
|
||||
}
|
||||
|
||||
Expr operator*(float a, Expr b) {
|
||||
return Expression<ScalarMultNodeOp>(b, a);
|
||||
return Expression<ScalarMultNodeOp>(b, a);
|
||||
}
|
||||
|
||||
Expr operator*(Expr a, float b) {
|
||||
return Expression<ScalarMultNodeOp>(a, b);
|
||||
return Expression<ScalarMultNodeOp>(a, b);
|
||||
}
|
||||
|
||||
Expr operator/(Expr a, float b) {
|
||||
return Expression<ScalarMultNodeOp>(a, 1.f / b);
|
||||
return Expression<ScalarMultNodeOp>(a, 1.f / b);
|
||||
}
|
||||
|
||||
/*********************************************************/
|
||||
@ -111,7 +111,6 @@ Expr flatten(Expr a) {
|
||||
return Expression<ReshapeNodeOp>(a, shape);
|
||||
}
|
||||
|
||||
|
||||
Expr sum(Expr a, keywords::axis_k ax) {
|
||||
return Expression<SumNodeOp>(a, ax);
|
||||
}
|
||||
@ -130,7 +129,6 @@ Expr weighted_average(Expr in, Expr weights, keywords::axis_k ax) {
|
||||
return p / s;
|
||||
}
|
||||
|
||||
|
||||
Expr dot(Expr a, Expr b) {
|
||||
return Expression<DotNodeOp>(a, b);
|
||||
}
|
||||
@ -187,7 +185,7 @@ Expr layer_norm(Expr x, Expr gamma, Expr beta) {
|
||||
return Expression<LayerNormalizationOp>(nodes);
|
||||
}
|
||||
|
||||
//Expr batch_norm(Expr x, Expr gamma, Expr beta) {
|
||||
// Expr batch_norm(Expr x, Expr gamma, Expr beta) {
|
||||
// auto mju = mean(x, keywords::axis=0);
|
||||
// auto xmmju = x - mju;
|
||||
// auto std = sqrt(mean(square(xmmju), keywords::axis=0), 1e-9);
|
||||
@ -205,5 +203,4 @@ Expr shift(Expr a, Shape shift) {
|
||||
Expr lexical_bias(Expr logits, Expr att, float eps, Ptr<sparse::CSR> lf) {
|
||||
return Expression<LexicalProbNodeOp>(logits, att, eps, lf);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "graph/backend_gpu.h"
|
||||
#include "graph/expression_graph.h"
|
||||
#include "graph/node.h"
|
||||
#include "graph/backend_gpu.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -48,5 +48,4 @@ void NaryNodeOp::remove_children_from_top_nodes() {
|
||||
for(auto child : children_)
|
||||
graph()->remove_top_node(child);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,41 +1,40 @@
|
||||
#include "node_operators.h"
|
||||
#include "expression_graph.h"
|
||||
#include "node_operators.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
size_t ConstantNode::allocate() {
|
||||
// @TODO params
|
||||
size_t elements = 0;
|
||||
if(!val_) {
|
||||
graph()->tensor(val_, shape_);
|
||||
elements = val_->shape().elements();
|
||||
}
|
||||
return elements;
|
||||
size_t ConstantNode::allocate() {
|
||||
// @TODO params
|
||||
size_t elements = 0;
|
||||
if(!val_) {
|
||||
graph()->tensor(val_, shape_);
|
||||
elements = val_->shape().elements();
|
||||
}
|
||||
|
||||
void ConstantNode::init() {
|
||||
if(!initialized_) {
|
||||
init_(val_);
|
||||
initialized_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
size_t ParamNode::allocate() {
|
||||
// @TODO params
|
||||
size_t elements = 0;
|
||||
if(!val_) {
|
||||
graph()->tensor(val_, shape_);
|
||||
elements = val_->shape().elements();
|
||||
}
|
||||
return elements;
|
||||
}
|
||||
|
||||
void ParamNode::init() {
|
||||
if(!initialized_) {
|
||||
//std::cerr << "Initializing parameter " << name() << std::endl;
|
||||
init_(val_);
|
||||
initialized_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
return elements;
|
||||
}
|
||||
|
||||
void ConstantNode::init() {
|
||||
if(!initialized_) {
|
||||
init_(val_);
|
||||
initialized_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
size_t ParamNode::allocate() {
|
||||
// @TODO params
|
||||
size_t elements = 0;
|
||||
if(!val_) {
|
||||
graph()->tensor(val_, shape_);
|
||||
elements = val_->shape().elements();
|
||||
}
|
||||
return elements;
|
||||
}
|
||||
|
||||
void ParamNode::init() {
|
||||
if(!initialized_) {
|
||||
// std::cerr << "Initializing parameter " << name() << std::endl;
|
||||
init_(val_);
|
||||
initialized_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,30 +3,34 @@
|
||||
|
||||
#include "kernels/dropout.h"
|
||||
|
||||
#define CUDA_CALL(x) \
|
||||
do { \
|
||||
if((x) != cudaSuccess) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
|
||||
printf("Error at %s:%d\n",__FILE__,__LINE__);\
|
||||
exit(1);}} while(0)
|
||||
|
||||
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n",__FILE__,__LINE__);\
|
||||
exit(1);}} while(0)
|
||||
#define CURAND_CALL(x) \
|
||||
do { \
|
||||
if((x) != CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
namespace marian {
|
||||
|
||||
__global__
|
||||
void gScale(float* data, int n, float p) {
|
||||
__global__ void gScale(float* data, int n, float p) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
while (index < n) {
|
||||
while(index < n) {
|
||||
data[index] = (data[index] < p) / p;
|
||||
index += gridDim.x * blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
void Dropout(Tensor tensor, float p,
|
||||
curandGenerator_t gen) {
|
||||
|
||||
void Dropout(Tensor tensor, float p, curandGenerator_t gen) {
|
||||
int n = tensor->size();
|
||||
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
|
||||
|
||||
@ -35,5 +39,4 @@ void Dropout(Tensor tensor, float p,
|
||||
|
||||
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -4,31 +4,56 @@
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
|
||||
namespace sparse {
|
||||
|
||||
void multiply(Ptr<CSR> C, const Ptr<CSR> A, const Ptr<CSR> B,
|
||||
bool transA, bool transB) {
|
||||
|
||||
void multiply(
|
||||
Ptr<CSR> C, const Ptr<CSR> A, const Ptr<CSR> B, bool transA, bool transB) {
|
||||
cudaSetDevice(C->getDevice());
|
||||
int nnzTotal;
|
||||
C->allocRowIndices(A->rows());
|
||||
CUSPARSE_CHECK(cusparseXcsrgemmNnz(A->handle(),
|
||||
transA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
transB ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
A->rows(), B->cols(), A->cols(),
|
||||
A->description(), A->nnz(), A->rowIndices(), A->colIndices(),
|
||||
B->description(), B->nnz(), B->rowIndices(), B->colIndices(),
|
||||
C->description(), C->rowIndices(), &nnzTotal));
|
||||
|
||||
CUSPARSE_CHECK(cusparseXcsrgemmNnz(
|
||||
A->handle(),
|
||||
transA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
transB ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
A->rows(),
|
||||
B->cols(),
|
||||
A->cols(),
|
||||
A->description(),
|
||||
A->nnz(),
|
||||
A->rowIndices(),
|
||||
A->colIndices(),
|
||||
B->description(),
|
||||
B->nnz(),
|
||||
B->rowIndices(),
|
||||
B->colIndices(),
|
||||
C->description(),
|
||||
C->rowIndices(),
|
||||
&nnzTotal));
|
||||
|
||||
C->allocValues(nnzTotal);
|
||||
C->allocColIndices(nnzTotal);
|
||||
CUSPARSE_CHECK(cusparseScsrgemm(A->handle(),
|
||||
transA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
transB ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
A->rows(), B->cols(), A->cols(),
|
||||
A->description(), A->nnz(), A->values(), A->rowIndices(), A->colIndices(),
|
||||
B->description(), B->nnz(), B->values(), B->rowIndices(), B->colIndices(),
|
||||
C->description(), C->values(), C->rowIndices(), C->colIndices()));
|
||||
C->allocColIndices(nnzTotal);
|
||||
CUSPARSE_CHECK(cusparseScsrgemm(
|
||||
A->handle(),
|
||||
transA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
transB ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
A->rows(),
|
||||
B->cols(),
|
||||
A->cols(),
|
||||
A->description(),
|
||||
A->nnz(),
|
||||
A->values(),
|
||||
A->rowIndices(),
|
||||
A->colIndices(),
|
||||
B->description(),
|
||||
B->nnz(),
|
||||
B->values(),
|
||||
B->rowIndices(),
|
||||
B->colIndices(),
|
||||
C->description(),
|
||||
C->values(),
|
||||
C->rowIndices(),
|
||||
C->colIndices()));
|
||||
}
|
||||
|
||||
//__global__ void gExpandAtt(float* out,
|
||||
@ -36,7 +61,7 @@ void multiply(Ptr<CSR> C, const Ptr<CSR> A, const Ptr<CSR> B,
|
||||
// int batch,
|
||||
// int srcWords,
|
||||
// int nonzeros) {
|
||||
//
|
||||
//
|
||||
// for(int bid = 0; bid < nonzeros; bid += blockDim.x * gridDim.x) {
|
||||
// int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
// if (index < nonzeros) {
|
||||
@ -48,26 +73,27 @@ void multiply(Ptr<CSR> C, const Ptr<CSR> A, const Ptr<CSR> B,
|
||||
//}
|
||||
//
|
||||
//
|
||||
//void ExpandAtt(Tensor out, Tensor in) {
|
||||
// void ExpandAtt(Tensor out, Tensor in) {
|
||||
// cudaSetDevice(in->getDevice());
|
||||
// int nonzeros = in->shape().elements();
|
||||
// int batch = in->shape()[0];
|
||||
// int srcWords = in->shape()[2];
|
||||
// int srcWords = in->shape()[2];
|
||||
//
|
||||
// int threads = std::min(MAX_THREADS, nonzeros);
|
||||
// int blocks = std::min(MAX_BLOCKS, nonzeros / threads + (nonzeros % threads != 0));
|
||||
// int blocks = std::min(MAX_BLOCKS, nonzeros / threads + (nonzeros % threads
|
||||
// != 0));
|
||||
//
|
||||
// gCollapseAtt<<<blocks, threads>>>(out->data(), in->data(), batch, srcWords, nonzeros);
|
||||
// gCollapseAtt<<<blocks, threads>>>(out->data(), in->data(), batch, srcWords,
|
||||
// nonzeros);
|
||||
//}
|
||||
|
||||
|
||||
void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr<CSR> sparseLf) {
|
||||
cudaSetDevice(out->getDevice());
|
||||
|
||||
int batch = att->shape()[0];
|
||||
|
||||
int batch = att->shape()[0];
|
||||
int srcWords = att->shape()[2];
|
||||
int trgWords = att->shape()[3];
|
||||
|
||||
|
||||
std::vector<float> values;
|
||||
att->get(values);
|
||||
int nonzeros = values.size();
|
||||
@ -89,25 +115,26 @@ void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr<CSR> sparseLf) {
|
||||
colInd[i] = std::get<1>(coo[i]);
|
||||
values[i] = std::get<2>(coo[i]);
|
||||
}
|
||||
|
||||
auto sparseAtt = New<CSR>(batch * trgWords, batch * srcWords,
|
||||
values, rowInd, colInd, out->getDevice());
|
||||
|
||||
auto sparseLfa = New<CSR>(sparseAtt->rows(), sparseLf->cols(), out->getDevice());
|
||||
|
||||
auto sparseAtt = New<CSR>(batch * trgWords,
|
||||
batch * srcWords,
|
||||
values,
|
||||
rowInd,
|
||||
colInd,
|
||||
out->getDevice());
|
||||
|
||||
auto sparseLfa
|
||||
= New<CSR>(sparseAtt->rows(), sparseLf->cols(), out->getDevice());
|
||||
multiply(sparseLfa, sparseAtt, sparseLf);
|
||||
|
||||
|
||||
sparseLfa->toTensor(out);
|
||||
}
|
||||
|
||||
__global__ void gCollapseAtt(float* out,
|
||||
const float* in,
|
||||
int batch,
|
||||
int srcWords,
|
||||
int nonzeros) {
|
||||
|
||||
__global__ void gCollapseAtt(
|
||||
float* out, const float* in, int batch, int srcWords, int nonzeros) {
|
||||
for(int bid = 0; bid < nonzeros; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (index < nonzeros) {
|
||||
if(index < nonzeros) {
|
||||
int r = (index % batch) + (index / (srcWords * batch)) * batch;
|
||||
int c = index % (srcWords * batch);
|
||||
float val = in[r * srcWords * batch + c];
|
||||
@ -116,47 +143,57 @@ __global__ void gCollapseAtt(float* out,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void CollapseAtt(Tensor out, Tensor in) {
|
||||
cudaSetDevice(out->getDevice());
|
||||
int nonzeros = out->shape().elements();
|
||||
int batch = out->shape()[0];
|
||||
int srcWords = out->shape()[2];
|
||||
int srcWords = out->shape()[2];
|
||||
|
||||
int threads = std::min(MAX_THREADS, nonzeros);
|
||||
int blocks = std::min(MAX_BLOCKS, nonzeros / threads + (nonzeros % threads != 0));
|
||||
int blocks
|
||||
= std::min(MAX_BLOCKS, nonzeros / threads + (nonzeros % threads != 0));
|
||||
|
||||
gCollapseAtt<<<blocks, threads>>>(out->data(), in->data(), batch, srcWords, nonzeros);
|
||||
gCollapseAtt<<<blocks, threads>>>(
|
||||
out->data(), in->data(), batch, srcWords, nonzeros);
|
||||
}
|
||||
|
||||
|
||||
void LfaBackward(Tensor gradAtt, Tensor adj, Ptr<CSR> sparseLf) {
|
||||
cudaSetDevice(adj->getDevice());
|
||||
|
||||
int batch = gradAtt->shape()[0];
|
||||
|
||||
int batch = gradAtt->shape()[0];
|
||||
int srcWords = gradAtt->shape()[2];
|
||||
int trgWords = gradAtt->shape()[3];
|
||||
int nonzeros = gradAtt->shape().elements();
|
||||
|
||||
|
||||
int dimTrgVoc = adj->shape()[1];
|
||||
|
||||
|
||||
float* expandAttGradBuffer;
|
||||
CUDA_CHECK(cudaMalloc(&expandAttGradBuffer, sizeof(float) * batch * srcWords * batch * trgWords));
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&expandAttGradBuffer,
|
||||
sizeof(float) * batch * srcWords * batch * trgWords));
|
||||
|
||||
float alpha = 1, beta = 0;
|
||||
CUSPARSE_CHECK(cusparseScsrmm2(sparseLf->handle(),
|
||||
CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
sparseLf->rows(), batch * trgWords, sparseLf->cols(), sparseLf->nnz(), &alpha,
|
||||
sparseLf->description(), sparseLf->values(), sparseLf->rowIndices(), sparseLf->colIndices(),
|
||||
adj->data(), dimTrgVoc, &beta, expandAttGradBuffer, batch * srcWords));
|
||||
|
||||
Tensor expandAttGrad(new TensorBase(expandAttGradBuffer,
|
||||
{batch * trgWords, batch * srcWords}, 0));
|
||||
CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
sparseLf->rows(),
|
||||
batch * trgWords,
|
||||
sparseLf->cols(),
|
||||
sparseLf->nnz(),
|
||||
&alpha,
|
||||
sparseLf->description(),
|
||||
sparseLf->values(),
|
||||
sparseLf->rowIndices(),
|
||||
sparseLf->colIndices(),
|
||||
adj->data(),
|
||||
dimTrgVoc,
|
||||
&beta,
|
||||
expandAttGradBuffer,
|
||||
batch * srcWords));
|
||||
|
||||
Tensor expandAttGrad(new TensorBase(
|
||||
expandAttGradBuffer, {batch * trgWords, batch * srcWords}, 0));
|
||||
CollapseAtt(gradAtt, expandAttGrad);
|
||||
CUDA_CHECK(cudaFree(expandAttGradBuffer));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,56 +6,48 @@
|
||||
namespace marian {
|
||||
|
||||
struct AttentionNodeOp : public NaryNodeOp {
|
||||
|
||||
AttentionNodeOp(const std::vector<Expr>& nodes)
|
||||
: NaryNodeOp(nodes,
|
||||
keywords::shape=newShape(nodes)) {}
|
||||
: NaryNodeOp(nodes, keywords::shape = newShape(nodes)) {}
|
||||
|
||||
Shape newShape(const std::vector<Expr>& nodes) {
|
||||
Shape shape = nodes[1]->shape();
|
||||
|
||||
Shape vaShape = nodes[0]->shape();
|
||||
Shape vaShape = nodes[0]->shape();
|
||||
Shape ctxShape = nodes[1]->shape();
|
||||
Shape stateShape = nodes[2]->shape();
|
||||
|
||||
for(int i = 0; i < stateShape.size(); ++i) {
|
||||
UTIL_THROW_IF2(ctxShape[i] != stateShape[i] && ctxShape[i] != 1 && stateShape[i] != 1,
|
||||
UTIL_THROW_IF2(ctxShape[i] != stateShape[i] && ctxShape[i] != 1
|
||||
&& stateShape[i] != 1,
|
||||
"Shapes cannot be broadcasted");
|
||||
shape.set(i, std::max(ctxShape[i], stateShape[i]));
|
||||
}
|
||||
|
||||
UTIL_THROW_IF2(vaShape[0] != shape[1] || vaShape[1] != 1,
|
||||
"Wrong size");
|
||||
UTIL_THROW_IF2(vaShape[0] != shape[1] || vaShape[1] != 1, "Wrong size");
|
||||
|
||||
shape.set(1, 1);
|
||||
return shape;
|
||||
}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
return {
|
||||
NodeOp(Att(val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
child(2)->val(),
|
||||
children_.size() == 4 ? child(3)->val() : nullptr))
|
||||
};
|
||||
return {NodeOp(Att(val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
child(2)->val(),
|
||||
children_.size() == 4 ? child(3)->val() : nullptr))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
return {
|
||||
NodeOp(
|
||||
AttBack(
|
||||
child(0)->grad(),
|
||||
child(1)->grad(),
|
||||
child(2)->grad(),
|
||||
children_.size() == 4 ? child(3)->grad() : nullptr,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
child(2)->val(),
|
||||
children_.size() == 4 ? child(3)->val() : nullptr,
|
||||
adj_
|
||||
);
|
||||
)
|
||||
NodeOp(AttBack(child(0)->grad(),
|
||||
child(1)->grad(),
|
||||
child(2)->grad(),
|
||||
children_.size() == 4 ? child(3)->grad() : nullptr,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
child(2)->val(),
|
||||
children_.size() == 4 ? child(3)->val() : nullptr,
|
||||
adj_);)
|
||||
};
|
||||
}
|
||||
|
||||
@ -65,13 +57,9 @@ struct AttentionNodeOp : public NaryNodeOp {
|
||||
op();
|
||||
}
|
||||
|
||||
const std::string type() {
|
||||
return "Att-ops";
|
||||
}
|
||||
const std::string type() { return "Att-ops"; }
|
||||
|
||||
const std::string color() {
|
||||
return "yellow";
|
||||
}
|
||||
const std::string color() { return "yellow"; }
|
||||
};
|
||||
|
||||
Expr attOps(Expr va, Expr context, Expr state, Expr coverage) {
|
||||
@ -81,9 +69,8 @@ Expr attOps(Expr va, Expr context, Expr state, Expr coverage) {
|
||||
|
||||
int dimBatch = context->shape()[0];
|
||||
int dimWords = context->shape()[2];
|
||||
int dimBeam = state->shape()[3];
|
||||
int dimBeam = state->shape()[3];
|
||||
return reshape(Expression<AttentionNodeOp>(nodes),
|
||||
{dimWords, dimBatch, 1, dimBeam});
|
||||
}
|
||||
|
||||
}
|
@ -8,20 +8,16 @@ namespace marian {
|
||||
struct GRUFastNodeOp : public NaryNodeOp {
|
||||
bool final_;
|
||||
|
||||
template <typename ...Args>
|
||||
GRUFastNodeOp(const std::vector<Expr>& nodes, bool final, Args ...args)
|
||||
: NaryNodeOp(nodes,
|
||||
args...),
|
||||
final_(final) {}
|
||||
template <typename... Args>
|
||||
GRUFastNodeOp(const std::vector<Expr>& nodes, bool final, Args... args)
|
||||
: NaryNodeOp(nodes, args...), final_(final) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
std::vector<Tensor> inputs;
|
||||
for(int i = 0; i < children_.size(); ++i)
|
||||
inputs.push_back(child(i)->val());
|
||||
|
||||
return {
|
||||
NodeOp(GRUFastForward(val_, inputs, final_))
|
||||
};
|
||||
return {NodeOp(GRUFastForward(val_, inputs, final_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
@ -35,9 +31,7 @@ struct GRUFastNodeOp : public NaryNodeOp {
|
||||
outputs.push_back(nullptr);
|
||||
}
|
||||
|
||||
return {
|
||||
NodeOp(GRUFastBackward(outputs, inputs, adj_, final_))
|
||||
};
|
||||
return {NodeOp(GRUFastBackward(outputs, inputs, adj_, final_))};
|
||||
}
|
||||
|
||||
// do not check if node is trainable
|
||||
@ -46,17 +40,12 @@ struct GRUFastNodeOp : public NaryNodeOp {
|
||||
op();
|
||||
}
|
||||
|
||||
const std::string type() {
|
||||
return "GRU-ops";
|
||||
}
|
||||
const std::string type() { return "GRU-ops"; }
|
||||
|
||||
const std::string color() {
|
||||
return "yellow";
|
||||
}
|
||||
const std::string color() { return "yellow"; }
|
||||
};
|
||||
|
||||
Expr gruOps(const std::vector<Expr>& nodes, bool final) {
|
||||
return Expression<GRUFastNodeOp>(nodes, final);
|
||||
}
|
||||
|
||||
}
|
@ -5,17 +5,16 @@ namespace marian {
|
||||
|
||||
thread_local Ptr<sparse::CSR> LexProbs::lexProbs_;
|
||||
thread_local Ptr<sparse::CSR> LexProbs::lf_;
|
||||
|
||||
|
||||
Expr LexicalBias::operator()(Expr logits) {
|
||||
auto& alignmentsVec = attention_->getAlignments();
|
||||
|
||||
|
||||
Expr aln;
|
||||
if(single_)
|
||||
aln = alignmentsVec.back();
|
||||
else
|
||||
aln = concatenate(alignmentsVec, keywords::axis=3);
|
||||
|
||||
aln = concatenate(alignmentsVec, keywords::axis = 3);
|
||||
|
||||
return lexical_bias(logits, aln, eps_, sentLexProbs_);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,16 +1,16 @@
|
||||
#include "clippers.h"
|
||||
|
||||
#include "kernels/thrust_functions.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "kernels/thrust_functions.h"
|
||||
|
||||
namespace marian {
|
||||
void Elementwise::clip(Tensor t) {
|
||||
Element(_1 = Clip(_1, c_), t);
|
||||
}
|
||||
void Elementwise::clip(Tensor t) {
|
||||
Element(_1 = Clip(_1, c_), t);
|
||||
}
|
||||
|
||||
void Norm::clip(Tensor t) {
|
||||
float l2Norm = L2Norm(t);
|
||||
if(l2Norm >= c_)
|
||||
Element(_1 = (c_ / l2Norm) * _1, t);
|
||||
}
|
||||
void Norm::clip(Tensor t) {
|
||||
float l2Norm = L2Norm(t);
|
||||
if(l2Norm >= c_)
|
||||
Element(_1 = (c_ / l2Norm) * _1, t);
|
||||
}
|
||||
}
|
@ -1,83 +1,77 @@
|
||||
#include "optimizers.h"
|
||||
|
||||
#include "kernels/thrust_functions.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "kernels/thrust_functions.h"
|
||||
|
||||
namespace marian {
|
||||
void Sgd::updateImpl(Tensor params, Tensor grads) {
|
||||
Element(_1 -= eta_ * _2, params, grads);
|
||||
void Sgd::updateImpl(Tensor params, Tensor grads) {
|
||||
Element(_1 -= eta_ * _2, params, grads);
|
||||
}
|
||||
|
||||
void Adagrad::updateImpl(Tensor params, Tensor grads) {
|
||||
if(!alloc_)
|
||||
alloc_ = New<TensorAllocator>(params->getDevice());
|
||||
|
||||
if(!gt_) {
|
||||
int totalSize = params->size();
|
||||
alloc_->reserveExact(totalSize);
|
||||
alloc_->allocate(gt_, {1, totalSize});
|
||||
gt_->set(0);
|
||||
}
|
||||
|
||||
void Adagrad::updateImpl(Tensor params, Tensor grads) {
|
||||
if(!alloc_)
|
||||
alloc_ = New<TensorAllocator>(params->getDevice());
|
||||
|
||||
if(!gt_) {
|
||||
int totalSize = params->size();
|
||||
alloc_->reserveExact(totalSize);
|
||||
alloc_->allocate(gt_, {1, totalSize});
|
||||
gt_->set(0);
|
||||
}
|
||||
Element(_1 += (_2 * _2), gt_, grads);
|
||||
|
||||
Element(_1 += (_2 * _2),
|
||||
gt_, grads);
|
||||
Element(_1 -= (eta_ / (Sqrt(_2) + eps_)) * _3, params, gt_, grads);
|
||||
}
|
||||
|
||||
Element(_1 -= (eta_ / (Sqrt(_2) + eps_)) * _3,
|
||||
params, gt_, grads);
|
||||
void Adam::updateImpl(Tensor params, Tensor grads) {
|
||||
if(!mtAlloc_)
|
||||
mtAlloc_ = New<TensorAllocator>(params->getDevice());
|
||||
if(!vtAlloc_)
|
||||
vtAlloc_ = New<TensorAllocator>(params->getDevice());
|
||||
|
||||
if(!mt_) {
|
||||
int totalSize = params->size();
|
||||
mtAlloc_->reserveExact(totalSize);
|
||||
mtAlloc_->allocate(mt_, {1, totalSize});
|
||||
mt_->set(0);
|
||||
|
||||
vtAlloc_->reserveExact(totalSize);
|
||||
vtAlloc_->allocate(vt_, {1, totalSize});
|
||||
vt_->set(0);
|
||||
}
|
||||
|
||||
void Adam::updateImpl(Tensor params, Tensor grads) {
|
||||
if(!mtAlloc_)
|
||||
mtAlloc_ = New<TensorAllocator>(params->getDevice());
|
||||
if(!vtAlloc_)
|
||||
vtAlloc_ = New<TensorAllocator>(params->getDevice());
|
||||
|
||||
if(!mt_) {
|
||||
int totalSize = params->size();
|
||||
mtAlloc_->reserveExact(totalSize);
|
||||
mtAlloc_->allocate(mt_, {1, totalSize});
|
||||
mt_->set(0);
|
||||
t_++;
|
||||
float denom1 = 1 - std::pow(beta1_, t_);
|
||||
float denom2 = 1 - std::pow(beta2_, t_);
|
||||
|
||||
vtAlloc_->reserveExact(totalSize);
|
||||
vtAlloc_->allocate(vt_, {1, totalSize});
|
||||
vt_->set(0);
|
||||
}
|
||||
Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2), mt_, grads);
|
||||
Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)), vt_, grads);
|
||||
|
||||
t_++;
|
||||
float denom1 = 1 - std::pow(beta1_, t_);
|
||||
float denom2 = 1 - std::pow(beta2_, t_);
|
||||
Element(_1 -= eta_ * (_2 / denom1) / (Sqrt(_3 / denom2) + eps_),
|
||||
params,
|
||||
mt_,
|
||||
vt_);
|
||||
}
|
||||
|
||||
Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2),
|
||||
mt_, grads);
|
||||
Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)),
|
||||
vt_, grads);
|
||||
Ptr<OptimizerBase> Optimizer(Ptr<Config> options) {
|
||||
Ptr<ClipperBase> clipper = nullptr;
|
||||
float clipNorm = options->get<double>("clip-norm");
|
||||
if(clipNorm > 0)
|
||||
clipper = Clipper<Norm>(clipNorm);
|
||||
|
||||
Element(_1 -= eta_ * (_2 / denom1) / (Sqrt(_3 / denom2) + eps_),
|
||||
params, mt_, vt_);
|
||||
float lrate = options->get<double>("learn-rate");
|
||||
|
||||
std::string opt = options->get<std::string>("optimizer");
|
||||
|
||||
if(opt == "sgd") {
|
||||
return Optimizer<Sgd>(lrate, keywords::clip = clipper);
|
||||
} else if(opt == "adagrad") {
|
||||
return Optimizer<Adagrad>(lrate, keywords::clip = clipper);
|
||||
} else if(opt == "adam") {
|
||||
return Optimizer<Adam>(lrate, keywords::clip = clipper);
|
||||
} else {
|
||||
UTIL_THROW2("Unknown optimizer: " << opt);
|
||||
}
|
||||
|
||||
Ptr<OptimizerBase> Optimizer(Ptr<Config> options) {
|
||||
Ptr<ClipperBase> clipper = nullptr;
|
||||
float clipNorm = options->get<double>("clip-norm");
|
||||
if(clipNorm > 0)
|
||||
clipper = Clipper<Norm>(clipNorm);
|
||||
|
||||
float lrate = options->get<double>("learn-rate");
|
||||
|
||||
std::string opt = options->get<std::string>("optimizer");
|
||||
|
||||
if(opt == "sgd") {
|
||||
return Optimizer<Sgd>(lrate, keywords::clip=clipper);
|
||||
}
|
||||
else if(opt == "adagrad") {
|
||||
return Optimizer<Adagrad>(lrate, keywords::clip=clipper);
|
||||
}
|
||||
else if(opt == "adam") {
|
||||
return Optimizer<Adam>(lrate, keywords::clip=clipper);
|
||||
}
|
||||
else {
|
||||
UTIL_THROW2("Unknown optimizer: " << opt);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -3,42 +3,42 @@
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include "tensors/tensor.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "kernels/cuda_helpers.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
__global__ void gFill(float* d_in, int size, float val) {
|
||||
__global__ void gFill(float *d_in, int size, float val) {
|
||||
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if (index < size) {
|
||||
if(index < size) {
|
||||
d_in[index] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float TensorBase::get(size_t i) {
|
||||
cudaSetDevice(device_);
|
||||
float temp;
|
||||
CUDA_CHECK(cudaMemcpy(&temp, data_ + i, sizeof(float),
|
||||
cudaMemcpyDeviceToHost));
|
||||
cudaStreamSynchronize(0);
|
||||
return temp;
|
||||
}
|
||||
cudaSetDevice(device_);
|
||||
float temp;
|
||||
CUDA_CHECK(
|
||||
cudaMemcpy(&temp, data_ + i, sizeof(float), cudaMemcpyDeviceToHost));
|
||||
cudaStreamSynchronize(0);
|
||||
return temp;
|
||||
}
|
||||
|
||||
void TensorBase::set(size_t i, float value) {
|
||||
cudaSetDevice(device_);
|
||||
CUDA_CHECK(cudaMemcpy(data_ + i, &value, sizeof(float),
|
||||
cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(
|
||||
cudaMemcpy(data_ + i, &value, sizeof(float), cudaMemcpyHostToDevice));
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void TensorBase::get(std::vector<float> &v) {
|
||||
CUDA_CHECK(cudaSetDevice(device_));
|
||||
v.resize(size());
|
||||
CUDA_CHECK(cudaMemcpy(v.data(), data_, size() * sizeof(float),
|
||||
cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
v.data(), data_, size() * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
@ -52,8 +52,8 @@ void TensorBase::set(float value) {
|
||||
|
||||
void TensorBase::set(const std::vector<float> &v) {
|
||||
CUDA_CHECK(cudaSetDevice(device_));
|
||||
CUDA_CHECK(cudaMemcpy(data_, v.data(), v.size() * sizeof(float),
|
||||
cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
data_, v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice));
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
@ -64,12 +64,11 @@ void TensorBase::setSparse(const std::vector<size_t> &k,
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
|
||||
void TensorBase::copyFrom(Tensor in) {
|
||||
cudaSetDevice(device_);
|
||||
CUDA_CHECK(cudaMemcpy(data_ , in->data() , in->size() * sizeof(float),
|
||||
cudaMemcpyDefault));
|
||||
cudaStreamSynchronize(0);
|
||||
cudaSetDevice(device_);
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
data_, in->data(), in->size() * sizeof(float), cudaMemcpyDefault));
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
std::string TensorBase::debug() {
|
||||
@ -78,9 +77,9 @@ std::string TensorBase::debug() {
|
||||
assert(shape_.size());
|
||||
strm << "shape=" << shape_[0];
|
||||
for(int i = 1; i < shape_.size(); ++i)
|
||||
strm << "x" << shape_[i];
|
||||
strm << " size=" << shape_.elements()
|
||||
<< " (" << shape_.elements() * sizeof(float) << "B)";
|
||||
strm << "x" << shape_[i];
|
||||
strm << " size=" << shape_.elements() << " ("
|
||||
<< shape_.elements() * sizeof(float) << "B)";
|
||||
strm << " device=" << device_ << std::endl;
|
||||
|
||||
// values
|
||||
@ -88,78 +87,76 @@ std::string TensorBase::debug() {
|
||||
std::vector<float> values(totSize);
|
||||
get(values);
|
||||
|
||||
|
||||
size_t dispCols = 5;
|
||||
strm << std::fixed << std::setprecision(8) << std::setfill(' ');
|
||||
for(size_t l = 0; l < shape()[3]; ++l) {
|
||||
for(size_t k = 0; k < shape()[2]; ++k) {
|
||||
strm << "[ ";
|
||||
if(shape()[0] > 10) {
|
||||
for (size_t i = 0; i < shape()[0] && i < dispCols; ++i) {
|
||||
if(i > 0)
|
||||
strm << std::endl << " ";
|
||||
for (size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[ i * shape().stride(0)
|
||||
+ j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3) ] << " ";
|
||||
}
|
||||
if(shape()[1] > dispCols)
|
||||
strm << "... ";
|
||||
for (size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[ i * shape().stride(0)
|
||||
+ j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3) ] << " ";
|
||||
}
|
||||
strm << "[ ";
|
||||
if(shape()[0] > 10) {
|
||||
for(size_t i = 0; i < shape()[0] && i < dispCols; ++i) {
|
||||
if(i > 0)
|
||||
strm << std::endl << " ";
|
||||
for(size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[i * shape().stride(0) + j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3)]
|
||||
<< " ";
|
||||
}
|
||||
strm << std::endl << " ...";
|
||||
for (size_t i = shape()[0] - dispCols; i < shape()[0]; ++i) {
|
||||
if(i > 0)
|
||||
strm << std::endl << " ";
|
||||
for (size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[ i * shape().stride(0)
|
||||
+ j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3) ] << " ";
|
||||
}
|
||||
if(shape()[1] > dispCols)
|
||||
strm << "... ";
|
||||
for (size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[ i * shape().stride(0)
|
||||
+ j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3) ] << " ";
|
||||
}
|
||||
if(shape()[1] > dispCols)
|
||||
strm << "... ";
|
||||
for(size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[i * shape().stride(0) + j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3)]
|
||||
<< " ";
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (size_t i = 0; i < shape()[0] && i < 10; ++i) {
|
||||
if(i > 0)
|
||||
strm << std::endl << " ";
|
||||
for (size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[ i * shape().stride(0)
|
||||
+ j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3) ] << " ";
|
||||
}
|
||||
if(shape()[1] > dispCols)
|
||||
strm << "... ";
|
||||
for (size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[ i * shape().stride(0)
|
||||
+ j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3) ] << " ";
|
||||
}
|
||||
}
|
||||
strm << std::endl << " ...";
|
||||
for(size_t i = shape()[0] - dispCols; i < shape()[0]; ++i) {
|
||||
if(i > 0)
|
||||
strm << std::endl << " ";
|
||||
for(size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[i * shape().stride(0) + j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3)]
|
||||
<< " ";
|
||||
}
|
||||
}
|
||||
strm << "]" << std::endl;
|
||||
if(shape()[1] > dispCols)
|
||||
strm << "... ";
|
||||
for(size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[i * shape().stride(0) + j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3)]
|
||||
<< " ";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(size_t i = 0; i < shape()[0] && i < 10; ++i) {
|
||||
if(i > 0)
|
||||
strm << std::endl << " ";
|
||||
for(size_t j = 0; j < shape()[1] && j < dispCols; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[i * shape().stride(0) + j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3)]
|
||||
<< " ";
|
||||
}
|
||||
if(shape()[1] > dispCols)
|
||||
strm << "... ";
|
||||
for(size_t j = shape()[1] - dispCols; j < shape()[1]; ++j) {
|
||||
strm << std::setw(12)
|
||||
<< values[i * shape().stride(0) + j * shape().stride(1)
|
||||
+ k * shape().stride(2)
|
||||
+ l * shape().stride(3)]
|
||||
<< " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
strm << "]" << std::endl;
|
||||
}
|
||||
}
|
||||
return strm.str();
|
||||
@ -174,36 +171,34 @@ DeviceGPU::~DeviceGPU() {
|
||||
}
|
||||
|
||||
void DeviceGPU::reserve(size_t size) {
|
||||
cudaSetDevice(device_);
|
||||
cudaSetDevice(device_);
|
||||
|
||||
UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
|
||||
UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
|
||||
|
||||
if(data_) {
|
||||
// Allocate memory by going through host memory
|
||||
float *temp = new float[size_];
|
||||
CUDA_CHECK(cudaMemcpy(temp, data_, size_* sizeof(float),
|
||||
cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
CUDA_CHECK(cudaMalloc(&data_, size * sizeof(float)));
|
||||
CUDA_CHECK(cudaMemcpy(data_, temp, size_* sizeof(float),
|
||||
cudaMemcpyHostToDevice));
|
||||
delete[] temp;
|
||||
}
|
||||
else {
|
||||
CUDA_CHECK(cudaMalloc(&data_, size * sizeof(float)));
|
||||
}
|
||||
if(data_) {
|
||||
// Allocate memory by going through host memory
|
||||
float *temp = new float[size_];
|
||||
CUDA_CHECK(
|
||||
cudaMemcpy(temp, data_, size_ * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
CUDA_CHECK(cudaMalloc(&data_, size * sizeof(float)));
|
||||
CUDA_CHECK(
|
||||
cudaMemcpy(data_, temp, size_ * sizeof(float), cudaMemcpyHostToDevice));
|
||||
delete[] temp;
|
||||
} else {
|
||||
CUDA_CHECK(cudaMalloc(&data_, size * sizeof(float)));
|
||||
}
|
||||
|
||||
size_ = size;
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
Tensor operator<<(Tensor t, const std::vector<float>& v) {
|
||||
Tensor operator<<(Tensor t, const std::vector<float> &v) {
|
||||
t->set(v);
|
||||
return t;
|
||||
}
|
||||
|
||||
Tensor operator>>(Tensor t, std::vector<float>& v) {
|
||||
Tensor operator>>(Tensor t, std::vector<float> &v) {
|
||||
t->get(v);
|
||||
return t;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,13 +1,13 @@
|
||||
#include <iostream>
|
||||
#include <cuda.h>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <random>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
|
||||
#include "layers/generic.h"
|
||||
#include "marian.h"
|
||||
@ -40,24 +40,23 @@ int main(int argc, char** argv) {
|
||||
graph->setDevice(0);
|
||||
graph->reserveWorkspaceMB(128);
|
||||
|
||||
auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp));
|
||||
auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0));
|
||||
auto beta = graph->param("beta", {1, 3072}, init=inits::zeros);
|
||||
auto x
|
||||
= graph->param("x", {batchSize, 3072}, init = inits::from_vector(temp));
|
||||
auto gamma
|
||||
= graph->param("gamma", {1, 3072}, init = inits::from_value(2.0));
|
||||
auto beta = graph->param("beta", {1, 3072}, init = inits::zeros);
|
||||
|
||||
auto y = layer_norm(x, gamma, beta);
|
||||
|
||||
auto yLogitsL1 = Dense("ff_logit_l1", 512,
|
||||
activation=act::tanh,
|
||||
normalize=true)
|
||||
(y, y, y);
|
||||
auto yLogitsL1 = Dense(
|
||||
"ff_logit_l1", 512, activation = act::tanh, normalize = true)(y, y, y);
|
||||
|
||||
auto yLogitsL2 = Dense("ff_logit_l2", 50000)
|
||||
(yLogitsL1);
|
||||
auto yLogitsL2 = Dense("ff_logit_l2", 50000)(yLogitsL1);
|
||||
|
||||
auto idx = graph->constant({(int)indeces.size(), 1},
|
||||
init=inits::from_vector(indeces));
|
||||
init = inits::from_vector(indeces));
|
||||
auto ce = cross_entropy(yLogitsL2, idx);
|
||||
auto cost = mean(sum(ce, keywords::axis=2), keywords::axis=0);
|
||||
auto cost = mean(sum(ce, keywords::axis = 2), keywords::axis = 0);
|
||||
|
||||
debug(x, "x");
|
||||
debug(gamma, "gamma");
|
||||
@ -72,7 +71,8 @@ int main(int argc, char** argv) {
|
||||
graph->setDevice(0);
|
||||
graph->reserveWorkspaceMB(128);
|
||||
|
||||
auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp));
|
||||
auto x = graph->param("x", {batchSize, 3072},
|
||||
init=inits::from_vector(temp));
|
||||
auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0));
|
||||
auto beta = graph->param("beta", {1, 3072}, init=inits::zeros);
|
||||
|
||||
|
@ -1,14 +1,14 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cuda.h>
|
||||
#include <curand.h>
|
||||
#include <vector>
|
||||
#include <boost/timer/timer.hpp>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <boost/chrono.hpp>
|
||||
#include <boost/timer/timer.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "training/config.h"
|
||||
#include "marian.h"
|
||||
#include "layers/param_initializers.h"
|
||||
#include "marian.h"
|
||||
#include "training/config.h"
|
||||
|
||||
using namespace marian;
|
||||
using namespace keywords;
|
||||
|
@ -1,17 +1,17 @@
|
||||
#include <algorithm>
|
||||
#include <boost/chrono.hpp>
|
||||
#include <boost/timer/timer.hpp>
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <iomanip>
|
||||
#include <string>
|
||||
#include <cstdio>
|
||||
#include <boost/timer/timer.hpp>
|
||||
#include <boost/chrono.hpp>
|
||||
|
||||
#include "marian.h"
|
||||
#include "training/config.h"
|
||||
#include "optimizers/optimizers.h"
|
||||
#include "optimizers/clippers.h"
|
||||
#include "data/batch_generator.h"
|
||||
#include "data/corpus.h"
|
||||
#include "marian.h"
|
||||
#include "optimizers/clippers.h"
|
||||
#include "optimizers/optimizers.h"
|
||||
#include "training/config.h"
|
||||
|
||||
#include "models/amun.h"
|
||||
#include "models/s2s.h"
|
||||
@ -34,8 +34,9 @@ int main(int argc, char** argv) {
|
||||
if(options->has("lexical-table"))
|
||||
lexProbs = New<LexProbs>(options,
|
||||
corpus->getVocabs().front(),
|
||||
corpus->getVocabs().back(), device);
|
||||
|
||||
corpus->getVocabs().back(),
|
||||
device);
|
||||
|
||||
auto type = options->get<std::string>("type");
|
||||
Ptr<EncoderDecoderBase> encdec;
|
||||
if(type == "s2s")
|
||||
@ -43,7 +44,7 @@ int main(int argc, char** argv) {
|
||||
else if(type == "multi-s2s")
|
||||
encdec = New<MultiS2S>(options);
|
||||
else
|
||||
encdec = New<Amun>(options, keywords::lex_probs=lexProbs);
|
||||
encdec = New<Amun>(options, keywords::lex_probs = lexProbs);
|
||||
|
||||
auto model = options->get<std::string>("model");
|
||||
if(boost::filesystem::exists(model))
|
||||
@ -52,7 +53,7 @@ int main(int argc, char** argv) {
|
||||
graph->reserveWorkspaceMB(options->get<size_t>("workspace"));
|
||||
|
||||
boost::timer::cpu_timer timer;
|
||||
//size_t batches = 1;
|
||||
// size_t batches = 1;
|
||||
for(int i = 0; i < 1; ++i) {
|
||||
bg.prepare(false);
|
||||
while(bg) {
|
||||
@ -60,12 +61,12 @@ int main(int argc, char** argv) {
|
||||
batch->debug();
|
||||
|
||||
auto costNode = encdec->build(graph, batch);
|
||||
//for(auto p : graph->params())
|
||||
// for(auto p : graph->params())
|
||||
// debug(p, p->name());
|
||||
|
||||
|
||||
debug(costNode, "cost");
|
||||
|
||||
//graph->graphviz("debug.dot");
|
||||
// graph->graphviz("debug.dot");
|
||||
|
||||
graph->forward();
|
||||
graph->backward();
|
||||
@ -73,7 +74,7 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
//encdec->save(graph, "test.npz", true);
|
||||
// encdec->save(graph, "test.npz", true);
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << timer.format(5, "%ws") << std::endl;
|
||||
|
@ -1,32 +1,32 @@
|
||||
#include <boost/timer/timer.hpp>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <boost/timer/timer.hpp>
|
||||
|
||||
#include "training/config.h"
|
||||
#include "tensors/tensor_allocator.h"
|
||||
#include "training/config.h"
|
||||
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "common/logging.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
using namespace marian;
|
||||
|
||||
|
||||
marian::Config config(argc, argv);
|
||||
|
||||
|
||||
TensorAllocator ta(0);
|
||||
|
||||
|
||||
int batchSize = 64;
|
||||
int hidden = 2048;
|
||||
int words = 13;
|
||||
|
||||
|
||||
Tensor out1, out2, out3;
|
||||
Tensor a_1, a_2, a_3;
|
||||
Tensor b_1, b_2, b_3;
|
||||
|
||||
|
||||
ta.allocate(out1, {batchSize, hidden, 1});
|
||||
ta.allocate(out2, {batchSize, hidden, 1});
|
||||
ta.allocate(out3, {batchSize, hidden, words});
|
||||
|
||||
|
||||
ta.allocate(a1, {batchSize, hidden, 1});
|
||||
ta.allocate(a2, {batchSize, hidden, 1});
|
||||
ta.allocate(a3, {batchSize, 1, words});
|
||||
@ -34,73 +34,72 @@ int main(int argc, char** argv) {
|
||||
ta.allocate(b1, {1, hidden, 1});
|
||||
ta.allocate(b2, {batchSize, 1, 1});
|
||||
ta.allocate(b3, {batchSize, hidden, 1});
|
||||
|
||||
|
||||
out1->set(0);
|
||||
out2->set(0);
|
||||
out3->set(0);
|
||||
|
||||
|
||||
a1->set(1);
|
||||
a2->set(1);
|
||||
a3->set(1);
|
||||
|
||||
|
||||
b1->set(2);
|
||||
b2->set(2);
|
||||
b3->set(2);
|
||||
|
||||
|
||||
for(int i = 0; i < 100; i++) {
|
||||
Add(_1 * _2, out1, a1, b1);
|
||||
Add(_1 * _2, out2, a2, b2);
|
||||
Add(_1 * _2, out3, a3, b3);
|
||||
}
|
||||
|
||||
//auto srcVocab = New<Vocab>();
|
||||
//auto trgVocab = New<Vocab>();
|
||||
|
||||
// auto srcVocab = New<Vocab>();
|
||||
// auto trgVocab = New<Vocab>();
|
||||
//
|
||||
//srcVocab->load("model/vocab.ro.yml");
|
||||
//trgVocab->load("model/vocab.en.yml");
|
||||
// srcVocab->load("model/vocab.ro.yml");
|
||||
// trgVocab->load("model/vocab.en.yml");
|
||||
//
|
||||
//int srcDim = 50;
|
||||
//int trgDim = 50;
|
||||
// int srcDim = 50;
|
||||
// int trgDim = 50;
|
||||
//
|
||||
//auto probs = New<LexProbs>("data/lex.s2t",
|
||||
// auto probs = New<LexProbs>("data/lex.s2t",
|
||||
// srcVocab, trgVocab,
|
||||
// srcDim, trgDim, 0);
|
||||
//
|
||||
//TensorAllocator ta(0);
|
||||
// TensorAllocator ta(0);
|
||||
//
|
||||
//int batchSize = 1;
|
||||
//int srcWords = 6;
|
||||
//int trgWords = 2;
|
||||
// int batchSize = 1;
|
||||
// int srcWords = 6;
|
||||
// int trgWords = 2;
|
||||
//
|
||||
//
|
||||
//std::vector<Ptr<data::SubBatch>> batches;
|
||||
//batches.push_back(New<data::SubBatch>(batchSize, srcWords));
|
||||
//batches.back()->indeces() = { 3, 4, 0, 1, 2, 0 };
|
||||
// std::vector<Ptr<data::SubBatch>> batches;
|
||||
// batches.push_back(New<data::SubBatch>(batchSize, srcWords));
|
||||
// batches.back()->indeces() = { 3, 4, 0, 1, 2, 0 };
|
||||
//
|
||||
//auto batch = New<data::CorpusBatch>(batches);
|
||||
// auto batch = New<data::CorpusBatch>(batches);
|
||||
//
|
||||
//Tensor att, logits;
|
||||
//Tensor lf, lfa;
|
||||
// Tensor att, logits;
|
||||
// Tensor lf, lfa;
|
||||
//
|
||||
//ta.allocate(att, {batchSize, 1, srcWords, trgWords});
|
||||
//ta.allocate(logits, {batchSize, trgDim, trgWords});
|
||||
//ta.allocate(lf, {batchSize, trgDim, srcWords, 1});
|
||||
//ta.allocate(lfa, {batchSize, trgDim, trgWords});
|
||||
// ta.allocate(att, {batchSize, 1, srcWords, trgWords});
|
||||
// ta.allocate(logits, {batchSize, trgDim, trgWords});
|
||||
// ta.allocate(lf, {batchSize, trgDim, srcWords, 1});
|
||||
// ta.allocate(lfa, {batchSize, trgDim, trgWords});
|
||||
//
|
||||
//logits->set(0);
|
||||
// logits->set(0);
|
||||
//
|
||||
//auto slf = probs->Lf(batch);
|
||||
//slf->toTensor(lf);
|
||||
//std::cerr << lf->debug() << std::endl;
|
||||
// auto slf = probs->Lf(batch);
|
||||
// slf->toTensor(lf);
|
||||
// std::cerr << lf->debug() << std::endl;
|
||||
//
|
||||
//std::vector<float> av = { 0.9, 0.05, 0.02, 0.01, 0.01, 0.01,
|
||||
// std::vector<float> av = { 0.9, 0.05, 0.02, 0.01, 0.01, 0.01,
|
||||
// 0.9, 0.05, 0.02, 0.01, 0.01, 0.01 };
|
||||
//att->set(av);
|
||||
//std::cerr << att->debug() << std::endl;
|
||||
// att->set(av);
|
||||
// std::cerr << att->debug() << std::endl;
|
||||
//
|
||||
//sparse::LfaForward(lfa, logits, att, slf);
|
||||
//std::cerr << lfa->debug() << std::endl;
|
||||
|
||||
|
||||
// sparse::LfaForward(lfa, logits, att, slf);
|
||||
// std::cerr << lfa->debug() << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,18 +1,21 @@
|
||||
#include <limits>
|
||||
#include <cuda.h>
|
||||
#include <limits>
|
||||
|
||||
#include "translator/helpers.h"
|
||||
#include "tensors/tensor.h"
|
||||
#include "data/types.h"
|
||||
#include "tensors/tensor.h"
|
||||
#include "translator/helpers.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
__global__ void gSetColumn(float* d_in, size_t n_columns, size_t n_rows,
|
||||
size_t noColumn, float value) {
|
||||
size_t rowNumber = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
__global__ void gSetColumn(float* d_in,
|
||||
size_t n_columns,
|
||||
size_t n_rows,
|
||||
size_t noColumn,
|
||||
float value) {
|
||||
size_t rowNumber = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
size_t index = noColumn + rowNumber * n_columns;
|
||||
|
||||
if (index < n_columns * n_rows) {
|
||||
if(index < n_columns * n_rows) {
|
||||
d_in[index] = value;
|
||||
}
|
||||
}
|
||||
@ -20,8 +23,8 @@ __global__ void gSetColumn(float* d_in, size_t n_columns, size_t n_rows,
|
||||
void SetColumn(Tensor in, size_t col, float value) {
|
||||
int nRows = in->shape()[0] * in->shape()[2] * in->shape()[3];
|
||||
int nColumns = in->shape()[1];
|
||||
|
||||
int nBlocks = nRows / 512 + ((nRows % 512 == 0) ? 0 : 1);
|
||||
|
||||
int nBlocks = nRows / 512 + ((nRows % 512 == 0) ? 0 : 1);
|
||||
int nThreads = std::min(512, nRows);
|
||||
|
||||
gSetColumn<<<nBlocks, nThreads>>>(in->data(), nColumns, nRows, col, value);
|
||||
@ -34,6 +37,4 @@ void suppressUnk(Expr probs) {
|
||||
void suppressWord(Expr probs, Word id) {
|
||||
SetColumn(probs->val(), id, std::numeric_limits<float>::lowest());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -4,29 +4,35 @@
|
||||
|
||||
namespace marian {
|
||||
|
||||
void HandleError(cudaError_t err, const char *file, int line ) {
|
||||
if (err != cudaSuccess) {
|
||||
UTIL_THROW2("ERROR: " << cudaGetErrorString(err) << " in " << file << " at line " << line);
|
||||
void HandleError(cudaError_t err, const char* file, int line) {
|
||||
if(err != cudaSuccess) {
|
||||
UTIL_THROW2("ERROR: " << cudaGetErrorString(err) << " in " << file
|
||||
<< " at line "
|
||||
<< line);
|
||||
}
|
||||
}
|
||||
|
||||
#define UNROLL_MAXARG_LOOP( n, max ) \
|
||||
if (tid < (n) && tid + (n) < ( max ) ) { \
|
||||
if (sdata[tid + ( n ) ] > sdata[tid]) { \
|
||||
sdata[tid] = sdata[tid + ( n ) ]; \
|
||||
indices[tid] = indices[tid + ( n ) ]; \
|
||||
} \
|
||||
#define UNROLL_MAXARG_LOOP(n, max) \
|
||||
if(tid < (n) && tid + (n) < (max)) { \
|
||||
if(sdata[tid + (n)] > sdata[tid]) { \
|
||||
sdata[tid] = sdata[tid + (n)]; \
|
||||
indices[tid] = indices[tid + (n)]; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
|
||||
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
|
||||
|
||||
__global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatches, int* batchFirstElementIdxs) {
|
||||
__global__ void gMaxElement(float* d_out,
|
||||
int* d_ind,
|
||||
float* d_in,
|
||||
int numBatches,
|
||||
int* batchFirstElementIdxs) {
|
||||
extern __shared__ float sdata[];
|
||||
__shared__ int indices[512];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
for (int batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
|
||||
for(int batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
|
||||
int begin = batchFirstElementIdxs[batchIdx];
|
||||
int end = batchFirstElementIdxs[batchIdx + 1];
|
||||
|
||||
@ -34,15 +40,15 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
|
||||
|
||||
sdata[tid] = -3.40282e+38f;
|
||||
|
||||
if (i < end) {
|
||||
if(i < end) {
|
||||
sdata[tid] = d_in[i];
|
||||
indices[tid] = i;
|
||||
}
|
||||
|
||||
if (i + blockDim.x < end) {
|
||||
if(i + blockDim.x < end) {
|
||||
float a = d_in[i];
|
||||
float b = d_in[i + blockDim.x];
|
||||
if (a > b) {
|
||||
if(a > b) {
|
||||
sdata[tid] = a;
|
||||
indices[tid] = i;
|
||||
} else {
|
||||
@ -51,18 +57,18 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
|
||||
}
|
||||
}
|
||||
|
||||
while (i + 2 * gridDim.x * blockDim.x < end) {
|
||||
while(i + 2 * gridDim.x * blockDim.x < end) {
|
||||
i += 2 * gridDim.x * blockDim.x;
|
||||
|
||||
float a = d_in[i];
|
||||
if (a > sdata[tid]) {
|
||||
if(a > sdata[tid]) {
|
||||
sdata[tid] = a;
|
||||
indices[tid] = i;
|
||||
}
|
||||
|
||||
if (i + blockDim.x < end) {
|
||||
if(i + blockDim.x < end) {
|
||||
float b = d_in[i + blockDim.x];
|
||||
if (b > sdata[tid]) {
|
||||
if(b > sdata[tid]) {
|
||||
sdata[tid] = b;
|
||||
indices[tid] = i + blockDim.x;
|
||||
}
|
||||
@ -71,9 +77,9 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
|
||||
if (tid < s && tid + s < end) {
|
||||
if (sdata[tid + s] > sdata[tid]) {
|
||||
for(int s = (blockDim.x >> 1); s > 32; s >>= 1) {
|
||||
if(tid < s && tid + s < end) {
|
||||
if(sdata[tid + s] > sdata[tid]) {
|
||||
sdata[tid] = sdata[tid + s];
|
||||
indices[tid] = indices[tid + s];
|
||||
}
|
||||
@ -88,7 +94,7 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
|
||||
UNROLL_MAXARG_LOOP(2, end);
|
||||
UNROLL_MAXARG_LOOP(1, end);
|
||||
|
||||
if (tid == 0) {
|
||||
if(tid == 0) {
|
||||
d_out[blockIdx.x + batchIdx * gridDim.x] = sdata[0];
|
||||
d_ind[blockIdx.x + batchIdx * gridDim.x] = indices[0];
|
||||
}
|
||||
@ -96,7 +102,14 @@ __global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatche
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, int *batchFirstElements, float* outCosts, int* outIdxs, int *cummulatedBeamSizes, int NUM_BLOCKS) {
|
||||
__global__ void gMaxElementUpdate(float* binCosts,
|
||||
int* binIdxs,
|
||||
float* probs,
|
||||
int* batchFirstElements,
|
||||
float* outCosts,
|
||||
int* outIdxs,
|
||||
int* cummulatedBeamSizes,
|
||||
int NUM_BLOCKS) {
|
||||
extern __shared__ float sdata[];
|
||||
__shared__ int indices[512];
|
||||
__shared__ float bestBinCost;
|
||||
@ -106,24 +119,26 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
const int batchIdx = blockIdx.x;
|
||||
const int N = batchFirstElements[batchIdx + 1] - batchFirstElements[batchIdx];
|
||||
int num_bins = int(N / (2 * 512)) + int(N % (2 * 512) != 0);
|
||||
if (num_bins > 500) {
|
||||
if(num_bins > 500) {
|
||||
num_bins = 500;
|
||||
}
|
||||
|
||||
for (int pos = cummulatedBeamSizes[batchIdx]; pos < cummulatedBeamSizes[batchIdx + 1]; ++pos) {
|
||||
for(int pos = cummulatedBeamSizes[batchIdx];
|
||||
pos < cummulatedBeamSizes[batchIdx + 1];
|
||||
++pos) {
|
||||
int i = tid;
|
||||
|
||||
sdata[tid] = -3.40282e+38f;
|
||||
|
||||
if (i < num_bins) {
|
||||
if(i < num_bins) {
|
||||
sdata[tid] = binCosts[batchIdx * NUM_BLOCKS + i];
|
||||
indices[tid] = i;
|
||||
}
|
||||
|
||||
if (i + blockDim.x < num_bins) {
|
||||
if(i + blockDim.x < num_bins) {
|
||||
float a = binCosts[batchIdx * NUM_BLOCKS + i];
|
||||
float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x];
|
||||
if (a > b) {
|
||||
if(a > b) {
|
||||
sdata[tid] = a;
|
||||
indices[tid] = i;
|
||||
} else {
|
||||
@ -132,18 +147,18 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
}
|
||||
}
|
||||
|
||||
while (i + 2 * blockDim.x < num_bins) {
|
||||
while(i + 2 * blockDim.x < num_bins) {
|
||||
i += 2 * blockDim.x;
|
||||
|
||||
float a = binCosts[batchIdx * NUM_BLOCKS + i];
|
||||
if (a > sdata[tid]) {
|
||||
if(a > sdata[tid]) {
|
||||
sdata[tid] = a;
|
||||
indices[tid] = i;
|
||||
}
|
||||
|
||||
if (i + blockDim.x < num_bins) {
|
||||
if(i + blockDim.x < num_bins) {
|
||||
float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x];
|
||||
if (b > sdata[tid]) {
|
||||
if(b > sdata[tid]) {
|
||||
sdata[tid] = b;
|
||||
indices[tid] = i + blockDim.x;
|
||||
}
|
||||
@ -152,9 +167,9 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
|
||||
if (tid < s && tid + s < num_bins) {
|
||||
if (sdata[tid + s] > sdata[tid]) {
|
||||
for(int s = (blockDim.x >> 1); s > 32; s >>= 1) {
|
||||
if(tid < s && tid + s < num_bins) {
|
||||
if(sdata[tid + s] > sdata[tid]) {
|
||||
sdata[tid] = sdata[tid + s];
|
||||
indices[tid] = indices[tid + s];
|
||||
}
|
||||
@ -169,7 +184,7 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
UNROLL_MAXARG_LOOP(2, num_bins);
|
||||
UNROLL_MAXARG_LOOP(1, num_bins);
|
||||
|
||||
if (tid == 0) {
|
||||
if(tid == 0) {
|
||||
bestBinCost = sdata[0];
|
||||
bestBinCostIdx = batchIdx * NUM_BLOCKS + indices[0];
|
||||
|
||||
@ -181,20 +196,21 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
|
||||
__syncthreads();
|
||||
|
||||
i = batchFirstElements[batchIdx] + (bestBinCostIdx - batchIdx * NUM_BLOCKS) * (blockDim.x * 2) + tid;
|
||||
i = batchFirstElements[batchIdx]
|
||||
+ (bestBinCostIdx - batchIdx * NUM_BLOCKS) * (blockDim.x * 2) + tid;
|
||||
const int dist = num_bins * 2 * blockDim.x;
|
||||
|
||||
sdata[tid] = -3.40282e+38f;
|
||||
|
||||
if (i < batchFirstElements[batchIdx + 1]) {
|
||||
if(i < batchFirstElements[batchIdx + 1]) {
|
||||
sdata[tid] = probs[i];
|
||||
indices[tid] = i;
|
||||
}
|
||||
|
||||
if (i + blockDim.x < batchFirstElements[batchIdx + 1]) {
|
||||
if(i + blockDim.x < batchFirstElements[batchIdx + 1]) {
|
||||
float a = probs[i];
|
||||
float b = probs[i+blockDim.x];
|
||||
if (a > b) {
|
||||
float b = probs[i + blockDim.x];
|
||||
if(a > b) {
|
||||
sdata[tid] = a;
|
||||
indices[tid] = i;
|
||||
} else {
|
||||
@ -203,18 +219,18 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
}
|
||||
}
|
||||
|
||||
while (i + dist < batchFirstElements[batchIdx + 1]) {
|
||||
while(i + dist < batchFirstElements[batchIdx + 1]) {
|
||||
i += dist;
|
||||
|
||||
float a = probs[i];
|
||||
if (a > sdata[tid]) {
|
||||
if(a > sdata[tid]) {
|
||||
sdata[tid] = a;
|
||||
indices[tid] = i;
|
||||
}
|
||||
|
||||
if (i + blockDim.x < batchFirstElements[batchIdx + 1]) {
|
||||
if(i + blockDim.x < batchFirstElements[batchIdx + 1]) {
|
||||
float b = probs[i + blockDim.x];
|
||||
if (b > sdata[tid]) {
|
||||
if(b > sdata[tid]) {
|
||||
sdata[tid] = b;
|
||||
indices[tid] = i + blockDim.x;
|
||||
}
|
||||
@ -223,9 +239,9 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
|
||||
if (tid < s && tid + s < batchFirstElements[batchIdx + 1]) {
|
||||
if (sdata[tid + s] > sdata[tid]) {
|
||||
for(int s = (blockDim.x >> 1); s > 32; s >>= 1) {
|
||||
if(tid < s && tid + s < batchFirstElements[batchIdx + 1]) {
|
||||
if(sdata[tid + s] > sdata[tid]) {
|
||||
sdata[tid] = sdata[tid + s];
|
||||
indices[tid] = indices[tid + s];
|
||||
}
|
||||
@ -240,7 +256,7 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
UNROLL_MAXARG_LOOP(2, batchFirstElements[batchIdx + 1]);
|
||||
UNROLL_MAXARG_LOOP(1, batchFirstElements[batchIdx + 1]);
|
||||
|
||||
if (tid == 0) {
|
||||
if(tid == 0) {
|
||||
binCosts[bestBinCostIdx] = sdata[0];
|
||||
binIdxs[bestBinCostIdx] = indices[0];
|
||||
}
|
||||
@ -248,40 +264,49 @@ __global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, i
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void gGetValueByKey(float* d_in, float* d_out, int* indeces, int n)
|
||||
{
|
||||
int tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if (tid < n) {
|
||||
__global__ void gGetValueByKey(float* d_in, float* d_out, int* indeces, int n) {
|
||||
int tid = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if(tid < n) {
|
||||
int index = indeces[tid];
|
||||
d_out[tid] = d_in[index];
|
||||
}
|
||||
}
|
||||
|
||||
NthElement::NthElement(size_t maxBeamSize, size_t maxBatchSize/*, cudaStream_t stream*/)
|
||||
NthElement::NthElement(size_t maxBeamSize,
|
||||
size_t maxBatchSize /*, cudaStream_t stream*/)
|
||||
: /*stream_(stream) ,*/
|
||||
NUM_BLOCKS(std::min(500, int(maxBeamSize * 85000 / (2 * BLOCK_SIZE)) + int(maxBeamSize * 85000 % (2 * BLOCK_SIZE) != 0)))
|
||||
{
|
||||
//std::cerr << "NthElement::NthElement" << std::endl;
|
||||
NUM_BLOCKS(
|
||||
std::min(500,
|
||||
int(maxBeamSize * 85000 / (2 * BLOCK_SIZE))
|
||||
+ int(maxBeamSize * 85000 % (2 * BLOCK_SIZE) != 0))) {
|
||||
// std::cerr << "NthElement::NthElement" << std::endl;
|
||||
|
||||
HANDLE_ERROR( cudaMalloc((void**)&d_ind, maxBatchSize * NUM_BLOCKS * sizeof(int)) );
|
||||
HANDLE_ERROR(
|
||||
cudaMalloc((void**)&d_ind, maxBatchSize * NUM_BLOCKS * sizeof(int)));
|
||||
|
||||
HANDLE_ERROR( cudaMalloc((void**)&d_out, maxBatchSize * NUM_BLOCKS * sizeof(float)) );
|
||||
HANDLE_ERROR(
|
||||
cudaMalloc((void**)&d_out, maxBatchSize * NUM_BLOCKS * sizeof(float)));
|
||||
|
||||
HANDLE_ERROR( cudaMalloc((void**)&d_res_idx, maxBatchSize * maxBeamSize * sizeof(int)) );
|
||||
HANDLE_ERROR( cudaMalloc((void**)&d_res, maxBatchSize * maxBeamSize * sizeof(float)) );
|
||||
HANDLE_ERROR(
|
||||
cudaMalloc((void**)&d_res_idx, maxBatchSize * maxBeamSize * sizeof(int)));
|
||||
HANDLE_ERROR(
|
||||
cudaMalloc((void**)&d_res, maxBatchSize * maxBeamSize * sizeof(float)));
|
||||
|
||||
HANDLE_ERROR( cudaHostAlloc((void**) &h_res, maxBeamSize * maxBatchSize* sizeof(float),
|
||||
cudaHostAllocDefault) );
|
||||
HANDLE_ERROR( cudaHostAlloc((void**) &h_res_idx, maxBeamSize * maxBatchSize * sizeof(int),
|
||||
cudaHostAllocDefault) );
|
||||
HANDLE_ERROR(cudaHostAlloc((void**)&h_res,
|
||||
maxBeamSize * maxBatchSize * sizeof(float),
|
||||
cudaHostAllocDefault));
|
||||
HANDLE_ERROR(cudaHostAlloc((void**)&h_res_idx,
|
||||
maxBeamSize * maxBatchSize * sizeof(int),
|
||||
cudaHostAllocDefault));
|
||||
|
||||
HANDLE_ERROR( cudaMalloc((void**)&d_breakdown, maxBeamSize * sizeof(float)) );
|
||||
HANDLE_ERROR( cudaMalloc((void**)&d_batchPosition, (maxBatchSize + 1) * sizeof(int)) );
|
||||
HANDLE_ERROR( cudaMalloc((void**)&d_cumBeamSizes, (maxBatchSize + 1) * sizeof(int)) );
|
||||
HANDLE_ERROR(cudaMalloc((void**)&d_breakdown, maxBeamSize * sizeof(float)));
|
||||
HANDLE_ERROR(
|
||||
cudaMalloc((void**)&d_batchPosition, (maxBatchSize + 1) * sizeof(int)));
|
||||
HANDLE_ERROR(
|
||||
cudaMalloc((void**)&d_cumBeamSizes, (maxBatchSize + 1) * sizeof(int)));
|
||||
}
|
||||
|
||||
NthElement::~NthElement()
|
||||
{
|
||||
NthElement::~NthElement() {
|
||||
HANDLE_ERROR(cudaFree(d_ind));
|
||||
HANDLE_ERROR(cudaFree(d_out));
|
||||
HANDLE_ERROR(cudaFree(d_res_idx));
|
||||
@ -293,52 +318,76 @@ NthElement::~NthElement()
|
||||
HANDLE_ERROR(cudaFree(d_cumBeamSizes));
|
||||
}
|
||||
|
||||
void NthElement::getNBestList(float* probs, const std::vector<int>& batchFirstElementIdxs,
|
||||
const std::vector<int>& cummulatedBeamSizes)
|
||||
{
|
||||
HANDLE_ERROR( cudaMemcpyAsync(d_batchPosition, batchFirstElementIdxs.data(), batchFirstElementIdxs.size() * sizeof(int),
|
||||
cudaMemcpyHostToDevice, /* stream_ */ 0) );
|
||||
HANDLE_ERROR( cudaMemcpyAsync(d_cumBeamSizes, cummulatedBeamSizes.data(), cummulatedBeamSizes.size() * sizeof(int),
|
||||
cudaMemcpyHostToDevice, /* stream_ */ 0) );
|
||||
void NthElement::getNBestList(float* probs,
|
||||
const std::vector<int>& batchFirstElementIdxs,
|
||||
const std::vector<int>& cummulatedBeamSizes) {
|
||||
HANDLE_ERROR(cudaMemcpyAsync(d_batchPosition,
|
||||
batchFirstElementIdxs.data(),
|
||||
batchFirstElementIdxs.size() * sizeof(int),
|
||||
cudaMemcpyHostToDevice,
|
||||
/* stream_ */ 0));
|
||||
HANDLE_ERROR(cudaMemcpyAsync(d_cumBeamSizes,
|
||||
cummulatedBeamSizes.data(),
|
||||
cummulatedBeamSizes.size() * sizeof(int),
|
||||
cudaMemcpyHostToDevice,
|
||||
/* stream_ */ 0));
|
||||
|
||||
const int numBatches = batchFirstElementIdxs.size() - 1;
|
||||
|
||||
gMaxElement<<<NUM_BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), /* stream_ */ 0>>>
|
||||
(d_out, d_ind, probs, numBatches, d_batchPosition);
|
||||
gMaxElement<<<NUM_BLOCKS,
|
||||
BLOCK_SIZE,
|
||||
BLOCK_SIZE * sizeof(float),
|
||||
/* stream_ */ 0>>>(
|
||||
d_out, d_ind, probs, numBatches, d_batchPosition);
|
||||
|
||||
gMaxElementUpdate<<<numBatches, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), /* stream_ */ 0>>>
|
||||
(d_out, d_ind, probs, d_batchPosition, d_res, d_res_idx, d_cumBeamSizes, NUM_BLOCKS);
|
||||
gMaxElementUpdate<<<numBatches,
|
||||
BLOCK_SIZE,
|
||||
BLOCK_SIZE * sizeof(float),
|
||||
/* stream_ */ 0>>>(d_out,
|
||||
d_ind,
|
||||
probs,
|
||||
d_batchPosition,
|
||||
d_res,
|
||||
d_res_idx,
|
||||
d_cumBeamSizes,
|
||||
NUM_BLOCKS);
|
||||
}
|
||||
|
||||
void NthElement::getNBestList(const std::vector<size_t>& beamSizes, Tensor Probs,
|
||||
std::vector<float>& outCosts, std::vector<unsigned>& outKeys,
|
||||
const bool isFirst) {
|
||||
void NthElement::getNBestList(const std::vector<size_t>& beamSizes,
|
||||
Tensor Probs,
|
||||
std::vector<float>& outCosts,
|
||||
std::vector<unsigned>& outKeys,
|
||||
const bool isFirst) {
|
||||
std::vector<int> cummulatedBeamSizes(beamSizes.size() + 1, 0);
|
||||
std::vector<int> batchFirstElementIdxs(beamSizes.size() + 1, 0);
|
||||
|
||||
const size_t vocabSize = Probs->shape()[1];
|
||||
for (size_t i = 0; i < beamSizes.size(); ++i) {
|
||||
|
||||
for(size_t i = 0; i < beamSizes.size(); ++i) {
|
||||
cummulatedBeamSizes[i + 1] = cummulatedBeamSizes[i] + beamSizes[i];
|
||||
batchFirstElementIdxs[i + 1] += ((isFirst) ? (i + 1) : cummulatedBeamSizes[i + 1]) * vocabSize;
|
||||
batchFirstElementIdxs[i + 1]
|
||||
+= ((isFirst) ? (i + 1) : cummulatedBeamSizes[i + 1]) * vocabSize;
|
||||
}
|
||||
|
||||
getNBestList(Probs->data(), batchFirstElementIdxs, cummulatedBeamSizes);
|
||||
GetPairs(cummulatedBeamSizes.back(), outKeys, outCosts);
|
||||
|
||||
}
|
||||
|
||||
void NthElement::GetPairs(size_t number,
|
||||
std::vector<unsigned>& outKeys,
|
||||
std::vector<float>& outValues) {
|
||||
|
||||
HANDLE_ERROR( cudaMemcpyAsync(h_res, d_res, number * sizeof(float),
|
||||
cudaMemcpyDeviceToHost, /* stream_ */ 0) );
|
||||
HANDLE_ERROR( cudaMemcpyAsync(h_res_idx, d_res_idx, number * sizeof(int),
|
||||
cudaMemcpyDeviceToHost, /* stream_ */ 0) );
|
||||
std::vector<unsigned>& outKeys,
|
||||
std::vector<float>& outValues) {
|
||||
HANDLE_ERROR(cudaMemcpyAsync(h_res,
|
||||
d_res,
|
||||
number * sizeof(float),
|
||||
cudaMemcpyDeviceToHost,
|
||||
/* stream_ */ 0));
|
||||
HANDLE_ERROR(cudaMemcpyAsync(h_res_idx,
|
||||
d_res_idx,
|
||||
number * sizeof(int),
|
||||
cudaMemcpyDeviceToHost,
|
||||
/* stream_ */ 0));
|
||||
cudaStreamSynchronize(/* stream_ */ 0);
|
||||
|
||||
for (size_t i = 0; i < number; ++i) {
|
||||
for(size_t i = 0; i < number; ++i) {
|
||||
outKeys.push_back(h_res_idx[i]);
|
||||
outValues.push_back(h_res[i]);
|
||||
}
|
||||
@ -347,13 +396,14 @@ void NthElement::GetPairs(size_t number,
|
||||
}
|
||||
|
||||
void NthElement::getValueByKey(std::vector<float>& out, float* d_in) {
|
||||
gGetValueByKey<<<1, lastN, 0, /* stream_ */ 0>>>
|
||||
(d_in, d_breakdown, h_res_idx, lastN);
|
||||
gGetValueByKey<<<1, lastN, 0, /* stream_ */ 0>>>(
|
||||
d_in, d_breakdown, h_res_idx, lastN);
|
||||
|
||||
HANDLE_ERROR( cudaMemcpyAsync(out.data(), d_breakdown, lastN * sizeof(float),
|
||||
cudaMemcpyDeviceToHost, /* stream_ */ 0) );
|
||||
HANDLE_ERROR( cudaStreamSynchronize(/* stream_ */ 0));
|
||||
HANDLE_ERROR(cudaMemcpyAsync(out.data(),
|
||||
d_breakdown,
|
||||
lastN * sizeof(float),
|
||||
cudaMemcpyDeviceToHost,
|
||||
/* stream_ */ 0));
|
||||
HANDLE_ERROR(cudaStreamSynchronize(/* stream_ */ 0));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user