Merge branch 'master' into save-adam-params

This commit is contained in:
Roman Grundkiewicz 2018-02-18 21:40:31 +00:00
commit 0f16ed97f3
51 changed files with 741 additions and 698 deletions

View File

@ -58,12 +58,12 @@ else(Tcmalloc_FOUND)
message(WARNING "Cannot find TCMalloc library. Continuing.")
endif(Tcmalloc_FOUND)
find_package(MPI)
if(MPI_FOUND)
include_directories(${MPI_INCLUDE_PATH})
set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
add_definitions(-DMPI_FOUND=1)
endif(MPI_FOUND)
#find_package(MPI)
#if(MPI_FOUND)
# include_directories(${MPI_INCLUDE_PATH})
# set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
# add_definitions(-DMPI_FOUND=1)
#endif(MPI_FOUND)
find_package(ZLIB)
if(ZLIB_FOUND)

View File

@ -8,19 +8,22 @@ cuda_add_library(marian
3rd_party/cnpy/cnpy.cpp
3rd_party/exception.cpp
3rd_party/svd/svd.cpp
graph/expression_graph.cpp
graph/expression_operators.cu
graph/node.cu
graph/node_operators.cu
graph/node_initializers.cu
tensors/tensor.cu
tensors/device_gpu.cu
tensors/memory_piece.cu
# tensors/tensor.cu
tensors/device.cu
tensors/device.cpp
tensors/backend.cpp
tensors/gpu/algorithm.cu
tensors/gpu/dropout.cu
tensors/cpu/dropout.cpp
kernels/tensor_operators.cu
kernels/cudnn_wrappers.cu
backend/gpu/dropout.cu
backend/cpu/dropout.cpp
graph/expression_graph.cpp
graph/expression_operators.cu
graph/node.cpp
graph/node_operators.cu
graph/node_initializers.cu
layers/convolution.cu
rnn/cells.cu
optimizers/clippers.cu
optimizers/optimizers.cu
@ -91,7 +94,7 @@ endforeach(exec)
#set_target_properties(align2steps PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
if(PYTHONLIBS_FOUND)
add_subdirectory(python)
# add_subdirectory(python)
endif(PYTHONLIBS_FOUND)
if(COMPILE_TESTS)

View File

@ -1,14 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include "backend/dispatch.h"
namespace marian {
namespace cpu {
void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
ABORT("Not implemented");
}
}
}

View File

@ -1,39 +0,0 @@
#pragma once
#include "common/definitions.h"
#include "graph/backend.h"
#include "tensors/tensor.h"
#define DISPATCH1(Function, Arg1) \
namespace gpu { \
void Function(Ptr<Backend>, Arg1); \
} \
namespace cpu { \
void Function(Ptr<Backend>, Arg1); \
} \
void Function(Ptr<Backend> backend, Arg1 arg1) { \
if(backend->getDevice().type == DeviceType::gpu) \
gpu::Function(backend, arg1); \
else \
cpu::Function(backend, arg1); \
}
#define DISPATCH2(Function, Arg1, Arg2) \
namespace gpu { \
void Function(Ptr<Backend>, Arg1, Arg2); \
} \
namespace cpu { \
void Function(Ptr<Backend>, Arg1, Arg2); \
} \
static inline void Function(Ptr<Backend> backend, Arg1 arg1, Arg2 arg2) { \
if(backend->getDevice().type == DeviceType::gpu) \
gpu::Function(backend, arg1, arg2); \
else \
cpu::Function(backend, arg1, arg2); \
}
namespace marian {
DISPATCH2(Dropout, Tensor, float)
}

View File

@ -1,8 +1,7 @@
#include <sstream>
#include "graph/backend_gpu.h"
#include "graph/expression_graph.h"
#include "backend/dispatch.h"
#include "tensors/dispatch.h"
namespace marian {
@ -11,19 +10,17 @@ ExpressionGraph::ExpressionGraph(bool inference)
void ExpressionGraph::setDevice(DeviceId deviceId) {
if(!backend_) {
backend_ = New<BackendGPU>(deviceId, Config::seed);
backend_ = BackendByDevice(deviceId, Config::seed);
params_ = New<Parameters>();
params_->init(backend_->getDevice());
tensors_ = New<TensorAllocator>(backend_->getDevice());
params_->init(backend_);
tensors_ = New<TensorAllocator>(backend_);
}
}
Expr ExpressionGraph::dropout(float prob, Shape shape) {
return Expression<ConstantNode>(shared_from_this(),
keywords::init = [prob, this](Tensor t) {
Dropout(backend_, t, prob);
Dropout(t, prob);
},
keywords::shape = shape);
}

View File

@ -8,8 +8,8 @@
#include "common/definitions.h"
#include "tensors/tensor_allocator.h"
#include "tensors/backend.h"
#include "graph/backend.h"
#include "graph/parameters.h"
#include "graph/chainable.h"
#include "graph/node_operators.h"
@ -306,7 +306,7 @@ public:
tensors_->free(t);
}
Ptr<Allocator<DeviceGPU>> allocator() { return tensors_->allocator(); }
Ptr<Allocator> allocator() { return tensors_->allocator(); }
void clear() {
// clear everything apart from parameters

View File

@ -1,4 +1,4 @@
#include "graph/backend_gpu.h"
#include "tensors/backend.h"
#include "graph/expression_graph.h"
#include "graph/node.h"

View File

@ -5,10 +5,11 @@
#include <thread>
#include "common/keywords.h"
#include "graph/backend.h"
#include "graph/chainable.h"
#include "tensors/backend.h"
#include "tensors/tensor.h"
#include "graph/chainable.h"
namespace marian {
class Node : public Chainable<Tensor>,

View File

@ -2,10 +2,10 @@
#include <thread>
#include "graph/backend_gpu.h"
#include "tensors/gpu/backend.h"
#include "graph/node.h"
#include "kernels/tensor_operators.h"
#include "functional/functional.h"
#include "kernels/tensor_operators.h"
#include "kernels/cudnn_wrappers.h"
namespace marian {
@ -54,7 +54,7 @@ public:
NodeOps forwardOps() {
// C = alpha * dot(op(A), op(B))
return {NodeOp(Prod(
std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
val_,
child(0)->val(),
child(1)->val(),
@ -72,7 +72,7 @@ public:
// to sum gradients from different graph parts
if(!transA_ && transB_)
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
adj_,
@ -81,7 +81,7 @@ public:
false,
1.0,
scalar_)),
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
adj_,
@ -92,7 +92,7 @@ public:
scalar_))};
if(transA_ && !transB_)
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
child(1)->val(),
@ -101,7 +101,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
child(0)->val(),
@ -112,7 +112,7 @@ public:
scalar_))};
if(transA_ && transB_)
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
child(1)->val(),
@ -121,7 +121,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
adj_,
@ -131,7 +131,7 @@ public:
1.0,
scalar_))};
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
adj_,
@ -140,7 +140,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
child(0)->val(),
@ -198,7 +198,7 @@ public:
using namespace functional;
return {
NodeOp(Prod(
std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
val_,
child(0)->val(),
child(1)->val(),
@ -219,7 +219,7 @@ public:
using namespace functional;
if(!transA_ && transB_)
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
adj_,
@ -228,7 +228,7 @@ public:
false,
1.0,
scalar_)),
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
adj_,
@ -240,7 +240,7 @@ public:
NodeOp(Add(_1, child(2)->grad(), adj_))};
if(transA_ && !transB_)
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
child(1)->val(),
@ -249,7 +249,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
child(0)->val(),
@ -261,7 +261,7 @@ public:
NodeOp(Add(_1, child(2)->grad(), adj_))};
if(transA_ && transB_)
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
child(1)->val(),
@ -270,7 +270,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
adj_,
@ -281,7 +281,7 @@ public:
scalar_)),
NodeOp(Add(_1, child(2)->grad(), adj_))};
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
adj_,
@ -290,7 +290,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
child(0)->val(),
@ -350,7 +350,7 @@ public:
NodeOps forwardOps() {
// C = alpha * dot(op(A), op(B))
return {NodeOp(ProdBatched(
std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
val_,
child(0)->val(),
child(1)->val(),
@ -369,7 +369,7 @@ public:
if(!transA_ && transB_)
return {
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
adj_,
@ -378,7 +378,7 @@ public:
false,
1.0,
scalar_)),
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
adj_,
@ -390,7 +390,7 @@ public:
if(transA_ && !transB_)
return {
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
child(1)->val(),
@ -399,7 +399,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
child(0)->val(),
@ -411,7 +411,7 @@ public:
if(transA_ && transB_)
return {
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
child(1)->val(),
@ -420,7 +420,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
adj_,
@ -431,7 +431,7 @@ public:
scalar_))};
return {
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(0)->grad(),
adj_,
@ -440,7 +440,7 @@ public:
true,
1.0,
scalar_)),
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
->getCublasHandle(),
child(1)->grad(),
child(0)->val(),

View File

@ -1,10 +1,11 @@
#pragma once
#include "graph/backend_gpu.h"
#include "tensors/tensor.h"
#include "tensors/gpu/backend.h"
#include "graph/node.h"
#include "kernels/sparse.h"
#include "kernels/tensor_operators.h"
#include "tensors/tensor.h"
#include "functional/functional.h"
#include "kernels/cudnn_wrappers.h"
@ -210,21 +211,7 @@ struct TanhNodeOp : public NaryNodeOp {
const std::string type() { return "tanh"; }
};
/**
* Represents a <a
* href="https://en.wikipedia.org/wiki/Rectifier_(neural_networks)">rectified
* linear</a> node in an expression graph.
*
* This node implements the activation function \f$ f(x) = \max(0, x) \f$ and
* its derivative:
* \f[
* f^\prime(x) =
* \begin{cases}
* 0 & \text{if } x \leq 0 \\
* 1 & \text{if } x > 0
* \end{cases}
* \f]
*/
struct ReLUNodeOp : public UnaryNodeOp {
template <typename... Args>
ReLUNodeOp(Args... args) : UnaryNodeOp(args...) {}
@ -876,14 +863,14 @@ public:
Tensor& val() {
auto childVal = reshapee_->val();
val_.reset(
new TensorBase(childVal->memory(), shape(), childVal->getDevice()));
new TensorBase(childVal->memory(), shape(), childVal->getBackend()));
return val_;
};
Tensor& grad() {
auto childGrad = reshapee_->grad();
adj_.reset(
new TensorBase(childGrad->memory(), shape(), childGrad->getDevice()));
new TensorBase(childGrad->memory(), shape(), childGrad->getBackend()));
return adj_;
};
@ -952,7 +939,7 @@ public:
size_t offset = step_ * shape().elements() * sizeof(float);
auto mem = New<MemoryPiece>(childVal->memory()->data() + offset,
childVal->memory()->size());
val_.reset(new TensorBase(mem, shape(), childVal->getDevice()));
val_.reset(new TensorBase(mem, shape(), childVal->getBackend()));
return val_;
};
@ -961,7 +948,7 @@ public:
size_t offset = step_ * shape().elements() * sizeof(float);
auto mem = New<MemoryPiece>(childGrad->memory()->data() + offset,
childGrad->memory()->size());
adj_.reset(new TensorBase(mem, shape(), childGrad->getDevice()));
adj_.reset(new TensorBase(mem, shape(), childGrad->getBackend()));
return adj_;
};

View File

@ -20,9 +20,9 @@ private:
Ptr<TensorAllocator> grads_;
public:
void init(DeviceId deviceId) {
vals_ = New<TensorAllocator>(deviceId);
grads_ = New<TensorAllocator>(deviceId);
void init(Ptr<Backend> backend) {
vals_ = New<TensorAllocator>(backend);
grads_ = New<TensorAllocator>(backend);
}
auto begin() -> decltype(params_.begin()) { return params_.begin(); }
@ -57,7 +57,7 @@ public:
}
void allocateForward() {
if(vals_->size() == 0) {
if(!params_.empty() && vals_->size() == 0) {
vals_->reserveExact(totalCapacity(vals_));
for(auto p : params_)
if(!p->val())
@ -66,7 +66,7 @@ public:
}
void allocateBackward() {
if(grads_->size() == 0) {
if(!params_.empty() && grads_->size() == 0) {
grads_->reserveExact(totalCapacity(grads_));
for(auto p : params_)
if(!p->grad())

View File

@ -17,6 +17,12 @@ inline void gpuAssert(cudaError_t code,
}
}
template <typename T>
void CudaCopy(const T* start, const T* end, T* dest) {
CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T),
cudaMemcpyDefault));
}
#define CUSPARSE_CHECK(x) \
{ \
cusparseStatus_t _c = x; \

View File

@ -12,7 +12,7 @@ void multiply(Ptr<CSR> C,
const Ptr<CSR> B,
bool transA,
bool transB) {
cudaSetDevice(C->getDevice());
cudaSetDevice(backend_->getDevice().no);
int nnzTotal;
C->allocRowIndices(A->rows());
CUSPARSE_CHECK(cusparseXcsrgemmNnz(
@ -91,7 +91,7 @@ void multiply(Ptr<CSR> C,
//}
void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr<CSR> sparseLf) {
cudaSetDevice(out->getDevice());
cudaSetDevice(backend_->getDevice().no);
int batch = att->shape()[0];
int srcWords = att->shape()[2];
@ -150,7 +150,7 @@ __global__ void gCollapseAtt(float* out,
}
void CollapseAtt(Tensor out, Tensor in) {
cudaSetDevice(out->getDevice());
cudaSetDevice(backend_->getDevice().no);
int nonzeros = out->shape().elements();
int batch = out->shape()[0];
int srcWords = out->shape()[2];

View File

@ -14,7 +14,7 @@ private:
int nnz_{0};
int rows_{0};
int cols_{0};
DeviceId deviceId_;
Ptr<Backend> backend_;
cusparseHandle_t handle_{0};
cusparseMatDescr_t descr_{0};
@ -24,9 +24,9 @@ private:
float* values_{0};
public:
CSR(int rows, int cols, DeviceId deviceId)
: rows_(rows), cols_(cols), deviceId_(deviceId) {
cudaSetDevice(deviceId_.no);
CSR(int rows, int cols, Ptr<Backend> backend)
: rows_(rows), cols_(cols), backend_(backend) {
cudaSetDevice(backend_->getDevice().no);
CUSPARSE_CHECK(cusparseCreate(&handle_));
CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_));
CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL));
@ -38,9 +38,9 @@ public:
const std::vector<float>& values,
const std::vector<int>& rowIndices,
const std::vector<int>& colIndices,
DeviceId deviceId)
: nnz_(values.size()), rows_(rows), cols_(cols), deviceId_(deviceId) {
cudaSetDevice(deviceId_.no);
Ptr<Backend> backend)
: nnz_(values.size()), rows_(rows), cols_(cols), backend_(backend) {
cudaSetDevice(backend_->getDevice().no);
CUSPARSE_CHECK(cusparseCreate(&handle_));
CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_));
CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL));
@ -73,8 +73,8 @@ public:
CUDA_CHECK(cudaFree(cooRowIndices));
}
CSR(Tensor dense) : deviceId_(dense->getDevice()) {
cudaSetDevice(deviceId_.no);
CSR(Tensor dense) : backend_(dense->getBackend()) {
cudaSetDevice(backend_->getDevice().no);
rows_ = dense->shape()[0] * dense->shape()[2] * dense->shape()[3];
cols_ = dense->shape()[1];
@ -114,7 +114,7 @@ public:
}
~CSR() {
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
if(values_)
CUDA_CHECK(cudaFree(values_));
if(rowIndices_)
@ -129,7 +129,7 @@ public:
}
void toTensor(Tensor dense) {
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
ABORT_IF(dense->size() != rows_ * cols_, "Matrix sizes do not match");
cusparseScsc2dense(handle_,
@ -154,10 +154,10 @@ public:
int* rowIndices() { return rowIndices_; }
int* colIndices() { return colIndices_; }
DeviceId getDevice() { return deviceId_; }
DeviceId getDevice() { return backend_->getDevice(); }
void allocValues(int nnz = 0) {
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
if(nnz > 0)
nnz_ = nnz;
if(values_)
@ -166,7 +166,7 @@ public:
}
void allocRowIndices(int rows = 0) {
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
if(rows > 0)
rows_ = rows;
if(rowIndices_)
@ -175,7 +175,7 @@ public:
}
void allocColIndices(int nnz = 0) {
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
if(nnz > 0)
nnz_ = nnz;
if(colIndices_)
@ -184,11 +184,12 @@ public:
}
std::string debug() {
cudaSetDevice(backend_->getDevice().no);
uint8_t* buffer;
CUDA_CHECK(cudaMalloc(&buffer, sizeof(float) * rows() * cols()));
auto mem = New<MemoryPiece>(buffer, sizeof(float) * rows() * cols());
Tensor tensor(new TensorBase(mem, {rows(), cols()}, deviceId_));
Tensor tensor(new TensorBase(mem, {rows(), cols()}, backend_));
toTensor(tensor);
std::string temp = tensor->debug();

View File

@ -879,7 +879,7 @@ __global__ void gInsert(float* out,
}
}
void Select(Ptr<Allocator<DeviceGPU>> allocator,
void Select(Ptr<Allocator> allocator,
Tensor out,
const Tensor in,
int axis,
@ -892,7 +892,7 @@ void Select(Ptr<Allocator<DeviceGPU>> allocator,
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
auto mp_indices = allocator->alloc<size_t>(indices.size());
mp_indices->insert(indices.data(), indices.size());
CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
int axisGPU = axis + gpu::Shape::size() - out->shape().size();
gSelect<<<blocks, threads>>>(out->data(),
@ -905,7 +905,7 @@ void Select(Ptr<Allocator<DeviceGPU>> allocator,
allocator->free(mp_indices);
}
void Insert(Ptr<Allocator<DeviceGPU>> allocator,
void Insert(Ptr<Allocator> allocator,
Tensor out,
const Tensor in,
int axis,
@ -918,7 +918,7 @@ void Insert(Ptr<Allocator<DeviceGPU>> allocator,
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
auto mp_indices = allocator->alloc<size_t>(indices.size());
mp_indices->insert(indices.data(), indices.size());
CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
int axisGPU = axis + gpu::Shape::size() - out->shape().size();
gInsert<<<blocks, threads>>>(out->data(),
@ -1295,7 +1295,7 @@ float L2Norm(Tensor in) {
uint8_t* data;
cudaMalloc(&data, blocks * sizeof(float));
Tensor out(new TensorBase(
New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getDevice()));
New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend()));
ReduceAll(_1 * _1, out, in);
float dataCpu = sqrtf(out->get(0));

View File

@ -7,7 +7,6 @@
#include "tensors/tensor.h"
#include "tensors/allocator.h"
#include "tensors/device_gpu.h"
#include "gpu/shape.h"
#include "gpu/tmp.h"
@ -71,13 +70,13 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) {
void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis);
void Select(Ptr<Allocator<DeviceGPU>> allocator,
void Select(Ptr<Allocator> allocator,
Tensor out,
Tensor in,
int axis,
const std::vector<size_t>&);
void Insert(Ptr<Allocator<DeviceGPU>> allocator,
void Insert(Ptr<Allocator> allocator,
Tensor out,
Tensor in,
int axis,

View File

@ -13,7 +13,7 @@ void Sgd::updateImpl(Tensor params, Tensor grads) {
void Adagrad::updateImpl(Tensor params, Tensor grads) {
if(!alloc_)
alloc_ = New<TensorAllocator>(params->getDevice());
alloc_ = New<TensorAllocator>(params->getBackend());
if(!gt_) {
int elements = params->size();
@ -42,7 +42,7 @@ void Adagrad::resetStats() {
void Adam::updateImpl(Tensor params, Tensor grads) {
if(!alloc_)
alloc_ = New<TensorAllocator>(params->getDevice());
alloc_ = New<TensorAllocator>(params->getBackend());
if(!mt_) {
int elements = params->size();

View File

@ -8,10 +8,10 @@ cuda_add_library(pymarian SHARED
../graph/node.cu
../graph/node_operators.cu
../tensors/tensor.cu
../tensors/device_gpu.cu
../tensors/device.cpp
../kernels/tensor_operators.cu
../backend/gpu/dropout.cu
../backend/cpu/dropout.cpp
../tensors/gpu/dropout.cu
../tensors/cpu/dropout.cpp
../kernels/sparse.cu
#../layers/param_initializers.cu
../rnn/attention.cu

View File

@ -10,6 +10,7 @@
#include "common/definitions.h"
#include "tensors/memory_piece.h"
#include "tensors/device.h"
namespace marian {
@ -65,10 +66,9 @@ public:
Gap rest(size_t offset) const { return Gap(data_ + offset, size_ - offset); }
};
template <class Device>
class Allocator {
private:
Device device_;
Ptr<Device> device_;
size_t available_{0};
size_t step_{128 * 1024 * 1024};
size_t alignment_{256};
@ -83,23 +83,23 @@ private:
void grow(size_t add) {
add = align(add);
uint8_t* oldData = device_.data();
size_t oldSize = device_.size();
uint8_t* oldData = device_->data();
size_t oldSize = device_->size();
device_.reserve(oldSize + add);
device_->reserve(oldSize + add);
std::set<Gap> oldGaps;
gaps_.swap(oldGaps);
for(auto gap : oldGaps)
gaps_.insert(
Gap(device_.data() + std::distance(oldData, gap.data()), gap.size()));
insertGap(Gap(device_.data() + oldSize, add));
Gap(device_->data() + std::distance(oldData, gap.data()), gap.size()));
insertGap(Gap(device_->data() + oldSize, add));
std::unordered_map<uint8_t*, Ptr<MemoryPiece>> oldAllocated;
allocated_.swap(oldAllocated);
for(auto it : oldAllocated) {
uint8_t* newPtr = device_.data() + std::distance(oldData, it.first);
uint8_t* newPtr = device_->data() + std::distance(oldData, it.first);
allocated_[newPtr] = oldAllocated[it.first];
allocated_[newPtr]->setPtr(newPtr);
}
@ -142,7 +142,7 @@ private:
public:
Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256)
: device_(deviceId, alignment),
: device_(DispatchDevice(deviceId, alignment)),
step_(step),
available_(0),
alignment_(alignment) {
@ -153,7 +153,8 @@ public:
void reserve(size_t bytes) {
bytes = align(bytes);
device_.reserve(bytes);
if(bytes > 0)
device_->reserve(bytes);
clear();
}
@ -211,17 +212,17 @@ public:
available_ = 0;
gaps_.clear();
allocated_.clear();
insertGap({device_.data(), device_.size()}, false);
insertGap({device_->data(), device_->size()}, false);
}
Ptr<MemoryPiece> memory() {
return New<MemoryPiece>(device_.data(), device_.size());
return New<MemoryPiece>(device_->data(), device_->size());
}
size_t size() { return device_.size(); }
size_t size() { return device_->size(); }
size_t available() { return available_; }
DeviceId getDevice() { return device_.getDevice(); }
DeviceId getDevice() { return device_->getDevice(); }
};
}

15
src/tensors/backend.cpp Normal file
View File

@ -0,0 +1,15 @@
#include "tensors/backend.h"
#include "tensors/gpu/backend.h"
#include "tensors/cpu/backend.h"
namespace marian {
Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
if(deviceId.type == DeviceType::gpu)
return New<gpu::Backend>(deviceId, seed);
else
return New<cpu::Backend>(deviceId, seed);
}
}

View File

@ -17,4 +17,6 @@ public:
virtual void setDevice() = 0;
};
Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed);
}

29
src/tensors/cpu/backend.h Normal file
View File

@ -0,0 +1,29 @@
#pragma once
#include <functional>
#include <random>
#include "common/config.h"
#include "tensors/backend.h"
namespace marian {
namespace cpu {
class Backend : public marian::Backend {
private:
std::default_random_engine gen_;
public:
Backend(DeviceId deviceId, size_t seed)
: marian::Backend(deviceId, seed),
gen_(seed_) {}
void setDevice() { }
std::default_random_engine& getRandomGenerator() {
return gen_;
}
};
}
}

View File

@ -0,0 +1,19 @@
#include <algorithm>
#include <random>
#include "tensors/dispatch.h"
#include "tensors/cpu/backend.h"
namespace marian {
namespace cpu {
void Dropout(Tensor tensor, float p) {
auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
auto &gen = cpuBackend->getRandomGenerator();
std::bernoulli_distribution dist(1.f - p);
std::generate(tensor->data(), tensor->data() + tensor->size(),
[&]() { return dist(gen) / (1.f - p); });
}
}
}

29
src/tensors/device.cpp Normal file
View File

@ -0,0 +1,29 @@
#include <iostream>
#include "tensors/device.h"
namespace marian {
namespace cpu {
Device::~Device() {
delete[] data_;
data_ = nullptr;
size_ = 0;
}
void Device::reserve(size_t size) {
size = align(size);
ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
if(data_) {
uint8_t *temp = new uint8_t[size_];
std::copy(data_, data_ + size_, temp);
delete[] data_;
data_ = temp;
} else {
data_ = new uint8_t[size];
}
size_ = size;
}
}
}

40
src/tensors/device.cu Normal file
View File

@ -0,0 +1,40 @@
#include <cuda.h>
#include <iostream>
#include "tensors/device.h"
#include "kernels/cuda_helpers.h"
namespace marian {
namespace gpu {
Device::~Device() {
cudaSetDevice(deviceId_.no);
if(data_) {
CUDA_CHECK(cudaFree(data_));
}
cudaDeviceSynchronize();
}
void Device::reserve(size_t size) {
size = align(size);
cudaSetDevice(deviceId_.no);
ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
if(data_) {
// Allocate memory by going through host memory
uint8_t *temp = new uint8_t[size_];
CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(data_));
CUDA_CHECK(cudaMalloc(&data_, size));
CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
delete[] temp;
} else {
CUDA_CHECK(cudaMalloc(&data_, size));
}
size_ = size;
}
}
}

68
src/tensors/device.h Normal file
View File

@ -0,0 +1,68 @@
#pragma once
#include <cmath>
#include <cstdint>
#include "common/definitions.h"
namespace marian {
class Device {
protected:
DeviceId deviceId_;
uint8_t* data_{0};
size_t size_{0};
size_t alignment_;
size_t align(size_t size) {
return ceil(size / (float)alignment_) * alignment_;
}
public:
Device(DeviceId deviceId, size_t alignment = 256)
: deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {}
virtual ~Device() {};
virtual void reserve(size_t size) = 0;
virtual uint8_t* data() { return data_; }
virtual size_t size() { return size_; }
virtual DeviceId getDevice() { return deviceId_; }
};
namespace gpu {
class Device : public marian::Device {
public:
Device(DeviceId deviceId, size_t alignment = 256)
: marian::Device(deviceId, alignment) {}
~Device();
void reserve(size_t size);
};
}
namespace cpu {
class Device : public marian::Device {
public:
Device(DeviceId deviceId, size_t alignment = 256)
: marian::Device(deviceId, alignment) {}
~Device();
void reserve(size_t size);
};
}
static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
if(deviceId.type == DeviceType::gpu)
return New<gpu::Device>(deviceId, alignment);
else
return New<cpu::Device>(deviceId, alignment);
}
}

View File

@ -1,37 +0,0 @@
#include <cuda.h>
#include <iostream>
#include "tensors/device_gpu.h"
#include "kernels/cuda_helpers.h"
namespace marian {
DeviceGPU::~DeviceGPU() {
cudaSetDevice(deviceId_.no);
if(data_) {
CUDA_CHECK(cudaFree(data_));
}
cudaDeviceSynchronize();
}
void DeviceGPU::reserve(size_t size) {
size = align(size);
cudaSetDevice(deviceId_.no);
ABORT_IF(size < size_, "New size must be larger than old size");
if(data_) {
// Allocate memory by going through host memory
uint8_t *temp = new uint8_t[size_];
CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(data_));
CUDA_CHECK(cudaMalloc(&data_, size));
CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
delete[] temp;
} else {
CUDA_CHECK(cudaMalloc(&data_, size));
}
size_ = size;
}
}

View File

@ -1,35 +0,0 @@
#pragma once
#include <cmath>
#include <cstdint>
#include "common/definitions.h"
namespace marian {
class DeviceGPU {
private:
uint8_t* data_;
size_t size_;
DeviceId deviceId_;
size_t alignment_;
size_t align(size_t size) {
return ceil(size / (float)alignment_) * alignment_;
}
public:
DeviceGPU(DeviceId deviceId, size_t alignment = 256)
: data_(0), size_(0), deviceId_(deviceId), alignment_(alignment) {}
~DeviceGPU();
void reserve(size_t size);
uint8_t* data() { return data_; }
size_t size() { return size_; }
DeviceId getDevice() { return deviceId_; }
};
}

38
src/tensors/dispatch.h Normal file
View File

@ -0,0 +1,38 @@
#pragma once
#include "common/definitions.h"
#include "tensors/tensor.h"
#define DISPATCH1(Function, Arg1) \
namespace gpu { \
void Function(Arg1); \
} \
namespace cpu { \
void Function(Arg1); \
} \
void Function(Arg1 arg1) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1); \
else \
cpu::Function(arg1); \
}
#define DISPATCH2(Function, Arg1, Arg2) \
namespace gpu { \
void Function(Arg1, Arg2); \
} \
namespace cpu { \
void Function(Arg1, Arg2); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2); \
else \
cpu::Function(arg1, arg2); \
}
namespace marian {
DISPATCH2(Dropout, Tensor, float)
}

View File

@ -0,0 +1,42 @@
#include "tensors/gpu/algorithm.h"
#include "kernels/cuda_helpers.h"
#include "kernels/tensor_operators.h"
namespace marian {
namespace gpu {
void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
CudaCopy(begin, end, dest);
CUDA_CHECK(cudaStreamSynchronize(0));
}
__global__ void gFill(float *d_in, int size, float val) {
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
if(index < size) {
d_in[index] = val;
}
}
}
void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
int size = end - begin;
int threads = std::min(512, size);
int blocks = (size / threads) + (size % threads != 0);
gFill<<<blocks, threads>>>(begin, size, value);
CUDA_CHECK(cudaStreamSynchronize(0));
}
void setSparse(Ptr<Backend> backend,
const std::vector<size_t>& keys,
const std::vector<float>& values,
float* data) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
SetSparse(data, keys, values);
CUDA_CHECK(cudaStreamSynchronize(0));
}
}
}

View File

@ -0,0 +1,12 @@
#pragma once
#include "tensors/backend.h"
namespace marian {
namespace gpu {
void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest);
void fill(Ptr<Backend> backend, float* begin, float* end, float value);
void setSparse(Ptr<Backend> backend, const std::vector<size_t>&, const std::vector<float>&, float*);
}
}

View File

@ -5,7 +5,7 @@
#include <curand.h>
#include "common/config.h"
#include "graph/backend.h"
#include "tensors/backend.h"
#define CURAND_CALL(x) \
do { \
@ -16,10 +16,11 @@
} while(0)
namespace marian {
namespace gpu {
class BackendGPU : public Backend {
class Backend : public marian::Backend {
public:
BackendGPU(DeviceId deviceId, size_t seed) : Backend(deviceId, seed) {
Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {
setDevice();
setHandles();
}
@ -36,13 +37,13 @@ private:
cublasHandle_t cublasHandle_;
curandGenerator_t curandGenerator_;
void setHandles() {
cublasHandle_ = create_handle();
curandGenerator_ = createCurandGenerator();
}
curandGenerator_t createCurandGenerator() {
cudaSetDevice(deviceId_.no);
curandGenerator_t generator;
@ -62,4 +63,6 @@ private:
return cublasHandle;
}
};
}
}

View File

@ -1,50 +1,52 @@
#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <stdlib.h>
#include "backend/dispatch.h"
#include "graph/backend_gpu.h"
#define CUDA_CALL(x) \
do { \
if((x) != cudaSuccess) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
exit(1); \
} \
} while(0)
#define CURAND_CALL(x) \
do { \
if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
exit(1); \
} \
} while(0)
namespace marian {
namespace gpu {
__global__ void gScale(float* data, int n, float p) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
while(index < n) {
data[index] = (data[index] < p) / p;
index += gridDim.x * blockDim.x;
}
}
void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
curandGenerator_t gen = std::static_pointer_cast<BackendGPU>(backend)->getCurandGenerator();
int n = tensor->size();
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
int numThreads = std::min(n, 512);
int numBlocks = n / numThreads + (n % numThreads != 0);
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
}
}
}
#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <stdlib.h>
#include "tensors/dispatch.h"
#include "tensors/gpu/backend.h"
#define CUDA_CALL(x) \
do { \
if((x) != cudaSuccess) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
exit(1); \
} \
} while(0)
#define CURAND_CALL(x) \
do { \
if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
exit(1); \
} \
} while(0)
namespace marian {
namespace gpu {
__global__ void gScale(float* data, int n, float p) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
while(index < n) {
data[index] = (data[index] < p) / p;
index += gridDim.x * blockDim.x;
}
}
void Dropout(Tensor tensor, float p) {
auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
curandGenerator_t gen = gpuBackend->getCurandGenerator();
int n = tensor->size();
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
int numThreads = std::min(n, 512);
int numBlocks = n / numThreads + (n % numThreads != 0);
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
}
}
}

View File

@ -1,11 +0,0 @@
#include <cuda.h>
#include "kernels/cuda_helpers.h"
#include "tensors/memory_piece.h"
namespace marian {
void MemoryPiece::insert(uint8_t* ptr, size_t num) {
CUDA_CHECK(cudaMemcpy(data_, ptr, num * sizeof(uint8_t), cudaMemcpyDefault));
}
}

View File

@ -34,13 +34,6 @@ public:
void setPtr(uint8_t* data) { data_ = data; }
template <typename T>
void insert(T* ptr, size_t num) {
insert((uint8_t*)ptr, num * sizeof(T));
}
void insert(uint8_t* ptr, size_t num);
friend std::ostream& operator<<(std::ostream& out, const MemoryPiece mp) {
out << "MemoryPiece - ptr: " << std::hex << (size_t)mp.data() << std::dec
<< " size: " << mp.size();

View File

@ -1,159 +0,0 @@
#include <cuda.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include "kernels/cuda_helpers.h"
#include "kernels/tensor_operators.h"
#include "tensors/tensor.h"
namespace marian {
__global__ void gFill(float *d_in, int size, float val) {
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
if(index < size) {
d_in[index] = val;
}
}
}
float TensorBase::get(size_t i) {
cudaSetDevice(deviceId_.no);
float temp;
CUDA_CHECK(
cudaMemcpy(&temp, data() + i, sizeof(float), cudaMemcpyDeviceToHost));
cudaStreamSynchronize(0);
return temp;
}
void TensorBase::set(size_t i, float value) {
cudaSetDevice(deviceId_.no);
CUDA_CHECK(
cudaMemcpy(data() + i, &value, sizeof(float), cudaMemcpyHostToDevice));
cudaStreamSynchronize(0);
}
void TensorBase::get(std::vector<float> &v) {
CUDA_CHECK(cudaSetDevice(deviceId_.no));
v.resize(size());
CUDA_CHECK(cudaMemcpy(
v.data(), data(), size() * sizeof(float), cudaMemcpyDeviceToHost));
cudaStreamSynchronize(0);
}
void TensorBase::set(float value) {
cudaSetDevice(deviceId_.no);
int threads = std::min(512, (int)size());
int blocks = (size() / threads) + (size() % threads != 0);
gFill<<<blocks, threads>>>(data(), size(), value);
cudaStreamSynchronize(0);
}
void TensorBase::set(const std::vector<float> &v) {
CUDA_CHECK(cudaSetDevice(deviceId_.no));
CUDA_CHECK(cudaMemcpy(
data(), v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice));
cudaStreamSynchronize(0);
}
void TensorBase::setSparse(const std::vector<size_t> &k,
const std::vector<float> &v) {
cudaSetDevice(deviceId_.no);
SetSparse(data(), k, v);
cudaStreamSynchronize(0);
}
void TensorBase::copyFrom(Tensor in) {
cudaSetDevice(deviceId_.no);
CUDA_CHECK(cudaMemcpy(data(),
(float *)in->data(),
in->size() * sizeof(float),
cudaMemcpyDefault));
cudaStreamSynchronize(0);
}
std::string TensorBase::debug() {
std::stringstream strm;
assert(shape_.size());
strm << shape_;
strm << " device=" << deviceId_;
strm << " ptr=" << (size_t)memory_->data();
strm << " bytes=" << memory_->size();
strm << std::endl;
// values
size_t totSize = shape_.elements();
std::vector<float> values(totSize);
get(values);
size_t dispCols = 5;
strm << std::fixed << std::setprecision(8) << std::setfill(' ');
for(int i = 0; i < values.size(); ++i) {
std::vector<int> dims;
shape().dims(i, dims);
bool disp = true;
for(int j = 0; j < dims.size(); ++j)
disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
if(disp) {
if(dims.back() == 0) {
bool par = true;
std::vector<std::string> p;
for(int j = dims.size() - 1; j >= 0; --j) {
if(dims[j] != 0)
par = false;
p.push_back(par ? "[" : " ");
}
for(auto it = p.rbegin(); it != p.rend(); ++it)
strm << *it;
strm << " ";
}
strm << std::setw(12)
<< values[i]
<< " ";
if(dims.back() + 1 == shape().back()) {
for(int j = dims.size() - 1; j >= 0; --j) {
if(dims[j] + 1 != shape()[j])
break;
strm << "]";
}
strm << std::endl;
}
bool prev = true;
for(int j = dims.size() - 1; j >= 0; --j) {
if(j < dims.size() - 1)
prev = prev && dims[j + 1] + 1 == shape()[j + 1];
if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
if(j < dims.size() - 1)
for(int k = 0; k <= j; ++k)
strm << " ";
strm << "... ";
if(j < dims.size() - 1)
strm << std::endl;
break;
}
}
}
}
strm << std::endl;
return strm.str();
}
Tensor operator<<(Tensor t, const std::vector<float> &v) {
t->set(v);
return t;
}
Tensor operator>>(Tensor t, std::vector<float> &v) {
t->get(v);
return t;
}
}

View File

@ -5,10 +5,13 @@
#include <memory>
#include <sstream>
#include "3rd_party/exception.h"
#include "common/definitions.h"
#include "common/shape.h"
#include "tensors/memory_piece.h"
#include "tensors/backend.h"
#include <algorithm>
#include "tensors/gpu/algorithm.h"
namespace marian {
@ -16,11 +19,11 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
private:
Ptr<MemoryPiece> memory_;
Shape shape_;
DeviceId deviceId_;
Ptr<Backend> backend_;
public:
TensorBase(Ptr<MemoryPiece> memory, Shape shape, DeviceId deviceId)
: memory_(memory), shape_(shape), deviceId_(deviceId) {}
TensorBase(Ptr<MemoryPiece> memory, Shape shape, Ptr<Backend> backend)
: memory_(memory), shape_(shape), backend_(backend) {}
~TensorBase() {}
@ -39,34 +42,157 @@ public:
return get(0);
}
DeviceId getDevice() { return deviceId_; }
Ptr<Backend> getBackend() { return backend_; }
DeviceId getDevice() { return backend_->getDevice(); }
Tensor subtensor(int offset, int size) {
auto mem = New<MemoryPiece>(memory_->data() + sizeof(float) * offset,
sizeof(float) * size);
return Tensor(new TensorBase(mem, {1, size}, deviceId_));
return New<TensorBase>(mem, Shape{1, size}, backend_);
}
float get(size_t i);
float get(size_t i) {
float temp;
if(backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, data() + i, data() + i + 1, &temp);
else
std::copy(data() + i, data() + i + 1, &temp);
return temp;
}
void set(size_t i, float value);
void set(size_t i, float value) {
if(backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, &value, &value + 1, data() + i);
else
std::copy(&value, &value + 1, data() + i);
}
void get(std::vector<float>& v);
void get(std::vector<float> &v) {
v.resize(size());
if(backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, data(), data() + size(), v.data());
else
std::copy(data(), data() + size(), v.data());
}
void set(float value);
void set(const std::vector<float> &v) {
if(backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, v.data(), v.data() + v.size(), data());
else
std::copy(v.data(), v.data() + v.size(), data());
}
void set(const std::vector<float>& v);
void set(float value) {
if(backend_->getDevice().type == DeviceType::gpu)
gpu::fill(backend_, data(), data() + size(), value);
else
std::fill(data(), data() + size(), value);
}
void setSparse(const std::vector<size_t>& k, const std::vector<float>& v);
void setSparse(const std::vector<size_t> &k,
const std::vector<float> &v) {
if(backend_->getDevice().type == DeviceType::gpu) {
gpu::setSparse(backend_, k, v, data());
} else {
for(int i = 0; i < k.size(); ++i)
data()[k[i]] = v[i];
}
}
void copyFrom(Tensor);
void copyFrom(Tensor in) {
if(in->getBackend()->getDevice().type == DeviceType::gpu ||
backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, in->data(), in->data() + in->size(), data());
else
std::copy(in->data(), in->data() + in->size(), data());
}
std::string debug() {
std::stringstream strm;
assert(shape_.size());
strm << shape_;
strm << " device=" << backend_->getDevice();
strm << " ptr=" << (size_t)memory_->data();
strm << " bytes=" << memory_->size();
strm << std::endl;
// values
size_t totSize = shape_.elements();
std::vector<float> values(totSize);
get(values);
size_t dispCols = 5;
strm << std::fixed << std::setprecision(8) << std::setfill(' ');
for(int i = 0; i < values.size(); ++i) {
std::vector<int> dims;
shape().dims(i, dims);
bool disp = true;
for(int j = 0; j < dims.size(); ++j)
disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
if(disp) {
if(dims.back() == 0) {
bool par = true;
std::vector<std::string> p;
for(int j = dims.size() - 1; j >= 0; --j) {
if(dims[j] != 0)
par = false;
p.push_back(par ? "[" : " ");
}
for(auto it = p.rbegin(); it != p.rend(); ++it)
strm << *it;
strm << " ";
}
strm << std::setw(12)
<< values[i]
<< " ";
if(dims.back() + 1 == shape().back()) {
for(int j = dims.size() - 1; j >= 0; --j) {
if(dims[j] + 1 != shape()[j])
break;
strm << "]";
}
strm << std::endl;
}
bool prev = true;
for(int j = dims.size() - 1; j >= 0; --j) {
if(j < dims.size() - 1)
prev = prev && dims[j + 1] + 1 == shape()[j + 1];
if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
if(j < dims.size() - 1)
for(int k = 0; k <= j; ++k)
strm << " ";
strm << "... ";
if(j < dims.size() - 1)
strm << std::endl;
break;
}
}
}
}
strm << std::endl;
return strm.str();
}
std::string debug();
};
typedef std::shared_ptr<TensorBase> Tensor;
Tensor operator<<(Tensor t, const std::vector<float>& v);
Tensor operator>>(Tensor t, std::vector<float>& v);
static Tensor operator<<(Tensor t, const std::vector<float> &v) {
t->set(v);
return t;
}
static Tensor operator>>(Tensor t, std::vector<float> &v) {
t->get(v);
return t;
}
}

View File

@ -5,7 +5,6 @@
#include "common/definitions.h"
#include "tensors/allocator.h"
#include "tensors/device_gpu.h"
#include "tensors/tensor.h"
namespace marian {
@ -17,11 +16,13 @@ private:
const size_t GROW = CHUNK * MBYTE;
const size_t ALIGN = 256;
Ptr<Allocator<DeviceGPU>> allocator_;
Ptr<Backend> backend_;
Ptr<Allocator> allocator_;
public:
TensorAllocator(DeviceId deviceId)
: allocator_(New<Allocator<DeviceGPU>>(deviceId, 0, GROW, ALIGN)) {}
TensorAllocator(Ptr<Backend> backend)
: backend_(backend),
allocator_(New<Allocator>(backend_->getDevice(), 0, GROW, ALIGN)) {}
~TensorAllocator() { clear(); }
@ -41,11 +42,17 @@ public:
void reserveExact(size_t bytes = 0) {
size_t mbytes = bytes / MBYTE;
LOG(info,
"[memory] Reserving {} MB, device {}",
mbytes,
allocator_->getDevice());
if(mbytes == 0) {
LOG(info,
"[memory] Reserving {} B, device {}",
bytes,
allocator_->getDevice());
} else {
LOG(info,
"[memory] Reserving {} MB, device {}",
mbytes,
allocator_->getDevice());
}
allocator_->reserve(bytes);
}
@ -59,7 +66,7 @@ public:
if(!t || t->shape() != shape) {
int size = shape.elements();
auto mem = allocator_->alloc<float>(size);
t = Tensor(new TensorBase(mem, shape, allocator_->getDevice()));
t = Tensor(new TensorBase(mem, shape, backend_));
}
}
@ -68,11 +75,11 @@ public:
Tensor asTensor() {
auto mem = allocator_->memory();
int size = mem->size() / sizeof(float);
return Tensor(new TensorBase(mem, {1, size}, allocator_->getDevice()));
return Tensor(new TensorBase(mem, {1, size}, backend_));
}
size_t size() { return allocator_->size() / sizeof(float); }
Ptr<Allocator<DeviceGPU>> allocator() { return allocator_; }
Ptr<Allocator> allocator() { return allocator_; }
};
}

View File

@ -1,82 +0,0 @@
#pragma once
// This file is part of the Marian toolkit.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <cstring>
#include "tensors/tensor.h"
namespace marian {
class TensorCPU : public TensorBase {
public:
TensorCPU(float* data, Shape shape) : TensorBase(data, shape) {}
float get(size_t i) { return data_[i]; }
void set(size_t i, float value) { data_[i] = value; }
void get(std::vector<float>& v) {
v.resize(size());
std::copy(data_, data_ + size(), v.begin());
}
void set(float value) { std::fill(data_, data_ + size(), value); }
void set(const std::vector<float>& v) {
std::copy(v.begin(), v.end(), data_);
}
};
class DeviceCPU {
private:
float* data_;
size_t size_
public : DeviceCPU()
: data_(0), size_(0) {}
~DeviceCPU() {
if(data_)
delete[] data_;
}
typedef TensorCPU tensor_type;
void reserve(size_t size) {
ABORT_IF(size < size_, "New size must be larger than old size");
float* temp = new float[size];
if(data_) {
std::memcpy(temp, data_, size_ * sizeof(float));
delete[] data_;
}
data_ = temp;
size_ = size;
}
float* data() { return data_; }
size_t capacity() { return size_; }
};
}

View File

@ -17,9 +17,9 @@ endforeach(test)
# Testing apps
add_executable(logger_test logger_test.cpp)
add_executable(dropout_test dropout_test.cpp)
#cuda_add_executable(bn_test bn_test.cu)
cuda_add_executable(pooling_test pooling_test.cu)
cuda_add_executable(dropout_test dropout_test.cu)
#cuda_add_executable(marian_test marian_test.cu)
cuda_add_executable(tensor_test tensor_test.cu)

View File

@ -5,55 +5,11 @@
#include "3rd_party/exception.h"
#include "tensors/allocator.h"
#include "tensors/device_gpu.h"
class DeviceCPU {
private:
uint8_t* data_;
size_t size_;
size_t alignment_;
public:
DeviceCPU(size_t device, size_t alignment = 256)
: data_(0), size_(0), alignment_(alignment) {}
~DeviceCPU() { delete[] data_; }
size_t align(size_t size) {
return ceil(size / (float)alignment_) * alignment_;
}
void reserve(size_t size) {
size = align(size);
ABORT_IF(size < size_, "New size must be larger than old size");
if(data_) {
// Allocate memory by going through host memory
uint8_t *temp = new uint8_t[size_];
std::copy(data_, data_ + size_, temp);
std::fill(data_, data_ + size_, 0);
delete[] data_;
data_ = new uint8_t[size];
std::copy(temp, temp + size_, data_);
delete[] temp;
} else {
data_ = new uint8_t[size];
}
size_ = size;
}
uint8_t* data() { return data_; }
size_t size() { return size_; }
size_t getDevice() { return 0; }
};
int main(int argc, char** argv) {
using namespace marian;
auto a = New<Allocator<DeviceGPU>>({0, DeviceType::gpu}, 0, 30000, 256);
auto a = New<Allocator>({0, DeviceType::gpu}, 0, 30000, 256);
std::cerr << "Size: " << a->size() << std::endl;
auto mem1 = a->alloc<int>(100000);

View File

@ -1,13 +1,9 @@
#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <stdlib.h>
#include <boost/chrono.hpp>
#include <boost/timer/timer.hpp>
#include <vector>
#include "common/config.h"
#include "graph/node_initializers.h"
#include "marian.h"
using namespace marian;
@ -16,8 +12,11 @@ using namespace keywords;
int main(int argc, char** argv) {
auto c = New<Config>(argc, argv);
auto type = c->get<bool>("cpu") ? DeviceType::cpu : DeviceType::gpu;
DeviceId deviceId{0, type};
auto g = New<ExpressionGraph>();
g->setDevice({0, DeviceType::gpu});
g->setDevice(deviceId);
g->reserveWorkspaceMB(512);
for(int i = 0; i < 10; ++i) {

View File

@ -53,3 +53,53 @@ TEST_CASE("Expression graph can be initialized with constant values",
REQUIRE(values == v);
}
}
TEST_CASE("Graph device is set (cpu)", "[graph]") {
auto graph = New<ExpressionGraph>();
graph->setDevice({0, DeviceType::cpu});
DeviceId testId{0, DeviceType::cpu};
REQUIRE(graph->getDevice() == testId);
}
TEST_CASE("Expression graph can be initialized with constant values (cpu)",
"[graph]") {
auto graph = New<ExpressionGraph>();
graph->setDevice({0, DeviceType::cpu});
graph->reserveWorkspaceMB(4);
std::vector<float> values;
SECTION("initializing with zero (cpu)") {
graph->clear();
values.clear();
auto zeros = graph->param("0s", {2, 5}, keywords::init = inits::zeros);
graph->forward();
zeros->val()->get(values);
REQUIRE(values == std::vector<float>(10, 0.0f));
}
SECTION("initializing with ones (cpu)") {
graph->clear();
values.clear();
auto ones = graph->param("1s", {2, 5}, keywords::init = inits::ones);
graph->forward();
ones->val()->get(values);
REQUIRE(values == std::vector<float>(10, 1.0f));
}
SECTION("initializing from vector (cpu)") {
graph->clear();
values.clear();
std::vector<float> v({1, 2, 3, 4, 5, 6});
auto vals = graph->param("vs", {2, 3}, keywords::init = inits::from_vector(v));
graph->forward();
REQUIRE(values.empty());
vals->val()->get(values);
REQUIRE(values == v);
}
}

View File

@ -93,12 +93,12 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
int pos = 0;
// parameter sharding
for(auto device : devices_) {
for(auto graph : graphs_) {
int __size__ = min(shardSize_, totalSize);
totalSize -= __size__;
Tensor param;
Ptr<TensorAllocator> allocator = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
allocator->reserveExact(__size__ * sizeof(float));
allocator->allocate(param, {1, __size__});
paramsAlloc_.push_back(allocator);
@ -112,11 +112,11 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
if(grads_.size() == 0) {
int totalSize = graphs_[0]->params()->vals()->size();
for(auto device : devices_) {
for(auto graph : graphs_) {
int __size__ = min(shardSize_, totalSize);
totalSize -= __size__;
Tensor grad_;
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(graph->getBackend());
allocator_->reserveExact(__size__ * sizeof(float));
allocator_->allocate(grad_, {1, __size__});
@ -129,11 +129,11 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
int totalSize = graphs_[0]->params()->vals()->size();
int i = 0;
for(auto device : devices_) {
for(auto graph : graphs_) {
int __size__ = min(shardSize_, totalSize);
totalSize -= __size__;
Tensor paramAvg;
Ptr<TensorAllocator> allocator = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
allocator->reserveExact(__size__ * sizeof(float));
allocator->allocate(paramAvg, {1, __size__});
@ -187,7 +187,7 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
Tensor gradients;
if(tau_ > 1) {
if(t == 0) {
accAlloc = New<TensorAllocator>(graph->getDevice());
accAlloc = New<TensorAllocator>(graph->getBackend());
accAlloc->reserveExact(graph->params()->grads()->memory()->size());
accAlloc->allocate(accGradients, graph->params()->grads()->shape());
accGradients->set(0);

View File

@ -8,9 +8,9 @@
namespace marian {
Tensor AsyncGraphGroupDrop::newTensor(int size, DeviceId deviceId) {
Tensor AsyncGraphGroupDrop::newTensor(int size, Ptr<Backend> backend) {
Tensor t;
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(deviceId);
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(backend);
allocator_->reserveExact(size * sizeof(float));
allocator_->allocate(t, {1, size});
allocators.push_back(allocator_);
@ -79,14 +79,13 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
size_t batch_words,
int device_id) {
if(pushStep_[device_id]++ <= dropping_warmup) {
std::cout<<"WARMUP"<<std::endl;
AsyncGraphGroup::pushGradients(newGrads, batch_words, device_id);
return;
}
// get the sparse gradient
pushDropper_[device_id]->dropGraph(
newGrads, pushSparseGradient_[device_id],
newGrads, pushSparseGradient_[device_id],
droping_rate, dropping_momentum);
SparseTensor newSparseGrads = pushSparseGradient_[device_id];
@ -146,13 +145,12 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
fetchStep_.push_back(0);
pushStep_.push_back(0);
size_t device = devices_[i];
// temporary tensor to compute parameter delta before fetching
paramsDelta_.push_back(newTensor(shardSize, {device, DeviceType::gpu}));
paramsDelta_.push_back(newTensor(shardSize, graphs_[i]->getBackend()));
// tensors to store local params history
for(int h_id = 0; h_id < devices_.size(); h_id++) {
Tensor tmp = newTensor(params_[i]->size(), {device, DeviceType::gpu});
Tensor tmp = newTensor(params_[i]->size(), graphs_[i]->getBackend());
tmp->copyFrom(params_[i]);
paramsLocal_[h_id].push_back(tmp);
}
@ -162,23 +160,23 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
// N-dropper for fetch
std::vector<GradientDrop> tmpDropper;
for(int i = 0; i < devices_.size(); i++)
for(auto device : devices_)
tmpDropper.push_back(GradientDrop(new GradientDropBase()));
fetchDropper.push_back(tmpDropper);
// sparsetensor to store sparsified gradients per-device
pushSparseGradient_.push_back(
SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu})));
SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
pushShardedSparseGradient_.push_back(
SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu})));
SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
fetchSparseGradient_.push_back(SparseTensor(
new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu})));
new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend())));
std::vector<SparseTensor> tmp;
for(int i = 0; i < devices_.size(); i++)
tmp.push_back(SparseTensor(
new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu})));
new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend())));
fetchShardedSparseGradient_.push_back(tmp);
}

View File

@ -31,7 +31,7 @@ class AsyncGraphGroupDrop : public AsyncGraphGroup {
std::vector<Ptr<TensorAllocator>> allocators;
Tensor newTensor(int size, DeviceId deviceId);
Tensor newTensor(int size, Ptr<Backend> backend);
protected:
void init(Ptr<data::Batch> batch);

View File

@ -19,9 +19,9 @@ void MultiNodeGraphGroup::setScheduler(Ptr<Scheduler> scheduler) {
/**
* Allocate new tensor on given GPU and store allocator.
*/
Tensor MultiNodeGraphGroup::newTensor(int size, DeviceId deviceId) {
Tensor MultiNodeGraphGroup::newTensor(int size, Ptr<Backend> backend) {
Tensor t;
Ptr<TensorAllocator> allocator = New<TensorAllocator>(deviceId);
Ptr<TensorAllocator> allocator = New<TensorAllocator>(backend);
allocator->reserveExact(size * sizeof(float));
allocator->allocate(t, {1, size});
allocators_.push_back(allocator);
@ -148,14 +148,12 @@ void MultiNodeGraphGroup::initClientCommOverlapVars() {
void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
size_t modelSize = clientGraphs_[0]->params()->vals()->size();
for(int client = 0; client < devices_.size(); client++) {
DeviceId deviceId{devices_[client], DeviceType::gpu};
// Communication overlap buffer (for grads + params)
Tensor commOverlapBuffer = newTensor(modelSize, deviceId);
Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend());
commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals());
clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer);
// Gradients local sum buffer
Tensor sumGrads = newTensor(modelSize, deviceId);
Tensor sumGrads = newTensor(modelSize, clientGraphs_[client]->getBackend());
sumGrads->set(0);
clientSummedGradsGPU.push_back(sumGrads);
// Local optimizer to apply summed gradients
@ -207,12 +205,11 @@ void MultiNodeGraphGroup::calculateShardSizes() {
void MultiNodeGraphGroup::initShardGpuTensors() {
size_t offset = 0;
for(int shard = 0; shard < devices_.size(); shard++) {
DeviceId deviceId{devices_[shard], DeviceType::gpu};
Tensor gpuParams = newTensor(shardSizes_[shard], deviceId);
Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
offset, shardSizes_[shard]));
shardParams_.push_back(gpuParams);
shardGrads_.push_back(newTensor(shardSizes_[shard], deviceId));
shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
}
}

View File

@ -217,7 +217,7 @@ protected:
/**
* Allocate new tensor on given GPU and store allocator.
*/
Tensor newTensor(int size, DeviceId deviceId);
Tensor newTensor(int size, Ptr<Backend> backend);
/**
* Setup training environment and launch server thread and (if enabled) client

View File

@ -60,7 +60,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
for(auto graph : graphs_) {
int __size__ = min(shardSize_, totalSize);
auto paramsAlloc = New<TensorAllocator>(graph->getDevice());
auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
paramsAllocs_.push_back(paramsAlloc);
paramsAlloc->reserveExact(3 * __size__ * sizeof(float));
@ -87,7 +87,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
int __size__ = min(shardSize_, totalSize);
totalSize -= __size__;
Tensor paramAvg;
auto allocator = New<TensorAllocator>(graph->getDevice());
auto allocator = New<TensorAllocator>(graph->getBackend());
allocator->reserveExact(__size__ * sizeof(float));
allocator->allocate(paramAvg, {1, __size__});

View File

@ -45,9 +45,9 @@ __global__ void gFindSubtensor(int* indices,
resultEnd[0] = idx;
}
SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId)
: deviceId_(deviceId), capacity_(capacity) {
cudaSetDevice(deviceId_.no);
SparseTensorBase::SparseTensorBase(int capacity, Ptr<Backend> backend)
: backend_(backend), capacity_(capacity) {
cudaSetDevice(backend_->getDevice().no);
CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity));
CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity));
@ -58,8 +58,8 @@ SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId)
SparseTensorBase::SparseTensorBase(float* data,
int* indices,
int size,
DeviceId deviceId)
: deviceId_(deviceId) {
Ptr<Backend> backend)
: backend_(backend) {
data_ = data;
indices_ = indices;
size_ = size;
@ -93,7 +93,7 @@ void SparseTensorBase::copyFrom(float* data,
size_ = size;
if(size == 0)
return;
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault);
if(!data_only)
@ -107,8 +107,8 @@ void SparseTensorBase::copyFrom(std::shared_ptr<SparseTensorBase> t,
copyFrom(t->data(), t->indices(), t->size(), data_only);
}
DeviceId SparseTensorBase::getDevice() {
return deviceId_;
Ptr<Backend> SparseTensorBase::getBackend() {
return backend_;
}
void SparseTensorBase::setSize(int size) {
@ -117,7 +117,7 @@ void SparseTensorBase::setSize(int size) {
// return the dense representation of this tensor
void SparseTensorBase::toDense(Tensor t, int offset) {
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
int threads = 512;
int blocks = 1 + size_ / threads;
t->set(0);
@ -127,7 +127,7 @@ void SparseTensorBase::toDense(Tensor t, int offset) {
}
void SparseTensorBase::scatterAdd(Tensor t, int offset) {
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
cudaStreamSynchronize(0);
int threads = 512;
int blocks = 1 + size_ / threads;
@ -139,7 +139,7 @@ void SparseTensorBase::scatterAdd(Tensor t, int offset) {
std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
int size,
int idx) {
cudaSetDevice(deviceId_.no);
cudaSetDevice(backend_->getDevice().no);
cudaStreamSynchronize(0);
int* start = gstart_ + idx;
int* end = gend_ + idx;
@ -165,6 +165,6 @@ std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
int subtensorSize = std::max(0, endOffset - startOffset + 1);
cudaStreamSynchronize(0);
return std::shared_ptr<SparseTensorBase>(new SparseTensorBase(
data_ + startOffset, indices_ + startOffset, subtensorSize, deviceId_));
data_ + startOffset, indices_ + startOffset, subtensorSize, backend_));
}
}

View File

@ -10,15 +10,15 @@ class SparseTensorBase : public std::enable_shared_from_this<SparseTensorBase> {
int* indices_;
int size_;
int capacity_;
DeviceId deviceId_;
Ptr<Backend> backend_;
int* d_is_unsorted;
int* gstart_;
int* gend_;
public:
SparseTensorBase(int capacity, DeviceId deviceId);
SparseTensorBase(float* data, int* indices, int size, DeviceId deviceId);
SparseTensorBase(int capacity, Ptr<Backend> backend);
SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend);
~SparseTensorBase() {}
@ -43,7 +43,7 @@ public:
void scatterAdd(Tensor t, int offset = 0);
std::shared_ptr<SparseTensorBase> subtensor(int pos, int size, int idx);
DeviceId getDevice();
Ptr<Backend> getBackend();
void toDense(Tensor t, int offset);
};