mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-19 02:37:14 +03:00
Merge branch 'master' into save-adam-params
This commit is contained in:
commit
0f16ed97f3
@ -58,12 +58,12 @@ else(Tcmalloc_FOUND)
|
||||
message(WARNING "Cannot find TCMalloc library. Continuing.")
|
||||
endif(Tcmalloc_FOUND)
|
||||
|
||||
find_package(MPI)
|
||||
if(MPI_FOUND)
|
||||
include_directories(${MPI_INCLUDE_PATH})
|
||||
set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
|
||||
add_definitions(-DMPI_FOUND=1)
|
||||
endif(MPI_FOUND)
|
||||
#find_package(MPI)
|
||||
#if(MPI_FOUND)
|
||||
# include_directories(${MPI_INCLUDE_PATH})
|
||||
# set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES})
|
||||
# add_definitions(-DMPI_FOUND=1)
|
||||
#endif(MPI_FOUND)
|
||||
|
||||
find_package(ZLIB)
|
||||
if(ZLIB_FOUND)
|
||||
|
@ -8,19 +8,22 @@ cuda_add_library(marian
|
||||
3rd_party/cnpy/cnpy.cpp
|
||||
3rd_party/exception.cpp
|
||||
3rd_party/svd/svd.cpp
|
||||
graph/expression_graph.cpp
|
||||
graph/expression_operators.cu
|
||||
graph/node.cu
|
||||
graph/node_operators.cu
|
||||
graph/node_initializers.cu
|
||||
tensors/tensor.cu
|
||||
tensors/device_gpu.cu
|
||||
tensors/memory_piece.cu
|
||||
# tensors/tensor.cu
|
||||
tensors/device.cu
|
||||
tensors/device.cpp
|
||||
tensors/backend.cpp
|
||||
tensors/gpu/algorithm.cu
|
||||
tensors/gpu/dropout.cu
|
||||
tensors/cpu/dropout.cpp
|
||||
kernels/tensor_operators.cu
|
||||
kernels/cudnn_wrappers.cu
|
||||
backend/gpu/dropout.cu
|
||||
backend/cpu/dropout.cpp
|
||||
graph/expression_graph.cpp
|
||||
graph/expression_operators.cu
|
||||
graph/node.cpp
|
||||
graph/node_operators.cu
|
||||
graph/node_initializers.cu
|
||||
layers/convolution.cu
|
||||
|
||||
rnn/cells.cu
|
||||
optimizers/clippers.cu
|
||||
optimizers/optimizers.cu
|
||||
@ -91,7 +94,7 @@ endforeach(exec)
|
||||
#set_target_properties(align2steps PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
|
||||
|
||||
if(PYTHONLIBS_FOUND)
|
||||
add_subdirectory(python)
|
||||
# add_subdirectory(python)
|
||||
endif(PYTHONLIBS_FOUND)
|
||||
|
||||
if(COMPILE_TESTS)
|
||||
|
@ -1,14 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "backend/dispatch.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
|
||||
void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
|
||||
ABORT("Not implemented");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -1,39 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/definitions.h"
|
||||
#include "graph/backend.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
#define DISPATCH1(Function, Arg1) \
|
||||
namespace gpu { \
|
||||
void Function(Ptr<Backend>, Arg1); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Ptr<Backend>, Arg1); \
|
||||
} \
|
||||
void Function(Ptr<Backend> backend, Arg1 arg1) { \
|
||||
if(backend->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(backend, arg1); \
|
||||
else \
|
||||
cpu::Function(backend, arg1); \
|
||||
}
|
||||
|
||||
#define DISPATCH2(Function, Arg1, Arg2) \
|
||||
namespace gpu { \
|
||||
void Function(Ptr<Backend>, Arg1, Arg2); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Ptr<Backend>, Arg1, Arg2); \
|
||||
} \
|
||||
static inline void Function(Ptr<Backend> backend, Arg1 arg1, Arg2 arg2) { \
|
||||
if(backend->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(backend, arg1, arg2); \
|
||||
else \
|
||||
cpu::Function(backend, arg1, arg2); \
|
||||
}
|
||||
|
||||
namespace marian {
|
||||
|
||||
DISPATCH2(Dropout, Tensor, float)
|
||||
|
||||
}
|
@ -1,8 +1,7 @@
|
||||
#include <sstream>
|
||||
#include "graph/backend_gpu.h"
|
||||
#include "graph/expression_graph.h"
|
||||
|
||||
#include "backend/dispatch.h"
|
||||
#include "tensors/dispatch.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -11,19 +10,17 @@ ExpressionGraph::ExpressionGraph(bool inference)
|
||||
|
||||
void ExpressionGraph::setDevice(DeviceId deviceId) {
|
||||
if(!backend_) {
|
||||
backend_ = New<BackendGPU>(deviceId, Config::seed);
|
||||
|
||||
backend_ = BackendByDevice(deviceId, Config::seed);
|
||||
params_ = New<Parameters>();
|
||||
params_->init(backend_->getDevice());
|
||||
|
||||
tensors_ = New<TensorAllocator>(backend_->getDevice());
|
||||
params_->init(backend_);
|
||||
tensors_ = New<TensorAllocator>(backend_);
|
||||
}
|
||||
}
|
||||
|
||||
Expr ExpressionGraph::dropout(float prob, Shape shape) {
|
||||
return Expression<ConstantNode>(shared_from_this(),
|
||||
keywords::init = [prob, this](Tensor t) {
|
||||
Dropout(backend_, t, prob);
|
||||
Dropout(t, prob);
|
||||
},
|
||||
keywords::shape = shape);
|
||||
}
|
||||
|
@ -8,8 +8,8 @@
|
||||
#include "common/definitions.h"
|
||||
|
||||
#include "tensors/tensor_allocator.h"
|
||||
#include "tensors/backend.h"
|
||||
|
||||
#include "graph/backend.h"
|
||||
#include "graph/parameters.h"
|
||||
#include "graph/chainable.h"
|
||||
#include "graph/node_operators.h"
|
||||
@ -306,7 +306,7 @@ public:
|
||||
tensors_->free(t);
|
||||
}
|
||||
|
||||
Ptr<Allocator<DeviceGPU>> allocator() { return tensors_->allocator(); }
|
||||
Ptr<Allocator> allocator() { return tensors_->allocator(); }
|
||||
|
||||
void clear() {
|
||||
// clear everything apart from parameters
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include "graph/backend_gpu.h"
|
||||
#include "tensors/backend.h"
|
||||
#include "graph/expression_graph.h"
|
||||
#include "graph/node.h"
|
||||
|
@ -5,10 +5,11 @@
|
||||
#include <thread>
|
||||
|
||||
#include "common/keywords.h"
|
||||
#include "graph/backend.h"
|
||||
#include "graph/chainable.h"
|
||||
#include "tensors/backend.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
#include "graph/chainable.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
class Node : public Chainable<Tensor>,
|
||||
|
@ -2,10 +2,10 @@
|
||||
|
||||
#include <thread>
|
||||
|
||||
#include "graph/backend_gpu.h"
|
||||
#include "tensors/gpu/backend.h"
|
||||
#include "graph/node.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "functional/functional.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "kernels/cudnn_wrappers.h"
|
||||
|
||||
namespace marian {
|
||||
@ -54,7 +54,7 @@ public:
|
||||
NodeOps forwardOps() {
|
||||
// C = alpha * dot(op(A), op(B))
|
||||
return {NodeOp(Prod(
|
||||
std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
|
||||
std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
|
||||
val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
@ -72,7 +72,7 @@ public:
|
||||
// to sum gradients from different graph parts
|
||||
|
||||
if(!transA_ && transB_)
|
||||
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
adj_,
|
||||
@ -81,7 +81,7 @@ public:
|
||||
false,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
adj_,
|
||||
@ -92,7 +92,7 @@ public:
|
||||
scalar_))};
|
||||
|
||||
if(transA_ && !transB_)
|
||||
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
child(1)->val(),
|
||||
@ -101,7 +101,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
child(0)->val(),
|
||||
@ -112,7 +112,7 @@ public:
|
||||
scalar_))};
|
||||
|
||||
if(transA_ && transB_)
|
||||
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
child(1)->val(),
|
||||
@ -121,7 +121,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
adj_,
|
||||
@ -131,7 +131,7 @@ public:
|
||||
1.0,
|
||||
scalar_))};
|
||||
|
||||
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
adj_,
|
||||
@ -140,7 +140,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
child(0)->val(),
|
||||
@ -198,7 +198,7 @@ public:
|
||||
using namespace functional;
|
||||
return {
|
||||
NodeOp(Prod(
|
||||
std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
|
||||
std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
|
||||
val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
@ -219,7 +219,7 @@ public:
|
||||
using namespace functional;
|
||||
|
||||
if(!transA_ && transB_)
|
||||
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
adj_,
|
||||
@ -228,7 +228,7 @@ public:
|
||||
false,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
adj_,
|
||||
@ -240,7 +240,7 @@ public:
|
||||
NodeOp(Add(_1, child(2)->grad(), adj_))};
|
||||
|
||||
if(transA_ && !transB_)
|
||||
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
child(1)->val(),
|
||||
@ -249,7 +249,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
child(0)->val(),
|
||||
@ -261,7 +261,7 @@ public:
|
||||
NodeOp(Add(_1, child(2)->grad(), adj_))};
|
||||
|
||||
if(transA_ && transB_)
|
||||
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
child(1)->val(),
|
||||
@ -270,7 +270,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
adj_,
|
||||
@ -281,7 +281,7 @@ public:
|
||||
scalar_)),
|
||||
NodeOp(Add(_1, child(2)->grad(), adj_))};
|
||||
|
||||
return {NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
adj_,
|
||||
@ -290,7 +290,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(Prod(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
child(0)->val(),
|
||||
@ -350,7 +350,7 @@ public:
|
||||
NodeOps forwardOps() {
|
||||
// C = alpha * dot(op(A), op(B))
|
||||
return {NodeOp(ProdBatched(
|
||||
std::static_pointer_cast<BackendGPU>(getBackend())->getCublasHandle(),
|
||||
std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
|
||||
val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
@ -369,7 +369,7 @@ public:
|
||||
|
||||
if(!transA_ && transB_)
|
||||
return {
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
adj_,
|
||||
@ -378,7 +378,7 @@ public:
|
||||
false,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
adj_,
|
||||
@ -390,7 +390,7 @@ public:
|
||||
|
||||
if(transA_ && !transB_)
|
||||
return {
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
child(1)->val(),
|
||||
@ -399,7 +399,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
child(0)->val(),
|
||||
@ -411,7 +411,7 @@ public:
|
||||
|
||||
if(transA_ && transB_)
|
||||
return {
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
child(1)->val(),
|
||||
@ -420,7 +420,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
adj_,
|
||||
@ -431,7 +431,7 @@ public:
|
||||
scalar_))};
|
||||
|
||||
return {
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(0)->grad(),
|
||||
adj_,
|
||||
@ -440,7 +440,7 @@ public:
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<BackendGPU>(getBackend())
|
||||
NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
|
||||
->getCublasHandle(),
|
||||
child(1)->grad(),
|
||||
child(0)->val(),
|
||||
|
@ -1,10 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "graph/backend_gpu.h"
|
||||
#include "tensors/tensor.h"
|
||||
#include "tensors/gpu/backend.h"
|
||||
|
||||
#include "graph/node.h"
|
||||
#include "kernels/sparse.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "tensors/tensor.h"
|
||||
#include "functional/functional.h"
|
||||
#include "kernels/cudnn_wrappers.h"
|
||||
|
||||
@ -210,21 +211,7 @@ struct TanhNodeOp : public NaryNodeOp {
|
||||
const std::string type() { return "tanh"; }
|
||||
};
|
||||
|
||||
/**
|
||||
* Represents a <a
|
||||
* href="https://en.wikipedia.org/wiki/Rectifier_(neural_networks)">rectified
|
||||
* linear</a> node in an expression graph.
|
||||
*
|
||||
* This node implements the activation function \f$ f(x) = \max(0, x) \f$ and
|
||||
* its derivative:
|
||||
* \f[
|
||||
* f^\prime(x) =
|
||||
* \begin{cases}
|
||||
* 0 & \text{if } x \leq 0 \\
|
||||
* 1 & \text{if } x > 0
|
||||
* \end{cases}
|
||||
* \f]
|
||||
*/
|
||||
|
||||
struct ReLUNodeOp : public UnaryNodeOp {
|
||||
template <typename... Args>
|
||||
ReLUNodeOp(Args... args) : UnaryNodeOp(args...) {}
|
||||
@ -876,14 +863,14 @@ public:
|
||||
Tensor& val() {
|
||||
auto childVal = reshapee_->val();
|
||||
val_.reset(
|
||||
new TensorBase(childVal->memory(), shape(), childVal->getDevice()));
|
||||
new TensorBase(childVal->memory(), shape(), childVal->getBackend()));
|
||||
return val_;
|
||||
};
|
||||
|
||||
Tensor& grad() {
|
||||
auto childGrad = reshapee_->grad();
|
||||
adj_.reset(
|
||||
new TensorBase(childGrad->memory(), shape(), childGrad->getDevice()));
|
||||
new TensorBase(childGrad->memory(), shape(), childGrad->getBackend()));
|
||||
return adj_;
|
||||
};
|
||||
|
||||
@ -952,7 +939,7 @@ public:
|
||||
size_t offset = step_ * shape().elements() * sizeof(float);
|
||||
auto mem = New<MemoryPiece>(childVal->memory()->data() + offset,
|
||||
childVal->memory()->size());
|
||||
val_.reset(new TensorBase(mem, shape(), childVal->getDevice()));
|
||||
val_.reset(new TensorBase(mem, shape(), childVal->getBackend()));
|
||||
return val_;
|
||||
};
|
||||
|
||||
@ -961,7 +948,7 @@ public:
|
||||
size_t offset = step_ * shape().elements() * sizeof(float);
|
||||
auto mem = New<MemoryPiece>(childGrad->memory()->data() + offset,
|
||||
childGrad->memory()->size());
|
||||
adj_.reset(new TensorBase(mem, shape(), childGrad->getDevice()));
|
||||
adj_.reset(new TensorBase(mem, shape(), childGrad->getBackend()));
|
||||
return adj_;
|
||||
};
|
||||
|
||||
|
@ -20,9 +20,9 @@ private:
|
||||
Ptr<TensorAllocator> grads_;
|
||||
|
||||
public:
|
||||
void init(DeviceId deviceId) {
|
||||
vals_ = New<TensorAllocator>(deviceId);
|
||||
grads_ = New<TensorAllocator>(deviceId);
|
||||
void init(Ptr<Backend> backend) {
|
||||
vals_ = New<TensorAllocator>(backend);
|
||||
grads_ = New<TensorAllocator>(backend);
|
||||
}
|
||||
|
||||
auto begin() -> decltype(params_.begin()) { return params_.begin(); }
|
||||
@ -57,7 +57,7 @@ public:
|
||||
}
|
||||
|
||||
void allocateForward() {
|
||||
if(vals_->size() == 0) {
|
||||
if(!params_.empty() && vals_->size() == 0) {
|
||||
vals_->reserveExact(totalCapacity(vals_));
|
||||
for(auto p : params_)
|
||||
if(!p->val())
|
||||
@ -66,7 +66,7 @@ public:
|
||||
}
|
||||
|
||||
void allocateBackward() {
|
||||
if(grads_->size() == 0) {
|
||||
if(!params_.empty() && grads_->size() == 0) {
|
||||
grads_->reserveExact(totalCapacity(grads_));
|
||||
for(auto p : params_)
|
||||
if(!p->grad())
|
||||
|
@ -17,6 +17,12 @@ inline void gpuAssert(cudaError_t code,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CudaCopy(const T* start, const T* end, T* dest) {
|
||||
CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T),
|
||||
cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
#define CUSPARSE_CHECK(x) \
|
||||
{ \
|
||||
cusparseStatus_t _c = x; \
|
||||
|
@ -12,7 +12,7 @@ void multiply(Ptr<CSR> C,
|
||||
const Ptr<CSR> B,
|
||||
bool transA,
|
||||
bool transB) {
|
||||
cudaSetDevice(C->getDevice());
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
int nnzTotal;
|
||||
C->allocRowIndices(A->rows());
|
||||
CUSPARSE_CHECK(cusparseXcsrgemmNnz(
|
||||
@ -91,7 +91,7 @@ void multiply(Ptr<CSR> C,
|
||||
//}
|
||||
|
||||
void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr<CSR> sparseLf) {
|
||||
cudaSetDevice(out->getDevice());
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
|
||||
int batch = att->shape()[0];
|
||||
int srcWords = att->shape()[2];
|
||||
@ -150,7 +150,7 @@ __global__ void gCollapseAtt(float* out,
|
||||
}
|
||||
|
||||
void CollapseAtt(Tensor out, Tensor in) {
|
||||
cudaSetDevice(out->getDevice());
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
int nonzeros = out->shape().elements();
|
||||
int batch = out->shape()[0];
|
||||
int srcWords = out->shape()[2];
|
||||
|
@ -14,7 +14,7 @@ private:
|
||||
int nnz_{0};
|
||||
int rows_{0};
|
||||
int cols_{0};
|
||||
DeviceId deviceId_;
|
||||
Ptr<Backend> backend_;
|
||||
|
||||
cusparseHandle_t handle_{0};
|
||||
cusparseMatDescr_t descr_{0};
|
||||
@ -24,9 +24,9 @@ private:
|
||||
float* values_{0};
|
||||
|
||||
public:
|
||||
CSR(int rows, int cols, DeviceId deviceId)
|
||||
: rows_(rows), cols_(cols), deviceId_(deviceId) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
CSR(int rows, int cols, Ptr<Backend> backend)
|
||||
: rows_(rows), cols_(cols), backend_(backend) {
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
CUSPARSE_CHECK(cusparseCreate(&handle_));
|
||||
CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_));
|
||||
CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL));
|
||||
@ -38,9 +38,9 @@ public:
|
||||
const std::vector<float>& values,
|
||||
const std::vector<int>& rowIndices,
|
||||
const std::vector<int>& colIndices,
|
||||
DeviceId deviceId)
|
||||
: nnz_(values.size()), rows_(rows), cols_(cols), deviceId_(deviceId) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
Ptr<Backend> backend)
|
||||
: nnz_(values.size()), rows_(rows), cols_(cols), backend_(backend) {
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
CUSPARSE_CHECK(cusparseCreate(&handle_));
|
||||
CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_));
|
||||
CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL));
|
||||
@ -73,8 +73,8 @@ public:
|
||||
CUDA_CHECK(cudaFree(cooRowIndices));
|
||||
}
|
||||
|
||||
CSR(Tensor dense) : deviceId_(dense->getDevice()) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
CSR(Tensor dense) : backend_(dense->getBackend()) {
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
rows_ = dense->shape()[0] * dense->shape()[2] * dense->shape()[3];
|
||||
cols_ = dense->shape()[1];
|
||||
|
||||
@ -114,7 +114,7 @@ public:
|
||||
}
|
||||
|
||||
~CSR() {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
if(values_)
|
||||
CUDA_CHECK(cudaFree(values_));
|
||||
if(rowIndices_)
|
||||
@ -129,7 +129,7 @@ public:
|
||||
}
|
||||
|
||||
void toTensor(Tensor dense) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
ABORT_IF(dense->size() != rows_ * cols_, "Matrix sizes do not match");
|
||||
|
||||
cusparseScsc2dense(handle_,
|
||||
@ -154,10 +154,10 @@ public:
|
||||
int* rowIndices() { return rowIndices_; }
|
||||
int* colIndices() { return colIndices_; }
|
||||
|
||||
DeviceId getDevice() { return deviceId_; }
|
||||
DeviceId getDevice() { return backend_->getDevice(); }
|
||||
|
||||
void allocValues(int nnz = 0) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
if(nnz > 0)
|
||||
nnz_ = nnz;
|
||||
if(values_)
|
||||
@ -166,7 +166,7 @@ public:
|
||||
}
|
||||
|
||||
void allocRowIndices(int rows = 0) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
if(rows > 0)
|
||||
rows_ = rows;
|
||||
if(rowIndices_)
|
||||
@ -175,7 +175,7 @@ public:
|
||||
}
|
||||
|
||||
void allocColIndices(int nnz = 0) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
if(nnz > 0)
|
||||
nnz_ = nnz;
|
||||
if(colIndices_)
|
||||
@ -184,11 +184,12 @@ public:
|
||||
}
|
||||
|
||||
std::string debug() {
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
uint8_t* buffer;
|
||||
CUDA_CHECK(cudaMalloc(&buffer, sizeof(float) * rows() * cols()));
|
||||
|
||||
auto mem = New<MemoryPiece>(buffer, sizeof(float) * rows() * cols());
|
||||
Tensor tensor(new TensorBase(mem, {rows(), cols()}, deviceId_));
|
||||
Tensor tensor(new TensorBase(mem, {rows(), cols()}, backend_));
|
||||
toTensor(tensor);
|
||||
std::string temp = tensor->debug();
|
||||
|
||||
|
@ -879,7 +879,7 @@ __global__ void gInsert(float* out,
|
||||
}
|
||||
}
|
||||
|
||||
void Select(Ptr<Allocator<DeviceGPU>> allocator,
|
||||
void Select(Ptr<Allocator> allocator,
|
||||
Tensor out,
|
||||
const Tensor in,
|
||||
int axis,
|
||||
@ -892,7 +892,7 @@ void Select(Ptr<Allocator<DeviceGPU>> allocator,
|
||||
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
|
||||
|
||||
auto mp_indices = allocator->alloc<size_t>(indices.size());
|
||||
mp_indices->insert(indices.data(), indices.size());
|
||||
CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
|
||||
|
||||
int axisGPU = axis + gpu::Shape::size() - out->shape().size();
|
||||
gSelect<<<blocks, threads>>>(out->data(),
|
||||
@ -905,7 +905,7 @@ void Select(Ptr<Allocator<DeviceGPU>> allocator,
|
||||
allocator->free(mp_indices);
|
||||
}
|
||||
|
||||
void Insert(Ptr<Allocator<DeviceGPU>> allocator,
|
||||
void Insert(Ptr<Allocator> allocator,
|
||||
Tensor out,
|
||||
const Tensor in,
|
||||
int axis,
|
||||
@ -918,7 +918,7 @@ void Insert(Ptr<Allocator<DeviceGPU>> allocator,
|
||||
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
|
||||
|
||||
auto mp_indices = allocator->alloc<size_t>(indices.size());
|
||||
mp_indices->insert(indices.data(), indices.size());
|
||||
CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
|
||||
|
||||
int axisGPU = axis + gpu::Shape::size() - out->shape().size();
|
||||
gInsert<<<blocks, threads>>>(out->data(),
|
||||
@ -1295,7 +1295,7 @@ float L2Norm(Tensor in) {
|
||||
uint8_t* data;
|
||||
cudaMalloc(&data, blocks * sizeof(float));
|
||||
Tensor out(new TensorBase(
|
||||
New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getDevice()));
|
||||
New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend()));
|
||||
|
||||
ReduceAll(_1 * _1, out, in);
|
||||
float dataCpu = sqrtf(out->get(0));
|
||||
|
@ -7,7 +7,6 @@
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
#include "tensors/allocator.h"
|
||||
#include "tensors/device_gpu.h"
|
||||
|
||||
#include "gpu/shape.h"
|
||||
#include "gpu/tmp.h"
|
||||
@ -71,13 +70,13 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) {
|
||||
|
||||
void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis);
|
||||
|
||||
void Select(Ptr<Allocator<DeviceGPU>> allocator,
|
||||
void Select(Ptr<Allocator> allocator,
|
||||
Tensor out,
|
||||
Tensor in,
|
||||
int axis,
|
||||
const std::vector<size_t>&);
|
||||
|
||||
void Insert(Ptr<Allocator<DeviceGPU>> allocator,
|
||||
void Insert(Ptr<Allocator> allocator,
|
||||
Tensor out,
|
||||
Tensor in,
|
||||
int axis,
|
||||
|
@ -13,7 +13,7 @@ void Sgd::updateImpl(Tensor params, Tensor grads) {
|
||||
|
||||
void Adagrad::updateImpl(Tensor params, Tensor grads) {
|
||||
if(!alloc_)
|
||||
alloc_ = New<TensorAllocator>(params->getDevice());
|
||||
alloc_ = New<TensorAllocator>(params->getBackend());
|
||||
|
||||
if(!gt_) {
|
||||
int elements = params->size();
|
||||
@ -42,7 +42,7 @@ void Adagrad::resetStats() {
|
||||
|
||||
void Adam::updateImpl(Tensor params, Tensor grads) {
|
||||
if(!alloc_)
|
||||
alloc_ = New<TensorAllocator>(params->getDevice());
|
||||
alloc_ = New<TensorAllocator>(params->getBackend());
|
||||
|
||||
if(!mt_) {
|
||||
int elements = params->size();
|
||||
|
@ -8,10 +8,10 @@ cuda_add_library(pymarian SHARED
|
||||
../graph/node.cu
|
||||
../graph/node_operators.cu
|
||||
../tensors/tensor.cu
|
||||
../tensors/device_gpu.cu
|
||||
../tensors/device.cpp
|
||||
../kernels/tensor_operators.cu
|
||||
../backend/gpu/dropout.cu
|
||||
../backend/cpu/dropout.cpp
|
||||
../tensors/gpu/dropout.cu
|
||||
../tensors/cpu/dropout.cpp
|
||||
../kernels/sparse.cu
|
||||
#../layers/param_initializers.cu
|
||||
../rnn/attention.cu
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#include "common/definitions.h"
|
||||
#include "tensors/memory_piece.h"
|
||||
#include "tensors/device.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -65,10 +66,9 @@ public:
|
||||
Gap rest(size_t offset) const { return Gap(data_ + offset, size_ - offset); }
|
||||
};
|
||||
|
||||
template <class Device>
|
||||
class Allocator {
|
||||
private:
|
||||
Device device_;
|
||||
Ptr<Device> device_;
|
||||
size_t available_{0};
|
||||
size_t step_{128 * 1024 * 1024};
|
||||
size_t alignment_{256};
|
||||
@ -83,23 +83,23 @@ private:
|
||||
|
||||
void grow(size_t add) {
|
||||
add = align(add);
|
||||
uint8_t* oldData = device_.data();
|
||||
size_t oldSize = device_.size();
|
||||
uint8_t* oldData = device_->data();
|
||||
size_t oldSize = device_->size();
|
||||
|
||||
device_.reserve(oldSize + add);
|
||||
device_->reserve(oldSize + add);
|
||||
|
||||
std::set<Gap> oldGaps;
|
||||
gaps_.swap(oldGaps);
|
||||
|
||||
for(auto gap : oldGaps)
|
||||
gaps_.insert(
|
||||
Gap(device_.data() + std::distance(oldData, gap.data()), gap.size()));
|
||||
insertGap(Gap(device_.data() + oldSize, add));
|
||||
Gap(device_->data() + std::distance(oldData, gap.data()), gap.size()));
|
||||
insertGap(Gap(device_->data() + oldSize, add));
|
||||
|
||||
std::unordered_map<uint8_t*, Ptr<MemoryPiece>> oldAllocated;
|
||||
allocated_.swap(oldAllocated);
|
||||
for(auto it : oldAllocated) {
|
||||
uint8_t* newPtr = device_.data() + std::distance(oldData, it.first);
|
||||
uint8_t* newPtr = device_->data() + std::distance(oldData, it.first);
|
||||
allocated_[newPtr] = oldAllocated[it.first];
|
||||
allocated_[newPtr]->setPtr(newPtr);
|
||||
}
|
||||
@ -142,7 +142,7 @@ private:
|
||||
|
||||
public:
|
||||
Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256)
|
||||
: device_(deviceId, alignment),
|
||||
: device_(DispatchDevice(deviceId, alignment)),
|
||||
step_(step),
|
||||
available_(0),
|
||||
alignment_(alignment) {
|
||||
@ -153,7 +153,8 @@ public:
|
||||
|
||||
void reserve(size_t bytes) {
|
||||
bytes = align(bytes);
|
||||
device_.reserve(bytes);
|
||||
if(bytes > 0)
|
||||
device_->reserve(bytes);
|
||||
clear();
|
||||
}
|
||||
|
||||
@ -211,17 +212,17 @@ public:
|
||||
available_ = 0;
|
||||
gaps_.clear();
|
||||
allocated_.clear();
|
||||
insertGap({device_.data(), device_.size()}, false);
|
||||
insertGap({device_->data(), device_->size()}, false);
|
||||
}
|
||||
|
||||
Ptr<MemoryPiece> memory() {
|
||||
return New<MemoryPiece>(device_.data(), device_.size());
|
||||
return New<MemoryPiece>(device_->data(), device_->size());
|
||||
}
|
||||
|
||||
size_t size() { return device_.size(); }
|
||||
size_t size() { return device_->size(); }
|
||||
|
||||
size_t available() { return available_; }
|
||||
|
||||
DeviceId getDevice() { return device_.getDevice(); }
|
||||
DeviceId getDevice() { return device_->getDevice(); }
|
||||
};
|
||||
}
|
||||
|
15
src/tensors/backend.cpp
Normal file
15
src/tensors/backend.cpp
Normal file
@ -0,0 +1,15 @@
|
||||
#include "tensors/backend.h"
|
||||
|
||||
#include "tensors/gpu/backend.h"
|
||||
#include "tensors/cpu/backend.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
|
||||
if(deviceId.type == DeviceType::gpu)
|
||||
return New<gpu::Backend>(deviceId, seed);
|
||||
else
|
||||
return New<cpu::Backend>(deviceId, seed);
|
||||
}
|
||||
|
||||
}
|
@ -17,4 +17,6 @@ public:
|
||||
virtual void setDevice() = 0;
|
||||
};
|
||||
|
||||
Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed);
|
||||
|
||||
}
|
29
src/tensors/cpu/backend.h
Normal file
29
src/tensors/cpu/backend.h
Normal file
@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <random>
|
||||
|
||||
#include "common/config.h"
|
||||
#include "tensors/backend.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
|
||||
class Backend : public marian::Backend {
|
||||
private:
|
||||
std::default_random_engine gen_;
|
||||
|
||||
public:
|
||||
Backend(DeviceId deviceId, size_t seed)
|
||||
: marian::Backend(deviceId, seed),
|
||||
gen_(seed_) {}
|
||||
|
||||
void setDevice() { }
|
||||
|
||||
std::default_random_engine& getRandomGenerator() {
|
||||
return gen_;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
19
src/tensors/cpu/dropout.cpp
Normal file
19
src/tensors/cpu/dropout.cpp
Normal file
@ -0,0 +1,19 @@
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
#include "tensors/dispatch.h"
|
||||
#include "tensors/cpu/backend.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
|
||||
void Dropout(Tensor tensor, float p) {
|
||||
auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
|
||||
auto &gen = cpuBackend->getRandomGenerator();
|
||||
std::bernoulli_distribution dist(1.f - p);
|
||||
std::generate(tensor->data(), tensor->data() + tensor->size(),
|
||||
[&]() { return dist(gen) / (1.f - p); });
|
||||
}
|
||||
|
||||
}
|
||||
}
|
29
src/tensors/device.cpp
Normal file
29
src/tensors/device.cpp
Normal file
@ -0,0 +1,29 @@
|
||||
#include <iostream>
|
||||
#include "tensors/device.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
|
||||
Device::~Device() {
|
||||
delete[] data_;
|
||||
data_ = nullptr;
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
void Device::reserve(size_t size) {
|
||||
size = align(size);
|
||||
ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
|
||||
|
||||
if(data_) {
|
||||
uint8_t *temp = new uint8_t[size_];
|
||||
std::copy(data_, data_ + size_, temp);
|
||||
delete[] data_;
|
||||
data_ = temp;
|
||||
} else {
|
||||
data_ = new uint8_t[size];
|
||||
}
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
40
src/tensors/device.cu
Normal file
40
src/tensors/device.cu
Normal file
@ -0,0 +1,40 @@
|
||||
#include <cuda.h>
|
||||
#include <iostream>
|
||||
|
||||
#include "tensors/device.h"
|
||||
#include "kernels/cuda_helpers.h"
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
|
||||
Device::~Device() {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
if(data_) {
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void Device::reserve(size_t size) {
|
||||
size = align(size);
|
||||
cudaSetDevice(deviceId_.no);
|
||||
|
||||
ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
|
||||
|
||||
if(data_) {
|
||||
// Allocate memory by going through host memory
|
||||
uint8_t *temp = new uint8_t[size_];
|
||||
CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
CUDA_CHECK(cudaMalloc(&data_, size));
|
||||
CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
|
||||
delete[] temp;
|
||||
} else {
|
||||
CUDA_CHECK(cudaMalloc(&data_, size));
|
||||
}
|
||||
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
68
src/tensors/device.h
Normal file
68
src/tensors/device.h
Normal file
@ -0,0 +1,68 @@
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
#include "common/definitions.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
class Device {
|
||||
protected:
|
||||
DeviceId deviceId_;
|
||||
|
||||
uint8_t* data_{0};
|
||||
size_t size_{0};
|
||||
size_t alignment_;
|
||||
|
||||
size_t align(size_t size) {
|
||||
return ceil(size / (float)alignment_) * alignment_;
|
||||
}
|
||||
|
||||
public:
|
||||
Device(DeviceId deviceId, size_t alignment = 256)
|
||||
: deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {}
|
||||
|
||||
virtual ~Device() {};
|
||||
|
||||
virtual void reserve(size_t size) = 0;
|
||||
|
||||
virtual uint8_t* data() { return data_; }
|
||||
|
||||
virtual size_t size() { return size_; }
|
||||
|
||||
virtual DeviceId getDevice() { return deviceId_; }
|
||||
};
|
||||
|
||||
namespace gpu {
|
||||
class Device : public marian::Device {
|
||||
public:
|
||||
Device(DeviceId deviceId, size_t alignment = 256)
|
||||
: marian::Device(deviceId, alignment) {}
|
||||
|
||||
~Device();
|
||||
|
||||
void reserve(size_t size);
|
||||
};
|
||||
}
|
||||
|
||||
namespace cpu {
|
||||
class Device : public marian::Device {
|
||||
public:
|
||||
Device(DeviceId deviceId, size_t alignment = 256)
|
||||
: marian::Device(deviceId, alignment) {}
|
||||
|
||||
~Device();
|
||||
|
||||
void reserve(size_t size);
|
||||
};
|
||||
}
|
||||
|
||||
static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
|
||||
if(deviceId.type == DeviceType::gpu)
|
||||
return New<gpu::Device>(deviceId, alignment);
|
||||
else
|
||||
return New<cpu::Device>(deviceId, alignment);
|
||||
}
|
||||
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
#include <cuda.h>
|
||||
#include <iostream>
|
||||
|
||||
#include "tensors/device_gpu.h"
|
||||
#include "kernels/cuda_helpers.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
DeviceGPU::~DeviceGPU() {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
if(data_) {
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void DeviceGPU::reserve(size_t size) {
|
||||
size = align(size);
|
||||
cudaSetDevice(deviceId_.no);
|
||||
|
||||
ABORT_IF(size < size_, "New size must be larger than old size");
|
||||
|
||||
if(data_) {
|
||||
// Allocate memory by going through host memory
|
||||
uint8_t *temp = new uint8_t[size_];
|
||||
CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
CUDA_CHECK(cudaMalloc(&data_, size));
|
||||
CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
|
||||
delete[] temp;
|
||||
} else {
|
||||
CUDA_CHECK(cudaMalloc(&data_, size));
|
||||
}
|
||||
|
||||
size_ = size;
|
||||
}
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
#include "common/definitions.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
class DeviceGPU {
|
||||
private:
|
||||
uint8_t* data_;
|
||||
size_t size_;
|
||||
DeviceId deviceId_;
|
||||
size_t alignment_;
|
||||
|
||||
size_t align(size_t size) {
|
||||
return ceil(size / (float)alignment_) * alignment_;
|
||||
}
|
||||
|
||||
public:
|
||||
DeviceGPU(DeviceId deviceId, size_t alignment = 256)
|
||||
: data_(0), size_(0), deviceId_(deviceId), alignment_(alignment) {}
|
||||
|
||||
~DeviceGPU();
|
||||
|
||||
void reserve(size_t size);
|
||||
|
||||
uint8_t* data() { return data_; }
|
||||
|
||||
size_t size() { return size_; }
|
||||
|
||||
DeviceId getDevice() { return deviceId_; }
|
||||
};
|
||||
}
|
38
src/tensors/dispatch.h
Normal file
38
src/tensors/dispatch.h
Normal file
@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/definitions.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
#define DISPATCH1(Function, Arg1) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1); \
|
||||
} \
|
||||
void Function(Arg1 arg1) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1); \
|
||||
else \
|
||||
cpu::Function(arg1); \
|
||||
}
|
||||
|
||||
#define DISPATCH2(Function, Arg1, Arg2) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2); \
|
||||
}
|
||||
|
||||
namespace marian {
|
||||
|
||||
DISPATCH2(Dropout, Tensor, float)
|
||||
|
||||
}
|
42
src/tensors/gpu/algorithm.cu
Normal file
42
src/tensors/gpu/algorithm.cu
Normal file
@ -0,0 +1,42 @@
|
||||
#include "tensors/gpu/algorithm.h"
|
||||
|
||||
#include "kernels/cuda_helpers.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
CudaCopy(begin, end, dest);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
}
|
||||
|
||||
__global__ void gFill(float *d_in, int size, float val) {
|
||||
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if(index < size) {
|
||||
d_in[index] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
int size = end - begin;
|
||||
int threads = std::min(512, size);
|
||||
int blocks = (size / threads) + (size % threads != 0);
|
||||
gFill<<<blocks, threads>>>(begin, size, value);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
}
|
||||
|
||||
void setSparse(Ptr<Backend> backend,
|
||||
const std::vector<size_t>& keys,
|
||||
const std::vector<float>& values,
|
||||
float* data) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
SetSparse(data, keys, values);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
12
src/tensors/gpu/algorithm.h
Normal file
12
src/tensors/gpu/algorithm.h
Normal file
@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensors/backend.h"
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
void copy(Ptr<Backend> backend, const float* begin, const float* end, float* dest);
|
||||
void fill(Ptr<Backend> backend, float* begin, float* end, float value);
|
||||
|
||||
void setSparse(Ptr<Backend> backend, const std::vector<size_t>&, const std::vector<float>&, float*);
|
||||
}
|
||||
}
|
@ -5,7 +5,7 @@
|
||||
#include <curand.h>
|
||||
|
||||
#include "common/config.h"
|
||||
#include "graph/backend.h"
|
||||
#include "tensors/backend.h"
|
||||
|
||||
#define CURAND_CALL(x) \
|
||||
do { \
|
||||
@ -16,10 +16,11 @@
|
||||
} while(0)
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
|
||||
class BackendGPU : public Backend {
|
||||
class Backend : public marian::Backend {
|
||||
public:
|
||||
BackendGPU(DeviceId deviceId, size_t seed) : Backend(deviceId, seed) {
|
||||
Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {
|
||||
setDevice();
|
||||
setHandles();
|
||||
}
|
||||
@ -36,13 +37,13 @@ private:
|
||||
cublasHandle_t cublasHandle_;
|
||||
curandGenerator_t curandGenerator_;
|
||||
|
||||
|
||||
|
||||
void setHandles() {
|
||||
cublasHandle_ = create_handle();
|
||||
curandGenerator_ = createCurandGenerator();
|
||||
}
|
||||
|
||||
|
||||
|
||||
curandGenerator_t createCurandGenerator() {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
curandGenerator_t generator;
|
||||
@ -62,4 +63,6 @@ private:
|
||||
return cublasHandle;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
@ -1,50 +1,52 @@
|
||||
#include <cuda.h>
|
||||
#include <curand.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "backend/dispatch.h"
|
||||
#include "graph/backend_gpu.h"
|
||||
|
||||
#define CUDA_CALL(x) \
|
||||
do { \
|
||||
if((x) != cudaSuccess) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define CURAND_CALL(x) \
|
||||
do { \
|
||||
if((x) != CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
|
||||
__global__ void gScale(float* data, int n, float p) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
while(index < n) {
|
||||
data[index] = (data[index] < p) / p;
|
||||
index += gridDim.x * blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
void Dropout(Ptr<Backend> backend, Tensor tensor, float p) {
|
||||
curandGenerator_t gen = std::static_pointer_cast<BackendGPU>(backend)->getCurandGenerator();
|
||||
int n = tensor->size();
|
||||
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
|
||||
|
||||
int numThreads = std::min(n, 512);
|
||||
int numBlocks = n / numThreads + (n % numThreads != 0);
|
||||
|
||||
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
#include <cuda.h>
|
||||
#include <curand.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "tensors/dispatch.h"
|
||||
#include "tensors/gpu/backend.h"
|
||||
|
||||
#define CUDA_CALL(x) \
|
||||
do { \
|
||||
if((x) != cudaSuccess) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define CURAND_CALL(x) \
|
||||
do { \
|
||||
if((x) != CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
|
||||
__global__ void gScale(float* data, int n, float p) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
while(index < n) {
|
||||
data[index] = (data[index] < p) / p;
|
||||
index += gridDim.x * blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
void Dropout(Tensor tensor, float p) {
|
||||
auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
|
||||
curandGenerator_t gen = gpuBackend->getCurandGenerator();
|
||||
int n = tensor->size();
|
||||
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
|
||||
|
||||
int numThreads = std::min(n, 512);
|
||||
int numBlocks = n / numThreads + (n % numThreads != 0);
|
||||
|
||||
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -1,11 +0,0 @@
|
||||
#include <cuda.h>
|
||||
|
||||
#include "kernels/cuda_helpers.h"
|
||||
#include "tensors/memory_piece.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
void MemoryPiece::insert(uint8_t* ptr, size_t num) {
|
||||
CUDA_CHECK(cudaMemcpy(data_, ptr, num * sizeof(uint8_t), cudaMemcpyDefault));
|
||||
}
|
||||
}
|
@ -34,13 +34,6 @@ public:
|
||||
|
||||
void setPtr(uint8_t* data) { data_ = data; }
|
||||
|
||||
template <typename T>
|
||||
void insert(T* ptr, size_t num) {
|
||||
insert((uint8_t*)ptr, num * sizeof(T));
|
||||
}
|
||||
|
||||
void insert(uint8_t* ptr, size_t num);
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& out, const MemoryPiece mp) {
|
||||
out << "MemoryPiece - ptr: " << std::hex << (size_t)mp.data() << std::dec
|
||||
<< " size: " << mp.size();
|
||||
|
@ -1,159 +0,0 @@
|
||||
|
||||
#include <cuda.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include "kernels/cuda_helpers.h"
|
||||
#include "kernels/tensor_operators.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
__global__ void gFill(float *d_in, int size, float val) {
|
||||
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if(index < size) {
|
||||
d_in[index] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float TensorBase::get(size_t i) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
float temp;
|
||||
CUDA_CHECK(
|
||||
cudaMemcpy(&temp, data() + i, sizeof(float), cudaMemcpyDeviceToHost));
|
||||
cudaStreamSynchronize(0);
|
||||
return temp;
|
||||
}
|
||||
|
||||
void TensorBase::set(size_t i, float value) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
CUDA_CHECK(
|
||||
cudaMemcpy(data() + i, &value, sizeof(float), cudaMemcpyHostToDevice));
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void TensorBase::get(std::vector<float> &v) {
|
||||
CUDA_CHECK(cudaSetDevice(deviceId_.no));
|
||||
v.resize(size());
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
v.data(), data(), size() * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void TensorBase::set(float value) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
int threads = std::min(512, (int)size());
|
||||
int blocks = (size() / threads) + (size() % threads != 0);
|
||||
gFill<<<blocks, threads>>>(data(), size(), value);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void TensorBase::set(const std::vector<float> &v) {
|
||||
CUDA_CHECK(cudaSetDevice(deviceId_.no));
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
data(), v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice));
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void TensorBase::setSparse(const std::vector<size_t> &k,
|
||||
const std::vector<float> &v) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
SetSparse(data(), k, v);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void TensorBase::copyFrom(Tensor in) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
CUDA_CHECK(cudaMemcpy(data(),
|
||||
(float *)in->data(),
|
||||
in->size() * sizeof(float),
|
||||
cudaMemcpyDefault));
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
std::string TensorBase::debug() {
|
||||
std::stringstream strm;
|
||||
assert(shape_.size());
|
||||
strm << shape_;
|
||||
strm << " device=" << deviceId_;
|
||||
strm << " ptr=" << (size_t)memory_->data();
|
||||
strm << " bytes=" << memory_->size();
|
||||
strm << std::endl;
|
||||
|
||||
// values
|
||||
size_t totSize = shape_.elements();
|
||||
std::vector<float> values(totSize);
|
||||
get(values);
|
||||
|
||||
size_t dispCols = 5;
|
||||
strm << std::fixed << std::setprecision(8) << std::setfill(' ');
|
||||
|
||||
for(int i = 0; i < values.size(); ++i) {
|
||||
std::vector<int> dims;
|
||||
shape().dims(i, dims);
|
||||
|
||||
bool disp = true;
|
||||
for(int j = 0; j < dims.size(); ++j)
|
||||
disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
|
||||
|
||||
if(disp) {
|
||||
|
||||
if(dims.back() == 0) {
|
||||
bool par = true;
|
||||
std::vector<std::string> p;
|
||||
for(int j = dims.size() - 1; j >= 0; --j) {
|
||||
if(dims[j] != 0)
|
||||
par = false;
|
||||
|
||||
p.push_back(par ? "[" : " ");
|
||||
}
|
||||
for(auto it = p.rbegin(); it != p.rend(); ++it)
|
||||
strm << *it;
|
||||
strm << " ";
|
||||
}
|
||||
|
||||
strm << std::setw(12)
|
||||
<< values[i]
|
||||
<< " ";
|
||||
|
||||
if(dims.back() + 1 == shape().back()) {
|
||||
for(int j = dims.size() - 1; j >= 0; --j) {
|
||||
if(dims[j] + 1 != shape()[j])
|
||||
break;
|
||||
strm << "]";
|
||||
}
|
||||
strm << std::endl;
|
||||
}
|
||||
|
||||
bool prev = true;
|
||||
for(int j = dims.size() - 1; j >= 0; --j) {
|
||||
if(j < dims.size() - 1)
|
||||
prev = prev && dims[j + 1] + 1 == shape()[j + 1];
|
||||
if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
|
||||
if(j < dims.size() - 1)
|
||||
for(int k = 0; k <= j; ++k)
|
||||
strm << " ";
|
||||
strm << "... ";
|
||||
if(j < dims.size() - 1)
|
||||
strm << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
strm << std::endl;
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
Tensor operator<<(Tensor t, const std::vector<float> &v) {
|
||||
t->set(v);
|
||||
return t;
|
||||
}
|
||||
|
||||
Tensor operator>>(Tensor t, std::vector<float> &v) {
|
||||
t->get(v);
|
||||
return t;
|
||||
}
|
||||
}
|
@ -5,10 +5,13 @@
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
|
||||
#include "3rd_party/exception.h"
|
||||
#include "common/definitions.h"
|
||||
#include "common/shape.h"
|
||||
#include "tensors/memory_piece.h"
|
||||
#include "tensors/backend.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include "tensors/gpu/algorithm.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -16,11 +19,11 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
|
||||
private:
|
||||
Ptr<MemoryPiece> memory_;
|
||||
Shape shape_;
|
||||
DeviceId deviceId_;
|
||||
Ptr<Backend> backend_;
|
||||
|
||||
public:
|
||||
TensorBase(Ptr<MemoryPiece> memory, Shape shape, DeviceId deviceId)
|
||||
: memory_(memory), shape_(shape), deviceId_(deviceId) {}
|
||||
TensorBase(Ptr<MemoryPiece> memory, Shape shape, Ptr<Backend> backend)
|
||||
: memory_(memory), shape_(shape), backend_(backend) {}
|
||||
|
||||
~TensorBase() {}
|
||||
|
||||
@ -39,34 +42,157 @@ public:
|
||||
return get(0);
|
||||
}
|
||||
|
||||
DeviceId getDevice() { return deviceId_; }
|
||||
Ptr<Backend> getBackend() { return backend_; }
|
||||
DeviceId getDevice() { return backend_->getDevice(); }
|
||||
|
||||
Tensor subtensor(int offset, int size) {
|
||||
auto mem = New<MemoryPiece>(memory_->data() + sizeof(float) * offset,
|
||||
sizeof(float) * size);
|
||||
return Tensor(new TensorBase(mem, {1, size}, deviceId_));
|
||||
return New<TensorBase>(mem, Shape{1, size}, backend_);
|
||||
}
|
||||
|
||||
float get(size_t i);
|
||||
float get(size_t i) {
|
||||
float temp;
|
||||
if(backend_->getDevice().type == DeviceType::gpu)
|
||||
gpu::copy(backend_, data() + i, data() + i + 1, &temp);
|
||||
else
|
||||
std::copy(data() + i, data() + i + 1, &temp);
|
||||
return temp;
|
||||
}
|
||||
|
||||
void set(size_t i, float value);
|
||||
void set(size_t i, float value) {
|
||||
if(backend_->getDevice().type == DeviceType::gpu)
|
||||
gpu::copy(backend_, &value, &value + 1, data() + i);
|
||||
else
|
||||
std::copy(&value, &value + 1, data() + i);
|
||||
}
|
||||
|
||||
void get(std::vector<float>& v);
|
||||
void get(std::vector<float> &v) {
|
||||
v.resize(size());
|
||||
if(backend_->getDevice().type == DeviceType::gpu)
|
||||
gpu::copy(backend_, data(), data() + size(), v.data());
|
||||
else
|
||||
std::copy(data(), data() + size(), v.data());
|
||||
}
|
||||
|
||||
void set(float value);
|
||||
void set(const std::vector<float> &v) {
|
||||
if(backend_->getDevice().type == DeviceType::gpu)
|
||||
gpu::copy(backend_, v.data(), v.data() + v.size(), data());
|
||||
else
|
||||
std::copy(v.data(), v.data() + v.size(), data());
|
||||
}
|
||||
|
||||
void set(const std::vector<float>& v);
|
||||
void set(float value) {
|
||||
if(backend_->getDevice().type == DeviceType::gpu)
|
||||
gpu::fill(backend_, data(), data() + size(), value);
|
||||
else
|
||||
std::fill(data(), data() + size(), value);
|
||||
}
|
||||
|
||||
void setSparse(const std::vector<size_t>& k, const std::vector<float>& v);
|
||||
void setSparse(const std::vector<size_t> &k,
|
||||
const std::vector<float> &v) {
|
||||
if(backend_->getDevice().type == DeviceType::gpu) {
|
||||
gpu::setSparse(backend_, k, v, data());
|
||||
} else {
|
||||
for(int i = 0; i < k.size(); ++i)
|
||||
data()[k[i]] = v[i];
|
||||
}
|
||||
}
|
||||
|
||||
void copyFrom(Tensor);
|
||||
void copyFrom(Tensor in) {
|
||||
if(in->getBackend()->getDevice().type == DeviceType::gpu ||
|
||||
backend_->getDevice().type == DeviceType::gpu)
|
||||
gpu::copy(backend_, in->data(), in->data() + in->size(), data());
|
||||
else
|
||||
std::copy(in->data(), in->data() + in->size(), data());
|
||||
}
|
||||
|
||||
std::string debug() {
|
||||
std::stringstream strm;
|
||||
assert(shape_.size());
|
||||
strm << shape_;
|
||||
strm << " device=" << backend_->getDevice();
|
||||
strm << " ptr=" << (size_t)memory_->data();
|
||||
strm << " bytes=" << memory_->size();
|
||||
strm << std::endl;
|
||||
|
||||
// values
|
||||
size_t totSize = shape_.elements();
|
||||
std::vector<float> values(totSize);
|
||||
get(values);
|
||||
|
||||
size_t dispCols = 5;
|
||||
strm << std::fixed << std::setprecision(8) << std::setfill(' ');
|
||||
|
||||
for(int i = 0; i < values.size(); ++i) {
|
||||
std::vector<int> dims;
|
||||
shape().dims(i, dims);
|
||||
|
||||
bool disp = true;
|
||||
for(int j = 0; j < dims.size(); ++j)
|
||||
disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
|
||||
|
||||
if(disp) {
|
||||
|
||||
if(dims.back() == 0) {
|
||||
bool par = true;
|
||||
std::vector<std::string> p;
|
||||
for(int j = dims.size() - 1; j >= 0; --j) {
|
||||
if(dims[j] != 0)
|
||||
par = false;
|
||||
|
||||
p.push_back(par ? "[" : " ");
|
||||
}
|
||||
for(auto it = p.rbegin(); it != p.rend(); ++it)
|
||||
strm << *it;
|
||||
strm << " ";
|
||||
}
|
||||
|
||||
strm << std::setw(12)
|
||||
<< values[i]
|
||||
<< " ";
|
||||
|
||||
if(dims.back() + 1 == shape().back()) {
|
||||
for(int j = dims.size() - 1; j >= 0; --j) {
|
||||
if(dims[j] + 1 != shape()[j])
|
||||
break;
|
||||
strm << "]";
|
||||
}
|
||||
strm << std::endl;
|
||||
}
|
||||
|
||||
bool prev = true;
|
||||
for(int j = dims.size() - 1; j >= 0; --j) {
|
||||
if(j < dims.size() - 1)
|
||||
prev = prev && dims[j + 1] + 1 == shape()[j + 1];
|
||||
if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
|
||||
if(j < dims.size() - 1)
|
||||
for(int k = 0; k <= j; ++k)
|
||||
strm << " ";
|
||||
strm << "... ";
|
||||
if(j < dims.size() - 1)
|
||||
strm << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
strm << std::endl;
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
std::string debug();
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<TensorBase> Tensor;
|
||||
|
||||
Tensor operator<<(Tensor t, const std::vector<float>& v);
|
||||
|
||||
Tensor operator>>(Tensor t, std::vector<float>& v);
|
||||
static Tensor operator<<(Tensor t, const std::vector<float> &v) {
|
||||
t->set(v);
|
||||
return t;
|
||||
}
|
||||
|
||||
static Tensor operator>>(Tensor t, std::vector<float> &v) {
|
||||
t->get(v);
|
||||
return t;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,7 +5,6 @@
|
||||
|
||||
#include "common/definitions.h"
|
||||
#include "tensors/allocator.h"
|
||||
#include "tensors/device_gpu.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
@ -17,11 +16,13 @@ private:
|
||||
const size_t GROW = CHUNK * MBYTE;
|
||||
const size_t ALIGN = 256;
|
||||
|
||||
Ptr<Allocator<DeviceGPU>> allocator_;
|
||||
Ptr<Backend> backend_;
|
||||
Ptr<Allocator> allocator_;
|
||||
|
||||
public:
|
||||
TensorAllocator(DeviceId deviceId)
|
||||
: allocator_(New<Allocator<DeviceGPU>>(deviceId, 0, GROW, ALIGN)) {}
|
||||
TensorAllocator(Ptr<Backend> backend)
|
||||
: backend_(backend),
|
||||
allocator_(New<Allocator>(backend_->getDevice(), 0, GROW, ALIGN)) {}
|
||||
|
||||
~TensorAllocator() { clear(); }
|
||||
|
||||
@ -41,11 +42,17 @@ public:
|
||||
|
||||
void reserveExact(size_t bytes = 0) {
|
||||
size_t mbytes = bytes / MBYTE;
|
||||
LOG(info,
|
||||
"[memory] Reserving {} MB, device {}",
|
||||
mbytes,
|
||||
allocator_->getDevice());
|
||||
|
||||
if(mbytes == 0) {
|
||||
LOG(info,
|
||||
"[memory] Reserving {} B, device {}",
|
||||
bytes,
|
||||
allocator_->getDevice());
|
||||
} else {
|
||||
LOG(info,
|
||||
"[memory] Reserving {} MB, device {}",
|
||||
mbytes,
|
||||
allocator_->getDevice());
|
||||
}
|
||||
allocator_->reserve(bytes);
|
||||
}
|
||||
|
||||
@ -59,7 +66,7 @@ public:
|
||||
if(!t || t->shape() != shape) {
|
||||
int size = shape.elements();
|
||||
auto mem = allocator_->alloc<float>(size);
|
||||
t = Tensor(new TensorBase(mem, shape, allocator_->getDevice()));
|
||||
t = Tensor(new TensorBase(mem, shape, backend_));
|
||||
}
|
||||
}
|
||||
|
||||
@ -68,11 +75,11 @@ public:
|
||||
Tensor asTensor() {
|
||||
auto mem = allocator_->memory();
|
||||
int size = mem->size() / sizeof(float);
|
||||
return Tensor(new TensorBase(mem, {1, size}, allocator_->getDevice()));
|
||||
return Tensor(new TensorBase(mem, {1, size}, backend_));
|
||||
}
|
||||
|
||||
size_t size() { return allocator_->size() / sizeof(float); }
|
||||
|
||||
Ptr<Allocator<DeviceGPU>> allocator() { return allocator_; }
|
||||
Ptr<Allocator> allocator() { return allocator_; }
|
||||
};
|
||||
}
|
||||
|
@ -1,82 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
// This file is part of the Marian toolkit.
|
||||
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
class TensorCPU : public TensorBase {
|
||||
public:
|
||||
TensorCPU(float* data, Shape shape) : TensorBase(data, shape) {}
|
||||
|
||||
float get(size_t i) { return data_[i]; }
|
||||
|
||||
void set(size_t i, float value) { data_[i] = value; }
|
||||
|
||||
void get(std::vector<float>& v) {
|
||||
v.resize(size());
|
||||
std::copy(data_, data_ + size(), v.begin());
|
||||
}
|
||||
|
||||
void set(float value) { std::fill(data_, data_ + size(), value); }
|
||||
|
||||
void set(const std::vector<float>& v) {
|
||||
std::copy(v.begin(), v.end(), data_);
|
||||
}
|
||||
};
|
||||
|
||||
class DeviceCPU {
|
||||
private:
|
||||
float* data_;
|
||||
size_t size_
|
||||
|
||||
public : DeviceCPU()
|
||||
: data_(0), size_(0) {}
|
||||
|
||||
~DeviceCPU() {
|
||||
if(data_)
|
||||
delete[] data_;
|
||||
}
|
||||
|
||||
typedef TensorCPU tensor_type;
|
||||
|
||||
void reserve(size_t size) {
|
||||
ABORT_IF(size < size_, "New size must be larger than old size");
|
||||
float* temp = new float[size];
|
||||
|
||||
if(data_) {
|
||||
std::memcpy(temp, data_, size_ * sizeof(float));
|
||||
delete[] data_;
|
||||
}
|
||||
|
||||
data_ = temp;
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
float* data() { return data_; }
|
||||
|
||||
size_t capacity() { return size_; }
|
||||
};
|
||||
}
|
@ -17,9 +17,9 @@ endforeach(test)
|
||||
|
||||
# Testing apps
|
||||
add_executable(logger_test logger_test.cpp)
|
||||
add_executable(dropout_test dropout_test.cpp)
|
||||
#cuda_add_executable(bn_test bn_test.cu)
|
||||
cuda_add_executable(pooling_test pooling_test.cu)
|
||||
cuda_add_executable(dropout_test dropout_test.cu)
|
||||
#cuda_add_executable(marian_test marian_test.cu)
|
||||
cuda_add_executable(tensor_test tensor_test.cu)
|
||||
|
||||
|
@ -5,55 +5,11 @@
|
||||
|
||||
#include "3rd_party/exception.h"
|
||||
#include "tensors/allocator.h"
|
||||
#include "tensors/device_gpu.h"
|
||||
|
||||
class DeviceCPU {
|
||||
private:
|
||||
uint8_t* data_;
|
||||
size_t size_;
|
||||
size_t alignment_;
|
||||
|
||||
public:
|
||||
DeviceCPU(size_t device, size_t alignment = 256)
|
||||
: data_(0), size_(0), alignment_(alignment) {}
|
||||
|
||||
~DeviceCPU() { delete[] data_; }
|
||||
|
||||
size_t align(size_t size) {
|
||||
return ceil(size / (float)alignment_) * alignment_;
|
||||
}
|
||||
|
||||
void reserve(size_t size) {
|
||||
size = align(size);
|
||||
ABORT_IF(size < size_, "New size must be larger than old size");
|
||||
|
||||
if(data_) {
|
||||
// Allocate memory by going through host memory
|
||||
uint8_t *temp = new uint8_t[size_];
|
||||
std::copy(data_, data_ + size_, temp);
|
||||
std::fill(data_, data_ + size_, 0);
|
||||
delete[] data_;
|
||||
data_ = new uint8_t[size];
|
||||
std::copy(temp, temp + size_, data_);
|
||||
delete[] temp;
|
||||
} else {
|
||||
data_ = new uint8_t[size];
|
||||
}
|
||||
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
uint8_t* data() { return data_; }
|
||||
|
||||
size_t size() { return size_; }
|
||||
|
||||
size_t getDevice() { return 0; }
|
||||
};
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
using namespace marian;
|
||||
|
||||
auto a = New<Allocator<DeviceGPU>>({0, DeviceType::gpu}, 0, 30000, 256);
|
||||
auto a = New<Allocator>({0, DeviceType::gpu}, 0, 30000, 256);
|
||||
std::cerr << "Size: " << a->size() << std::endl;
|
||||
|
||||
auto mem1 = a->alloc<int>(100000);
|
||||
|
@ -1,13 +1,9 @@
|
||||
#include <cuda.h>
|
||||
#include <curand.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <boost/chrono.hpp>
|
||||
#include <boost/timer/timer.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include "common/config.h"
|
||||
#include "graph/node_initializers.h"
|
||||
#include "marian.h"
|
||||
|
||||
using namespace marian;
|
||||
@ -16,8 +12,11 @@ using namespace keywords;
|
||||
int main(int argc, char** argv) {
|
||||
auto c = New<Config>(argc, argv);
|
||||
|
||||
auto type = c->get<bool>("cpu") ? DeviceType::cpu : DeviceType::gpu;
|
||||
DeviceId deviceId{0, type};
|
||||
|
||||
auto g = New<ExpressionGraph>();
|
||||
g->setDevice({0, DeviceType::gpu});
|
||||
g->setDevice(deviceId);
|
||||
g->reserveWorkspaceMB(512);
|
||||
|
||||
for(int i = 0; i < 10; ++i) {
|
@ -53,3 +53,53 @@ TEST_CASE("Expression graph can be initialized with constant values",
|
||||
REQUIRE(values == v);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Graph device is set (cpu)", "[graph]") {
|
||||
auto graph = New<ExpressionGraph>();
|
||||
|
||||
graph->setDevice({0, DeviceType::cpu});
|
||||
|
||||
DeviceId testId{0, DeviceType::cpu};
|
||||
REQUIRE(graph->getDevice() == testId);
|
||||
}
|
||||
|
||||
TEST_CASE("Expression graph can be initialized with constant values (cpu)",
|
||||
"[graph]") {
|
||||
auto graph = New<ExpressionGraph>();
|
||||
graph->setDevice({0, DeviceType::cpu});
|
||||
graph->reserveWorkspaceMB(4);
|
||||
|
||||
std::vector<float> values;
|
||||
|
||||
SECTION("initializing with zero (cpu)") {
|
||||
graph->clear();
|
||||
values.clear();
|
||||
auto zeros = graph->param("0s", {2, 5}, keywords::init = inits::zeros);
|
||||
graph->forward();
|
||||
|
||||
zeros->val()->get(values);
|
||||
REQUIRE(values == std::vector<float>(10, 0.0f));
|
||||
}
|
||||
|
||||
SECTION("initializing with ones (cpu)") {
|
||||
graph->clear();
|
||||
values.clear();
|
||||
auto ones = graph->param("1s", {2, 5}, keywords::init = inits::ones);
|
||||
graph->forward();
|
||||
|
||||
ones->val()->get(values);
|
||||
REQUIRE(values == std::vector<float>(10, 1.0f));
|
||||
}
|
||||
|
||||
SECTION("initializing from vector (cpu)") {
|
||||
graph->clear();
|
||||
values.clear();
|
||||
std::vector<float> v({1, 2, 3, 4, 5, 6});
|
||||
auto vals = graph->param("vs", {2, 3}, keywords::init = inits::from_vector(v));
|
||||
graph->forward();
|
||||
|
||||
REQUIRE(values.empty());
|
||||
vals->val()->get(values);
|
||||
REQUIRE(values == v);
|
||||
}
|
||||
}
|
||||
|
@ -93,12 +93,12 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
|
||||
|
||||
int pos = 0;
|
||||
// parameter sharding
|
||||
for(auto device : devices_) {
|
||||
for(auto graph : graphs_) {
|
||||
int __size__ = min(shardSize_, totalSize);
|
||||
totalSize -= __size__;
|
||||
|
||||
Tensor param;
|
||||
Ptr<TensorAllocator> allocator = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
|
||||
Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
|
||||
allocator->reserveExact(__size__ * sizeof(float));
|
||||
allocator->allocate(param, {1, __size__});
|
||||
paramsAlloc_.push_back(allocator);
|
||||
@ -112,11 +112,11 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
|
||||
if(grads_.size() == 0) {
|
||||
int totalSize = graphs_[0]->params()->vals()->size();
|
||||
|
||||
for(auto device : devices_) {
|
||||
for(auto graph : graphs_) {
|
||||
int __size__ = min(shardSize_, totalSize);
|
||||
totalSize -= __size__;
|
||||
Tensor grad_;
|
||||
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
|
||||
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(graph->getBackend());
|
||||
|
||||
allocator_->reserveExact(__size__ * sizeof(float));
|
||||
allocator_->allocate(grad_, {1, __size__});
|
||||
@ -129,11 +129,11 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
|
||||
int totalSize = graphs_[0]->params()->vals()->size();
|
||||
|
||||
int i = 0;
|
||||
for(auto device : devices_) {
|
||||
for(auto graph : graphs_) {
|
||||
int __size__ = min(shardSize_, totalSize);
|
||||
totalSize -= __size__;
|
||||
Tensor paramAvg;
|
||||
Ptr<TensorAllocator> allocator = New<TensorAllocator>(DeviceId{device, DeviceType::gpu});
|
||||
Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
|
||||
|
||||
allocator->reserveExact(__size__ * sizeof(float));
|
||||
allocator->allocate(paramAvg, {1, __size__});
|
||||
@ -187,7 +187,7 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
|
||||
Tensor gradients;
|
||||
if(tau_ > 1) {
|
||||
if(t == 0) {
|
||||
accAlloc = New<TensorAllocator>(graph->getDevice());
|
||||
accAlloc = New<TensorAllocator>(graph->getBackend());
|
||||
accAlloc->reserveExact(graph->params()->grads()->memory()->size());
|
||||
accAlloc->allocate(accGradients, graph->params()->grads()->shape());
|
||||
accGradients->set(0);
|
||||
|
@ -8,9 +8,9 @@
|
||||
|
||||
namespace marian {
|
||||
|
||||
Tensor AsyncGraphGroupDrop::newTensor(int size, DeviceId deviceId) {
|
||||
Tensor AsyncGraphGroupDrop::newTensor(int size, Ptr<Backend> backend) {
|
||||
Tensor t;
|
||||
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(deviceId);
|
||||
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(backend);
|
||||
allocator_->reserveExact(size * sizeof(float));
|
||||
allocator_->allocate(t, {1, size});
|
||||
allocators.push_back(allocator_);
|
||||
@ -79,14 +79,13 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
|
||||
size_t batch_words,
|
||||
int device_id) {
|
||||
if(pushStep_[device_id]++ <= dropping_warmup) {
|
||||
std::cout<<"WARMUP"<<std::endl;
|
||||
AsyncGraphGroup::pushGradients(newGrads, batch_words, device_id);
|
||||
return;
|
||||
}
|
||||
|
||||
// get the sparse gradient
|
||||
pushDropper_[device_id]->dropGraph(
|
||||
newGrads, pushSparseGradient_[device_id],
|
||||
newGrads, pushSparseGradient_[device_id],
|
||||
droping_rate, dropping_momentum);
|
||||
|
||||
SparseTensor newSparseGrads = pushSparseGradient_[device_id];
|
||||
@ -146,13 +145,12 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
|
||||
fetchStep_.push_back(0);
|
||||
pushStep_.push_back(0);
|
||||
|
||||
size_t device = devices_[i];
|
||||
// temporary tensor to compute parameter delta before fetching
|
||||
paramsDelta_.push_back(newTensor(shardSize, {device, DeviceType::gpu}));
|
||||
paramsDelta_.push_back(newTensor(shardSize, graphs_[i]->getBackend()));
|
||||
|
||||
// tensors to store local params history
|
||||
for(int h_id = 0; h_id < devices_.size(); h_id++) {
|
||||
Tensor tmp = newTensor(params_[i]->size(), {device, DeviceType::gpu});
|
||||
Tensor tmp = newTensor(params_[i]->size(), graphs_[i]->getBackend());
|
||||
tmp->copyFrom(params_[i]);
|
||||
paramsLocal_[h_id].push_back(tmp);
|
||||
}
|
||||
@ -162,23 +160,23 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
|
||||
|
||||
// N-dropper for fetch
|
||||
std::vector<GradientDrop> tmpDropper;
|
||||
for(int i = 0; i < devices_.size(); i++)
|
||||
for(auto device : devices_)
|
||||
tmpDropper.push_back(GradientDrop(new GradientDropBase()));
|
||||
fetchDropper.push_back(tmpDropper);
|
||||
|
||||
// sparsetensor to store sparsified gradients per-device
|
||||
pushSparseGradient_.push_back(
|
||||
SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu})));
|
||||
SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
|
||||
|
||||
pushShardedSparseGradient_.push_back(
|
||||
SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu})));
|
||||
SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
|
||||
fetchSparseGradient_.push_back(SparseTensor(
|
||||
new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu})));
|
||||
new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend())));
|
||||
|
||||
std::vector<SparseTensor> tmp;
|
||||
for(int i = 0; i < devices_.size(); i++)
|
||||
tmp.push_back(SparseTensor(
|
||||
new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu})));
|
||||
new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend())));
|
||||
fetchShardedSparseGradient_.push_back(tmp);
|
||||
}
|
||||
|
||||
|
@ -31,7 +31,7 @@ class AsyncGraphGroupDrop : public AsyncGraphGroup {
|
||||
|
||||
std::vector<Ptr<TensorAllocator>> allocators;
|
||||
|
||||
Tensor newTensor(int size, DeviceId deviceId);
|
||||
Tensor newTensor(int size, Ptr<Backend> backend);
|
||||
|
||||
protected:
|
||||
void init(Ptr<data::Batch> batch);
|
||||
|
@ -19,9 +19,9 @@ void MultiNodeGraphGroup::setScheduler(Ptr<Scheduler> scheduler) {
|
||||
/**
|
||||
* Allocate new tensor on given GPU and store allocator.
|
||||
*/
|
||||
Tensor MultiNodeGraphGroup::newTensor(int size, DeviceId deviceId) {
|
||||
Tensor MultiNodeGraphGroup::newTensor(int size, Ptr<Backend> backend) {
|
||||
Tensor t;
|
||||
Ptr<TensorAllocator> allocator = New<TensorAllocator>(deviceId);
|
||||
Ptr<TensorAllocator> allocator = New<TensorAllocator>(backend);
|
||||
allocator->reserveExact(size * sizeof(float));
|
||||
allocator->allocate(t, {1, size});
|
||||
allocators_.push_back(allocator);
|
||||
@ -148,14 +148,12 @@ void MultiNodeGraphGroup::initClientCommOverlapVars() {
|
||||
void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
|
||||
size_t modelSize = clientGraphs_[0]->params()->vals()->size();
|
||||
for(int client = 0; client < devices_.size(); client++) {
|
||||
DeviceId deviceId{devices_[client], DeviceType::gpu};
|
||||
|
||||
// Communication overlap buffer (for grads + params)
|
||||
Tensor commOverlapBuffer = newTensor(modelSize, deviceId);
|
||||
Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend());
|
||||
commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals());
|
||||
clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer);
|
||||
// Gradients local sum buffer
|
||||
Tensor sumGrads = newTensor(modelSize, deviceId);
|
||||
Tensor sumGrads = newTensor(modelSize, clientGraphs_[client]->getBackend());
|
||||
sumGrads->set(0);
|
||||
clientSummedGradsGPU.push_back(sumGrads);
|
||||
// Local optimizer to apply summed gradients
|
||||
@ -207,12 +205,11 @@ void MultiNodeGraphGroup::calculateShardSizes() {
|
||||
void MultiNodeGraphGroup::initShardGpuTensors() {
|
||||
size_t offset = 0;
|
||||
for(int shard = 0; shard < devices_.size(); shard++) {
|
||||
DeviceId deviceId{devices_[shard], DeviceType::gpu};
|
||||
Tensor gpuParams = newTensor(shardSizes_[shard], deviceId);
|
||||
Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
|
||||
gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
|
||||
offset, shardSizes_[shard]));
|
||||
shardParams_.push_back(gpuParams);
|
||||
shardGrads_.push_back(newTensor(shardSizes_[shard], deviceId));
|
||||
shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -217,7 +217,7 @@ protected:
|
||||
/**
|
||||
* Allocate new tensor on given GPU and store allocator.
|
||||
*/
|
||||
Tensor newTensor(int size, DeviceId deviceId);
|
||||
Tensor newTensor(int size, Ptr<Backend> backend);
|
||||
|
||||
/**
|
||||
* Setup training environment and launch server thread and (if enabled) client
|
||||
|
@ -60,7 +60,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
|
||||
for(auto graph : graphs_) {
|
||||
int __size__ = min(shardSize_, totalSize);
|
||||
|
||||
auto paramsAlloc = New<TensorAllocator>(graph->getDevice());
|
||||
auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
|
||||
paramsAllocs_.push_back(paramsAlloc);
|
||||
|
||||
paramsAlloc->reserveExact(3 * __size__ * sizeof(float));
|
||||
@ -87,7 +87,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
|
||||
int __size__ = min(shardSize_, totalSize);
|
||||
totalSize -= __size__;
|
||||
Tensor paramAvg;
|
||||
auto allocator = New<TensorAllocator>(graph->getDevice());
|
||||
auto allocator = New<TensorAllocator>(graph->getBackend());
|
||||
|
||||
allocator->reserveExact(__size__ * sizeof(float));
|
||||
allocator->allocate(paramAvg, {1, __size__});
|
||||
|
@ -45,9 +45,9 @@ __global__ void gFindSubtensor(int* indices,
|
||||
resultEnd[0] = idx;
|
||||
}
|
||||
|
||||
SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId)
|
||||
: deviceId_(deviceId), capacity_(capacity) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
SparseTensorBase::SparseTensorBase(int capacity, Ptr<Backend> backend)
|
||||
: backend_(backend), capacity_(capacity) {
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity));
|
||||
CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity));
|
||||
|
||||
@ -58,8 +58,8 @@ SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId)
|
||||
SparseTensorBase::SparseTensorBase(float* data,
|
||||
int* indices,
|
||||
int size,
|
||||
DeviceId deviceId)
|
||||
: deviceId_(deviceId) {
|
||||
Ptr<Backend> backend)
|
||||
: backend_(backend) {
|
||||
data_ = data;
|
||||
indices_ = indices;
|
||||
size_ = size;
|
||||
@ -93,7 +93,7 @@ void SparseTensorBase::copyFrom(float* data,
|
||||
size_ = size;
|
||||
if(size == 0)
|
||||
return;
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
|
||||
cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault);
|
||||
if(!data_only)
|
||||
@ -107,8 +107,8 @@ void SparseTensorBase::copyFrom(std::shared_ptr<SparseTensorBase> t,
|
||||
copyFrom(t->data(), t->indices(), t->size(), data_only);
|
||||
}
|
||||
|
||||
DeviceId SparseTensorBase::getDevice() {
|
||||
return deviceId_;
|
||||
Ptr<Backend> SparseTensorBase::getBackend() {
|
||||
return backend_;
|
||||
}
|
||||
|
||||
void SparseTensorBase::setSize(int size) {
|
||||
@ -117,7 +117,7 @@ void SparseTensorBase::setSize(int size) {
|
||||
|
||||
// return the dense representation of this tensor
|
||||
void SparseTensorBase::toDense(Tensor t, int offset) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
int threads = 512;
|
||||
int blocks = 1 + size_ / threads;
|
||||
t->set(0);
|
||||
@ -127,7 +127,7 @@ void SparseTensorBase::toDense(Tensor t, int offset) {
|
||||
}
|
||||
|
||||
void SparseTensorBase::scatterAdd(Tensor t, int offset) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
cudaStreamSynchronize(0);
|
||||
int threads = 512;
|
||||
int blocks = 1 + size_ / threads;
|
||||
@ -139,7 +139,7 @@ void SparseTensorBase::scatterAdd(Tensor t, int offset) {
|
||||
std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
|
||||
int size,
|
||||
int idx) {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
cudaStreamSynchronize(0);
|
||||
int* start = gstart_ + idx;
|
||||
int* end = gend_ + idx;
|
||||
@ -165,6 +165,6 @@ std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
|
||||
int subtensorSize = std::max(0, endOffset - startOffset + 1);
|
||||
cudaStreamSynchronize(0);
|
||||
return std::shared_ptr<SparseTensorBase>(new SparseTensorBase(
|
||||
data_ + startOffset, indices_ + startOffset, subtensorSize, deviceId_));
|
||||
data_ + startOffset, indices_ + startOffset, subtensorSize, backend_));
|
||||
}
|
||||
}
|
||||
|
@ -10,15 +10,15 @@ class SparseTensorBase : public std::enable_shared_from_this<SparseTensorBase> {
|
||||
int* indices_;
|
||||
int size_;
|
||||
int capacity_;
|
||||
DeviceId deviceId_;
|
||||
Ptr<Backend> backend_;
|
||||
|
||||
int* d_is_unsorted;
|
||||
int* gstart_;
|
||||
int* gend_;
|
||||
|
||||
public:
|
||||
SparseTensorBase(int capacity, DeviceId deviceId);
|
||||
SparseTensorBase(float* data, int* indices, int size, DeviceId deviceId);
|
||||
SparseTensorBase(int capacity, Ptr<Backend> backend);
|
||||
SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend);
|
||||
|
||||
~SparseTensorBase() {}
|
||||
|
||||
@ -43,7 +43,7 @@ public:
|
||||
void scatterAdd(Tensor t, int offset = 0);
|
||||
std::shared_ptr<SparseTensorBase> subtensor(int pos, int size, int idx);
|
||||
|
||||
DeviceId getDevice();
|
||||
Ptr<Backend> getBackend();
|
||||
|
||||
void toDense(Tensor t, int offset);
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user