Merge pull request #397 from marian-nmt/fixSwap

Fixes parameter swapping for single GPU case with --sync-sgd
This commit is contained in:
Marcin Junczys-Dowmunt 2018-12-04 08:55:15 -08:00 committed by GitHub
commit 5cac51d1ef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 138 additions and 49 deletions

View File

@ -49,7 +49,7 @@ void fill(Ptr<Backend> backend, T* begin, T* end, T value) {
if (size == 0)
return;
CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no));
int threadsPerBlock = std::min(512, size);
int threadsPerBlock = std::min(MAX_THREADS, size);
int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b)
gFill<<<blocks, threadsPerBlock>>>(begin, size, value);
CUDA_CHECK(cudaStreamSynchronize(0));
@ -76,5 +76,46 @@ void setSparse(Ptr<Backend> backend,
// gpu::SetSparse(data, keys, values);
CUDA_CHECK(cudaStreamSynchronize(0));
}
template <typename T>
__global__ void gSwap(T* d_v1, T* d_v2, int size) {
auto threadsPerBlock = blockDim.x;
int index = threadIdx.x + threadsPerBlock * blockIdx.x;
if(index < size) {
T temp = d_v1[index];
d_v1[index] = d_v2[index];
d_v2[index] = temp;
}
}
template <typename T>
void swap_ranges(Ptr<Backend> backend, T* begin, T* end, T* dest) {
int size = end - begin;
if (size == 0)
return;
CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no));
int threadsPerBlock = std::min(MAX_THREADS, size);
int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b)
gSwap<<<blocks, threadsPerBlock>>>(begin, dest, size);
CUDA_CHECK(cudaStreamSynchronize(0));
}
// clang-format off
template void swap_ranges<int8_t>(Ptr<Backend>, int8_t*, int8_t*, int8_t*);
template void swap_ranges<int16_t>(Ptr<Backend>, int16_t*, int16_t*, int16_t*);
template void swap_ranges<int32_t>(Ptr<Backend>, int32_t*, int32_t*, int32_t*);
template void swap_ranges<int64_t>(Ptr<Backend>, int64_t*, int64_t*, int64_t*);
template void swap_ranges<uint8_t>(Ptr<Backend>, uint8_t*, uint8_t*, uint8_t*);
template void swap_ranges<uint16_t>(Ptr<Backend>, uint16_t*, uint16_t*, uint16_t*);
template void swap_ranges<uint32_t>(Ptr<Backend>, uint32_t*, uint32_t*, uint32_t*);
template void swap_ranges<uint64_t>(Ptr<Backend>, uint64_t*, uint64_t*, uint64_t*);
template void swap_ranges<char>(Ptr<Backend>, char*, char*, char*);
template void swap_ranges<float>(Ptr<Backend>, float*, float*, float*);
template void swap_ranges<double>(Ptr<Backend>, double*, double*, double*);
// clang-format on
} // namespace gpu
} // namespace marian

View File

@ -10,6 +10,9 @@ void copy(Ptr<marian::Backend> backend, const T* begin, const T* end, T* dest);
template <typename T>
void fill(Ptr<marian::Backend> backend, T* begin, T* end, T value);
template <typename T>
void swap_ranges(Ptr<marian::Backend> backend, T* begin, T* end, T* dest);
void setSparse(Ptr<marian::Backend> backend,
const std::vector<size_t>&,
const std::vector<float>&,

View File

@ -56,15 +56,6 @@ public:
virtual size_t size() { return shape_.elements(); }
virtual float scalar() {
ABORT_IF(!matchType<float>(type_),
"Requested type ({}) and underlying type ({}) do not match",
request<float>(),
type_);
ABORT_IF(size() != 1, "Tensor is not a scalar");
return get(0);
}
template <typename T>
T scalar() {
ABORT_IF(!matchType<T>(type_),
@ -76,6 +67,10 @@ public:
return get<T>(0);
}
virtual float scalar() {
return scalar<float>();
}
Ptr<Backend> getBackend() { return backend_; }
DeviceId getDeviceId() { return backend_->getDeviceId(); }
@ -85,24 +80,6 @@ public:
return New<TensorBase>(mem, Shape{1, (int)size}, backend_);
}
float get(size_t i) {
ABORT_IF(!matchType<float>(type_),
"Requested type ({}) and underlying type ({}) do not match",
request<float>(),
type_);
float temp = 0; // (initialize to keep compiler happy)
if(backend_->getDeviceId().type == DeviceType::cpu) {
std::copy(data() + i, data() + i + 1, &temp);
}
#ifdef CUDA_FOUND
else {
gpu::copy(backend_, data() + i, data() + i + 1, &temp);
}
#endif
return temp;
}
template <typename T>
T get(size_t i) {
ABORT_IF(!matchType<T>(type_),
@ -122,6 +99,10 @@ public:
return temp;
}
float get(size_t i) {
return get<float>(i);
}
template <typename T>
void set(size_t i, T value) {
ABORT_IF(!matchType<T>(type_),
@ -228,24 +209,95 @@ public:
#endif
}
template <typename T>
void copyFrom(Tensor in) {
// @TODO: solve this later
ABORT_IF(!matchType<float>(type_),
ABORT_IF(in->shape() != shape_, "Can only copy tensors with equal shapes ({} != {})", in->shape(), shape_);
ABORT_IF(in->type() != type_, "Can only copy tensors with equal types ({} != {})", in->type(), type_);
ABORT_IF(!matchType<T>(type_),
"Requested type ({}) and underlying type ({}) do not match",
request<float>(),
request<T>(),
type_);
if(in->getBackend()->getDeviceId().type == DeviceType::cpu
&& backend_->getDeviceId().type == DeviceType::cpu) {
std::copy(in->data(), in->data() + in->size(), data());
std::copy(in->data<T>(), in->data<T>() + in->size(), data<T>());
}
#ifdef CUDA_FOUND
else {
gpu::copy(backend_, in->data(), in->data() + in->size(), data());
gpu::copy(backend_, in->data<T>(), in->data<T>() + in->size(), data<T>());
}
#endif
}
void copyFrom(Tensor in) {
switch(type_) {
case Type::int8: copyFrom<int8_t>(in); break;
case Type::int16: copyFrom<int16_t>(in); break;
case Type::int32: copyFrom<int32_t>(in); break;
case Type::int64: copyFrom<int64_t>(in); break;
case Type::uint8: copyFrom<uint8_t>(in); break;
case Type::uint16: copyFrom<uint16_t>(in); break;
case Type::uint32: copyFrom<uint32_t>(in); break;
case Type::uint64: copyFrom<uint64_t>(in); break;
case Type::float32: copyFrom<float>(in); break;
case Type::float64: copyFrom<double>(in); break;
default: ABORT("Unknown type {}", type_);
}
}
// Swaps the contents of the current tensor with the argument tensor
template <typename T>
void swap(Tensor swapee) {
ABORT_IF(swapee->shape() != shape_, "Can only swap tensors with equal shapes ({} != {})", swapee->shape(), shape_);
ABORT_IF(swapee->type() != type_, "Can only swap tensors with equal types ({} != {})", swapee->type(), type_);
ABORT_IF(!matchType<T>(type_),
"Requested type ({}) and underlying type ({}) do not match",
request<T>(),
type_);
// we live on CPUs; just use stdlib
if(swapee->getBackend()->getDeviceId().type == DeviceType::cpu
&& backend_->getDeviceId().type == DeviceType::cpu) {
std::swap_ranges(swapee->data<T>(), swapee->data<T>() + swapee->size(), data<T>());
}
#ifdef CUDA_FOUND
else {
if(backend_->getDeviceId() == swapee->getBackend()->getDeviceId()) {
// we live on the same GPU; do an element-wise swap
gpu::swap_ranges(backend_, swapee->data<T>(), swapee->data<T>() + swapee->size(), data<T>());
} else {
// we live on two different GPUs or devices; go through CPU RAM
std::vector<T> temp;
get(temp);
copyFrom(swapee);
swapee->set(temp);
}
}
#endif
}
void swap(Tensor swapee) {
switch(type_) {
case Type::int8: swap<int8_t>(swapee); break;
case Type::int16: swap<int16_t>(swapee); break;
case Type::int32: swap<int32_t>(swapee); break;
case Type::int64: swap<int64_t>(swapee); break;
case Type::uint8: swap<uint8_t>(swapee); break;
case Type::uint16: swap<uint16_t>(swapee); break;
case Type::uint32: swap<uint32_t>(swapee); break;
case Type::uint64: swap<uint64_t>(swapee); break;
case Type::float32: swap<float>(swapee); break;
case Type::float64: swap<double>(swapee); break;
default: ABORT("Unknown type {}", type_);
}
}
template <typename T>
std::string debug() {
ABORT_IF(!matchType<T>(type_),

View File

@ -7,9 +7,9 @@
#include "optimizers/optimizers.h"
#if MPI_FOUND
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsuggest-override"
#endif
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsuggest-override"
#endif
#include "mpi.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
@ -203,25 +203,18 @@ public:
void swapParams(const std::vector<Tensor>& paramShards) const override {
// Update all graphs with parameter shard
ABORT_IF(graphs_.size() < 2, "Swap requires at least two graphs");
auto gather = [this, paramShards](size_t idx, size_t begin, size_t end) {
ABORT_IF(end-begin != paramShards[idx]->size(), "inconsistent shard size (swapParams, [{}], {} vs {})??", idx, end-begin, paramShards[idx]->size());
ABORT_IF(end - begin != paramShards[idx]->size(), "inconsistent shard size (swapParams, [{}], {} vs {})??", idx, end-begin, paramShards[idx]->size());
// Copy parameter shard to each graph, apart from last graph
for(int i = 0; i < (int)graphs_.size() - 1; ++i) {
auto subParam
= graphs_[i]->params()->vals()->subtensor(begin, paramShards[idx]->size());
auto subParam = graphs_[i]->params()->vals()->subtensor(begin, paramShards[idx]->size());
subParam->copyFrom(paramShards[idx]);
}
// Back-up shard from last graph
auto subParamLast =
graphs_.back()->params()->vals()->subtensor(begin, paramShards[idx]->size());
paramShards[idx]->copyFrom(subParamLast);
auto subParamFirst
= graphs_[0]->params()->vals()->subtensor(begin, paramShards[idx]->size());
subParamLast->copyFrom(subParamFirst);
// Swap shard with corresponding share from last graph
auto subParamLast = graphs_.back()->params()->vals()->subtensor(begin, paramShards[idx]->size());
paramShards[idx]->swap(subParamLast);
};
// Execute for each shard
foreach(gather);