diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu index 9aad629d..bdf66bac 100755 --- a/src/tensors/gpu/algorithm.cu +++ b/src/tensors/gpu/algorithm.cu @@ -49,7 +49,7 @@ void fill(Ptr backend, T* begin, T* end, T value) { if (size == 0) return; CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); - int threadsPerBlock = std::min(512, size); + int threadsPerBlock = std::min(MAX_THREADS, size); int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b) gFill<<>>(begin, size, value); CUDA_CHECK(cudaStreamSynchronize(0)); @@ -76,5 +76,46 @@ void setSparse(Ptr backend, // gpu::SetSparse(data, keys, values); CUDA_CHECK(cudaStreamSynchronize(0)); } + +template +__global__ void gSwap(T* d_v1, T* d_v2, int size) { + auto threadsPerBlock = blockDim.x; + int index = threadIdx.x + threadsPerBlock * blockIdx.x; + if(index < size) { + T temp = d_v1[index]; + d_v1[index] = d_v2[index]; + d_v2[index] = temp; + } +} + +template +void swap_ranges(Ptr backend, T* begin, T* end, T* dest) { + int size = end - begin; + if (size == 0) + return; + + CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); + int threadsPerBlock = std::min(MAX_THREADS, size); + int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b) + gSwap<<>>(begin, dest, size); + CUDA_CHECK(cudaStreamSynchronize(0)); +} + +// clang-format off +template void swap_ranges(Ptr, int8_t*, int8_t*, int8_t*); +template void swap_ranges(Ptr, int16_t*, int16_t*, int16_t*); +template void swap_ranges(Ptr, int32_t*, int32_t*, int32_t*); +template void swap_ranges(Ptr, int64_t*, int64_t*, int64_t*); + +template void swap_ranges(Ptr, uint8_t*, uint8_t*, uint8_t*); +template void swap_ranges(Ptr, uint16_t*, uint16_t*, uint16_t*); +template void swap_ranges(Ptr, uint32_t*, uint32_t*, uint32_t*); +template void swap_ranges(Ptr, uint64_t*, uint64_t*, uint64_t*); + +template void swap_ranges(Ptr, char*, char*, char*); +template void swap_ranges(Ptr, float*, float*, float*); +template void swap_ranges(Ptr, double*, double*, double*); +// clang-format on + } // namespace gpu } // namespace marian diff --git a/src/tensors/gpu/algorithm.h b/src/tensors/gpu/algorithm.h index 9b4480e9..84f8b41e 100644 --- a/src/tensors/gpu/algorithm.h +++ b/src/tensors/gpu/algorithm.h @@ -10,6 +10,9 @@ void copy(Ptr backend, const T* begin, const T* end, T* dest); template void fill(Ptr backend, T* begin, T* end, T value); +template +void swap_ranges(Ptr backend, T* begin, T* end, T* dest); + void setSparse(Ptr backend, const std::vector&, const std::vector&, diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index c40b94e0..9721670b 100755 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -56,15 +56,6 @@ public: virtual size_t size() { return shape_.elements(); } - virtual float scalar() { - ABORT_IF(!matchType(type_), - "Requested type ({}) and underlying type ({}) do not match", - request(), - type_); - ABORT_IF(size() != 1, "Tensor is not a scalar"); - return get(0); - } - template T scalar() { ABORT_IF(!matchType(type_), @@ -76,6 +67,10 @@ public: return get(0); } + virtual float scalar() { + return scalar(); + } + Ptr getBackend() { return backend_; } DeviceId getDeviceId() { return backend_->getDeviceId(); } @@ -85,24 +80,6 @@ public: return New(mem, Shape{1, (int)size}, backend_); } - float get(size_t i) { - ABORT_IF(!matchType(type_), - "Requested type ({}) and underlying type ({}) do not match", - request(), - type_); - - float temp = 0; // (initialize to keep compiler happy) - if(backend_->getDeviceId().type == DeviceType::cpu) { - std::copy(data() + i, data() + i + 1, &temp); - } -#ifdef CUDA_FOUND - else { - gpu::copy(backend_, data() + i, data() + i + 1, &temp); - } -#endif - return temp; - } - template T get(size_t i) { ABORT_IF(!matchType(type_), @@ -122,6 +99,10 @@ public: return temp; } + float get(size_t i) { + return get(i); + } + template void set(size_t i, T value) { ABORT_IF(!matchType(type_), @@ -228,24 +209,95 @@ public: #endif } + template void copyFrom(Tensor in) { - // @TODO: solve this later - ABORT_IF(!matchType(type_), + ABORT_IF(in->shape() != shape_, "Can only copy tensors with equal shapes ({} != {})", in->shape(), shape_); + ABORT_IF(in->type() != type_, "Can only copy tensors with equal types ({} != {})", in->type(), type_); + ABORT_IF(!matchType(type_), "Requested type ({}) and underlying type ({}) do not match", - request(), + request(), type_); if(in->getBackend()->getDeviceId().type == DeviceType::cpu && backend_->getDeviceId().type == DeviceType::cpu) { - std::copy(in->data(), in->data() + in->size(), data()); + std::copy(in->data(), in->data() + in->size(), data()); } #ifdef CUDA_FOUND else { - gpu::copy(backend_, in->data(), in->data() + in->size(), data()); + gpu::copy(backend_, in->data(), in->data() + in->size(), data()); } #endif } + void copyFrom(Tensor in) { + switch(type_) { + case Type::int8: copyFrom(in); break; + case Type::int16: copyFrom(in); break; + case Type::int32: copyFrom(in); break; + case Type::int64: copyFrom(in); break; + + case Type::uint8: copyFrom(in); break; + case Type::uint16: copyFrom(in); break; + case Type::uint32: copyFrom(in); break; + case Type::uint64: copyFrom(in); break; + + case Type::float32: copyFrom(in); break; + case Type::float64: copyFrom(in); break; + + default: ABORT("Unknown type {}", type_); + } + } + + // Swaps the contents of the current tensor with the argument tensor + template + void swap(Tensor swapee) { + ABORT_IF(swapee->shape() != shape_, "Can only swap tensors with equal shapes ({} != {})", swapee->shape(), shape_); + ABORT_IF(swapee->type() != type_, "Can only swap tensors with equal types ({} != {})", swapee->type(), type_); + ABORT_IF(!matchType(type_), + "Requested type ({}) and underlying type ({}) do not match", + request(), + type_); + + // we live on CPUs; just use stdlib + if(swapee->getBackend()->getDeviceId().type == DeviceType::cpu + && backend_->getDeviceId().type == DeviceType::cpu) { + std::swap_ranges(swapee->data(), swapee->data() + swapee->size(), data()); + } +#ifdef CUDA_FOUND + else { + if(backend_->getDeviceId() == swapee->getBackend()->getDeviceId()) { + // we live on the same GPU; do an element-wise swap + gpu::swap_ranges(backend_, swapee->data(), swapee->data() + swapee->size(), data()); + } else { + // we live on two different GPUs or devices; go through CPU RAM + std::vector temp; + get(temp); + copyFrom(swapee); + swapee->set(temp); + } + } +#endif + } + + void swap(Tensor swapee) { + switch(type_) { + case Type::int8: swap(swapee); break; + case Type::int16: swap(swapee); break; + case Type::int32: swap(swapee); break; + case Type::int64: swap(swapee); break; + + case Type::uint8: swap(swapee); break; + case Type::uint16: swap(swapee); break; + case Type::uint32: swap(swapee); break; + case Type::uint64: swap(swapee); break; + + case Type::float32: swap(swapee); break; + case Type::float64: swap(swapee); break; + + default: ABORT("Unknown type {}", type_); + } + } + template std::string debug() { ABORT_IF(!matchType(type_), diff --git a/src/training/communicator.h b/src/training/communicator.h index 2d42e109..eef136f2 100755 --- a/src/training/communicator.h +++ b/src/training/communicator.h @@ -7,9 +7,9 @@ #include "optimizers/optimizers.h" #if MPI_FOUND #ifdef __GNUC__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wsuggest-override" -#endif +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsuggest-override" +#endif #include "mpi.h" #ifdef __GNUC__ #pragma GCC diagnostic pop @@ -203,25 +203,18 @@ public: void swapParams(const std::vector& paramShards) const override { // Update all graphs with parameter shard - ABORT_IF(graphs_.size() < 2, "Swap requires at least two graphs"); - + auto gather = [this, paramShards](size_t idx, size_t begin, size_t end) { - ABORT_IF(end-begin != paramShards[idx]->size(), "inconsistent shard size (swapParams, [{}], {} vs {})??", idx, end-begin, paramShards[idx]->size()); + ABORT_IF(end - begin != paramShards[idx]->size(), "inconsistent shard size (swapParams, [{}], {} vs {})??", idx, end-begin, paramShards[idx]->size()); // Copy parameter shard to each graph, apart from last graph for(int i = 0; i < (int)graphs_.size() - 1; ++i) { - auto subParam - = graphs_[i]->params()->vals()->subtensor(begin, paramShards[idx]->size()); + auto subParam = graphs_[i]->params()->vals()->subtensor(begin, paramShards[idx]->size()); subParam->copyFrom(paramShards[idx]); } - // Back-up shard from last graph - auto subParamLast = - graphs_.back()->params()->vals()->subtensor(begin, paramShards[idx]->size()); - paramShards[idx]->copyFrom(subParamLast); - - auto subParamFirst - = graphs_[0]->params()->vals()->subtensor(begin, paramShards[idx]->size()); - subParamLast->copyFrom(subParamFirst); + // Swap shard with corresponding share from last graph + auto subParamLast = graphs_.back()->params()->vals()->subtensor(begin, paramShards[idx]->size()); + paramShards[idx]->swap(subParamLast); }; // Execute for each shard foreach(gather);