From 483426798930bc17309fecfb401b28ec8917529c Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 15 Feb 2018 15:23:45 -0800 Subject: [PATCH 01/11] clean up allocators and devices --- CMakeLists.txt | 12 +++--- src/CMakeLists.txt | 4 +- src/backend/cpu/dropout.cpp | 4 +- src/graph/expression_graph.h | 2 +- src/kernels/cuda_helpers.h | 6 +++ src/kernels/tensor_operators.cu | 12 +++--- src/kernels/tensor_operators.h | 5 +-- src/python/CMakeLists.txt | 2 +- src/tensors/allocator.h | 28 +++++++------- src/tensors/device.cpp | 28 ++++++++++++++ src/tensors/device.cu | 40 +++++++++++++++++++ src/tensors/device.h | 68 +++++++++++++++++++++++++++++++++ src/tensors/device_gpu.cu | 37 ------------------ src/tensors/device_gpu.h | 35 ----------------- src/tensors/memory_piece.cu | 11 ------ src/tensors/memory_piece.h | 7 ---- src/tensors/tensor_allocator.h | 7 ++-- src/tests/allocator_test.cpp | 46 +--------------------- src/tests/dropout_test.cu | 6 +-- 19 files changed, 181 insertions(+), 179 deletions(-) create mode 100644 src/tensors/device.cpp create mode 100644 src/tensors/device.cu create mode 100644 src/tensors/device.h delete mode 100644 src/tensors/device_gpu.cu delete mode 100644 src/tensors/device_gpu.h delete mode 100644 src/tensors/memory_piece.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 6628d8f4..23598378 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,12 +58,12 @@ else(Tcmalloc_FOUND) message(WARNING "Cannot find TCMalloc library. Continuing.") endif(Tcmalloc_FOUND) -find_package(MPI) -if(MPI_FOUND) - include_directories(${MPI_INCLUDE_PATH}) - set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES}) - add_definitions(-DMPI_FOUND=1) -endif(MPI_FOUND) +#find_package(MPI) +#if(MPI_FOUND) +# include_directories(${MPI_INCLUDE_PATH}) +# set(EXT_LIBS ${EXT_LIBS} ${MPI_LIBRARIES}) +# add_definitions(-DMPI_FOUND=1) +#endif(MPI_FOUND) find_package(ZLIB) if(ZLIB_FOUND) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8550e51d..c1bfdaf6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,8 +14,8 @@ cuda_add_library(marian graph/node_operators.cu graph/node_initializers.cu tensors/tensor.cu - tensors/device_gpu.cu - tensors/memory_piece.cu + tensors/device.cu + tensors/device.cpp kernels/tensor_operators.cu kernels/cudnn_wrappers.cu backend/gpu/dropout.cu diff --git a/src/backend/cpu/dropout.cpp b/src/backend/cpu/dropout.cpp index 034dfeec..247dd460 100644 --- a/src/backend/cpu/dropout.cpp +++ b/src/backend/cpu/dropout.cpp @@ -1,5 +1,4 @@ -#include -#include +#include #include "backend/dispatch.h" @@ -8,6 +7,7 @@ namespace marian { void Dropout(Ptr backend, Tensor tensor, float p) { ABORT("Not implemented"); + std::fill(tensor->data(), tensor->data() + tensor->size(), p); } } diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index 9d7f714e..7d50e9ba 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -306,7 +306,7 @@ public: tensors_->free(t); } - Ptr> allocator() { return tensors_->allocator(); } + Ptr allocator() { return tensors_->allocator(); } void clear() { // clear everything apart from parameters diff --git a/src/kernels/cuda_helpers.h b/src/kernels/cuda_helpers.h index bc4c21d6..92faaf4c 100644 --- a/src/kernels/cuda_helpers.h +++ b/src/kernels/cuda_helpers.h @@ -17,6 +17,12 @@ inline void gpuAssert(cudaError_t code, } } +template +void CudaCopy(const T* start, const T* end, D* dest) { + CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T), + cudaMemcpyDefault)); +} + #define CUSPARSE_CHECK(x) \ { \ cusparseStatus_t _c = x; \ diff --git a/src/kernels/tensor_operators.cu b/src/kernels/tensor_operators.cu index ad7c5d14..69b8afc4 100644 --- a/src/kernels/tensor_operators.cu +++ b/src/kernels/tensor_operators.cu @@ -879,7 +879,7 @@ __global__ void gInsert(float* out, } } -void Select(Ptr> allocator, +void Select(Ptr allocator, Tensor out, const Tensor in, int axis, @@ -892,8 +892,8 @@ void Select(Ptr> allocator, int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); auto mp_indices = allocator->alloc(indices.size()); - mp_indices->insert(indices.data(), indices.size()); - + CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data()); + int axisGPU = axis + gpu::Shape::size() - out->shape().size(); gSelect<<>>(out->data(), out->shape(), @@ -905,7 +905,7 @@ void Select(Ptr> allocator, allocator->free(mp_indices); } -void Insert(Ptr> allocator, +void Insert(Ptr allocator, Tensor out, const Tensor in, int axis, @@ -918,8 +918,8 @@ void Insert(Ptr> allocator, int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); auto mp_indices = allocator->alloc(indices.size()); - mp_indices->insert(indices.data(), indices.size()); - + CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data()); + int axisGPU = axis + gpu::Shape::size() - out->shape().size(); gInsert<<>>(out->data(), out->shape(), diff --git a/src/kernels/tensor_operators.h b/src/kernels/tensor_operators.h index 91717c1b..77e58670 100644 --- a/src/kernels/tensor_operators.h +++ b/src/kernels/tensor_operators.h @@ -7,7 +7,6 @@ #include "tensors/tensor.h" #include "tensors/allocator.h" -#include "tensors/device_gpu.h" #include "gpu/shape.h" #include "gpu/tmp.h" @@ -71,13 +70,13 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) { void TransposeND(Tensor out, Tensor in, const std::vector& vAxis); -void Select(Ptr> allocator, +void Select(Ptr allocator, Tensor out, Tensor in, int axis, const std::vector&); -void Insert(Ptr> allocator, +void Insert(Ptr allocator, Tensor out, Tensor in, int axis, diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index baa7f97d..d547660a 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -8,7 +8,7 @@ cuda_add_library(pymarian SHARED ../graph/node.cu ../graph/node_operators.cu ../tensors/tensor.cu - ../tensors/device_gpu.cu + ../tensors/device.cpp ../kernels/tensor_operators.cu ../backend/gpu/dropout.cu ../backend/cpu/dropout.cpp diff --git a/src/tensors/allocator.h b/src/tensors/allocator.h index 8c9a5fee..6c9b51f6 100644 --- a/src/tensors/allocator.h +++ b/src/tensors/allocator.h @@ -10,6 +10,7 @@ #include "common/definitions.h" #include "tensors/memory_piece.h" +#include "tensors/device.h" namespace marian { @@ -65,10 +66,9 @@ public: Gap rest(size_t offset) const { return Gap(data_ + offset, size_ - offset); } }; -template class Allocator { private: - Device device_; + Ptr device_; size_t available_{0}; size_t step_{128 * 1024 * 1024}; size_t alignment_{256}; @@ -83,23 +83,23 @@ private: void grow(size_t add) { add = align(add); - uint8_t* oldData = device_.data(); - size_t oldSize = device_.size(); + uint8_t* oldData = device_->data(); + size_t oldSize = device_->size(); - device_.reserve(oldSize + add); + device_->reserve(oldSize + add); std::set oldGaps; gaps_.swap(oldGaps); for(auto gap : oldGaps) gaps_.insert( - Gap(device_.data() + std::distance(oldData, gap.data()), gap.size())); - insertGap(Gap(device_.data() + oldSize, add)); + Gap(device_->data() + std::distance(oldData, gap.data()), gap.size())); + insertGap(Gap(device_->data() + oldSize, add)); std::unordered_map> oldAllocated; allocated_.swap(oldAllocated); for(auto it : oldAllocated) { - uint8_t* newPtr = device_.data() + std::distance(oldData, it.first); + uint8_t* newPtr = device_->data() + std::distance(oldData, it.first); allocated_[newPtr] = oldAllocated[it.first]; allocated_[newPtr]->setPtr(newPtr); } @@ -142,7 +142,7 @@ private: public: Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256) - : device_(deviceId, alignment), + : device_(DispatchDevice(deviceId, alignment)), step_(step), available_(0), alignment_(alignment) { @@ -153,7 +153,7 @@ public: void reserve(size_t bytes) { bytes = align(bytes); - device_.reserve(bytes); + device_->reserve(bytes); clear(); } @@ -211,17 +211,17 @@ public: available_ = 0; gaps_.clear(); allocated_.clear(); - insertGap({device_.data(), device_.size()}, false); + insertGap({device_->data(), device_->size()}, false); } Ptr memory() { - return New(device_.data(), device_.size()); + return New(device_->data(), device_->size()); } - size_t size() { return device_.size(); } + size_t size() { return device_->size(); } size_t available() { return available_; } - DeviceId getDevice() { return device_.getDevice(); } + DeviceId getDevice() { return device_->getDevice(); } }; } diff --git a/src/tensors/device.cpp b/src/tensors/device.cpp new file mode 100644 index 00000000..5e2a3568 --- /dev/null +++ b/src/tensors/device.cpp @@ -0,0 +1,28 @@ +#include +#include "tensors/device.h" + +namespace marian { +namespace cpu { + + Device::~Device() { + delete[] data_; + } + + void Device::reserve(size_t size) { + size = align(size); + ABORT_IF(size < size_, "New size must be larger than old size"); + + if(data_) { + uint8_t *temp = new uint8_t[size_]; + std::copy(data_, data_ + size_, temp); + delete[] data_; + data_ = temp; + } else { + data_ = new uint8_t[size]; + } + + size_ = size; + } + +} +} diff --git a/src/tensors/device.cu b/src/tensors/device.cu new file mode 100644 index 00000000..c773d566 --- /dev/null +++ b/src/tensors/device.cu @@ -0,0 +1,40 @@ +#include +#include + +#include "tensors/device.h" +#include "kernels/cuda_helpers.h" + +namespace marian { +namespace gpu { + + Device::~Device() { + cudaSetDevice(deviceId_.no); + if(data_) { + CUDA_CHECK(cudaFree(data_)); + } + cudaDeviceSynchronize(); + } + + void Device::reserve(size_t size) { + size = align(size); + cudaSetDevice(deviceId_.no); + + ABORT_IF(size < size_, "New size must be larger than old size"); + + if(data_) { + // Allocate memory by going through host memory + uint8_t *temp = new uint8_t[size_]; + CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(data_)); + CUDA_CHECK(cudaMalloc(&data_, size)); + CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice)); + delete[] temp; + } else { + CUDA_CHECK(cudaMalloc(&data_, size)); + } + + size_ = size; + } + +} +} diff --git a/src/tensors/device.h b/src/tensors/device.h new file mode 100644 index 00000000..33a9c104 --- /dev/null +++ b/src/tensors/device.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include + +#include "common/definitions.h" + +namespace marian { + +class Device { +protected: + DeviceId deviceId_; + + uint8_t* data_; + size_t size_; + size_t alignment_; + + size_t align(size_t size) { + return ceil(size / (float)alignment_) * alignment_; + } + +public: + Device(DeviceId deviceId, size_t alignment = 256) + : deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {} + + virtual ~Device() {}; + + virtual void reserve(size_t size) = 0; + + virtual uint8_t* data() { return data_; } + + virtual size_t size() { return size_; } + + virtual DeviceId getDevice() { return deviceId_; } +}; + +namespace gpu { + class Device : public marian::Device { + public: + Device(DeviceId deviceId, size_t alignment = 256) + : marian::Device(deviceId, alignment) {} + + ~Device(); + + void reserve(size_t size); + }; +} + +namespace cpu { + class Device : public marian::Device { + public: + Device(DeviceId deviceId, size_t alignment = 256) + : marian::Device(deviceId, alignment) {} + + ~Device(); + + void reserve(size_t size); + }; +} + +static inline Ptr DispatchDevice(DeviceId deviceId, size_t alignment = 256) { + if(deviceId.type == DeviceType::gpu) + return New(deviceId, alignment); + else + return New(deviceId, alignment); +} + +} \ No newline at end of file diff --git a/src/tensors/device_gpu.cu b/src/tensors/device_gpu.cu deleted file mode 100644 index f56c8747..00000000 --- a/src/tensors/device_gpu.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include - -#include "tensors/device_gpu.h" -#include "kernels/cuda_helpers.h" - -namespace marian { - -DeviceGPU::~DeviceGPU() { - cudaSetDevice(deviceId_.no); - if(data_) { - CUDA_CHECK(cudaFree(data_)); - } - cudaDeviceSynchronize(); -} - -void DeviceGPU::reserve(size_t size) { - size = align(size); - cudaSetDevice(deviceId_.no); - - ABORT_IF(size < size_, "New size must be larger than old size"); - - if(data_) { - // Allocate memory by going through host memory - uint8_t *temp = new uint8_t[size_]; - CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(data_)); - CUDA_CHECK(cudaMalloc(&data_, size)); - CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice)); - delete[] temp; - } else { - CUDA_CHECK(cudaMalloc(&data_, size)); - } - - size_ = size; -} -} diff --git a/src/tensors/device_gpu.h b/src/tensors/device_gpu.h deleted file mode 100644 index 279a2b2b..00000000 --- a/src/tensors/device_gpu.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include -#include - -#include "common/definitions.h" - -namespace marian { - -class DeviceGPU { -private: - uint8_t* data_; - size_t size_; - DeviceId deviceId_; - size_t alignment_; - - size_t align(size_t size) { - return ceil(size / (float)alignment_) * alignment_; - } - -public: - DeviceGPU(DeviceId deviceId, size_t alignment = 256) - : data_(0), size_(0), deviceId_(deviceId), alignment_(alignment) {} - - ~DeviceGPU(); - - void reserve(size_t size); - - uint8_t* data() { return data_; } - - size_t size() { return size_; } - - DeviceId getDevice() { return deviceId_; } -}; -} \ No newline at end of file diff --git a/src/tensors/memory_piece.cu b/src/tensors/memory_piece.cu deleted file mode 100644 index d19767ae..00000000 --- a/src/tensors/memory_piece.cu +++ /dev/null @@ -1,11 +0,0 @@ -#include - -#include "kernels/cuda_helpers.h" -#include "tensors/memory_piece.h" - -namespace marian { - -void MemoryPiece::insert(uint8_t* ptr, size_t num) { - CUDA_CHECK(cudaMemcpy(data_, ptr, num * sizeof(uint8_t), cudaMemcpyDefault)); -} -} diff --git a/src/tensors/memory_piece.h b/src/tensors/memory_piece.h index 4b464e79..8380d2cd 100644 --- a/src/tensors/memory_piece.h +++ b/src/tensors/memory_piece.h @@ -34,13 +34,6 @@ public: void setPtr(uint8_t* data) { data_ = data; } - template - void insert(T* ptr, size_t num) { - insert((uint8_t*)ptr, num * sizeof(T)); - } - - void insert(uint8_t* ptr, size_t num); - friend std::ostream& operator<<(std::ostream& out, const MemoryPiece mp) { out << "MemoryPiece - ptr: " << std::hex << (size_t)mp.data() << std::dec << " size: " << mp.size(); diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h index 3498ff18..e1c54b22 100644 --- a/src/tensors/tensor_allocator.h +++ b/src/tensors/tensor_allocator.h @@ -5,7 +5,6 @@ #include "common/definitions.h" #include "tensors/allocator.h" -#include "tensors/device_gpu.h" #include "tensors/tensor.h" namespace marian { @@ -17,11 +16,11 @@ private: const size_t GROW = CHUNK * MBYTE; const size_t ALIGN = 256; - Ptr> allocator_; + Ptr allocator_; public: TensorAllocator(DeviceId deviceId) - : allocator_(New>(deviceId, 0, GROW, ALIGN)) {} + : allocator_(New(deviceId, 0, GROW, ALIGN)) {} ~TensorAllocator() { clear(); } @@ -73,6 +72,6 @@ public: size_t size() { return allocator_->size() / sizeof(float); } - Ptr> allocator() { return allocator_; } + Ptr allocator() { return allocator_; } }; } diff --git a/src/tests/allocator_test.cpp b/src/tests/allocator_test.cpp index 2a67fe37..1b5cb348 100644 --- a/src/tests/allocator_test.cpp +++ b/src/tests/allocator_test.cpp @@ -5,55 +5,11 @@ #include "3rd_party/exception.h" #include "tensors/allocator.h" -#include "tensors/device_gpu.h" - -class DeviceCPU { -private: - uint8_t* data_; - size_t size_; - size_t alignment_; - -public: - DeviceCPU(size_t device, size_t alignment = 256) - : data_(0), size_(0), alignment_(alignment) {} - - ~DeviceCPU() { delete[] data_; } - - size_t align(size_t size) { - return ceil(size / (float)alignment_) * alignment_; - } - - void reserve(size_t size) { - size = align(size); - ABORT_IF(size < size_, "New size must be larger than old size"); - - if(data_) { - // Allocate memory by going through host memory - uint8_t *temp = new uint8_t[size_]; - std::copy(data_, data_ + size_, temp); - std::fill(data_, data_ + size_, 0); - delete[] data_; - data_ = new uint8_t[size]; - std::copy(temp, temp + size_, data_); - delete[] temp; - } else { - data_ = new uint8_t[size]; - } - - size_ = size; - } - - uint8_t* data() { return data_; } - - size_t size() { return size_; } - - size_t getDevice() { return 0; } -}; int main(int argc, char** argv) { using namespace marian; - auto a = New>({0, DeviceType::gpu}, 0, 30000, 256); + auto a = New({0, DeviceType::gpu}, 0, 30000, 256); std::cerr << "Size: " << a->size() << std::endl; auto mem1 = a->alloc(100000); diff --git a/src/tests/dropout_test.cu b/src/tests/dropout_test.cu index 517056c2..37738deb 100644 --- a/src/tests/dropout_test.cu +++ b/src/tests/dropout_test.cu @@ -1,13 +1,9 @@ -#include -#include #include #include #include #include #include -#include "common/config.h" -#include "graph/node_initializers.h" #include "marian.h" using namespace marian; @@ -17,7 +13,7 @@ int main(int argc, char** argv) { auto c = New(argc, argv); auto g = New(); - g->setDevice({0, DeviceType::gpu}); + g->setDevice({0, DeviceType::cpu}); g->reserveWorkspaceMB(512); for(int i = 0; i < 10; ++i) { From b4fdf6134b47220955a41a4b60806c53829eb1c7 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 15 Feb 2018 15:30:16 -0800 Subject: [PATCH 02/11] remove tensor_cpu.h --- src/tensors/tensor.cu | 5 +-- src/tensors/tensor_cpu.h | 82 ---------------------------------------- 2 files changed, 1 insertion(+), 86 deletions(-) delete mode 100644 src/tensors/tensor_cpu.h diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu index eec4be6b..bc26fcec 100644 --- a/src/tensors/tensor.cu +++ b/src/tensors/tensor.cu @@ -1,11 +1,8 @@ -#include -#include -#include +#include "tensors/tensor.h" #include "kernels/cuda_helpers.h" #include "kernels/tensor_operators.h" -#include "tensors/tensor.h" namespace marian { diff --git a/src/tensors/tensor_cpu.h b/src/tensors/tensor_cpu.h deleted file mode 100644 index 18725b3d..00000000 --- a/src/tensors/tensor_cpu.h +++ /dev/null @@ -1,82 +0,0 @@ -#pragma once - -// This file is part of the Marian toolkit. - -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include - -#include "tensors/tensor.h" - -namespace marian { - -class TensorCPU : public TensorBase { -public: - TensorCPU(float* data, Shape shape) : TensorBase(data, shape) {} - - float get(size_t i) { return data_[i]; } - - void set(size_t i, float value) { data_[i] = value; } - - void get(std::vector& v) { - v.resize(size()); - std::copy(data_, data_ + size(), v.begin()); - } - - void set(float value) { std::fill(data_, data_ + size(), value); } - - void set(const std::vector& v) { - std::copy(v.begin(), v.end(), data_); - } -}; - -class DeviceCPU { -private: - float* data_; - size_t size_ - - public : DeviceCPU() - : data_(0), size_(0) {} - - ~DeviceCPU() { - if(data_) - delete[] data_; - } - - typedef TensorCPU tensor_type; - - void reserve(size_t size) { - ABORT_IF(size < size_, "New size must be larger than old size"); - float* temp = new float[size]; - - if(data_) { - std::memcpy(temp, data_, size_ * sizeof(float)); - delete[] data_; - } - - data_ = temp; - size_ = size; - } - - float* data() { return data_; } - - size_t capacity() { return size_; } -}; -} From ae600b3ef7cefb82c569444b1477873e4c2229f3 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 15 Feb 2018 17:42:43 -0800 Subject: [PATCH 03/11] more clean up in backend --- src/CMakeLists.txt | 18 +++++---- src/backend/cpu/dropout.cpp | 14 ------- src/backend/dispatch.h | 39 ------------------- src/backend/gpu/dropout.cu | 50 ------------------------ src/graph/backend.h | 20 ---------- src/graph/backend_gpu.h | 65 ------------------------------- src/graph/expression_graph.cpp | 7 +--- src/graph/expression_graph.h | 2 +- src/graph/node.cu | 51 ------------------------ src/graph/node.h | 5 ++- src/graph/node_operators_binary.h | 58 +++++++++++++-------------- src/graph/node_operators_unary.h | 5 ++- src/python/CMakeLists.txt | 4 +- src/tests/dropout_test.cu | 2 +- 14 files changed, 51 insertions(+), 289 deletions(-) delete mode 100644 src/backend/cpu/dropout.cpp delete mode 100644 src/backend/dispatch.h delete mode 100644 src/backend/gpu/dropout.cu delete mode 100644 src/graph/backend.h delete mode 100644 src/graph/backend_gpu.h delete mode 100644 src/graph/node.cu diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c1bfdaf6..977d88f7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,19 +8,21 @@ cuda_add_library(marian 3rd_party/cnpy/cnpy.cpp 3rd_party/exception.cpp 3rd_party/svd/svd.cpp - graph/expression_graph.cpp - graph/expression_operators.cu - graph/node.cu - graph/node_operators.cu - graph/node_initializers.cu tensors/tensor.cu tensors/device.cu tensors/device.cpp + tensors/backend.cpp + tensors/gpu/dropout.cu + tensors/cpu/dropout.cpp kernels/tensor_operators.cu kernels/cudnn_wrappers.cu - backend/gpu/dropout.cu - backend/cpu/dropout.cpp + graph/expression_graph.cpp + graph/expression_operators.cu + graph/node.cpp + graph/node_operators.cu + graph/node_initializers.cu layers/convolution.cu + rnn/cells.cu optimizers/clippers.cu optimizers/optimizers.cu @@ -91,7 +93,7 @@ endforeach(exec) #set_target_properties(align2steps PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") if(PYTHONLIBS_FOUND) - add_subdirectory(python) +# add_subdirectory(python) endif(PYTHONLIBS_FOUND) if(COMPILE_TESTS) diff --git a/src/backend/cpu/dropout.cpp b/src/backend/cpu/dropout.cpp deleted file mode 100644 index 247dd460..00000000 --- a/src/backend/cpu/dropout.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include - -#include "backend/dispatch.h" - -namespace marian { - namespace cpu { - - void Dropout(Ptr backend, Tensor tensor, float p) { - ABORT("Not implemented"); - std::fill(tensor->data(), tensor->data() + tensor->size(), p); - } - - } -} diff --git a/src/backend/dispatch.h b/src/backend/dispatch.h deleted file mode 100644 index 4d9490e1..00000000 --- a/src/backend/dispatch.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "common/definitions.h" -#include "graph/backend.h" -#include "tensors/tensor.h" - -#define DISPATCH1(Function, Arg1) \ - namespace gpu { \ - void Function(Ptr, Arg1); \ - } \ - namespace cpu { \ - void Function(Ptr, Arg1); \ - } \ - void Function(Ptr backend, Arg1 arg1) { \ - if(backend->getDevice().type == DeviceType::gpu) \ - gpu::Function(backend, arg1); \ - else \ - cpu::Function(backend, arg1); \ - } - -#define DISPATCH2(Function, Arg1, Arg2) \ - namespace gpu { \ - void Function(Ptr, Arg1, Arg2); \ - } \ - namespace cpu { \ - void Function(Ptr, Arg1, Arg2); \ - } \ - static inline void Function(Ptr backend, Arg1 arg1, Arg2 arg2) { \ - if(backend->getDevice().type == DeviceType::gpu) \ - gpu::Function(backend, arg1, arg2); \ - else \ - cpu::Function(backend, arg1, arg2); \ - } - -namespace marian { - - DISPATCH2(Dropout, Tensor, float) - -} \ No newline at end of file diff --git a/src/backend/gpu/dropout.cu b/src/backend/gpu/dropout.cu deleted file mode 100644 index f2c29810..00000000 --- a/src/backend/gpu/dropout.cu +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include -#include -#include - -#include "backend/dispatch.h" -#include "graph/backend_gpu.h" - -#define CUDA_CALL(x) \ - do { \ - if((x) != cudaSuccess) { \ - printf("Error at %s:%d\n", __FILE__, __LINE__); \ - exit(1); \ - } \ - } while(0) - -#define CURAND_CALL(x) \ - do { \ - if((x) != CURAND_STATUS_SUCCESS) { \ - printf("Error at %s:%d\n", __FILE__, __LINE__); \ - exit(1); \ - } \ - } while(0) - - -namespace marian { - namespace gpu { - - __global__ void gScale(float* data, int n, float p) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - - while(index < n) { - data[index] = (data[index] < p) / p; - index += gridDim.x * blockDim.x; - } - } - - void Dropout(Ptr backend, Tensor tensor, float p) { - curandGenerator_t gen = std::static_pointer_cast(backend)->getCurandGenerator(); - int n = tensor->size(); - CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n)); - - int numThreads = std::min(n, 512); - int numBlocks = n / numThreads + (n % numThreads != 0); - - gScale<<>>(tensor->data(), n, 1.f - p); - } - - } -} diff --git a/src/graph/backend.h b/src/graph/backend.h deleted file mode 100644 index 323f3f95..00000000 --- a/src/graph/backend.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -#include "common/definitions.h" - -namespace marian { - -class Backend { -protected: - DeviceId deviceId_; - size_t seed_; - -public: - Backend(DeviceId deviceId, size_t seed) - : deviceId_(deviceId), seed_(seed) {} - - virtual DeviceId getDevice() { return deviceId_; }; - virtual void setDevice() = 0; -}; - -} diff --git a/src/graph/backend_gpu.h b/src/graph/backend_gpu.h deleted file mode 100644 index faa5cc77..00000000 --- a/src/graph/backend_gpu.h +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "common/config.h" -#include "graph/backend.h" - -#define CURAND_CALL(x) \ - do { \ - if((x) != CURAND_STATUS_SUCCESS) { \ - printf("Error at %s:%d\n", __FILE__, __LINE__); \ - exit(1); \ - } \ - } while(0) - -namespace marian { - -class BackendGPU : public Backend { -public: - BackendGPU(DeviceId deviceId, size_t seed) : Backend(deviceId, seed) { - setDevice(); - setHandles(); - } - - void setDevice() { - cudaSetDevice(deviceId_.no); - } - - cublasHandle_t getCublasHandle() { return cublasHandle_; } - - curandGenerator_t getCurandGenerator() { return curandGenerator_; } - -private: - cublasHandle_t cublasHandle_; - curandGenerator_t curandGenerator_; - - - void setHandles() { - cublasHandle_ = create_handle(); - curandGenerator_ = createCurandGenerator(); - } - - - curandGenerator_t createCurandGenerator() { - cudaSetDevice(deviceId_.no); - curandGenerator_t generator; - CURAND_CALL(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator, seed_)); - - // cudaStream_t stream = 0; - // CURAND_CALL(curandSetStream(generator, stream)); - // CURAND_CALL(curandDestroyGenerator(generator)); - return generator; - } - - cublasHandle_t create_handle() { - cudaSetDevice(deviceId_.no); - cublasHandle_t cublasHandle; - cublasCreate(&cublasHandle); - return cublasHandle; - } -}; -} diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index 9014abb5..183b5787 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -1,8 +1,7 @@ #include -#include "graph/backend_gpu.h" #include "graph/expression_graph.h" -#include "backend/dispatch.h" +#include "tensors/dispatch.h" namespace marian { @@ -11,11 +10,9 @@ ExpressionGraph::ExpressionGraph(bool inference) void ExpressionGraph::setDevice(DeviceId deviceId) { if(!backend_) { - backend_ = New(deviceId, Config::seed); - + backend_ = BackendByDevice(deviceId, Config::seed); params_ = New(); params_->init(backend_->getDevice()); - tensors_ = New(backend_->getDevice()); } } diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index 7d50e9ba..defdb475 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -8,8 +8,8 @@ #include "common/definitions.h" #include "tensors/tensor_allocator.h" +#include "tensors/backend.h" -#include "graph/backend.h" #include "graph/parameters.h" #include "graph/chainable.h" #include "graph/node_operators.h" diff --git a/src/graph/node.cu b/src/graph/node.cu deleted file mode 100644 index a289f60c..00000000 --- a/src/graph/node.cu +++ /dev/null @@ -1,51 +0,0 @@ -#include "graph/backend_gpu.h" -#include "graph/expression_graph.h" -#include "graph/node.h" - -namespace marian { - -size_t Node::allocate() { - size_t elements = 0; - if(!val_) { - graph()->tensor(val_, shape_); - elements = val_->shape().elements(); - } - return elements; -} - -void Node::free() { - if(graph()) { - if(val_) - graph()->free(val_); - if(adj_) - graph()->free(adj_); - } -} - -void Node::init_dependent() { - if(!adj_) { - graph()->tensor(adj_, shape_); - adj_->set(1); - } -} - -void Node::set_zero_adjoint() { - if(!adj_) { - graph()->tensor(adj_, shape_); - adj_->set(0); - } -} - -float Node::scalar() { - return val_->scalar(); -} - -Ptr Node::getBackend() { - return graph()->getBackend(); -} - -void NaryNodeOp::remove_children_from_top_nodes() { - for(auto child : children_) - graph()->remove_top_node(child); -} -} diff --git a/src/graph/node.h b/src/graph/node.h index 126d7d49..aa450000 100644 --- a/src/graph/node.h +++ b/src/graph/node.h @@ -5,10 +5,11 @@ #include #include "common/keywords.h" -#include "graph/backend.h" -#include "graph/chainable.h" +#include "tensors/backend.h" #include "tensors/tensor.h" +#include "graph/chainable.h" + namespace marian { class Node : public Chainable, diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 50ead8e6..7d6836a3 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -2,10 +2,10 @@ #include -#include "graph/backend_gpu.h" +#include "tensors/gpu/backend.h" #include "graph/node.h" -#include "kernels/tensor_operators.h" #include "functional/functional.h" +#include "kernels/tensor_operators.h" #include "kernels/cudnn_wrappers.h" namespace marian { @@ -54,7 +54,7 @@ public: NodeOps forwardOps() { // C = alpha * dot(op(A), op(B)) return {NodeOp(Prod( - std::static_pointer_cast(getBackend())->getCublasHandle(), + std::static_pointer_cast(getBackend())->getCublasHandle(), val_, child(0)->val(), child(1)->val(), @@ -72,7 +72,7 @@ public: // to sum gradients from different graph parts if(!transA_ && transB_) - return {NodeOp(Prod(std::static_pointer_cast(getBackend()) + return {NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), adj_, @@ -81,7 +81,7 @@ public: false, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast(getBackend()) + NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), adj_, @@ -92,7 +92,7 @@ public: scalar_))}; if(transA_ && !transB_) - return {NodeOp(Prod(std::static_pointer_cast(getBackend()) + return {NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), child(1)->val(), @@ -101,7 +101,7 @@ public: true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast(getBackend()) + NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), child(0)->val(), @@ -112,7 +112,7 @@ public: scalar_))}; if(transA_ && transB_) - return {NodeOp(Prod(std::static_pointer_cast(getBackend()) + return {NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), child(1)->val(), @@ -121,7 +121,7 @@ public: true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast(getBackend()) + NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), adj_, @@ -131,7 +131,7 @@ public: 1.0, scalar_))}; - return {NodeOp(Prod(std::static_pointer_cast(getBackend()) + return {NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), adj_, @@ -140,7 +140,7 @@ public: true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast(getBackend()) + NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), child(0)->val(), @@ -198,7 +198,7 @@ public: using namespace functional; return { NodeOp(Prod( - std::static_pointer_cast(getBackend())->getCublasHandle(), + std::static_pointer_cast(getBackend())->getCublasHandle(), val_, child(0)->val(), child(1)->val(), @@ -219,7 +219,7 @@ public: using namespace functional; if(!transA_ && transB_) - return {NodeOp(Prod(std::static_pointer_cast(getBackend()) + return {NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), adj_, @@ -228,7 +228,7 @@ public: false, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast(getBackend()) + NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), adj_, @@ -240,7 +240,7 @@ public: NodeOp(Add(_1, child(2)->grad(), adj_))}; if(transA_ && !transB_) - return {NodeOp(Prod(std::static_pointer_cast(getBackend()) + return {NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), child(1)->val(), @@ -249,7 +249,7 @@ public: true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast(getBackend()) + NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), child(0)->val(), @@ -261,7 +261,7 @@ public: NodeOp(Add(_1, child(2)->grad(), adj_))}; if(transA_ && transB_) - return {NodeOp(Prod(std::static_pointer_cast(getBackend()) + return {NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), child(1)->val(), @@ -270,7 +270,7 @@ public: true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast(getBackend()) + NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), adj_, @@ -281,7 +281,7 @@ public: scalar_)), NodeOp(Add(_1, child(2)->grad(), adj_))}; - return {NodeOp(Prod(std::static_pointer_cast(getBackend()) + return {NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), adj_, @@ -290,7 +290,7 @@ public: true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast(getBackend()) + NodeOp(Prod(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), child(0)->val(), @@ -350,7 +350,7 @@ public: NodeOps forwardOps() { // C = alpha * dot(op(A), op(B)) return {NodeOp(ProdBatched( - std::static_pointer_cast(getBackend())->getCublasHandle(), + std::static_pointer_cast(getBackend())->getCublasHandle(), val_, child(0)->val(), child(1)->val(), @@ -369,7 +369,7 @@ public: if(!transA_ && transB_) return { - NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) + NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), adj_, @@ -378,7 +378,7 @@ public: false, 1.0, scalar_)), - NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) + NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), adj_, @@ -390,7 +390,7 @@ public: if(transA_ && !transB_) return { - NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) + NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), child(1)->val(), @@ -399,7 +399,7 @@ public: true, 1.0, scalar_)), - NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) + NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), child(0)->val(), @@ -411,7 +411,7 @@ public: if(transA_ && transB_) return { - NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) + NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), child(1)->val(), @@ -420,7 +420,7 @@ public: true, 1.0, scalar_)), - NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) + NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), adj_, @@ -431,7 +431,7 @@ public: scalar_))}; return { - NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) + NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(0)->grad(), adj_, @@ -440,7 +440,7 @@ public: true, 1.0, scalar_)), - NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) + NodeOp(ProdBatched(std::static_pointer_cast(getBackend()) ->getCublasHandle(), child(1)->grad(), child(0)->val(), diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 8390a0c2..a3f27fd2 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -1,10 +1,11 @@ #pragma once -#include "graph/backend_gpu.h" +#include "tensors/tensor.h" +#include "tensors/gpu/backend.h" + #include "graph/node.h" #include "kernels/sparse.h" #include "kernels/tensor_operators.h" -#include "tensors/tensor.h" #include "functional/functional.h" #include "kernels/cudnn_wrappers.h" diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index d547660a..9d54c01a 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -10,8 +10,8 @@ cuda_add_library(pymarian SHARED ../tensors/tensor.cu ../tensors/device.cpp ../kernels/tensor_operators.cu - ../backend/gpu/dropout.cu - ../backend/cpu/dropout.cpp + ../tensors/gpu/dropout.cu + ../tensors/cpu/dropout.cpp ../kernels/sparse.cu #../layers/param_initializers.cu ../rnn/attention.cu diff --git a/src/tests/dropout_test.cu b/src/tests/dropout_test.cu index 37738deb..f4a42e92 100644 --- a/src/tests/dropout_test.cu +++ b/src/tests/dropout_test.cu @@ -13,7 +13,7 @@ int main(int argc, char** argv) { auto c = New(argc, argv); auto g = New(); - g->setDevice({0, DeviceType::cpu}); + g->setDevice({0, DeviceType::gpu}); g->reserveWorkspaceMB(512); for(int i = 0; i < 10; ++i) { From dd296e77f76143033fc1589c7dce6d12196bbfdd Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 15 Feb 2018 17:57:33 -0800 Subject: [PATCH 04/11] whitespace --- src/graph/node.cpp | 51 ++++++++++++++++++++++++++++ src/tensors/backend.cpp | 15 ++++++++ src/tensors/backend.h | 22 ++++++++++++ src/tensors/cpu/backend.h | 24 +++++++++++++ src/tensors/cpu/dropout.cpp | 14 ++++++++ src/tensors/device.cu | 12 +++---- src/tensors/dispatch.h | 39 +++++++++++++++++++++ src/tensors/gpu/backend.h | 68 +++++++++++++++++++++++++++++++++++++ src/tensors/gpu/dropout.cu | 51 ++++++++++++++++++++++++++++ 9 files changed, 290 insertions(+), 6 deletions(-) create mode 100644 src/graph/node.cpp create mode 100644 src/tensors/backend.cpp create mode 100644 src/tensors/backend.h create mode 100644 src/tensors/cpu/backend.h create mode 100644 src/tensors/cpu/dropout.cpp create mode 100644 src/tensors/dispatch.h create mode 100644 src/tensors/gpu/backend.h create mode 100644 src/tensors/gpu/dropout.cu diff --git a/src/graph/node.cpp b/src/graph/node.cpp new file mode 100644 index 00000000..1c93683c --- /dev/null +++ b/src/graph/node.cpp @@ -0,0 +1,51 @@ +#include "tensors/backend.h" +#include "graph/expression_graph.h" +#include "graph/node.h" + +namespace marian { + +size_t Node::allocate() { + size_t elements = 0; + if(!val_) { + graph()->tensor(val_, shape_); + elements = val_->shape().elements(); + } + return elements; +} + +void Node::free() { + if(graph()) { + if(val_) + graph()->free(val_); + if(adj_) + graph()->free(adj_); + } +} + +void Node::init_dependent() { + if(!adj_) { + graph()->tensor(adj_, shape_); + adj_->set(1); + } +} + +void Node::set_zero_adjoint() { + if(!adj_) { + graph()->tensor(adj_, shape_); + adj_->set(0); + } +} + +float Node::scalar() { + return val_->scalar(); +} + +Ptr Node::getBackend() { + return graph()->getBackend(); +} + +void NaryNodeOp::remove_children_from_top_nodes() { + for(auto child : children_) + graph()->remove_top_node(child); +} +} diff --git a/src/tensors/backend.cpp b/src/tensors/backend.cpp new file mode 100644 index 00000000..a1d66e9a --- /dev/null +++ b/src/tensors/backend.cpp @@ -0,0 +1,15 @@ +#include "tensors/backend.h" + +#include "tensors/gpu/backend.h" +#include "tensors/cpu/backend.h" + +namespace marian { + +Ptr BackendByDevice(DeviceId deviceId, size_t seed) { + if(deviceId.type == DeviceType::gpu) + return New(deviceId, seed); + else + return New(deviceId, seed); +} + +} diff --git a/src/tensors/backend.h b/src/tensors/backend.h new file mode 100644 index 00000000..11fc60d7 --- /dev/null +++ b/src/tensors/backend.h @@ -0,0 +1,22 @@ +#pragma once + +#include "common/definitions.h" + +namespace marian { + +class Backend { +protected: + DeviceId deviceId_; + size_t seed_; + +public: + Backend(DeviceId deviceId, size_t seed) + : deviceId_(deviceId), seed_(seed) {} + + virtual DeviceId getDevice() { return deviceId_; }; + virtual void setDevice() = 0; +}; + +Ptr BackendByDevice(DeviceId deviceId, size_t seed); + +} diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h new file mode 100644 index 00000000..7bf5770e --- /dev/null +++ b/src/tensors/cpu/backend.h @@ -0,0 +1,24 @@ +#pragma once + +#include "common/config.h" +#include "tensors/backend.h" + +namespace marian { +namespace cpu { + +class Backend : public marian::Backend { +public: + Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) { + } + + void setDevice() { + } + +private: + void setHandles() { + + } +}; + +} +} diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp new file mode 100644 index 00000000..4286042b --- /dev/null +++ b/src/tensors/cpu/dropout.cpp @@ -0,0 +1,14 @@ +#include + +#include "tensors/dispatch.h" + +namespace marian { + namespace cpu { + + void Dropout(Ptr backend, Tensor tensor, float p) { + ABORT("Not implemented"); + std::fill(tensor->data(), tensor->data() + tensor->size(), p); + } + + } +} diff --git a/src/tensors/device.cu b/src/tensors/device.cu index c773d566..2264fda4 100644 --- a/src/tensors/device.cu +++ b/src/tensors/device.cu @@ -6,7 +6,7 @@ namespace marian { namespace gpu { - + Device::~Device() { cudaSetDevice(deviceId_.no); if(data_) { @@ -14,13 +14,13 @@ namespace gpu { } cudaDeviceSynchronize(); } - + void Device::reserve(size_t size) { size = align(size); cudaSetDevice(deviceId_.no); - + ABORT_IF(size < size_, "New size must be larger than old size"); - + if(data_) { // Allocate memory by going through host memory uint8_t *temp = new uint8_t[size_]; @@ -32,9 +32,9 @@ namespace gpu { } else { CUDA_CHECK(cudaMalloc(&data_, size)); } - + size_ = size; } - + } } diff --git a/src/tensors/dispatch.h b/src/tensors/dispatch.h new file mode 100644 index 00000000..14b74f5a --- /dev/null +++ b/src/tensors/dispatch.h @@ -0,0 +1,39 @@ +#pragma once + +#include "common/definitions.h" +#include "tensors/backend.h" +#include "tensors/tensor.h" + +#define DISPATCH1(Function, Arg1) \ + namespace gpu { \ + void Function(Ptr, Arg1); \ + } \ + namespace cpu { \ + void Function(Ptr, Arg1); \ + } \ + void Function(Ptr backend, Arg1 arg1) { \ + if(backend->getDevice().type == DeviceType::gpu) \ + gpu::Function(backend, arg1); \ + else \ + cpu::Function(backend, arg1); \ + } + +#define DISPATCH2(Function, Arg1, Arg2) \ + namespace gpu { \ + void Function(Ptr, Arg1, Arg2); \ + } \ + namespace cpu { \ + void Function(Ptr, Arg1, Arg2); \ + } \ + static inline void Function(Ptr backend, Arg1 arg1, Arg2 arg2) { \ + if(backend->getDevice().type == DeviceType::gpu) \ + gpu::Function(backend, arg1, arg2); \ + else \ + cpu::Function(backend, arg1, arg2); \ + } + +namespace marian { + + DISPATCH2(Dropout, Tensor, float) + +} diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h new file mode 100644 index 00000000..cafdfb5b --- /dev/null +++ b/src/tensors/gpu/backend.h @@ -0,0 +1,68 @@ +#pragma once + +#include +#include +#include + +#include "common/config.h" +#include "tensors/backend.h" + +#define CURAND_CALL(x) \ + do { \ + if((x) != CURAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + exit(1); \ + } \ + } while(0) + +namespace marian { +namespace gpu { + +class Backend : public marian::Backend { +public: + Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) { + setDevice(); + setHandles(); + } + + void setDevice() { + cudaSetDevice(deviceId_.no); + } + + cublasHandle_t getCublasHandle() { return cublasHandle_; } + + curandGenerator_t getCurandGenerator() { return curandGenerator_; } + +private: + cublasHandle_t cublasHandle_; + curandGenerator_t curandGenerator_; + + + void setHandles() { + cublasHandle_ = create_handle(); + curandGenerator_ = createCurandGenerator(); + } + + + curandGenerator_t createCurandGenerator() { + cudaSetDevice(deviceId_.no); + curandGenerator_t generator; + CURAND_CALL(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator, seed_)); + + // cudaStream_t stream = 0; + // CURAND_CALL(curandSetStream(generator, stream)); + // CURAND_CALL(curandDestroyGenerator(generator)); + return generator; + } + + cublasHandle_t create_handle() { + cudaSetDevice(deviceId_.no); + cublasHandle_t cublasHandle; + cublasCreate(&cublasHandle); + return cublasHandle; + } +}; + +} +} diff --git a/src/tensors/gpu/dropout.cu b/src/tensors/gpu/dropout.cu new file mode 100644 index 00000000..6dc49c51 --- /dev/null +++ b/src/tensors/gpu/dropout.cu @@ -0,0 +1,51 @@ +#include +#include +#include +#include + +#include "tensors/dispatch.h" +#include "tensors/gpu/backend.h" + +#define CUDA_CALL(x) \ + do { \ + if((x) != cudaSuccess) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + exit(1); \ + } \ + } while(0) + +#define CURAND_CALL(x) \ + do { \ + if((x) != CURAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + exit(1); \ + } \ + } while(0) + + +namespace marian { + namespace gpu { + + __global__ void gScale(float* data, int n, float p) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + while(index < n) { + data[index] = (data[index] < p) / p; + index += gridDim.x * blockDim.x; + } + } + + void Dropout(Ptr backend, Tensor tensor, float p) { + curandGenerator_t gen = std::static_pointer_cast(backend)->getCurandGenerator(); + int n = tensor->size(); + CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n)); + + int numThreads = std::min(n, 512); + int numBlocks = n / numThreads + (n % numThreads != 0); + + gScale<<>>(tensor->data(), n, 1.f - p); + } + + + } +} From a0e74ad35ebbbecaa0d30007edd0e8d84a4681dd Mon Sep 17 00:00:00 2001 From: Alham Fikri Aji Date: Fri, 16 Feb 2018 14:15:30 +0000 Subject: [PATCH 05/11] remove debug --- src/training/graph_group_async_drop.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/src/training/graph_group_async_drop.cu b/src/training/graph_group_async_drop.cu index 84985009..42dfa725 100644 --- a/src/training/graph_group_async_drop.cu +++ b/src/training/graph_group_async_drop.cu @@ -79,7 +79,6 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads, size_t batch_words, int device_id) { if(pushStep_[device_id]++ <= dropping_warmup) { - std::cout<<"WARMUP"< Date: Fri, 16 Feb 2018 11:59:12 -0800 Subject: [PATCH 06/11] pass through backend --- src/graph/expression_graph.cpp | 6 ++--- src/graph/node_operators_unary.h | 24 ++++--------------- src/graph/parameters.h | 6 ++--- src/kernels/sparse.cu | 6 ++--- src/kernels/sparse.h | 33 +++++++++++++------------- src/kernels/tensor_operators.cu | 6 ++--- src/optimizers/optimizers.cu | 4 ++-- src/tensors/cpu/dropout.cpp | 2 +- src/tensors/dispatch.h | 25 ++++++++++--------- src/tensors/gpu/dropout.cu | 5 ++-- src/tensors/tensor.cu | 16 ++++++------- src/tensors/tensor.h | 12 ++++++---- src/tensors/tensor_allocator.h | 10 ++++---- src/training/graph_group_async.cu | 14 +++++------ src/training/graph_group_async_drop.cu | 21 ++++++++-------- src/training/graph_group_async_drop.h | 2 +- src/training/graph_group_multinode.cu | 15 +++++------- src/training/graph_group_multinode.h | 2 +- src/training/graph_group_sync.cu | 4 ++-- src/training/sparse_tensor.cu | 24 +++++++++---------- src/training/sparse_tensor.h | 8 +++---- 21 files changed, 116 insertions(+), 129 deletions(-) diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index 183b5787..934e2b73 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -12,15 +12,15 @@ void ExpressionGraph::setDevice(DeviceId deviceId) { if(!backend_) { backend_ = BackendByDevice(deviceId, Config::seed); params_ = New(); - params_->init(backend_->getDevice()); - tensors_ = New(backend_->getDevice()); + params_->init(backend_); + tensors_ = New(backend_); } } Expr ExpressionGraph::dropout(float prob, Shape shape) { return Expression(shared_from_this(), keywords::init = [prob, this](Tensor t) { - Dropout(backend_, t, prob); + Dropout(t, prob); }, keywords::shape = shape); } diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index a3f27fd2..0170fc73 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -211,21 +211,7 @@ struct TanhNodeOp : public NaryNodeOp { const std::string type() { return "tanh"; } }; -/** - * Represents a rectified - * linear node in an expression graph. - * - * This node implements the activation function \f$ f(x) = \max(0, x) \f$ and - * its derivative: - * \f[ - * f^\prime(x) = - * \begin{cases} - * 0 & \text{if } x \leq 0 \\ - * 1 & \text{if } x > 0 - * \end{cases} - * \f] - */ + struct ReLUNodeOp : public UnaryNodeOp { template ReLUNodeOp(Args... args) : UnaryNodeOp(args...) {} @@ -877,14 +863,14 @@ public: Tensor& val() { auto childVal = reshapee_->val(); val_.reset( - new TensorBase(childVal->memory(), shape(), childVal->getDevice())); + new TensorBase(childVal->memory(), shape(), childVal->getBackend())); return val_; }; Tensor& grad() { auto childGrad = reshapee_->grad(); adj_.reset( - new TensorBase(childGrad->memory(), shape(), childGrad->getDevice())); + new TensorBase(childGrad->memory(), shape(), childGrad->getBackend())); return adj_; }; @@ -953,7 +939,7 @@ public: size_t offset = step_ * shape().elements() * sizeof(float); auto mem = New(childVal->memory()->data() + offset, childVal->memory()->size()); - val_.reset(new TensorBase(mem, shape(), childVal->getDevice())); + val_.reset(new TensorBase(mem, shape(), childVal->getBackend())); return val_; }; @@ -962,7 +948,7 @@ public: size_t offset = step_ * shape().elements() * sizeof(float); auto mem = New(childGrad->memory()->data() + offset, childGrad->memory()->size()); - adj_.reset(new TensorBase(mem, shape(), childGrad->getDevice())); + adj_.reset(new TensorBase(mem, shape(), childGrad->getBackend())); return adj_; }; diff --git a/src/graph/parameters.h b/src/graph/parameters.h index ed8b7690..3f282e4a 100644 --- a/src/graph/parameters.h +++ b/src/graph/parameters.h @@ -20,9 +20,9 @@ private: Ptr grads_; public: - void init(DeviceId deviceId) { - vals_ = New(deviceId); - grads_ = New(deviceId); + void init(Ptr backend) { + vals_ = New(backend); + grads_ = New(backend); } auto begin() -> decltype(params_.begin()) { return params_.begin(); } diff --git a/src/kernels/sparse.cu b/src/kernels/sparse.cu index 1d104474..b5080c0c 100644 --- a/src/kernels/sparse.cu +++ b/src/kernels/sparse.cu @@ -12,7 +12,7 @@ void multiply(Ptr C, const Ptr B, bool transA, bool transB) { - cudaSetDevice(C->getDevice()); + cudaSetDevice(backend_->getDevice().no); int nnzTotal; C->allocRowIndices(A->rows()); CUSPARSE_CHECK(cusparseXcsrgemmNnz( @@ -91,7 +91,7 @@ void multiply(Ptr C, //} void LfaForward(Tensor out, Tensor logits, Tensor att, Ptr sparseLf) { - cudaSetDevice(out->getDevice()); + cudaSetDevice(backend_->getDevice().no); int batch = att->shape()[0]; int srcWords = att->shape()[2]; @@ -150,7 +150,7 @@ __global__ void gCollapseAtt(float* out, } void CollapseAtt(Tensor out, Tensor in) { - cudaSetDevice(out->getDevice()); + cudaSetDevice(backend_->getDevice().no); int nonzeros = out->shape().elements(); int batch = out->shape()[0]; int srcWords = out->shape()[2]; diff --git a/src/kernels/sparse.h b/src/kernels/sparse.h index d70555f1..cffb398e 100644 --- a/src/kernels/sparse.h +++ b/src/kernels/sparse.h @@ -14,7 +14,7 @@ private: int nnz_{0}; int rows_{0}; int cols_{0}; - DeviceId deviceId_; + Ptr backend_; cusparseHandle_t handle_{0}; cusparseMatDescr_t descr_{0}; @@ -24,9 +24,9 @@ private: float* values_{0}; public: - CSR(int rows, int cols, DeviceId deviceId) - : rows_(rows), cols_(cols), deviceId_(deviceId) { - cudaSetDevice(deviceId_.no); + CSR(int rows, int cols, Ptr backend) + : rows_(rows), cols_(cols), backend_(backend) { + cudaSetDevice(backend_->getDevice().no); CUSPARSE_CHECK(cusparseCreate(&handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_)); CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL)); @@ -38,9 +38,9 @@ public: const std::vector& values, const std::vector& rowIndices, const std::vector& colIndices, - DeviceId deviceId) - : nnz_(values.size()), rows_(rows), cols_(cols), deviceId_(deviceId) { - cudaSetDevice(deviceId_.no); + Ptr backend) + : nnz_(values.size()), rows_(rows), cols_(cols), backend_(backend) { + cudaSetDevice(backend_->getDevice().no); CUSPARSE_CHECK(cusparseCreate(&handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&descr_)); CUSPARSE_CHECK(cusparseSetMatType(descr_, CUSPARSE_MATRIX_TYPE_GENERAL)); @@ -73,8 +73,8 @@ public: CUDA_CHECK(cudaFree(cooRowIndices)); } - CSR(Tensor dense) : deviceId_(dense->getDevice()) { - cudaSetDevice(deviceId_.no); + CSR(Tensor dense) : backend_(dense->getBackend()) { + cudaSetDevice(backend_->getDevice().no); rows_ = dense->shape()[0] * dense->shape()[2] * dense->shape()[3]; cols_ = dense->shape()[1]; @@ -114,7 +114,7 @@ public: } ~CSR() { - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); if(values_) CUDA_CHECK(cudaFree(values_)); if(rowIndices_) @@ -129,7 +129,7 @@ public: } void toTensor(Tensor dense) { - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); ABORT_IF(dense->size() != rows_ * cols_, "Matrix sizes do not match"); cusparseScsc2dense(handle_, @@ -154,10 +154,10 @@ public: int* rowIndices() { return rowIndices_; } int* colIndices() { return colIndices_; } - DeviceId getDevice() { return deviceId_; } + DeviceId getDevice() { return backend_->getDevice(); } void allocValues(int nnz = 0) { - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); if(nnz > 0) nnz_ = nnz; if(values_) @@ -166,7 +166,7 @@ public: } void allocRowIndices(int rows = 0) { - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); if(rows > 0) rows_ = rows; if(rowIndices_) @@ -175,7 +175,7 @@ public: } void allocColIndices(int nnz = 0) { - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); if(nnz > 0) nnz_ = nnz; if(colIndices_) @@ -184,11 +184,12 @@ public: } std::string debug() { + cudaSetDevice(backend_->getDevice().no); uint8_t* buffer; CUDA_CHECK(cudaMalloc(&buffer, sizeof(float) * rows() * cols())); auto mem = New(buffer, sizeof(float) * rows() * cols()); - Tensor tensor(new TensorBase(mem, {rows(), cols()}, deviceId_)); + Tensor tensor(new TensorBase(mem, {rows(), cols()}, backend_)); toTensor(tensor); std::string temp = tensor->debug(); diff --git a/src/kernels/tensor_operators.cu b/src/kernels/tensor_operators.cu index 69b8afc4..87019d9e 100644 --- a/src/kernels/tensor_operators.cu +++ b/src/kernels/tensor_operators.cu @@ -893,7 +893,7 @@ void Select(Ptr allocator, auto mp_indices = allocator->alloc(indices.size()); CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data()); - + int axisGPU = axis + gpu::Shape::size() - out->shape().size(); gSelect<<>>(out->data(), out->shape(), @@ -919,7 +919,7 @@ void Insert(Ptr allocator, auto mp_indices = allocator->alloc(indices.size()); CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data()); - + int axisGPU = axis + gpu::Shape::size() - out->shape().size(); gInsert<<>>(out->data(), out->shape(), @@ -1295,7 +1295,7 @@ float L2Norm(Tensor in) { uint8_t* data; cudaMalloc(&data, blocks * sizeof(float)); Tensor out(new TensorBase( - New(data, blocks * sizeof(float)), {1, blocks}, in->getDevice())); + New(data, blocks * sizeof(float)), {1, blocks}, in->getBackend())); ReduceAll(_1 * _1, out, in); float dataCpu = sqrtf(out->get(0)); diff --git a/src/optimizers/optimizers.cu b/src/optimizers/optimizers.cu index 2874e1d5..e82800c9 100644 --- a/src/optimizers/optimizers.cu +++ b/src/optimizers/optimizers.cu @@ -13,7 +13,7 @@ void Sgd::updateImpl(Tensor params, Tensor grads) { void Adagrad::updateImpl(Tensor params, Tensor grads) { if(!alloc_) - alloc_ = New(params->getDevice()); + alloc_ = New(params->getBackend()); if(!gt_) { int elements = params->size(); @@ -42,7 +42,7 @@ void Adagrad::resetStats() { void Adam::updateImpl(Tensor params, Tensor grads) { if(!alloc_) - alloc_ = New(params->getDevice()); + alloc_ = New(params->getBackend()); if(!mt_) { int elements = params->size(); diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp index 4286042b..cc6cea41 100644 --- a/src/tensors/cpu/dropout.cpp +++ b/src/tensors/cpu/dropout.cpp @@ -5,7 +5,7 @@ namespace marian { namespace cpu { - void Dropout(Ptr backend, Tensor tensor, float p) { + void Dropout(Tensor tensor, float p) { ABORT("Not implemented"); std::fill(tensor->data(), tensor->data() + tensor->size(), p); } diff --git a/src/tensors/dispatch.h b/src/tensors/dispatch.h index 14b74f5a..e63a6af1 100644 --- a/src/tensors/dispatch.h +++ b/src/tensors/dispatch.h @@ -1,35 +1,34 @@ #pragma once #include "common/definitions.h" -#include "tensors/backend.h" #include "tensors/tensor.h" #define DISPATCH1(Function, Arg1) \ namespace gpu { \ - void Function(Ptr, Arg1); \ + void Function(Arg1); \ } \ namespace cpu { \ - void Function(Ptr, Arg1); \ + void Function(Arg1); \ } \ - void Function(Ptr backend, Arg1 arg1) { \ - if(backend->getDevice().type == DeviceType::gpu) \ - gpu::Function(backend, arg1); \ + void Function(Arg1 arg1) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1); \ else \ - cpu::Function(backend, arg1); \ + cpu::Function(arg1); \ } #define DISPATCH2(Function, Arg1, Arg2) \ namespace gpu { \ - void Function(Ptr, Arg1, Arg2); \ + void Function(Arg1, Arg2); \ } \ namespace cpu { \ - void Function(Ptr, Arg1, Arg2); \ + void Function(Arg1, Arg2); \ } \ - static inline void Function(Ptr backend, Arg1 arg1, Arg2 arg2) { \ - if(backend->getDevice().type == DeviceType::gpu) \ - gpu::Function(backend, arg1, arg2); \ + static inline void Function(Arg1 arg1, Arg2 arg2) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2); \ else \ - cpu::Function(backend, arg1, arg2); \ + cpu::Function(arg1, arg2); \ } namespace marian { diff --git a/src/tensors/gpu/dropout.cu b/src/tensors/gpu/dropout.cu index 6dc49c51..4a4223a8 100644 --- a/src/tensors/gpu/dropout.cu +++ b/src/tensors/gpu/dropout.cu @@ -35,8 +35,9 @@ namespace marian { } } - void Dropout(Ptr backend, Tensor tensor, float p) { - curandGenerator_t gen = std::static_pointer_cast(backend)->getCurandGenerator(); + void Dropout(Tensor tensor, float p) { + auto gpuBackend = std::static_pointer_cast(tensor->getBackend()); + curandGenerator_t gen = gpuBackend->getCurandGenerator(); int n = tensor->size(); CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n)); diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu index bc26fcec..96d979bf 100644 --- a/src/tensors/tensor.cu +++ b/src/tensors/tensor.cu @@ -16,7 +16,7 @@ __global__ void gFill(float *d_in, int size, float val) { } float TensorBase::get(size_t i) { - cudaSetDevice(deviceId_.no); + CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); float temp; CUDA_CHECK( cudaMemcpy(&temp, data() + i, sizeof(float), cudaMemcpyDeviceToHost)); @@ -25,14 +25,14 @@ float TensorBase::get(size_t i) { } void TensorBase::set(size_t i, float value) { - cudaSetDevice(deviceId_.no); + CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); CUDA_CHECK( cudaMemcpy(data() + i, &value, sizeof(float), cudaMemcpyHostToDevice)); cudaStreamSynchronize(0); } void TensorBase::get(std::vector &v) { - CUDA_CHECK(cudaSetDevice(deviceId_.no)); + CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); v.resize(size()); CUDA_CHECK(cudaMemcpy( v.data(), data(), size() * sizeof(float), cudaMemcpyDeviceToHost)); @@ -40,7 +40,7 @@ void TensorBase::get(std::vector &v) { } void TensorBase::set(float value) { - cudaSetDevice(deviceId_.no); + CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); int threads = std::min(512, (int)size()); int blocks = (size() / threads) + (size() % threads != 0); gFill<<>>(data(), size(), value); @@ -48,7 +48,7 @@ void TensorBase::set(float value) { } void TensorBase::set(const std::vector &v) { - CUDA_CHECK(cudaSetDevice(deviceId_.no)); + CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); CUDA_CHECK(cudaMemcpy( data(), v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice)); cudaStreamSynchronize(0); @@ -56,13 +56,13 @@ void TensorBase::set(const std::vector &v) { void TensorBase::setSparse(const std::vector &k, const std::vector &v) { - cudaSetDevice(deviceId_.no); + CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); SetSparse(data(), k, v); cudaStreamSynchronize(0); } void TensorBase::copyFrom(Tensor in) { - cudaSetDevice(deviceId_.no); + CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); CUDA_CHECK(cudaMemcpy(data(), (float *)in->data(), in->size() * sizeof(float), @@ -74,7 +74,7 @@ std::string TensorBase::debug() { std::stringstream strm; assert(shape_.size()); strm << shape_; - strm << " device=" << deviceId_; + strm << " device=" << backend_->getDevice(); strm << " ptr=" << (size_t)memory_->data(); strm << " bytes=" << memory_->size(); strm << std::endl; diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index 15d3a427..309cacd5 100644 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -9,6 +9,7 @@ #include "common/definitions.h" #include "common/shape.h" #include "tensors/memory_piece.h" +#include "tensors/backend.h" namespace marian { @@ -16,11 +17,11 @@ class TensorBase : public std::enable_shared_from_this { private: Ptr memory_; Shape shape_; - DeviceId deviceId_; + Ptr backend_; public: - TensorBase(Ptr memory, Shape shape, DeviceId deviceId) - : memory_(memory), shape_(shape), deviceId_(deviceId) {} + TensorBase(Ptr memory, Shape shape, Ptr backend) + : memory_(memory), shape_(shape), backend_(backend) {} ~TensorBase() {} @@ -39,12 +40,13 @@ public: return get(0); } - DeviceId getDevice() { return deviceId_; } + Ptr getBackend() { return backend_; } + DeviceId getDevice() { return backend_->getDevice(); } Tensor subtensor(int offset, int size) { auto mem = New(memory_->data() + sizeof(float) * offset, sizeof(float) * size); - return Tensor(new TensorBase(mem, {1, size}, deviceId_)); + return New(mem, Shape{1, size}, backend_); } float get(size_t i); diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h index e1c54b22..18aae134 100644 --- a/src/tensors/tensor_allocator.h +++ b/src/tensors/tensor_allocator.h @@ -16,11 +16,13 @@ private: const size_t GROW = CHUNK * MBYTE; const size_t ALIGN = 256; + Ptr backend_; Ptr allocator_; public: - TensorAllocator(DeviceId deviceId) - : allocator_(New(deviceId, 0, GROW, ALIGN)) {} + TensorAllocator(Ptr backend) + : backend_(backend), + allocator_(New(backend_->getDevice(), 0, GROW, ALIGN)) {} ~TensorAllocator() { clear(); } @@ -58,7 +60,7 @@ public: if(!t || t->shape() != shape) { int size = shape.elements(); auto mem = allocator_->alloc(size); - t = Tensor(new TensorBase(mem, shape, allocator_->getDevice())); + t = Tensor(new TensorBase(mem, shape, backend_)); } } @@ -67,7 +69,7 @@ public: Tensor asTensor() { auto mem = allocator_->memory(); int size = mem->size() / sizeof(float); - return Tensor(new TensorBase(mem, {1, size}, allocator_->getDevice())); + return Tensor(new TensorBase(mem, {1, size}, backend_)); } size_t size() { return allocator_->size() / sizeof(float); } diff --git a/src/training/graph_group_async.cu b/src/training/graph_group_async.cu index 90d575bd..18f3908a 100644 --- a/src/training/graph_group_async.cu +++ b/src/training/graph_group_async.cu @@ -93,12 +93,12 @@ void AsyncGraphGroup::init(Ptr batch) { int pos = 0; // parameter sharding - for(auto device : devices_) { + for(auto graph : graphs_) { int __size__ = min(shardSize_, totalSize); totalSize -= __size__; Tensor param; - Ptr allocator = New(DeviceId{device, DeviceType::gpu}); + Ptr allocator = New(graph->getBackend()); allocator->reserveExact(__size__ * sizeof(float)); allocator->allocate(param, {1, __size__}); paramsAlloc_.push_back(allocator); @@ -112,11 +112,11 @@ void AsyncGraphGroup::init(Ptr batch) { if(grads_.size() == 0) { int totalSize = graphs_[0]->params()->vals()->size(); - for(auto device : devices_) { + for(auto graph : graphs_) { int __size__ = min(shardSize_, totalSize); totalSize -= __size__; Tensor grad_; - Ptr allocator_ = New(DeviceId{device, DeviceType::gpu}); + Ptr allocator_ = New(graph->getBackend()); allocator_->reserveExact(__size__ * sizeof(float)); allocator_->allocate(grad_, {1, __size__}); @@ -129,11 +129,11 @@ void AsyncGraphGroup::init(Ptr batch) { int totalSize = graphs_[0]->params()->vals()->size(); int i = 0; - for(auto device : devices_) { + for(auto graph : graphs_) { int __size__ = min(shardSize_, totalSize); totalSize -= __size__; Tensor paramAvg; - Ptr allocator = New(DeviceId{device, DeviceType::gpu}); + Ptr allocator = New(graph->getBackend()); allocator->reserveExact(__size__ * sizeof(float)); allocator->allocate(paramAvg, {1, __size__}); @@ -187,7 +187,7 @@ void AsyncGraphGroup::execute(Ptr batch) { Tensor gradients; if(tau_ > 1) { if(t == 0) { - accAlloc = New(graph->getDevice()); + accAlloc = New(graph->getBackend()); accAlloc->reserveExact(graph->params()->grads()->memory()->size()); accAlloc->allocate(accGradients, graph->params()->grads()->shape()); accGradients->set(0); diff --git a/src/training/graph_group_async_drop.cu b/src/training/graph_group_async_drop.cu index 84985009..33684bf9 100644 --- a/src/training/graph_group_async_drop.cu +++ b/src/training/graph_group_async_drop.cu @@ -8,9 +8,9 @@ namespace marian { -Tensor AsyncGraphGroupDrop::newTensor(int size, DeviceId deviceId) { +Tensor AsyncGraphGroupDrop::newTensor(int size, Ptr backend) { Tensor t; - Ptr allocator_ = New(deviceId); + Ptr allocator_ = New(backend); allocator_->reserveExact(size * sizeof(float)); allocator_->allocate(t, {1, size}); allocators.push_back(allocator_); @@ -86,7 +86,7 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads, // get the sparse gradient pushDropper_[device_id]->dropGraph( - newGrads, pushSparseGradient_[device_id], + newGrads, pushSparseGradient_[device_id], droping_rate, dropping_momentum); SparseTensor newSparseGrads = pushSparseGradient_[device_id]; @@ -146,13 +146,12 @@ void AsyncGraphGroupDrop::init(Ptr batch) { fetchStep_.push_back(0); pushStep_.push_back(0); - size_t device = devices_[i]; // temporary tensor to compute parameter delta before fetching - paramsDelta_.push_back(newTensor(shardSize, {device, DeviceType::gpu})); + paramsDelta_.push_back(newTensor(shardSize, graphs_[i]->getBackend())); // tensors to store local params history for(int h_id = 0; h_id < devices_.size(); h_id++) { - Tensor tmp = newTensor(params_[i]->size(), {device, DeviceType::gpu}); + Tensor tmp = newTensor(params_[i]->size(), graphs_[i]->getBackend()); tmp->copyFrom(params_[i]); paramsLocal_[h_id].push_back(tmp); } @@ -162,23 +161,23 @@ void AsyncGraphGroupDrop::init(Ptr batch) { // N-dropper for fetch std::vector tmpDropper; - for(int i = 0; i < devices_.size(); i++) + for(auto device : devices_) tmpDropper.push_back(GradientDrop(new GradientDropBase())); fetchDropper.push_back(tmpDropper); // sparsetensor to store sparsified gradients per-device pushSparseGradient_.push_back( - SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu}))); + SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend()))); pushShardedSparseGradient_.push_back( - SparseTensor(new SparseTensorBase(sparseCap, {device, DeviceType::gpu}))); + SparseTensor(new SparseTensorBase(sparseCap, graphs_[i]->getBackend()))); fetchSparseGradient_.push_back(SparseTensor( - new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu}))); + new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend()))); std::vector tmp; for(int i = 0; i < devices_.size(); i++) tmp.push_back(SparseTensor( - new SparseTensorBase(sparseCap / devices_.size(), {device, DeviceType::gpu}))); + new SparseTensorBase(sparseCap / devices_.size(), graphs_[i]->getBackend()))); fetchShardedSparseGradient_.push_back(tmp); } diff --git a/src/training/graph_group_async_drop.h b/src/training/graph_group_async_drop.h index 33e74a40..f32d9444 100644 --- a/src/training/graph_group_async_drop.h +++ b/src/training/graph_group_async_drop.h @@ -31,7 +31,7 @@ class AsyncGraphGroupDrop : public AsyncGraphGroup { std::vector> allocators; - Tensor newTensor(int size, DeviceId deviceId); + Tensor newTensor(int size, Ptr backend); protected: void init(Ptr batch); diff --git a/src/training/graph_group_multinode.cu b/src/training/graph_group_multinode.cu index 78cd842f..34aa2b5d 100644 --- a/src/training/graph_group_multinode.cu +++ b/src/training/graph_group_multinode.cu @@ -19,9 +19,9 @@ void MultiNodeGraphGroup::setScheduler(Ptr scheduler) { /** * Allocate new tensor on given GPU and store allocator. */ -Tensor MultiNodeGraphGroup::newTensor(int size, DeviceId deviceId) { +Tensor MultiNodeGraphGroup::newTensor(int size, Ptr backend) { Tensor t; - Ptr allocator = New(deviceId); + Ptr allocator = New(backend); allocator->reserveExact(size * sizeof(float)); allocator->allocate(t, {1, size}); allocators_.push_back(allocator); @@ -148,14 +148,12 @@ void MultiNodeGraphGroup::initClientCommOverlapVars() { void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() { size_t modelSize = clientGraphs_[0]->params()->vals()->size(); for(int client = 0; client < devices_.size(); client++) { - DeviceId deviceId{devices_[client], DeviceType::gpu}; - // Communication overlap buffer (for grads + params) - Tensor commOverlapBuffer = newTensor(modelSize, deviceId); + Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend()); commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals()); clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer); // Gradients local sum buffer - Tensor sumGrads = newTensor(modelSize, deviceId); + Tensor sumGrads = newTensor(modelSize, clientGraphs_[client]->getBackend()); sumGrads->set(0); clientSummedGradsGPU.push_back(sumGrads); // Local optimizer to apply summed gradients @@ -207,12 +205,11 @@ void MultiNodeGraphGroup::calculateShardSizes() { void MultiNodeGraphGroup::initShardGpuTensors() { size_t offset = 0; for(int shard = 0; shard < devices_.size(); shard++) { - DeviceId deviceId{devices_[shard], DeviceType::gpu}; - Tensor gpuParams = newTensor(shardSizes_[shard], deviceId); + Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()); gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor( offset, shardSizes_[shard])); shardParams_.push_back(gpuParams); - shardGrads_.push_back(newTensor(shardSizes_[shard], deviceId)); + shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend())); } } diff --git a/src/training/graph_group_multinode.h b/src/training/graph_group_multinode.h index 90243f7c..c6dc495c 100644 --- a/src/training/graph_group_multinode.h +++ b/src/training/graph_group_multinode.h @@ -217,7 +217,7 @@ protected: /** * Allocate new tensor on given GPU and store allocator. */ - Tensor newTensor(int size, DeviceId deviceId); + Tensor newTensor(int size, Ptr backend); /** * Setup training environment and launch server thread and (if enabled) client diff --git a/src/training/graph_group_sync.cu b/src/training/graph_group_sync.cu index e713b99d..171c0652 100644 --- a/src/training/graph_group_sync.cu +++ b/src/training/graph_group_sync.cu @@ -60,7 +60,7 @@ void SyncGraphGroup::execute(Ptr batch) { for(auto graph : graphs_) { int __size__ = min(shardSize_, totalSize); - auto paramsAlloc = New(graph->getDevice()); + auto paramsAlloc = New(graph->getBackend()); paramsAllocs_.push_back(paramsAlloc); paramsAlloc->reserveExact(3 * __size__ * sizeof(float)); @@ -87,7 +87,7 @@ void SyncGraphGroup::execute(Ptr batch) { int __size__ = min(shardSize_, totalSize); totalSize -= __size__; Tensor paramAvg; - auto allocator = New(graph->getDevice()); + auto allocator = New(graph->getBackend()); allocator->reserveExact(__size__ * sizeof(float)); allocator->allocate(paramAvg, {1, __size__}); diff --git a/src/training/sparse_tensor.cu b/src/training/sparse_tensor.cu index 5d32b16f..aafafa97 100644 --- a/src/training/sparse_tensor.cu +++ b/src/training/sparse_tensor.cu @@ -45,9 +45,9 @@ __global__ void gFindSubtensor(int* indices, resultEnd[0] = idx; } -SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId) -: deviceId_(deviceId), capacity_(capacity) { - cudaSetDevice(deviceId_.no); +SparseTensorBase::SparseTensorBase(int capacity, Ptr backend) +: backend_(backend), capacity_(capacity) { + cudaSetDevice(backend_->getDevice().no); CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity)); CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity)); @@ -58,8 +58,8 @@ SparseTensorBase::SparseTensorBase(int capacity, DeviceId deviceId) SparseTensorBase::SparseTensorBase(float* data, int* indices, int size, - DeviceId deviceId) -: deviceId_(deviceId) { + Ptr backend) +: backend_(backend) { data_ = data; indices_ = indices; size_ = size; @@ -93,7 +93,7 @@ void SparseTensorBase::copyFrom(float* data, size_ = size; if(size == 0) return; - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault); if(!data_only) @@ -107,8 +107,8 @@ void SparseTensorBase::copyFrom(std::shared_ptr t, copyFrom(t->data(), t->indices(), t->size(), data_only); } -DeviceId SparseTensorBase::getDevice() { - return deviceId_; +Ptr SparseTensorBase::getBackend() { + return backend_; } void SparseTensorBase::setSize(int size) { @@ -117,7 +117,7 @@ void SparseTensorBase::setSize(int size) { // return the dense representation of this tensor void SparseTensorBase::toDense(Tensor t, int offset) { - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); int threads = 512; int blocks = 1 + size_ / threads; t->set(0); @@ -127,7 +127,7 @@ void SparseTensorBase::toDense(Tensor t, int offset) { } void SparseTensorBase::scatterAdd(Tensor t, int offset) { - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); cudaStreamSynchronize(0); int threads = 512; int blocks = 1 + size_ / threads; @@ -139,7 +139,7 @@ void SparseTensorBase::scatterAdd(Tensor t, int offset) { std::shared_ptr SparseTensorBase::subtensor(int pos, int size, int idx) { - cudaSetDevice(deviceId_.no); + cudaSetDevice(backend_->getDevice().no); cudaStreamSynchronize(0); int* start = gstart_ + idx; int* end = gend_ + idx; @@ -165,6 +165,6 @@ std::shared_ptr SparseTensorBase::subtensor(int pos, int subtensorSize = std::max(0, endOffset - startOffset + 1); cudaStreamSynchronize(0); return std::shared_ptr(new SparseTensorBase( - data_ + startOffset, indices_ + startOffset, subtensorSize, deviceId_)); + data_ + startOffset, indices_ + startOffset, subtensorSize, backend_)); } } diff --git a/src/training/sparse_tensor.h b/src/training/sparse_tensor.h index 03fb53a9..9194748f 100644 --- a/src/training/sparse_tensor.h +++ b/src/training/sparse_tensor.h @@ -10,15 +10,15 @@ class SparseTensorBase : public std::enable_shared_from_this { int* indices_; int size_; int capacity_; - DeviceId deviceId_; + Ptr backend_; int* d_is_unsorted; int* gstart_; int* gend_; public: - SparseTensorBase(int capacity, DeviceId deviceId); - SparseTensorBase(float* data, int* indices, int size, DeviceId deviceId); + SparseTensorBase(int capacity, Ptr backend); + SparseTensorBase(float* data, int* indices, int size, Ptr backend); ~SparseTensorBase() {} @@ -43,7 +43,7 @@ public: void scatterAdd(Tensor t, int offset = 0); std::shared_ptr subtensor(int pos, int size, int idx); - DeviceId getDevice(); + Ptr getBackend(); void toDense(Tensor t, int offset); }; From 6931f6c3c8ad5440e151a29787e89bab69ef9745 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 16 Feb 2018 14:26:10 -0800 Subject: [PATCH 07/11] make tensor device independent --- src/CMakeLists.txt | 3 +- src/kernels/cuda_helpers.h | 4 +- src/kernels/tensor_operators.cu | 4 +- src/tensors/gpu/algorithm.cu | 31 +++++++ src/tensors/gpu/algorithm.h | 10 +++ src/tensors/tensor.cu | 140 ------------------------------ src/tensors/tensor.h | 145 +++++++++++++++++++++++++++++--- 7 files changed, 180 insertions(+), 157 deletions(-) create mode 100644 src/tensors/gpu/algorithm.cu create mode 100644 src/tensors/gpu/algorithm.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 977d88f7..92c8a776 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,10 +8,11 @@ cuda_add_library(marian 3rd_party/cnpy/cnpy.cpp 3rd_party/exception.cpp 3rd_party/svd/svd.cpp - tensors/tensor.cu +# tensors/tensor.cu tensors/device.cu tensors/device.cpp tensors/backend.cpp + tensors/gpu/algorithm.cu tensors/gpu/dropout.cu tensors/cpu/dropout.cpp kernels/tensor_operators.cu diff --git a/src/kernels/cuda_helpers.h b/src/kernels/cuda_helpers.h index 92faaf4c..46d84afe 100644 --- a/src/kernels/cuda_helpers.h +++ b/src/kernels/cuda_helpers.h @@ -17,8 +17,8 @@ inline void gpuAssert(cudaError_t code, } } -template -void CudaCopy(const T* start, const T* end, D* dest) { +template +void CudaCopy(const T* start, const T* end, T* dest) { CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T), cudaMemcpyDefault)); } diff --git a/src/kernels/tensor_operators.cu b/src/kernels/tensor_operators.cu index 87019d9e..8dcda559 100644 --- a/src/kernels/tensor_operators.cu +++ b/src/kernels/tensor_operators.cu @@ -892,7 +892,7 @@ void Select(Ptr allocator, int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); auto mp_indices = allocator->alloc(indices.size()); - CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data()); + CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data()); int axisGPU = axis + gpu::Shape::size() - out->shape().size(); gSelect<<>>(out->data(), @@ -918,7 +918,7 @@ void Insert(Ptr allocator, int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); auto mp_indices = allocator->alloc(indices.size()); - CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data()); + CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data()); int axisGPU = axis + gpu::Shape::size() - out->shape().size(); gInsert<<>>(out->data(), diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu new file mode 100644 index 00000000..b98a8b1c --- /dev/null +++ b/src/tensors/gpu/algorithm.cu @@ -0,0 +1,31 @@ +#include "tensors/gpu/algorithm.h" + +#include "kernels/cuda_helpers.h" + +namespace marian { + namespace gpu { + void copy(Ptr backend, const float* begin, const float* end, float* dest) { + CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); + CudaCopy(begin, end, dest); + CUDA_CHECK(cudaStreamSynchronize(0)); + } + + __global__ void gFill(float *d_in, int size, float val) { + for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) { + int index = bid + threadIdx.x + blockDim.x * blockIdx.x; + if(index < size) { + d_in[index] = val; + } + } + } + + void fill(Ptr backend, float* begin, float* end, float value) { + CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); + int size = end - begin; + int threads = std::min(512, size); + int blocks = (size / threads) + (size % threads != 0); + gFill<<>>(begin, size, value); + CUDA_CHECK(cudaStreamSynchronize(0)); + } + } +} diff --git a/src/tensors/gpu/algorithm.h b/src/tensors/gpu/algorithm.h new file mode 100644 index 00000000..7fd9a93c --- /dev/null +++ b/src/tensors/gpu/algorithm.h @@ -0,0 +1,10 @@ +#pragma once + +#include "tensors/backend.h" + +namespace marian { + namespace gpu { + void copy(Ptr backend, const float* begin, const float* end, float* dest); + void fill(Ptr backend, float* begin, float* end, float value); + } +} diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu index 96d979bf..be097178 100644 --- a/src/tensors/tensor.cu +++ b/src/tensors/tensor.cu @@ -6,54 +6,6 @@ namespace marian { -__global__ void gFill(float *d_in, int size, float val) { - for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) { - int index = bid + threadIdx.x + blockDim.x * blockIdx.x; - if(index < size) { - d_in[index] = val; - } - } -} - -float TensorBase::get(size_t i) { - CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); - float temp; - CUDA_CHECK( - cudaMemcpy(&temp, data() + i, sizeof(float), cudaMemcpyDeviceToHost)); - cudaStreamSynchronize(0); - return temp; -} - -void TensorBase::set(size_t i, float value) { - CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); - CUDA_CHECK( - cudaMemcpy(data() + i, &value, sizeof(float), cudaMemcpyHostToDevice)); - cudaStreamSynchronize(0); -} - -void TensorBase::get(std::vector &v) { - CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); - v.resize(size()); - CUDA_CHECK(cudaMemcpy( - v.data(), data(), size() * sizeof(float), cudaMemcpyDeviceToHost)); - cudaStreamSynchronize(0); -} - -void TensorBase::set(float value) { - CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); - int threads = std::min(512, (int)size()); - int blocks = (size() / threads) + (size() % threads != 0); - gFill<<>>(data(), size(), value); - cudaStreamSynchronize(0); -} - -void TensorBase::set(const std::vector &v) { - CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); - CUDA_CHECK(cudaMemcpy( - data(), v.data(), v.size() * sizeof(float), cudaMemcpyHostToDevice)); - cudaStreamSynchronize(0); -} - void TensorBase::setSparse(const std::vector &k, const std::vector &v) { CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); @@ -61,96 +13,4 @@ void TensorBase::setSparse(const std::vector &k, cudaStreamSynchronize(0); } -void TensorBase::copyFrom(Tensor in) { - CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); - CUDA_CHECK(cudaMemcpy(data(), - (float *)in->data(), - in->size() * sizeof(float), - cudaMemcpyDefault)); - cudaStreamSynchronize(0); -} - -std::string TensorBase::debug() { - std::stringstream strm; - assert(shape_.size()); - strm << shape_; - strm << " device=" << backend_->getDevice(); - strm << " ptr=" << (size_t)memory_->data(); - strm << " bytes=" << memory_->size(); - strm << std::endl; - - // values - size_t totSize = shape_.elements(); - std::vector values(totSize); - get(values); - - size_t dispCols = 5; - strm << std::fixed << std::setprecision(8) << std::setfill(' '); - - for(int i = 0; i < values.size(); ++i) { - std::vector dims; - shape().dims(i, dims); - - bool disp = true; - for(int j = 0; j < dims.size(); ++j) - disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols); - - if(disp) { - - if(dims.back() == 0) { - bool par = true; - std::vector p; - for(int j = dims.size() - 1; j >= 0; --j) { - if(dims[j] != 0) - par = false; - - p.push_back(par ? "[" : " "); - } - for(auto it = p.rbegin(); it != p.rend(); ++it) - strm << *it; - strm << " "; - } - - strm << std::setw(12) - << values[i] - << " "; - - if(dims.back() + 1 == shape().back()) { - for(int j = dims.size() - 1; j >= 0; --j) { - if(dims[j] + 1 != shape()[j]) - break; - strm << "]"; - } - strm << std::endl; - } - - bool prev = true; - for(int j = dims.size() - 1; j >= 0; --j) { - if(j < dims.size() - 1) - prev = prev && dims[j + 1] + 1 == shape()[j + 1]; - if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) { - if(j < dims.size() - 1) - for(int k = 0; k <= j; ++k) - strm << " "; - strm << "... "; - if(j < dims.size() - 1) - strm << std::endl; - break; - } - } - } - } - strm << std::endl; - return strm.str(); -} - -Tensor operator<<(Tensor t, const std::vector &v) { - t->set(v); - return t; -} - -Tensor operator>>(Tensor t, std::vector &v) { - t->get(v); - return t; -} } diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index 309cacd5..e6dbc9fd 100644 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -5,12 +5,14 @@ #include #include -#include "3rd_party/exception.h" #include "common/definitions.h" #include "common/shape.h" #include "tensors/memory_piece.h" #include "tensors/backend.h" +#include +#include "tensors/gpu/algorithm.h" + namespace marian { class TensorBase : public std::enable_shared_from_this { @@ -49,26 +51,145 @@ public: return New(mem, Shape{1, size}, backend_); } - float get(size_t i); + float get(size_t i) { + float temp; + if(backend_->getDevice().type == DeviceType::gpu) + gpu::copy(backend_, data() + i, data() + i + 1, &temp); + else + std::copy(data() + i, data() + i + 1, &temp); + return temp; + } - void set(size_t i, float value); + void set(size_t i, float value) { + if(backend_->getDevice().type == DeviceType::gpu) + gpu::copy(backend_, &value, &value + 1, data() + i); + else + std::copy(&value, &value + 1, data() + i); + } - void get(std::vector& v); + void get(std::vector &v) { + v.resize(size()); + if(backend_->getDevice().type == DeviceType::gpu) + gpu::copy(backend_, data(), data() + size(), v.data()); + else + std::copy(data(), data() + size(), v.data()); + } - void set(float value); + void set(const std::vector &v) { + if(backend_->getDevice().type == DeviceType::gpu) + gpu::copy(backend_, v.data(), v.data() + v.size(), data()); + else + std::copy(v.data(), v.data() + v.size(), data()); + } - void set(const std::vector& v); + void set(float value) { + if(backend_->getDevice().type == DeviceType::gpu) + gpu::fill(backend_, data(), data() + size(), value); + else + std::fill(data(), data() + size(), value); + } - void setSparse(const std::vector& k, const std::vector& v); + void setSparse(const std::vector &k, + const std::vector &v) { + //CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); + //SetSparse(data(), k, v); + //cudaStreamSynchronize(0); + } - void copyFrom(Tensor); + void copyFrom(Tensor in) { + if(in->getBackend()->getDevice().type == DeviceType::gpu || + backend_->getDevice().type == DeviceType::gpu) + gpu::copy(backend_, in->data(), in->data() + in->size(), data()); + else + std::copy(in->data(), in->data() + in->size(), data()); + } + + std::string debug() { + std::stringstream strm; + assert(shape_.size()); + strm << shape_; + strm << " device=" << backend_->getDevice(); + strm << " ptr=" << (size_t)memory_->data(); + strm << " bytes=" << memory_->size(); + strm << std::endl; + + // values + size_t totSize = shape_.elements(); + std::vector values(totSize); + get(values); + + size_t dispCols = 5; + strm << std::fixed << std::setprecision(8) << std::setfill(' '); + + for(int i = 0; i < values.size(); ++i) { + std::vector dims; + shape().dims(i, dims); + + bool disp = true; + for(int j = 0; j < dims.size(); ++j) + disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols); + + if(disp) { + + if(dims.back() == 0) { + bool par = true; + std::vector p; + for(int j = dims.size() - 1; j >= 0; --j) { + if(dims[j] != 0) + par = false; + + p.push_back(par ? "[" : " "); + } + for(auto it = p.rbegin(); it != p.rend(); ++it) + strm << *it; + strm << " "; + } + + strm << std::setw(12) + << values[i] + << " "; + + if(dims.back() + 1 == shape().back()) { + for(int j = dims.size() - 1; j >= 0; --j) { + if(dims[j] + 1 != shape()[j]) + break; + strm << "]"; + } + strm << std::endl; + } + + bool prev = true; + for(int j = dims.size() - 1; j >= 0; --j) { + if(j < dims.size() - 1) + prev = prev && dims[j + 1] + 1 == shape()[j + 1]; + if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) { + if(j < dims.size() - 1) + for(int k = 0; k <= j; ++k) + strm << " "; + strm << "... "; + if(j < dims.size() - 1) + strm << std::endl; + break; + } + } + } + } + strm << std::endl; + return strm.str(); + } - std::string debug(); }; typedef std::shared_ptr Tensor; -Tensor operator<<(Tensor t, const std::vector& v); - -Tensor operator>>(Tensor t, std::vector& v); +static Tensor operator<<(Tensor t, const std::vector &v) { + t->set(v); + return t; +} + +static Tensor operator>>(Tensor t, std::vector &v) { + t->get(v); + return t; +} + } From 0c959c1a7f0c0a95f2fb43b3ddc577fb43f22ebf Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 16 Feb 2018 14:38:54 -0800 Subject: [PATCH 08/11] clean up tensors --- src/tensors/gpu/algorithm.cu | 11 +++++++++++ src/tensors/gpu/algorithm.h | 2 ++ src/tensors/tensor.cu | 16 ---------------- src/tensors/tensor.h | 11 +++++++---- 4 files changed, 20 insertions(+), 20 deletions(-) delete mode 100644 src/tensors/tensor.cu diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu index b98a8b1c..b26e00e9 100644 --- a/src/tensors/gpu/algorithm.cu +++ b/src/tensors/gpu/algorithm.cu @@ -1,6 +1,7 @@ #include "tensors/gpu/algorithm.h" #include "kernels/cuda_helpers.h" +#include "kernels/tensor_operators.h" namespace marian { namespace gpu { @@ -27,5 +28,15 @@ namespace marian { gFill<<>>(begin, size, value); CUDA_CHECK(cudaStreamSynchronize(0)); } + + void setSparse(Ptr backend, + const std::vector& keys, + const std::vector& values, + float* data) { + CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); + SetSparse(data, keys, values); + CUDA_CHECK(cudaStreamSynchronize(0)); + } + } } diff --git a/src/tensors/gpu/algorithm.h b/src/tensors/gpu/algorithm.h index 7fd9a93c..1bfd9acb 100644 --- a/src/tensors/gpu/algorithm.h +++ b/src/tensors/gpu/algorithm.h @@ -6,5 +6,7 @@ namespace marian { namespace gpu { void copy(Ptr backend, const float* begin, const float* end, float* dest); void fill(Ptr backend, float* begin, float* end, float value); + + void setSparse(Ptr backend, const std::vector&, const std::vector&, float*); } } diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu deleted file mode 100644 index be097178..00000000 --- a/src/tensors/tensor.cu +++ /dev/null @@ -1,16 +0,0 @@ - -#include "tensors/tensor.h" - -#include "kernels/cuda_helpers.h" -#include "kernels/tensor_operators.h" - -namespace marian { - -void TensorBase::setSparse(const std::vector &k, - const std::vector &v) { - CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); - SetSparse(data(), k, v); - cudaStreamSynchronize(0); -} - -} diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index e6dbc9fd..a693f1f5 100644 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -90,10 +90,13 @@ public: } void setSparse(const std::vector &k, - const std::vector &v) { - //CUDA_CHECK(cudaSetDevice(backend_->getDevice().no)); - //SetSparse(data(), k, v); - //cudaStreamSynchronize(0); + const std::vector &v) { + if(backend_->getDevice().type == DeviceType::gpu) { + gpu::setSparse(backend_, k, v, data()); + } else { + for(int i = 0; i < k.size(); ++i) + data()[k[i]] = v[i]; + } } void copyFrom(Tensor in) { From 5e1b95e0bb0a6b86b75515c1069bf55596b6fe6c Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 16 Feb 2018 16:39:11 -0800 Subject: [PATCH 09/11] finshed backend separation --- src/graph/parameters.h | 4 +-- src/tensors/allocator.h | 3 ++- src/tensors/cpu/backend.h | 25 +++++++++++-------- src/tensors/cpu/dropout.cpp | 8 ++++-- src/tensors/device.cpp | 11 ++++---- src/tensors/device.cu | 2 +- src/tensors/device.h | 16 ++++++------ src/tensors/gpu/backend.h | 8 +++--- src/tensors/tensor_allocator.h | 16 ++++++++---- src/tests/CMakeLists.txt | 2 +- .../{dropout_test.cu => dropout_test.cpp} | 5 +++- 11 files changed, 60 insertions(+), 40 deletions(-) rename src/tests/{dropout_test.cu => dropout_test.cpp} (79%) diff --git a/src/graph/parameters.h b/src/graph/parameters.h index 3f282e4a..99e3b2af 100644 --- a/src/graph/parameters.h +++ b/src/graph/parameters.h @@ -57,7 +57,7 @@ public: } void allocateForward() { - if(vals_->size() == 0) { + if(!params_.empty() && vals_->size() == 0) { vals_->reserveExact(totalCapacity(vals_)); for(auto p : params_) if(!p->val()) @@ -66,7 +66,7 @@ public: } void allocateBackward() { - if(grads_->size() == 0) { + if(!params_.empty() && grads_->size() == 0) { grads_->reserveExact(totalCapacity(grads_)); for(auto p : params_) if(!p->grad()) diff --git a/src/tensors/allocator.h b/src/tensors/allocator.h index 6c9b51f6..6781afa7 100644 --- a/src/tensors/allocator.h +++ b/src/tensors/allocator.h @@ -153,7 +153,8 @@ public: void reserve(size_t bytes) { bytes = align(bytes); - device_->reserve(bytes); + if(bytes > 0) + device_->reserve(bytes); clear(); } diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 7bf5770e..83d0ae12 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -1,23 +1,28 @@ #pragma once +#include +#include + #include "common/config.h" #include "tensors/backend.h" namespace marian { namespace cpu { - -class Backend : public marian::Backend { -public: - Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) { - } - - void setDevice() { - } +class Backend : public marian::Backend { private: - void setHandles() { + std::default_random_engine gen_; - } +public: + Backend(DeviceId deviceId, size_t seed) + : marian::Backend(deviceId, seed), + gen_(seed_) {} + + void setDevice() { } + + std::default_random_engine& getRandomGenerator() { + return gen_; + } }; } diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp index cc6cea41..a6a0c263 100644 --- a/src/tensors/cpu/dropout.cpp +++ b/src/tensors/cpu/dropout.cpp @@ -1,13 +1,17 @@ #include +#include #include "tensors/dispatch.h" +#include "tensors/cpu/backend.h" namespace marian { namespace cpu { void Dropout(Tensor tensor, float p) { - ABORT("Not implemented"); - std::fill(tensor->data(), tensor->data() + tensor->size(), p); + auto cpuBackend = std::static_pointer_cast(tensor->getBackend()); + auto &gen = cpuBackend->getRandomGenerator(); + std::bernoulli_distribution dist(1.f - p); + std::generate(tensor->data(), tensor->data() + tensor->size(), [&]() { return dist(gen) / (1.f - p); }); } } diff --git a/src/tensors/device.cpp b/src/tensors/device.cpp index 5e2a3568..e4bf9d17 100644 --- a/src/tensors/device.cpp +++ b/src/tensors/device.cpp @@ -3,14 +3,16 @@ namespace marian { namespace cpu { - + Device::~Device() { delete[] data_; + data_ = nullptr; + size_ = 0; } - + void Device::reserve(size_t size) { size = align(size); - ABORT_IF(size < size_, "New size must be larger than old size"); + ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0"); if(data_) { uint8_t *temp = new uint8_t[size_]; @@ -20,9 +22,8 @@ namespace cpu { } else { data_ = new uint8_t[size]; } - size_ = size; } - + } } diff --git a/src/tensors/device.cu b/src/tensors/device.cu index 2264fda4..bef5491b 100644 --- a/src/tensors/device.cu +++ b/src/tensors/device.cu @@ -19,7 +19,7 @@ namespace gpu { size = align(size); cudaSetDevice(deviceId_.no); - ABORT_IF(size < size_, "New size must be larger than old size"); + ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0"); if(data_) { // Allocate memory by going through host memory diff --git a/src/tensors/device.h b/src/tensors/device.h index 33a9c104..22e41bd5 100644 --- a/src/tensors/device.h +++ b/src/tensors/device.h @@ -10,9 +10,9 @@ namespace marian { class Device { protected: DeviceId deviceId_; - - uint8_t* data_; - size_t size_; + + uint8_t* data_{0}; + size_t size_{0}; size_t alignment_; size_t align(size_t size) { @@ -39,9 +39,9 @@ namespace gpu { public: Device(DeviceId deviceId, size_t alignment = 256) : marian::Device(deviceId, alignment) {} - + ~Device(); - + void reserve(size_t size); }; } @@ -51,9 +51,9 @@ namespace cpu { public: Device(DeviceId deviceId, size_t alignment = 256) : marian::Device(deviceId, alignment) {} - + ~Device(); - + void reserve(size_t size); }; } @@ -65,4 +65,4 @@ static inline Ptr DispatchDevice(DeviceId deviceId, size_t alignment = 2 return New(deviceId, alignment); } -} \ No newline at end of file +} diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h index cafdfb5b..357206fc 100644 --- a/src/tensors/gpu/backend.h +++ b/src/tensors/gpu/backend.h @@ -17,8 +17,8 @@ namespace marian { namespace gpu { - -class Backend : public marian::Backend { + +class Backend : public marian::Backend { public: Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) { setDevice(); @@ -37,13 +37,13 @@ private: cublasHandle_t cublasHandle_; curandGenerator_t curandGenerator_; - + void setHandles() { cublasHandle_ = create_handle(); curandGenerator_ = createCurandGenerator(); } - + curandGenerator_t createCurandGenerator() { cudaSetDevice(deviceId_.no); curandGenerator_t generator; diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h index 18aae134..e36100a2 100644 --- a/src/tensors/tensor_allocator.h +++ b/src/tensors/tensor_allocator.h @@ -42,11 +42,17 @@ public: void reserveExact(size_t bytes = 0) { size_t mbytes = bytes / MBYTE; - LOG(info, - "[memory] Reserving {} MB, device {}", - mbytes, - allocator_->getDevice()); - + if(mbytes == 0) { + LOG(info, + "[memory] Reserving {} B, device {}", + bytes, + allocator_->getDevice()); + } else { + LOG(info, + "[memory] Reserving {} MB, device {}", + mbytes, + allocator_->getDevice()); + } allocator_->reserve(bytes); } diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index aebaaaaa..16882b52 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -17,9 +17,9 @@ endforeach(test) # Testing apps add_executable(logger_test logger_test.cpp) +add_executable(dropout_test dropout_test.cpp) #cuda_add_executable(bn_test bn_test.cu) cuda_add_executable(pooling_test pooling_test.cu) -cuda_add_executable(dropout_test dropout_test.cu) #cuda_add_executable(marian_test marian_test.cu) cuda_add_executable(tensor_test tensor_test.cu) diff --git a/src/tests/dropout_test.cu b/src/tests/dropout_test.cpp similarity index 79% rename from src/tests/dropout_test.cu rename to src/tests/dropout_test.cpp index f4a42e92..5023606b 100644 --- a/src/tests/dropout_test.cu +++ b/src/tests/dropout_test.cpp @@ -12,8 +12,11 @@ using namespace keywords; int main(int argc, char** argv) { auto c = New(argc, argv); + auto type = c->get("cpu") ? DeviceType::cpu : DeviceType::gpu; + DeviceId deviceId{0, type}; + auto g = New(); - g->setDevice({0, DeviceType::gpu}); + g->setDevice(deviceId); g->reserveWorkspaceMB(512); for(int i = 0; i < 10; ++i) { From 79c385b894ed41c6ef699fb310ce1f59250bfc91 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 16 Feb 2018 16:43:43 -0800 Subject: [PATCH 10/11] reformatting --- src/tensors/cpu/dropout.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp index a6a0c263..bb6ee799 100644 --- a/src/tensors/cpu/dropout.cpp +++ b/src/tensors/cpu/dropout.cpp @@ -11,7 +11,8 @@ namespace marian { auto cpuBackend = std::static_pointer_cast(tensor->getBackend()); auto &gen = cpuBackend->getRandomGenerator(); std::bernoulli_distribution dist(1.f - p); - std::generate(tensor->data(), tensor->data() + tensor->size(), [&]() { return dist(gen) / (1.f - p); }); + std::generate(tensor->data(), tensor->data() + tensor->size(), + [&]() { return dist(gen) / (1.f - p); }); } } From 6e421f7a741dca8d7181f87acd398da60bb77f7d Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Fri, 16 Feb 2018 21:53:09 -0800 Subject: [PATCH 11/11] add cpu tests for graph --- src/tests/graph_tests.cpp | 50 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/tests/graph_tests.cpp b/src/tests/graph_tests.cpp index 297fd73f..dbb4dd0c 100644 --- a/src/tests/graph_tests.cpp +++ b/src/tests/graph_tests.cpp @@ -53,3 +53,53 @@ TEST_CASE("Expression graph can be initialized with constant values", REQUIRE(values == v); } } + +TEST_CASE("Graph device is set (cpu)", "[graph]") { + auto graph = New(); + + graph->setDevice({0, DeviceType::cpu}); + + DeviceId testId{0, DeviceType::cpu}; + REQUIRE(graph->getDevice() == testId); +} + +TEST_CASE("Expression graph can be initialized with constant values (cpu)", + "[graph]") { + auto graph = New(); + graph->setDevice({0, DeviceType::cpu}); + graph->reserveWorkspaceMB(4); + + std::vector values; + + SECTION("initializing with zero (cpu)") { + graph->clear(); + values.clear(); + auto zeros = graph->param("0s", {2, 5}, keywords::init = inits::zeros); + graph->forward(); + + zeros->val()->get(values); + REQUIRE(values == std::vector(10, 0.0f)); + } + + SECTION("initializing with ones (cpu)") { + graph->clear(); + values.clear(); + auto ones = graph->param("1s", {2, 5}, keywords::init = inits::ones); + graph->forward(); + + ones->val()->get(values); + REQUIRE(values == std::vector(10, 1.0f)); + } + + SECTION("initializing from vector (cpu)") { + graph->clear(); + values.clear(); + std::vector v({1, 2, 3, 4, 5, 6}); + auto vals = graph->param("vs", {2, 3}, keywords::init = inits::from_vector(v)); + graph->forward(); + + REQUIRE(values.empty()); + vals->val()->get(values); + REQUIRE(values == v); + } +}