From 164d26cc36204316324a2fc76f7538a044dd192a Mon Sep 17 00:00:00 2001 From: Martin Junczys-Dowmunt Date: Mon, 6 Jan 2020 19:14:00 +0000 Subject: [PATCH] Merged PR 10999: Splitting up add_all.h into *.h, *.cu and *.inc Splitting up header file into header and *.cu, comes with the price of having to include specializations for combinations of types as for element.inc and add.inc. No code changes otherwise. Add CMake options to disable specific compute capabilities. When run with `make -j16` this compiles in about 6 minutes instead of 7 minutes. Selecting only SM70 during compilation brings down the time to 3 minutes. --- CHANGELOG.md | 1 + CMakeLists.txt | 23 +++++-- VERSION | 2 +- src/3rd_party/CMakeLists.txt | 19 +++++- src/CMakeLists.txt | 3 +- src/tensors/gpu/add.inc | 24 ++++---- src/tensors/gpu/add_all.cu | 116 +++++++++++++++++++++++++++++++++++ src/tensors/gpu/add_all.h | 89 ++++++++------------------- src/tensors/gpu/add_all.inc | 71 +++++++++++++++++++++ 9 files changed, 261 insertions(+), 87 deletions(-) create mode 100644 src/tensors/gpu/add_all.cu create mode 100644 src/tensors/gpu/add_all.inc diff --git a/CHANGELOG.md b/CHANGELOG.md index a3f9dcbb..fea7c040 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Add CMAKE options to disable compilation for specific GPU SM types - An option to print word-level translation scores - An option to turn off automatic detokenization from SentencePiece - Separate quantization types for 8-bit FBGEMM for AVX2 and AVX512 diff --git a/CMakeLists.txt b/CMakeLists.txt index ab5460e6..c442931f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,10 @@ set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.") # Custom CMake options option(COMPILE_CPU "Compile CPU version" ON) option(COMPILE_CUDA "Compile GPU version" ON) +option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON) +option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON) +option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON) +option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON) option(COMPILE_EXAMPLES "Compile examples" OFF) option(COMPILE_SERVER "Compile marian-server" OFF) option(COMPILE_TESTS "Compile tests" OFF) @@ -181,8 +185,6 @@ set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS}) if(COMPILE_CUDA) -LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;) - if(USE_STATIC_LIBS) # link statically to stdlib libraries set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++") @@ -202,10 +204,19 @@ if(CUDA_FOUND) if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2")) message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}") endif() - - if(CUDA_VERSION VERSION_GREATER "8.0") - LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) - endif() + + if(COMPILE_CUDA_SM35) + LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above + endif(COMPILE_CUDA_SM35) + if(COMPILE_CUDA_SM50) + LIST(APPEND COMPUTE -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52;) # Maxwell GPUs + endif(COMPILE_CUDA_SM50) + if(COMPILE_CUDA_SM60) + LIST(APPEND COMPUTE -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;) # Pascal GPUs + endif(COMPILE_CUDA_SM60) + if(COMPILE_CUDA_SM70) + LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs + endif(COMPILE_CUDA_SM70) if(USE_STATIC_LIBS) find_library(CUDA_culibos_LIBRARY NAMES culibos PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64) diff --git a/VERSION b/VERSION index 771725ae..b08f51ec 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.8.37 +v1.8.38 diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index cc9da3fb..b40df585 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -71,8 +71,23 @@ set(INSTALLS "") # this will contain a list of 3rd part dependencies that we ins if(CUDA_FOUND) if(USE_NCCL) - # disables compilation for sm_30 to avoid ptxas warning... that's general Kepler support. But K80s are supported for instance by sm_35 - set(GENCODE "-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61") + # disables compilation for sm_30 to avoid ptxas warning... that is general Kepler support. But K80s are supported for instance by sm_35 + + set(GENCODE "") + if(COMPILE_CUDA_SM35) + set(GENCODE "${GENCODE} -gencode=arch=compute_35,code=sm_35") + endif(COMPILE_CUDA_SM35) + if(COMPILE_CUDA_SM50) + set(GENCODE "${GENCODE} -gencode=arch=compute_50,code=sm_50") + endif(COMPILE_CUDA_SM50) + if(COMPILE_CUDA_SM60) + set(GENCODE "${GENCODE} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61") + endif(COMPILE_CUDA_SM60) + if(COMPILE_CUDA_SM70) + set(GENCODE "${GENCODE} -gencode=arch=compute_70,code=sm_70") + endif(COMPILE_CUDA_SM70) + + message(${GENCODE}) # install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation ExternalProject_Add(nccl_install diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b549ce2d..c1a51390 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -138,7 +138,8 @@ cuda_add_library(marian_cuda tensors/gpu/algorithm.cu tensors/gpu/prod.cpp tensors/gpu/element.cu - tensors/gpu/add.cu + tensors/gpu/add.cu + tensors/gpu/add_all.cu tensors/gpu/tensor_operators.cu tensors/gpu/cudnn_wrappers.cu translator/nth_element.cu diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc index 1e11d011..98723b9d 100755 --- a/src/tensors/gpu/add.inc +++ b/src/tensors/gpu/add.inc @@ -15,21 +15,21 @@ template void Add, BinaryFunctor, BinaryFunctor, Capture>>, marian::Tensor, marian::Tensor>(BinaryFunctor, BinaryFunctor, Capture>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void Add, UnaryFunctor>>, marian::Tensor, marian::Tensor>(BinaryFunctor, UnaryFunctor>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void Add, Assignee<2>>, BinaryFunctor>>, marian::Tensor, marian::Tensor>(BinaryFunctor, Assignee<2>>, BinaryFunctor>>, float, marian::Tensor, marian::Tensor, marian::Tensor); -template void Add, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor, Capture>, Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor, Capture>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void Add, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add, UnaryFunctor, Assignee<2>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor, UnaryFunctor, Assignee<2>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add, UnaryFunctor, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor, UnaryFunctor, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); -template void Add, Assignee<2> >, marian::Tensor, marian::Tensor >(BinaryFunctor, Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); -template void marian::gpu::Add, marian::functional::Assignee<1> >, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::Assignee<1> >, float, marian::Tensor, marian::Tensor); -template void marian::gpu::Aggregate, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor); -template void marian::gpu::Aggregate, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor); -template void marian::gpu::Aggregate, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor); -template void marian::gpu::Aggregate, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor); -template void marian::gpu::Add, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); -template void marian::gpu::Add, marian::functional::UnaryFunctor, marian::functional::Assignee<3> > > >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor, marian::functional::Assignee<3> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); -template void marian::gpu::Add, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); -template void marian::gpu::Add, marian::functional::Capture>, marian::functional::Assignee<2> >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::Capture>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor); -template void marian::gpu::Add, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor > >, marian::functional::BinaryFunctor > > > > >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor > >, marian::functional::BinaryFunctor > > > > >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); \ No newline at end of file +template void Add, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add, Assignee<1>>, marian::Tensor >(BinaryFunctor, Assignee<1>>, float, marian::Tensor, marian::Tensor); +template void Aggregate, BinaryFunctor, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void Aggregate, BinaryFunctor, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void Aggregate, BinaryFunctor, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void Aggregate, BinaryFunctor, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void Add, Assignee<2>>, Assignee<3>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor, Assignee<2>>, Assignee<3>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add, UnaryFunctor, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor, UnaryFunctor, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add, Assignee<2>>, Assignee<3>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor, Assignee<2>>, Assignee<3>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor, Capture>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add, BinaryFunctor>, BinaryFunctor>>, BinaryFunctor>>>>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor, BinaryFunctor>, BinaryFunctor>>, BinaryFunctor>>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); \ No newline at end of file diff --git a/src/tensors/gpu/add_all.cu b/src/tensors/gpu/add_all.cu new file mode 100644 index 00000000..ad3ac252 --- /dev/null +++ b/src/tensors/gpu/add_all.cu @@ -0,0 +1,116 @@ +#include "tensors/gpu/add_all.h" +#include "tensors/gpu/cuda_helpers.h" +#include "functional/functional.h" +#include "tensors/tensor_operators.h" +#include "3rd_party/reduce_all.h" // only works with CUDA >9.0, we are dropping CUDA 8.0 support, also changed in CMakeLists.txt + +namespace marian { + +#if COMPILE_FP16 +// local overload to determine tensor type +template <> inline Type typeId() { return Type::float16; } +#endif + +// Version with variadic template arguments, called by version with explicit arguments below +template +void AggregateAllVar(Ptr allocator, + Functor functor, + AccType aggInit, + AggFunctor aggFunctor, + AccType scale, + Tensor out, + const Tensors... tensors) { + cudaSetDevice(out->getDeviceId().no); + + static_assert(CUDA_VERSION >= 9000, "Marian requires CUDA_VERSION >= 9000 (9.0)"); + + constexpr size_t K = sizeof...(Tensors); // obtain arity K of tensors... + functional::Array, K> gIns = {tensors...}; // convert to array of K objects of type functional::Tensor + functional::Shape full = marian::Shape::broadcast({tensors...}); // compute maximal broadcasted shape + + int size = full.elements(); + int threads = (size < MAX_THREADS * 2) ? nextPow2((size + 1) / 2) : MAX_THREADS; // suggested in NVidia example for the all_reduce kernel + int blocks = std::min(MAX_BLOCKS, (size + (threads * 2 - 1)) / (threads * 2)); // suggested in NVidia example for the all_reduce kernel + + // The all_reduce kernel by nivida needs to perform multiple passes if the number of blocks needed to perform the reduction is larger than 1. + // Here we allocate the memory for the intermediate reductions for each block. + Tensor blockMem; + if(blocks > 1 || out->type() != typeId()) { // if the out tensor does not have elementType AccType we need to allocate and convert later + MemoryPiece::PtrType temporaryMemory; + if(allocator) { + temporaryMemory = allocator->alloc(blocks); + } else { // @TODO: get rid of this branch + uint8_t* temporaryMemoryPtr = 0; + CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType) * blocks)); + temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType) * blocks); // @TODO: consider implementing MemoryPiece::cudaMalloc(size) for managed memory + } + blockMem = TensorBase::New(temporaryMemory, + Shape({blocks}), + typeId(), + out->getBackend()); + blockMem->set(aggInit); // set temporary memory to aggInit + } + else { // we are reducing into a single element now and the type matches, just use out as memory + blockMem = out; // do not set final output memory as we might be summing gradients... needs to be handled outside this function + } + + functional::Tensor gBlockMem = blockMem; + reduceSinglePass(functor, aggInit, aggFunctor, scale, full, /*out=*/gBlockMem, /*in=*/gIns, threads, blocks); // First pass reduction into intermediate memory + + // If we actually needed more than one block to perform the first pass reduction, recursively run a second pass reduction over block memory until block memory has size 1. + if(blocks > 1) { + using namespace functional; + auto identity = _1; // transformation was done in first pass, hence only identity + AggregateAll(allocator, identity, aggInit, aggFunctor, scale, out, /*in=*/blockMem); // Reducing AccType in AccType now (meta-reduction) + } else if(out->type() != typeId()) { // it's only a single block, but we need to convert to different type, as mentioned above + CopyCast(out, blockMem); + } + + if(blockMem != out) { + // Free temporary memory whether allocated in allocator or via cudaMalloc + if(allocator) + allocator->free(blockMem->memory()); + else if(blockMem->memory()->data()) + CUDA_CHECK(cudaFree(blockMem->memory()->data())); + } +} + +template +void AggregateAll(Ptr allocator, + Functor functor, + AccType aggInit, + AggFunctor aggFunctor, + AccType scale, + Tensor out, + const Tensor in1) { + AggregateAllVar(allocator, functor, aggInit, aggFunctor, scale, out, in1); +} + +template +void AggregateAll(Ptr allocator, + Functor functor, + AccType aggInit, + AggFunctor aggFunctor, + AccType scale, + Tensor out, + const Tensor in1, + const Tensor in2) { + AggregateAllVar(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2); +} + +template +void AggregateAll(Ptr allocator, + Functor functor, + AccType aggInit, + AggFunctor aggFunctor, + AccType scale, + Tensor out, + const Tensor in1, + const Tensor in2, + const Tensor in3) { + AggregateAllVar(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2, in3); +} + +#include "tensors/gpu/add_all.inc" + +} \ No newline at end of file diff --git a/src/tensors/gpu/add_all.h b/src/tensors/gpu/add_all.h index 5ef079a1..2e37fd49 100644 --- a/src/tensors/gpu/add_all.h +++ b/src/tensors/gpu/add_all.h @@ -3,87 +3,46 @@ // This header file provides wrappers around NVidia's reduce_all kernel with our custom aggregation functionality // This kernel reduces a tensor into a single value. We have modified it to allow for different types of aggregations // like summing or max etc. - + #include "tensors/gpu/cuda_helpers.h" #include "tensors/tensor.h" #include "tensors/allocator.h" -#include "functional/functional.h" #include "functional/tensor.h" #include "tensors/tensor_operators.h" -#include "3rd_party/reduce_all.h" // only works with CUDA >9.0, we are dropping CUDA 8.0 support, also changed in CMakeLists.txt - -#include - namespace marian { -#if COMPILE_FP16 -// local overload to determine tensor type -template <> inline Type typeId() { return Type::float16; } -#endif - -template +// These function declarations are repeated as template specialization with variadic template arguments does not seem to work. +// Here I am just creating version for 1, 2, and 3 arguments. To be extended if required. +template void AggregateAll(Ptr allocator, Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale, Tensor out, - const Tensors... tensors) { - cudaSetDevice(out->getDeviceId().no); + const Tensor in1); - static_assert(CUDA_VERSION >= 9000, "Marian requires CUDA_VERSION >= 9000 (9.0)"); +template +void AggregateAll(Ptr allocator, + Functor functor, + AccType aggInit, + AggFunctor aggFunctor, + AccType scale, + Tensor out, + const Tensor in1, + const Tensor in2); - constexpr size_t K = sizeof...(Tensors); // obtain arity K of tensors... - functional::Array, K> gIns = {tensors...}; // convert to array of K objects of type functional::Tensor - functional::Shape full = marian::Shape::broadcast({tensors...}); // compute maximal broadcasted shape - - int size = full.elements(); - int threads = (size < MAX_THREADS * 2) ? nextPow2((size + 1) / 2) : MAX_THREADS; // suggested in NVidia example for the all_reduce kernel - int blocks = std::min(MAX_BLOCKS, (size + (threads * 2 - 1)) / (threads * 2)); // suggested in NVidia example for the all_reduce kernel - - // The all_reduce kernel by nivida needs to perform multiple passes if the number of blocks needed to perform the reduction is larger than 1. - // Here we allocate the memory for the intermediate reductions for each block. - Tensor blockMem; - if(blocks > 1 || out->type() != typeId()) { // if the out tensor does not have elementType AccType we need to allocate and convert later - MemoryPiece::PtrType temporaryMemory; - if(allocator) { - temporaryMemory = allocator->alloc(blocks); - } else { // @TODO: get rid of this branch - uint8_t* temporaryMemoryPtr = 0; - CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType) * blocks)); - temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType) * blocks); // @TODO: consider implementing MemoryPiece::cudaMalloc(size) for managed memory - } - blockMem = TensorBase::New(temporaryMemory, - Shape({blocks}), - typeId(), - out->getBackend()); - blockMem->set(aggInit); // set temporary memory to aggInit - } - else { // we are reducing into a single element now and the type matches, just use out as memory - blockMem = out; // do not set final output memory as we might be summing gradients... needs to be handled outside this function - } - - functional::Tensor gBlockMem = blockMem; - reduceSinglePass(functor, aggInit, aggFunctor, scale, full, /*out=*/gBlockMem, /*in=*/gIns, threads, blocks); // First pass reduction into intermediate memory - - // If we actually needed more than one block to perform the first pass reduction, recursively run a second pass reduction over block memory until block memory has size 1. - if(blocks > 1) { - using namespace functional; - auto identity = _1; // transformation was done in first pass, hence only identity - AggregateAll(allocator, identity, aggInit, aggFunctor, scale, out, /*in=*/blockMem); // Reducing AccType in AccType now (meta-reduction) - } else if(out->type() != typeId()) { // it's only a single block, but we need to convert to different type, as mentioned above - CopyCast(out, blockMem); - } - - if(blockMem != out) { - // Free temporary memory whether allocated in allocator or via cudaMalloc - if(allocator) - allocator->free(blockMem->memory()); - else if(blockMem->memory()->data()) - CUDA_CHECK(cudaFree(blockMem->memory()->data())); - } -} +template +void AggregateAll(Ptr allocator, + Functor functor, + AccType aggInit, + AggFunctor aggFunctor, + AccType scale, + Tensor out, + const Tensor in1, + const Tensor in2, + const Tensor in3); // Aggregates all values into a single tensor and returns the value of that tensor as a float // This does a GPU to CPU memory copy via TensorBase::scalar(). diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc new file mode 100644 index 00000000..73b0bda9 --- /dev/null +++ b/src/tensors/gpu/add_all.inc @@ -0,0 +1,71 @@ +// see element.inc for instructions on how to maintain this +using namespace functional; + +template void AggregateAll>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll>>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor, Assignee<2>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor, Assignee<2>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll, Capture>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Capture>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll>, Assignee<2>>, BinaryFunctor, Assignee<3>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, BinaryFunctor, Assignee<3>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, UnaryFunctor>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor, BinaryFunctor>, BinaryFunctor>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor, BinaryFunctor>, BinaryFunctor>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, UnaryFunctor>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor, Capture>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor, Capture>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, UnaryFunctor>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<2>>, BinaryFunctor>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, BinaryFunctor>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Capture>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Capture>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<3>>, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<3>>, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<3>>, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<3>>, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, UnaryFunctor, Assignee<2>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor, Assignee<2>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<3>>, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<3>>, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<3>>, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<3>>, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, UnaryFunctor, Assignee<3>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor, Assignee<3>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<2>>, Assignee<3>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, Assignee<3>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, UnaryFunctor, Assignee<3>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor, Assignee<3>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<2>>, Assignee<3>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, Assignee<3>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Capture>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Capture>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, BinaryFunctor>, BinaryFunctor>>, BinaryFunctor>>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor>, BinaryFunctor>>, BinaryFunctor>>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); + +#if COMPILE_FP16 +template void AggregateAll<__half, float, BinaryFunctor>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor>>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, BinaryFunctor, Assignee<2>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor, Assignee<2>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Capture>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Capture>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor>, Assignee<2>>, BinaryFunctor, Assignee<3>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, BinaryFunctor, Assignee<3>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, UnaryFunctor>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, UnaryFunctor>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, BinaryFunctor, BinaryFunctor>, BinaryFunctor>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor, BinaryFunctor>, BinaryFunctor>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, UnaryFunctor>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, BinaryFunctor>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, BinaryFunctor, Capture>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor, Capture>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, UnaryFunctor>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<2>>, BinaryFunctor>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, BinaryFunctor>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Capture>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Capture>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<3>>, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<3>>, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<3>>, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<3>>, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, UnaryFunctor, Assignee<2>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor, Assignee<2>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<3>>, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<3>>, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<3>>, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<3>>, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, UnaryFunctor, Assignee<3>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor, Assignee<3>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<2>>, Assignee<3>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, Assignee<3>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, UnaryFunctor, Assignee<3>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, UnaryFunctor, Assignee<3>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<2>>, Assignee<3>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<2>>, Assignee<3>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Capture>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Capture>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, BinaryFunctor>, BinaryFunctor>>, BinaryFunctor>>>>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, BinaryFunctor>, BinaryFunctor>>, BinaryFunctor>>>>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, Assignee<1>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +template void AggregateAll<__half, float, BinaryFunctor, Assignee<1>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor, Assignee<1>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor); +#endif \ No newline at end of file