more coda separation

This commit is contained in:
Marcin Junczys-Dowmunt 2018-03-02 18:06:26 -08:00
parent 42293372ca
commit 8ec6eef9d5
14 changed files with 214 additions and 75 deletions

View File

@ -10,6 +10,7 @@ option(COMPILE_EXAMPLES "Compile examples" OFF)
option(COMPILE_TESTS "Compile tests" OFF)
option(COMPILE_SERVER "Compile marian-server" ON)
option(COMPILE_CPU "Compile CPU version" ON)
option(COMPILE_CUDA "Compile GPU version" ON)
option(USE_CUDNN "Use CUDNN library" OFF)
option(USE_MPI "Use MPI library" OFF)
@ -20,14 +21,17 @@ include(GetVersionFromFile)
message(STATUS "Project name: ${PROJECT_NAME}")
message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")
# Set compilation flags
set(CMAKE_CXX_FLAGS_RELEASE " -std=c++11 -O3 -Ofast -m64 -march=native -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
set(CMAKE_CXX_FLAGS_DEBUG " -std=c++11 -g -O0 -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
set(CMAKE_CXX_FLAGS_RELEASE " -std=c++11 -O3 -Ofast -m64 -pthread -march=native -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
set(CMAKE_CXX_FLAGS_DEBUG " -std=c++11 -g -O0 -pthread -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -g -pg")
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
# Find packages
set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS})
if(COMPILE_CUDA)
find_package(CUDA "8.0" REQUIRED)
if(CUDA_FOUND)
set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY})
@ -41,8 +45,15 @@ if(CUDA_FOUND)
LIST(APPEND CUDA_NVCC_FLAGS -DCUDNN; )
endif(CUDNN_FOUND)
endif(USE_CUDNN)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDA_FOUND")
else(CUDA_FOUND)
message(WARNING "Cannot find CUDA compiling CPU version only")
endif(CUDA_FOUND)
else(COMPILE_CUDA)
message(WARNING "COMPILE_CUDA=off : Building only CPU version")
endif(COMPILE_CUDA)
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -O0; -g; -Xcompiler '-fPIC'; -arch=sm_30; -gencode=arch=compute_30,code=sm_30; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_61,code=compute_61 ;)
else(CMAKE_BUILD_TYPE STREQUAL "Debug")

View File

@ -4,7 +4,7 @@ include_directories(.)
include_directories(3rd_party)
include_directories(3rd_party/SQLiteCpp/include)
cuda_add_library(marian
add_library(marian STATIC
common/utils.cpp
common/logging.cpp
common/config.cpp
@ -26,14 +26,6 @@ cuda_add_library(marian
tensors/cpu/prod.cpp
tensors/cpu/tensor_operators.cpp
tensors/gpu/device.cu
tensors/gpu/algorithm.cu
tensors/gpu/dropout.cu
tensors/gpu/prod.cu
tensors/gpu/element.cu
tensors/gpu/add.cu
tensors/gpu/tensor_operators.cu
tensors/gpu/cudnn_wrappers.cu
graph/expression_graph.cpp
graph/expression_operators.cpp
@ -53,26 +45,37 @@ cuda_add_library(marian
translator/history.cpp
translator/output_collector.cpp
translator/nth_element.cu
translator/nth_element.cpp
translator/helpers.cu
translator/helpers.cpp
translator/scorers.cpp
training/dropper.cu
training/graph_group_async.cpp
training/graph_group_async_drop.cpp
training/graph_group_sync.cpp
training/graph_group_singleton.cpp
training/graph_group_multinode.cpp
training/sparse_tensor.cu
training/validator.cpp
rescorer/score_collector.cpp
$<TARGET_OBJECTS:libyaml-cpp>
$<TARGET_OBJECTS:SQLiteCpp>
STATIC
)
$<TARGET_OBJECTS:SQLiteCpp>)
if(CUDA_FOUND)
cuda_add_library(marian_cuda
tensors/gpu/device.cu
tensors/gpu/algorithm.cu
tensors/gpu/dropout.cu
tensors/gpu/prod.cu
tensors/gpu/element.cu
tensors/gpu/add.cu
tensors/gpu/tensor_operators.cu
tensors/gpu/cudnn_wrappers.cu
translator/nth_element.cu
translator/helpers.cu
training/dropper.cu
training/sparse_tensor.cu
STATIC)
endif(CUDA_FOUND)
set_target_properties(marian PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
@ -99,7 +102,10 @@ endif(COMPILE_SERVER)
foreach(exec ${EXECUTABLES})
target_link_libraries(${exec} marian ${EXT_LIBS})
cuda_add_cublas_to_target(${exec})
if(CUDA_FOUND)
target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
cuda_add_cublas_to_target(${exec})
endif(CUDA_FOUND)
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
endforeach(exec)

View File

@ -3,6 +3,9 @@ add_executable(mnist_example mnist/mnist_ffnn.cpp)
foreach(exec iris_example mnist_example)
target_link_libraries(${exec} marian ${EXT_LIBS})
cuda_add_cublas_to_target(${exec})
if(CUDA_FOUND)
target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS})
cuda_add_cublas_to_target(${exec})
endif(CUDA_FOUND)
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
endforeach(exec)

View File

@ -1,5 +1,4 @@
#include "graph/expression_operators.h"
//#include "kernels/sparse.h"
#include "layers/constructors.h"
#include "graph/node_operators.h"
@ -333,6 +332,8 @@ Expr shift(Expr a, Shape shift) {
// return Expression<LexicalProbNodeOp>(logits, att, eps, lf);
//}
#ifdef CUDA_FOUND
Expr avg_pooling(
Expr x,
int height,
@ -410,4 +411,6 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
return Expression<PoolingWithMaskingOp>(x, mask, width, isEven);
}
#endif
}

View File

@ -1,15 +1,13 @@
#pragma once
#include "tensors/tensor.h"
#include "tensors/gpu/backend.h"
#include "tensors/backend.h"
#include "graph/node.h"
//#include "kernels/sparse.h"
#include "tensors/tensor_operators.h"
#include "functional/functional.h"
#include "tensors/gpu/cudnn_wrappers.h"
//#include "tensors/gpu/cudnn_wrappers.h"
namespace marian {

View File

@ -1,14 +1,19 @@
#include "tensors/backend.h"
#ifdef CUDA_FOUND
#include "tensors/gpu/backend.h"
#endif
#include "tensors/cpu/backend.h"
namespace marian {
Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
#ifdef CUDA_FOUND
if(deviceId.type == DeviceType::gpu)
return New<gpu::Backend>(deviceId, seed);
else
#endif
return New<cpu::Backend>(deviceId, seed);
}

View File

@ -3,8 +3,8 @@
* SPDX-License-Identifier: MIT
*/
#include "tensors/gpu/prod.h"
#include "tensors/gpu/backend.h"
#include "tensors/tensor.h"
#include "tensors/cpu/backend.h"
#if MKL_FOUND
#include <mkl.h>

View File

@ -59,10 +59,17 @@ namespace cpu {
}
static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
#ifdef CUDA_FOUND
if(deviceId.type == DeviceType::gpu)
return New<gpu::Device>(deviceId, alignment);
else
return New<cpu::Device>(deviceId, alignment);
#else
if(deviceId.type == DeviceType::gpu)
ABORT("CUDA support not compiled into marian");
else
return New<cpu::Device>(deviceId, alignment);
#endif
}
}

View File

@ -1,5 +1,6 @@
#pragma once
#ifdef CUDA_FOUND
#define DISPATCH1(Function, Arg1) \
namespace gpu { \
@ -127,3 +128,78 @@
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
}
#else
#define DISPATCH1(Function, Arg1) \
namespace cpu { \
void Function(Arg1); \
} \
void Function(Arg1 arg1) { \
cpu::Function(arg1); \
}
#define DISPATCH2(Function, Arg1, Arg2) \
namespace cpu { \
void Function(Arg1, Arg2); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2) { \
cpu::Function(arg1, arg2); \
}
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
cpu::Function(arg1, arg2, arg3); \
}
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
cpu::Function(arg1, arg2, arg3, arg4); \
}
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
}
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
}
#define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
}
#define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
}
#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
}
#endif

View File

@ -11,7 +11,10 @@
#include "tensors/backend.h"
#include <algorithm>
#ifdef CUDA_FOUND
#include "tensors/gpu/algorithm.h"
#endif
namespace marian {
@ -53,32 +56,40 @@ public:
float get(size_t i) {
float temp;
#ifdef CUDA_FOUND
if(backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, data() + i, data() + i + 1, &temp);
else
#endif
std::copy(data() + i, data() + i + 1, &temp);
return temp;
}
void set(size_t i, float value) {
#ifdef CUDA_FOUND
if(backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, &value, &value + 1, data() + i);
else
#endif
std::copy(&value, &value + 1, data() + i);
}
void get(std::vector<float> &v) {
v.resize(size());
#ifdef CUDA_FOUND
if(backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, data(), data() + size(), v.data());
else
#endif
std::copy(data(), data() + size(), v.data());
}
void set(const float* begin, const float* end) {
#ifdef CUDA_FOUND
if(backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, begin, end, data());
else
#endif
std::copy(begin, end, data());
}
@ -87,27 +98,32 @@ public:
}
void set(float value) {
#ifdef CUDA_FOUND
if(backend_->getDevice().type == DeviceType::gpu)
gpu::fill(backend_, data(), data() + size(), value);
else
#endif
std::fill(data(), data() + size(), value);
}
void setSparse(const std::vector<size_t> &k,
const std::vector<float> &v) {
if(backend_->getDevice().type == DeviceType::gpu) {
#ifdef CUDA_FOUND
if(backend_->getDevice().type == DeviceType::gpu)
gpu::setSparse(backend_, k, v, data());
} else {
else
#endif
for(int i = 0; i < k.size(); ++i)
data()[k[i]] = v[i];
}
}
void copyFrom(Tensor in) {
#ifdef CUDA_FOUND
if(in->getBackend()->getDevice().type == DeviceType::gpu ||
backend_->getDevice().type == DeviceType::gpu)
gpu::copy(backend_, in->data(), in->data() + in->size(), data());
else
#endif
std::copy(in->data(), in->data() + in->size(), data());
}

View File

@ -10,9 +10,11 @@
#include "functional/tmp.h"
#include "functional/tensor.h"
#ifdef CUDA_FOUND
#include "tensors/gpu/element.h"
#include "tensors/gpu/add.h"
#include "tensors/gpu/prod.h"
#endif
#include "tensors/cpu/element.h"
#include "tensors/cpu/add.h"
@ -21,9 +23,11 @@ namespace marian {
template <class Functor, class ...Tensors>
void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
#ifdef CUDA_FOUND
if(out->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Element(functor, out, tensors...);
else
#endif
cpu::Element(functor, out, tensors...);
}
@ -32,9 +36,11 @@ namespace marian {
float scale,
marian::Tensor out,
Tensors... tensors) {
#ifdef CUDA_FOUND
if(out->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Add(functor, scale, out, tensors...);
else
#endif
cpu::Add(functor, scale, out, tensors...);
}
@ -81,21 +87,23 @@ namespace marian {
DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
#ifdef CUDA_FOUND
namespace gpu {
void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
}
#endif
namespace cpu {
void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
}
static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) {
if(in->getBackend()->getDevice().type == DeviceType::gpu) {
#ifdef CUDA_FOUND
if(in->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Deconcatenate(outputs, in, ax);
}
else {
else
#endif
cpu::Deconcatenate(outputs, in, ax);
}
}
DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
@ -116,11 +124,13 @@ namespace marian {
DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>)
DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>);
#ifdef CUDA_FOUND
namespace gpu {
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
#endif
namespace cpu {
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
@ -131,19 +141,21 @@ namespace marian {
static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj) {
if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
#ifdef CUDA_FOUND
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::LSTMCellBackward(outputs, inputs, adj);
}
else {
else
#endif
cpu::LSTMCellBackward(outputs, inputs, adj);
}
}
#ifdef CUDA_FOUND
namespace gpu {
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
#endif
namespace cpu {
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
@ -154,22 +166,24 @@ namespace marian {
static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj) {
if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
#ifdef CUDA_FOUND
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::LSTMOutputBackward(outputs, inputs, adj);
}
else {
else
#endif
cpu::LSTMOutputBackward(outputs, inputs, adj);
}
}
DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)
#ifdef CUDA_FOUND
namespace gpu {
void GRUFastBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj,
bool final);
}
#endif
namespace cpu {
void GRUFastBackward(std::vector<marian::Tensor> outputs,
@ -182,35 +196,37 @@ namespace marian {
std::vector<marian::Tensor> inputs,
marian::Tensor adj,
bool final = false) {
if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
#ifdef CUDA_FOUND
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::GRUFastBackward(outputs, inputs, adj, final);
}
else {
else
#endif
cpu::GRUFastBackward(outputs, inputs, adj, final);
}
}
DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
#ifdef CUDA_FOUND
namespace gpu {
float L2Norm(marian::Tensor in);
}
#endif
namespace cpu {
float L2Norm(marian::Tensor in);
}
static inline float L2Norm(marian::Tensor in) {
if(in->getBackend()->getDevice().type == DeviceType::gpu) {
#ifdef CUDA_FOUND
if(in->getBackend()->getDevice().type == DeviceType::gpu)
return gpu::L2Norm(in);
}
else {
else
#endif
return cpu::L2Norm(in);
}
}
DISPATCH5(PoolingWithMaskingForward, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
DISPATCH6(PoolingWithMaskingBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
}

View File

@ -9,7 +9,11 @@ set(UNIT_TESTS
foreach(test ${UNIT_TESTS})
add_executable("run_${test}" run_tests.cpp "${test}.cpp")
target_link_libraries("run_${test}" marian ${EXT_LIBS} Catch)
cuda_add_cublas_to_target("run_${test}")
if(CUDA_FOUND)
target_link_libraries("run_${test}" marian marian_cuda ${EXT_LIBS} Catch)
cuda_add_cublas_to_target("run_${test}")
endif(CUDA_FOUND)
add_test(NAME ${test} COMMAND "run_${test}")
endforeach(test)
@ -18,35 +22,23 @@ endforeach(test)
# Testing apps
add_executable(logger_test logger_test.cpp)
add_executable(dropout_test dropout_test.cpp)
#cuda_add_executable(bn_test bn_test.cu)
if(CUDA_FOUND)
cuda_add_executable(pooling_test pooling_test.cu)
#cuda_add_executable(marian_test marian_test.cu)
cuda_add_executable(tensor_test tensor_test.cu)
endif(CUDA_FOUND)
add_executable(sqlite_test sqlite_test.cpp)
foreach(exec
logger_test
dropout_test
pooling_test
#marian_test
#bn_test
tensor_test
sqlite_test
)
)
target_link_libraries(${exec} marian ${EXT_LIBS})
cuda_add_cublas_to_target(${exec})
if(CUDA_FOUND)
target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} Catch)
cuda_add_cublas_to_target(${exec})
endif(CUDA_FOUND)
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
endforeach(exec)
# if(CUDNN_FOUND)
# cuda_add_executable(conv_test conv_test.cu)
# foreach(exec
# conv_test
# )
# target_link_libraries(${exec} marian ${EXT_LIBS})
# cuda_add_cublas_to_target(${exec})
# set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
# endforeach(exec)
# endif(CUDNN_FOUND)

View File

@ -105,9 +105,11 @@ public:
// @TODO: unify this
Ptr<NthElement> nth;
#ifdef CUDA_FOUND
if(graph->getDevice().type == DeviceType::gpu)
nth = New<NthElementGPU>(localBeamSize, dimBatch, graph->getDevice());
else
#endif
nth = New<NthElementCPU>(localBeamSize, dimBatch);
Beams beams(dimBatch);

View File

@ -38,18 +38,22 @@ void suppressUnk(Expr probs) {
if(probs->val()->getBackend()->getDevice().type == DeviceType::cpu) {
cpu::suppressUnk(probs);
}
#ifdef CUDA_FOUND
else {
gpu::suppressUnk(probs);
}
#endif
}
void suppressWord(Expr probs, Word id) {
if(probs->val()->getBackend()->getDevice().type == DeviceType::cpu) {
cpu::suppressWord(probs, id);
}
#ifdef CUDA_FOUND
else {
gpu::suppressWord(probs, id);
}
#endif
}
}