more coda separation

2024-09-17 09:47:34 +03:00 · 2018-03-02 18:06:26 -08:00 · 2018-03-02 18:06:26 -08:00 · 8ec6eef9d5
commit 8ec6eef9d5
parent 42293372ca
14 changed files with 214 additions and 75 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -10,6 +10,7 @@ option(COMPILE_EXAMPLES "Compile examples" OFF)
 option(COMPILE_TESTS "Compile tests" OFF)
 option(COMPILE_SERVER "Compile marian-server" ON)
 option(COMPILE_CPU "Compile CPU version" ON)
+option(COMPILE_CUDA "Compile GPU version" ON)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_MPI "Use MPI library" OFF)

@ -20,14 +21,17 @@ include(GetVersionFromFile)
 message(STATUS "Project name: ${PROJECT_NAME}")
 message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")

-
 # Set compilation flags
-set(CMAKE_CXX_FLAGS_RELEASE " -std=c++11 -O3 -Ofast -m64  -march=native -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
-set(CMAKE_CXX_FLAGS_DEBUG " -std=c++11 -g -O0 -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
+set(CMAKE_CXX_FLAGS_RELEASE " -std=c++11 -O3 -Ofast -m64 -pthread -march=native -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
+set(CMAKE_CXX_FLAGS_DEBUG " -std=c++11 -g -O0 -pthread -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
 set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -g -pg")
 set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})

 # Find packages
+
+set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS})
+
+if(COMPILE_CUDA)
 find_package(CUDA "8.0" REQUIRED)
 if(CUDA_FOUND)
  set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY})
@ -41,8 +45,15 @@ if(CUDA_FOUND)
      LIST(APPEND CUDA_NVCC_FLAGS -DCUDNN; )
    endif(CUDNN_FOUND)
  endif(USE_CUDNN)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDA_FOUND")
+else(CUDA_FOUND)
+  message(WARNING "Cannot find CUDA compiling CPU version only")
 endif(CUDA_FOUND)

+else(COMPILE_CUDA)
+  message(WARNING "COMPILE_CUDA=off : Building only CPU version")
+endif(COMPILE_CUDA)
+
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
  list(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -O0; -g; -Xcompiler '-fPIC'; -arch=sm_30; -gencode=arch=compute_30,code=sm_30; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_61,code=compute_61 ;)
 else(CMAKE_BUILD_TYPE STREQUAL "Debug")
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -4,7 +4,7 @@ include_directories(.)
 include_directories(3rd_party)
 include_directories(3rd_party/SQLiteCpp/include)

-cuda_add_library(marian
+add_library(marian STATIC
  common/utils.cpp
  common/logging.cpp
  common/config.cpp
@ -26,14 +26,6 @@ cuda_add_library(marian
  tensors/cpu/prod.cpp
  tensors/cpu/tensor_operators.cpp

-  tensors/gpu/device.cu
-  tensors/gpu/algorithm.cu
-  tensors/gpu/dropout.cu
-  tensors/gpu/prod.cu
-  tensors/gpu/element.cu
-  tensors/gpu/add.cu
-  tensors/gpu/tensor_operators.cu
-  tensors/gpu/cudnn_wrappers.cu

  graph/expression_graph.cpp
  graph/expression_operators.cpp
@ -53,26 +45,37 @@ cuda_add_library(marian

  translator/history.cpp
  translator/output_collector.cpp
-  translator/nth_element.cu
  translator/nth_element.cpp
-  translator/helpers.cu
  translator/helpers.cpp
  translator/scorers.cpp

-  training/dropper.cu
  training/graph_group_async.cpp
  training/graph_group_async_drop.cpp
  training/graph_group_sync.cpp
  training/graph_group_singleton.cpp
  training/graph_group_multinode.cpp
-  training/sparse_tensor.cu
  training/validator.cpp

  rescorer/score_collector.cpp
  $<TARGET_OBJECTS:libyaml-cpp>
-  $<TARGET_OBJECTS:SQLiteCpp>
-  STATIC
-)
+  $<TARGET_OBJECTS:SQLiteCpp>)
+
+if(CUDA_FOUND)
+cuda_add_library(marian_cuda
+  tensors/gpu/device.cu
+  tensors/gpu/algorithm.cu
+  tensors/gpu/dropout.cu
+  tensors/gpu/prod.cu
+  tensors/gpu/element.cu
+  tensors/gpu/add.cu
+  tensors/gpu/tensor_operators.cu
+  tensors/gpu/cudnn_wrappers.cu
+  translator/nth_element.cu
+  translator/helpers.cu
+  training/dropper.cu
+  training/sparse_tensor.cu
+  STATIC)
+endif(CUDA_FOUND)

 set_target_properties(marian PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
@ -99,7 +102,10 @@ endif(COMPILE_SERVER)

 foreach(exec ${EXECUTABLES})
  target_link_libraries(${exec} marian ${EXT_LIBS})
-  cuda_add_cublas_to_target(${exec})
+  if(CUDA_FOUND)
+    target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+    cuda_add_cublas_to_target(${exec})
+  endif(CUDA_FOUND)
  set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 endforeach(exec)

--- a/src/examples/CMakeLists.txt
+++ b/src/examples/CMakeLists.txt
@ -3,6 +3,9 @@ add_executable(mnist_example mnist/mnist_ffnn.cpp)

 foreach(exec iris_example mnist_example)
  target_link_libraries(${exec} marian ${EXT_LIBS})
-  cuda_add_cublas_to_target(${exec})
+  if(CUDA_FOUND)
+    target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS})
+    cuda_add_cublas_to_target(${exec})
+  endif(CUDA_FOUND)
  set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 endforeach(exec)
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -1,5 +1,4 @@
 #include "graph/expression_operators.h"
-//#include "kernels/sparse.h"
 #include "layers/constructors.h"

 #include "graph/node_operators.h"
@ -333,6 +332,8 @@ Expr shift(Expr a, Shape shift) {
 //  return Expression<LexicalProbNodeOp>(logits, att, eps, lf);
 //}

+#ifdef CUDA_FOUND
+
 Expr avg_pooling(
    Expr x,
    int height,
@ -410,4 +411,6 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
  return Expression<PoolingWithMaskingOp>(x, mask, width, isEven);
 }

+#endif
+
 }
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@ -1,15 +1,13 @@
 #pragma once

 #include "tensors/tensor.h"
-#include "tensors/gpu/backend.h"
+#include "tensors/backend.h"

 #include "graph/node.h"
-//#include "kernels/sparse.h"
 #include "tensors/tensor_operators.h"
 #include "functional/functional.h"

-#include "tensors/gpu/cudnn_wrappers.h"
-
+//#include "tensors/gpu/cudnn_wrappers.h"

 namespace marian {

--- a/src/tensors/backend.cpp
+++ b/src/tensors/backend.cpp
@ -1,14 +1,19 @@
 #include "tensors/backend.h"

+#ifdef CUDA_FOUND
 #include "tensors/gpu/backend.h"
+#endif
+
 #include "tensors/cpu/backend.h"

 namespace marian {

 Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
+#ifdef CUDA_FOUND
  if(deviceId.type == DeviceType::gpu)
    return New<gpu::Backend>(deviceId, seed);
  else
+#endif
    return New<cpu::Backend>(deviceId, seed);
 }

--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@ -3,8 +3,8 @@
 *   SPDX-License-Identifier: MIT                                                                                                                                                            
 */  

-#include "tensors/gpu/prod.h"
-#include "tensors/gpu/backend.h"
+#include "tensors/tensor.h"
+#include "tensors/cpu/backend.h"

 #if MKL_FOUND
 #include <mkl.h>
--- a/src/tensors/device.h
+++ b/src/tensors/device.h
@ -59,10 +59,17 @@ namespace cpu {
 }

 static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
+#ifdef CUDA_FOUND
  if(deviceId.type == DeviceType::gpu)
    return New<gpu::Device>(deviceId, alignment);
  else
    return New<cpu::Device>(deviceId, alignment);
+#else
+  if(deviceId.type == DeviceType::gpu)
+    ABORT("CUDA support not compiled into marian");
+  else
+    return New<cpu::Device>(deviceId, alignment);
+#endif
 }

 }
--- a/src/tensors/dispatch.h
+++ b/src/tensors/dispatch.h
@ -1,5 +1,6 @@
 #pragma once

+#ifdef CUDA_FOUND

 #define DISPATCH1(Function, Arg1) \
  namespace gpu { \
@ -127,3 +128,78 @@
      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
  }

+#else
+
+#define DISPATCH1(Function, Arg1) \
+  namespace cpu { \
+    void Function(Arg1); \
+  } \
+  void Function(Arg1 arg1) { \
+      cpu::Function(arg1); \
+  }
+
+#define DISPATCH2(Function, Arg1, Arg2) \
+  namespace cpu { \
+    void Function(Arg1, Arg2); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2) { \
+      cpu::Function(arg1, arg2); \
+  }
+
+#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
+  namespace cpu { \
+    void Function(Arg1, Arg2, Arg3); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
+      cpu::Function(arg1, arg2, arg3); \
+  }
+
+#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
+  namespace cpu { \
+    void Function(Arg1, Arg2, Arg3, Arg4); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
+      cpu::Function(arg1, arg2, arg3, arg4); \
+  }
+
+#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
+  namespace cpu { \
+    void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5); \
+  }
+
+#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
+  namespace cpu { \
+    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
+  }
+
+#define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
+  namespace cpu { \
+    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
+  }
+
+#define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
+  namespace cpu { \
+    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
+  }
+
+#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
+  namespace cpu { \
+    void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
+  } \
+  static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
+      cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
+  }
+
+#endif
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -11,7 +11,10 @@
 #include "tensors/backend.h"

 #include <algorithm>
+
+#ifdef CUDA_FOUND
 #include "tensors/gpu/algorithm.h"
+#endif

 namespace marian {

@ -53,32 +56,40 @@ public:

  float get(size_t i) {
    float temp;
+#ifdef CUDA_FOUND
    if(backend_->getDevice().type == DeviceType::gpu)
      gpu::copy(backend_, data() + i, data() + i + 1, &temp);
    else
+#endif
      std::copy(data() + i, data() + i + 1, &temp);
    return temp;
  }

  void set(size_t i, float value) {
+#ifdef CUDA_FOUND
    if(backend_->getDevice().type == DeviceType::gpu)
      gpu::copy(backend_, &value, &value + 1, data() + i);
    else
+#endif
      std::copy(&value, &value + 1, data() + i);
  }

  void get(std::vector<float> &v) {
    v.resize(size());
+#ifdef CUDA_FOUND
    if(backend_->getDevice().type == DeviceType::gpu)
      gpu::copy(backend_, data(), data() + size(), v.data());
    else
+#endif
      std::copy(data(), data() + size(), v.data());
  }

  void set(const float* begin, const float* end) {
+#ifdef CUDA_FOUND
    if(backend_->getDevice().type == DeviceType::gpu)
      gpu::copy(backend_, begin, end, data());
    else
+#endif
      std::copy(begin, end, data());
  }

@ -87,27 +98,32 @@ public:
  }

  void set(float value) {
+#ifdef CUDA_FOUND
    if(backend_->getDevice().type == DeviceType::gpu)
      gpu::fill(backend_, data(), data() + size(), value);
    else
+#endif
      std::fill(data(), data() + size(), value);
  }

  void setSparse(const std::vector<size_t> &k,
                 const std::vector<float> &v) {
-    if(backend_->getDevice().type == DeviceType::gpu) {
+#ifdef CUDA_FOUND
+    if(backend_->getDevice().type == DeviceType::gpu)
      gpu::setSparse(backend_, k, v, data());
-    } else {
+    else
+#endif
      for(int i = 0; i < k.size(); ++i)
        data()[k[i]] = v[i];
-    }
  }

  void copyFrom(Tensor in) {
+#ifdef CUDA_FOUND
    if(in->getBackend()->getDevice().type == DeviceType::gpu ||
       backend_->getDevice().type == DeviceType::gpu)
      gpu::copy(backend_, in->data(), in->data() + in->size(), data());
    else
+#endif
      std::copy(in->data(), in->data() + in->size(), data());
  }

--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@ -10,9 +10,11 @@
 #include "functional/tmp.h"
 #include "functional/tensor.h"

+#ifdef CUDA_FOUND
 #include "tensors/gpu/element.h"
 #include "tensors/gpu/add.h"
 #include "tensors/gpu/prod.h"
+#endif

 #include "tensors/cpu/element.h"
 #include "tensors/cpu/add.h"
@ -21,9 +23,11 @@ namespace marian {

  template <class Functor, class ...Tensors>
  void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
+#ifdef CUDA_FOUND
    if(out->getBackend()->getDevice().type == DeviceType::gpu)
      gpu::Element(functor, out, tensors...);
    else
+#endif
      cpu::Element(functor, out, tensors...);
  }

@ -32,9 +36,11 @@ namespace marian {
           float scale,
           marian::Tensor out,
           Tensors... tensors) {
+#ifdef CUDA_FOUND
    if(out->getBackend()->getDevice().type == DeviceType::gpu)
      gpu::Add(functor, scale, out, tensors...);
    else
+#endif
      cpu::Add(functor, scale, out, tensors...);
  }

@ -81,21 +87,23 @@ namespace marian {

  DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)

+#ifdef CUDA_FOUND
  namespace gpu {
    void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
  }
+#endif

  namespace cpu {
    void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
  }

  static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) {
-    if(in->getBackend()->getDevice().type == DeviceType::gpu) {
+#ifdef CUDA_FOUND
+    if(in->getBackend()->getDevice().type == DeviceType::gpu)
      gpu::Deconcatenate(outputs, in, ax);
-    }
-    else {
+    else
+#endif
      cpu::Deconcatenate(outputs, in, ax);
-    }
  }

  DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
@ -116,11 +124,13 @@ namespace marian {
  DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>)
  DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>);

+#ifdef CUDA_FOUND
  namespace gpu {
    void LSTMCellBackward(std::vector<marian::Tensor> outputs,
                          std::vector<marian::Tensor> inputs,
                          marian::Tensor adj);
  }
+#endif

  namespace cpu {
    void LSTMCellBackward(std::vector<marian::Tensor> outputs,
@ -131,19 +141,21 @@ namespace marian {
  static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
                                      std::vector<marian::Tensor> inputs,
                                      marian::Tensor adj) {
-    if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
+#ifdef CUDA_FOUND
+    if(adj->getBackend()->getDevice().type == DeviceType::gpu)
      gpu::LSTMCellBackward(outputs, inputs, adj);
-    }
-    else {
+    else
+#endif
      cpu::LSTMCellBackward(outputs, inputs, adj);
-    }
  }

+#ifdef CUDA_FOUND
  namespace gpu {
    void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
                            std::vector<marian::Tensor> inputs,
                            marian::Tensor adj);
  }
+#endif

  namespace cpu {
    void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
@ -154,22 +166,24 @@ namespace marian {
  static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
                                        std::vector<marian::Tensor> inputs,
                                        marian::Tensor adj) {
-    if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
+#ifdef CUDA_FOUND
+    if(adj->getBackend()->getDevice().type == DeviceType::gpu)
      gpu::LSTMOutputBackward(outputs, inputs, adj);
-    }
-    else {
+    else
+#endif
      cpu::LSTMOutputBackward(outputs, inputs, adj);
-    }
  }

  DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)

+#ifdef CUDA_FOUND
  namespace gpu {
    void GRUFastBackward(std::vector<marian::Tensor> outputs,
                         std::vector<marian::Tensor> inputs,
                         marian::Tensor adj,
                         bool final);
  }
+#endif

  namespace cpu {
    void GRUFastBackward(std::vector<marian::Tensor> outputs,
@ -182,35 +196,37 @@ namespace marian {
                                     std::vector<marian::Tensor> inputs,
                                     marian::Tensor adj,
                                     bool final = false) {
-    if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
+#ifdef CUDA_FOUND
+    if(adj->getBackend()->getDevice().type == DeviceType::gpu)
      gpu::GRUFastBackward(outputs, inputs, adj, final);
-    }
-    else {
+    else
+#endif
      cpu::GRUFastBackward(outputs, inputs, adj, final);
-    }
  }

  DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
  DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)

+#ifdef CUDA_FOUND
  namespace gpu {
    float L2Norm(marian::Tensor in);
  }
+#endif

  namespace cpu {
    float L2Norm(marian::Tensor in);
  }

  static inline float L2Norm(marian::Tensor in) {
-    if(in->getBackend()->getDevice().type == DeviceType::gpu) {
+#ifdef CUDA_FOUND
+    if(in->getBackend()->getDevice().type == DeviceType::gpu)
      return gpu::L2Norm(in);
-    }
-    else {
+    else
+#endif
      return cpu::L2Norm(in);
-    }
  }
-  
+
  DISPATCH5(PoolingWithMaskingForward, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
  DISPATCH6(PoolingWithMaskingBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
-  
+
 }
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@ -9,7 +9,11 @@ set(UNIT_TESTS
 foreach(test ${UNIT_TESTS})
    add_executable("run_${test}" run_tests.cpp "${test}.cpp")
    target_link_libraries("run_${test}" marian ${EXT_LIBS} Catch)
-    cuda_add_cublas_to_target("run_${test}")
+
+    if(CUDA_FOUND)
+      target_link_libraries("run_${test}" marian marian_cuda ${EXT_LIBS} Catch)
+      cuda_add_cublas_to_target("run_${test}")
+    endif(CUDA_FOUND)

    add_test(NAME ${test} COMMAND "run_${test}")
 endforeach(test)
@ -18,35 +22,23 @@ endforeach(test)
 # Testing apps
 add_executable(logger_test logger_test.cpp)
 add_executable(dropout_test dropout_test.cpp)
-#cuda_add_executable(bn_test bn_test.cu)
+
+if(CUDA_FOUND)
 cuda_add_executable(pooling_test pooling_test.cu)
-#cuda_add_executable(marian_test marian_test.cu)
-cuda_add_executable(tensor_test tensor_test.cu)
+endif(CUDA_FOUND)

 add_executable(sqlite_test sqlite_test.cpp)
-
 foreach(exec
        logger_test
        dropout_test
-        pooling_test
-        #marian_test
-        #bn_test
-        tensor_test
        sqlite_test
-)
+    )
    target_link_libraries(${exec} marian ${EXT_LIBS})
-    cuda_add_cublas_to_target(${exec})
+    if(CUDA_FOUND)
+      target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} Catch)
+      cuda_add_cublas_to_target(${exec})
+    endif(CUDA_FOUND)
+
    set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 endforeach(exec)

-# if(CUDNN_FOUND)
-    # cuda_add_executable(conv_test conv_test.cu)
-
-    # foreach(exec
-            # conv_test
-    # )
-        # target_link_libraries(${exec} marian ${EXT_LIBS})
-        # cuda_add_cublas_to_target(${exec})
-        # set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
-    # endforeach(exec)
-# endif(CUDNN_FOUND)
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -105,9 +105,11 @@ public:

    // @TODO: unify this
    Ptr<NthElement> nth;
+#ifdef CUDA_FOUND
    if(graph->getDevice().type == DeviceType::gpu)
      nth = New<NthElementGPU>(localBeamSize, dimBatch, graph->getDevice());
    else
+#endif
      nth = New<NthElementCPU>(localBeamSize, dimBatch);

    Beams beams(dimBatch);
--- a/src/translator/helpers.cpp
+++ b/src/translator/helpers.cpp
@ -38,18 +38,22 @@ void suppressUnk(Expr probs) {
  if(probs->val()->getBackend()->getDevice().type == DeviceType::cpu) {
    cpu::suppressUnk(probs);
  }
+#ifdef CUDA_FOUND
  else {
    gpu::suppressUnk(probs);
  }
+#endif
 }

 void suppressWord(Expr probs, Word id) {
  if(probs->val()->getBackend()->getDevice().type == DeviceType::cpu) {
    cpu::suppressWord(probs, id);
  }
+#ifdef CUDA_FOUND
  else {
    gpu::suppressWord(probs, id);
  }
+#endif
 }

 }