fix compilation for various gcc and cuda combinations

2024-11-03 20:13:47 +03:00 · 2019-10-28 13:18:07 -07:00 · 2019-10-28 13:18:07 -07:00 · 0a89e8f168
commit 0a89e8f168
parent 2dfb302091
7 changed files with 24 additions and 15 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -159,6 +159,8 @@ set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS})

 if(COMPILE_CUDA)

+LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;)
+
 if(USE_STATIC_LIBS)
  # link statically to stdlib libraries
  set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++")
@ -179,6 +181,10 @@ if(CUDA_FOUND)
      message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
  endif()

+  if(CUDA_VERSION VERSION_GREATER "8.0")
+    LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70)
+  endif()
+
  if(USE_STATIC_LIBS)
    find_library(CUDA_culibos_LIBRARY NAMES culibos PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
    set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_culibos_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
@ -240,9 +246,9 @@ endif(COMPILE_CUDA)

 # TODO: make compatible with older CUDA versions
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70 ;)
+  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
 else(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70 ;)
+  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
 endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
 if(NOT MSVC)
  # @TODO: add warnings here too
--- a/src/common/types.h
+++ b/src/common/types.h
@ -15,17 +15,18 @@
 #include <functional>
 #include <type_traits>

-#ifndef __CUDA_ARCH__
+#ifndef __CUDACC__
 #include <immintrin.h>
 #endif

 #ifdef __CUDACC__ // nvcc is compiling this code
-  #if (__CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__))
+  #include <cuda.h> // required to see CUDA_VERSION
+  #if (CUDA_VERSION > 8000 && (__CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)))
    #define COMPILE_FP16 1 // we are in GPU code and we know what to do with FP16 code
  #else
    #define COMPILE_FP16 0 // we are in GPU code, but compute capability is too low to use FP16
  #endif
-#else // other compiler, likely host code. Sould be fine with seeing the correct includes with host code
+#else // other compiler, likely host code. Should be fine with seeing the correct includes with host code
  #define COMPILE_FP16 1
 #endif

@ -125,7 +126,7 @@ do { \

 namespace marian {

-#ifndef __CUDA_ARCH__
+#ifndef __CUDACC__

 // @TODO: check what intrinsics are actually available.
 struct float32x4 {
--- a/src/functional/defs.h
+++ b/src/functional/defs.h
@ -1,6 +1,6 @@
 #pragma once

-#ifdef __CUDA_ARCH__
+#ifdef __CUDACC__

 #include <cuda.h>
 #define HOST __host__
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@ -191,7 +191,9 @@ struct Ops<double> {

 // stay invisible to NVCC as it seems to have problems with intrinsics;
 // will still be compiled into the binary by cpu-side gcc/g++
-#ifndef __CUDA_ARCH__
+// __CUDACC__ is defined when compiling with NVCC regardless of device type
+// __CUDA_ARCH__ is defined when compiling devicde (GPU) code
+#ifndef __CUDACC__

 #include "3rd_party/sse_mathfun.h"

@ -437,9 +439,9 @@ struct Ops<float32x8> {
 } // end namespace functional
 } // end namespace marian

-#endif // of "#ifndef __CUDA_ARCH__"
+#endif // of "#ifndef __CUDACC__"

-#ifdef __CUDA_ARCH__
+#ifdef __CUDACC__
 #if COMPILE_FP16
 // only compile with fp16 support for compute_70, i.e. VOLTA 100 and above.
 #include <cuda_fp16.h>
--- a/src/functional/tensor.h
+++ b/src/functional/tensor.h
@ -19,7 +19,7 @@ inline marian::Shape adapt(const marian::Shape& shape) {
 // modify last shape dimension to automatically map to a larger stride. We are moving now by 4 floats
 // at once and need to stop earlier. This is a shallow typecast to bascially an array of 4 floats.

-#ifndef __CUDA_ARCH__
+#ifndef __CUDACC__

 template <>
 inline marian::Shape adapt<float32x4>(const marian::Shape& shape) {
--- a/src/tensors/cpu/element.h
+++ b/src/tensors/cpu/element.h
@ -81,7 +81,7 @@ void element(const Functor& functor, marian::Tensor out, Tensors... tensors) {
 // AVX2 specific intrinsics. Similar for 4 and AVX. TODO: Add AVX512 support.
 template <class Functor, class... Tensors>
 void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors) {
-#ifndef __CUDA_ARCH__
+#ifndef __CUDACC__
  std::vector<marian::Tensor> ts({tensors...});
  bool div8 = true;
  bool div4 = true;
--- a/src/tests/units/operator_tests.cpp
+++ b/src/tests/units/operator_tests.cpp
@ -14,7 +14,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
  auto graph = New<ExpressionGraph>();
  graph->setParameterType(floatType);
  graph->setDevice({0, device});
-  graph->reserveWorkspaceMB(16);
+  graph->reserveWorkspaceMB(32);

  std::vector<T> values, values2;

@ -71,8 +71,8 @@ void tests(DeviceType device, Type floatType = Type::float32) {
    CHECK(compare(rplus,  [](float a, float b) {return a + b;}, true));
    CHECK(compare(rminus, [](float a, float b) {return a - b;}, true));
    CHECK(compare(rmult,  [](float a, float b) {return a * b;}, true));
-    CHECK(compare(rdiv,   [](float a, float b) {return a / b;}, /*exactMatch=*/false));
-    CHECK(compare(rlae,   [](float a, float b) {return logf(expf(a) + expf(b));}, /*exactMatch=*/false));
+    CHECK(compare(rdiv,   [](float a, float b) {return a / b;}, false));
+    CHECK(compare(rlae,   [](float a, float b) {return logf(expf(a) + expf(b));}, false));
    CHECK(compare(rmax,   [](float a, float b) {return std::max(a, b);}, true));
    CHECK(compare(rmin,   [](float a, float b) {return std::min(a, b);}, true));
    CHECK(compare(rlt,    [](float a, float b) {return a <  b;}, true));