fix compilation for various gcc and cuda combinations

This commit is contained in:
Marcin Junczys-Dowmunt 2019-10-28 13:18:07 -07:00
parent 2dfb302091
commit 0a89e8f168
7 changed files with 24 additions and 15 deletions

View File

@ -159,6 +159,8 @@ set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS})
if(COMPILE_CUDA)
LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;)
if(USE_STATIC_LIBS)
# link statically to stdlib libraries
set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++")
@ -179,6 +181,10 @@ if(CUDA_FOUND)
message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
endif()
if(CUDA_VERSION VERSION_GREATER "8.0")
LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70)
endif()
if(USE_STATIC_LIBS)
find_library(CUDA_culibos_LIBRARY NAMES culibos PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_culibos_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
@ -240,9 +246,9 @@ endif(COMPILE_CUDA)
# TODO: make compatible with older CUDA versions
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70 ;)
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
else(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70 ;)
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
if(NOT MSVC)
# @TODO: add warnings here too

View File

@ -15,17 +15,18 @@
#include <functional>
#include <type_traits>
#ifndef __CUDA_ARCH__
#ifndef __CUDACC__
#include <immintrin.h>
#endif
#ifdef __CUDACC__ // nvcc is compiling this code
#if (__CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__))
#include <cuda.h> // required to see CUDA_VERSION
#if (CUDA_VERSION > 8000 && (__CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)))
#define COMPILE_FP16 1 // we are in GPU code and we know what to do with FP16 code
#else
#define COMPILE_FP16 0 // we are in GPU code, but compute capability is too low to use FP16
#endif
#else // other compiler, likely host code. Sould be fine with seeing the correct includes with host code
#else // other compiler, likely host code. Should be fine with seeing the correct includes with host code
#define COMPILE_FP16 1
#endif
@ -125,7 +126,7 @@ do { \
namespace marian {
#ifndef __CUDA_ARCH__
#ifndef __CUDACC__
// @TODO: check what intrinsics are actually available.
struct float32x4 {

View File

@ -1,6 +1,6 @@
#pragma once
#ifdef __CUDA_ARCH__
#ifdef __CUDACC__
#include <cuda.h>
#define HOST __host__

View File

@ -191,7 +191,9 @@ struct Ops<double> {
// stay invisible to NVCC as it seems to have problems with intrinsics;
// will still be compiled into the binary by cpu-side gcc/g++
#ifndef __CUDA_ARCH__
// __CUDACC__ is defined when compiling with NVCC regardless of device type
// __CUDA_ARCH__ is defined when compiling devicde (GPU) code
#ifndef __CUDACC__
#include "3rd_party/sse_mathfun.h"
@ -437,9 +439,9 @@ struct Ops<float32x8> {
} // end namespace functional
} // end namespace marian
#endif // of "#ifndef __CUDA_ARCH__"
#endif // of "#ifndef __CUDACC__"
#ifdef __CUDA_ARCH__
#ifdef __CUDACC__
#if COMPILE_FP16
// only compile with fp16 support for compute_70, i.e. VOLTA 100 and above.
#include <cuda_fp16.h>

View File

@ -19,7 +19,7 @@ inline marian::Shape adapt(const marian::Shape& shape) {
// modify last shape dimension to automatically map to a larger stride. We are moving now by 4 floats
// at once and need to stop earlier. This is a shallow typecast to bascially an array of 4 floats.
#ifndef __CUDA_ARCH__
#ifndef __CUDACC__
template <>
inline marian::Shape adapt<float32x4>(const marian::Shape& shape) {

View File

@ -81,7 +81,7 @@ void element(const Functor& functor, marian::Tensor out, Tensors... tensors) {
// AVX2 specific intrinsics. Similar for 4 and AVX. TODO: Add AVX512 support.
template <class Functor, class... Tensors>
void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors) {
#ifndef __CUDA_ARCH__
#ifndef __CUDACC__
std::vector<marian::Tensor> ts({tensors...});
bool div8 = true;
bool div4 = true;

View File

@ -14,7 +14,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
auto graph = New<ExpressionGraph>();
graph->setParameterType(floatType);
graph->setDevice({0, device});
graph->reserveWorkspaceMB(16);
graph->reserveWorkspaceMB(32);
std::vector<T> values, values2;
@ -71,8 +71,8 @@ void tests(DeviceType device, Type floatType = Type::float32) {
CHECK(compare(rplus, [](float a, float b) {return a + b;}, true));
CHECK(compare(rminus, [](float a, float b) {return a - b;}, true));
CHECK(compare(rmult, [](float a, float b) {return a * b;}, true));
CHECK(compare(rdiv, [](float a, float b) {return a / b;}, /*exactMatch=*/false));
CHECK(compare(rlae, [](float a, float b) {return logf(expf(a) + expf(b));}, /*exactMatch=*/false));
CHECK(compare(rdiv, [](float a, float b) {return a / b;}, false));
CHECK(compare(rlae, [](float a, float b) {return logf(expf(a) + expf(b));}, false));
CHECK(compare(rmax, [](float a, float b) {return std::max(a, b);}, true));
CHECK(compare(rmin, [](float a, float b) {return std::min(a, b);}, true));
CHECK(compare(rlt, [](float a, float b) {return a < b;}, true));