mirror of
https://github.com/marian-nmt/marian.git
synced 2024-11-03 20:13:47 +03:00
fix compilation for various gcc and cuda combinations
This commit is contained in:
parent
2dfb302091
commit
0a89e8f168
@ -159,6 +159,8 @@ set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS})
|
||||
|
||||
if(COMPILE_CUDA)
|
||||
|
||||
LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;)
|
||||
|
||||
if(USE_STATIC_LIBS)
|
||||
# link statically to stdlib libraries
|
||||
set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++")
|
||||
@ -179,6 +181,10 @@ if(CUDA_FOUND)
|
||||
message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
|
||||
endif()
|
||||
|
||||
if(CUDA_VERSION VERSION_GREATER "8.0")
|
||||
LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70)
|
||||
endif()
|
||||
|
||||
if(USE_STATIC_LIBS)
|
||||
find_library(CUDA_culibos_LIBRARY NAMES culibos PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
|
||||
set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_cusparse_LIBRARY} ${CUDA_culibos_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
|
||||
@ -240,9 +246,9 @@ endif(COMPILE_CUDA)
|
||||
|
||||
# TODO: make compatible with older CUDA versions
|
||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70 ;)
|
||||
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
|
||||
else(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61; -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70 ;)
|
||||
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
|
||||
endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
if(NOT MSVC)
|
||||
# @TODO: add warnings here too
|
||||
|
@ -15,17 +15,18 @@
|
||||
#include <functional>
|
||||
#include <type_traits>
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef __CUDACC__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__ // nvcc is compiling this code
|
||||
#if (__CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__))
|
||||
#include <cuda.h> // required to see CUDA_VERSION
|
||||
#if (CUDA_VERSION > 8000 && (__CUDA_ARCH__ >= 600 || !defined(__CUDA_ARCH__)))
|
||||
#define COMPILE_FP16 1 // we are in GPU code and we know what to do with FP16 code
|
||||
#else
|
||||
#define COMPILE_FP16 0 // we are in GPU code, but compute capability is too low to use FP16
|
||||
#endif
|
||||
#else // other compiler, likely host code. Sould be fine with seeing the correct includes with host code
|
||||
#else // other compiler, likely host code. Should be fine with seeing the correct includes with host code
|
||||
#define COMPILE_FP16 1
|
||||
#endif
|
||||
|
||||
@ -125,7 +126,7 @@ do { \
|
||||
|
||||
namespace marian {
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef __CUDACC__
|
||||
|
||||
// @TODO: check what intrinsics are actually available.
|
||||
struct float32x4 {
|
||||
|
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef __CUDACC__
|
||||
|
||||
#include <cuda.h>
|
||||
#define HOST __host__
|
||||
|
@ -191,7 +191,9 @@ struct Ops<double> {
|
||||
|
||||
// stay invisible to NVCC as it seems to have problems with intrinsics;
|
||||
// will still be compiled into the binary by cpu-side gcc/g++
|
||||
#ifndef __CUDA_ARCH__
|
||||
// __CUDACC__ is defined when compiling with NVCC regardless of device type
|
||||
// __CUDA_ARCH__ is defined when compiling devicde (GPU) code
|
||||
#ifndef __CUDACC__
|
||||
|
||||
#include "3rd_party/sse_mathfun.h"
|
||||
|
||||
@ -437,9 +439,9 @@ struct Ops<float32x8> {
|
||||
} // end namespace functional
|
||||
} // end namespace marian
|
||||
|
||||
#endif // of "#ifndef __CUDA_ARCH__"
|
||||
#endif // of "#ifndef __CUDACC__"
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#ifdef __CUDACC__
|
||||
#if COMPILE_FP16
|
||||
// only compile with fp16 support for compute_70, i.e. VOLTA 100 and above.
|
||||
#include <cuda_fp16.h>
|
||||
|
@ -19,7 +19,7 @@ inline marian::Shape adapt(const marian::Shape& shape) {
|
||||
// modify last shape dimension to automatically map to a larger stride. We are moving now by 4 floats
|
||||
// at once and need to stop earlier. This is a shallow typecast to bascially an array of 4 floats.
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef __CUDACC__
|
||||
|
||||
template <>
|
||||
inline marian::Shape adapt<float32x4>(const marian::Shape& shape) {
|
||||
|
@ -81,7 +81,7 @@ void element(const Functor& functor, marian::Tensor out, Tensors... tensors) {
|
||||
// AVX2 specific intrinsics. Similar for 4 and AVX. TODO: Add AVX512 support.
|
||||
template <class Functor, class... Tensors>
|
||||
void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors) {
|
||||
#ifndef __CUDA_ARCH__
|
||||
#ifndef __CUDACC__
|
||||
std::vector<marian::Tensor> ts({tensors...});
|
||||
bool div8 = true;
|
||||
bool div4 = true;
|
||||
|
@ -14,7 +14,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
|
||||
auto graph = New<ExpressionGraph>();
|
||||
graph->setParameterType(floatType);
|
||||
graph->setDevice({0, device});
|
||||
graph->reserveWorkspaceMB(16);
|
||||
graph->reserveWorkspaceMB(32);
|
||||
|
||||
std::vector<T> values, values2;
|
||||
|
||||
@ -71,8 +71,8 @@ void tests(DeviceType device, Type floatType = Type::float32) {
|
||||
CHECK(compare(rplus, [](float a, float b) {return a + b;}, true));
|
||||
CHECK(compare(rminus, [](float a, float b) {return a - b;}, true));
|
||||
CHECK(compare(rmult, [](float a, float b) {return a * b;}, true));
|
||||
CHECK(compare(rdiv, [](float a, float b) {return a / b;}, /*exactMatch=*/false));
|
||||
CHECK(compare(rlae, [](float a, float b) {return logf(expf(a) + expf(b));}, /*exactMatch=*/false));
|
||||
CHECK(compare(rdiv, [](float a, float b) {return a / b;}, false));
|
||||
CHECK(compare(rlae, [](float a, float b) {return logf(expf(a) + expf(b));}, false));
|
||||
CHECK(compare(rmax, [](float a, float b) {return std::max(a, b);}, true));
|
||||
CHECK(compare(rmin, [](float a, float b) {return std::min(a, b);}, true));
|
||||
CHECK(compare(rlt, [](float a, float b) {return a < b;}, true));
|
||||
|
Loading…
Reference in New Issue
Block a user