handle warnings in sse2 and avx code

2024-10-05 19:17:10 +03:00 · 2018-12-12 13:33:04 -08:00 · 2018-12-12 13:33:04 -08:00 · 0ae07c5323
commit 0ae07c5323
parent d602dda8e2
6 changed files with 53 additions and 83 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -32,6 +32,28 @@ message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")

 execute_process(COMMAND git submodule update --init --recursive --no-fetch
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ 
+# Set compilation flags
+if(MSVC)
+# These are used in src/CMakeLists.txt on a per-target basis
+  list(APPEND ALL_WARNINGS /WX; /W4;)
+
+  # Disabled bogus warnings for CPU intrincics:
+  # C4310: cast truncates constant value
+  # C4324: 'marian::cpu::int16::`anonymous-namespace'::ScatterPut': structure was padded due to alignment specifier
+  set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\"")
+  
+  set(INTRINSICS "/arch:AVX512")
+
+  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS ${DISABLE_GLOBALLY}")
+  set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
+  set(CMAKE_CXX_FLAGS_DEBUG     "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
+  set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS} /DEBUG /LTCG:incremental /INCREMENTAL:NO /NODEFAULTLIB:MSVCRT")
+  set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /LTCG:incremental")
+
+  find_library(SHLWAPI Shlwapi.lib) 
+  set(EXT_LIBS ${EXT_LIBS} SHLWAPI)
+else()

 # Detect support CPU instrinsics for the current platform. This will
 # only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we
@ -64,24 +86,11 @@ else()
  set(INTRINSICS "-msse4.1")
 endif()

-# Set compilation flags
-if(MSVC)
-  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /W4")
-  set(CMAKE_CXX_FLAGS_RELEASE   "/MT /O2 /W4 /Zi /MP /GL /DNDEBUG")
-  set(CMAKE_CXX_FLAGS_DEBUG     "/MTd /Od /Ob0 /RTC1 /Zi /D_DEBUG")
+set(DISABLE_GLOBALLY "-Wno-unused-result")

-  set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS} /DEBUG /LTCG:incremental /INCREMENTAL:NO /NODEFAULTLIB:MSVCRT")
-
-  set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /LTCG:incremental")
-
-  find_library(SHLWAPI Shlwapi.lib) 
-  set(EXT_LIBS ${EXT_LIBS} SHLWAPI)
-else()
-  set(DISABLE_GLOBALLY "-Wno-unused-result")
-
-  # These are used in src/CMakeLists.txt on a per-target basis
-  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-                           -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
+# These are used in src/CMakeLists.txt on a per-target basis
+list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
+                         -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)

  # This warning does not exist prior to gcc 5.0
  if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
@ -90,7 +99,7 @@ else()

  set(CMAKE_CXX_FLAGS                 "-std=c++11 -O3 -Ofast -m64 -pthread -march=${BUILD_ARCH} ${INTRINSICS} -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC ${DISABLE_GLOBALLY}")
  set(CMAKE_CXX_FLAGS_RELEASE         "${CMAKE_CXX_FLAGS} -g -rdynamic")
-  set(CMAKE_CXX_FLAGS_DEBUG           "-std=c++11 -g -rdynamic -O0 -pthread -Wl,--no-as-needed -fPIC -Wno-unused-result -Wno-deprecated -Werror -Wno-pragmas")
+  set(CMAKE_CXX_FLAGS_DEBUG           "-std=c++11 -g -rdynamic -O0 -pthread -Wl,--no-as-needed -fPIC -Wno-unused-result -Wno-deprecated -Wno-pragmas")
  set(CMAKE_CXX_FLAGS_SLIM            "${CMAKE_CXX_FLAGS} -DNDEBUG")
  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS} -g -rdynamic")
  set(CMAKE_CXX_FLAGS_PROFILE         "${CMAKE_CXX_FLAGS_RELEASE} -pg -g -rdynamic")
--- a/src/tensors/cpu/sharp/avx_gemm.cpp
+++ b/src/tensors/cpu/sharp/avx_gemm.cpp
@ -99,7 +99,8 @@ union IntAccess {
 *       _mm512_sra_epi32(sum, shift16));
 */
 inline void Convert32Sum(__m512i &sum) {
-  sum = _mm512_madd_epi16(sum, _mm512_set1_epi16(1));
+  short one = 1;
+  sum = _mm512_madd_epi16(sum, _mm512_set1_epi16(one));
 }

 // Two sum version.
@ -114,7 +115,7 @@ inline ReducedPair Reduce16to32(__m512i sum1, __m512i sum2) {
                                    _mm512_unpacklo_epi32(sum1, sum2));
  // 1 2 1 2 1 2 1 2
  __m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack12),
-                                    _mm512_extracti64x4_epi64(pack12, 1));
+                                    _mm512_extracti64x4_epi64(pack12, (short)1));
  // 1 2 1 2
  IntAccess a;
  a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves),
@ -144,7 +145,7 @@ inline __m128i Reduce32(__m512i sum1,
                                      _mm512_unpacklo_epi64(pack12, pack34));
  // Cut the register into halves and sum those.  1 2 3 4 1 2 3 4
  __m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack1234),
-                                    _mm512_extracti64x4_epi64(pack1234, 1));
+                                    _mm512_extracti64x4_epi64(pack1234, (short)1));
  // Again: cut the register into halves and sum those. 1 2 3 4
  return _mm_add_epi32(_mm256_castsi256_si128(halves),
                       _mm256_extracti128_si256(halves, 1));
@ -175,14 +176,14 @@ inline int32_t Reduce32(__m256i halves) {
 inline int32_t Reduce32(__m512i sum1) {
  // Fold register over itself.
  return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
-                                   _mm512_extracti64x4_epi64(sum1, 1)));
+                                   _mm512_extracti64x4_epi64(sum1, (short)1)));
 }

 inline int32_t Reduce16to32(__m512i sum1) {
  Convert32Sum(sum1);
  // Fold register over itself.
  return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
-                                   _mm512_extracti64x4_epi64(sum1, 1)));
+                                   _mm512_extracti64x4_epi64(sum1, (short)1)));
 }

 class ScatterPut {
@ -204,7 +205,7 @@ public:
    float_sums = _mm_mul_ps(float_sums, unquant_mult_sse_);
 #ifdef __AVX512VL__
    // The scatter instruction requires avx512vl
-    _mm_i32scatter_ps(base, num_b_rows_scatter_, float_sums, 1);
+    _mm_i32scatter_ps(base, num_b_rows_scatter_, float_sums, (short)1);
 #else
    FloatAccess a;
    // Get floats for each of the sums to write.
@ -398,6 +399,7 @@ inline void Accum(const __m512i zeros,
  // Choosing to approximate and do adds.
  // Perhaps every so often we could accumulate by Convert32Sum
  sum = _mm512_adds_epi16(sum, multiplied);
+  b; // make compiler happy
 }

 }  // namespace
--- a/src/tensors/cpu/sharp/int_gemm.cpp
+++ b/src/tensors/cpu/sharp/int_gemm.cpp
@ -73,7 +73,7 @@ void Quantize8(marian::Tensor out,
               const marian::Tensor in,
               float clipValue) {
 #ifdef __AVX512F__
-  float quant_mult = 127.0 / clipValue;
+  float quant_mult = 127.0f / clipValue;
  AVX_Quantize8(
      in->data(), out->data<int8_t>(), quant_mult, in->shape().elements());
 #else
@ -165,8 +165,8 @@ void ProdInt8(marian::Tensor C,
 #ifdef __AVX512F__
  // This would be easy...
  ABORT_IF(scale != 1, "Scale other than 1 not supported");
-  float quant_mult = 127.0 / clipValue;
-  float unquant_mult = 1.0 / (quant_mult * quant_mult);
+  float quant_mult = 127.0f / clipValue;
+  float unquant_mult = 1.0f / (quant_mult * quant_mult);

  float* fC = C->data();
  int num_A_rows = A->shape().elements() / A->shape()[-1];
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -87,7 +87,7 @@ public:
             request<T>(),
             type_);

-    T temp;
+    T temp = 0;
    if(backend_->getDeviceId().type == DeviceType::cpu) {
      std::copy(data<T>() + i, data<T>() + i + 1, &temp);
    }
--- a/vs/BuildRelease.bat
+++ b/vs/BuildRelease.bat
@ -17,10 +17,6 @@ if "%BUILD_ROOT%"=="" set BUILD_ROOT=%ROOT%build
 call CreateVSProjects.bat %BUILD_ROOT%
 if errorlevel 1 exit /b 1

-set _CL_=/utf-8
-
-REM -DCMAKE_INSTALL_PREFIX=%LIBRARY_PATH%
-
-cmake --build %BUILD_ROOT% --config Release 
+cmake --build %BUILD_ROOT% --config Release

 exit /b 0
--- a/vs/CheckDeps.bat
+++ b/vs/CheckDeps.bat
@ -108,30 +108,12 @@ set CMAKE_OPT=
 ::
 echo.
 echo ... CUDA
-REM if "%CUDA_PATH%"=="" (
-REM     echo The CUDA_PATH environment variable is not defined: please make sure CUDA 8.0+ is installed.
-REM     exit /b 1
-REM )
-REM if not exist "%CUDA_PATH%" (
-REM     echo CUDA_PATH is set to a non existing path:
-REM     echo %CUDA_PATH%
-REM     echo Please make sure CUDA 8.0+ is properly installed.
-REM     exit /b 1
-REM )
-REM if not exist "%CUDA_PATH%\include\cuda.h" (
-REM     echo CUDA header files were not found in this folder:
-REM     echo    "%CUDA_PATH%"
-REM     echo Please make sure CUDA 8.0+ is properly installed.
-REM     exit /b 1
-REM )
-REM if not exist "%CUDA_PATH%\lib\x64\cuda.lib" (
-REM     echo CUDA library files were not found in this folder:
-REM     echo    "%CUDA_PATH%"
-REM     echo Please make sure CUDA 8.0+ is properly installed.
-REM     exit /b 1
-REM )
-
-echo Found Cuda SDK in %CUDA_PATH%
+if "%CUDA_PATH%"=="" (
+    echo The CUDA_PATH environment variable is not defined: this will compile only the CPU version.
+)
+else (
+    echo Found Cuda SDK in %CUDA_PATH%
+)

 :: -------------------------
 :: The MKL setup does not set any environment variable to the installation path.
@ -215,40 +197,21 @@ if "%OPENSSL_ROOT_DIR%"=="" (
    set OPENSSL_ROOT_DIR=%VCPKG_INSTALL%
 )

-REM if not exist "%OPENSSL_ROOT_DIR%" (
-REM     echo OPENSSL_ROOT_DIR is set to a non existing path:
-REM     echo "%OPENSSL_ROOT_DIR%"
-REM     echo Please set OPENSSL_ROOT_DIR to the installation path of the OpenSLL library.
-REM     exit /b 1
-REM )
-REM if not exist "%OPENSSL_ROOT_DIR%\include\openssl\opensslv.h" (
-REM     echo OpenSSL header files were not found in this folder:
-REM     echo    "%OPENSSL_ROOT_DIR%"
-REM     echo Please make sure OpenSSL is correctly installed.
-REM     exit /b 1
-REM )
-REM if not exist "%OPENSSL_ROOT_DIR%\lib\ssleay32.lib" (
-REM     echo OpenSSL library file were not found in this folder:
-REM     echo    "%OPENSSL_ROOT_DIR%"
-REM     echo Please make sure OpenSSL is correctly installed.
-REM     exit /b 1
-)
-
-echo Found OpenSSL library in "%OPENSSL_ROOT_DIR%"
-
-set _CL_=/utf-8
-set LIBRARY_PATH=%CURRENT_PATH%\deps\proto
-
+if not exist "%VCPKG_INSTALL%/bin/protoc.exe" (
 mkdir build
 cd build
 git clone https://github.com/protocolbuffers/protobuf
 cd protobuf
-git checkout v.3.6.1
+git checkout v3.6.1
 cd cmake
-cmake . -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=%LIBRARY_PATH%
+cmake . -A x64 -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=%VCPKG_INSTALL%
 cmake --build . --config Release --target install
 cd ..\..\..

+)
+
+set CMAKE_PREFIX_PATH=%VCPKG_INSTALL%
+
 echo.
 echo.
 echo --------------------------------------------------