handle warnings in sse2 and avx code

This commit is contained in:
Marcin Junczys-Dowmunt 2018-12-12 13:33:04 -08:00
parent d602dda8e2
commit 0ae07c5323
6 changed files with 53 additions and 83 deletions

View File

@ -32,6 +32,28 @@ message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")
execute_process(COMMAND git submodule update --init --recursive --no-fetch
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
# Set compilation flags
if(MSVC)
# These are used in src/CMakeLists.txt on a per-target basis
list(APPEND ALL_WARNINGS /WX; /W4;)
# Disabled bogus warnings for CPU intrincics:
# C4310: cast truncates constant value
# C4324: 'marian::cpu::int16::`anonymous-namespace'::ScatterPut': structure was padded due to alignment specifier
set(DISABLE_GLOBALLY "/wd\"4310\" /wd\"4324\"")
set(INTRINSICS "/arch:AVX512")
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG /LTCG:incremental /INCREMENTAL:NO /NODEFAULTLIB:MSVCRT")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /LTCG:incremental")
find_library(SHLWAPI Shlwapi.lib)
set(EXT_LIBS ${EXT_LIBS} SHLWAPI)
else()
# Detect support CPU instrinsics for the current platform. This will
# only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we
@ -64,24 +86,11 @@ else()
set(INTRINSICS "-msse4.1")
endif()
# Set compilation flags
if(MSVC)
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /W4")
set(CMAKE_CXX_FLAGS_RELEASE "/MT /O2 /W4 /Zi /MP /GL /DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "/MTd /Od /Ob0 /RTC1 /Zi /D_DEBUG")
set(DISABLE_GLOBALLY "-Wno-unused-result")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG /LTCG:incremental /INCREMENTAL:NO /NODEFAULTLIB:MSVCRT")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /LTCG:incremental")
find_library(SHLWAPI Shlwapi.lib)
set(EXT_LIBS ${EXT_LIBS} SHLWAPI)
else()
set(DISABLE_GLOBALLY "-Wno-unused-result")
# These are used in src/CMakeLists.txt on a per-target basis
list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
# These are used in src/CMakeLists.txt on a per-target basis
list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
# This warning does not exist prior to gcc 5.0
if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
@ -90,7 +99,7 @@ else()
set(CMAKE_CXX_FLAGS "-std=c++11 -O3 -Ofast -m64 -pthread -march=${BUILD_ARCH} ${INTRINSICS} -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -g -rdynamic")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -g -rdynamic -O0 -pthread -Wl,--no-as-needed -fPIC -Wno-unused-result -Wno-deprecated -Werror -Wno-pragmas")
set(CMAKE_CXX_FLAGS_DEBUG "-std=c++11 -g -rdynamic -O0 -pthread -Wl,--no-as-needed -fPIC -Wno-unused-result -Wno-deprecated -Wno-pragmas")
set(CMAKE_CXX_FLAGS_SLIM "${CMAKE_CXX_FLAGS} -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -g -rdynamic")
set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg -g -rdynamic")

View File

@ -99,7 +99,8 @@ union IntAccess {
* _mm512_sra_epi32(sum, shift16));
*/
inline void Convert32Sum(__m512i &sum) {
sum = _mm512_madd_epi16(sum, _mm512_set1_epi16(1));
short one = 1;
sum = _mm512_madd_epi16(sum, _mm512_set1_epi16(one));
}
// Two sum version.
@ -114,7 +115,7 @@ inline ReducedPair Reduce16to32(__m512i sum1, __m512i sum2) {
_mm512_unpacklo_epi32(sum1, sum2));
// 1 2 1 2 1 2 1 2
__m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack12),
_mm512_extracti64x4_epi64(pack12, 1));
_mm512_extracti64x4_epi64(pack12, (short)1));
// 1 2 1 2
IntAccess a;
a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves),
@ -144,7 +145,7 @@ inline __m128i Reduce32(__m512i sum1,
_mm512_unpacklo_epi64(pack12, pack34));
// Cut the register into halves and sum those. 1 2 3 4 1 2 3 4
__m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack1234),
_mm512_extracti64x4_epi64(pack1234, 1));
_mm512_extracti64x4_epi64(pack1234, (short)1));
// Again: cut the register into halves and sum those. 1 2 3 4
return _mm_add_epi32(_mm256_castsi256_si128(halves),
_mm256_extracti128_si256(halves, 1));
@ -175,14 +176,14 @@ inline int32_t Reduce32(__m256i halves) {
inline int32_t Reduce32(__m512i sum1) {
// Fold register over itself.
return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
_mm512_extracti64x4_epi64(sum1, 1)));
_mm512_extracti64x4_epi64(sum1, (short)1)));
}
inline int32_t Reduce16to32(__m512i sum1) {
Convert32Sum(sum1);
// Fold register over itself.
return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
_mm512_extracti64x4_epi64(sum1, 1)));
_mm512_extracti64x4_epi64(sum1, (short)1)));
}
class ScatterPut {
@ -204,7 +205,7 @@ public:
float_sums = _mm_mul_ps(float_sums, unquant_mult_sse_);
#ifdef __AVX512VL__
// The scatter instruction requires avx512vl
_mm_i32scatter_ps(base, num_b_rows_scatter_, float_sums, 1);
_mm_i32scatter_ps(base, num_b_rows_scatter_, float_sums, (short)1);
#else
FloatAccess a;
// Get floats for each of the sums to write.
@ -398,6 +399,7 @@ inline void Accum(const __m512i zeros,
// Choosing to approximate and do adds.
// Perhaps every so often we could accumulate by Convert32Sum
sum = _mm512_adds_epi16(sum, multiplied);
b; // make compiler happy
}
} // namespace

View File

@ -73,7 +73,7 @@ void Quantize8(marian::Tensor out,
const marian::Tensor in,
float clipValue) {
#ifdef __AVX512F__
float quant_mult = 127.0 / clipValue;
float quant_mult = 127.0f / clipValue;
AVX_Quantize8(
in->data(), out->data<int8_t>(), quant_mult, in->shape().elements());
#else
@ -165,8 +165,8 @@ void ProdInt8(marian::Tensor C,
#ifdef __AVX512F__
// This would be easy...
ABORT_IF(scale != 1, "Scale other than 1 not supported");
float quant_mult = 127.0 / clipValue;
float unquant_mult = 1.0 / (quant_mult * quant_mult);
float quant_mult = 127.0f / clipValue;
float unquant_mult = 1.0f / (quant_mult * quant_mult);
float* fC = C->data();
int num_A_rows = A->shape().elements() / A->shape()[-1];

View File

@ -87,7 +87,7 @@ public:
request<T>(),
type_);
T temp;
T temp = 0;
if(backend_->getDeviceId().type == DeviceType::cpu) {
std::copy(data<T>() + i, data<T>() + i + 1, &temp);
}

View File

@ -17,10 +17,6 @@ if "%BUILD_ROOT%"=="" set BUILD_ROOT=%ROOT%build
call CreateVSProjects.bat %BUILD_ROOT%
if errorlevel 1 exit /b 1
set _CL_=/utf-8
REM -DCMAKE_INSTALL_PREFIX=%LIBRARY_PATH%
cmake --build %BUILD_ROOT% --config Release
cmake --build %BUILD_ROOT% --config Release
exit /b 0

View File

@ -108,30 +108,12 @@ set CMAKE_OPT=
::
echo.
echo ... CUDA
REM if "%CUDA_PATH%"=="" (
REM echo The CUDA_PATH environment variable is not defined: please make sure CUDA 8.0+ is installed.
REM exit /b 1
REM )
REM if not exist "%CUDA_PATH%" (
REM echo CUDA_PATH is set to a non existing path:
REM echo %CUDA_PATH%
REM echo Please make sure CUDA 8.0+ is properly installed.
REM exit /b 1
REM )
REM if not exist "%CUDA_PATH%\include\cuda.h" (
REM echo CUDA header files were not found in this folder:
REM echo "%CUDA_PATH%"
REM echo Please make sure CUDA 8.0+ is properly installed.
REM exit /b 1
REM )
REM if not exist "%CUDA_PATH%\lib\x64\cuda.lib" (
REM echo CUDA library files were not found in this folder:
REM echo "%CUDA_PATH%"
REM echo Please make sure CUDA 8.0+ is properly installed.
REM exit /b 1
REM )
echo Found Cuda SDK in %CUDA_PATH%
if "%CUDA_PATH%"=="" (
echo The CUDA_PATH environment variable is not defined: this will compile only the CPU version.
)
else (
echo Found Cuda SDK in %CUDA_PATH%
)
:: -------------------------
:: The MKL setup does not set any environment variable to the installation path.
@ -215,40 +197,21 @@ if "%OPENSSL_ROOT_DIR%"=="" (
set OPENSSL_ROOT_DIR=%VCPKG_INSTALL%
)
REM if not exist "%OPENSSL_ROOT_DIR%" (
REM echo OPENSSL_ROOT_DIR is set to a non existing path:
REM echo "%OPENSSL_ROOT_DIR%"
REM echo Please set OPENSSL_ROOT_DIR to the installation path of the OpenSLL library.
REM exit /b 1
REM )
REM if not exist "%OPENSSL_ROOT_DIR%\include\openssl\opensslv.h" (
REM echo OpenSSL header files were not found in this folder:
REM echo "%OPENSSL_ROOT_DIR%"
REM echo Please make sure OpenSSL is correctly installed.
REM exit /b 1
REM )
REM if not exist "%OPENSSL_ROOT_DIR%\lib\ssleay32.lib" (
REM echo OpenSSL library file were not found in this folder:
REM echo "%OPENSSL_ROOT_DIR%"
REM echo Please make sure OpenSSL is correctly installed.
REM exit /b 1
)
echo Found OpenSSL library in "%OPENSSL_ROOT_DIR%"
set _CL_=/utf-8
set LIBRARY_PATH=%CURRENT_PATH%\deps\proto
if not exist "%VCPKG_INSTALL%/bin/protoc.exe" (
mkdir build
cd build
git clone https://github.com/protocolbuffers/protobuf
cd protobuf
git checkout v.3.6.1
git checkout v3.6.1
cd cmake
cmake . -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=%LIBRARY_PATH%
cmake . -A x64 -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=%VCPKG_INSTALL%
cmake --build . --config Release --target install
cd ..\..\..
)
set CMAKE_PREFIX_PATH=%VCPKG_INSTALL%
echo.
echo.
echo --------------------------------------------------