Merge branch 'master' into ug-const-diligence

This commit is contained in:
Ulrich Germann 2020-01-29 16:23:35 +00:00
commit cfdde151a1
94 changed files with 4098 additions and 1832 deletions

View File

@ -5,9 +5,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- Add CMAKE options to disable compilation for specific GPU SM types
- An option to print word-level translation scores
- An option to turn off automatic detokenization from SentencePiece
- Separate quantization types for 8-bit FBGEMM for AVX2 and AVX512
- Sequence-level unliklihood training
- Allow file name templated valid-translation-output files
- Support for lexical shortlists in marian-server
- Support for 8-bit matrix multiplication with FBGEMM
- CMakeLists.txt now looks for SSE 4.2
- Purging of finished hypotheses during beam-search. A lot faster for large batches.
- Faster option look-up, up to 20-30% faster translation
- Added --cite and --authors flag
@ -24,6 +34,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Gradient-checkpointing
### Fixed
- Fix empty source batch entries with batch purging
- Clear RNN chache in transformer model, add correct hash functions to nodes
- Gather-operation for all index sizes
- Fix word weighting with max length cropping
- Fixed compilation on CPUs without support for AVX
- FastOpt now reads "n" and "y" values as strings, not as boolean values
- Fixed multiple reduction kernels on GPU
- Fixed guided-alignment training with cross-entropy
- Replace IntrusivePtr with std::uniq_ptr in FastOpt, fixes random segfaults
due to thread-non-safty of reference counting.
- Make sure that items are 256-byte aligned during saving
@ -38,6 +56,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Compilation with CUDA 10.1
### Changed
- Revert LayerNorm eps to old position, i.e. sigma' = sqrt(sigma^2 + eps)
- Downgrade NCCL to 2.3.7 as 2.4.2 is buggy (hangs with larger models)
- Return error signal on SIGTERM
- Dropped support for CUDA 8.0, CUDA 9.0 is now minimal requirement
- Removed autotuner for now, will be switched back on later
- Boost depdendency is now optional and only required for marian_server
- Dropped support for g++-4.9

View File

@ -13,6 +13,10 @@ set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
# Custom CMake options
option(COMPILE_CPU "Compile CPU version" ON)
option(COMPILE_CUDA "Compile GPU version" ON)
option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
option(COMPILE_EXAMPLES "Compile examples" OFF)
option(COMPILE_SERVER "Compile marian-server" OFF)
option(COMPILE_TESTS "Compile tests" OFF)
@ -181,8 +185,6 @@ set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS})
if(COMPILE_CUDA)
LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;)
if(USE_STATIC_LIBS)
# link statically to stdlib libraries
set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++")
@ -196,16 +198,25 @@ if(USE_STATIC_LIBS)
endif()
endif()
find_package(CUDA "8.0") # TODO: only enable FP16-related options for compute_70 and higher.
find_package(CUDA "9.0") # TODO: only enable FP16-related options for compute_70 and higher.
if(CUDA_FOUND)
# CUDA >= 10.0 requires CMake >= 3.12.2
if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2"))
message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
endif()
if(CUDA_VERSION VERSION_GREATER "8.0")
LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70)
endif()
if(COMPILE_CUDA_SM35)
LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
endif(COMPILE_CUDA_SM35)
if(COMPILE_CUDA_SM50)
LIST(APPEND COMPUTE -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52;) # Maxwell GPUs
endif(COMPILE_CUDA_SM50)
if(COMPILE_CUDA_SM60)
LIST(APPEND COMPUTE -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;) # Pascal GPUs
endif(COMPILE_CUDA_SM60)
if(COMPILE_CUDA_SM70)
LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs
endif(COMPILE_CUDA_SM70)
if(USE_STATIC_LIBS)
find_library(CUDA_culibos_LIBRARY NAMES culibos PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
@ -321,9 +332,12 @@ if(COMPILE_CPU)
set(BLA_VENDOR "OpenBLAS")
find_package(BLAS)
if(BLAS_FOUND)
include_directories(${BLAS_INCLUDE_DIR})
set(EXT_LIBS ${EXT_LIBS} ${BLAS_LIBRARIES})
add_definitions(-DBLAS_FOUND=1)
include(FindCBLAS)
if(CBLAS_FOUND)
include_directories(${BLAS_INCLUDE_DIR} ${CBLAS_INCLUDE_DIR})
set(EXT_LIBS ${EXT_LIBS} ${BLAS_LIBRARIES} ${CBLAS_LIBRARIES})
add_definitions(-DBLAS_FOUND=1)
endif(CBLAS_FOUND)
endif(BLAS_FOUND)
endif(MKL_FOUND)
endif(COMPILE_CPU)

View File

@ -1,7 +1,6 @@
Marian
======
[![Build Status CUDA 8](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-8.0.svg?label=CUDA%208)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-8.0/)
[![Build Status CUDA 9](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-9.2.svg?label=CUDA%209)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-9.2/)
[![Build Status CUDA 10](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.1.svg?label=CUDA%2010)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.1/)
[![Build Status CPU](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/)

View File

@ -1 +1 @@
v1.8.21
v1.8.40

186
cmake/FindCBLAS.cmake Normal file
View File

@ -0,0 +1,186 @@
# - Find CBLAS library
#
# This module finds an installed fortran library that implements the CBLAS
# linear-algebra interface (see http://www.netlib.org/blas/), with CBLAS
# interface.
#
# This module sets the following variables:
# CBLAS_FOUND - set to true if a library implementing the CBLAS interface
# is found
# CBLAS_LINKER_FLAGS - uncached list of required linker flags (excluding -l
# and -L).
# CBLAS_LIBRARIES - uncached list of libraries (using full path name) to
# link against to use CBLAS
# CBLAS_INCLUDE_DIR - path to includes
# CBLAS_INCLUDE_FILE - the file to be included to use CBLAS
#
## Based on https://github.com/Eyescale/CMake/blob/master/FindCBLAS.cmake
INCLUDE(CheckFunctionExists)
INCLUDE(CheckIncludeFile)
MACRO(CHECK_ALL_LIBRARIES LIBRARIES INCLUDE _prefix _name _flags _list _include _search_include)
# This macro checks for the existence of the combination of fortran libraries
# given by _list. If the combination is found, this macro checks (using the
# Check_Fortran_Function_Exists macro) whether can link against that library
# combination using the name of a routine given by _name using the linker
# flags given by _flags. If the combination of libraries is found and passes
# the link test, LIBRARIES is set to the list of complete library paths that
# have been found. Otherwise, LIBRARIES is set to FALSE.
# N.B. _prefix is the prefix applied to the names of all cached variables that
# are generated internally and marked advanced by this macro.
SET(__list)
FOREACH(_elem ${_list})
IF(__list)
SET(__list "${__list} - ${_elem}")
ELSE(__list)
SET(__list "${_elem}")
ENDIF(__list)
ENDFOREACH(_elem)
MESSAGE(STATUS "Checking for [${__list}]")
SET(_libraries_work TRUE)
SET(${LIBRARIES})
SET(_combined_name)
SET(_paths)
FOREACH(_library ${_list})
SET(_combined_name ${_combined_name}_${_library})
# did we find all the libraries in the _list until now?
# (we stop at the first unfound one)
IF(_libraries_work)
IF(APPLE)
FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
NAMES ${_library}
PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV
DYLD_LIBRARY_PATH
)
ELSE(APPLE)
FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
NAMES ${_library}
PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV
LD_LIBRARY_PATH
)
ENDIF(APPLE)
MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY)
IF(${_prefix}_${_library}_LIBRARY)
GET_FILENAME_COMPONENT(_path ${${_prefix}_${_library}_LIBRARY} PATH)
LIST(APPEND _paths ${_path}/../include ${_path}/../../include)
ENDIF(${_prefix}_${_library}_LIBRARY)
SET(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
SET(_libraries_work ${${_prefix}_${_library}_LIBRARY})
ENDIF(_libraries_work)
ENDFOREACH(_library ${_list})
# Test include
SET(_bug_search_include ${_search_include}) #CMAKE BUG!!! SHOULD NOT BE THAT
IF(_bug_search_include)
FIND_PATH(${_prefix}${_combined_name}_INCLUDE ${_include} ${_paths})
MARK_AS_ADVANCED(${_prefix}${_combined_name}_INCLUDE)
IF(${_prefix}${_combined_name}_INCLUDE)
MESSAGE(STATUS "Checking for [${__list}] -- includes found")
SET(${_prefix}_INCLUDE_DIR ${${_prefix}${_combined_name}_INCLUDE})
SET(${_prefix}_INCLUDE_FILE ${_include})
SET(${INCLUDE} ${${_prefix}_INCLUDE_DIR})
ELSE(${_prefix}${_combined_name}_INCLUDE)
MESSAGE(STATUS "Checking for [${__list}] -- includes not found")
SET(_libraries_work FALSE)
ENDIF(${_prefix}${_combined_name}_INCLUDE)
ELSE(_bug_search_include)
SET(${_prefix}_INCLUDE_DIR)
SET(${_prefix}_INCLUDE_FILE ${_include})
ENDIF(_bug_search_include)
IF(_libraries_work)
# Test this combination of libraries.
SET(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
CHECK_FUNCTION_EXISTS(${_name} ${_prefix}${_combined_name}_WORKS)
SET(CMAKE_REQUIRED_LIBRARIES)
MARK_AS_ADVANCED(${_prefix}${_combined_name}_WORKS)
SET(_libraries_work ${${_prefix}${_combined_name}_WORKS})
IF(_libraries_work)
MESSAGE(STATUS "Checking for [${__list}] -- libraries found")
ENDIF(_libraries_work)
ENDIF(_libraries_work)
IF(NOT _libraries_work)
SET(${LIBRARIES} FALSE)
ENDIF(NOT _libraries_work)
ENDMACRO(CHECK_ALL_LIBRARIES)
SET(CBLAS_LINKER_FLAGS)
SET(CBLAS_LIBRARIES)
SET(CBLAS_INCLUDE_DIR)
# CBLAS in openBLAS
IF(NOT CBLAS_LIBRARIES)
CHECK_ALL_LIBRARIES(
CBLAS_LIBRARIES
CBLAS_INCLUDE_DIR
cblas
cblas_sgemm
""
"openblas"
"cblas.h"
TRUE
)
ENDIF(NOT CBLAS_LIBRARIES)
#MESSAGE(STATUS ${openblas_INCLUDE_DIR})
# CBLAS in CBLAS
IF(NOT CBLAS_LIBRARIES)
CHECK_ALL_LIBRARIES(
CBLAS_LIBRARIES
CBLAS_INCLUDE_DIR
cblas
cblas_sgemm
""
"cblas"
"cblas.h"
TRUE
)
ENDIF(NOT CBLAS_LIBRARIES)
#MESSAGE(STATUS ${cblas_INCLUDE_DIR})
# CBLAS in lapacke
IF(NOT CBLAS_LIBRARIES)
CHECK_ALL_LIBRARIES(
CBLAS_LIBRARIES
CBLAS_INCLUDE_DIR
cblas
cblas_sgemm
""
"lapacke"
"cblas.h"
TRUE
)
ENDIF(NOT CBLAS_LIBRARIES)
#MESSAGE(STATUS ${lapacke_INCLUDE_DIR})
IF(CBLAS_LIBRARIES)
SET(CBLAS_FOUND TRUE)
ELSE(CBLAS_LIBRARIES)
SET(CBLAS_FOUND FALSE)
ENDIF(CBLAS_LIBRARIES)
IF(NOT CBLAS_FOUND AND CBLAS_FIND_REQUIRED)
MESSAGE(FATAL_ERROR "CBLAS library not found. Please specify library location")
ENDIF(NOT CBLAS_FOUND AND CBLAS_FIND_REQUIRED)
IF(NOT CBLAS_FIND_QUIETLY)
IF(CBLAS_FOUND)
MESSAGE(STATUS "CBLAS library found: " ${CBLAS_LIBRARIES})
MESSAGE(STATUS "cblas.h include directory: " ${CBLAS_INCLUDE_DIR})
ELSE(CBLAS_FOUND)
MESSAGE(STATUS "CBLAS library not found. Please specify library location")
ENDIF(CBLAS_FOUND)
ENDIF(NOT CBLAS_FIND_QUIETLY)

@ -1 +1 @@
Subproject commit 336740065d9c23e53e912a1befff18981d9d27ab
Subproject commit c19b7814d71febf1053bd93af6ac314b46204092

@ -1 +1 @@
Subproject commit 2a5833e41110c19f0bbe9f3cf2aa92caad96cf42
Subproject commit 6a08849b23f6c14eefbe12f4eb73dc638b962587

View File

@ -13,6 +13,11 @@ if(USE_FBGEMM)
set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS 1 CACHE INTERNAL "No dev warnings")
endif()
if(NOT MSVC)
# only locally disabled for the 3rd_party folder
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
endif()
set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "Disable fbgemm tests")
set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "Disable fbgemm benchmark")
add_subdirectory(./fbgemm)
@ -66,8 +71,21 @@ set(INSTALLS "") # this will contain a list of 3rd part dependencies that we ins
if(CUDA_FOUND)
if(USE_NCCL)
# disables compilation for sm_30 to avoid ptxas warning... that's general Kepler support. But K80s are supported for instance by sm_35
set(GENCODE "-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
# disables compilation for sm_30 to avoid ptxas warning... that is general Kepler support. But K80s are supported for instance by sm_35
set(GENCODE "")
if(COMPILE_CUDA_SM35)
set(GENCODE "${GENCODE} -gencode=arch=compute_35,code=sm_35")
endif(COMPILE_CUDA_SM35)
if(COMPILE_CUDA_SM50)
set(GENCODE "${GENCODE} -gencode=arch=compute_50,code=sm_50")
endif(COMPILE_CUDA_SM50)
if(COMPILE_CUDA_SM60)
set(GENCODE "${GENCODE} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
endif(COMPILE_CUDA_SM60)
if(COMPILE_CUDA_SM70)
set(GENCODE "${GENCODE} -gencode=arch=compute_70,code=sm_70")
endif(COMPILE_CUDA_SM70)
# install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation
ExternalProject_Add(nccl_install

0
src/3rd_party/avx_mathfun.h vendored Executable file → Normal file
View File

@ -1 +1 @@
Subproject commit f0b354327aaf2330c65340725b1981040c8bec9e
Subproject commit 84e66a976046180187724aff60a236c5378fde7c

2
src/3rd_party/nccl vendored

@ -1 +1 @@
Subproject commit 8e3a3f7c5b520babff49cec54a866fa3eda3a3b6
Subproject commit b56650c7f59b8cd40d18809784a6d6be38ef8acb

View File

@ -1,347 +1,248 @@
// This software contains source code provided by NVIDIA Corporation.
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECINDIRECFunctor, T, AccTyf, pe, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACSf, TRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
MJD: Relevant text from the NVIDIA EULA:
2.1 Sample Source Code Modification, Ownership and Distribution
Subject to the terms of the SLA and this Supplement, NVIDIA hereby grants you a non-
exclusive, non-transferable license, without the right to sublicense, during the applicable
license term unless earlier terminated pursuant to the SLA, to have Authorized Users
modify and create derivative works of CUDA Licensed Software that constitutes sample
source code, when provided to you by NVIDIA in source code form. You hold all rights,
title and interest in and to your modifications and derivative works of the sample source
code software that you create as permitted hereunder (collective, Derivatives), subject
to NVIDIAs underlying Intellectual Property Rights in and to the CUDA Licensed
Software; provided, however that you grant NVIDIA and its Affiliates an irrevocable,
perpetual, nonexclusive, worldwide, royalty-free paid-up license to make, have made,
use, have used, reproduce, license, distribute, sublicense, transfer and otherwise
commercialize Derivatives including (without limitation) with the CUDA Licensed
Software or other NVIDIA products, technologies or materials. You may distribute the
CUDA Supplement to Software License Agreement End User License Agreements (EULA)
DR-06739-001_v01_v9.0 | 14 sample source code as delivered by NVIDIA and/or your Derivatives,
provided that all NVIDIA copyright notices and trademarks are maintained and used properly
and the sample source code includes the following notice: This software contains source code
provided by NVIDIA Corporation.
*/
#pragma once
#include "tensors/tensor.h"
#include <cuda_runtime.h>
#include "functional/tmp.h"
#include <cooperative_groups.h>
namespace marian {
template <unsigned int blockSize, typename AccType>
__device__ void
reduceBlock(volatile AccType *sdata, AccType mySum, const unsigned int tid)
{
sdata[tid] = mySum;
__syncthreads();
namespace cg = cooperative_groups;
// do reduction in shared mem
if (blockSize >= 512)
{
if (tid < 256)
{
sdata[tid] = mySum = mySum + sdata[tid + 256];
}
// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template <class T>
struct SharedMemory {
__device__ inline operator T *() {
extern __shared__ int __smem[];
return (T *)__smem;
}
__syncthreads();
__device__ inline operator const T *() const {
extern __shared__ int __smem[];
return (T *)__smem;
}
};
// specialize for double to avoid unaligned memory
// access compile errors
template <>
struct SharedMemory<double> {
__device__ inline operator double *() {
extern __shared__ double __smem_d[];
return (double *)__smem_d;
}
__device__ inline operator const double *() const {
extern __shared__ double __smem_d[];
return (double *)__smem_d;
}
};
/*
This version adds multiple elements per thread sequentially. This reduces
the overall cost of the algorithm while keeping the work complexity O(n) and
the step complexity O(log n). (Brent's Theorem optimization)
Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
If blockSize > 32, allocate blockSize*sizeof(T) bytes.
*/
template <typename T, typename AccType, unsigned int blockSize, bool nIsPow2Greater1, size_t K, class Functor, class AggFunctor>
__global__ void reduceSinglePass(Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale,
const functional::Shape full,
functional::Tensor<AccType> out,
functional::Array<functional::Tensor<T>, K> ins) {
int n = full.elements();
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
AccType *sdata = SharedMemory<AccType>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
AccType mySum = aggInit;
// we reduceSinglePass multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n) {
mySum = aggFunctor(mySum, functional::applyWithCast<AccType>(functor, ins, i));
// ensure we don't read out of bounds -- this is optimized away for powerOf2
// sized arrays
if (nIsPow2Greater1 || i + blockSize < n)
mySum = aggFunctor(mySum, functional::applyWithCast<AccType>(functor, ins, i + blockSize));
i += gridSize;
}
// each thread puts its local sum into shared memory
sdata[tid] = mySum;
cg::sync(cta);
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256)) {
sdata[tid] = mySum = aggFunctor(mySum, sdata[tid + 256]);
}
cg::sync(cta);
if ((blockSize >= 256) && (tid < 128)) {
sdata[tid] = mySum = aggFunctor(mySum, sdata[tid + 128]);
}
cg::sync(cta);
if ((blockSize >= 128) && (tid < 64)) {
sdata[tid] = mySum = aggFunctor(mySum, sdata[tid + 64]);
}
cg::sync(cta);
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
if (cta.thread_rank() < 32) {
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64)
mySum = aggFunctor(mySum, sdata[tid + 32]);
// reduce final warp using shuffle
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
mySum = aggFunctor(mySum, tile32.shfl_down(mySum, offset));
}
}
if (blockSize >= 256)
{
if (tid < 128)
{
sdata[tid] = mySum = mySum + sdata[tid + 128];
}
__syncthreads();
}
if (blockSize >= 128)
{
if (tid < 64)
{
sdata[tid] = mySum = mySum + sdata[tid + 64];
}
__syncthreads();
}
if (tid < 32)
{
if (blockSize >= 64)
{
sdata[tid] = mySum = mySum + sdata[tid + 32];
}
if (blockSize >= 32)
{
sdata[tid] = mySum = mySum + sdata[tid + 16];
}
if (blockSize >= 16)
{
sdata[tid] = mySum = mySum + sdata[tid + 8];
}
if (blockSize >= 8)
{
sdata[tid] = mySum = mySum + sdata[tid + 4];
}
if (blockSize >= 4)
{
sdata[tid] = mySum = mySum + sdata[tid + 2];
}
if (blockSize >= 2)
{
sdata[tid] = mySum = mySum + sdata[tid + 1];
}
}
// write result for this block to global mem
if (cta.thread_rank() == 0)
out[blockIdx.x] = aggFunctor(out[blockIdx.x], mySum * scale); // aggFunctor?
}
template <unsigned int blockSize, bool nIsPow2, typename T, typename AccType, class Functor>
__device__ void
reduceBlocks(Functor f, T *g_idata, AccType *g_odata, unsigned int n)
{
extern __shared__ AccType sdata[];
static inline bool isPow2Greater1(unsigned int x) { // is power of two but also larger than 1, otherwise an out-of-bounds read occurs
return x > 1 && ((x & (x - 1)) == 0);
}
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
AccType mySum = 0;
static inline unsigned int nextPow2(unsigned int x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n)
{
mySum += f((AccType)g_idata[i]);
////////////////////////////////////////////////////////////////////////////////
// Wrapper function for kernel launch
////////////////////////////////////////////////////////////////////////////////
template <typename T, typename AccType, size_t K, class Functor, class AggFunctor>
void reduceSinglePass(Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale,
const functional::Shape full,
functional::Tensor<AccType> out,
functional::Array<functional::Tensor<T>, K> ins,
int threads, int blocks) {
int size = full.elements();
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (threads <= 32) ? 2 * threads * sizeof(AccType) : threads * sizeof(AccType);
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n)
mySum += f((AccType)g_idata[i+blockSize]);
i += gridSize;
if (isPow2Greater1(size)) {
switch (threads) {
case 512:
reduceSinglePass<T, AccType, 512, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 256:
reduceSinglePass<T, AccType, 256, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 128:
reduceSinglePass<T, AccType, 128, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 64:
reduceSinglePass<T, AccType, 64, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 32:
reduceSinglePass<T, AccType, 32, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 16:
reduceSinglePass<T, AccType, 16, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 8:
reduceSinglePass<T, AccType, 8, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 4:
reduceSinglePass<T, AccType, 4, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 2:
reduceSinglePass<T, AccType, 2, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 1:
reduceSinglePass<T, AccType, 1, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
}
// do reduction in shared mem
reduceBlock<blockSize>(sdata, mySum, tid);
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Global variable used by reduceSinglePass to count how many blocks have finished
__device__ unsigned int retirementCount = 0;
cudaError_t setRetirementCount(int retCnt)
{
return cudaMemcpyToSymbol(retirementCount, &retCnt, sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
}
// This reduction kernel reduces an arbitrary size array in a single kernel invocation
// It does so by keeping track of how many blocks have finished. After each thread
// block completes the reduction of its own block of data, it "takes a ticket" by
// atomically incrementing a global counter. If the ticket value is equal to the number
// of thread blocks, then the block holding the ticket knows that it is the last block
// to finish. This last block is responsible for summing the results of all the other
// blocks.
//
// In order for this to work, we must be sure that before a block takes a ticket, all
// of its memory transactions have completed. This is what __threadfence() does -- it
// blocks until the results of all outstanding memory transactions within the
// calling thread are visible to all other threads.
//
// For more details on the reduction algorithm (notably the multi-pass approach), see
// the "reduction" sample in the CUDA SDK.
template <unsigned int blockSize, bool nIsPow2, typename T, typename AccType, class Functor>
__global__ void reduceSinglePass(Functor f, T *g_idata, AccType *g_odata, unsigned int n)
{
//
// PHASE 1: Process all inputs assigned to this block
//
reduceBlocks<blockSize, nIsPow2, T, AccType>(f, g_idata, g_odata, n);
//
// PHASE 2: Last block finished will process all partial sums
//
if (gridDim.x > 1)
{
const unsigned int tid = threadIdx.x;
__shared__ bool amLast;
extern AccType __shared__ smem[];
// wait until all outstanding memory instructions in this thread are finished
__threadfence();
// Thread 0 takes a ticket
if (tid==0)
{
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
// If the ticket ID is equal to the number of blocks, we are the last block!
amLast = (ticket == gridDim.x-1);
}
__syncthreads();
// The last block sums the results of all other blocks
if (amLast)
{
int i = tid;
AccType mySum = 0;
while (i < gridDim.x)
{
mySum += g_odata[i];
i += blockSize;
}
reduceBlock<blockSize>(smem, mySum, tid);
if (tid==0)
{
g_odata[0] = smem[0];
// reset retirement count so that next run succeeds
retirementCount = 0;
}
}
} else {
switch (threads) {
case 512:
reduceSinglePass<T, AccType, 512, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 256:
reduceSinglePass<T, AccType, 256, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 128:
reduceSinglePass<T, AccType, 128, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 64:
reduceSinglePass<T, AccType, 64, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 32:
reduceSinglePass<T, AccType, 32, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 16:
reduceSinglePass<T, AccType, 16, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 8:
reduceSinglePass<T, AccType, 8, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 4:
reduceSinglePass<T, AccType, 4, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 2:
reduceSinglePass<T, AccType, 2, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
case 1:
reduceSinglePass<T, AccType, 1, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
break;
}
}
}
bool isPow2(unsigned int x)
{
return ((x&(x-1))==0);
}
template <typename T, typename AccType, class Functor>
void ReduceAll(Functor f, Tensor blockMem, Tensor in)
{
cudaSetDevice(in->getDeviceId().no);
int size = in->shape().elements();
int threads = std::min(MAX_THREADS, size);
int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = threads * sizeof(AccType);
T* d_idata = in->data<T>();
AccType* d_odata = blockMem->data<AccType>();
// choose which of the optimized versions of reduction to launch
if (isPow2(size))
{
switch (threads)
{
case 512:
reduceSinglePass<512, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 256:
reduceSinglePass<256, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 128:
reduceSinglePass<128, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 64:
reduceSinglePass< 64, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 32:
reduceSinglePass< 32, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 16:
reduceSinglePass< 16, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 8:
reduceSinglePass< 8, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 4:
reduceSinglePass< 4, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 2:
reduceSinglePass< 2, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 1:
reduceSinglePass< 1, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
}
}
else
{
switch (threads)
{
case 512:
reduceSinglePass<512, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 256:
reduceSinglePass<256, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 128:
reduceSinglePass<128, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 64:
reduceSinglePass< 64, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 32:
reduceSinglePass< 32, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 16:
reduceSinglePass< 16, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 8:
reduceSinglePass< 8, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 4:
reduceSinglePass< 4, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 2:
reduceSinglePass< 2, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
case 1:
reduceSinglePass< 1, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
break;
}
}
}
}
}

0
src/3rd_party/sse_mathfun.h vendored Executable file → Normal file
View File

View File

@ -51,7 +51,7 @@ add_library(marian STATIC
tensors/cpu/sharp/int_gemm.cpp
tensors/cpu/sharp/avx_gemm.cpp
tensors/cpu/sharp/sse_gemm.cpp
tensors/cpu/sharp/packed_gemm.cpp
tensors/cpu/fbgemm/packed_gemm.cpp
graph/expression_graph.cpp
graph/expression_operators.cpp
@ -138,7 +138,8 @@ cuda_add_library(marian_cuda
tensors/gpu/algorithm.cu
tensors/gpu/prod.cpp
tensors/gpu/element.cu
tensors/gpu/add.cu
tensors/gpu/add.cu
tensors/gpu/add_all.cu
tensors/gpu/tensor_operators.cu
tensors/gpu/cudnn_wrappers.cu
translator/nth_element.cu

View File

@ -4,7 +4,7 @@
#include <sstream>
#include "graph/expression_graph_packable.h"
#include "tensors/cpu/fbgemm/expression_graph_packable.h"
int main(int argc, char** argv) {
using namespace marian;
@ -19,16 +19,29 @@ int main(int argc, char** argv) {
"Convert a model in the .npz format and normal memory layout to a mmap-able binary model which could be in normal memory layout or packed memory layout",
"Allowed options",
"Examples:\n"
" ./marian-conv -f model.npz -t model.bin --gemm-type fp16packed");
" ./marian-conv -f model.npz -t model.bin --gemm-type packed16");
cli->add<std::string>("--from,-f", "Input model", "model.npz");
cli->add<std::string>("--to,-t", "Output model", "model.bin");
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used with this weights", "mklfp32");
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512", "float32");
cli->parse(argc, argv);
options->merge(config);
}
auto modelFrom = options->get<std::string>("from");
auto modelTo = options->get<std::string>("to");
auto saveGemmType = options->get<std::string>("gemm-type");
auto saveGemmTypeStr = options->get<std::string>("gemm-type", "float32");
Type saveGemmType;
if(saveGemmTypeStr == "float32") {
saveGemmType = Type::float32;
} else if(saveGemmTypeStr == "packed16") { // packed16 only supports AVX2. AVX512 might be added later
saveGemmType = Type::packed16;
} else if(saveGemmTypeStr == "packed8avx2") { // packed8 for AVX2
saveGemmType = Type::packed8avx2;
} else if(saveGemmTypeStr == "packed8avx512") { // packed8 for AVX512
saveGemmType = Type::packed8avx512;
} else {
ABORT("Unknown gemm-type: {}", saveGemmTypeStr);
}
LOG(info, "Outputting {}", modelTo);

View File

@ -1,3 +1,4 @@
#include <signal.h>
#include "marian.h"
#include "training/graph_group_async.h"
@ -68,5 +69,13 @@ int main(int argc, char** argv) {
}
}
return 0;
// If we exit due to SIGTERM, exit with 128 + the signal number, as suggested
// for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent
// scripts to determine if training terminated naturally or via SIGTERM.
// Whith this approach we can accommodate additional signals in the future.
// An alternative would be to return 124, which is what the timeout command
// returns for timeout -s SIGTERM <seconds> ...., because exiting after SIGTERM
// is not technically a fatal error (which is what the 128+x convention usually
// stands for).
return getSigtermFlag() ? (128 + SIGTERM) : 0;
}

View File

@ -328,6 +328,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
"Optimization criterion: ce-mean, ce-mean-words, ce-sum, perplexity", "ce-mean");
cli.add<std::string>("--multi-loss-type",
"How to accumulate multi-objective losses: sum, scaled, mean", "sum");
cli.add<bool>("--unlikelihood-loss",
"Use word-level weights as indicators for sequence-level unlikelihood training");
cli.add<bool>("--overwrite",
"Do not create model checkpoints, only overwrite main model file with last checkpoint. "
"Reduces disk usage");
@ -502,7 +504,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
true);
// add ULR settings
addSuboptionsULR(cli);
addSuboptionsULR(cli);
cli.add<std::vector<std::string>>("--task",
"Use predefined set of options. Possible values: transformer, transformer-big");
@ -543,6 +545,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
"Allow unknown words to appear in output");
cli.add<bool>("--n-best",
"Generate n-best list");
cli.add<bool>("--word-scores",
"Print word-level scores");
// efficiency options
cli.add<int>("--valid-mini-batch",
@ -562,8 +566,10 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
"Additional args passed to --valid-script-path. These are inserted"
" between the script path and the output translation-file path");
cli.add<std::string>("--valid-translation-output",
"Path to store the translation");
"(Template for) path to store the translation. "
"E.g., validation-output-after-{U}-updates-{T}-tokens.txt. Template "
"parameters: {E} for epoch; {B} for No. of batches within epoch; "
"{U} for total No. of updates; {T} for total No. of tokens seen.");
cli.add<bool>("--keep-best",
"Keep best model for each validation metric");
cli.add<std::string>("--valid-log",
@ -603,6 +609,12 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
cli.add<std::string>("--alignment",
"Return word alignment. Possible values: 0.0-1.0, hard, soft")
->implicit_val("1");
cli.add<bool>("--word-scores",
"Print word-level scores");
#ifdef USE_SENTENCEPIECE
cli.add<bool>("--no-spm-decode",
"Keep the output segmented into SentencePiece subwords");
#endif
addSuboptionsDevices(cli);
addSuboptionsInputLength(cli);
@ -612,7 +624,7 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Optimize speed aggressively sacrificing memory or precision");
cli.add<bool>("--skip-cost",
"Ignore model cost during translation, not recommended for beam-size > 1");
cli.add<bool>("--fp16",
cli.add<bool>("--fp16",
"Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
cli.add<std::vector<std::string>>("--precision",
"Mixed precision for inference, set parameter type in expression graph",
@ -626,8 +638,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Noise output layer with gumbel noise",
false);
#if 0 // @TODO: Ask Hany if there are any decoding-time options
// add ULR settings
addSuboptionsULR(cli);
#endif
cli.switchGroup(previous_group);
// clang-format on
@ -737,29 +751,31 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
"Sorting strategy for maxi-batch: none, src, trg (not available for decoder)",
defaultMaxiBatchSort);
cli.add<bool>("--shuffle-in-ram",
"Keep shuffled corpus in RAM, do not write to temp file");
// @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
cli.add<size_t>("--all-caps-every",
"When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
cli.add<size_t>("--english-title-case-every",
"When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");
if(mode_ == cli::mode::training) {
cli.add<bool>("--shuffle-in-ram",
"Keep shuffled corpus in RAM, do not write to temp file");
// @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
cli.add<size_t>("--all-caps-every",
"When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
cli.add<size_t>("--english-title-case-every",
"When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");
cli.add<int>("--mini-batch-words-ref",
"If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
"--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
"Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
"Auto-adjusted to --mini-batch-words-ref if given",
{"0"});
cli.add<bool>("--mini-batch-track-lr",
"Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
cli.add<size_t>("--mini-batch-overstuff",
"[experimental] Stuff this much more data into a minibatch, but scale down the LR and progress counter",
1);
cli.add<size_t>("--mini-batch-understuff",
"[experimental] Break each batch into this many updates",
1);
cli.add<int>("--mini-batch-words-ref",
"If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
"--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
"Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
"Auto-adjusted to --mini-batch-words-ref if given",
{"0"});
cli.add<bool>("--mini-batch-track-lr",
"Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
cli.add<size_t>("--mini-batch-overstuff",
"[experimental] Stuff this much more data into a minibatch, but scale down the LR and progress counter",
1);
cli.add<size_t>("--mini-batch-understuff",
"[experimental] Break each batch into this many updates",
1);
}
// clang-format on
}

View File

@ -23,7 +23,7 @@ struct Convert {
// specialization for translating from string, @TODO check if this is required at all, mostly for compilation now.
template <typename To>
struct Convert<To, std::string> {
static inline To apply(const std::string& from) {
static inline To apply(const std::string& /* from */) {
ABORT("Not implemented");
}
};
@ -84,7 +84,10 @@ std::vector<T> As<std::vector<T>>::apply(const FastOpt& node) {
// specializations for simple vector types
template struct As<std::vector<bool>>;
template struct As<std::vector<int>>;
template struct As<std::vector<unsigned long>>;
// Windows and Unix based OS have different type definitions for 'unsigned long'.
// So, we need an explicit definition for uint64_t. Otherwise, there's a linking error on windows.
// https://software.intel.com/en-us/articles/size-of-long-integer-type-on-different-architecture-and-os/
template struct As<std::vector<uint64_t>>;
template struct As<std::vector<float>>;
template struct As<std::vector<double>>;
template struct As<std::vector<std::string>>;

View File

@ -154,6 +154,13 @@ private:
void makeScalar(const YAML::Node& v) {
elements_ = 0;
try {
// Cast node to text first, that works for any scalar node and test that it does not contain single characters
// that according to YAML could be boolean values. Unfortunately, we do not have any type information at this point.
// This means we are disabling support for boolean values in YAML that are expressed with these characters.
auto asText = v.as<std::string>();
if(asText.size() == 1 && asText.find_first_of("nyNYtfTF") == 0) // @TODO: should we disallow other strings too?
throw YAML::BadConversion(YAML::Mark()); // get's picked up by next catch block
value_ = v.as<bool>();
type_ = NodeType::Bool;
} catch(const YAML::BadConversion& /*e*/) {

0
src/common/file_stream.cpp Normal file → Executable file
View File

View File

@ -1,5 +1,5 @@
#include "common/types.h"
#include "tensors/cpu/sharp/packed_gemm.h"
#include "tensors/cpu/fbgemm/packed_gemm.h"
namespace marian {
@ -8,13 +8,31 @@ namespace marian {
// But for instance, for intransparent types like packed tensors, it cannot easily be inferred by
// multiplying. All cases are handed here and can later be passed to allocators etc.
size_t requiredBytes(const Shape& shape, Type type) {
if(isPacked(type)) {
uint64_t packsize;
cpu::variant::PackInfoFp32(shape, false, packsize);
return (size_t)packsize;
#if USE_FBGEMM
if (isPacked(type)) {
if (sizeOf(type) == 1) {
// Type::packed8avx2 || type == Type::packed8avx512
// AVX2 and AVX512 CPUs have different cache and vector lanes,
// so the optimal memory layouts for them are different.
int nrow, ncol;
uint64_t packsize;
cpu::variant::fbgemmPacked8PackInfo(shape, type, false, /*out=*/nrow, /*out=*/ncol, /*out=*/packsize);
return (size_t)packsize;
} else if (type == Type::packed16) {
uint64_t packsize;
cpu::variant::fbgemmPacked16PackInfo(shape, false, /*out=*/packsize);
return (size_t)packsize;
} else {
ABORT("Not a supported data type: {}", type);
return 0;
}
} else {
return shape.elements() * sizeOf(type);
}
#else
return shape.elements() * sizeOf(type);
#endif // USE_FBGEMM
}
}

View File

@ -31,7 +31,7 @@
#include <cuda.h> // required to see CUDA_VERSION
#if (CUDA_VERSION > 9000)
#define COMPILE_FP16 1
#else
#else
#define COMPILE_FP16 0
#endif
#else
@ -135,13 +135,19 @@ do { \
namespace marian {
// small struct to enable templating based on types use for packing
struct packed8 {
struct packed16 {
uint16_t x;
};
// small struct to enable templating based on types use for packing. This is a memory holder.
// There's no difference between packed8avx2 and packed8avx512. But, they are separately defined to be distinguished.
struct packed8avx2 {
uint8_t x;
};
// small struct to enable templating based on types use for packing
struct packed16 {
uint16_t x;
// small struct to enable templating based on types use for packing. This is a memory holder.
struct packed8avx512 {
uint8_t x;
};
#ifndef __CUDACC__ // vectorized types not available from .cu files
@ -174,6 +180,7 @@ public:
};
// @TODO: consider how code can be shared via templating
#ifdef __AVX__
struct float32x8 {
private:
__m256 f_;
@ -199,22 +206,35 @@ public:
return out;
}
};
#else
//Dummy version to get things to compile on older CPUs
struct float32x8 {
};
#endif
#endif
// Internal to types.h, don't use. Use test functions below.
enum class TypeClass : size_t {
signed_type = 0x100,
unsigned_type = 0x200,
float_type = 0x400,
packed_type = 0x800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
size_mask = 0x0FF
signed_type = 0x0100,
unsigned_type = 0x0200,
float_type = 0x0400,
packed_type = 0x0800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
avx2_type = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only
avx512_type = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only
size_mask = 0x00FF,
class_mask = 0xFF00
};
constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
return (size_t)typeClass + val;
}
constexpr inline size_t operator+(size_t val, TypeClass typeClass) {
return val + (size_t)typeClass;
}
// @TODO: rename to ElementType when things become stable, so it's easier to review
enum class Type : size_t {
int8 = TypeClass::signed_type + 1u,
@ -231,14 +251,20 @@ enum class Type : size_t {
float32 = TypeClass::float_type + 4u,
float64 = TypeClass::float_type + 8u,
packed8 = TypeClass::packed_type + 1u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed16 = TypeClass::packed_type + 2u // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
};
static inline size_t operator&(TypeClass typeClass, Type type) {
return (size_t)typeClass & (size_t)type;
}
static inline bool isSameTypeClass(Type type1, Type type2) {
return (TypeClass::class_mask & type1) == (TypeClass::class_mask & type2);
}
static inline size_t sizeOf(Type type) {
return TypeClass::size_mask & type;
}
@ -263,6 +289,14 @@ static inline bool isPacked(Type type) {
return (TypeClass::packed_type & type) != 0;
}
static inline bool isAvx2(Type type) {
return (TypeClass::avx2_type & type) != 0;
}
static inline bool isAvx512(Type type) {
return (TypeClass::avx512_type & type) != 0;
}
size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type
template <typename T>
@ -284,8 +318,9 @@ template <> inline bool matchType<float16>(Type type) { return type == Type::fl
template <> inline bool matchType<float>(Type type) { return type == Type::float32; }
template <> inline bool matchType<double>(Type type) { return type == Type::float64; }
template <> inline bool matchType<packed8>(Type type) { return type == Type::packed8; }
template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
template <> inline bool matchType<packed8avx2>(Type type) { return type == Type::packed8avx2; }
template <> inline bool matchType<packed8avx512>(Type type) { return type == Type::packed8avx512; }
// clang-format on
static inline std::ostream& operator<<(std::ostream& out, Type type) {
@ -304,8 +339,9 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) {
case Type::float32 : out << "float32"; break;
case Type::float64 : out << "float64"; break;
case Type::packed8 : out << "packed8"; break;
case Type::packed16: out << "packed16"; break;
case Type::packed16 : out << "packed16"; break;
case Type::packed8avx2 : out << "packed8avx2"; break;
case Type::packed8avx512 : out << "packed8avx512"; break;
}
return out;
}
@ -328,8 +364,9 @@ template <> inline std::string request<float16>() { return "float16"; }
template <> inline std::string request<float>() { return "float32"; }
template <> inline std::string request<double>() { return "float64"; }
template <> inline std::string request<packed8>() { return "packed8"; }
template <> inline std::string request<packed16>() { return "packed16"; }
template <> inline std::string request<packed8avx2>() { return "packed8avx2"; }
template <> inline std::string request<packed8avx512>() { return "packed8avx512"; }
// clang-format on
static Type inline typeFromString(const std::string& str) {
@ -357,6 +394,13 @@ static Type inline typeFromString(const std::string& str) {
return Type::float32;
if(str == "float64")
return Type::float64;
if(str == "packed16")
return Type::packed16;
if(str == "packed8avx2")
return Type::packed8avx2;
if(str == "packed8avx512")
return Type::packed8avx512;
ABORT("Unknown type {}", str);
}
@ -378,6 +422,10 @@ template <> inline Type typeId<float16>() { return Type::float16; }
template <> inline Type typeId<float>() { return Type::float32; }
template <> inline Type typeId<double>() { return Type::float64; }
template <> inline Type typeId<packed16>() { return Type::packed16; }
template <> inline Type typeId<packed8avx2>() { return Type::packed8avx2; }
template <> inline Type typeId<packed8avx512>() { return Type::packed8avx512; }
// Abort if given C++ does not correspond to runtime type
template <typename T>
void matchOrAbort(Type type) {

View File

@ -8,9 +8,7 @@ namespace data {
WordAlignment::WordAlignment() {}
WordAlignment::WordAlignment(
const std::vector<Point>& align)
: data_(align) {}
WordAlignment::WordAlignment(const std::vector<Point>& align) : data_(align) {}
WordAlignment::WordAlignment(const std::string& line) {
std::vector<std::string> atok = utils::splitAny(line, " -");

23
src/data/corpus.cpp Normal file → Executable file
View File

@ -13,17 +13,17 @@ namespace data {
Corpus::Corpus(Ptr<Options> options, bool translate /*= false*/)
: CorpusBase(options, translate),
shuffleInRAM_(options_->get<bool>("shuffle-in-ram")),
allCapsEvery_(options_->get<size_t>("all-caps-every")),
titleCaseEvery_(options_->get<size_t>("english-title-case-every")) {}
shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
Corpus::Corpus(std::vector<std::string> paths,
std::vector<Ptr<Vocab>> vocabs,
Ptr<Options> options)
: CorpusBase(paths, vocabs, options),
shuffleInRAM_(options_->get<bool>("shuffle-in-ram")),
allCapsEvery_(options_->get<size_t>("all-caps-every")),
titleCaseEvery_(options_->get<size_t>("english-title-case-every")) {}
shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
void Corpus::preprocessLine(std::string& line, size_t streamId) {
if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) {
@ -235,11 +235,12 @@ CorpusBase::batch_ptr Corpus::toBatch(const std::vector<Sample>& batchVector) {
}
std::vector<size_t> words(maxDims.size(), 0);
for(size_t i = 0; i < batchSize; ++i) {
for(size_t j = 0; j < maxDims.size(); ++j) {
for(size_t k = 0; k < batchVector[i][j].size(); ++k) {
subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
subBatches[j]->mask()[k * batchSize + i] = 1.f;
for(size_t b = 0; b < batchSize; ++b) { // loop over batch entries
for(size_t j = 0; j < maxDims.size(); ++j) { // loop over streams
auto subBatch = subBatches[j];
for(size_t s = 0; s < batchVector[b][j].size(); ++s) { // loop over word positions
subBatch->data()[subBatch->locate(/*batchIdx=*/b, /*wordPos=*/s)/*s * batchSize + b*/] = batchVector[b][j][s];
subBatch->mask()[subBatch->locate(/*batchIdx=*/b, /*wordPos=*/s)/*s * batchSize + b*/] = 1.f;
words[j]++;
}
}

View File

@ -1,6 +1,7 @@
#include <random>
#include "data/corpus.h"
#include "data/factored_vocab.h"
namespace marian {
namespace data {
@ -84,19 +85,19 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
"Vocabularies will be built separately for each file.");
std::vector<int> vocabDims(paths_.size(), 0);
std::vector<std::string> vocabPaths(paths_.size());
std::vector<std::string> vocabPaths1(paths_.size());
// Create vocabs if not provided
for(size_t i = 0; i < paths_.size(); ++i) {
Ptr<Vocab> vocab = New<Vocab>(options_, i);
std::vector<std::string> trainPaths = { paths_[i] };
vocabDims[i] = vocab->loadOrCreate("", trainPaths, maxVocabs[i]);
vocabPaths[i] = paths_[i] + ".yml";
vocabDims[i] = (int) vocab->loadOrCreate("", trainPaths, maxVocabs[i]);
vocabPaths1[i] = paths_[i] + ".yml";
vocabs_.emplace_back(vocab);
}
// TODO: this is not nice as it modifies the option object and needs to expose the changes
// outside the corpus as models need to know about the vocabulary size; extract the vocab
// creation functionality from the class.
options_->set("dim-vocabs", vocabDims, "vocabs", vocabPaths);
options_->set("dim-vocabs", vocabDims, "vocabs", vocabPaths1);
} else {
// Load all vocabs
size_t numVocs = vocabPaths.size();
@ -128,7 +129,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
// it wild not be created again, but just correctly loaded.
auto pathsAndSize = groupVocab[vocabPaths[i]];
std::vector<std::string> groupedPaths(pathsAndSize.paths.begin(), pathsAndSize.paths.end());
vocabDims[i] = vocab->loadOrCreate(vocabPaths[i], groupedPaths, pathsAndSize.size);
vocabDims[i] = (int) vocab->loadOrCreate(vocabPaths[i], groupedPaths, pathsAndSize.size);
vocabs_.emplace_back(vocab);
}
// TODO: this is not nice as it modifies the option object and needs to expose the changes
@ -150,7 +151,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
vocabDims.resize(numVocs, 0);
for(size_t i = 0; i + 1 < numVocs; ++i) {
Ptr<Vocab> vocab = New<Vocab>(options_, i);
vocabDims[i] = vocab->load(vocabPaths[i], maxVocabs[i]);
vocabDims[i] = (int) vocab->load(vocabPaths[i], maxVocabs[i]);
vocabs_.emplace_back(vocab);
}
// TODO: As above, this is not nice as it modifies the option object and needs to expose the changes
@ -240,10 +241,10 @@ void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupl
if(!elements.empty()) {
std::vector<float> weights;
for(auto& e : elements) {
if(maxLengthCrop_ && weights.size() > maxLength_)
for(auto& e : elements) { // Iterate weights as strings
if(maxLengthCrop_ && weights.size() >= maxLength_) // Cut if the input is going to be cut
break;
weights.emplace_back(std::stof(e));
weights.emplace_back(std::stof(e)); // Add a weight converted into float
}
if(rightLeft_)
@ -330,5 +331,54 @@ void CorpusBase::initEOS(bool training = true) {
}
}
// experimental: hide inline-fix source tokens from cross attention
std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
{
const auto& srcVocab = *vocab();
auto factoredVocab = vocab()->tryAs<FactoredVocab>();
size_t inlineFixGroupIndex = 0, inlineFixSrc = 0;
auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc);
auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG];
auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG];
auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG];
auto unkId = srcVocab.getUnkId();
auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId;
auto m = mask(); // default return value, which we will modify in-place below in case we need to
if (hasInlineFixFactors || hasInlineFixTags) {
LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens");
// example: force French translation of name "frank" to always be "franck"
// - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to
// - hasInlineFixTags: "<IOPEN> frank <IDELIM> franck <ICLOSE>", "frank" and all tags cannot be cross-attended to
auto dimBatch = batchSize(); // number of sentences in the batch
auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch
const auto& d = data();
size_t numWords = 0;
for (size_t b = 0; b < dimBatch; b++) { // loop over batch entries
bool inside = false;
for (size_t s = 0; s < dimWidth; s++) { // loop over source positions
auto i = locate(/*batchIdx=*/b, /*wordPos=*/s);
if (!m[i])
break;
numWords++;
// keep track of entering/exiting the inline-fix source tags
auto w = d[i];
if (w == fixSrcId)
inside = true;
else if (w == fixTgtId)
inside = false;
bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc;
if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor)
m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens
}
}
ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??");
}
return m;
}
} // namespace data
} // namespace marian

84
src/data/corpus_base.h Normal file → Executable file
View File

@ -143,12 +143,19 @@ public:
* words (width) and \f$s\f$ is the number of sentences (size).
*/
Words& data() { return indices_; }
const Words& data() const { return indices_; }
/**
* @brief compute flat index into data() and mask() vectors for given batch index and word index in sentence
*/
size_t locate(size_t batchIdx, size_t wordPos) const { return locate(batchIdx, wordPos, size_); }
static size_t locate(size_t batchIdx, size_t wordPos, size_t batchSize) { return wordPos * batchSize + batchIdx; }
/**
* @brief Flat masking vector; 0 is used for masked words.
*
* @see data()
*/
std::vector<float>& mask() { return mask_; }
const std::vector<float>& mask() const { return mask_; }
/**
* @brief Accessors to the vocab_ field.
@ -158,15 +165,15 @@ public:
/**
* @brief The number of sentences in the batch.
*/
size_t batchSize() { return size_; }
size_t batchSize() const { return size_; }
/**
* @brief The number of words in the longest sentence in the batch.
*/
size_t batchWidth() { return width_; };
size_t batchWidth() const { return width_; };
/**
* @brief The total number of words in the batch (not counting masked-out words).
*/
size_t batchWords() { return words_; }
size_t batchWords() const { return words_; }
/**
* @brief Splits the stream into sub-batches of equal size (except for last).
@ -179,7 +186,7 @@ public:
*
* @see marian::data::Batch::split(size_t n)
*/
std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) {
std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) const {
ABORT_IF(size_ == 0, "Encountered sub-batch size of 0");
auto size = std::min(size_, sizeLimit); // if limit is given then pretend the batch only has that many sentences
@ -191,26 +198,24 @@ public:
// determine actual width (=max length) of this sub-batch, which may be smaller than the overall max length
size_t subWidth = 0;
for(size_t j = 0; j < width_; ++j) {
for(size_t i = 0; i < subSize; ++i) {
if(mask_[j * size_ + (pos + i)] != 0)
if (subWidth < j + 1)
subWidth = j + 1;
for(size_t s = 0; s < width_; ++s) {
for(size_t b = 0; b < subSize; ++b) {
if(mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)] != 0) // s * size_ + (pos + b)
if (subWidth < s + 1)
subWidth = s + 1;
}
}
//if (subWidth < width_)
// LOG(info, "[data] sub-batch {} of {} wide batch has effective width of {}", pos / targetSize, width_, subWidth);
// create sub-batch
auto sb = New<SubBatch>(subSize, subWidth, vocab_);
size_t words = 0;
for(size_t j = 0; j < subWidth; ++j) {
for(size_t i = 0; i < subSize; ++i) {
sb->data()[j * subSize + i] = indices_[j * size_ + (pos + i)];
sb->mask()[j * subSize + i] = mask_[j * size_ + (pos + i)];
for(size_t s = 0; s < subWidth; ++s) {
for(size_t b = 0; b < subSize; ++b) {
sb->data()[locate(/*batchIdx=*/b, /*wordPos=*/s, /*batchSize=*/subSize)/*s * subSize + b*/] = indices_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)]; // s * size_ + (pos + b)
sb->mask()[locate(/*batchIdx=*/b, /*wordPos=*/s, /*batchSize=*/subSize)/*s * subSize + b*/] = mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)]; // s * size_ + (pos + b)
if(mask_[j * size_ + (pos + i)] != 0)
if(mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)/*s * size_ + (pos + b)*/] != 0)
words++;
}
}
@ -222,6 +227,9 @@ public:
}
void setWords(size_t words) { words_ = words; }
// experimental: hide inline-fix source tokens from cross attention
std::vector<float> crossMaskWithInlineFixSourceSuppressed() const;
};
/**
@ -231,7 +239,7 @@ public:
class CorpusBatch : public Batch {
protected:
std::vector<Ptr<SubBatch>> subBatches_;
std::vector<float> guidedAlignment_;
std::vector<float> guidedAlignment_; // [max source len, batch size, max target len] flattened
std::vector<float> dataWeights_;
public:
@ -302,7 +310,8 @@ public:
/**
* @brief Creates a batch filled with fake data. Used to determine the size of
* the batch object.
* the batch object. With guided-alignments and multiple encoders, those
* multiple source streams are expected to have the same lengths.
*
* @param lengths List of subbatch sizes.
* @param batchSize Number of sentences in the batch.
@ -335,6 +344,7 @@ public:
return batch;
if(options->get("guided-alignment", std::string("none")) != "none") {
// @TODO: if > 1 encoder, verify that all encoders have the same sentence lengths
std::vector<float> alignment(batchSize * lengths.front() * lengths.back(),
0.f);
batch->setGuidedAlignment(std::move(alignment));
@ -369,7 +379,7 @@ public:
// split each stream separately
for(auto batchStream : subBatches_) {
size_t i = 0; // index into split batch
for(auto splitSubBatch : batchStream->split(n, sizeLimit)) {
for(auto splitSubBatch : batchStream->split(n, sizeLimit)) { // splits a batch into pieces, can also change width
if(subs.size() <= i)
subs.resize(i + 1);
subs[i++].push_back(splitSubBatch); // this forms tuples across streams
@ -408,7 +418,7 @@ public:
size_t bi = i + pos;
for(size_t sid = 0; sid < srcWords; ++sid) {
for(size_t tid = 0; tid < trgWords; ++tid) {
size_t bidx = sid * oldSize * oldTrgWords + bi * oldTrgWords + tid;
size_t bidx = sid * oldSize * oldTrgWords + bi * oldTrgWords + tid; // [sid, bi, tid]
size_t idx = sid * dimBatch * trgWords + i * trgWords + tid;
aligns[idx] = guidedAlignment_[bidx];
}
@ -424,20 +434,19 @@ public:
if(!dataWeights_.empty()) {
size_t oldSize = size();
size_t width = 1;
// There are more weights than sentences, i.e. these are word weights.
if(dataWeights_.size() != oldSize)
width = subBatches_.back()->batchWidth();
for(auto split : splits) {
auto cb = std::static_pointer_cast<CorpusBatch>(split);
size_t width = 1; // One weight per sentence in case of sentence-level weights
if(dataWeights_.size() != oldSize) // if number of weights does not correspond to number of sentences we have word-level weights
width = cb->back()->batchWidth(); // splitting also affects width, hence we need to accomodate this here
std::vector<float> ws(width * split->size(), 1.0f);
// this needs to be split along the batch dimension
// which is here the innermost dimension.
// Should work for sentence-based weights, too.
for(size_t j = 0; j < width; ++j) {
for(size_t i = 0; i < split->size(); ++i) {
ws[j * split->size() + i] = dataWeights_[j * oldSize + i + pos];
for(size_t s = 0; s < width; ++s) {
for(size_t b = 0; b < split->size(); ++b) {
ws[s * split->size() + b] = dataWeights_[s * oldSize + b + pos]; // @TODO: use locate() as well
}
}
split->setDataWeights(ws);
@ -448,9 +457,13 @@ public:
return splits;
}
std::vector<float>& getGuidedAlignment() { return guidedAlignment_; }
const std::vector<float>& getGuidedAlignment() const { return guidedAlignment_; } // [dimSrcWords, dimBatch, dimTrgWords] flattened
void setGuidedAlignment(std::vector<float>&& aln) override {
guidedAlignment_ = std::move(aln);
guidedAlignment_ = std::move(aln);
}
size_t locateInGuidedAlignments(size_t b, size_t s, size_t t) {
return ((s * size()) + b) * widthTrg() + t;
}
std::vector<float>& getDataWeights() { return dataWeights_; }
@ -472,15 +485,14 @@ public:
std::cerr << std::endl;
}
size_t b = 0;
size_t subBatchIndex = 0;
for(auto sb : subBatches_) {
std::cerr << "batch " << b++ << ": " << std::endl;
std::cerr << "stream " << subBatchIndex++ << ": " << std::endl;
const auto& vocab = sb->vocab();
for(size_t i = 0; i < sb->batchWidth(); i++) {
for(size_t s = 0; s < sb->batchWidth(); s++) {
std::cerr << "\t w: ";
for(size_t j = 0; j < sb->batchSize(); j++) {
size_t idx = i * sb->batchSize() + j;
Word w = sb->data()[idx];
for(size_t b = 0; b < sb->batchSize(); b++) {
Word w = sb->data()[sb->locate(/*batchIdx=*/b, /*wordPos=*/s)]; // s * sb->batchSize() + b;
if (vocab && !printIndices)
std::cerr << (*vocab)[w] << " ";
else

View File

@ -400,7 +400,7 @@ std::string FactoredVocab::word2string(Word word) const {
res.append("?");
}
else
res.append(factorVocab_[(WordIndex)(index + groupRanges_[g].first)]);
res.append(getFactorName(g, index));
}
return res;
}
@ -431,6 +431,21 @@ Word FactoredVocab::string2word(const std::string& w) const {
return word;
}
// does a specific factor exist in the vocabulary
// Factor name must be given without separator. This function cannot be used for lemmas.
bool FactoredVocab::tryGetFactor(const std::string& factorName, size_t& groupIndex, size_t& factorIndex) const {
WordIndex u;
if (factorVocab_.tryFind(factorSeparator_ + factorName, u))
{
groupIndex = factorGroups_[u];
ABORT_IF(u < groupRanges_[groupIndex].first || u >= groupRanges_[groupIndex].second, "Invalid factorGroups_ entry??");
factorIndex = u - groupRanges_[groupIndex].first;
return true;
}
else
return false;
}
// extract the factor index of a given factor type from the 'Word' representation
size_t FactoredVocab::getFactor(Word word, size_t groupIndex) const {
size_t index = word.toWordIndex();
@ -565,12 +580,18 @@ void FactoredVocab::constructNormalizationInfoForVocab() {
// decode a 'Word' array into the external string representation of that token sequence, as written to output files
/*virtual*/ std::string FactoredVocab::decode(const Words& sentence, bool ignoreEOS /*= true*/) const /*override final*/ {
std::vector<std::string> decoded;
decoded.reserve(sentence.size());
for(auto w : sentence) {
std::vector<std::string> decoded; decoded.reserve(sentence.size());
for(auto w : sentence)
if((w != getEosId() || !ignoreEOS))
decoded.push_back((*this)[w]);
}
return utils::join(decoded, " ");
}
// diagnostics version of decode() that will not fail on partial words, will print EOS, and is a little slower
std::string FactoredVocab::decodeForDiagnostics(const Words& sentence) const {
std::vector<std::string> decoded; decoded.reserve(sentence.size());
for (auto w : sentence)
decoded.push_back(word2string(w));
return utils::join(decoded, " ");
}
@ -740,7 +761,7 @@ Ptr<IVocab> createFactoredVocab(const std::string& vocabPath) {
static std::map<std::string, Ptr<IVocab>> s_cache;
auto iter = s_cache.find(vocabPath);
if (iter != s_cache.end()) {
LOG(info, "[vocab] Reusing existing vocabulary object in memory (vocab size {})", iter->second->size());
LOG_ONCE(info, "[vocab] Reusing existing vocabulary object in memory (vocab size {})", iter->second->size());
return iter->second;
}
auto vocab = New<FactoredVocab>();

14
src/data/factored_vocab.h Normal file → Executable file
View File

@ -66,6 +66,9 @@ public:
bool canExpandFactoredWord(Word word, size_t groupIndex) const { return lemmaHasFactorGroup(getFactor(word, 0), groupIndex); }
size_t getFactor(Word word, size_t groupIndex) const;
bool lemmaHasFactorGroup(size_t factor0Index, size_t g) const { return lemmaHasFactorGroup_[factor0Index][g]; }
const std::string& getFactorGroupPrefix(size_t groupIndex) const { return groupPrefixes_[groupIndex]; } // for diagnostics only
const std::string& getFactorName(size_t groupIndex, size_t factorIndex) const { return factorVocab_[(WordIndex)(factorIndex + groupRanges_[groupIndex].first)]; }
std::string decodeForDiagnostics(const Words& sentence) const;
static constexpr size_t FACTOR_NOT_APPLICABLE = (SIZE_MAX - 1);
static constexpr size_t FACTOR_NOT_SPECIFIED = (SIZE_MAX - 2);
@ -74,6 +77,17 @@ public:
static Ptr<FactoredVocab> tryCreateAndLoad(const std::string& path); // load from "vocab" option if it specifies a factored vocab
std::string word2string(Word word) const;
Word string2word(const std::string& w) const;
bool tryGetFactor(const std::string& factorGroupName, size_t& groupIndex, size_t& factorIndex) const; // note: factorGroupName given without separator
// some hard-coded constants from FactoredSegmenter
// The naming mimics the names in FactoredSegmenter.cs, and therefore intentionally does not follow Marian conventions.
// @TODO: We have more hard-coded constants throughout the code. Move them all here.
// @TODO: figure out how to do this with static const*/constexpr
#define FactoredVocab_INLINE_FIX_WHAT_serialized "is"
#define FactoredVocab_FIX_SRC_ID_TAG "<IOPEN>"
#define FactoredVocab_FIX_TGT_ID_TAG "<IDELIM>"
#define FactoredVocab_FIX_END_ID_TAG "<ICLOSE>"
private:
void constructGroupInfoFromFactorVocab();
void constructFactorIndexConversion();

View File

@ -36,17 +36,18 @@ private:
std::mt19937 generator_;
std::uniform_int_distribution<int> randInt_; // from 0 to INT_MAX
// Keeps sentences segmented into subword units
bool keepEncoded_{false};
// Sample from one file, based on first algorithm from:
// https://en.wikipedia.org/wiki/Reservoir_sampling
void reservoirSampling(std::vector<std::string>& sample, size_t& seenLines,
const std::string& trainPath, size_t maxLines, size_t maxBytes) {
ABORT_IF(maxLines == 0, "Sample needs to be larger 0");
std::unique_ptr<std::istream> trainStrm(
trainPath == "stdin" ? new std::istream(std::cin.rdbuf())
: new io::InputFileStream(trainPath)
);
std::unique_ptr<std::istream> trainStrm(trainPath == "stdin"
? new std::istream(std::cin.rdbuf())
: new io::InputFileStream(trainPath));
std::string line;
while(getline(*trainStrm, line)) {
@ -109,8 +110,10 @@ private:
public:
SentencePieceVocab(Ptr<Options> options, size_t batchIndex)
: options_(options), batchIndex_(batchIndex), generator_((uint32_t)Config::seed) {
: options_(options),
batchIndex_(batchIndex),
generator_((uint32_t)Config::seed),
keepEncoded_(options->get<bool>("no-spm-decode", false)) {
if(options_->has("sentencepiece-alphas")) {
auto alphas = options_->get<std::vector<float>>("sentencepiece-alphas");
if(alphas.size() <= batchIndex)
@ -221,11 +224,18 @@ public:
std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
std::string line;
// convert vector of Word to vector of int
std::vector<int> spmSentence; spmSentence.reserve(sentence.size());
for (auto&& word : sentence)
spmSentence.push_back(word.toWordIndex());
spm_->Decode(spmSentence, &line);
if(keepEncoded_) { // i.e. keep the sentence segmented into subword units
for(const Word& id : sentence)
line += (*this)[id] + " ";
line.pop_back(); // trim the trailing whitespace
} else {
// convert vector of Word to vector of int
std::vector<int> spmSentence;
spmSentence.reserve(sentence.size());
for(auto&& word : sentence)
spmSentence.push_back(word.toWordIndex());
spm_->Decode(spmSentence, &line);
}
return line;
}

0
src/data/vocab.cpp Normal file → Executable file
View File

0
src/data/vocab_base.h Normal file → Executable file
View File

View File

@ -319,7 +319,7 @@ struct Ops<float32x4> {
} // end namespace functional
} // end namespace marian
#ifdef __AVX__
#include "3rd_party/avx_mathfun.h"
namespace marian {
@ -438,7 +438,7 @@ struct Ops<float32x8> {
} // end namespace functional
} // end namespace marian
#endif
#endif // of "#ifndef __CUDACC__"
#ifdef __CUDACC__
@ -600,4 +600,4 @@ BINARY(sPReLU, PReLU, Ops<ElementType>::prelu(x, y));
BINARY(sPReLUBack, PReLUback, Ops<ElementType>::preluBack(x, y));
} // end namespace functional
} // end namespace marian
} // end namespace marian

View File

@ -145,20 +145,20 @@ struct ConstantShape {
HOST_DEVICE_INLINE int elements() const { return (int)elements_; }
// The following functions iterate over shape dimensions and use resursive
// The following functions iterate over shape dimensions and use recursive
// templates. They unroll over a compile-time defined number of dimensions.
// Struct for recurrent template calls over shape dimensions,
// version for K > 0
template <const int K, const int D> struct I {
HOST_DEVICE_INLINE static int index(const Array<int, D>& dims,
const Array<int, D>& stride) {
const Array<int, D>& stride) {
return dims[K] * stride[K] + I<K-1, D>::index(dims, stride);
}
HOST_DEVICE_INLINE static int index(int si,
const Array<int, D>& shape,
const Array<int, D>& stride) {
const Array<int, D>& shape,
const Array<int, D>& stride) {
return (si % shape[K]) * stride[K] + I<K-1, D>::index(si / shape[K], shape, stride);
}
@ -175,19 +175,19 @@ struct ConstantShape {
// specialization for K == 0
template <const int D> struct I<0, D> {
HOST_DEVICE_INLINE static int index(const Array<int, D>& dims,
const Array<int, D>& stride) {
const Array<int, D>& stride) {
return dims[0] * stride[0];
}
HOST_DEVICE_INLINE static int index(int si,
const Array<int, D>& shape,
const Array<int, D>& stride) {
const Array<int, D>& shape,
const Array<int, D>& stride) {
return (si % shape[0]) * stride[0];
}
HOST_DEVICE_INLINE static void dims(int si,
Array<int, D>& dims,
const Array<int, D>& shape) {
Array<int, D>& dims,
const Array<int, D>& shape) {
dims[0] = si % shape[0];
}
};

View File

@ -7,8 +7,8 @@
namespace marian {
namespace functional {
// By default for single valued types like float do nothing. Usually the number of elements in a tensor
// is correctly mirrored in the shape object. Only special multi-element types like float32x4 (4 floats),
// By default for single valued types like float do nothing. Usually the number of elements in a tensor
// is correctly mirrored in the shape object. Only special multi-element types like float32x4 (4 floats),
// float32x8 (8 floats) and half2 (2 half) require special handling done by specializations below.
// Similar for multi-element integer types to be added later.
template <typename T>
@ -31,7 +31,7 @@ inline marian::Shape adapt<float32x4>(const marian::Shape& shape) {
x4Shape.set(-1, shape[-1] / 4);
return x4Shape;
}
#ifdef __AVX__
template <>
inline marian::Shape adapt<float32x8>(const marian::Shape& shape) {
ABORT_IF(shape[-1] % 8 != 0,
@ -42,7 +42,7 @@ inline marian::Shape adapt<float32x8>(const marian::Shape& shape) {
x8Shape.set(-1, shape[-1] / 8);
return x8Shape;
}
#endif
#endif
template <typename T, const int D>
@ -211,4 +211,4 @@ template <typename T>
using Tensor = View<T, CONST_SHAPE_DIMS>;
} // namespace functional
} // namespace marian
} // namespace marian

View File

@ -9,136 +9,155 @@
namespace marian {
namespace functional {
template <size_t K, class Functor>
// This struct and its specializations are never used directly, only through apply and applyWithCast below.
template <size_t K, class Functor, typename AccType> // K-ary application of Functor, elements are cast to AccType before application of Functor
struct FApply {};
template <class Functor>
struct FApply<1, Functor> {
template <class Functor, typename AccType>
struct FApply<1, Functor, AccType> {
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 1>& in,
const functional::Array<int, 1>& indices) {
return functor(in[0].data()[indices[0]]);
return functor((AccType)in[0].data()[indices[0]]); // indices is an array of offsets into multiple tensors, index[i] corresponds in[i] based on up to arity K
}
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 1>& in,
int index) {
return functor(in[0].data()[index]);
return functor((AccType)in[0].data()[index]);
}
};
template <class Functor>
struct FApply<2, Functor> {
template <class Functor, typename AccType>
struct FApply<2, Functor, AccType> {
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 2>& in,
const functional::Array<int, 2>& indices) {
return functor(in[0].data()[indices[0]],
in[1].data()[indices[1]]);
return functor((AccType)in[0].data()[indices[0]],
(AccType)in[1].data()[indices[1]]);
}
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 2>& in,
int index) {
return functor(in[0].data()[index],
in[1].data()[index]);
return functor((AccType)in[0].data()[index],
(AccType)in[1].data()[index]);
}
};
template <class Functor>
struct FApply<3, Functor> {
template <class Functor, typename AccType>
struct FApply<3, Functor, AccType> {
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 3>& in,
const functional::Array<int, 3>& indices) {
return functor(in[0].data()[indices[0]],
in[1].data()[indices[1]],
in[2].data()[indices[2]]);
return functor((AccType)in[0].data()[indices[0]],
(AccType)in[1].data()[indices[1]],
(AccType)in[2].data()[indices[2]]);
}
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 3>& in,
int index) {
return functor(in[0].data()[index],
in[1].data()[index],
in[2].data()[index]);
return functor((AccType)in[0].data()[index],
(AccType)in[1].data()[index],
(AccType)in[2].data()[index]);
}
};
template <class Functor>
struct FApply<4, Functor> {
template <class Functor, typename AccType>
struct FApply<4, Functor, AccType> {
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 4>& in,
const functional::Array<int, 4>& indices) {
return functor(in[0].data()[indices[0]],
in[1].data()[indices[1]],
in[2].data()[indices[2]],
in[3].data()[indices[3]]);
return functor((AccType)in[0].data()[indices[0]],
(AccType)in[1].data()[indices[1]],
(AccType)in[2].data()[indices[2]],
(AccType)in[3].data()[indices[3]]);
}
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 4>& in,
int index) {
return functor(in[0].data()[index],
in[1].data()[index],
in[2].data()[index],
in[3].data()[index]);
return functor((AccType)in[0].data()[index],
(AccType)in[1].data()[index],
(AccType)in[2].data()[index],
(AccType)in[3].data()[index]);
}
};
template <class Functor>
struct FApply<5, Functor> {
template <class Functor, typename AccType>
struct FApply<5, Functor, AccType> {
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 5>& in,
const functional::Array<int, 5>& indices) {
return functor(in[0].data()[indices[0]],
in[1].data()[indices[1]],
in[2].data()[indices[2]],
in[3].data()[indices[3]],
in[4].data()[indices[4]]);
return functor((AccType)in[0].data()[indices[0]],
(AccType)in[1].data()[indices[1]],
(AccType)in[2].data()[indices[2]],
(AccType)in[3].data()[indices[3]],
(AccType)in[4].data()[indices[4]]);
}
template <typename ElementType>
HOST_DEVICE_INLINE static ElementType apply(
HOST_DEVICE_INLINE static AccType apply(
Functor functor,
functional::Array<functional::Tensor<ElementType>, 5>& in,
int index) {
return functor(in[0].data()[index],
in[1].data()[index],
in[2].data()[index],
in[3].data()[index],
in[4].data()[index]);
return functor((AccType)in[0].data()[index],
(AccType)in[1].data()[index],
(AccType)in[2].data()[index],
(AccType)in[3].data()[index],
(AccType)in[4].data()[index]);
}
};
template <size_t K, class Functor, typename ElementType>
/******************************************************************************/
// Applying functor to sets of K tensors
template <typename ElementType, size_t K, class Functor>
HOST_DEVICE_INLINE ElementType apply(Functor functor,
functional::Array<functional::Tensor<ElementType>, K>& in,
const functional::Array<int, K>& indices) {
return FApply<K, Functor>::apply(functor, in, indices);
return FApply<K, Functor, ElementType>::apply(functor, in, indices); // functor is applied to same type as input ElementType, no casting required
}
template <size_t K, class Functor, typename ElementType>
template <typename ElementType, size_t K, class Functor>
HOST_DEVICE_INLINE ElementType apply(Functor functor,
functional::Array<functional::Tensor<ElementType>, K>& in,
int index) {
return FApply<K, Functor>::apply(functor, in, index);
return FApply<K, Functor, ElementType>::apply(functor, in, index); // functor is applied to same type as input ElementType, no casting required
}
template <typename AccType, typename ElementType, size_t K, class Functor>
HOST_DEVICE_INLINE AccType applyWithCast(Functor functor,
functional::Array<functional::Tensor<ElementType>, K>& in,
const functional::Array<int, K>& indices) {
return FApply<K, Functor, AccType>::apply(functor, in, indices); // ElementType and AccType are potentially different, cast to AccType before applying functor.
// This is useful when accumulating e.g. 16-bit into 32-bit and we want to case to 32-bit before
// the functor is applied. L2-Norm is a good use-case since the square can be large.
}
template <typename AccType, typename ElementType, size_t K, class Functor>
HOST_DEVICE_INLINE AccType applyWithCast(Functor functor,
functional::Array<functional::Tensor<ElementType>, K>& in,
int index) {
return FApply<K, Functor, AccType>::apply(functor, in, index); // ElementType and AccType are potentially different, cast to AccType before applying functor
}
/******************************************************************************/
@ -180,7 +199,7 @@ struct Loop<1, N, K> {
for(size_t j = 0; j < K; ++j) {
acc[j] = pAcc[j] + (dim[N - 1] + i) * in[j].shape().bstride(N - 1);
}
agg = aggFunctor(agg, (AccType)apply<K>(functor, in, acc));
agg = aggFunctor(agg, applyWithCast<AccType>(functor, in, acc));
}
return agg;
}

View File

@ -354,7 +354,7 @@ public:
const Ptr<inits::NodeInitializer>& init,
const Type elementType,
bool fixed = false) {
// since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
// this param is called with a specified type
return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
}
@ -362,7 +362,7 @@ public:
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
bool fixed = false) {
// since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
// since this param is called without a specified type, we assume defaultElementType but allow to check for a different type
return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
}
@ -497,7 +497,12 @@ public:
// skip over special parameters starting with "special:"
if(pName.substr(0, 8) == "special:")
continue;
param(pName, item.shape, inits::fromItem(item), item.type, /*fixed=*/false);
// if during loading the loaded type is of the same type class as the default element type, allow conversion;
// otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both
// have type class TypeClass::float_type.
auto loadElementType = isSameTypeClass(item.type, defaultElementType_) ? defaultElementType_ : item.type;
param(pName, item.shape, inits::fromItem(item), loadElementType, /*fixed=*/false);
}
if(markReloaded)
setReloaded(true);
@ -531,7 +536,7 @@ public:
auto defaultParams = std::dynamic_pointer_cast<MappedParameters>(it->second);
if(!defaultParams) {
// but it's not mapped, so delete it and replace it with a mapped version
auto defaultParams = New<MappedParameters>(defaultElementType_);
defaultParams = New<MappedParameters>(defaultElementType_);
defaultParams->init(backend_);
paramsByElementType_[defaultElementType_] = defaultParams;
}
@ -540,8 +545,8 @@ public:
// pre-populate parameters by type
for(auto& item : items) {
auto it = paramsByElementType_.find(item.type);
if(it == paramsByElementType_.end()) {
auto it1 = paramsByElementType_.find(item.type);
if(it1 == paramsByElementType_.end()) {
auto params = New<MappedParameters>(item.type);
params->init(backend_);
paramsByElementType_.insert({item.type, params});

View File

@ -7,7 +7,7 @@
#include "graph/auto_tuner.h"
#include "tensors/cpu/int16.h"
#include "tensors/cpu/expanded_gemm.h"
#include "tensors/cpu/fbgemm/expanded_gemm.h"
#if USE_FBGEMM
#include "fbgemm/Utils.h"
@ -284,11 +284,6 @@ Expr stopGradient(Expr a) {
return res;
}
Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init) {
auto graph = a->graph();
return graph->constant(a->shape(), init, a->value_type());
}
// gather() -- gather arbitrary elements along an axis; batched or non-batched
Expr gather(Expr a, int axis, Expr indices) {
return Expression<GatherNodeOp>(a, axis, indices);
@ -317,6 +312,7 @@ Expr index_select(Expr a, int axis, Expr indices) {
indices = reshape(indices, shape); // move index to axis
return gather(a, axis, indices);
}
Expr index_select(Expr a, int axis, const std::vector<IndexType>& indices) {
auto indexExpr = a->graph()->indices(indices);
return index_select(a, axis, indexExpr);
@ -355,35 +351,51 @@ Expr slice(Expr a, int axis, Slice slice) { // numpy __getslice__ semantics, but
}
Expr sum(Expr a, int ax) {
if(a->shape()[ax] == 1) // nothing to reduce, sum of itself is a
return a;
return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::sum);
}
Expr mean(Expr a, int ax) {
if(a->shape()[ax] == 1) // nothing to reduce, mean of itself is a
return a;
return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::mean);
}
Expr std(Expr a, int ax) {
return Expression<ReduceNodeOp>(a - mean(a,ax), ax, ReduceNodeOpCode::rms);
if(a->shape()[ax] == 1) // nothing to reduce, std(a) = 0
return a - a;
return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::rms);
}
Expr var(Expr a, int ax) {
Expr var(Expr a, int ax) {
if(a->shape()[ax] == 1) // nothing to reduce, var(a) = 0
return a - a;
return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::meanSqr);
}
Expr max(Expr a, int ax) {
if(a->shape()[ax] == 1) // nothing to reduce, max of itself is a
return a;
return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::max);
}
Expr min(Expr a, int ax) {
if(a->shape()[ax] == 1) // nothing to reduce, min of itself is a
return a;
return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::min);
}
Expr prod(Expr a, int ax) {
if(a->shape()[ax] == 1) // nothing to reduce, prod of itself is a
return a;
return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::prod);
}
// log(sum(exp(a)))
Expr logsumexp(Expr a, int ax) {
if(a->shape()[ax] == 1) // nothing to reduce, log(sum(exp(a))) = log(exp(a)) = a
return a;
return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::logSumExp);
}
@ -400,17 +412,50 @@ Expr weighted_average(Expr in, Expr weights, int ax) {
Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
auto device = a->graph()->getDeviceId().type;
float clipValue = a->graph()->getBackend()->getClip();
// added support for packed GEMM API (fp16, int8)
Type aElementType = a->value_type();
Type bElementType = b->value_type();
// Currently only true when command line options
// --optimize --cpu-thread=N with N > 0 are set.
if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()) {
// dotInt16 computes A * B.T, hence the transpose for B to get A * B
// if transA = false and transB = false.
if(device == DeviceType::cpu) {
if(isFloat(aElementType) && isFloat(bElementType)) {
if(a->graph()->getBackend()->isOptimized()) {
// dotInt16 computes A * B.T, hence the transpose for B to get A * B
// if transA = false and transB = false.
return cpu::int16::dot(
cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
cpu::int16::quantize(transB ? b : transpose(b), clipValue),
scale);
return cpu::int16::dot(
cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
cpu::int16::quantize(transB ? b : transpose(b), clipValue),
scale);
} else {
return Expression<DotNodeOp>(
clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
}
} else if(isFloat(aElementType) && isPacked(bElementType)) {
#if USE_FBGEMM
// 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
// one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
// It looks at the cpu register
// (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support()) {
// This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
return cpu::variant::dot(clip(a, clipValue),
b,
b->shape(),
transA,
transB,
scale);
} else {
ABORT("AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed GEMM");
}
#else
ABORT("Packed GEMM is not available in this build");
#endif // USE_FBGEMM
} else {
ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType);
}
} else {
return Expression<DotNodeOp>(
clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
@ -469,6 +514,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
// (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support()) {
// This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
return cpu::variant::affine(clip(a, clipValue),
b,
b->shape(),
@ -477,7 +523,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
transB,
scale);
} else {
ABORT("No on-the-fly packing at the moment");
ABORT("AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed GEMM");
}
#else
ABORT("Packed GEMM is not available in this build");
@ -562,8 +608,20 @@ Expr cast(Expr a, Type type) {
}
}
Expr cross_entropy(Expr a, Expr indices) {
return Expression<CrossEntropyNodeOp>(a, indices);
Expr cross_entropy(Expr logits, Expr indices) {
return Expression<CrossEntropyNodeOp>(logits, indices);
}
// Unlikelihood loss based on https://arxiv.org/abs/1908.04319
Expr unlikelihood(Expr logits, Expr indices) {
int dimBatch = logits->shape()[-2];
int dimTime = logits->shape()[-3];
// @TODO: fix this outside of this function in decoder.h etc.
auto indicesWithLayout = reshape(indices, {1, dimTime, dimBatch, 1});
// This is currently implemented with multiple ops, might be worth doing a special operation like for cross_entropy
return -log(gather(1.f - softmax(logits), /*axis=*/-1, indicesWithLayout));
}
Expr plus(const std::vector<Expr>& nodes) {

View File

@ -141,7 +141,17 @@ Expr atleast_4d(Expr a);
Expr atleast_nd(Expr a, size_t dims);
// create a constant of shape a->shape() and initialize with init
Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init);
// @TODO: add a && version, to avoid a ref count. NodeInitializers are typically temps.
// @TODO: and/or make this a template on init
static inline Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init) {
return a->graph()->constant(a->shape(), init, a->value_type());
}
// short-cut to init from std::vector, since we do this so often
template<typename ElementType>
Expr constant_like(Expr a, const std::vector<ElementType>& v) { return constant_like(a, inits::fromVector(std::move(v))); }
template<typename ElementType>
Expr constant_like(Expr a, std::vector<ElementType>&& v) { return constant_like(a, inits::fromVector(v)); }
Expr flatten(Expr a);
Expr flatten_2d(Expr a);
@ -200,6 +210,8 @@ Expr logsoftmax(Expr a);
Expr cross_entropy(Expr a, Expr b);
Expr unlikelihood(Expr a, Expr b);
Expr scalar_product(Expr a, Expr b, int ax = 0);
Expr weighted_average(Expr in, Expr weights, int ax = 0);

View File

@ -145,10 +145,20 @@ Ptr<NodeInitializer> fromVector(const std::vector<T>& v) {
return fromLambda([v](Tensor t) { t->set(v.data(), v.data() + v.size()); }, typeId<T>());
}
template <typename T>
Ptr<NodeInitializer> fromVector(std::vector<T>&& v) {
return fromLambda([v](Tensor t) { t->set(v.data(), v.data() + v.size()); }, typeId<T>());
}
template Ptr<NodeInitializer> fromVector<float16>(const std::vector<float16>& v);
template Ptr<NodeInitializer> fromVector<float>(const std::vector<float>& v);
template Ptr<NodeInitializer> fromVector<IndexType>(const std::vector<IndexType>& v);
// @TODO: can we remove the const& ones above? They always make a copy anyways, and often from a temp
template Ptr<NodeInitializer> fromVector<float16> (std::vector<float16> && v);
template Ptr<NodeInitializer> fromVector<float> (std::vector<float> && v);
template Ptr<NodeInitializer> fromVector<IndexType>(std::vector<IndexType>&& v);
Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v) {
return fromLambda([v](Tensor t) { t->set(1e-6); t->setSparse(v.first, v.second); });
}

2
src/graph/node_initializers.h Normal file → Executable file
View File

@ -143,6 +143,8 @@ Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
// @TODO: add documentation
template <typename T>
Ptr<NodeInitializer> fromVector(const std::vector<T>& v);
template <typename T>
Ptr<NodeInitializer> fromVector(std::vector<T>&& v);
// @TODO: add documentation
Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v);

161
src/graph/node_operators_binary.h Normal file → Executable file
View File

@ -63,7 +63,6 @@ public:
// df/dB += alpha * dot(op(A).T, D)
// beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C
// to sum gradients from different graph parts
if(!transA_ && transB_)
return {NodeOp(Prod(child(0)->grad(),
adj_,
@ -130,6 +129,29 @@ public:
const std::string type() override { return "dot"; }
virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
util::hash_combine(seed, transA_);
util::hash_combine(seed, transB_);
util::hash_combine(seed, scalar_);
return seed;
}
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<DotNodeOp>(node);
if(!cnode)
return false;
if(transA_ != cnode->transA_)
return false;
if(transB_ != cnode->transB_)
return false;
if(scalar_ != cnode->scalar_)
return false;
return true;
}
const std::string color() override { return "orange"; }
};
@ -274,6 +296,30 @@ public:
}
const std::string type() override { return "affine"; }
virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
util::hash_combine(seed, transA_);
util::hash_combine(seed, transB_);
util::hash_combine(seed, scalar_);
return seed;
}
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<AffineNodeOp>(node);
if(!cnode)
return false;
if(transA_ != cnode->transA_)
return false;
if(transB_ != cnode->transB_)
return false;
if(scalar_ != cnode->scalar_)
return false;
return true;
}
};
class DotBatchedNodeOp : public NaryNodeOp {
@ -402,6 +448,29 @@ public:
const std::string type() override { return "bdot"; }
virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
util::hash_combine(seed, transA_);
util::hash_combine(seed, transB_);
util::hash_combine(seed, scalar_);
return seed;
}
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<DotBatchedNodeOp>(node);
if(!cnode)
return false;
if(transA_ != cnode->transA_)
return false;
if(transB_ != cnode->transB_)
return false;
if(scalar_ != cnode->scalar_)
return false;
return true;
}
const std::string color() override { return "orange"; }
};
@ -443,18 +512,42 @@ public:
}
NodeOps backwardOps() override {
return {nullptr, // can't backprop into the sparse matrix (the gradient is dense)
nullptr,
nullptr,
NodeOp(CSRProd(child(3)->grad(), // child(3) = D
graph()->allocator(),
child(0)->val(), child(1)->val(), child(2)->val(), // children(0..2) = A
adj_,
/*transS=*/!transS_, /*swapOperands=*/swapOperands_, /*beta=*/1))};
return { nullptr, // can't backprop into the sparse matrix (the gradient is dense)
nullptr,
nullptr,
NodeOp(CSRProd(child(3)->grad(), // child(3) = D
graph()->allocator(),
child(0)->val(), child(1)->val(), child(2)->val(), // children(0..2) = A
adj_,
/*transS=*/!transS_, /*swapOperands=*/swapOperands_, /*beta=*/1))};
}
const std::string type() override { return "csr_dot"; }
virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
for(auto s : shape())
util::hash_combine(seed, s);
util::hash_combine(seed, transS_);
util::hash_combine(seed, swapOperands_);
return seed;
}
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<CSRDotNodeOp>(node);
if(!cnode)
return false;
if(transS_ != cnode->transS_)
return false;
if(shape() != cnode->shape())
return false;
if(swapOperands_ != cnode->swapOperands_)
return false;
return true;
}
const std::string color() override { return "orange"; }
};
@ -569,8 +662,6 @@ struct RowsNodeOp : public NaryNodeOp {
// out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1
// out[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2
// 'a' and 'indices' must have the same rank.
// @TODO: The current implementation does not support batched indices (third scenario above).
// I.e. all axes of 'indices' except 'axis' must have dimension 1.
struct GatherNodeOp : public NaryNodeOp {
GatherNodeOp(Expr a, int axis, Expr indices)
: NaryNodeOp({a, indices}, newShape(a, axis, indices), a->value_type()),
@ -599,10 +690,6 @@ struct GatherNodeOp : public NaryNodeOp {
if (i != axis) {
ABORT_IF(indices->shape()[i] != shape[i] && indices->shape()[i] != 1,
"Dimensions must match or broadcast for input ({}) and indices ({})", std::string(shape), std::string(indices->shape()));
#if 1 // presently, this implementation does not support batched indices
ABORT_IF(indices->shape()[i] != 1,
"Presently, gather() does not implement batched indices");
#endif
}
}
return shape;
@ -865,7 +952,9 @@ struct MinimumNodeOp : public ElementBinaryNodeOp {
struct CmpNodeOp : public ElementBinaryNodeOp {
CmpNodeOp(Expr a, Expr b, int cmp_, bool not_) : ElementBinaryNodeOp(a, b), cmp_(cmp_), not_(not_) {
setTrainable(false); // has no gradient
//setTrainable(false); // has no gradient
// Note: ^^ Disabled because it currently causing Marian to choke, for unknown reasons.
// Not setting this will not change the result since the vector of gradient functions is empty.
}
NodeOps forwardOps() override {
@ -887,6 +976,29 @@ struct CmpNodeOp : public ElementBinaryNodeOp {
ABORT("Should not get here??");
}
virtual size_t hash() override {
if(!hash_) {
size_t seed = NaryNodeOp::hash();
util::hash_combine(seed, cmp_);
util::hash_combine(seed, not_);
hash_ = seed;
}
return hash_;
}
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<CmpNodeOp>(node);
if(!cnode)
return false;
if(cmp_ != cnode->cmp_)
return false;
if(not_ != cnode->not_)
return false;
return true;
}
private:
int cmp_; // -1: less; 0: equal; 1: greater
bool not_; // invert result if true
@ -1019,6 +1131,23 @@ public:
const std::string type() override { return "layer_normalization"; }
virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
util::hash_combine(seed, eps_);
return seed;
}
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<LayerNormalizationOp>(node);
if(!cnode)
return false;
if(eps_ != cnode->eps_)
return false;
return true;
}
private:
float eps_;
};

View File

@ -993,6 +993,8 @@ struct ShiftNodeOp : public UnaryNodeOp {
if(!cnode)
return false;
if(shift_ != cnode->shift_)
return false;
if(padValue_ != cnode->padValue_)
return false;
return true;
}

View File

@ -130,8 +130,8 @@ private:
Ptr<Backend> backend_;
public:
MappedParameters(Type acceptedElementType_) : Parameters(acceptedElementType_) {
LOG(debug, "Created mapped parameter object of type {}", acceptedElementType_);
MappedParameters(Type acceptedElementType) : Parameters(acceptedElementType) {
LOG(debug, "Created mapped parameter object of type {}", acceptedElementType);
}
virtual void init(Ptr<Backend> backend) override { backend_ = backend; }

View File

@ -4,7 +4,8 @@
#include "layers/constructors.h"
#include "layers/loss.h"
#include "data/factored_vocab.h"
#include "rnn/types.h" // for State::select()
#include "rnn/types.h" // for State::select()
#include "models/states.h" // for EncoderState
//using std::size_t; // not sure why this is needed
@ -23,7 +24,11 @@ namespace marian {
ABORT_IF(empty(), "Attempted to read out logits on empty Logits object");
auto firstLogits = logits_.front()->loss();
ABORT_IF(labels.size() * firstLogits->shape()[-1] != firstLogits->shape().elements(), "Labels not matching logits shape??");
ABORT_IF(labels.size() * firstLogits->shape()[-1] != firstLogits->shape().elements(),
"Labels not matching logits shape ({} != {}, {})??",
labels.size() * firstLogits->shape()[-1],
firstLogits->shape().elements(),
firstLogits->shape());
// base case (no factors)
if (!factoredVocab_) {
@ -219,7 +224,7 @@ namespace marian {
factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
if (factoredVocab_) {
numOutputClasses = (int)factoredVocab_->factorVocabSize();
LOG(info, "[embedding] Factored outputs enabled");
LOG_ONCE(info, "[embedding] Factored outputs enabled");
}
if(tiedParam_) {
@ -237,10 +242,10 @@ namespace marian {
/*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
ABORT_IF(lemmaDimEmb && !factoredVocab_, "--lemma-dim-emb requires a factored vocabulary");
if (lemmaDimEmb > 0) {
if (lemmaDimEmb > 0) { // > 0 means to embed the (expected) word with a different embedding matrix
#define HARDMAX_HACK
#ifdef HARDMAX_HACK
lemmaDimEmb = lemmaDimEmb & 0xfffffffe;
lemmaDimEmb = lemmaDimEmb & 0xfffffffe; // hack to select hard-max: use an odd number
#endif
auto range = factoredVocab_->getGroupRange(0);
auto lemmaVocabDim = (int)(range.second - range.first);
@ -263,8 +268,9 @@ namespace marian {
// project each factor separately
auto numGroups = factoredVocab_->getNumGroups();
std::vector<Ptr<RationalLoss>> allLogits(numGroups, nullptr); // (note: null entries for absent factors)
Expr input1 = input;
Expr Plemma = nullptr;
Expr input1 = input; // [B... x D]
Expr Plemma = nullptr; // used for lemmaDimEmb=-1
Expr inputLemma = nullptr; // used for lemmaDimEmb=-2, -3
for (size_t g = 0; g < numGroups; g++) {
auto range = factoredVocab_->getGroupRange(g);
if (g > 0 && range.first == range.second) // empty entry
@ -280,6 +286,52 @@ namespace marian {
factorWt = slice(Wt_, isLegacyUntransposedW ? -1 : 0, Slice((int)range.first, (int)range.second));
factorB = slice(b_, -1, Slice((int)range.first, (int)range.second));
}
/*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
if ((lemmaDimEmb == -2 || lemmaDimEmb == -3) && g > 0) { // -2/-3 means a gated transformer-like structure (-3 = hard-max)
LOG_ONCE(info, "[embedding] using lemma conditioning with gate");
// this mimics one transformer layer
// - attention over two inputs:
// - e = current lemma. We use the original embedding vector; specifically, expectation over all lemmas.
// - input = hidden state FF(h_enc+h_dec)
// - dot-prod attention to allow both sides to influence (unlike our recurrent self-attention)
// - multi-head to allow for multiple conditions to be modeled
// - add & norm, for gradient flow and scaling
// - FF layer --this is expensive; it is per-factor
// multi-head attention
int inputDim = input->shape()[-1];
int heads = 8;
auto name = options_->get<std::string>("prefix") + "_factor" + std::to_string(g);
auto Wq = graph_->param(name + "_Wq", { inputDim, inputDim }, inits::glorotUniform());
auto Wk = graph_->param(name + "_Wk", { inputDim, inputDim }, inits::glorotUniform());
auto Wv = graph_->param(name + "_Wv", { inputDim, inputDim }, inits::glorotUniform());
auto toMultiHead = [&](Expr x, int heads) {
const auto& shape = x->shape();
int inputDim = shape[-1];
int otherDim = shape.elements() / inputDim;
ABORT_IF(inputDim / heads * heads != inputDim, "inputDim ({}) must be multiple of number of heads ({})", inputDim, heads);
return reshape(x, { otherDim, heads, 1, inputDim / heads });
};
input1 = inputLemma;
auto qm = toMultiHead(dot(input1, Wq), heads); // [B... x H x D/H] projected query
auto kdm = toMultiHead(dot(input1 - input, Wk), heads); // [B... x H x D/H] the two data vectors projected as keys. Use diff and sigmoid, instead of softmax.
auto vem = toMultiHead(dot(input1, Wv), heads); // [B... x H x D/H] one of the two data vectors projected as values
auto vim = toMultiHead(dot( input, Wv), heads); // [B... x H x D/H] the other
auto zm = bdot(qm, kdm, false, true); // [B... x H x 1]
auto sm = sigmoid(zm); // [B... x H x 1]
auto rm = sm * (vem - vim) + vim; // [B... x H x D/H]
auto r = reshape(rm, input->shape()); // [B... x D]
// add & norm
input1 = r + input1;
input1 = layerNorm(input1, name + "_att");
// FF layer
auto ffnDropProb = 0.1f; // @TODO: get as a parameter
auto ffnDim = inputDim * 2; // @TODO: get as a parameter
auto f = denseInline(input1, name + "_ffn", /*suffix=*/"1", ffnDim, (ActivationFunction*)relu, ffnDropProb);
f = denseInline(f, name + "_ffn", /*suffix=*/"2", inputDim);
// add & norm
input1 = f + input1;
input1 = layerNorm(input1, name + "_ffn");
}
// @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
// optionally add lemma-dependent bias
@ -294,15 +346,28 @@ namespace marian {
allLogits[g] = New<RationalLoss>(factorLogits, nullptr);
// optionally add a soft embedding of lemma back to create some lemma dependency
// @TODO: if this works, move it into lazyConstruct
/*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
if (lemmaDimEmb < 0 && g == 0) {
ABORT_IF(shortlist_ && lemmaDimEmb != 0, "Lemma-dependent bias with short list is not yet implemented");
if (lemmaDimEmb == -2 && g == 0) { // -2 means a gated transformer-like structure
LOG_ONCE(info, "[embedding] using lemma conditioning with gate, soft-max version");
// get expected lemma embedding vector
auto factorLogSoftmax = logsoftmax(factorLogits); // [B... x U] note: with shortlist, this is not the full lemma set
auto factorSoftmax = exp(factorLogSoftmax);
inputLemma = dot(factorSoftmax, factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D]
}
else if (lemmaDimEmb == -3 && g == 0) { // same as -2 except with hard max
LOG_ONCE(info, "[embedding] using lemma conditioning with gate, hard-max version");
// get max-lemma embedding vector
auto maxVal = max(factorLogits, -1); // [B... x U] note: with shortlist, this is not the full lemma set
auto factorHardmax = eq(factorLogits, maxVal);
inputLemma = dot(factorHardmax, factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D]
}
else if (lemmaDimEmb == -1 && g == 0) { // -1 means learn a lemma-dependent bias
ABORT_IF(shortlist_, "Lemma-dependent bias with short list is not yet implemented");
LOG_ONCE(info, "[embedding] using lemma-dependent bias");
auto factorLogSoftmax = logsoftmax(factorLogits); // (we do that again later, CSE will kick in)
auto z = /*stopGradient*/(factorLogSoftmax);
Plemma = exp(z); // [B... x U]
}
if (lemmaDimEmb > 0 && g == 0) {
else if (lemmaDimEmb > 0 && g == 0) { // > 0 means learn a re-embedding matrix
LOG_ONCE(info, "[embedding] enabled re-embedding of lemma, at dim {}", lemmaDimEmb);
// compute softmax. We compute logsoftmax() separately because this way, computation will be reused later via CSE
auto factorLogSoftmax = logsoftmax(factorLogits);
@ -349,7 +414,7 @@ namespace marian {
factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
if (factoredVocab_) {
dimVoc = (int)factoredVocab_->factorVocabSize();
LOG(info, "[embedding] Factored embeddings enabled");
LOG_ONCE(info, "[embedding] Factored embeddings enabled");
}
// Embedding layer initialization should depend only on embedding size, hence fanIn=false
@ -389,7 +454,7 @@ namespace marian {
auto graph = E_->graph();
int dimBatch = (int)subBatch->batchSize();
int dimEmb = E_->shape()[-1];
int dimWords = (int)subBatch->batchWidth();
int dimWidth = (int)subBatch->batchWidth();
// factored embeddings:
// - regular:
@ -419,9 +484,16 @@ namespace marian {
// - but forward pass weighs them down, so that all factors are in a similar numeric range
// - if it is required to be in a different range, the embeddings can still learn that, but more slowly
auto batchEmbeddings = apply(subBatch->data(), {dimWords, dimBatch, dimEmb});
auto batchMask = graph->constant({dimWords, dimBatch, 1},
auto batchEmbeddings = apply(subBatch->data(), {dimWidth, dimBatch, dimEmb});
#if 0
auto batchMask = graph->constant({dimWidth, dimBatch, 1},
inits::fromVector(subBatch->mask()));
#else
// experimental: hide inline-fix source tokens from cross attention
auto batchMask = graph->constant({dimWidth, dimBatch, 1},
inits::fromVector(subBatch->crossMaskWithInlineFixSourceSuppressed()));
#endif
return std::make_tuple(batchEmbeddings, batchMask);
}

View File

@ -412,4 +412,32 @@ public:
ABORT("not implemented"); // @TODO: implement me
}
};
// --- a few layers with built-in parameters created on the fly, without proper object
// @TODO: change to a proper layer object
// like affine() but with built-in parameters, activation, and dropout
static inline
Expr denseInline(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
{
auto graph = x->graph();
auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorotUniform());
auto b = graph->param(prefix + "_b" + suffix, { 1, outDim }, inits::zeros());
x = affine(x, W, b);
if (actFn)
x = actFn(x);
x = dropout(x, dropProb);
return x;
}
static inline
Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) {
int dimModel = x->shape()[-1];
auto scale = x->graph()->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones());
auto bias = x->graph()->param(prefix + "_ln_bias" + suffix, { 1, dimModel }, inits::zeros());
return marian::layerNorm(x, scale, bias, 1e-6f);
}
} // namespace marian

76
src/layers/guided_alignment.h Normal file → Executable file
View File

@ -1,43 +1,75 @@
#pragma once
#include "layers/loss.h"
#include "common/logging.h"
namespace marian {
static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch,
Ptr<Options> options,
Expr attention) {
Expr attention) { // [beam depth=1, max src length, batch size, tgt length]
// @TODO: there should be positional masking here ... on the other hand, positions that are not
// in a sentence should always agree (both being 0). Lack of masking affects label count only which is
// probably negligible?
// @TODO: change "cost" to "loss"
std::string guidedLossType = options->get<std::string>("guided-alignment-cost");
float guidedScalar = options->get<float>("guided-alignment-weight");
std::string guidedLossType = options->get<std::string>("guided-alignment-cost"); // @TODO: change "cost" to "loss"
float guidedLossWeight = options->get<float>("guided-alignment-weight");
const auto& shape = attention->shape(); // [beam depth=1, max src length, batch size, tgt length]
float epsilon = 1e-6f;
Expr alignment = constant_like(attention, inits::fromVector(batch->getGuidedAlignment()));
Expr alignmentLoss; // sum up loss over all attention/alignment positions
if(guidedLossType == "mse") {
alignmentLoss = sum(flatten(square(attention - alignment))) / 2.f;
} else if(guidedLossType == "mult") {
alignmentLoss = -log(sum(flatten(attention * alignment)) + epsilon);
} else if(guidedLossType == "ce") {
size_t numLabels;
if(guidedLossType == "ce") {
// normalizedAlignment is multi-hot, but ce requires normalized probabilities, so need to normalize to P(s|t)
auto dimBatch = shape[-2];
auto dimTrgWords = shape[-1];
auto dimSrcWords = shape[-3];
ABORT_IF(shape[-4] != 1, "Guided alignments with beam??");
auto normalizedAlignment = batch->getGuidedAlignment(); // [dimSrcWords, dimBatch, dimTrgWords] flattened, matches shape of 'attention'
auto srcBatch = batch->front();
const auto& srcMask = srcBatch->mask();
ABORT_IF(shape.elements() != normalizedAlignment.size(), "Attention-matrix and alignment shapes differ??");
ABORT_IF(dimBatch != batch->size() || dimTrgWords != batch->widthTrg() || dimSrcWords != batch->width(), "Attention-matrix and batch shapes differ??");
auto locate = [=](size_t s, size_t b, size_t t) { return ((s * dimBatch) + b) * dimTrgWords + t; };
for (size_t b = 0; b < dimBatch; b++) {
for (size_t t = 0; t < dimTrgWords; t++) {
for (size_t s = 0; s < dimSrcWords; s++)
ABORT_IF(locate(s, b, t) != batch->locateInGuidedAlignments(b, s, t), "locate() and locateInGuidedAlignments() differ??");
// renormalize the alignment such that it sums up to 1
float sum = 0;
for (size_t s = 0; s < dimSrcWords; s++)
sum += srcMask[srcBatch->locate(b, s)] * normalizedAlignment[locate(s, b, t)]; // these values are 0 or 1
if (sum != 0 && sum != 1)
for (size_t s = 0; s < dimSrcWords; s++)
normalizedAlignment[locate(s, b, t)] /= sum;
}
}
auto alignment = constant_like(attention, std::move(normalizedAlignment));
alignmentLoss = -sum(flatten(alignment * log(attention + epsilon)));
numLabels = batch->back()->batchWords();
ABORT_IF(numLabels > shape.elements() / shape[-3], "Num labels of guided alignment cost is off??");
} else {
ABORT("Unknown alignment cost type: {}", guidedLossType);
auto alignment = constant_like(attention, batch->getGuidedAlignment());
if(guidedLossType == "mse")
alignmentLoss = sum(flatten(square(attention - alignment))) / 2.f;
else if(guidedLossType == "mult") // @TODO: I don't know what this criterion is for. Can we remove it?
alignmentLoss = -log(sum(flatten(attention * alignment)) + epsilon);
else
ABORT("Unknown alignment cost type: {}", guidedLossType);
// every position is a label as they should all agree
// @TODO: there should be positional masking here ... on the other hand, positions that are not
// in a sentence should always agree (both being 0). Lack of masking affects label count only which is
// probably negligible?
numLabels = shape.elements();
}
alignmentLoss = guidedScalar * alignmentLoss; // weigh by scalar
// every position is a label as they should all agree, see caveat at the top.
size_t numLabels = alignment->shape().elements();
// Create label node, also weigh by scalar so labels and cost are in the same domain.
// Fractional label counts are OK
return RationalLoss(alignmentLoss, guidedScalar * numLabels);
// Fractional label counts are OK. But only if combined as "sum".
// @TODO: It is ugly to check the multi-loss type here, but doing this right requires
// a substantial rewrite of the multi-loss architecture, which is planned anyways.
std::string multiLossType = options->get<std::string>("multi-loss-type", "sum");
if (multiLossType == "sum") // sum of sums
return RationalLoss(guidedLossWeight * alignmentLoss, guidedLossWeight * numLabels);
else
return RationalLoss(guidedLossWeight * alignmentLoss, (float)numLabels);
}
} // namespace marian

View File

@ -7,9 +7,15 @@ Ptr<LabelwiseLoss> newLoss(Ptr<Options> options, bool inference) {
float smoothing = inference ? 0.f : options->get<float>("label-smoothing");
float factorWeight = options->get<float>("factor-weight", 1.0f);
std::string costType = options->get<std::string>("cost-type", "ce-mean");
bool unlikelihood = options->get<bool>("unlikelihood-loss", false);
if(costType == "ce-rescore") { // returns per-batch-item scores (while ce-mean reduces over batch)
return New<RescorerLoss>();
} else if(unlikelihood) {
ABORT_IF(!options->hasAndNotEmpty("data-weighting")
&& options->get<std::string>("data-weighting-type") != "word",
"Unlikelihood loss training requires error annotation in form of per-target-label scores");
return New<SequenceUnlikelihoodLoss>(smoothing, factorWeight); // this is a mix of CE-loss and unlikelihood less depending on values given for data-weighting
} else { // same as ce-mean --@TODO: better check all allowed values, and fail for invalid ones. E.g. what about ce-sum?
return New<CrossEntropyLoss>(smoothing, factorWeight);
}

View File

@ -206,7 +206,7 @@ private:
virtual Expr accumulateLoss(const RationalLoss& current) override {
if(loss_) {
const auto& first = partialLosses_.front();
return loss_ + first.count() * (current.loss() / current.count()); // scale up/down to match scale of first loss
return loss_ + current.loss() * first.count() / current.count(); // scale up/down to match scale of first loss
} else {
return current.loss(); // first reference loss, keeps to scale with this one
}
@ -344,8 +344,8 @@ protected:
// for bert training or classification the time dimension is lot.
// Here safeguard against 2d classifier output, adds 1 on the left, non-op.
Expr ce = cast(cross_entropy(logits, indices), Type::float32);
if (inFactor) {
LOG_ONCE("scaling factor losses with weight {}", factorWeight_);
if (inFactor && factorWeight_ != 1.0f) {
LOG_ONCE(info, "scaling factor losses with weight {}", factorWeight_);
ce = ce * factorWeight_;
}
if (labelSmoothing_ > 0) {
@ -366,13 +366,68 @@ protected:
if(mask)
ce = ce * cast(mask, Type::float32);
if(labelWeights)
if(labelWeights) {
// We currently do not know how to use target factors and word-level label weights together
bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights.
ABORT_IF(wordlevel && logits.getNumFactorGroups() > 1, "CE loss with word-level label weights is not implemented for factors");
ce = ce * cast(labelWeights, Type::float32);
}
return ce;
}
};
/**
* @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an
* implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319.
* We rely on word-level label weights where 1 is correct and 0 is marking an error. If there are not
* zeros for a sentence it going to be trained with normal CE loss if there is at least one 0 it is going
* to flip over to use SUL for that sentence to penalize the selected word.
*
* SUL is implemented as:
* -log(gather(1 - softmax(logits), -1, indices))
*
* Factors are currently not supported.
*/
class SequenceUnlikelihoodLoss : public CrossEntropyLoss {
public:
SequenceUnlikelihoodLoss(float labelSmoothing, float factorWeight)
: CrossEntropyLoss(labelSmoothing, factorWeight) {} // cross-entropy already reduces over axis -1
SequenceUnlikelihoodLoss(const std::vector<int>& axes, float labelSmoothing, float factorWeight)
: CrossEntropyLoss(axes, labelSmoothing, factorWeight) {}
protected:
virtual Expr compute(Logits logits, const Words& labels,
Expr mask = nullptr, Expr labelWeights = nullptr) override {
auto ce = CrossEntropyLoss::compute(logits, labels, mask, /*labelWeights=*/nullptr); // don't pass label-weights to CE
if(!labelWeights)
return ce; // for validation, @TODO: maybe put rather abort or LOG_ONCE(warn, ...)?
// We currently do not know how to use target factors and word-level label weights together
ABORT_IF(logits.getNumFactorGroups() > 1, "Unlikelihood loss is not implemented for factors");
ABORT_IF(!mask, "mask is required"); // @TODO: check this, it seems weights for padding are by default 1, which would make this obsolete.
// use label weights, where 1 is GOOD and 0 is BAD. After inversion here, now 1 marks, mask again to eliminate padding (might be obsolete)
auto errorMask = (1.f - cast(labelWeights, Type::float32)) * cast(mask, Type::float32);
auto ceUl = logits.applyLossFunction(labels, [&](Expr logits, Expr indices) {
return cast(unlikelihood(logits, indices), Type::float32);
});
// compute if want to use CE or UL. If there are no errors train with CE, otherwise train _only on_ the errors with UL. This is the "mixed" training
// schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL.
auto onlyCe = eq(sum(errorMask, /*axis=*/-3), 0.f); // [1, 1, dimBatch, 1] - equal 1 if no errors are present
ceUl = errorMask * ceUl; // don't use for correct label or padding
auto cost = onlyCe * ce + (1.f - onlyCe) * ceUl; // ce or unlikelihood part are never simultanously used as cost per batch entry
return cost;
}
};
/**
* @brief Cross entropy in rescorer used for computing sentences-level log probabilities
*/

View File

@ -15,8 +15,16 @@ Expr DataWeighting::getWeights(Ptr<ExpressionGraph> graph,
bool sentenceWeighting = weightingType_ == "sentence";
int dimBatch = (int)batch->size();
int dimWords = sentenceWeighting ? 1 : (int)batch->back()->batchWidth();
// This would abort anyway in fromVector(...), but has clearer error message
// here for this particular case
ABORT_IF(batch->getDataWeights().size() != dimWords * dimBatch,
"Number of sentence/word-level weights ({}) does not match tensor size ({})",
batch->getDataWeights().size(), dimWords * dimBatch);
auto weights = graph->constant({1, dimWords, dimBatch, 1},
inits::fromVector(batch->getDataWeights()));
return weights;
return weights; // [1, dimWords, dimBatch, 1] in case of word-level weights or
// [1, 1, dimBatch, 1] in case of sentence-level weights
}
} // namespace marian

View File

@ -10,7 +10,7 @@
#include "translator/scorers.h"
#include "data/alignment.h"
#include "data/vocab_base.h"
#include "graph/expression_graph_packable.h"
#include "tensors/cpu/fbgemm/expression_graph_packable.h"
#if USE_FBGEMM
#include "fbgemm/Utils.h"
@ -258,14 +258,14 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP
graph->load(inputFile);
graph->forward();
std::string saveGemmType = "fp32default";
auto saveGemmType = Type::float32;
if (targetPrec == 16)
saveGemmType = "fp16packed";
saveGemmType = Type::packed16;
else if (targetPrec == 8)
saveGemmType = "int8packed";
saveGemmType = Type::packed8avx2; // We currently use avx2 by default.
// added a flag if the weights needs to be packed or not
graph->packAndSave(outputFile, configStr.str(), saveGemmType); // @TODO: this should just be type-based
graph->packAndSave(outputFile, configStr.str(), saveGemmType);
std::cout << "Conversion Finished." << std::endl;

View File

@ -47,13 +47,12 @@ public:
ABORT_IF(shortlist_, "How did a shortlist make it into training?");
const Words& data = subBatch->data();
Expr yData = graph_->indices(toWordIndexVector(data));
auto yShifted = shift(y, {1, 0, 0});
state->setTargetHistoryEmbeddings(yShifted);
state->setTargetMask(yMask);
const Words& data = subBatch->data();
state->setTargetWords(data);
}

View File

@ -196,7 +196,7 @@ Ptr<DecoderState> EncoderDecoder::step(Ptr<ExpressionGraph> graph,
state = hypIndices.empty() ? state : state->select(hypIndices, batchIndices, beamSize);
// Fill state with embeddings based on last prediction
decoders_[0]->embeddingsFromPrediction(graph, state, words, batchIndices.size(), beamSize);
decoders_[0]->embeddingsFromPrediction(graph, state, words, (int) batchIndices.size(), beamSize);
auto nextState = decoders_[0]->step(graph, state);
return nextState;

8
src/models/states.h Normal file → Executable file
View File

@ -9,7 +9,7 @@ namespace marian {
class EncoderState {
private:
Expr context_;
Expr mask_;
Expr mask_; // [beam depth=1, max length, batch size, vector dim=1] source mask
Ptr<data::CorpusBatch> batch_;
public:
@ -18,9 +18,9 @@ public:
EncoderState() {}
virtual Expr getContext() { return context_; }
virtual Expr getAttended() { return context_; }
virtual Expr getMask() { return mask_; }
virtual Expr getContext() const { return context_; }
virtual Expr getAttended() const { return context_; }
virtual Expr getMask() const { return mask_; } // source batch mask; may have additional positions suppressed
virtual const Words& getSourceWords() {
return batch_->front()->data();

View File

@ -142,29 +142,6 @@ public:
return reshape(output, {dimBeam, dimBatch, dimSteps, dimModel});
}
// like affine() but with built-in parameters, activation, and dropout
static inline
Expr dense(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
{
auto graph = x->graph();
auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorotUniform());
auto b = graph->param(prefix + "_b" + suffix, { 1, outDim }, inits::zeros());
x = affine(x, W, b);
if (actFn)
x = actFn(x);
x = dropout(x, dropProb);
return x;
}
Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) const {
int dimModel = x->shape()[-1];
auto scale = graph_->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones());
auto bias = graph_->param(prefix + "_ln_bias" + suffix, { 1, dimModel }, inits::zeros());
return marian::layerNorm(x, scale, bias, 1e-6f);
}
Expr preProcess(std::string prefix, std::string ops, Expr input, float dropProb = 0.0f) const {
auto output = input;
for(auto op : ops) {
@ -192,7 +169,7 @@ public:
// highway connection
else if(op == 'h') {
int dimModel = input->shape()[-1];
auto t = dense(prevInput, prefix, /*suffix=*/"h", dimModel);
auto t = denseInline(prevInput, prefix, /*suffix=*/"h", dimModel);
output = highway(output, prevInput, t);
}
// layer normalization
@ -402,8 +379,8 @@ public:
// the stack of FF layers
for(int i = 1; i < depthFfn; ++i)
output = dense(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
output = dense(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);
output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
output = denseInline(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);
auto opsPost = opt<std::string>("transformer-postprocess");
output
@ -430,14 +407,14 @@ public:
// the stack of AAN layers
for(int i = 1; i < depthAan; ++i)
y = dense(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed
y = dense(y, prefix, std::to_string(depthAan), dimModel);
y = denseInline(y, prefix, std::to_string(depthAan), dimModel);
bool noGate = opt<bool>("transformer-aan-nogate");
if(!noGate) {
auto gi = dense(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
auto gf = dense(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
y = gi * x + gf * y;
}
@ -482,7 +459,7 @@ public:
int /*startPos*/) const {
float dropoutRnn = inference_ ? 0.f : opt<float>("dropout-rnn");
if(!perLayerRnn[prefix]) // lazily created and cache RNNs in the docoder to avoid costly recreation @TODO: turn this into class members
if(!perLayerRnn[prefix]) // lazily create and cache RNNs in the decoder to avoid costly recreation @TODO: turn this into class members
perLayerRnn[prefix] = rnn::rnn(
"type", opt<std::string>("dec-cell"),
"prefix", prefix,
@ -533,29 +510,31 @@ public:
batchEmbeddings = addSpecialEmbeddings(batchEmbeddings, /*start=*/0, batch);
// reorganize batch and timestep
batchEmbeddings = atleast_nd(batchEmbeddings, 4);
batchMask = atleast_nd(batchMask, 4);
auto layer = transposeTimeBatch(batchEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
auto layerMask
= reshape(transposeTimeBatch(batchMask), {1, dimBatch, 1, dimSrcWords}); // [-4: beam depth=1, -3: batch size, -2: vector dim=1, -1: max length]
batchEmbeddings = atleast_nd(batchEmbeddings, 4); // [beam depth=1, max length, batch size, vector dim]
batchMask = atleast_nd(batchMask, 4); // [beam depth=1, max length, batch size, vector dim=1]
auto layer = transposeTimeBatch(batchEmbeddings); // [beam depth=1, batch size, max length, vector dim]
auto layerMask = transposeTimeBatch(batchMask); // [beam depth=1, batch size, max length, vector dim=1]
auto opsEmb = opt<std::string>("transformer-postprocess-emb");
float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
layer = preProcess(prefix_ + "_emb", opsEmb, layer, dropProb);
layerMask = transposedLogMask(layerMask); // [-4: batch size, -3: 1, -2: vector dim=1, -1: max length]
// LayerAttention expects mask in a different layout
layerMask = reshape(layerMask, {1, dimBatch, 1, dimSrcWords}); // [1, batch size, 1, max length]
layerMask = transposedLogMask(layerMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
// apply encoder layers
// This is the Transformer Encoder stack.
auto encDepth = opt<int>("enc-depth");
for(int i = 1; i <= encDepth; ++i) {
layer = LayerAttention(prefix_ + "_l" + std::to_string(i) + "_self",
layer, // query
layer, // keys
layer, // values
layerMask);
layerMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
layer = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", layer);
checkpoint(layer); // sets a manually specified checkpoint if gradient checkpointing is enabled, does nothing otherwise.
}
// restore organization of batch and time steps. This is currently required
@ -698,12 +677,14 @@ public:
std::vector<Expr> encoderContexts;
std::vector<Expr> encoderMasks;
for(auto encoderState : state->getEncoderStates()) {
auto encoderContext = encoderState->getContext();
auto encoderMask = encoderState->getMask();
auto encoderContext = encoderState->getContext(); // encoder output
auto encoderMask = encoderState->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention
encoderMask = atleast_nd(encoderMask, 4);
encoderContext = transposeTimeBatch(encoderContext); // [beam depth=1, batch size, max length, vector dim]
encoderMask = transposeTimeBatch(encoderMask); // [beam depth=1, max length, batch size, vector dim=1]
encoderContext = transposeTimeBatch(encoderContext); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
int dimSrcWords = encoderContext->shape()[-2];
// This would happen if something goes wrong during batch pruning.
@ -712,15 +693,17 @@ public:
encoderContext->shape()[-3],
dimBatch);
encoderMask = atleast_nd(encoderMask, 4);
encoderMask = reshape(transposeTimeBatch(encoderMask),
{1, dimBatch, 1, dimSrcWords});
encoderMask = transposedLogMask(encoderMask);
// LayerAttention expects mask in a different layout
encoderMask = reshape(encoderMask, { 1, dimBatch, 1, dimSrcWords }); // [1, batch size, 1, max length]
encoderMask = transposedLogMask(encoderMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
if(dimBeam > 1)
encoderMask = repeat(encoderMask, dimBeam, /*axis=*/ -4);
encoderContexts.push_back(encoderContext);
encoderMasks.push_back(encoderMask);
checkpoint(encoderContext);
checkpoint(encoderMask);
}
rnn::States prevDecoderStates = state->getStates();
@ -756,6 +739,8 @@ public:
ABORT("Unknown auto-regressive layer type in transformer decoder {}",
layerType);
checkpoint(query);
// source-target attention
// Iterate over multiple encoders and simply stack the attention blocks
if(encoderContexts.size() > 0) {
@ -792,10 +777,14 @@ public:
}
}
checkpoint(query);
// remember decoder state
decoderStates.push_back(decoderState);
query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
checkpoint(query);
}
auto decoderContext = transposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
@ -831,6 +820,10 @@ public:
output_->clear();
cache_.clear();
alignments_.clear();
perLayerRnn_.clear(); // this needs to be cleared between batches.
// @TODO: figure out how to detect stale nodes i.e. nodes that are referenced,
// but where underlying memory has been deallocated by dropping all tensors
// from a TensorAllocator object. This can happen during ExpressionGraph::clear()
}
};

0
src/optimizers/optimizers.cpp Normal file → Executable file
View File

View File

@ -69,11 +69,10 @@ public:
for(auto device : devices) {
auto graph = New<ExpressionGraph>(true);
graph->setDevice(device);
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
graph->setDevice(device);
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));

View File

@ -43,6 +43,23 @@ struct GRUFastNodeOp : public NaryNodeOp {
const std::string type() override { return "GRU-ops"; }
const std::string color() override { return "yellow"; }
virtual size_t hash() override {
size_t seed = NaryNodeOp::hash();
util::hash_combine(seed, final_);
return seed;
}
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
auto cnode = std::dynamic_pointer_cast<GRUFastNodeOp>(node);
if(!cnode)
return false;
if(final_ != cnode->final_)
return false;
return true;
}
};
Expr gruOps(const std::vector<Expr>& nodes, bool final) {

View File

@ -5,15 +5,6 @@
namespace marian {
// GEMM type enum
typedef enum {
Auto = 0, // auto tuning between available GEMMs
MklFp32 = 1, // MKL based GEMM, fp32
IntrinInt16 = 2, // Intrinsic implementation of Int 16 GEMM
FbFp16Packed = 10, // FBGEMM based fp16 GEMM with packing
FbInt8Packed = 11 // FBGEMM based int8 GEMM with packing
} GemmType;
class Backend {
protected:
DeviceId deviceId_;

View File

@ -12,7 +12,6 @@ namespace cpu {
class Backend : public marian::Backend {
protected:
bool optimized_{false};
GemmType gemmType_{GemmType::Auto};
public:
Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}

View File

@ -99,8 +99,10 @@ void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors
if(div8) {
// std::cerr << "8: " << functor.to_string() << std::endl;
#ifdef __AVX__
element<float32x8>(functor, out, tensors...);
return;
#endif
}
if(div4) {

View File

@ -1,205 +0,0 @@
#pragma once
#include "graph/node.h"
#include "tensors/cpu/sharp/packed_gemm.h"
#if USE_FBGEMM
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#endif
#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
using namespace fbgemm;
#endif // USE_FBGEMM
namespace marian {
namespace cpu {
namespace variant {
// Enumeration for the Matrix used in pack functions
// A matrix - 0, B matrix - 1
enum class PackMatrix : uint8_t {
A = 0x00,
B = 0x01
};
// Pack a matrix into cache utilization efficient way (block format)
// PackMatrix packMat_: the type of packed matrix - A or B matrix
// bool transpose_: transpose
// int nrow_: the number of rows
// int ncol_: the number of columns
// int kernel_ncol_blocks_: the number of column blocks
// int brow_: the number of rows in a block
// int bcol_: the number of columns in a block
// int last_brow_: the number of rows in the last block
// int nbrow_: row index in a block
// int nbcol_: column index in a block
// uint64_t packsize_: the size of the packed matrix
// (the number of fp16 elements + padding (1024) + extra temporary memory (256))
struct PackNodeOp : public UnaryNodeOp {
PackMatrix packMat_;
bool transpose_;
int nrow_;
int ncol_;
int kernel_ncol_blocks_;
int brow_;
int bcol_;
int last_brow_;
int nbrow_;
int nbcol_;
uint64_t packsize_;
PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
packMat_(packMat),
transpose_(transpose) {
if(packMat != PackMatrix::B)
ABORT("Only prepacking of B (weight matrix) is supported");
if(clipValue != 0)
ABORT("Clipping is not supported");
if(!memoize_)
ABORT("Only constant weight node can be packed");
}
NodeOps forwardOps() override {
return {NodeOp(PackFp32(val_,
child(0)->val()->data(),
transpose_,
nrow_,
ncol_,
kernel_ncol_blocks_,
brow_,
bcol_,
last_brow_,
nbrow_,
nbcol_,
packsize_))
};
}
NodeOps backwardOps() override {
ABORT("PackNodeOp only available for inference");
return {NodeOp(0)};
}
const std::string type() override { return "packMat"; }
Shape newShape(Expr a, bool transpose) {
#if USE_FBGEMM
auto shapeMat = a->shape();
// Should be 2D - weight matrix
ABORT_IF(shapeMat.size() != 2,
"Weight Matrix should be 2D");
PackInfoFp32(shapeMat,
transpose,
nrow_,
ncol_,
kernel_ncol_blocks_,
brow_,
bcol_,
last_brow_,
nbrow_,
nbcol_,
packsize_);
Shape outShape({(int)packsize_});
return outShape;
#else // USE_FBGEMM
ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
return Shape();
#endif // USE_FBGEMM
}
};
// Affine transform (matrix multiplication) using packed B matrix
// float scalar_: scalar multiplier
// size_t m_: the number of rows in A and C
// size_t n_: the number of columns in B and C
// size_t k_: the number of columns in A and the number of rows in C
// bool transA_: transpose A
// bool transB_: transpose B
class AffineNodeOp : public NaryNodeOp {
private:
float scalar_;
size_t m_;
size_t n_;
size_t k_;
bool transA_;
bool transB_;
public:
AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
scalar_(scalar) {
transA_ = transA;
transB_ = transB;
m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
k_ = nodes[0]->shape().back();
if(transA)
std::swap(m_, k_);
size_t l = bShape.elements() / bShape[-1];
n_ = bShape[-1];
if(transB)
std::swap(l, n_);
}
Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
auto shapeA = a->shape();
if(transA) {
shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
}
auto shapeB = bShape;
if(transB) {
shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
}
Shape outShape = shapeA;
outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
"Matrix product requires inner dimensions to match");
return outShape;
}
NodeOps forwardOps() override {
return {
NodeOp(GemmPackFp32(val_,
child(0)->val(),
child(1)->val(),
child(2)->val(),
m_,
n_,
transA_))
};
}
NodeOps backwardOps() override {
ABORT("Only used for inference");
return {NodeOp(0)};
}
const std::string type() override { return "fp16packed"; }
};
static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
std::vector<Expr> nodes = {a, b, c};
return Expression<cpu::variant::AffineNodeOp>(nodes, bShape, transA, transB, scalar);
}
static inline Expr pack(Expr a, PackMatrix packMat, bool transpose, float clipValue) {
return Expression<cpu::variant::PackNodeOp>(a, packMat, transpose, clipValue);
}
} // namespace variant
} // namespace cpu
} // namespace marian

View File

@ -0,0 +1,408 @@
#pragma once
#include "graph/node.h"
#include "packed_gemm.h"
#include "tensors/cpu/sharp/int_gemm.h"
#if USE_FBGEMM
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#endif
#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
using namespace fbgemm;
#endif // USE_FBGEMM
namespace marian {
namespace cpu {
namespace variant {
// Enumeration for the Matrix used in pack functions
// A matrix - 0, B matrix - 1
enum class PackMatrix : uint8_t {
A = 0x00,
B = 0x01
};
// Pack a matrix (fp16) into cache utilization efficient way (block format) together with quantization into fp16
// PackMatrix packMat_: the type of packed matrix - A or B matrix
// bool transpose_: transpose
// int nrow_: the number of rows
// int ncol_: the number of columns
// int kernel_ncol_blocks_: the number of column blocks
// int brow_: the number of rows in a block
// int bcol_: the number of columns in a block
// int last_brow_: the number of rows in the last block
// int nbrow_: row index in a block
// int nbcol_: column index in a block
// uint64_t packsize_: the size of the packed matrix
// (the number of fp16 elements + padding (1024) + extra temporary memory (256))
struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
PackMatrix packMat_;
bool transpose_;
int nrow_;
int ncol_;
int kernel_ncol_blocks_;
int brow_;
int bcol_;
int last_brow_;
int nbrow_;
int nbcol_;
uint64_t packsize_;
FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
packMat_(packMat),
transpose_(transpose) {
if(packMat != PackMatrix::B)
ABORT("Only prepacking of B (weight matrix) is supported");
if(clipValue != 0)
ABORT("Clipping is not supported");
if(!memoize_)
ABORT("Only constant weight node can be packed");
}
NodeOps forwardOps() override {
#if USE_FBGEMM
return {NodeOp(fbgemmPacked16Pack(val_,
child(0)->val()->data(),
transpose_,
nrow_,
ncol_,
kernel_ncol_blocks_,
brow_,
bcol_,
last_brow_,
nbrow_,
nbcol_,
packsize_))
};
#else // USE_FBGEMM
ABORT("FbgemmPacked16PackNodeOp can only be used with FBGEMM enabled.");
return { NodeOp(0) };
#endif // USE_FBGEMM
}
NodeOps backwardOps() override {
ABORT("FbgemmPacked16PackNodeOp only available for inference");
return {NodeOp(0)};
}
const std::string type() override { return "packMatFp16"; }
Shape newShape(Expr a, bool transpose) {
#if USE_FBGEMM
auto shapeMat = a->shape();
// Should be 2D - weight matrix
ABORT_IF(shapeMat.size() != 2,
"Weight Matrix should be 2D");
fbgemmPacked16PackInfo(shapeMat,
transpose,
nrow_,
ncol_,
kernel_ncol_blocks_,
brow_,
bcol_,
last_brow_,
nbrow_,
nbcol_,
packsize_);
Shape outShape({(int)packsize_});
return outShape;
#else // USE_FBGEMM
ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
return Shape();
#endif // USE_FBGEMM
}
};
// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
// PackMatrix packMat_: the type of packed matrix - A or B matrix
// marian::Type packType_: the type the input matrix is packed - packed8avx2 or packed8avx512
// bool transpose_: transpose
// int nrow_: the number of rows
// int ncol_: the number of columns
// uint64_t packsize_: the size of the packed matrix
// (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
PackMatrix packMat_;
marian::Type packType_;
bool transpose_;
int nrow_;
int ncol_;
uint64_t packsize_;
FbgemmPacked8PackNodeOp(Expr a,
PackMatrix packMat,
marian::Type packType,
bool transpose,
float clipValue)
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
packMat_(packMat),
packType_(packType),
transpose_(transpose) {
if(packMat != PackMatrix::B)
ABORT("Only prepacking of B (weight matrix) is supported");
if(clipValue != 0)
ABORT("Clipping is not supported");
if(!memoize_)
ABORT("Only constant weight node can be packed");
}
NodeOps forwardOps() override {
#if USE_FBGEMM
return {NodeOp(fbgemmPacked8Pack(val_,
child(0)->val()->data(),
packType_,
transpose_,
nrow_,
ncol_,
packsize_))
};
#else // USE_FBGEMM
ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled.");
return { NodeOp(0) };
#endif // USE_FBGEMM
}
NodeOps backwardOps() override {
ABORT("FbgemmPacked8PackNodeOp only available for inference");
return {NodeOp(0)};
}
const std::string type() override { return "packMatInt8"; }
Shape newShape(Expr a, bool transpose) {
#if USE_FBGEMM
fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
Shape outShape({(int)packsize_});
return outShape;
#else // USE_FBGEMM
ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
return Shape();
#endif // USE_FBGEMM
}
};
// Affine transform (matrix multiplication) using packed B matrix
// float scalar_: scalar multiplier
// size_t m_: the number of rows in A and C
// size_t n_: the number of columns in B and C
// size_t k_: the number of columns in A and the number of rows in C
// bool transA_: transpose A
// bool transB_: transpose B
class FbgemmPacked16AffineNodeOp : public NaryNodeOp {
private:
float scalar_;
size_t m_;
size_t n_;
size_t k_;
bool transA_;
bool transB_;
public:
FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
scalar_(scalar) {
transA_ = transA;
transB_ = transB;
m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
k_ = nodes[0]->shape().back();
if(transA)
std::swap(m_, k_);
size_t l = bShape.elements() / bShape[-1];
n_ = bShape[-1];
if(transB)
std::swap(l, n_);
}
Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
auto shapeA = a->shape();
if(transA) {
shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
}
auto shapeB = bShape;
if(transB) {
shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
}
Shape outShape = shapeA;
outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
"Matrix product requires inner dimensions to match");
return outShape;
}
NodeOps forwardOps() override {
#if USE_FBGEMM
return {
NodeOp(fbgemmPacked16Gemm(val_,
child(0)->val(),
child(1)->val(),
children().size() > 2 ? child(2)->val() : nullptr, // pass only if it has a bias
m_,
n_,
transA_))
};
#else // USE_FBGEMM
ABORT("FbgemmPacked16AffineNodeOp can only be used with FBGEMM enabled.");
return { NodeOp(0) };
#endif // USE_FBGEMM
}
NodeOps backwardOps() override {
ABORT("Only used for inference");
return {NodeOp(0)};
}
const std::string type() override { return "gemmPacked16"; }
};
// Affine transform (matrix multiplication) using packed B matrix
// Especially, this gemm performs quantized gemms in 8-bit integers.
// float scalar_: scalar multiplier
// size_t m_: the number of rows in A and C
// size_t n_: the number of columns in B and C
// size_t k_: the number of columns in A and the number of rows in C
// bool transA_: transpose A
// bool transB_: transpose B
class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
private:
float scalar_;
size_t m_;
size_t n_;
size_t k_;
bool transA_;
bool transB_;
public:
FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
scalar_(scalar) {
transA_ = transA;
transB_ = transB;
m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
k_ = nodes[0]->shape().back();
if(transA)
std::swap(m_, k_);
size_t l = bShape.elements() / bShape[-1];
n_ = bShape[-1];
if(transB)
std::swap(l, n_);
}
Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
auto shapeA = a->shape();
if(transA) {
shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
}
auto shapeB = bShape;
if(transB) {
shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
}
Shape outShape = shapeA;
outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
"Matrix product requires inner dimensions to match");
return outShape;
}
NodeOps forwardOps() override {
NodeOps nodeOps;
#if USE_FBGEMM
// Do addBias only if it has a bias term
if (children().size() > 2) {
nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
child(0)->val(),
child(1)->val(),
m_,
n_,
k_,
transA_,
transB_);
marian::cpu::int16::AddBias(val_, child(2)->val())) };
} else {
nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
child(0)->val(),
child(1)->val(),
m_,
n_,
k_,
transA_,
transB_)) };
}
#else // USE_FBGEMM
ABORT("FbgemmPacked8AffineNodeOp can only be used with FBGEMM enabled.");
#endif // USE_FBGEMM
return nodeOps;
}
NodeOps backwardOps() override {
ABORT("Only used for inference");
return {NodeOp(0)};
}
const std::string type() override { return "gemmPacked8"; }
};
static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
std::vector<Expr> nodes = {a, b, c};
Type elementType = b->value_type();
if (elementType == Type::packed16)
return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;
}
}
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
if (elementType == Type::packed16)
return Expression<cpu::variant::FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;
}
}
static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
std::vector<Expr> nodes = {a, b};
Type elementType = b->value_type();
if (elementType == Type::packed16)
return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;
}
}
} // namespace variant
} // namespace cpu
} // namespace marian

View File

@ -1,7 +1,7 @@
#pragma once
#include "graph/expression_graph.h"
#include "tensors/cpu/sharp/packed_gemm.h"
#include "packed_gemm.h"
namespace marian {
@ -20,7 +20,7 @@ public:
// Convert model weights into packed format and save to IO items.
// @TODO: review this
void packAndSave(const std::string& name, const std::string& meta, std::string& saveGemmType, Type saveElementType = Type::float32) {
void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
std::vector<io::Item> ioItems;
// sorted by name in std::map
@ -35,15 +35,62 @@ public:
Tensor val = p.second->val();
// save as packed format
// @TODO Hardcoded to find packable weights - all the weights used for affine op
if (saveGemmType == "fp16packed" && pName.find("_W") == pName.length() - 3) {
// @TODO Hardcoded to find packable weights - all the weights used for affine op (fp16), all the weights used for affine op and dot op (int8)
if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information - size
int nrow;
int ncol;
uint64_t packsize;
fbgemmPacked8PackInfo(val->shape(),
gemmElementType,
pName.find("Wemb") != std::string::npos,
nrow,
ncol,
packsize);
auto allocator = New<TensorAllocator>(getBackend());
// buffer tensor to save packed matrix
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
//Pack B matrix into int8
fbgemmPacked8Pack(packedTensor,
val->data(),
gemmElementType,
pName.find("Wemb") != std::string::npos,
nrow,
ncol,
packsize);
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = gemmElementType;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
// tensor size. Saving to *.npz will cut to size.
auto mem = packedTensor->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
} else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information
int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
uint64_t packsize;
PackInfoFp32(val->shape(),
fbgemmPacked16PackInfo(val->shape(),
false,
nrow,
ncol,
@ -60,8 +107,8 @@ public:
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
// PackFp32
PackFp32(packedTensor,
// fbgemmPacked16Pack
fbgemmPacked16Pack(packedTensor,
val->data(),
false,
nrow,
@ -76,7 +123,7 @@ public:
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = Type::packed16;
item.type = gemmElementType;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
@ -86,6 +133,9 @@ public:
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
} else {
io::Item item;
val->get(item, pName);

View File

@ -0,0 +1,550 @@
#include "packed_gemm.h"
#include "tensors/tensor_allocator.h"
#include "tensors/tensor_operators.h"
#include <emmintrin.h>
#include <immintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
#include <unordered_map>
//#include <chrono>
#if USE_FBGEMM
#ifdef _MSC_VER
#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
#pragma warning(disable: 4661) // 'fbgemm::PackMatrix<fbgemm::PackBMatrix<int8_t,int32_t>,int8_t,int32_t>::PackMatrix(int32_t,int32_t,inpType *,int,const fbgemm::BlockingFactors *)': no suitable definition provided for explicit template instantiation request
#pragma warning(disable: 4244) // fbgemm\quantutils.h(51): warning C4244: 'return': conversion from 'const _Ty' to 'T2', possible loss of data
#pragma warning(disable: 4717) // 'fbgemm::PackMatrix<fbgemm::PackAWithQuantRowOffset<unsigned char,int>,unsigned char,int>::isThisLastKBlock': recursive on all control paths, function will cause runtime stack overflow
// the following does not work; need to manually disable them in Linker options
//#pragma comment(linker, "/ignore:4049") // locally defined symbol ...asmjit... imported
//#pragma comment(linker, "/ignore:4217") // locally defined symbol ...asmjit... imported
#endif
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#endif
#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
#include "3rd_party/fbgemm/include/fbgemm/QuantUtils.h"
#include "3rd_party/fbgemm/include/fbgemm/Fbgemm.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
#if MKL_FOUND
#include <mkl.h>
#include <mkl_types.h>
#endif
using namespace fbgemm;
#endif // USE_FBGEMM
namespace marian {
namespace cpu {
namespace variant { // Variants of GEMM implementations
#if USE_FBGEMM
// initialize with a dummy
// When this class is instantiated,
// the actual packing operation is happening. If we create this instance every time we call GEMM,
// we are doing packing every time and very slow.
// In Caffe2, the operator is stateful and hold an instance of this.
// But, we don't have any logic for this in marian. We can only cache a tensor (which means a memory chunk).
// So, for now, we keep the packed memory on our own 1D tensor, then when we call GEMM,
// we just reuse this instance again and again by replacing the class members (including memory pointer). Eventually,
// I will add a new constructor to the class in FBGEMM which accepts
// pre - allocated and pre - packed memory as a parameter.After it's done,
// this temporary buffer will be removed.
// When constructing this dummy buffer, ones are used for all the parameters to allocate minimum amount of memory.
//
// In a multi marian instance setting (as a dynamic library),
// different marian instances should not share this variable.
static thread_local PackedGemmMatrixFP16 packedPlaceholder(1, 1, 1, 1, 1, 1, 1, 1);
// Copied code from fbgemm. It's padding required from some kernel in FBGEMM
// Verbatim - 'required by sw pipelined kernels'
// https://github.com/marian-nmt/FBGEMM/blob/master/include/fbgemm/FbgemmFP16.h#L109
const int PACK16_PADDING = 1024;
// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
const int PACK16_SPECIALMEM = 256;
// This is copied from FBGEMM code
// A better way?
// will be removed, when FBGEMM api is changed
// blocked row-major format address arithmetic
/**
* Returns the memory address in the packed (block formatted) matrix array of a specific element
* indexed by the original non-packed array.
*
* @param r_ row index in the original matrix
* @param c_ column index in the original matrix
* @param brow_ row wide block index
* @param bcol_ column wide block index
* @param nbrow_ number of blocks in row
* @param nbcol_ number of blocks in column
* @param last_brow_ row number of the last block
*/
inline uint64_t addr(const int r_,
const int c_,
const int brow_,
const int bcol_,
const int nbrow_,
const int nbcol_,
const int last_brow_) {
uint64_t r = (uint64_t)r_;
uint64_t c = (uint64_t)c_;
uint64_t block_row_id = r / brow_;
uint64_t brow_offset = (block_row_id * nbcol_) * (brow_ * bcol_);
uint64_t block_col_id = c / bcol_;
uint64_t bcol_offset
= block_col_id * ((block_row_id != nbrow_ - 1) ? (brow_ * bcol_) : (last_brow_ * bcol_));
uint64_t block_offset = brow_offset + bcol_offset;
uint64_t inblock_offset = r % brow_ * bcol_ + c % bcol_;
uint64_t index = block_offset + inblock_offset;
return index;
}
// Memory blocking factors (parameters) for packing into AVX2 int8
static const fbgemm::BlockingFactors Packed8Avx2BlockingFactors = {
PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MR,
PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NR,
PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NR_MIN,
PackingTraits<int8_t, int32_t, inst_set_t::avx2>::ROW_INTERLEAVE,
PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MCB,
PackingTraits<int8_t, int32_t, inst_set_t::avx2>::KCB,
PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NCB
};
// Memory blocking factors (parameters) for packing into AVX512 int8
static const fbgemm::BlockingFactors Packed8Avx512BlockingFactors = {
PackingTraits<int8_t, int32_t, inst_set_t::avx512>::MR,
PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NR,
PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NR_MIN,
PackingTraits<int8_t, int32_t, inst_set_t::avx512>::ROW_INTERLEAVE,
PackingTraits<int8_t, int32_t, inst_set_t::avx512>::MCB,
PackingTraits<int8_t, int32_t, inst_set_t::avx512>::KCB,
PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NCB
};
// This function returns the correct blocking factors structure for given packing type.
inline const fbgemm::BlockingFactors* getBlockingFactors(marian::Type packType) {
if(packType == Type::packed8avx2) {
return &Packed8Avx2BlockingFactors;
} else if(packType == Type::packed8avx512) {
return &Packed8Avx512BlockingFactors;
} else {
ABORT("Only avx2 and avx512 instruction sets are supported for int8. {}", packType);
}
}
void fbgemmPacked16PackInfo(const marian::Shape& shape,
const bool transpose,
uint64_t& packsize) {
int nrow, ncol, kernel_ncol_blocks, brow = 512, bcol, last_brow, nbrow, nbcol;
fbgemmPacked16PackInfo(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
}
void fbgemmPacked16PackInfo(const marian::Shape& shape,
const bool transpose,
int& nrow,
int& ncol,
int& kernel_ncol_blocks,
int& brow,
int& bcol,
int& last_brow,
int& nbrow,
int& nbcol,
uint64_t& packsize) {
nrow = transpose ? shape[1] : shape[0];
ncol = transpose ? shape[0] : shape[1];
kernel_ncol_blocks = 2;
brow = 512;
bcol = 8 * kernel_ncol_blocks;
last_brow = nrow % brow == 0 ? brow : nrow % brow;
nbrow = nrow % brow == 0 ? nrow / brow : (nrow + brow) / brow;
nbcol = ncol % bcol == 0 ? ncol / bcol : (ncol + bcol) / bcol;
ABORT_IF(ncol % bcol != 0, "ncol (number of columns) should be multiple of 16. {}", ncol);
packsize = ((nbrow * brow) * (nbcol * bcol)) * sizeof(fbgemm::float16) + PACK16_PADDING
+ PACK16_SPECIALMEM;
}
void fbgemmPacked8PackInfo(const marian::Shape& shape,
const marian::Type packType,
const bool transpose,
int& nrow,
int& ncol,
uint64_t& packsize) {
// Should be 2D - weight matrix
ABORT_IF(shape.size() != 2,
"Weight Matrix should be 2D");
nrow = transpose ? shape[1] : shape[0];
ncol = transpose ? shape[0] : shape[1];
const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
packsize = fbgemm::PackMatrix<fbgemm::PackBMatrix<int8_t>, int8_t>::packedBufferSize(
transpose ? shape[1] : shape[0],
transpose ? shape[0] : shape[1], params);
// add extra space for storing some other variables specific to B matrix
// quantization sacles: 1 per column and float
// quantization offset: 1 per column and int32
// column offsets: 1 per column and int32
packsize += ncol * (sizeof(float) + sizeof(int32_t) + sizeof(int32_t));
}
// This function computes the offset values for each column which are used for compensating the remainders of quantized values
// More detailed math is avilable in the FBGEMM's blog - https://engineering.fb.com/ml-applications/fbgemm/
inline void col_offsets_with_zero_pt_s8acc32(
bool transpose,
int K,
int N,
const int8_t* Bint8,
const int32_t* B_zero_point,
int32_t* col_offsets,
int ncols_per_quant_group) {
for (int n = 0; n < N; ++n) {
int32_t sum = 0;
for (int k = 0; k < K; ++k) {
sum += transpose ? Bint8[k + n * K] : Bint8[k * N + n];
}
col_offsets[n] = sum - B_zero_point[n / ncols_per_quant_group] * K;
}
}
void fbgemmPacked16Pack(marian::Tensor out,
const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
const bool transpose,
const int nrow,
const int ncol,
const int kernel_ncol_blocks,
const int brow,
const int bcol,
const int last_brow,
const int nbrow,
const int nbcol,
const uint64_t packsize) {
// initialize memory
uint8_t* outmemorg = out->data<uint8_t>();
for(auto i = 0; i < packsize; i++) {
outmemorg[i] = 0;
}
// save the other auxiliary variables
uint64_t* auxmemsize = (uint64_t*)outmemorg;
auxmemsize[0] = packsize;
// save FBGEMM related parameters into the header of the allocated memory by marian
int32_t header[8];
header[0] = nrow;
header[1] = ncol;
header[2] = kernel_ncol_blocks;
header[3] = brow;
header[4] = bcol;
header[5] = last_brow;
header[6] = nbrow;
header[7] = nbcol;
memcpy(auxmemsize + 1, header, sizeof(header));
// cast to float16
fbgemm::float16* outmem = (fbgemm::float16*)(outmemorg + 256);
fbgemm::float16* dummy = new fbgemm::float16;
// pack the matrix
for(int i = 0; i < nrow; i++) {
for(int j = 0; j < ncol; j++) {
outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
= tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
}
}
delete dummy;
}
void fbgemmPacked8Pack(marian::Tensor out,
const float* inData,
const marian::Type packType,
const bool transpose,
const int nrow,
const int ncol,
const uint64_t packsize) {
int k = nrow;
int n = ncol;
int len = k * n;
// 1. collect stats for each column
float* bqScale = new float[n];
int32_t* bqZeropoint = new int32_t[n];
const float* data = inData;
float val = 0;
if (transpose) {
for (int jj = 0; jj < n; jj++) {
float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
double mean = 0, sqrsum = 0;
for (int ii = 0; ii < k; ii++) {
val = data[jj * k + ii];
mean += val;
sqrsum += val * val;
}
mean /= k;
sqrsum /= k;
sqrsum -= mean * mean;
sqrsum = sqrt(sqrsum);
min = (float)(mean - 7.0f*sqrsum);
max = (float)(mean + 7.0f*sqrsum);
bqScale[jj] = (max - min) / 255;
bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
}
} else {
for (int jj = 0; jj < n; jj++) {
float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
double mean = 0, sqrsum = 0;
for (int ii = 0; ii < k; ii++) {
val = data[jj + ii * n];
mean += val;
sqrsum += val * val;
}
mean /= k;
sqrsum /= k;
sqrsum -= mean * mean;
sqrsum = sqrt(sqrsum);
min = (float)(mean - 7.0f*sqrsum);
max = (float)(mean + 7.0f*sqrsum);
bqScale[jj] = (max - min) / 255;
bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
}
}
// 2. quantize
int8_t* quantized = 0;
#ifdef _MSC_VER
quantized = (int8_t*)_aligned_malloc(len, 256);
#else
int result = posix_memalign((void**)&quantized, 256, len); result;
assert(result == 0);
#endif
for (int jj = 0; jj < n; jj++) {
TensorQuantizationParams bQuantParam;
bQuantParam.scale = bqScale[jj];
bQuantParam.zero_point = bqZeropoint[jj];
bQuantParam.precision = 8;
if (transpose)
fbgemm::Quantize<int8_t>(data + jj * k, quantized + jj * k, k, bQuantParam);
else {
for (int ii = 0; ii < k; ii++) {
quantized[ii*n + jj] = fbgemm::Quantize<int8_t>(data[ii*n + jj], bQuantParam);
}
}
}
// 3. compute column offsets
int32_t* col_offsets = new int32_t[n];
col_offsets_with_zero_pt_s8acc32(transpose, k, n, quantized, bqZeropoint, col_offsets, 1);
int8_t* packedbuf = out->data<int8_t>();
for(auto i = 0; i < packsize; i++) {
packedbuf[i] = 0;
}
// 4. packing
const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
PackBMatrix<int8_t> packedBN(
transpose ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
nrow, ncol, quantized, transpose ? nrow : ncol, packedbuf, 1, params);
// copy quantization scale
memcpy(packedbuf + (packsize - n * (sizeof(float) + sizeof(int32_t) + sizeof(int32_t))), bqScale, n * sizeof(float));
// copy quantization offset
memcpy(packedbuf + (packsize - n * (sizeof(int32_t) + sizeof(int32_t))), bqZeropoint, n * sizeof(int32_t));
// copy column offsets to the memory
memcpy(packedbuf + (packsize - n * sizeof(int32_t)), col_offsets, n * sizeof(int32_t));
#ifdef _MSC_VER
_aligned_free(quantized);
#else
free(quantized);
#endif
delete[] col_offsets;
delete[] bqScale;
delete[] bqZeropoint;
}
// GEMM operation on the packed B matrix
// C: output matrix
// A: A matrix
// B: B matrix (packed)
// m: the number of rows in A and C
// n: the number of columns in B and C
// transA: transpose of A matrix
// B is already packed. So, we don't need transB
void fbgemmPacked16Gemm(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
const marian::Tensor bias,
const size_t m,
const size_t n,
const int transA) {
// row major
// keep the original mem
fbgemm::float16* pmat = packedPlaceholder.pmat_;
// retreive aux fields from the memory
uint64_t* packedmemSize = (uint64_t*)B->data();
packedPlaceholder.size_ = packedmemSize[0];
int32_t header[8];
memcpy(header, packedmemSize + 1, sizeof(header));
packedPlaceholder.nrow_ = header[0];
packedPlaceholder.ncol_ = header[1];
packedPlaceholder.kernel_ncol_blocks_ = header[2];
packedPlaceholder.brow_ = header[3];
packedPlaceholder.bcol_ = header[4];
packedPlaceholder.last_brow_ = header[5];
packedPlaceholder.nbrow_ = header[6];
packedPlaceholder.nbcol_ = header[7];
// packed matrix
packedPlaceholder.pmat_ = (fbgemm::float16*)(B->data<uint8_t>() + 256);
if(bias != nullptr) {
#if MKL_FOUND
for(int i = 0; i < m; ++i) {
mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
}
#else
for(int i = 0; i < m; ++i) {
std::copy(bias->data(), bias->data() + n, C->data() + n * i);
}
#endif
}
#ifdef _OPENMP
#pragma omp parallel
#endif
{
#ifdef _OPENMP
int num_threads = omp_get_num_threads();
int tid = omp_get_thread_num();
#else
int num_threads = 1;
int tid = 0;
#endif
fbgemm::cblas_gemm_compute(transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
(int)m,
A->data(),
packedPlaceholder,
bias != nullptr ? 1.0f : 0.0f,
C->data(),
tid,
num_threads);
}
// return back the original mem
packedPlaceholder.pmat_ = pmat;
}
// GEMM operation on the packed B matrix in 8 bit integers
// C: output matrix
// A: A matrix
// B: B matrix (packed)
// m: the number of rows in A and C
// n: the number of columns in B and C
// k: the number of columns in A and the number of rows in B
// transA: whether A matrix is transposed or not
// transB: whether B matrix is transposed or not
void fbgemmPacked8Gemm(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
const size_t m,
const size_t n,
const size_t k,
const int transA,
const int transB) {
// pack type
marian::Type packType = B->type();
const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
if((packType == Type::packed8avx2 && fbgemmHasAvx512Support())
|| (packType == Type::packed8avx512 && !fbgemmHasAvx512Support())) {
ABORT("FBGEMM doesn't allow to use {} packing order on {} CPUs",
packType == Type::packed8avx2 ? "AVX2" : "AVX512",
fbgemmHasAvx512Support() ? "AVX512" : "AVX2");
}
// compute range to quantize A (activations) - (min/max quantization)
float min_est = std::numeric_limits<float>::max(), max_est = std::numeric_limits<float>::min();
int elem = A->shape().elements();
float* data = A->data();
// AVX based find min/max
FindMinMax(data, &min_est, &max_est, elem);
float ascale = (max_est - min_est) / 255;
int32_t azeropoint = (int32_t)(255 - max_est / ascale);
std::vector<int32_t> row_offset_buf(PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
PackAWithQuantRowOffset<uint8_t> packAN(
transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
(int32_t)(transA ? k : m),
(int32_t)(transA ? m : k),
A->data(),
(int32_t)(transA ? m : k),
nullptr, /*buffer for packed matrix*/
ascale,
azeropoint,
1, /*groups*/
row_offset_buf.data(),
params);
// packed matrix size of B
int bPackSize = PackMatrix<PackBMatrix<int8_t>, int8_t>::packedBufferSize((int32_t)k, (int32_t)n);
// retrieve B matrix
int8_t* bdata = B->data<int8_t>();
float* bqScale = new float[n];
memcpy(bqScale, bdata + bPackSize, n * sizeof(float));
int32_t* bqZeropoint = new int32_t[n];
memcpy(bqZeropoint, bdata + bPackSize + n * sizeof(float), n * sizeof(int32_t));
int32_t* col_offsets = new int32_t[n];
memcpy(col_offsets, bdata + bPackSize + n * (sizeof(float) + sizeof(int32_t)), n * sizeof(int32_t));
DoNothing<float, float> doNothingObj{};
ReQuantizeForFloat<false, QuantizationGranularity::OUT_CHANNEL> outputProcObj(
doNothingObj,
ascale,
bqScale,
azeropoint,
bqZeropoint,
packAN.getRowOffsetBuffer(),
col_offsets,
nullptr,
(std::uint32_t) n);
PackBMatrix<int8_t> repackedBN(
transB ? matrix_op_t::Transpose : matrix_op_t::NoTranspose, (int32_t) k, (int32_t) n, bdata, (int32_t) (transB ? k : n), 1, params);
// gemm computation
fbgemmPacked(packAN, repackedBN, C->data(), (int32_t*)C->data(), (int32_t) n, outputProcObj, 0, 1, params);
delete[] col_offsets;
delete[] bqZeropoint;
delete[] bqScale;
}
#endif // USE_FBGEMM
} // namespace variant
} // namespace cpu
} // namespace marian

View File

@ -0,0 +1,141 @@
#pragma once
#include "tensors/tensor.h"
namespace marian {
namespace cpu {
namespace variant { // Variants of GEMM implementations
// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
// Packing with fp16 only targets AVX2 instruction sets for now.
// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
// shape: shape of the tensor to be packed
// transpose: the matrix is transposed
// packsize (out): the size of the packed matrix in byte
void fbgemmPacked16PackInfo(const marian::Shape& shape,
const bool transpose,
/*out*/uint64_t& packsize);
// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
// This function returns some other extra variables
// Packing with fp16 only targets AVX2 instruction sets for now.
// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
// shape: shape of the tensor to be packed
// transpose: the matrix is transposed
// nrow (out): the number of rows
// ncol (out): the number of columns
// kernel_ncol_blocks (out): the number of column blocks
// brow (out): the number of rows in a block
// bcol (out): the number of columns in a block
// last_brow (out): the number of rows in the last block
// nbrow (out): row index in a block
// nbcol (out): column index in a block
// packsize (out): the size of the packed matrix in byte
void fbgemmPacked16PackInfo(const marian::Shape& shape,
const bool transpose,
/*out*/int& nrow,
/*out*/int& ncol,
/*out*/int& kernel_ncol_blocks,
/*out*/int& brow,
/*out*/int& bcol,
/*out*/int& last_brow,
/*out*/int& nbrow,
/*out*/int& nbcol,
/*out*/uint64_t& packsize); // @TODO: change to size_t where appropriate
// Returns the byte size of packed matrix in int8. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
// See '3rd_party/fbgemm/src/PackBMatrix.cc'.
// shape: shape of the tensor to be packed
// packType: Type to be packed - packed8avx2 or packed8avx512
// transpose: the matrix is transposed
// nrow (out): the number of rows
// ncol (out): the number of columns
// packsize (out): the size of the packed matrix in byte
void fbgemmPacked8PackInfo(const marian::Shape& shape,
const marian::Type packType,
const bool transpose,
/*out*/int& nrow,
/*out*/int& ncol,
/*out*/uint64_t& packsize);
// Pack a matrix (fp16) into cache utilization efficient way (block format) into fp16
// out: output tensor - packed format
// inData: input tensor data - pointer of float data
// transpose: the matrix is transposed
// nrow: the number of rows
// ncol: the number of columns
// kernel_ncol_blocks: the number of column blocks
// brow: the number of rows in a block
// bcol: the number of columns in a block
// last_brow: the number of rows in the last block
// nbrow: row index in a block
// nbcol: column index in a block
// packsize: the size of the packed matrix
// (the number of fp16 elements + padding (1024) + extra temporary memory (256))
void fbgemmPacked16Pack(marian::Tensor out,
const float* inData,
const bool transpose,
const int nrow,
const int ncol,
const int kernel_ncol_blocks,
const int brow,
const int bcol,
const int last_brow,
const int nbrow,
const int nbcol,
const uint64_t packsize); // @TODO: change to size_t where appropriate
// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
// out: output tensor - packed format and quantized into int8
// inData: input tensor data - pointer of float data
// packType: Type to be packed - packed8avx2 or packed8avx512
// transpose: the matrix is transposed
// nrow: the number of rows
// ncol: the number of columns
// packsize: the size of the packed matrix
// (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
void fbgemmPacked8Pack(marian::Tensor out,
const float* inData,
const marian::Type packType,
const bool transpose,
const int nrow,
const int ncol,
const uint64_t packsize); // @TODO: change to size_t where appropriate
// GEMM operation on the packed B matrix
// C: output matrix
// A: A matrix
// B: B matrix (packed)
// m: the number of rows in A and C
// n: the number of columns in B and C
// transA: transpose of A matrix
// B is already packed. So, we don't need transB
void fbgemmPacked16Gemm(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
const marian::Tensor bias,
const size_t m,
const size_t n,
const int transA = 0);
// GEMM operation on the packed B matrix in 8 bit integers
// C: output matrix
// A: A matrix
// B: B matrix (packed)
// m: the number of rows in A and C
// n: the number of columns in B and C
// k: the number of columns in A and rows in B
// transA: transpose of A matrix
// transB: transpose of B matrix
void fbgemmPacked8Gemm(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
const size_t m,
const size_t n,
const size_t k,
const int transA = 0,
const int transB = 0);
} // namespace variant
} // namespace cpu
} // namespace marian

View File

@ -1,313 +0,0 @@
#include "packed_gemm.h"
#include "tensors/tensor_allocator.h"
#include "tensors/tensor_operators.h"
#include <emmintrin.h>
#include <immintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
#include <unordered_map>
//#include <chrono>
#if USE_FBGEMM
#ifdef _MSC_VER
#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
// the following does not work; need to manually disable them in Linker options
//#pragma comment(linker, "/ignore:4049") // locally defined symbol ...asmjit... imported
//#pragma comment(linker, "/ignore:4217") // locally defined symbol ...asmjit... imported
#endif
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#endif
#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
#include "3rd_party/fbgemm/include/fbgemm/QuantUtils.h"
#include "3rd_party/fbgemm/include/fbgemm/Fbgemm.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
#if MKL_FOUND
#include <mkl.h>
#include <mkl_types.h>
#endif
using namespace fbgemm;
#endif // USE_FBGEMM
namespace marian {
namespace cpu {
namespace variant { // Variants of GEMM implementations
#if USE_FBGEMM
// initialize with a dummy
// When this class is instantiated,
// the actual packing operation is happening. If we create this instance every time we call GEMM,
// we are doing packing every time and very slow.
// In Caffe2, the operator is stateful and hold an instance of this.
// But, we don't have any logic for this in marian. We can only cache a tensor (which means a memory chunk).
// So, for now, we keep the packed memory on our own 1D tensor, then when we call GEMM,
// we just reuse this instance again and again by replacing the class members (including memory pointer). Eventually,
// I will add a new constructor to the class in FBGEMM which accepts
// pre - allocated and pre - packed memory as a parameter.After it's done,
// this temporary buffer will be removed.
// When constructing this dummy buffer, ones are used for all the parameters to allocate minimum amount of memory.
//
// In a multi marian instance setting (as a dynamic library),
// different marian instances should not share this variable.
static thread_local PackedGemmMatrixFP16 packedPlaceholder(1, 1, 1, 1, 1, 1, 1, 1);
// Copied code from fbgemm. It's padding required from some kernel in FBGEMM
// Verbatim - 'required by sw pipelined kernels'
// https://github.com/marian-nmt/FBGEMM/blob/master/include/fbgemm/FbgemmFP16.h#L109
const int PACK16_PADDING = 1024;
// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
const int PACK16_SPECIALMEM = 256;
// This is copied from FBGEMM code
// A better way?
// will be removed, when FBGEMM api is changed
// blocked row-major format address arithmetic
/**
* Returns the memory address in the packed (block formatted) matrix array of a specific element
* indexed by the original non-packed array.
*
* @param r_ row index in the original matrix
* @param c_ column index in the original matrix
* @param brow_ row wide block index
* @param bcol_ column wide block index
* @param nbrow_ number of blocks in row
* @param nbcol_ number of blocks in column
* @param last_brow_ row number of the last block
*/
inline uint64_t addr(const int r_,
const int c_,
const int brow_,
const int bcol_,
const int nbrow_,
const int nbcol_,
const int last_brow_) {
uint64_t r = (uint64_t)r_;
uint64_t c = (uint64_t)c_;
uint64_t block_row_id = r / brow_;
uint64_t brow_offset = (block_row_id * nbcol_) * (brow_ * bcol_);
uint64_t block_col_id = c / bcol_;
uint64_t bcol_offset
= block_col_id * ((block_row_id != nbrow_ - 1) ? (brow_ * bcol_) : (last_brow_ * bcol_));
uint64_t block_offset = brow_offset + bcol_offset;
uint64_t inblock_offset = r % brow_ * bcol_ + c % bcol_;
uint64_t index = block_offset + inblock_offset;
return index;
}
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
uint64_t& packsize) {
int nrow, ncol, kernel_ncol_blocks, brow = 512, bcol, last_brow, nbrow, nbcol;
PackInfoFp32(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
}
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
int& nrow,
int& ncol,
int& kernel_ncol_blocks,
int& brow,
int& bcol,
int& last_brow,
int& nbrow,
int& nbcol,
uint64_t& packsize) {
nrow = transpose ? shape[1] : shape[0];
ncol = transpose ? shape[0] : shape[1];
kernel_ncol_blocks = 2;
brow = 512;
bcol = 8 * kernel_ncol_blocks;
last_brow = nrow % brow == 0 ? brow : nrow % brow;
nbrow = nrow % brow == 0 ? nrow / brow : (nrow + brow) / brow;
nbcol = ncol % bcol == 0 ? ncol / bcol : (ncol + bcol) / bcol;
ABORT_IF(ncol % bcol != 0, "ncol (number of columns) should be multiple of 16. {}", ncol);
packsize = ((nbrow * brow) * (nbcol * bcol)) * sizeof(fbgemm::float16) + PACK16_PADDING
+ PACK16_SPECIALMEM;
}
void PackFp32(marian::Tensor out,
const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
const bool transpose,
const int nrow,
const int ncol,
const int kernel_ncol_blocks,
const int brow,
const int bcol,
const int last_brow,
const int nbrow,
const int nbcol,
const uint64_t packsize) {
// initialize memory
uint8_t* outmemorg = out->data<uint8_t>();
for(auto i = 0; i < packsize; i++) {
outmemorg[i] = 0;
}
// save the other auxiliary variables
uint64_t* auxmemsize = (uint64_t*)outmemorg;
auxmemsize[0] = packsize;
// save FBGEMM related parameters into the header of the allocated memory by marian
int32_t header[8];
header[0] = nrow;
header[1] = ncol;
header[2] = kernel_ncol_blocks;
header[3] = brow;
header[4] = bcol;
header[5] = last_brow;
header[6] = nbrow;
header[7] = nbcol;
memcpy(auxmemsize + 1, header, sizeof(header));
// cast to float16
fbgemm::float16* outmem = (fbgemm::float16*)(outmemorg + 256);
fbgemm::float16* dummy = new fbgemm::float16;
// pack the matrix
for(int i = 0; i < nrow; i++) {
for(int j = 0; j < ncol; j++) {
outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
= tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
}
}
delete dummy;
}
// GEMM operation on the packed B matrix
// C: output matrix
// A: A matrix
// B: B matrix (packed)
// m: the number of rows in A and C
// n: the number of columns in B and C
// transA: transpose of A matrix
// B is already packed. So, we don't need transB
void GemmPackFp32(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
const marian::Tensor bias,
const size_t m,
const size_t n,
const int transA) {
// row major
// keep the original mem
fbgemm::float16* pmat = packedPlaceholder.pmat_;
// retreive aux fields from the memory
uint64_t* packedmemSize = (uint64_t*)B->data();
packedPlaceholder.size_ = packedmemSize[0];
int32_t header[8];
memcpy(header, packedmemSize + 1, sizeof(header));
packedPlaceholder.nrow_ = header[0];
packedPlaceholder.ncol_ = header[1];
packedPlaceholder.kernel_ncol_blocks_ = header[2];
packedPlaceholder.brow_ = header[3];
packedPlaceholder.bcol_ = header[4];
packedPlaceholder.last_brow_ = header[5];
packedPlaceholder.nbrow_ = header[6];
packedPlaceholder.nbcol_ = header[7];
// packed matrix
packedPlaceholder.pmat_ = (fbgemm::float16*)(B->data<uint8_t>() + 256);
if(bias != nullptr) {
#if MKL_FOUND
for(int i = 0; i < m; ++i) {
mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
}
#else
for(int i = 0; i < m; ++i) {
std::copy(bias->data(), bias->data() + n, C->data() + n * i);
}
#endif
}
#ifdef _OPENMP
#pragma omp parallel
#endif
{
#ifdef _OPENMP
int num_threads = omp_get_num_threads();
int tid = omp_get_thread_num();
#else
int num_threads = 1;
int tid = 0;
#endif
fbgemm::cblas_gemm_compute(transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
(int)m,
A->data(),
packedPlaceholder,
bias != nullptr ? 1.0f : 0.0f,
C->data(),
tid,
num_threads);
}
// return back the original mem
packedPlaceholder.pmat_ = pmat;
}
#else // USE_FBGEMM
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
uint64_t& packsize) {
// does nothing. supports only FBGEMM based packed gemm at this moment.
ABORT("FBGEMM is needed to use packed GEMM.");
}
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
int& nrow,
int& ncol,
int& kernel_ncol_blocks,
int& brow,
int& bcol,
int& last_brow,
int& nbrow,
int& nbcol,
uint64_t& packsize) {
// does nothing. supports only FBGEMM based packed gemm at this moment.
ABORT("FBGEMM is needed to use packed GEMM.");
}
void PackFp32(marian::Tensor out,
const float* inData,
const bool transpose,
const int nrow,
const int ncol,
const int kernel_ncol_blocks,
const int brow,
const int bcol,
const int last_brow,
const int nbrow,
const int nbcol,
const uint64_t packsize) {
// does nothing. supports only FBGEMM based packed gemm at this moment.
ABORT("FBGEMM is needed to use packed GEMM.");
}
void GemmPackFp32(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
const marian::Tensor bias,
const size_t m,
const size_t n,
const int transA) {
// does nothing. supports only FBGEMM based packed gemm at this moment.
ABORT("FBGEMM is needed to use packed GEMM.");
}
#endif // USE_FBGEMM
} // namespace variant
} // namespace cpu
} // namespace marian

View File

@ -1,70 +0,0 @@
#pragma once
#include "tensors/tensor.h"
namespace marian {
namespace cpu {
namespace variant { // Variants of GEMM implementations
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
/*out*/uint64_t& packsize);
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
int& nrow,
int& ncol,
int& kernel_ncol_blocks,
int& brow,
int& bcol,
int& last_brow,
int& nbrow,
int& nbcol,
/*out*/uint64_t& packsize); // @TODO: change to size_t where appropriate
// Pack a matrix into cache utilization efficient way (block format)
// out: output tensor - packed format
// inData: input tensor data - pointer of float data
// transpose: the matrix is transposed
// nrow: the number of rows
// ncol: the number of columns
// kernel_ncol_blocks: the number of column blocks
// brow: the number of rows in a block
// bcol: the number of columns in a block
// last_brow: the number of rows in the last block
// nbrow: row index in a block
// nbcol: column index in a block
// packsize: the size of the packed matrix
// (the number of fp16 elements + padding (1024) + extra temporary memory (256))
void PackFp32(marian::Tensor out,
const float* inData,
const bool transpose,
const int nrow,
const int ncol,
const int kernel_ncol_blocks,
const int brow,
const int bcol,
const int last_brow,
const int nbrow,
const int nbcol,
const uint64_t packsize); // @TODO: change to size_t where appropriate
// GEMM operation on the packed B matrix
// C: output matrix
// A: A matrix
// B: B matrix (packed)
// m: the number of rows in A and C
// n: the number of columns in B and C
// transA: transpose of A matrix
// B is already packed. So, we don't need transB
void GemmPackFp32(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
const marian::Tensor bias,
const size_t m,
const size_t n,
const int transA = 0);
} // namespace variant
} // namespace cpu
} // namespace marian

View File

@ -425,9 +425,13 @@ void Softmax(Tensor out, Tensor in) {
matchOrAbort<float>(out->type());
matchOrAbort<float>(in->type());
#ifdef __AVX__
if(out->shape()[-1] % 8 == 0) {
Softmax<float32x8>(out, in);
} else if(out->shape()[-1] % 4 == 0) {
return;
}
#endif
if(out->shape()[-1] % 4 == 0) {
Softmax<float32x4>(out, in);
} else {
Softmax<float>(out, in);
@ -477,9 +481,13 @@ void LogSoftmax(Tensor out, Tensor in) {
matchOrAbort<float>(out->type());
matchOrAbort<float>(in->type());
#ifdef __AVX__
if(out->shape()[-1] % 8 == 0) {
LogSoftmax<float32x8>(out, in);
} else if(out->shape()[-1] % 4 == 0) {
return;
}
#endif
if(out->shape()[-1] % 4 == 0) {
LogSoftmax<float32x4>(out, in);
} else {
LogSoftmax<float>(out, in);
@ -678,20 +686,22 @@ void Select(Tensor out,
// @TODO: make this efficient
functional::Shape outShape = out->shape();
functional::Shape inShape = in->shape();
functional::Shape inShape = in->shape();
functional::Shape idxShape = indices->shape();
int length = outShape.elements();
functional::Array<int, functional::Shape::size()> dims;
int axisCPU = (int)(axis + functional::Shape::size() - out->shape().size());
if(axisCPU == 2) // specialization for axis==2, assuming N=4
if(axisCPU == 2 && outShape == idxShape) // specialization for axis==2 when there is no broadcasting, @TODO to be removed once we have a faster implementation below
return SelectAxis2(out, in, indices);
for(int index = 0; index < length; ++index) {
outShape.dims(index, dims);
dims[axisCPU] = (int)indices->data<IndexType>()[dims[axisCPU]];
int inIndex = inShape.index(dims);
out->data()[index] = in->data()[inIndex];
outShape.dims(index, dims); // compute dimension-based indices from global index;
int idxIndex = idxShape.bindex(dims); // return global index for indices based on dimension-specific indices from out, take broadcasting into account;
dims[axisCPU] = (int)indices->data<IndexType>()[idxIndex]; // substitute index of out-tensor with corresponding axis-local position from in-tensor;
int inIndex = inShape.index(dims); // compute global index from dimension-specific indices, no broadcasting as out and in match in all dimensions apart from axis
out->data()[index] = in->data()[inIndex]; // assign corresponding values.
}
}
@ -704,7 +714,8 @@ void Insert(Tensor out,
// @TODO: make this efficient
functional::Shape outShape = out->shape();
functional::Shape inShape = in->shape();
functional::Shape inShape = in->shape();
functional::Shape idxShape = indices->shape();
int length = inShape.elements();
functional::Array<int, functional::Shape::size()> dims;
@ -712,7 +723,8 @@ void Insert(Tensor out,
for(int index = 0; index < length; ++index) {
inShape.dims(index, dims);
dims[axisCPU] = (int)indices->data<IndexType>()[dims[axisCPU]];
int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
dims[axisCPU] = (int)indices->data<IndexType>()[idxIndex];
int outIndex = outShape.index(dims);
out->data()[outIndex] += in->data()[index];
}
@ -879,7 +891,7 @@ void CrossEntropyPick(Tensor out, Tensor in, Tensor labelIndices) {
// Groundtruth label index
IndexType i = labelIndices->data<IndexType>()[j];
// This appears to be safe i.e. that i >= 0 && i < cols is known
out->data()[j] = std::log(sum) - sp[i] + max;
out->data()[j] = std::log(sum) - sp[i] + max; // -log(p_i) = - logsoftmax(x_i - max) = - (x_i - max) - log(sum_j exp(x_j - max))
}
}
@ -912,7 +924,8 @@ void CrossEntropyPickBackward(Tensor out,
// cross-entropy
for(int i = 0; i < cols; ++i) {
float sub = (float)(i == (int)labelIndices->data<IndexType>()[j]); // delta, true if label index and column index match
so[i] += adj->data()[j] * (std::exp(sp[i] - max) / sum - sub);
auto softmax = std::exp(sp[i] - max) / sum;
so[i] += adj->data()[j] * (softmax - sub);
}
}
}
@ -1037,7 +1050,7 @@ void LayerNormalization(Tensor out_,
sqSum += ex * ex;
}
float sigma = std::sqrt(eps + sqSum / cols);
float sigma = std::sqrt(sqSum / cols + eps);
#pragma omp simd
for(int i = 0; i < cols; ++i) {
@ -1099,7 +1112,7 @@ void LayerNormalizationGrad(Tensor gradX_,
sum_sqr += ex * ex;
}
float sigma = std::sqrt(eps + sum_sqr / cols);
float sigma = std::sqrt(sum_sqr / cols + eps);
#pragma omp simd
for(size_t i = 0; i < cols; ++i) {
float grad_x = 0.f;
@ -1141,7 +1154,7 @@ void LayerNormalizationGrad(Tensor gradX_,
sum_sqr += ex * ex;
}
float sigma = std::sqrt(eps + sum_sqr / cols);
float sigma = std::sqrt(sum_sqr / cols + eps);
#pragma omp simd
for(size_t i = 0; i < cols; ++i) {
float grad_x = 0.f;

View File

@ -1,4 +1,5 @@
#include "tensors/gpu/add.h"
#include "tensors/gpu/add_all.h"
#include "tensors/gpu/cuda_helpers.h"
@ -12,11 +13,13 @@ namespace marian {
namespace gpu {
template <size_t K, class Functor, class AggFunctor, typename T, typename AccType>
__global__ void gAggregateGeneric(Functor functor, AccType aggInit, AggFunctor aggFunctor,
const functional::Shape full,
functional::Tensor<T> out,
functional::Array<functional::Tensor<T>, K> ins,
AccType scale = 1.0) {
__global__ void gAggregateGeneric(Functor functor, // functor applied to single corresponding elements in tensors (via broadcasting),
AccType aggInit, // aggInit is starting value of accumulation (e.g. 0 for sum),
AggFunctor aggFunctor, // aggFunctor is used to accumulate values (e.g. sum),
const functional::Shape full, // maximal combined shape of all tensors via broadcasting
functional::Tensor<T> out, // output tensor
functional::Array<functional::Tensor<T>, K> ins, // input tensors
AccType scale = 1.0) { // scale accumulation result by scale. e.g. used for computing mean from sum over N elements with scale 1/N
int outLength = out.shape().elements();
bool same = outLength == full.elements();
for(int i = 0; i < K; ++i)
@ -32,10 +35,10 @@ __global__ void gAggregateGeneric(Functor functor, AccType aggInit, AggFunctor a
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < outLength) {
if(same) {
out[index] = aggFunctor(out[index], functional::apply(functor, ins, index) * (T)scale);
out[index] = (T)aggFunctor((AccType)out[index], functional::applyWithCast<AccType>(functor, ins, index) * scale); // apply functors to with arguments cast to AccType
} else {
out.shape().dims(index, dims);
out[index] = aggFunctor(out[index], (T)(functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale));
out[index] = (T)aggFunctor((AccType)out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale); // apply functors to with arguments cast to AccType
}
}
}
@ -62,7 +65,7 @@ __global__ void gAggregateEqual(Functor functor, AggFunctor aggFunctor,
indices[i] = ins[i].shape().bindex(dims);
}
out[index] = aggFunctor(out[index], functional::apply(functor, ins, indices) * (T)scale);
out[index] = (T)aggFunctor((AccType)out[index], functional::applyWithCast<AccType>(functor, ins, indices) * scale);
}
}
}
@ -76,7 +79,7 @@ __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor ag
int rows = full.elements() / full.back();
int cols = full.back();
bool same = true;
bool same = true; // do all inputs have the same number of elements?
for(int i = 0; i < K; ++i)
same = same && ins[i].shape().elements() == full.elements();
@ -93,7 +96,7 @@ __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor ag
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id < cols)
_sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], (AccType)functional::apply(functor, ins, j * cols + id));
_sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::applyWithCast<AccType>(functor, ins, j * cols + id)); // casts to AccType before applying functor which then performs operation in AccType
}
} else {
functional::Array<int, functional::Shape::size()> dims;
@ -106,7 +109,7 @@ __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor ag
functional::Array<int, K> indices;
for(int i = 0; i < K; ++i)
indices[i] = ins[i].shape().bindex(dims);
_sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], (AccType)functional::apply(functor, ins, indices));
_sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::applyWithCast<AccType>(functor, ins, indices));// casts to AccType before applying functor which then performs operation in AccType
}
}
}
@ -121,7 +124,8 @@ __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor ag
len = (len + 1) >> 1;
}
__syncthreads();
out[j] = aggFunctor(out[j], (T)(_sum[0] * scale));
if(threadIdx.x == 0) // only set value when in thread 0 in block
out[j] = aggFunctor(out[j], (T)(_sum[0] * scale));
}
__syncthreads();
}
@ -140,16 +144,16 @@ void AggregateTyped(Functor functor, AccType aggInit, AggFunctor aggFunctor, Acc
functional::Tensor<T> gOut = out;
functional::Array<functional::Tensor<T>, K> gIns = {tensors...};
if(full.back() != 1 && out->shape().back() == 1) {
size_t m = full.elements() / length;
size_t k = full.back();
if(out->shape().elements() == 1) { // reduce everything into a single element
AggregateAll<T, AccType>(nullptr, functor, aggInit, aggFunctor, scale, out, tensors...); // @TODO: pass allocator in here, currently uses cudaMalloc
} else if(full.back() != 1 && out->shape().back() == 1 && full.elements() / full.back() == length) { // element number of out and full shape on axis that are not reduced must match
size_t m = full.elements() / full.back(); // how many rows are we iterating over?
size_t k = full.back(); // how many columns are being reduced to 1 in each row?
int blocks = std::min(MAX_BLOCKS, (int)m);
int blocks = std::min(MAX_BLOCKS, (int)m);
int threads = std::min(MAX_THREADS, (int)k);
int shared = sizeof(AccType) * threads;
int shared = sizeof(AccType) * threads;
gAggregateReduce<K, Functor, AggFunctor, T, AccType><<<blocks, threads, shared>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
} else if(out->shape() == full) {
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));

View File

@ -15,21 +15,21 @@ template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, float, marian::Tensor, marian::Tensor);
template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Min, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Min, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor);
template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor);
template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor);
template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<3> > > > > >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<3> > > > > >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, marian::Tensor >(BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, float, marian::Tensor, marian::Tensor);
template void Aggregate<Assignee<1>, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void Aggregate<Assignee<1>, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void Aggregate<Assignee<1>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void Aggregate<Assignee<1>, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);

116
src/tensors/gpu/add_all.cu Normal file
View File

@ -0,0 +1,116 @@
#include "tensors/gpu/add_all.h"
#include "tensors/gpu/cuda_helpers.h"
#include "functional/functional.h"
#include "tensors/tensor_operators.h"
#include "3rd_party/reduce_all.h" // only works with CUDA >9.0, we are dropping CUDA 8.0 support, also changed in CMakeLists.txt
namespace marian {
#if COMPILE_FP16
// local overload to determine tensor type
template <> inline Type typeId<half>() { return Type::float16; }
#endif
// Version with variadic template arguments, called by version with explicit arguments below
template <typename T, typename AccType, class Functor, class AggFunctor, class... Tensors>
void AggregateAllVar(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
Tensor out,
const Tensors... tensors) {
cudaSetDevice(out->getDeviceId().no);
static_assert(CUDA_VERSION >= 9000, "Marian requires CUDA_VERSION >= 9000 (9.0)");
constexpr size_t K = sizeof...(Tensors); // obtain arity K of tensors...
functional::Array<functional::Tensor<T>, K> gIns = {tensors...}; // convert to array of K objects of type functional::Tensor<T>
functional::Shape full = marian::Shape::broadcast({tensors...}); // compute maximal broadcasted shape
int size = full.elements();
int threads = (size < MAX_THREADS * 2) ? nextPow2((size + 1) / 2) : MAX_THREADS; // suggested in NVidia example for the all_reduce kernel
int blocks = std::min(MAX_BLOCKS, (size + (threads * 2 - 1)) / (threads * 2)); // suggested in NVidia example for the all_reduce kernel
// The all_reduce kernel by nivida needs to perform multiple passes if the number of blocks needed to perform the reduction is larger than 1.
// Here we allocate the memory for the intermediate reductions for each block.
Tensor blockMem;
if(blocks > 1 || out->type() != typeId<AccType>()) { // if the out tensor does not have elementType AccType we need to allocate and convert later
MemoryPiece::PtrType temporaryMemory;
if(allocator) {
temporaryMemory = allocator->alloc<AccType>(blocks);
} else { // @TODO: get rid of this branch
uint8_t* temporaryMemoryPtr = 0;
CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType) * blocks));
temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType) * blocks); // @TODO: consider implementing MemoryPiece::cudaMalloc<T>(size) for managed memory
}
blockMem = TensorBase::New(temporaryMemory,
Shape({blocks}),
typeId<AccType>(),
out->getBackend());
blockMem->set(aggInit); // set temporary memory to aggInit
}
else { // we are reducing into a single element now and the type matches, just use out as memory
blockMem = out; // do not set final output memory as we might be summing gradients... needs to be handled outside this function
}
functional::Tensor<AccType> gBlockMem = blockMem;
reduceSinglePass<T, AccType>(functor, aggInit, aggFunctor, scale, full, /*out=*/gBlockMem, /*in=*/gIns, threads, blocks); // First pass reduction into intermediate memory
// If we actually needed more than one block to perform the first pass reduction, recursively run a second pass reduction over block memory until block memory has size 1.
if(blocks > 1) {
using namespace functional;
auto identity = _1; // transformation was done in first pass, hence only identity
AggregateAll<AccType, AccType>(allocator, identity, aggInit, aggFunctor, scale, out, /*in=*/blockMem); // Reducing AccType in AccType now (meta-reduction)
} else if(out->type() != typeId<AccType>()) { // it's only a single block, but we need to convert to different type, as mentioned above
CopyCast(out, blockMem);
}
if(blockMem != out) {
// Free temporary memory whether allocated in allocator or via cudaMalloc
if(allocator)
allocator->free(blockMem->memory());
else if(blockMem->memory()->data())
CUDA_CHECK(cudaFree(blockMem->memory()->data()));
}
}
template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
Tensor out,
const Tensor in1) {
AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1);
}
template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
Tensor out,
const Tensor in1,
const Tensor in2) {
AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2);
}
template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
Tensor out,
const Tensor in1,
const Tensor in2,
const Tensor in3) {
AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2, in3);
}
#include "tensors/gpu/add_all.inc"
}

87
src/tensors/gpu/add_all.h Normal file
View File

@ -0,0 +1,87 @@
#pragma once
// This header file provides wrappers around NVidia's reduce_all kernel with our custom aggregation functionality
// This kernel reduces a tensor into a single value. We have modified it to allow for different types of aggregations
// like summing or max etc.
#include "tensors/gpu/cuda_helpers.h"
#include "tensors/tensor.h"
#include "tensors/allocator.h"
#include "functional/tensor.h"
#include "tensors/tensor_operators.h"
namespace marian {
// These function declarations are repeated as template specialization with variadic template arguments does not seem to work.
// Here I am just creating version for 1, 2, and 3 arguments. To be extended if required.
template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
Tensor out,
const Tensor in1);
template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
Tensor out,
const Tensor in1,
const Tensor in2);
template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
Tensor out,
const Tensor in1,
const Tensor in2,
const Tensor in3);
// Aggregates all values into a single tensor and returns the value of that tensor as a float
// This does a GPU to CPU memory copy via TensorBase::scalar().
// Used currently only for L2Norm computation
template <typename T, typename AccType, class Functor, class AggFunctor, class... Tensors>
AccType AggregateAllAndReturn(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
const Tensors... tensors) {
MemoryPiece::PtrType temporaryMemory;
if(allocator) {
temporaryMemory = allocator->alloc<AccType>(1);
} else { // @TODO: get rid of this branch
uint8_t* temporaryMemoryPtr = 0;
CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType)));
temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType));
}
std::tuple<Tensors...> in(tensors...);
// Create a temporary tensor of size 1 to reduce into
auto out = TensorBase::New(temporaryMemory,
Shape({1}),
typeId<AccType>(),
std::get<0>(in)->getBackend());
out->set(aggInit); // init to aggInit
AggregateAll<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, tensors...);
AccType outScalar = out->template scalar<AccType>(); // convert to float also if other underlying type
if(allocator)
allocator->free(out->memory());
else if(out->memory()->data()) // @TODO: get rid of this branch
CUDA_CHECK(cudaFree(out->memory()->data()));
return outScalar;
}
}

View File

@ -0,0 +1,71 @@
// see element.inc for instructions on how to maintain this
using namespace functional;
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, UnaryFunctor<elem::Neg, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, UnaryFunctor<elem::Neg, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, Assignee<1>, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, Assignee<1>, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, Assignee<1>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, Assignee<1>, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
#if COMPILE_FP16
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, UnaryFunctor<elem::Neg, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, UnaryFunctor<elem::Neg, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
#endif

168
src/tensors/gpu/tensor_operators.cu Normal file → Executable file
View File

@ -1,5 +1,3 @@
//#include <thrust/transform_reduce.h>
#include "common/types.h"
#include "tensors/tensor_operators.h"
@ -9,7 +7,7 @@
#include "tensors/gpu/backend.h"
#include "tensors/gpu/cuda_helpers.h"
#include "3rd_party/reduce_all.h"
#include "tensors/gpu/add_all.h"
namespace marian {
@ -588,6 +586,8 @@ __global__ void gSoftmax(T* out,
// determine max (used below to improve numeric stability)
T* _max = _share;
// @TODO: what's going on here with fp16?
_max[threadIdx.x] = -CUDA_FLT_MAX; // mask
// find max over column indices that have the same relative column index (=threadIdx.x) across all blocks of columns
for(int tid = 0; tid < cols; tid += blockDim.x) {
@ -980,7 +980,7 @@ __global__ void gPasteRows(T* out,
const IndexType* targetRowIdx,
size_t rows) {
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
int j = bid + blockIdx.x; // index into 'indices' vector
if(j < rows) {
size_t dstId = targetRowIdx[j];
size_t srcId = j;
@ -988,11 +988,15 @@ __global__ void gPasteRows(T* out,
T* rowOut = out + dstId * cols;
const T* rowIn = in + srcId * cols;
// aggregate the entire row
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
int i = tid + threadIdx.x; // column index --@TODO: column index should be called 'j'
if(i < cols) {
// @TODO: Do we need to get rid of this atomic add? It seems slow for fp16
atomics::atomicAdd(rowOut + i, rowIn[i]);
// Note: atomicAdd() not needed if number of blocks is 1. Avoid it because it is slow for fp16.
if (gridDim.x == 1)
rowOut[i] += rowIn[i];
else
atomics::atomicAdd(rowOut + i, rowIn[i]);
}
}
}
@ -1011,7 +1015,15 @@ void PasteRows(Tensor out,
size_t rowsToCopy = indices->size();
int threads = std::min(MAX_THREADS, (int)cols);
#if 1 // @TODO: make this configurable with a 'deterministic' flag
// If we only use one block, then each core operates on a different column,
// hence the summation becomes deterministic.
// However, we only use e.g. 512 cores out of possibly 3000+, so this will be
// 6 x slower in this example.
int blocks = 1;
#else
int blocks = std::min(MAX_BLOCKS, (int)rowsToCopy);
#endif
if(out->type() == Type::float32) {
gPasteRows<<<blocks, threads>>>(
@ -1132,7 +1144,8 @@ __global__ void gSelect(T* out,
const T* in,
const functional::Shape inShape,
int axis,
IndexType* d_indices) {
const IndexType* d_indices,
const functional::Shape idxShape) {
int length = outShape.elements();
functional::Array<int, functional::Shape::size()> dims;
@ -1140,7 +1153,8 @@ __global__ void gSelect(T* out,
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
outShape.dims(index, dims);
dims[axis] = d_indices[dims[axis]];
int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
dims[axis] = (int)d_indices[idxIndex];
int inIndex = inShape.index(dims);
out[index] = in[inIndex];
}
@ -1153,7 +1167,8 @@ __global__ void gInsert(T* out,
const T* in,
const functional::Shape inShape,
int axis,
IndexType* d_indices) {
const IndexType* d_indices,
const functional::Shape idxShape) {
int length = inShape.elements();
functional::Array<int, functional::Shape::size()> dims;
@ -1161,7 +1176,8 @@ __global__ void gInsert(T* out,
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
inShape.dims(index, dims);
dims[axis] = d_indices[dims[axis]];
int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
dims[axis] = (int)d_indices[idxIndex];
int outIndex = outShape.index(dims);
out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
}
@ -1189,7 +1205,8 @@ void Select(Tensor out,
in->data<float>(),
in->shape(),
axisGPU,
indices->data<IndexType>());
indices->data<IndexType>(),
indices->shape());
#if COMPILE_FP16
} else if (out->type() == Type::float16) {
gSelect<<<blocks, threads>>>(out->data<half>(),
@ -1197,7 +1214,8 @@ void Select(Tensor out,
in->data<half>(),
in->shape(),
axisGPU,
indices->data<IndexType>());
indices->data<IndexType>(),
indices->shape());
#endif
} else {
ABORT("Select not implemented for type {}", out->type());
@ -1224,7 +1242,8 @@ void Insert(Tensor out,
in->data<float>(),
in->shape(),
axisGPU,
indices->data<IndexType>());
indices->data<IndexType>(),
indices->shape());
#if COMPILE_FP16
} else if (out->type() == Type::float16) {
gInsert<<<blocks, threads>>>(out->data<half>(),
@ -1232,7 +1251,8 @@ void Insert(Tensor out,
in->data<half>(),
in->shape(),
axisGPU,
indices->data<IndexType>());
indices->data<IndexType>(),
indices->shape());
#endif
} else {
ABORT("Insert not implemented for type {}", out->type());
@ -1522,11 +1542,11 @@ __global__ void gCrossEntropyPick(T* out,
__syncthreads();
// cross-entropy
auto sum = _sum[0];
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id == (int)pick[j]) {
out[j] = (T)functional::Ops<AccType>::log(_sum[0]) - sp[id] + max;
}
if(id == (int)pick[j])
out[j] = (T)functional::Ops<AccType>::log(sum) - sp[id] + max;
}
}
__syncthreads();
@ -1628,7 +1648,8 @@ __global__ void gCrossEntropyPickBackward(T* out,
int id = tid + threadIdx.x;
if(id < cols) {
AccType sub = (AccType)(id == (int)pick[j]);
so[id] += (AccType)adj[j] * (functional::Ops<AccType>::exp(sp[id] - max) / _sum[0] - sub);
auto softmax = functional::Ops<AccType>::exp(sp[id] - max) / _sum[0];
so[id] += (AccType)adj[j] * (softmax - sub);
}
}
}
@ -1661,53 +1682,27 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor indices)
}
}
float L2Norm(Tensor in, Ptr<Allocator> allocator) {
// computes the L2Norm of tensor and returns value as flaot on the CPU,
// this is mostly used for diagnostic purposes and gradient clipping
float L2Norm(Tensor in, Ptr<Allocator> allocator) { // @TODO: reverse order of arguments
cudaSetDevice(in->getDeviceId().no);
int size = in->shape().elements();
int threads = std::min(MAX_THREADS, size);
int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
if(allocator) {
auto memoryPiece = allocator->alloc<float>(blocks);
auto blockMem = TensorBase::New(memoryPiece, Shape({1, blocks}), Type::float32, in->getBackend());
using namespace functional;
if(in->type() == Type::float32) {
ReduceAll<float, float>(_1 * _1, blockMem, in);
using namespace functional;
float l2Norm;
if(in->type() == Type::float32) {
l2Norm = std::sqrt(AggregateAllAndReturn</*ElementType=*/float, /*AccType=*/float>(allocator, /*functor=*/_1 * _1, /*aggInit=*/0.f, /*aggFunctor=*/_1 + _2, /*scale=*/1.f, in));
#if COMPILE_FP16
} else if(in->type() == Type::float16) {
ReduceAll<half, float>(_1 * _1, blockMem, in);
} else if(in->type() == Type::float16) {
l2Norm = std::sqrt(AggregateAllAndReturn</*ElementType=*/half, /*AccType=*/float>(allocator, /*functor=*/_1 * _1, /*aggInit=*/0.f, /*aggFunctor=*/_1 + _2, /*scale=*/1.f, in));
#endif
} else {
ABORT("L2Norm not implemented for type {}", in->type());
}
float dataCpu = sqrtf(blockMem->get<float>(0));
allocator->free(memoryPiece);
return dataCpu;
} else { // @TODO: this branch is to be removed with next PR, old version
uint8_t* data;
cudaMalloc(&data, blocks * sizeof(float));
Tensor out(TensorBase::New(MemoryPiece::New(data, blocks * sizeof(float)),
Shape({1, blocks}),
Type::float32,
in->getBackend()));
using namespace functional;
if(in->type() == Type::float32) {
ReduceAll<float, float>(_1 * _1, out, in);
#if COMPILE_FP16
} else if(in->type() == Type::float16) {
ReduceAll<half, float>(_1 * _1, out, in);
#endif
} else {
ABORT("L2Norm not implemented for type {}", in->type());
}
float dataCpu = sqrtf(out->get<float>(0));
out.reset();
cudaFree(data);
return dataCpu;
} else {
ABORT("L2Norm not implemented for type {}", in->type());
}
return l2Norm;
}
template <typename T, typename AccType = float>
@ -1761,22 +1756,22 @@ __global__ void gAtt(T* out,
void Att(Tensor out, Tensor va, Tensor context, Tensor state) {
cudaSetDevice(out->getDeviceId().no);
size_t m = out->shape().elements() / out->shape().back();
size_t k = context->shape()[-1];
size_t b = context->shape()[-2];
size_t t = context->shape()[-3];
size_t totalRows = out->shape().elements() / out->shape().back(); // number of rows
size_t modelDim = context->shape()[-1]; // number of cols
size_t batchDim = context->shape()[-2];
size_t contextWordsDim = context->shape()[-3];
int blocks = std::min(MAX_BLOCKS, (int)m);
int threads = std::min(MAX_THREADS, (int)k);
int blocks = std::min(MAX_BLOCKS, (int)totalRows);
int threads = std::min(MAX_THREADS, (int)modelDim);
int shared = sizeof(float) * threads;
if(out->type() == Type::float32) {
gAtt<float, float><<<blocks, threads, shared>>>(
out->data<float>(), va->data<float>(), context->data<float>(), state->data<float>(), m, k, b, t);
out->data<float>(), va->data<float>(), context->data<float>(), state->data<float>(), totalRows, modelDim, batchDim, contextWordsDim);
#if COMPILE_FP16
} else if (out->type() == Type::float16) {
gAtt<half, float><<<blocks, threads, shared>>>(
out->data<half>(), va->data<half>(), context->data<half>(), state->data<half>(), m, k, b, t);
out->data<half>(), va->data<half>(), context->data<half>(), state->data<half>(), totalRows, modelDim, batchDim, contextWordsDim);
#endif
} else {
ABORT("gAtt not implemented for type {}", out->type());
@ -1930,7 +1925,7 @@ __global__ void gLNormalization(T* out,
len = (len + 1) >> 1;
}
__syncthreads();
AccType sigma = functional::Ops<AccType>::sqrt(_sqSum[0] / N); // all AccType
AccType sigma = functional::Ops<AccType>::sqrt(_sqSum[0] / N + eps); // all AccType
__syncthreads();
for(int tid = 0; tid < cols; tid += blockDim.x) {
@ -1939,7 +1934,7 @@ __global__ void gLNormalization(T* out,
AccType gammav = (AccType)gamma[id];
AccType xv = (AccType)xRow[id];
AccType betav = beta ? (AccType)beta[id] : (AccType)0.f;
AccType lv = (xv - mean) / (sigma + eps);
AccType lv = (xv - mean) / sigma;
AccType y = gammav * lv + betav;
yRow[id] = (T)y;
}
@ -2005,10 +2000,10 @@ __global__ void gLayerNormalizationGrad(T* gradX,
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
AccType* sum_adj = shared;
AccType* sum_adj_x = shared + blockDim.x;
AccType* sum_x = shared + 2 * blockDim.x;
AccType* sum_sqr = shared + 3 * blockDim.x;
AccType* sum_adj = shared; // sum of gradient coming in
AccType* sum_adj_l = shared + blockDim.x; // sum of gradient coming in times layerNorm from value
AccType* sum_x = shared + 2 * blockDim.x; // sum of input value x
AccType* sum_sqr = shared + 3 * blockDim.x; // sum of (x - mean)^2
const T* xRow = x + j * cols;
const T* yRow = y + j * cols;
@ -2016,7 +2011,7 @@ __global__ void gLayerNormalizationGrad(T* gradX,
sum_x[threadIdx.x] = (AccType)0.0f;
sum_adj[threadIdx.x] = (AccType)0.0f;
sum_adj_x[threadIdx.x] = (AccType)0.0f;
sum_adj_l[threadIdx.x] = (AccType)0.0f;
sum_sqr[threadIdx.x] = (AccType)0.0f;
for(int tid = 0; tid < cols; tid += blockDim.x) {
@ -2027,10 +2022,10 @@ __global__ void gLayerNormalizationGrad(T* gradX,
AccType betav = beta ? (AccType)beta[id] : (AccType)0.f;
AccType gammav = (AccType)gamma[id];
AccType adjv = adjRow[id];
AccType lv = (yv - betav) / (gammav + eps); // go back to LN(x) from scaled and shifted version for accumulation
AccType lv = (yv - betav) / gammav; // go back to LN(x) from scaled and shifted version for accumulation
sum_x[threadIdx.x] += xv;
sum_adj_x[threadIdx.x] += adjv * lv;
sum_adj_l[threadIdx.x] += adjv * lv;
sum_adj[threadIdx.x] += adjv;
}
}
@ -2042,7 +2037,7 @@ __global__ void gLayerNormalizationGrad(T* gradX,
if(threadIdx.x < (len >> 1)) {
sum_x[threadIdx.x] += sum_x[threadIdx.x + skip]; // Accumulates in AccType
sum_adj[threadIdx.x] += sum_adj[threadIdx.x + skip]; // Accumulates in AccType
sum_adj_x[threadIdx.x] += sum_adj_x[threadIdx.x + skip]; // Accumulates in AccType
sum_adj_l[threadIdx.x] += sum_adj_l[threadIdx.x + skip]; // Accumulates in AccType
}
len = (len + 1) >> 1;
}
@ -2069,33 +2064,32 @@ __global__ void gLayerNormalizationGrad(T* gradX,
len = (len + 1) >> 1;
}
__syncthreads();
AccType sigma = functional::Ops<AccType>::sqrt(sum_sqr[0] / N);
AccType sigma = functional::Ops<AccType>::sqrt(sum_sqr[0] / N + eps);
__syncthreads();
// Jacobian of layer norm
// J = [ \frac{1}{N\sigma} (N\delta_{ij} - l_i l_j - 1) ]_{ij}
// J * a = dC/dx_i = ( N v_i - l_i \sum_j l_j a_j - \sum_j a_j ) / (N \sigma)
// J * a = dC/dx_i = ( N a_i - l_i \sum_j l_j a_j - \sum_j a_j ) / (N \sigma)
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id < cols) {
AccType xv = xRow[id];
//AccType yv = yRow[id];
//AccType betav = beta ? (AccType)beta[id] : (AccType)0.f;
AccType gammav = (AccType)gamma[id];
AccType adjv = adjRow[id];
AccType lv = (xv - mean) / (sigma + eps);
AccType lv = (xv - mean) / sigma;
AccType gradLv = N * adjv - lv * sum_adj_x[0] - sum_adj[0];
gradLv /= N * (sigma + eps); // eps has to be inside parentheses for correct gradient
AccType gradLv = N * adjv - lv * sum_adj_l[0] - sum_adj[0];
gradLv /= N * sigma;
AccType gradXv = gammav * gradLv;
// Keep LN gradient between [-10, 10]
// AccType sign = functional::Ops<AccType>::sgn(gradXv);
// AccType cutoff = (AccType)10.f;
// gradXv = functional::Ops<AccType>::abs(gradXv) > cutoff ? sign * cutoff : gradXv;
// Keep LN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. @TODO: to be fixed and removed.
AccType sign = functional::Ops<AccType>::sgn(gradXv);
AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option?
// or better: make obsolete.
gradXv = functional::Ops<AccType>::abs(gradXv) > cutoff ? sign * cutoff : gradXv;
T* gradXRow = gradX + j * cols;
gradXRow[id] += (T)(gradXv);

View File

@ -28,18 +28,20 @@ std::string TensorBase::debug(int precision, int dispCols) {
else
strm << std::fixed << std::setprecision(0) << std::setfill(' ');
// double maxv = std::numeric_limits<double>::lowest();
// double minv = std::numeric_limits<double>::max();
// double l2Norm = 0.0;
double maxv = std::numeric_limits<double>::lowest();
double minv = std::numeric_limits<double>::max();
double l2Sum = 0.0;
for(int i = 0; i < values.size(); ++i) {
if((double)values[i] > maxv) maxv = (double)values[i];
if((double)values[i] < minv) minv = (double)values[i];
l2Sum += (double)values[i] * (double)values[i];
}
strm << "min: " << minv << " max: " << maxv << " l2-norm: " << sqrt(l2Sum) << std::endl;
for(int i = 0; i < values.size(); ++i) {
std::vector<int> dims;
shape().dims(i, dims);
// if((double)values[i] > maxv) maxv = values[i];
// if((double)values[i] < minv) minv = values[i];
// l2Norm += (double)values[i] * (double)values[i];
bool disp = true;
for(int j = 0; j < dims.size(); ++j)
disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
@ -95,8 +97,6 @@ std::string TensorBase::debug(int precision, int dispCols) {
}
}
strm << std::endl;
//strm << "min: " << minv << " max: " << maxv << " l2-norm: " << sqrt(l2Norm);
return strm.str();
}

View File

@ -54,12 +54,12 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
gpu::Add(functor, scale, out, tensors...);
else
#endif
cpu::Aggregate(functor, 0.0f, functional::_1 + functional::_2, scale, out, tensors...);
cpu::Aggregate(functor, /*aggInit=*/0.0f, functional::_1 + functional::_2, scale, out, tensors...);
}
template <class Functor, class... Tensors>
void Add(Functor functor, marian::Tensor out, Tensors... tensors) {
Add(functor, 1, out, tensors...);
Add(functor, /*scale=*/1.f, out, tensors...);
}
template <class Functor, class AggFunctor, class... Tensors>

View File

@ -40,6 +40,9 @@ TEST_CASE("Options can be accessed", "[fastopt]") {
"subnode: {"
" baz: [ 111.5, False ],"
" qux: 222,"
" preprocess1: n,"
" preprocess2: d,"
" preprocess3: y,"
" }"
"}");
@ -57,6 +60,9 @@ TEST_CASE("Options can be accessed", "[fastopt]") {
CHECK( o["subnode"]["baz"][0].as<float>() == 111.5f );
CHECK( o["subnode"]["baz"][1].as<bool>() == false );
CHECK( o["subnode"]["baz"][0].as<int>() == 111 );
CHECK( o["subnode"]["preprocess1"].as<std::string>() == "n" ); // don't allow "n" to be cast to boolean false while converting from YAML
CHECK( o["subnode"]["preprocess2"].as<std::string>() == "d" );
CHECK( o["subnode"]["preprocess3"].as<std::string>() == "y" ); // don't allow "y" to be cast to boolean true while converting from YAML
}
node["foo"] = "baz";

View File

@ -670,16 +670,16 @@ void tests(DeviceType device, Type floatType = Type::float32) {
values.clear();
std::vector<T> vA({ 1, -2, 3,
-4, 5, -6,
7, -8, 9,
-10, 11, -12});
-4, 5, -6,
7, -8, 9,
-10, 11, -12});
std::vector<T> vC({ 1, -2, // C = np.array([1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12]).reshape((2, 3, 2))
3, -4,
5, -6,
3, -4,
5, -6,
7, -8,
9, -10,
11, -12 });
7, -8,
9, -10,
11, -12 });
std::vector<T> vB1({1, -2, 3});
std::vector<T> vB2({1, -4, 7, -10});
std::vector<T> vB3({-2, 5, -8, 11});
@ -687,7 +687,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
std::vector<T> vD1(vB4);
std::vector<T> vD2({5, -6, 11, -12});
std::vector<T> vD3({1, -2, 5, -6, 7, -8, 11, -12}); // C[:,(0,2),:]
//std::vector<float> vD4({5, -6, 3, -4, 7, -8, 11, -12}); // [C[0,(2,1),:],C[1,(0,2),:]]
std::vector<T> vD4({5, -6, 3, -4, 7, -8, 11, -12}); // [C[0,(2,1),:],C[1,(0,2),:]]
std::vector<T> vS1({7, -8, 9});
std::vector<T> vS2({-4, 5, -6, 7, -8, 9});
std::vector<T> vS3({7, -8, 9, -10, 11, -12});
@ -714,11 +714,11 @@ void tests(DeviceType device, Type floatType = Type::float32) {
CHECK(D1->type() == "sliceView");
CHECK(D2->type() == "gather");
// enable this once gather() supports batched indices:
//auto D4 = gather(C, 1, graph->constant({2, 2, 1}, // [C[0,(2,1),:],C[1,(0,2),:]]
// inits::fromVector(std::vector<IndexType>{
// 2, 1,
// 0, 2 }),
// Type::uint32));
auto D4 = gather(C, 1, graph->constant({2, 2, 1}, // [C[0,(2,1),:],C[1,(0,2),:]]
inits::fromVector(std::vector<IndexType>{
2, 1,
0, 2 }),
Type::uint32));
auto S1 = slice(A, 0, 2);
auto S2 = narrow(A, 0, 1, 2);
@ -736,7 +736,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
CHECK(D1->shape() == Shape({1, 3, 2})); D1->val()->get(values); CHECK( values == vD1 );
CHECK(D2->shape() == Shape({2, 1, 2})); D2->val()->get(values); CHECK( values == vD2 );
CHECK(D3->shape() == Shape({2, 2, 2})); D3->val()->get(values); CHECK( values == vD3 );
//CHECK(D4->shape() == Shape({2, 2, 2})); D4->val()->get(values); CHECK( values == vD4 );
CHECK(D4->shape() == Shape({2, 2, 2})); D4->val()->get(values); CHECK( values == vD4 );
CHECK(S1->shape() == Shape({1,3})); S1->val()->get(values); CHECK(values == vS1);
CHECK(S2->shape() == Shape({2,3})); S2->val()->get(values); CHECK(values == vS2);
@ -789,3 +789,59 @@ TEST_CASE("Expression graph supports basic math operations (cpu)", "[operator]")
tests<float>(DeviceType::cpu);
}
#endif
#ifdef BLAS_FOUND
#ifdef CUDA_FOUND
TEST_CASE("Compare aggregate operator", "[graph]") {
auto floatApprox = [](float x, float y) -> bool { return x == Approx(y).epsilon(0.01); };
Config::seed = 1234;
std::vector<float> initc;
std::vector<float> inita;
{
auto graph = New<ExpressionGraph>();
graph->setDevice({0, DeviceType::cpu});
graph->reserveWorkspaceMB(40);
auto chl = graph->param("1x10x512x2048", {1, 10, 512, 2048}, inits::normal());
auto adj = graph->param("1x1x512x2048", {1, 1, 512, 2048}, inits::normal());
graph->forward();
chl->val()->get(initc);
adj->val()->get(inita);
}
SECTION("initializing with zero (cpu)") {
std::vector<float> values1;
std::vector<float> values2;
auto graph1 = New<ExpressionGraph>();
graph1->setDevice({0, DeviceType::cpu});
graph1->reserveWorkspaceMB(40);
auto graph2 = New<ExpressionGraph>();
graph2->setDevice({0, DeviceType::gpu});
graph2->reserveWorkspaceMB(40);
auto chl1 = graph1->param("1x10x512x2048", {1, 10, 512, 2048}, inits::fromVector(initc));
auto adj1 = graph1->param("1x1x512x2048", {1, 1, 512, 2048}, inits::fromVector(inita));
auto prod1 = scalar_product(chl1, adj1, -1);
graph1->forward();
auto chl2 = graph2->param("1x10x512x2048", {1, 10, 512, 2048}, inits::fromVector(initc));
auto adj2 = graph2->param("1x1x512x2048", {1, 1, 512, 2048}, inits::fromVector(inita));
auto prod2 = scalar_product(chl2, adj2, -1);
graph2->forward();
prod1->val()->get(values1);
prod2->val()->get(values2);
CHECK( std::equal(values1.begin(), values1.end(), values2.begin(), floatApprox) );
}
}
#endif
#endif

View File

@ -109,6 +109,8 @@ public:
auto cost = model->build(graph, batch);
fits = graph->fits();
LOG(debug, "[batching] length: {} - size: {} - fits: {}", lengths[0], current, fits);
if(fits) {
stats->add(batch, multiplier);
start = current + 1;

View File

@ -18,6 +18,7 @@ AsyncGraphGroup::AsyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
for(auto device : devices_) {
auto graph = New<ExpressionGraph>();
graph->setDevice(device);
graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_.push_back(graph);

View File

@ -34,6 +34,7 @@ public:
// Initialize graph
graph_ = New<ExpressionGraph>();
graph_->setDevice(deviceId);
graph_->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph_->getBackend()->setClip(options_->get<float>("clip-gemm"));
graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
opt_ = Optimizer(options_);

View File

@ -10,6 +10,7 @@ SyncGraphGroup::SyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
for(auto device : devices_) {
auto graph = New<ExpressionGraph>();
graph->setDevice(device);
graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
@ -57,19 +58,6 @@ void SyncGraphGroup::initialize(const Ptr<data::Batch>& exampleBatch) {
if (i > 0)
graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
});
//ThreadPool pool(graphs_.size() - 1, graphs_.size() - 1);
//for(size_t i = 1; i < graphs_.size(); ++i) {
// auto init = [&](size_t i) {
// // initialize i-th graph and weights
// builders_[i]->build(graphs_[i], exampleBatch);
// graphs_[i]->forward();
// // overwrite weights of i-th graph with weights from 0-th graph
// graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
// };
// pool.enqueue(init, i);
//}
//// ThreadPool destructor waits until completion of all tasks.
//// @TODO: can we use comm_->foreach()?
}
void SyncGraphGroup::initializeAvg() {
@ -401,15 +389,20 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
paramsAvg_[idx], curParam, scheduler_->numberOfBatches(), updateTrgWords);
};
comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards
comm_->foreach(update); // per-shard model-update
comm_->allGatherParams(); // distribute param value shards back
// cost across all local devices (scheduler will aggregate cross-process)
StaticLoss localLoss;
for(auto& l : localDeviceLosses) // localDeviceLosses is already summed up over delay steps
localLoss += l;
// model update
if (std::isfinite(localLoss.loss) || mpi_->numMPIProcesses() > 1) { // guard against NaN (except with MPI, as this simple way could hang it)
comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices and MPI nodes into shards
comm_->foreach(update); // per-shard model-update
comm_->allGatherParams(); // distribute param value shards back
}
else
LOG(info, "[training] skipping {}-th update due to loss being {}", scheduler_->numberOfBatches(), localLoss.loss);
if(scheduler_) {
// track and log localLoss
scheduler_->update(localLoss, numReadBatches, effectiveBatchSize, effectiveBatchTrgWords, mpi_);

6
src/training/scheduler.h Normal file → Executable file
View File

@ -9,7 +9,7 @@
namespace marian {
bool getSigtermFlag();
void installSignalHandlers();
void installSignalHandlers();
class Scheduler : public TrainingObserver {
private:
@ -229,7 +229,7 @@ public:
continue;
size_t stalledPrev = validator->stalled();
float value = validator->validate(graphs);
float value = validator->validate(graphs, state_);
if(validator->stalled() > 0) {
LOG_VALID(info,
"Ep. {} : Up. {} : {} : {} : stalled {} times (last best: {})",
@ -358,7 +358,7 @@ public:
&& heartBeatTimer_.elapsed<std::chrono::minutes>() >= 10) {
printf("PROGRESS: %.2f%%\nEVALERR: %.7f%%\n",
(double)state_->epochs,
state_->costSum / state_->costCount / (mpi ? mpi->numMPIProcesses() : 1));
state_->costSum / (state_->costCount ? state_->costCount : 1) / (mpi ? mpi->numMPIProcesses() : 1));
fflush(stdout);
std::cout << "MBSIZE: " << batchLabels << " after " << state_->batches << " updates = " << state_->labelsTotal << " labels" << std::endl << std::flush;
heartBeatTimer_.start();

View File

@ -254,7 +254,7 @@ public:
seedCorpus = config["seed-corpus"].as<std::string>();
}
void save(const std::string& name) {
void save(const std::string& name) const {
std::ofstream fout(name);
YAML::Node config;
@ -291,6 +291,16 @@ public:
fout << config;
}
std::string fillTemplate(const std::string& templ) const {
// The formatting below uses fmtlib, which is included with spdlog
// and is included via the logger.
return fmt::format(templ.c_str(),
fmt::arg("E", epochs),
fmt::arg("U", batches),
fmt::arg("B", batchesEpoch),
fmt::arg("T", labelsTotal));
}
private:
std::vector<Ptr<TrainingObserver>> observers_;
};

View File

@ -303,7 +303,8 @@ ScriptValidator::ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> op
"valid-script metric but no script given");
}
float ScriptValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
float ScriptValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> /*ignored*/) {
using namespace data;
auto model = options_->get<std::string>("model");
std::string suffix = model.substr(model.size() - 4);
@ -331,7 +332,8 @@ TranslationValidator::TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<O
createBatchGenerator(/*isTranslating=*/true);
}
float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> state) {
using namespace data;
// Generate batches
@ -353,6 +355,8 @@ float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& gr
if(options_->hasAndNotEmpty("valid-translation-output")) {
fileName = options_->get<std::string>("valid-translation-output");
// fileName can be a template with fields for training state parameters:
fileName = state->fillTemplate(fileName);
} else {
tempFile.reset(new io::TemporaryFile(options_->get<std::string>("tempdir"), false));
fileName = tempFile->getFileName();
@ -455,7 +459,8 @@ BleuValidator::BleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> option
createBatchGenerator(/*isTranslating=*/true);
}
float BleuValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
float BleuValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> state) {
using namespace data;
// Generate batches
@ -495,6 +500,8 @@ float BleuValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
Ptr<OutputCollector> collector;
if(options_->hasAndNotEmpty("valid-translation-output")) {
auto fileName = options_->get<std::string>("valid-translation-output");
// fileName can be a template with fields for training state parameters:
fileName = state->fillTemplate(fileName);
collector = New<OutputCollector>(fileName); // for debugging
} else {
collector = New<OutputCollector>(/* null */); // don't print, but log

View File

@ -37,7 +37,8 @@ protected:
public:
ValidatorBase(bool lowerIsBetter) : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) = 0;
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> state) = 0;
virtual std::string type() = 0;
float lastBest() { return lastBest_; }
@ -83,7 +84,8 @@ protected:
}
public:
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) override {
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> /*ignored*/) override {
for(auto graph : graphs)
graph->setInference(true);
@ -176,7 +178,8 @@ class ScriptValidator : public Validator<data::Corpus, models::IModel> {
public:
ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) override;
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> /*ignored*/) override;
std::string type() override { return "valid-script"; }
@ -191,7 +194,8 @@ class TranslationValidator : public Validator<data::Corpus, models::IModel> {
public:
TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) override;
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> state) override;
std::string type() override { return "translation"; }
@ -209,7 +213,8 @@ class BleuValidator : public Validator<data::Corpus, models::IModel> {
public:
BleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool detok = false);
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) override;
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
Ptr<const TrainingState> state) override;
// @TODO: why do we return this string, but not pass it to the constructor?
std::string type() override { return detok_ ? "bleu-detok" : "bleu"; }

View File

@ -39,6 +39,7 @@ public:
const std::vector<Ptr<ScorerState /*const*/>>& states,
Ptr<data::CorpusBatch /*const*/> batch, // for alignments only
Ptr<FactoredVocab/*const*/> factoredVocab, size_t factorGroup,
const std::vector<bool>& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use.
const std::vector<IndexType>& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx]
std::vector<float> align; // collects alignment information from the last executed time step
if(options_->hasAndNotEmpty("alignment") && factorGroup == 0)
@ -49,9 +50,10 @@ public:
// create a reverse batchMap to obtain original batchIdx in the starting batch size
// and calculate the current batch size based on non-empty beams
std::vector<IndexType> reverseBatchIdxMap(batchIdxMap.size());
std::vector<IndexType> reverseBatchIdxMap; // empty if not purging batch entries
size_t currentDimBatch = beams.size();
if(PURGE_BATCH) {
reverseBatchIdxMap.resize(batchIdxMap.size()); // adjust size if doing batch purging.
currentDimBatch = 0;
for(int i = 0; i < batchIdxMap.size(); ++i) {
reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one,
@ -66,16 +68,22 @@ public:
// They can be between 0 and (vocabSize * nBestBeamSize * batchSize)-1.
// (beamHypIdx refers to the GPU tensors, *not* the beams[] array; they are not the same in case of purging)
const auto key = nBestKeys[i];
const float pathScore = nBestPathScores[i]; // expanded path score for (batchIdx, beamHypIdx, word)
// decompose key into individual indices (batchIdx, beamHypIdx, wordIdx)
const auto wordIdx = (WordIndex)(key % vocabSize);
const auto beamHypIdx = (key / vocabSize) % nBestBeamSize;
const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
const auto beamHypIdx = (key / vocabSize) % nBestBeamSize;
const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
const auto origBatchIdx = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
auto origBatchIdx = currentBatchIdx;
if(PURGE_BATCH)
origBatchIdx = reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx];
// if we force=drop the hypothesis, assign EOS, otherwise the expected word id.
const auto wordIdx = dropHyp ? trgVocab_->getEosId().toWordIndex() : (WordIndex)(key % vocabSize);
// @TODO: We currently assign a log probability of 0 to all beam entries of the dropped batch entry, instead it might be a good idea to use
// the per Hyp pathScore without the current expansion (a bit hard to obtain).
// For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better.
// For the empty hyp this would naturally result in 0, too.
const float pathScore = dropHyp ? 0.f : nBestPathScores[i]; // 0 (Prob = 1, maximum score) if dropped or expanded path score for (batchIdx, beamHypIdx, word)
const auto& beam = beams[origBatchIdx];
auto& newBeam = newBeams[origBatchIdx]; // extended hypotheses are going to be placed in this new beam
@ -85,7 +93,7 @@ public:
if (pathScore <= INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor)
continue;
ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??");
ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...)
// map wordIdx to word
auto prevBeamHypIdx = beamHypIdx; // back pointer
@ -99,12 +107,17 @@ public:
// starting with the lemma, then adding factors one by one.
if (factorGroup == 0) {
word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
//std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
//LOG(info, "new lemma {},{}={} -> {}->{}", word.toWordIndex(), factorIndices[0], factoredVocab->word2string(word), prevHyp->getPathScore(), pathScore);
std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
//LOG(info, "{} + {} ({}) -> {} -> {}",
// factoredVocab->decode(prevHyp->tracebackWords()),
// factoredVocab->word2string(word), factorIndices[0], prevHyp->getPathScore(), pathScore);
}
else {
//LOG(info, "expand word {}={} with factor[{}] {} -> {}->{}", beam[beamHypIdx]->getWord().toWordIndex(),
// factoredVocab->word2string(beam[beamHypIdx]->getWord()), factorGroup, wordIdx, prevHyp->getPathScore(), pathScore);
//LOG(info, "{} |{} ({}) = {} ({}) -> {} -> {}",
// factoredVocab->decodeForDiagnostics(beam[beamHypIdx]->tracebackWords()),
// factoredVocab->getFactorGroupPrefix(factorGroup), factorGroup,
// factoredVocab->getFactorName(factorGroup, wordIdx), wordIdx,
// prevHyp->getPathScore(), pathScore);
word = beam[beamHypIdx]->getWord();
ABORT_IF(!factoredVocab->canExpandFactoredWord(word, factorGroup),
"A word without this factor snuck through to here??");
@ -235,7 +248,7 @@ public:
if(PURGE_BATCH)
if(newBeam.empty() && !beam.empty()) { // previous beam had hyps, but all were finished in this step, newBeam will now stay empty
for(int i = beamIdx + 1; i < beams.size(); ++i) // for all entries above this beam
for(size_t i = beamIdx + 1; i < beams.size(); ++i) // for all entries above this beam
batchIdxMap[i] = batchIdxMap[i] - 1; // make them look at one batch index below, as the current entry will be removed from the batch.
}
@ -282,31 +295,22 @@ public:
states.push_back(scorer->startState(graph, batch));
}
const auto srcEosId = batch->front()->vocab()->getEosId();
// create one beam per batch entry with sentence-start hypothesis
Beams beams(origDimBatch, Beam(beamSize_, Hypothesis::New())); // array [origDimBatch] of array [maxBeamSize] of Hypothesis, keeps full size through search.
// batch purging is determined from an empty sub-beam.
std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking.
// By default that corresponds to position in array,
// but shifts in the course of removing batch entries when they are finished.
const std::vector<bool> emptyBatchEntries; // used for recording if there are empty input batch entries
for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) {
batchIdxMap[origBatchIdx] = origBatchIdx; // map to same position on initialization
auto& beam = beams[origBatchIdx];
histories[origBatchIdx]->add(beam, trgEosId); // add beams with start-hypotheses to traceback grid
// Handle batch entries that consist only of source <EOS> i.e. these are empty lines
if(batch->front()->data()[origBatchIdx] == srcEosId) {
// create a target <EOS> hypothesis that extends the start-hypothesis
auto eosHyp = Hypothesis::New(/*prevHyp=*/ beam[0],
/*currWord=*/ trgEosId,
/*prevHypIdx=*/ 0,
/*pathScore=*/ 0.f);
auto eosBeam = Beam(beamSize_, eosHyp); // create a dummy beam filled with <EOS>-hyps
histories[origBatchIdx]->add(eosBeam, trgEosId); // push dummy <EOS>-beam to traceback grid
beam.clear(); // Zero out current beam, so it does not get used for further symbols as empty beams get omitted everywhere.
// The corresponding neural states will be purged further down.
}
// Mark batch entries that consist only of source <EOS> i.e. these are empty lines. They will be forced to EOS and purged from batch
const auto& srcEosId = batch->front()->vocab()->getEosId();
const_cast<std::vector<bool>&>(emptyBatchEntries).push_back(batch->front()->data()[origBatchIdx] == srcEosId); // const_cast during construction
}
// determine index of UNK in the log prob vectors if we want to suppress it in the decoding process
@ -406,7 +410,7 @@ public:
}
}
if(factorGroup == 0)
currentDimBatch = batchIndices.size(); // keep batch size constant for all factor groups in a time step
currentDimBatch = (IndexType) batchIndices.size(); // keep batch size constant for all factor groups in a time step
prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores));
}
if (!anyCanExpand) // all words cannot expand this factor: skip
@ -491,6 +495,7 @@ public:
states, // used for keeping track of per-ensemble-member path score
batch, // only used for propagating alignment info
factoredVocab, factorGroup,
emptyBatchEntries, // [origDimBatch] - empty source batch entries are marked with true
batchIdxMap); // used to create a reverse batch index map to recover original batch indices for this step
} // END FOR factorGroup = 0 .. numFactorGroups-1

View File

@ -42,33 +42,44 @@ public:
float getPathScore() const { return pathScore_; }
const std::vector<float>& getScoreBreakdown() { return scoreBreakdown_; }
void setScoreBreakdown(const std::vector<float>& scoreBreaddown) { scoreBreakdown_ = scoreBreaddown; }
void setScoreBreakdown(const std::vector<float>& scoreBreakdown) { scoreBreakdown_ = scoreBreakdown; }
const std::vector<float>& getAlignment() { return alignment_; }
void setAlignment(const std::vector<float>& align) { alignment_ = align; };
// helpers to trace back paths referenced from this hypothesis
Words tracebackWords()
{
Words targetWords;
for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
targetWords.push_back(hyp->getWord());
// std::cerr << hyp->getWord() << " " << hyp << std::endl;
}
std::reverse(targetWords.begin(), targetWords.end());
return targetWords;
// trace back paths referenced from this hypothesis
Words tracebackWords() {
Words targetWords;
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
targetWords.push_back(hyp->getWord());
}
std::reverse(targetWords.begin(), targetWords.end());
return targetWords;
}
// calculate word-level scores for each target word by de-aggregating the path score
std::vector<float> tracebackWordScores() {
std::vector<float> scores;
// traverse hypotheses backward
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
// a path score is a cumulative score including scores from all preceding hypotheses (words),
// so calculate a word-level score by subtracting the previous path score from the current path score
auto prevPathScore = hyp->getPrevHyp() ? hyp->getPrevHyp().get()->pathScore_ : 0.f;
scores.push_back(hyp->pathScore_ - prevPathScore);
}
std::reverse(scores.begin(), scores.end());
return scores;
}
// get soft alignments [t][s] -> P(s|t) for each target word starting from the hyp one
typedef data::SoftAlignment SoftAlignment;
SoftAlignment tracebackAlignment()
{
SoftAlignment align;
for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
align.push_back(hyp->getAlignment());
}
std::reverse(align.begin(), align.end());
return align; // [t][s] -> P(s|t)
SoftAlignment tracebackAlignment() {
SoftAlignment align;
for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
align.push_back(hyp->getAlignment());
}
std::reverse(align.begin(), align.end());
return align; // [t][s] -> P(s|t)
}
private:

View File

@ -1,5 +1,7 @@
#include "output_printer.h"
#include <sstream>
namespace marian {
std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
@ -19,11 +21,18 @@ std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
} else if(alignment_ == "hard") {
return data::ConvertSoftAlignToHardAlign(align, 1.f).toString();
} else if(alignmentThreshold_ > 0.f) {
return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_)
.toString();
return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_).toString();
} else {
ABORT("Unrecognized word alignment type");
}
}
std::string OutputPrinter::getWordScores(const Hypothesis::PtrType& hyp) {
std::ostringstream scores;
scores.precision(5);
for(const auto& score : hyp->tracebackWordScores())
scores << " " << std::fixed << score;
return scores.str();
}
} // namespace marian

View File

@ -20,12 +20,14 @@ public:
? options->get<size_t>("beam-size")
: 0),
alignment_(options->get<std::string>("alignment", "")),
alignmentThreshold_(getAlignmentThreshold(alignment_)) {}
alignmentThreshold_(getAlignmentThreshold(alignment_)),
wordScores_(options->get<bool>("word-scores")) {}
template <class OStream>
void print(Ptr<const History> history, OStream& best1, OStream& bestn) {
const auto& nbl = history->nBest(nbest_);
// prepare n-best list output
for(size_t i = 0; i < nbl.size(); ++i) {
const auto& result = nbl[i];
const auto& hypo = std::get<1>(result);
@ -40,6 +42,9 @@ public:
if(!alignment_.empty())
bestn << " ||| " << getAlignment(hypo);
if(wordScores_)
bestn << " ||| WordScores=" << getWordScores(hypo);
bestn << " |||";
if(hypo->getScoreBreakdown().empty()) {
bestn << " F0=" << hypo->getPathScore();
@ -72,17 +77,26 @@ public:
best1 << " ||| " << getAlignment(hypo);
}
if(wordScores_) {
const auto& hypo = std::get<1>(result);
best1 << " ||| WordScores=" << getWordScores(hypo);
}
best1 << std::flush;
}
private:
Ptr<Vocab const> vocab_;
bool reverse_{false};
size_t nbest_{0};
std::string alignment_;
float alignmentThreshold_{0.f};
bool reverse_{false}; // If it is a right-to-left model that needs reversed word order
size_t nbest_{0}; // Size of the n-best list to print
std::string alignment_; // A non-empty string indicates the type of word alignment
float alignmentThreshold_{0.f}; // Threshold for converting attention into hard word alignment
bool wordScores_{false}; // Whether to print word-level scores or not
// Get word alignment pairs or soft alignment
std::string getAlignment(const Hypothesis::PtrType& hyp);
// Get word-level scores
std::string getWordScores(const Hypothesis::PtrType& hyp);
float getAlignmentThreshold(const std::string& str) {
try {

View File

@ -175,6 +175,7 @@ private:
std::vector<Ptr<Vocab>> srcVocabs_;
Ptr<Vocab> trgVocab_;
Ptr<const data::ShortlistGenerator> shortlistGenerator_;
size_t numDevices_;
@ -199,6 +200,11 @@ public:
trgVocab_ = New<Vocab>(options_, vocabPaths.size() - 1);
trgVocab_->load(vocabPaths.back());
// load lexical shortlist
if(options_->hasAndNotEmpty("shortlist"))
shortlistGenerator_ = New<data::LexicalShortlistGenerator>(
options_, srcVocabs_.front(), trgVocab_, 0, 1, vocabPaths.front() == vocabPaths.back());
// get device IDs
auto devices = Config::getDevices(options_);
numDevices_ = devices.size();
@ -218,8 +224,11 @@ public:
graphs_.push_back(graph);
auto scorers = createScorers(options_);
for(auto scorer : scorers)
for(auto scorer : scorers) {
scorer->init(graph);
if(shortlistGenerator_)
scorer->setShortlistGenerator(shortlistGenerator_);
}
scorers_.push_back(scorers);
}
}

View File

@ -1,7 +1,7 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.27703.2047
VisualStudioVersion = 15.0.28307.902
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Marian", "Marian.vcxproj", "{E2F320FE-0C01-4C80-810C-3A92205A29DC}"
EndProject
@ -20,6 +20,6 @@ Global
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {8CA1BE8F-87A9-4094-B549-E8C790F79D8C}
SolutionGuid = {3B922907-3384-4D39-9CEB-816BF7BB390D}
EndGlobalSection
EndGlobal

View File

@ -43,14 +43,14 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
<IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
<IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\googletest\googletest;..\src\3rd_party\fbgemm\third_party\googletest\googletest\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
<LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ExecutablePath)</ExecutablePath>
<IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
<IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
<IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\googletest\googletest;..\src\3rd_party\fbgemm\third_party\googletest\googletest\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
<LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup>
@ -70,7 +70,7 @@
</PrecompiledHeader>
<WarningLevel>Level4</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>USE_MKL;ASMJIT_EXPORTS;BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>false</SDLCheck>
<TreatWarningAsError>true</TreatWarningAsError>
<AdditionalOptions>/bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
@ -107,7 +107,7 @@
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions>USE_MKL;ASMJIT_EXPORTS;BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>false</SDLCheck>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<AdditionalOptions>/d2Zi+ /bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
@ -141,6 +141,102 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="..\src\3rd_party\ExceptionWithCallStack.cpp" />
<ClCompile Include="..\src\3rd_party\fbgemm\bench\BenchUtils.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\ConvUnifiedBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\Depthwise3DBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\DepthwiseBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\FP16Benchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsTunableBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\GroupwiseConvRequantizeBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\I8SpmdmBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\Im2ColFusedRequantizeBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedFloatInOutBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc16Benchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc32Benchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\RequantizeBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\RowOffsetBenchmark.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\codegen_fp16fp32.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -177,6 +273,12 @@
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8Depthwise3DAvx2.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -201,6 +303,12 @@
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC16Avx512VNNI.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -213,6 +321,12 @@
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32Avx512VNNI.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\GroupwiseConvAcc32Avx2.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -255,6 +369,12 @@
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\PackDepthwiseConvMatrixAvx2.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\PackMatrix.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -309,153 +429,253 @@
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\test\FP16Test.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.cpp">
<ClCompile Include="..\src\3rd_party\fbgemm\test\GConvTest.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\I8DepthwiseTest.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\I8SpmdmTest.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\Im2ColFusedRequantizeTest.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeAcc16Test.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeTest.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\QuantUtilsTest.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\RequantizeOnlyTest.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\TestUtils.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\test\UniConvTest.cc">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.cpp">
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
@ -472,20 +692,10 @@
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand_regs.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass.cpp">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
@ -579,6 +789,11 @@
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-all.cc" />
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest_main.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="..\src\3rd_party\pathie-cpp\src\entry_iterator.cpp" />
<ClCompile Include="..\src\3rd_party\pathie-cpp\src\errors.cpp" />
<ClCompile Include="..\src\3rd_party\pathie-cpp\src\path.cpp" />
@ -586,6 +801,10 @@
<ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie_ifstream.cpp" />
<ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie_ofstream.cpp" />
<ClCompile Include="..\src\3rd_party\pathie-cpp\src\temp.cpp" />
<ClCompile Include="..\src\3rd_party\phf\phf.cc">
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
<WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
</ClCompile>
<ClCompile Include="..\src\3rd_party\sentencepiece\src\bpe_model.cc">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -791,6 +1010,8 @@
<ClInclude Include="..\src\3rd_party\any_type.h" />
<ClInclude Include="..\src\3rd_party\avx_mathfun.h" />
<ClInclude Include="..\src\3rd_party\ExceptionWithCallStack.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\bench\AlignedVec.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\bench\BenchUtils.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\ConvUtils.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Fbgemm.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\FbgemmBuild.h" />
@ -804,56 +1025,77 @@
<ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Types.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Utils.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\UtilsAvx2.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\CodeCache.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernelGeneric.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernelU8S8.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmFP16UKernelsAvx2.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2-inl.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\GenerateKernel.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\GroupwiseConv.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\OptimizedKernelsAvx2.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\RefImplementations.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\src\TransposeUtils.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\arm.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\test\TestUtils.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apibegin.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apiend.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_build.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\misc_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\simdtypes.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\build.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codebufferwriter_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\datatypes.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\features.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\misc_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\raassignment_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rabuilders_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\radefs_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestring.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86emitter.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86globals.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86logging_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86misc.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86opcode_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass_p.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\include\cpuinfo-mock.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\include\cpuinfo.h" />
@ -864,6 +1106,7 @@
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\api.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\cpuid.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\windows\api.h" />
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-internal-inl.h" />
<ClInclude Include="..\src\3rd_party\half_float\umHalf.h" />
<ClInclude Include="..\src\3rd_party\nccl\src\collectives\collectives.h">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -962,6 +1205,7 @@
<ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie_ifstream.hpp" />
<ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie_ofstream.hpp" />
<ClInclude Include="..\src\3rd_party\pathie-cpp\include\temp.hpp" />
<ClInclude Include="..\src\3rd_party\phf\phf.h" />
<ClInclude Include="..\src\3rd_party\sentencepiece\src\bpe_model.h">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -1094,6 +1338,7 @@
<ClCompile Include="..\src\common\cli_helper.cpp" />
<ClCompile Include="..\src\common\cli_wrapper.cpp" />
<ClCompile Include="..\src\common\config_validator.cpp" />
<ClCompile Include="..\src\common\fastopt.cpp" />
<ClCompile Include="..\src\common\filesystem.cpp" />
<ClCompile Include="..\src\common\file_stream.cpp" />
<ClCompile Include="..\src\common\io.cpp" />
@ -1136,13 +1381,10 @@
<ClCompile Include="..\src\rescorer\score_collector.cpp" />
<ClCompile Include="..\src\tensors\backend.cpp" />
<ClCompile Include="..\src\tensors\cpu\device.cpp" />
<ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp" />
<ClCompile Include="..\src\tensors\cpu\prod.cpp" />
<ClCompile Include="..\src\tensors\cpu\sharp\avx_gemm.cpp" />
<ClCompile Include="..\src\tensors\cpu\sharp\int_gemm.cpp" />
<ClCompile Include="..\src\tensors\cpu\sharp\packed_gemm.cpp">
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
<TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
</ClCompile>
<ClCompile Include="..\src\tensors\cpu\sharp\sse_gemm.cpp" />
<ClCompile Include="..\src\tensors\cpu\tensor_operators.cpp" />
<ClCompile Include="..\src\graph\expression_graph.cpp" />
@ -1251,6 +1493,7 @@
<ClInclude Include="..\src\common\cli_helper.h" />
<ClInclude Include="..\src\common\cli_wrapper.h" />
<ClInclude Include="..\src\common\config_validator.h" />
<ClInclude Include="..\src\common\fastopt.h" />
<ClInclude Include="..\src\common\filesystem.h" />
<ClInclude Include="..\src\common\hash.h" />
<ClInclude Include="..\src\common\io.h" />
@ -1267,7 +1510,6 @@
<ClInclude Include="..\src\examples\mnist\validator.h" />
<ClInclude Include="..\src\functional\approx.h" />
<ClInclude Include="..\src\functional\operators.h" />
<ClInclude Include="..\src\graph\expression_graph_packable.h" />
<ClInclude Include="..\src\layers\loss.h" />
<ClInclude Include="..\src\layers\weight.h" />
<ClInclude Include="..\src\marian.h" />
@ -1487,9 +1729,10 @@
<ClInclude Include="..\src\rnn\types.h" />
<ClInclude Include="..\src\tensors\allocator.h" />
<ClInclude Include="..\src\tensors\backend.h" />
<ClInclude Include="..\src\tensors\cpu\expanded_gemm.h" />
<ClInclude Include="..\src\tensors\cpu\fbgemm\expanded_gemm.h" />
<ClInclude Include="..\src\tensors\cpu\fbgemm\expression_graph_packable.h" />
<ClInclude Include="..\src\tensors\cpu\fbgemm\packed_gemm.h" />
<ClInclude Include="..\src\tensors\cpu\sharp\int_gemm.h" />
<ClInclude Include="..\src\tensors\cpu\sharp\packed_gemm.h" />
<ClInclude Include="..\src\tensors\device.h" />
<ClInclude Include="..\src\tensors\dispatch.h" />
<ClInclude Include="..\src\tensors\gpu\add.h" />

View File

@ -490,9 +490,6 @@
<ClCompile Include="..\src\tensors\gpu\prod.cpp">
<Filter>tensors\gpu</Filter>
</ClCompile>
<ClCompile Include="..\src\tensors\cpu\sharp\packed_gemm.cpp">
<Filter>tensors\cpu\sharp</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.cc">
<Filter>3rd_party\fbgemm\src</Filter>
</ClCompile>
@ -616,19 +613,127 @@
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\cache\init.c">
<Filter>3rd_party\fbgemm\third_party\cpuinfo\src\x86\cacehe</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src\clog.c">
<Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src</Filter>
</ClCompile>
<ClCompile Include="..\src\common\aliases.cpp">
<Filter>common</Filter>
</ClCompile>
<ClCompile Include="..\src\common\filesystem.cpp">
<Filter>common</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.cpp">
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl.cpp">
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal.cpp">
@ -640,74 +745,110 @@
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand_regs.cpp">
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\src\codegen_fp16fp32.cc">
<Filter>3rd_party\fbgemm\src</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8Depthwise3DAvx2.cc">
<Filter>3rd_party\fbgemm\src</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC16Avx512VNNI.cc">
<Filter>3rd_party\fbgemm\src</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32Avx512VNNI.cc">
<Filter>3rd_party\fbgemm\src</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\src\PackDepthwiseConvMatrixAvx2.cc">
<Filter>3rd_party\fbgemm\src</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\FP16Test.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\GConvTest.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\I8DepthwiseTest.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\I8SpmdmTest.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\Im2ColFusedRequantizeTest.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeAcc16Test.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeTest.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\QuantUtilsTest.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\RequantizeOnlyTest.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\TestUtils.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\test\UniConvTest.cc">
<Filter>3rd_party\fbgemm\test</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\BenchUtils.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\ConvUnifiedBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\Depthwise3DBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.cpp">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\DepthwiseBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src\clog.c">
<Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src</Filter>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\FP16Benchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsTunableBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\GroupwiseConvRequantizeBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\I8SpmdmBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\Im2ColFusedRequantizeBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedFloatInOutBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc16Benchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc32Benchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\RequantizeBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\bench\RowOffsetBenchmark.cc">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest_main.cc">
<Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-all.cc">
<Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
</ClCompile>
<ClCompile Include="..\src\common\aliases.cpp">
<Filter>common</Filter>
@ -733,6 +874,15 @@
<ClCompile Include="..\src\common\types.cpp">
<Filter>common</Filter>
</ClCompile>
<ClCompile Include="..\src\common\fastopt.cpp">
<Filter>common</Filter>
</ClCompile>
<ClCompile Include="..\src\3rd_party\phf\phf.cc">
<Filter>3rd_party\phf</Filter>
</ClCompile>
<ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp">
<Filter>tensors\cpu\fbgemm</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\marian.h" />
@ -1798,12 +1948,6 @@
<ClInclude Include="..\src\tensors\gpu\add.inc">
<Filter>tensors\gpu</Filter>
</ClInclude>
<ClInclude Include="..\src\tensors\cpu\expanded_gemm.h">
<Filter>tensors\cpu</Filter>
</ClInclude>
<ClInclude Include="..\src\tensors\cpu\sharp\packed_gemm.h">
<Filter>tensors\cpu\sharp</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\ConvUtils.h">
<Filter>3rd_party\fbgemm\include\fbgemm</Filter>
</ClInclude>
@ -1897,46 +2041,163 @@
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\cpuinfo\utils.h">
<Filter>3rd_party\fbgemm\third_party\cpuinfo\src\cpuinfo</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\arm.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apibegin.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apiend.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_build.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h">
<Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\build.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codebufferwriter_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\datatypes.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\features.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\misc_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\raassignment_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rabuilders_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\radefs_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestring.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86emitter.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86globals.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.h">
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl_p.h">
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal_p.h">
@ -1945,83 +2206,35 @@
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86logging_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86misc.h">
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86opcode_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc_p.h">
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClInclude Include="..\src\3rd_party\fbgemm\src\CodeCache.h">
<Filter>3rd_party\fbgemm\src</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2-inl.h">
<Filter>3rd_party\fbgemm\src</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClInclude Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.h">
<Filter>3rd_party\fbgemm\test</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClInclude Include="..\src\3rd_party\fbgemm\test\TestUtils.h">
<Filter>3rd_party\fbgemm\test</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClInclude Include="..\src\3rd_party\fbgemm\bench\AlignedVec.h">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
<ClInclude Include="..\src\3rd_party\fbgemm\bench\BenchUtils.h">
<Filter>3rd_party\fbgemm\bench</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\misc_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc_p.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\simdtypes.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h">
<Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include</Filter>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-internal-inl.h">
<Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\half_float\umHalf.h">
<Filter>3rd_party\half_float</Filter>
@ -2047,8 +2260,23 @@
<ClInclude Include="..\src\3rd_party\zstr\zstr.hpp">
<Filter>3rd_party</Filter>
</ClInclude>
<ClInclude Include="..\src\graph\expression_graph_packable.h">
<Filter>graph</Filter>
<ClInclude Include="..\src\common\fastopt.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\phf\phf.h">
<Filter>3rd_party\phf</Filter>
</ClInclude>
<ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core.h">
<Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
</ClInclude>
<ClInclude Include="..\src\tensors\cpu\fbgemm\expanded_gemm.h">
<Filter>tensors\cpu\fbgemm</Filter>
</ClInclude>
<ClInclude Include="..\src\tensors\cpu\fbgemm\expression_graph_packable.h">
<Filter>tensors\cpu\fbgemm</Filter>
</ClInclude>
<ClInclude Include="..\src\tensors\cpu\fbgemm\packed_gemm.h">
<Filter>tensors\cpu\fbgemm</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
@ -2265,9 +2493,6 @@
<Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86">
<UniqueIdentifier>{5818c959-7963-4d8e-9e87-b61f340476c2}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\base">
<UniqueIdentifier>{15414ec0-8761-4068-afef-822b7bed88df}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\fbgemm\third_party\cpuinfo\deps">
<UniqueIdentifier>{d4505c8d-5e6e-4baf-8525-dc59ae8b6415}</UniqueIdentifier>
</Filter>
@ -2280,9 +2505,33 @@
<Filter Include="3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src">
<UniqueIdentifier>{8fd74b1e-d3c1-4158-ad46-4a447222934e}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\core">
<UniqueIdentifier>{b3b34c5f-5b98-436a-b34c-11e2dccb7ea2}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\fbgemm\test">
<UniqueIdentifier>{40576dca-07d5-4904-8119-ffbc982451a3}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\fbgemm\bench">
<UniqueIdentifier>{9f11c8f1-78f7-47c6-9eac-34cd2c6cd909}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\fbgemm\third_party\googletest">
<UniqueIdentifier>{75f9df88-0eb1-4d9a-858e-4e0b8fc3aa8a}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\fbgemm\third_party\googletest\googletest">
<UniqueIdentifier>{9f77e916-1d2f-4c15-9eba-46bcbddd2658}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\fbgemm\third_party\googletest\googletest\src">
<UniqueIdentifier>{050ba410-c56a-4607-8401-935f58f598b5}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\half_float">
<UniqueIdentifier>{defd3aec-3c56-4d70-a4bb-90ba9003d98d}</UniqueIdentifier>
</Filter>
<Filter Include="3rd_party\phf">
<UniqueIdentifier>{352ac0e9-daed-437a-bc36-fb85ecd037eb}</UniqueIdentifier>
</Filter>
<Filter Include="tensors\cpu\fbgemm">
<UniqueIdentifier>{bf361868-f451-45b8-9695-570d67924972}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<None Include="..\src\3rd_party\nccl\src\bootstrap.cu">