Merged PR 11831: Change the weight matrix quantization to use 7-bit min/max quantization to avoid overflow

1. Change the weight matrix quantization to use 7-bit min/max quantization
-> This resolves all the overflow issue, because weight and activations are quantized by min/max range.
2. Clip fp16 quantization to avoid overflow
3. Fix windows build errors (cmake options, vcproj file)
4. int8 pack model (encoder -> fp16)
This commit is contained in:
Young Jin Kim 2020-03-25 02:52:17 +00:00
parent a5a5c62d4a
commit d2b4f3803e
7 changed files with 159 additions and 116 deletions

View File

@ -841,11 +841,15 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate){
auto buildInfo = get<std::string>("build-info");
if(!buildInfo.empty() && buildInfo != "false") {
#ifndef _MSC_VER // cmake build options are not available on MSVC based build.
if(buildInfo == "all")
std::cerr << cmakeBuildOptionsAdvanced() << std::endl;
else
std::cerr << cmakeBuildOptions() << std::endl;
exit(0);
#else // _MSC_VER
ABORT("build-info is not available on MSVC based build.");
#endif // _MSC_VER
}
// get paths to extra config files

View File

@ -35,10 +35,13 @@ public:
Tensor val = p.second->val();
// save as packed format
// @TODO Hardcoded to find packable weights - all the weights used for affine op (fp16), all the weights used for affine op and dot op (int8)
// @TODO Hardcoded to find packable weights
// int8 - quantize decoder only for better quality, all the weights used for affine op and dot op (int8)
// fp16 - all the weights used for affine op (fp16)
if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
#if USE_FBGEMM
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)
&& pName.find("encoder") == std::string::npos) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information - size
int nrow;
@ -82,7 +85,10 @@ public:
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
} else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
// fp16 quantization option + encoders for int8 quantized models
} else if ((gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3)
|| ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2))) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
@ -123,7 +129,7 @@ public:
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = gemmElementType;
item.type = Type::packed16;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of

View File

@ -76,22 +76,31 @@ const int PACK16_PADDING = 1024;
// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
const int PACK16_SPECIALMEM = 256;
// This is the maximum value of FP16 type. There is a template type implementation, but it doesn't work on windows.
// To keep the consistent result, just use the constant value instead of #ifdef _MSC_VER.
// Template type implementation: float FP16_MAX = NumericLimits<float>(Type::float16).max;
const float FP16_MAX = 65504.f;
// This function clips a value into a [min, max] range
inline float clip(float value, float min, float max) {
return std::max(min, std::min(value, max));
}
// This is copied from FBGEMM code
// A better way?
// will be removed, when FBGEMM api is changed
// blocked row-major format address arithmetic
/**
* Returns the memory address in the packed (block formatted) matrix array of a specific element
* indexed by the original non-packed array.
*
* @param r_ row index in the original matrix
* @param c_ column index in the original matrix
* @param brow_ row wide block index
* @param bcol_ column wide block index
* @param nbrow_ number of blocks in row
* @param nbcol_ number of blocks in column
* @param last_brow_ row number of the last block
*/
//
// Returns the memory address in the packed (block formatted) matrix array of a specific element
// indexed by the original non-packed array.
//
// @param r_ row index in the original matrix
// @param c_ column index in the original matrix
// @param brow_ row wide block index
// @param bcol_ column wide block index
// @param nbrow_ number of blocks in row
// @param nbcol_ number of blocks in column
// @param last_brow_ row number of the last block
inline uint64_t addr(const int r_,
const int c_,
const int brow_,
@ -114,6 +123,15 @@ inline uint64_t addr(const int r_,
return index;
}
// Returns a value in 2D array with the row, column index (i, j) and transposed flag.
// The number of rows and columns needs to be passed.
// The transposed flag indicates if the underlying data needs to be accessed in a tranposed layout or not.
inline float getVal2dArr(const float* data, size_t i, size_t j, size_t rows, size_t cols, bool transposed) {
ABORT_IF(i >= rows, "Row index {} exceeds the number of rows {}.", i, rows);
ABORT_IF(j >= cols, "Column index {} exceeds the number of columns {}.", j, cols);
return transposed ? data[j * rows + i] : data[i * cols + j];
}
// Memory blocking factors (parameters) for packing into AVX2 int8
static const fbgemm::BlockingFactors Packed8Avx2BlockingFactors = {
PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MR,
@ -147,6 +165,12 @@ inline const fbgemm::BlockingFactors* getBlockingFactors(marian::Type packType)
}
}
// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
// Packing with fp16 only targets AVX2 instruction sets for now.
// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
// shape: shape of the tensor to be packed
// transpose: the matrix is transposed
// packsize (out): the size of the packed matrix in byte
void fbgemmPacked16PackInfo(const marian::Shape& shape,
const bool transpose,
uint64_t& packsize) {
@ -154,6 +178,21 @@ void fbgemmPacked16PackInfo(const marian::Shape& shape,
fbgemmPacked16PackInfo(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
}
// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
// This function returns some other extra variables
// Packing with fp16 only targets AVX2 instruction sets for now.
// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
// shape: shape of the tensor to be packed
// transpose: the matrix is transposed
// nrow (out): the number of rows
// ncol (out): the number of columns
// kernel_ncol_blocks (out): the number of column blocks
// brow (out): the number of rows in a block
// bcol (out): the number of columns in a block
// last_brow (out): the number of rows in the last block
// nbrow (out): row index in a block
// nbcol (out): column index in a block
// packsize (out): the size of the packed matrix in byte
void fbgemmPacked16PackInfo(const marian::Shape& shape,
const bool transpose,
int& nrow,
@ -178,6 +217,14 @@ void fbgemmPacked16PackInfo(const marian::Shape& shape,
+ PACK16_SPECIALMEM;
}
// Returns the byte size of packed matrix in int8. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
// See '3rd_party/fbgemm/src/PackBMatrix.cc'.
// shape: shape of the tensor to be packed
// packType: Type to be packed - packed8avx2 or packed8avx512
// transpose: the matrix is transposed
// nrow (out): the number of rows
// ncol (out): the number of columns
// packsize (out): the size of the packed matrix in byte
void fbgemmPacked8PackInfo(const marian::Shape& shape,
const marian::Type packType,
const bool transpose,
@ -221,6 +268,20 @@ inline void col_offsets_with_zero_pt_s8acc32(
}
}
// Pack a matrix (fp16) into cache utilization efficient way (block format) into fp16
// out: output tensor - packed format
// inData: input tensor data - pointer of float data
// transpose: the matrix is transposed
// nrow: the number of rows
// ncol: the number of columns
// kernel_ncol_blocks: the number of column blocks
// brow: the number of rows in a block
// bcol: the number of columns in a block
// last_brow: the number of rows in the last block
// nbrow: row index in a block
// nbcol: column index in a block
// packsize: the size of the packed matrix
// (the number of fp16 elements + padding (1024) + extra temporary memory (256))
void fbgemmPacked16Pack(marian::Tensor out,
const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
const bool transpose,
@ -258,20 +319,37 @@ void fbgemmPacked16Pack(marian::Tensor out,
// pack the matrix
for(int i = 0; i < nrow; i++) {
for(int j = 0; j < ncol; j++) {
outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
= tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
float src = clip(transpose ? inData[i + nrow * j] : inData[i * ncol + j], -FP16_MAX, FP16_MAX);
outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)] = tconv(src, *dummy);
}
}
delete dummy;
}
// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
// out: output tensor - packed format and quantized into int8
// inData: input tensor data - pointer of float data
// packType: Type to be packed - packed8avx2 or packed8avx512
// transpose: the matrix is transposed
// nrow: the number of rows
// ncol: the number of columns
// packsize: the size of the packed matrix
// (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
// quantRangeStdDevs: the range to be quantized for the original float data in multiples standard deviation
// the default value is 0.0f which means min/max quantization
// only a half range of normal int8 which is [-64, 63] used to avoid overflow
// during the accumulation in VPMADDUBSW instruction
// https://intel.github.io/mkl-dnn/dev_guide_int8_computations.html
// (e.g. 3.f means the original tensor is quantized
// from [mean - 3.f * standard deviation, mean + 3.f * standard deviation] to [-64, 63])
void fbgemmPacked8Pack(marian::Tensor out,
const float* inData,
const marian::Type packType,
const bool transpose,
const int nrow,
const int ncol,
const uint64_t packsize) {
const uint64_t packsize,
const float quantRangeStdDevs) {
int k = nrow;
int n = ncol;
int len = k * n;
@ -282,46 +360,43 @@ void fbgemmPacked8Pack(marian::Tensor out,
const float* data = inData;
float val = 0;
// Use half of the quantization range to prevent overflow of VPMADDUBSW
constexpr static int quantizedRange = 127;
constexpr static int quantizedMax = 63;
if (transpose) {
for (int jj = 0; jj < n; jj++) {
float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
double mean = 0, sqrsum = 0;
for (int ii = 0; ii < k; ii++) {
val = data[jj * k + ii];
mean += val;
sqrsum += val * val;
}
mean /= k;
sqrsum /= k;
sqrsum -= mean * mean;
sqrsum = sqrt(sqrsum);
min = (float)(mean - 7.0f*sqrsum);
max = (float)(mean + 7.0f*sqrsum);
bqScale[jj] = (max - min) / 255;
bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
}
} else {
for (int jj = 0; jj < n; jj++) {
float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
double mean = 0, sqrsum = 0;
for (int ii = 0; ii < k; ii++) {
val = data[jj + ii * n];
mean += val;
sqrsum += val * val;
}
mean /= k;
sqrsum /= k;
sqrsum -= mean * mean;
sqrsum = sqrt(sqrsum);
min = (float)(mean - 7.0f*sqrsum);
max = (float)(mean + 7.0f*sqrsum);
bqScale[jj] = (max - min) / 255;
bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
}
}
// This routine compute the quantization range for each column - either one of min/max range or quantRangeStdDevs sigma range.
for (size_t jj = 0; jj < n; jj++) { // for each column, collect stats (min/max or mean/std.dev.)
float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
double mean = 0, sqrsum = 0;
for (size_t ii = 0; ii < k; ii++) { // in a column, go throuhg all the rows and collect stats
val = getVal2dArr(data, ii, jj, k, n, transpose);
// If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range
if(quantRangeStdDevs == 0.f) {
if(min > val)
min = val;
if(max < val)
max = val;
} else {
// Quantize by std.dev. range
mean += val;
sqrsum += val * val;
}
}
// If a quantization range (in multiples of std. dev.) is given with a non-zero value,
// it calculate the range for this column (different quantization scale/offset are used for each column)
if(quantRangeStdDevs != 0.f) {
mean /= k;
sqrsum /= k;
sqrsum -= mean * mean;
sqrsum = sqrt(sqrsum);
min = (float)(mean - quantRangeStdDevs * sqrsum);
max = (float)(mean + quantRangeStdDevs * sqrsum);
}
// based on the quantization range, this computes the scale and offset for the quantization
bqScale[jj] = (max - min) / quantizedRange;
bqZeropoint[jj] = (int32_t)(quantizedMax - max / bqScale[jj]);
}
// 2. quantize
int8_t* quantized = 0;
@ -335,7 +410,7 @@ void fbgemmPacked8Pack(marian::Tensor out,
TensorQuantizationParams bQuantParam;
bQuantParam.scale = bqScale[jj];
bQuantParam.zero_point = bqZeropoint[jj];
bQuantParam.precision = 8;
bQuantParam.precision = 7; // Use half of the quantization range to prevent overflow of VPMADDUBSW
if (transpose)
fbgemm::Quantize<int8_t>(data + jj * k, quantized + jj * k, k, bQuantParam);

View File

@ -94,13 +94,21 @@ void fbgemmPacked16Pack(marian::Tensor out,
// ncol: the number of columns
// packsize: the size of the packed matrix
// (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
// quantRangeStdDevs: the range to be quantized for the original float data in multiples standard deviation
// the default value is 0.0f which means min/max quantization
// only a half range of normal int8 which is [-64, 63] used to avoid overflow
// during the accumulation in VPMADDUBSW instruction
// https://intel.github.io/mkl-dnn/dev_guide_int8_computations.html
// (e.g. 3.f means the original tensor is quantized
// from [mean - 3.f * standard deviation, mean + 3.f * standard deviation] to [-64, 63])
void fbgemmPacked8Pack(marian::Tensor out,
const float* inData,
const marian::Type packType,
const bool transpose,
const int nrow,
const int ncol,
const uint64_t packsize); // @TODO: change to size_t where appropriate
const uint64_t packsize,
const float quantRangeStdDevs = 0.f); // @TODO: change to size_t where appropriate
// GEMM operation on the packed B matrix
// C: output matrix

View File

@ -56,7 +56,7 @@ public:
for(size_t i = 0; i < N; ++i) {
int idx = idxs[i];
// since idxs is re-used for each batch, add batch offset to each idx to get absolute position
h_res_idx[pos] = idx + batchIdx * batchOffset;
h_res_idx[pos] = (int) (idx + batchIdx * batchOffset);
h_res[pos] = scoresData[idx];
++pos;
}

View File

@ -1445,7 +1445,7 @@
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="..\src\training\communicator.cpp" />
<ClCompile Include="..\src\training\graph_group_multinode_sync.cpp" />
<ClCompile Include="..\src\training\graph_group.cpp" />
<ClCompile Include="..\src\training\scheduler.cpp" />
<ClCompile Include="..\src\translator\history.cpp" />
<ClCompile Include="..\src\translator\output_collector.cpp" />
@ -1454,10 +1454,8 @@
<ClCompile Include="..\src\translator\output_printer.cpp" />
<ClCompile Include="..\src\translator\scorers.cpp" />
<ClCompile Include="..\src\training\graph_group_async.cpp" />
<ClCompile Include="..\src\training\graph_group_async_drop.cpp" />
<ClCompile Include="..\src\training\graph_group_sync.cpp" />
<ClCompile Include="..\src\training\graph_group_singleton.cpp" />
<ClCompile Include="..\src\training\graph_group_multinode.cpp" />
<ClCompile Include="..\src\training\validator.cpp" />
<ClCompile Include="..\src\3rd_party\yaml-cpp\convert.cpp" />
<ClCompile Include="..\src\3rd_party\yaml-cpp\directives.cpp" />
@ -1653,7 +1651,6 @@
<ClInclude Include="..\src\common\config_parser.h" />
<ClInclude Include="..\src\common\definitions.h" />
<ClInclude Include="..\src\common\file_stream.h" />
<ClInclude Include="..\src\common\keywords.h" />
<ClInclude Include="..\src\common\logging.h" />
<ClInclude Include="..\src\common\options.h" />
<ClInclude Include="..\src\common\regex.h" />
@ -1755,18 +1752,12 @@
<ClInclude Include="..\src\training\communicator.h" />
<ClInclude Include="..\src\training\graph_group.h" />
<ClInclude Include="..\src\training\graph_group_async.h" />
<ClInclude Include="..\src\training\graph_group_async_drop.h" />
<ClInclude Include="..\src\training\graph_group_multinode.h" />
<ClInclude Include="..\src\training\graph_group_multinode_sync.h" />
<ClInclude Include="..\src\training\graph_group_singleton.h" />
<ClInclude Include="..\src\training\graph_group_sync.h" />
<ClInclude Include="..\src\training\scheduler.h" />
<ClInclude Include="..\src\training\training.h" />
<ClInclude Include="..\src\training\training_state.h" />
<ClInclude Include="..\src\training\validator.h" />
<ClInclude Include="..\src\training\gradient_dropping\dropper.h" />
<ClInclude Include="..\src\training\gradient_dropping\sparse_tensor.h" />
<ClInclude Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.h" />
<ClInclude Include="..\src\translator\beam_search.h" />
<ClInclude Include="..\src\translator\helpers.h" />
<ClInclude Include="..\src\translator\history.h" />
@ -1906,8 +1897,6 @@
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="..\src\training\gradient_dropping\gpu\dropper.cu" />
<CudaCompile Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.cu" />
<CudaCompile Include="..\src\translator\helpers.cu" />
<CudaCompile Include="..\src\translator\nth_element.cu" />
</ItemGroup>

View File

@ -94,18 +94,12 @@
<ClCompile Include="..\src\training\graph_group_async.cpp">
<Filter>training</Filter>
</ClCompile>
<ClCompile Include="..\src\training\graph_group_async_drop.cpp">
<Filter>training</Filter>
</ClCompile>
<ClCompile Include="..\src\training\graph_group_sync.cpp">
<Filter>training</Filter>
</ClCompile>
<ClCompile Include="..\src\training\graph_group_singleton.cpp">
<Filter>training</Filter>
</ClCompile>
<ClCompile Include="..\src\training\graph_group_multinode.cpp">
<Filter>training</Filter>
</ClCompile>
<ClCompile Include="..\src\training\validator.cpp">
<Filter>training</Filter>
</ClCompile>
@ -226,9 +220,6 @@
<ClCompile Include="..\src\3rd_party\yaml-cpp\binary_renamed.cpp">
<Filter>3rd_party\yaml-cpp</Filter>
</ClCompile>
<ClCompile Include="..\src\training\graph_group_multinode_sync.cpp">
<Filter>training</Filter>
</ClCompile>
<ClCompile Include="..\src\command\marian_main.cpp">
<Filter>command</Filter>
</ClCompile>
@ -883,6 +874,9 @@
<ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp">
<Filter>tensors\cpu\fbgemm</Filter>
</ClCompile>
<ClCompile Include="..\src\training\graph_group.cpp">
<Filter>training</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\marian.h" />
@ -1288,9 +1282,6 @@
<ClInclude Include="..\src\common\file_stream.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\common\keywords.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\src\common\logging.h">
<Filter>common</Filter>
</ClInclude>
@ -1531,12 +1522,6 @@
<ClInclude Include="..\src\training\graph_group_async.h">
<Filter>training</Filter>
</ClInclude>
<ClInclude Include="..\src\training\graph_group_async_drop.h">
<Filter>training</Filter>
</ClInclude>
<ClInclude Include="..\src\training\graph_group_multinode.h">
<Filter>training</Filter>
</ClInclude>
<ClInclude Include="..\src\training\graph_group_singleton.h">
<Filter>training</Filter>
</ClInclude>
@ -1555,15 +1540,6 @@
<ClInclude Include="..\src\training\validator.h">
<Filter>training</Filter>
</ClInclude>
<ClInclude Include="..\src\training\gradient_dropping\dropper.h">
<Filter>training\gradient_dropping</Filter>
</ClInclude>
<ClInclude Include="..\src\training\gradient_dropping\sparse_tensor.h">
<Filter>training\gradient_dropping</Filter>
</ClInclude>
<ClInclude Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.h">
<Filter>training\gradient_dropping\gpu</Filter>
</ClInclude>
<ClInclude Include="..\src\translator\beam_search.h">
<Filter>translator</Filter>
</ClInclude>
@ -1642,9 +1618,6 @@
<ClInclude Include="..\src\translator\output_printer.h">
<Filter>translator</Filter>
</ClInclude>
<ClInclude Include="..\src\training\graph_group_multinode_sync.h">
<Filter>training</Filter>
</ClInclude>
<ClInclude Include="..\src\command\marian_vocab.cpp">
<Filter>command</Filter>
</ClInclude>
@ -2373,12 +2346,6 @@
<Filter Include="training">
<UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0098}</UniqueIdentifier>
</Filter>
<Filter Include="training\gradient_dropping">
<UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0101}</UniqueIdentifier>
</Filter>
<Filter Include="training\gradient_dropping\gpu">
<UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0104}</UniqueIdentifier>
</Filter>
<Filter Include="translator">
<UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0107}</UniqueIdentifier>
</Filter>
@ -2703,11 +2670,5 @@
<CudaCompile Include="..\src\tensors\gpu\sparse.cu">
<Filter>tensors\gpu</Filter>
</CudaCompile>
<CudaCompile Include="..\src\training\gradient_dropping\gpu\dropper.cu">
<Filter>training\gradient_dropping\gpu</Filter>
</CudaCompile>
<CudaCompile Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.cu">
<Filter>training\gradient_dropping\gpu</Filter>
</CudaCompile>
</ItemGroup>
</Project>