Merged PR 11831: Change the weight matrix quantization to use 7-bit min/max quantization to avoid overflow

1. Change the weight matrix quantization to use 7-bit min/max quantization -> This resolves all the overflow issue, because weight and activations are quantized by min/max range. 2. Clip fp16 quantization to avoid overflow 3. Fix windows build errors (cmake options, vcproj file) 4. int8 pack model (encoder -> fp16)
2024-10-26 09:09:10 +03:00 · 2020-03-25 02:52:17 +00:00 · 2020-03-25 02:52:17 +00:00 · d2b4f3803e
commit d2b4f3803e
parent a5a5c62d4a
7 changed files with 159 additions and 116 deletions
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -841,11 +841,15 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate){

  auto buildInfo = get<std::string>("build-info");
  if(!buildInfo.empty() && buildInfo != "false") {
+#ifndef _MSC_VER // cmake build options are not available on MSVC based build.
    if(buildInfo == "all")
      std::cerr << cmakeBuildOptionsAdvanced() << std::endl;
    else
      std::cerr << cmakeBuildOptions() << std::endl;
    exit(0);
+#else // _MSC_VER
+    ABORT("build-info is not available on MSVC based build.");
+#endif // _MSC_VER
  }

  // get paths to extra config files
--- a/src/tensors/cpu/fbgemm/expression_graph_packable.h
+++ b/src/tensors/cpu/fbgemm/expression_graph_packable.h
@ -35,10 +35,13 @@ public:
      Tensor val = p.second->val();

      // save as packed format
-      // @TODO Hardcoded to find packable weights - all the weights used for affine op (fp16), all the weights used for affine op and dot op (int8)
+      // @TODO Hardcoded to find packable weights
+      // int8 - quantize decoder only for better quality, all the weights used for affine op and dot op (int8)
+      // fp16 - all the weights used for affine op (fp16)
      if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
-        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
-  #if USE_FBGEMM
+        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)
+        && pName.find("encoder") == std::string::npos) {
+#if USE_FBGEMM
        using namespace marian::cpu::variant;
        // packing information - size
        int nrow;
@ -82,7 +85,10 @@ public:
 #else
        ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
 #endif
-      } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
+      // fp16 quantization option + encoders for int8 quantized models
+      } else if ((gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3)
+        || ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
+        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2))) {
 #if USE_FBGEMM
        using namespace marian::cpu::variant;

@ -123,7 +129,7 @@ public:
        io::Item item;
        item.name = pName;
        item.shape = val->shape();
-        item.type = gemmElementType;
+        item.type = Type::packed16;

        // Use the actual memory as this will be aligned and padded.
        // When memory mapping this is required. Shape keeps track of
--- a/src/tensors/cpu/fbgemm/packed_gemm.cpp
+++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@ -76,22 +76,31 @@ const int PACK16_PADDING = 1024;
 // This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
 const int PACK16_SPECIALMEM = 256;

+// This is the maximum value of FP16 type. There is a template type implementation, but it doesn't work on windows.
+// To keep the consistent result, just use the constant value instead of #ifdef _MSC_VER.
+// Template type implementation: float FP16_MAX = NumericLimits<float>(Type::float16).max;
+const float FP16_MAX = 65504.f;
+
+// This function clips a value into a [min, max] range
+inline float clip(float value, float min, float max) {
+  return std::max(min, std::min(value, max));
+}
+
 // This is copied from FBGEMM code
 // A better way?
 // will be removed, when FBGEMM api is changed
 // blocked row-major format address arithmetic
-/**
- * Returns the memory address in the packed (block formatted) matrix array of a specific element 
- * indexed by the original non-packed array.
- *
- * @param r_ row index in the original matrix
- * @param c_ column index in the original matrix
- * @param brow_ row wide block index
- * @param bcol_ column wide block index
- * @param nbrow_ number of blocks in row
- * @param nbcol_ number of blocks in column
- * @param last_brow_ row number of the last block
- */
+//
+// Returns the memory address in the packed (block formatted) matrix array of a specific element 
+// indexed by the original non-packed array.
+//
+// @param r_ row index in the original matrix
+// @param c_ column index in the original matrix
+// @param brow_ row wide block index
+// @param bcol_ column wide block index
+// @param nbrow_ number of blocks in row
+// @param nbcol_ number of blocks in column
+// @param last_brow_ row number of the last block
 inline uint64_t addr(const int r_,
                     const int c_,
                     const int brow_,
@ -114,6 +123,15 @@ inline uint64_t addr(const int r_,
  return index;
 }

+// Returns a value in 2D array with the row, column index (i, j) and transposed flag.
+// The number of rows and columns needs to be passed.
+// The transposed flag indicates if the underlying data needs to be accessed in a tranposed layout or not.
+inline float getVal2dArr(const float* data, size_t i, size_t j, size_t rows, size_t cols, bool transposed) {
+  ABORT_IF(i >= rows, "Row index {} exceeds the number of rows {}.", i, rows);
+  ABORT_IF(j >= cols, "Column index {} exceeds the number of columns {}.", j, cols);
+  return transposed ? data[j * rows + i] : data[i * cols + j];
+}
+
 // Memory blocking factors (parameters) for packing into AVX2 int8
 static const fbgemm::BlockingFactors Packed8Avx2BlockingFactors = {
    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MR,
@ -147,6 +165,12 @@ inline const fbgemm::BlockingFactors* getBlockingFactors(marian::Type packType)
  }
 }

+// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// Packing with fp16 only targets AVX2 instruction sets for now.
+// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// packsize (out): the size of the packed matrix in byte
 void fbgemmPacked16PackInfo(const marian::Shape& shape,
                            const bool transpose,
                            uint64_t& packsize) {
@ -154,6 +178,21 @@ void fbgemmPacked16PackInfo(const marian::Shape& shape,
  fbgemmPacked16PackInfo(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
 }

+// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// This function returns some other extra variables
+// Packing with fp16 only targets AVX2 instruction sets for now.
+// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// nrow (out): the number of rows
+// ncol (out): the number of columns
+// kernel_ncol_blocks (out): the number of column blocks
+// brow (out): the number of rows in a block
+// bcol (out): the number of columns in a block
+// last_brow (out): the number of rows in the last block
+// nbrow (out): row index in a block
+// nbcol (out): column index in a block
+// packsize (out): the size of the packed matrix in byte
 void fbgemmPacked16PackInfo(const marian::Shape& shape,
                            const bool transpose,
                            int& nrow,
@ -178,6 +217,14 @@ void fbgemmPacked16PackInfo(const marian::Shape& shape,
             + PACK16_SPECIALMEM;
 }

+// Returns the byte size of packed matrix in int8. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// See '3rd_party/fbgemm/src/PackBMatrix.cc'.
+// shape: shape of the tensor to be packed
+// packType: Type to be packed - packed8avx2 or packed8avx512
+// transpose: the matrix is transposed
+// nrow (out): the number of rows
+// ncol (out): the number of columns
+// packsize (out): the size of the packed matrix in byte
 void fbgemmPacked8PackInfo(const marian::Shape& shape,
                           const marian::Type packType,
                           const bool transpose,
@ -221,6 +268,20 @@ inline void col_offsets_with_zero_pt_s8acc32(
  }
 }

+// Pack a matrix (fp16) into cache utilization efficient way (block format) into fp16
+// out: output tensor - packed format
+// inData: input tensor data - pointer of float data
+// transpose: the matrix is transposed
+// nrow: the number of rows
+// ncol: the number of columns
+// kernel_ncol_blocks: the number of column blocks
+// brow: the number of rows in a block
+// bcol: the number of columns in a block
+// last_brow: the number of rows in the last block
+// nbrow: row index in a block
+// nbcol: column index in a block
+// packsize: the size of the packed matrix
+//          (the number of fp16 elements + padding (1024) + extra temporary memory (256))
 void fbgemmPacked16Pack(marian::Tensor out,
                        const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
                        const bool transpose,
@ -258,20 +319,37 @@ void fbgemmPacked16Pack(marian::Tensor out,
  // pack the matrix
  for(int i = 0; i < nrow; i++) {
    for(int j = 0; j < ncol; j++) {
-      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
-          = tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
+      float src = clip(transpose ? inData[i + nrow * j] : inData[i * ncol + j], -FP16_MAX, FP16_MAX);
+      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)] = tconv(src, *dummy);
    }
  }
  delete dummy;
 }

+// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
+// out: output tensor - packed format and quantized into int8
+// inData: input tensor data - pointer of float data
+// packType: Type to be packed - packed8avx2 or packed8avx512
+// transpose: the matrix is transposed
+// nrow: the number of rows
+// ncol: the number of columns
+// packsize: the size of the packed matrix
+//          (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+// quantRangeStdDevs: the range to be quantized for the original float data in multiples standard deviation
+//                    the default value is 0.0f which means min/max quantization
+//                    only a half range of normal int8 which is [-64, 63] used to avoid overflow
+//                    during the accumulation in VPMADDUBSW instruction 
+//                    https://intel.github.io/mkl-dnn/dev_guide_int8_computations.html
+//                    (e.g. 3.f means the original tensor is quantized
+//                    from [mean - 3.f * standard deviation, mean + 3.f * standard deviation] to [-64, 63])
 void fbgemmPacked8Pack(marian::Tensor out,
                       const float* inData,
                       const marian::Type packType,
                       const bool transpose,
                       const int nrow,
                       const int ncol,
-                       const uint64_t packsize) {
+                       const uint64_t packsize,
+                       const float quantRangeStdDevs) {
  int k = nrow;
  int n = ncol;
  int len = k * n;
@ -282,46 +360,43 @@ void fbgemmPacked8Pack(marian::Tensor out,

  const float* data = inData;
  float val = 0;
+
+  // Use half of the quantization range to prevent overflow of VPMADDUBSW
+  constexpr static int quantizedRange = 127;
+  constexpr static int quantizedMax = 63;

-  if (transpose) {
-    for (int jj = 0; jj < n; jj++) {
-      float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
-      double mean = 0, sqrsum = 0;
-      for (int ii = 0; ii < k; ii++) {
-        val = data[jj * k + ii];
-        mean += val;
-        sqrsum += val * val;
-      }
-      mean /= k;
-      sqrsum /= k;
-      sqrsum -= mean * mean;
-      sqrsum = sqrt(sqrsum);
-
-      min = (float)(mean - 7.0f*sqrsum);
-      max = (float)(mean + 7.0f*sqrsum);
-      bqScale[jj] = (max - min) / 255;
-      bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
-    }
-  } else {
-    for (int jj = 0; jj < n; jj++) {
-      float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
-      double mean = 0, sqrsum = 0;
-      for (int ii = 0; ii < k; ii++) {
-        val = data[jj + ii * n];
-        mean += val;
-        sqrsum += val * val;
-      }
-      mean /= k;
-      sqrsum /= k;
-      sqrsum -= mean * mean;
-      sqrsum = sqrt(sqrsum);
-
-      min = (float)(mean - 7.0f*sqrsum);
-      max = (float)(mean + 7.0f*sqrsum);
-      bqScale[jj] = (max - min) / 255;
-      bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
-    }
-  }
+  // This routine compute the quantization range for each column - either one of min/max range or quantRangeStdDevs sigma range.
+  for (size_t jj = 0; jj < n; jj++) { // for each column, collect stats (min/max or mean/std.dev.)
+    float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
+    double mean = 0, sqrsum = 0;
+    for (size_t ii = 0; ii < k; ii++) { // in a column, go throuhg all the rows and collect stats
+      val = getVal2dArr(data, ii, jj, k, n, transpose);
+      // If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range
+      if(quantRangeStdDevs == 0.f) {
+        if(min > val)
+          min = val;
+        if(max < val)
+          max = val;
+      } else {
+        // Quantize by std.dev. range
+        mean += val;
+        sqrsum += val * val;
+      }
+    }
+    // If a quantization range (in multiples of std. dev.) is given with a non-zero value,
+    // it calculate the range for this column (different quantization scale/offset are used for each column)
+    if(quantRangeStdDevs != 0.f) {
+      mean /= k;
+      sqrsum /= k;
+      sqrsum -= mean * mean;
+      sqrsum = sqrt(sqrsum);
+      min = (float)(mean - quantRangeStdDevs * sqrsum);
+      max = (float)(mean + quantRangeStdDevs * sqrsum);
+    }
+    // based on the quantization range, this computes the scale and offset for the quantization
+    bqScale[jj] = (max - min) / quantizedRange;
+    bqZeropoint[jj] = (int32_t)(quantizedMax - max / bqScale[jj]);
+  }

  // 2. quantize
  int8_t* quantized = 0;
@ -335,7 +410,7 @@ void fbgemmPacked8Pack(marian::Tensor out,
    TensorQuantizationParams bQuantParam;
    bQuantParam.scale = bqScale[jj];
    bQuantParam.zero_point = bqZeropoint[jj];
-    bQuantParam.precision = 8;
+    bQuantParam.precision = 7;  // Use half of the quantization range to prevent overflow of VPMADDUBSW

    if (transpose)
      fbgemm::Quantize<int8_t>(data + jj * k, quantized + jj * k, k, bQuantParam);
--- a/src/tensors/cpu/fbgemm/packed_gemm.h
+++ b/src/tensors/cpu/fbgemm/packed_gemm.h
@ -94,13 +94,21 @@ void fbgemmPacked16Pack(marian::Tensor out,
 // ncol: the number of columns
 // packsize: the size of the packed matrix
 //          (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+// quantRangeStdDevs: the range to be quantized for the original float data in multiples standard deviation
+//                    the default value is 0.0f which means min/max quantization
+//                    only a half range of normal int8 which is [-64, 63] used to avoid overflow
+//                    during the accumulation in VPMADDUBSW instruction 
+//                    https://intel.github.io/mkl-dnn/dev_guide_int8_computations.html
+//                    (e.g. 3.f means the original tensor is quantized
+//                    from [mean - 3.f * standard deviation, mean + 3.f * standard deviation] to [-64, 63])
 void fbgemmPacked8Pack(marian::Tensor out,
                       const float* inData,
                       const marian::Type packType,
                       const bool transpose,
                       const int nrow,
                       const int ncol,
-                       const uint64_t packsize); // @TODO: change to size_t where appropriate
+                       const uint64_t packsize,
+                       const float quantRangeStdDevs = 0.f); // @TODO: change to size_t where appropriate

 // GEMM operation on the packed B matrix
 // C: output matrix
--- a/src/translator/nth_element.cpp
+++ b/src/translator/nth_element.cpp
@ -56,7 +56,7 @@ public:
      for(size_t i = 0; i < N; ++i) {
        int idx = idxs[i];
        // since idxs is re-used for each batch, add batch offset to each idx to get absolute position
-        h_res_idx[pos] = idx + batchIdx * batchOffset;
+        h_res_idx[pos] = (int) (idx + batchIdx * batchOffset);
        h_res[pos] = scoresData[idx];
        ++pos;
      }
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@ -1445,7 +1445,7 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="..\src\training\communicator.cpp" />
-    <ClCompile Include="..\src\training\graph_group_multinode_sync.cpp" />
+    <ClCompile Include="..\src\training\graph_group.cpp" />
    <ClCompile Include="..\src\training\scheduler.cpp" />
    <ClCompile Include="..\src\translator\history.cpp" />
    <ClCompile Include="..\src\translator\output_collector.cpp" />
@ -1454,10 +1454,8 @@
    <ClCompile Include="..\src\translator\output_printer.cpp" />
    <ClCompile Include="..\src\translator\scorers.cpp" />
    <ClCompile Include="..\src\training\graph_group_async.cpp" />
-    <ClCompile Include="..\src\training\graph_group_async_drop.cpp" />
    <ClCompile Include="..\src\training\graph_group_sync.cpp" />
    <ClCompile Include="..\src\training\graph_group_singleton.cpp" />
-    <ClCompile Include="..\src\training\graph_group_multinode.cpp" />
    <ClCompile Include="..\src\training\validator.cpp" />
    <ClCompile Include="..\src\3rd_party\yaml-cpp\convert.cpp" />
    <ClCompile Include="..\src\3rd_party\yaml-cpp\directives.cpp" />
@ -1653,7 +1651,6 @@
    <ClInclude Include="..\src\common\config_parser.h" />
    <ClInclude Include="..\src\common\definitions.h" />
    <ClInclude Include="..\src\common\file_stream.h" />
-    <ClInclude Include="..\src\common\keywords.h" />
    <ClInclude Include="..\src\common\logging.h" />
    <ClInclude Include="..\src\common\options.h" />
    <ClInclude Include="..\src\common\regex.h" />
@ -1755,18 +1752,12 @@
    <ClInclude Include="..\src\training\communicator.h" />
    <ClInclude Include="..\src\training\graph_group.h" />
    <ClInclude Include="..\src\training\graph_group_async.h" />
-    <ClInclude Include="..\src\training\graph_group_async_drop.h" />
-    <ClInclude Include="..\src\training\graph_group_multinode.h" />
-    <ClInclude Include="..\src\training\graph_group_multinode_sync.h" />
    <ClInclude Include="..\src\training\graph_group_singleton.h" />
    <ClInclude Include="..\src\training\graph_group_sync.h" />
    <ClInclude Include="..\src\training\scheduler.h" />
    <ClInclude Include="..\src\training\training.h" />
    <ClInclude Include="..\src\training\training_state.h" />
    <ClInclude Include="..\src\training\validator.h" />
-    <ClInclude Include="..\src\training\gradient_dropping\dropper.h" />
-    <ClInclude Include="..\src\training\gradient_dropping\sparse_tensor.h" />
-    <ClInclude Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.h" />
    <ClInclude Include="..\src\translator\beam_search.h" />
    <ClInclude Include="..\src\translator\helpers.h" />
    <ClInclude Include="..\src\translator\history.h" />
@ -1906,8 +1897,6 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
    </CudaCompile>
-    <CudaCompile Include="..\src\training\gradient_dropping\gpu\dropper.cu" />
-    <CudaCompile Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.cu" />
    <CudaCompile Include="..\src\translator\helpers.cu" />
    <CudaCompile Include="..\src\translator\nth_element.cu" />
  </ItemGroup>
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@ -94,18 +94,12 @@
    <ClCompile Include="..\src\training\graph_group_async.cpp">
      <Filter>training</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\training\graph_group_async_drop.cpp">
-      <Filter>training</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\training\graph_group_sync.cpp">
      <Filter>training</Filter>
    </ClCompile>
    <ClCompile Include="..\src\training\graph_group_singleton.cpp">
      <Filter>training</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\training\graph_group_multinode.cpp">
-      <Filter>training</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\training\validator.cpp">
      <Filter>training</Filter>
    </ClCompile>
@ -226,9 +220,6 @@
    <ClCompile Include="..\src\3rd_party\yaml-cpp\binary_renamed.cpp">
      <Filter>3rd_party\yaml-cpp</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\training\graph_group_multinode_sync.cpp">
-      <Filter>training</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\command\marian_main.cpp">
      <Filter>command</Filter>
    </ClCompile>
@ -883,6 +874,9 @@
    <ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp">
      <Filter>tensors\cpu\fbgemm</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\training\graph_group.cpp">
+      <Filter>training</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\src\marian.h" />
@ -1288,9 +1282,6 @@
    <ClInclude Include="..\src\common\file_stream.h">
      <Filter>common</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\common\keywords.h">
-      <Filter>common</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\common\logging.h">
      <Filter>common</Filter>
    </ClInclude>
@ -1531,12 +1522,6 @@
    <ClInclude Include="..\src\training\graph_group_async.h">
      <Filter>training</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\training\graph_group_async_drop.h">
-      <Filter>training</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\training\graph_group_multinode.h">
-      <Filter>training</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\training\graph_group_singleton.h">
      <Filter>training</Filter>
    </ClInclude>
@ -1555,15 +1540,6 @@
    <ClInclude Include="..\src\training\validator.h">
      <Filter>training</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\training\gradient_dropping\dropper.h">
-      <Filter>training\gradient_dropping</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\training\gradient_dropping\sparse_tensor.h">
-      <Filter>training\gradient_dropping</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.h">
-      <Filter>training\gradient_dropping\gpu</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\translator\beam_search.h">
      <Filter>translator</Filter>
    </ClInclude>
@ -1642,9 +1618,6 @@
    <ClInclude Include="..\src\translator\output_printer.h">
      <Filter>translator</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\training\graph_group_multinode_sync.h">
-      <Filter>training</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\command\marian_vocab.cpp">
      <Filter>command</Filter>
    </ClInclude>
@ -2373,12 +2346,6 @@
    <Filter Include="training">
      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0098}</UniqueIdentifier>
    </Filter>
-    <Filter Include="training\gradient_dropping">
-      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0101}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="training\gradient_dropping\gpu">
-      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0104}</UniqueIdentifier>
-    </Filter>
    <Filter Include="translator">
      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0107}</UniqueIdentifier>
    </Filter>
@ -2703,11 +2670,5 @@
    <CudaCompile Include="..\src\tensors\gpu\sparse.cu">
      <Filter>tensors\gpu</Filter>
    </CudaCompile>
-    <CudaCompile Include="..\src\training\gradient_dropping\gpu\dropper.cu">
-      <Filter>training\gradient_dropping\gpu</Filter>
-    </CudaCompile>
-    <CudaCompile Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.cu">
-      <Filter>training\gradient_dropping\gpu</Filter>
-    </CudaCompile>
  </ItemGroup>
 </Project>