From d2b4f3803ebbc839429d79a7af60f6f2ebf87a1b Mon Sep 17 00:00:00 2001
From: Young Jin Kim <youki@microsoft.com>
Date: Wed, 25 Mar 2020 02:52:17 +0000
Subject: [PATCH] Merged PR 11831: Change the weight matrix quantization to use
 7-bit min/max quantization to avoid overflow

1. Change the weight matrix quantization to use 7-bit min/max quantization
-> This resolves all the overflow issue, because weight and activations are quantized by min/max range.
2. Clip fp16 quantization to avoid overflow
3. Fix windows build errors (cmake options, vcproj file)
4. int8 pack model (encoder -> fp16)
---
 src/common/config_parser.cpp                  |   4 +
 .../cpu/fbgemm/expression_graph_packable.h    |  16 +-
 src/tensors/cpu/fbgemm/packed_gemm.cpp        | 185 ++++++++++++------
 src/tensors/cpu/fbgemm/packed_gemm.h          |  10 +-
 src/translator/nth_element.cpp                |   2 +-
 vs/Marian.vcxproj                             |  13 +-
 vs/Marian.vcxproj.filters                     |  45 +----
 7 files changed, 159 insertions(+), 116 deletions(-)
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 9c711eaa..d0155be9 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -841,11 +841,15 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate){
 
   auto buildInfo = get<std::string>("build-info");
   if(!buildInfo.empty() && buildInfo != "false") {
+#ifndef _MSC_VER // cmake build options are not available on MSVC based build.
     if(buildInfo == "all")
       std::cerr << cmakeBuildOptionsAdvanced() << std::endl;
     else
       std::cerr << cmakeBuildOptions() << std::endl;
     exit(0);
+#else // _MSC_VER
+    ABORT("build-info is not available on MSVC based build.");
+#endif // _MSC_VER
   }
 
   // get paths to extra config files
diff --git a/src/tensors/cpu/fbgemm/expression_graph_packable.h b/src/tensors/cpu/fbgemm/expression_graph_packable.h
index 743b7c8c..4c282895 100644
--- a/src/tensors/cpu/fbgemm/expression_graph_packable.h
+++ b/src/tensors/cpu/fbgemm/expression_graph_packable.h
@@ -35,10 +35,13 @@ public:
       Tensor val = p.second->val();
 
       // save as packed format
-      // @TODO Hardcoded to find packable weights - all the weights used for affine op (fp16), all the weights used for affine op and dot op (int8)
+      // @TODO Hardcoded to find packable weights
+      // int8 - quantize decoder only for better quality, all the weights used for affine op and dot op (int8)
+      // fp16 - all the weights used for affine op (fp16)
       if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
-        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
-  #if USE_FBGEMM
+        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)
+        && pName.find("encoder") == std::string::npos) {
+#if USE_FBGEMM
         using namespace marian::cpu::variant;
         // packing information - size
         int nrow;
@@ -82,7 +85,10 @@ public:
 #else
         ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
 #endif
-      } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
+      // fp16 quantization option + encoders for int8 quantized models
+      } else if ((gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3)
+        || ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
+        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2))) {
 #if USE_FBGEMM
         using namespace marian::cpu::variant;
 
@@ -123,7 +129,7 @@ public:
         io::Item item;
         item.name = pName;
         item.shape = val->shape();
-        item.type = gemmElementType;
+        item.type = Type::packed16;
 
         // Use the actual memory as this will be aligned and padded.
         // When memory mapping this is required. Shape keeps track of
diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp
index a98d5e4a..064c3c2b 100644
--- a/src/tensors/cpu/fbgemm/packed_gemm.cpp
+++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@@ -76,22 +76,31 @@ const int PACK16_PADDING = 1024;
 // This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
 const int PACK16_SPECIALMEM = 256;
 
+// This is the maximum value of FP16 type. There is a template type implementation, but it doesn't work on windows.
+// To keep the consistent result, just use the constant value instead of #ifdef _MSC_VER.
+// Template type implementation: float FP16_MAX = NumericLimits<float>(Type::float16).max;
+const float FP16_MAX = 65504.f;
+
+// This function clips a value into a [min, max] range
+inline float clip(float value, float min, float max) {
+  return std::max(min, std::min(value, max));
+}
+
 // This is copied from FBGEMM code
 // A better way?
 // will be removed, when FBGEMM api is changed
 // blocked row-major format address arithmetic
-/**
- * Returns the memory address in the packed (block formatted) matrix array of a specific element 
- * indexed by the original non-packed array.
- *
- * @param r_ row index in the original matrix
- * @param c_ column index in the original matrix
- * @param brow_ row wide block index
- * @param bcol_ column wide block index
- * @param nbrow_ number of blocks in row
- * @param nbcol_ number of blocks in column
- * @param last_brow_ row number of the last block
- */
+//
+// Returns the memory address in the packed (block formatted) matrix array of a specific element 
+// indexed by the original non-packed array.
+//
+// @param r_ row index in the original matrix
+// @param c_ column index in the original matrix
+// @param brow_ row wide block index
+// @param bcol_ column wide block index
+// @param nbrow_ number of blocks in row
+// @param nbcol_ number of blocks in column
+// @param last_brow_ row number of the last block
 inline uint64_t addr(const int r_,
                      const int c_,
                      const int brow_,
@@ -114,6 +123,15 @@ inline uint64_t addr(const int r_,
   return index;
 }
 
+// Returns a value in 2D array with the row, column index (i, j) and transposed flag.
+// The number of rows and columns needs to be passed.
+// The transposed flag indicates if the underlying data needs to be accessed in a tranposed layout or not.
+inline float getVal2dArr(const float* data, size_t i, size_t j, size_t rows, size_t cols, bool transposed) {
+  ABORT_IF(i >= rows, "Row index {} exceeds the number of rows {}.", i, rows);
+  ABORT_IF(j >= cols, "Column index {} exceeds the number of columns {}.", j, cols);
+  return transposed ? data[j * rows + i] : data[i * cols + j];
+}
+
 // Memory blocking factors (parameters) for packing into AVX2 int8
 static const fbgemm::BlockingFactors Packed8Avx2BlockingFactors = {
     PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MR,
@@ -147,6 +165,12 @@ inline const fbgemm::BlockingFactors* getBlockingFactors(marian::Type packType)
   }
 }
 
+// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// Packing with fp16 only targets AVX2 instruction sets for now.
+// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// packsize (out): the size of the packed matrix in byte
 void fbgemmPacked16PackInfo(const marian::Shape& shape,
                             const bool transpose,
                             uint64_t& packsize) {
@@ -154,6 +178,21 @@ void fbgemmPacked16PackInfo(const marian::Shape& shape,
   fbgemmPacked16PackInfo(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
 }
 
+// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// This function returns some other extra variables
+// Packing with fp16 only targets AVX2 instruction sets for now.
+// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// nrow (out): the number of rows
+// ncol (out): the number of columns
+// kernel_ncol_blocks (out): the number of column blocks
+// brow (out): the number of rows in a block
+// bcol (out): the number of columns in a block
+// last_brow (out): the number of rows in the last block
+// nbrow (out): row index in a block
+// nbcol (out): column index in a block
+// packsize (out): the size of the packed matrix in byte
 void fbgemmPacked16PackInfo(const marian::Shape& shape,
                             const bool transpose,
                             int& nrow,
@@ -178,6 +217,14 @@ void fbgemmPacked16PackInfo(const marian::Shape& shape,
              + PACK16_SPECIALMEM;
 }
 
+// Returns the byte size of packed matrix in int8. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// See '3rd_party/fbgemm/src/PackBMatrix.cc'.
+// shape: shape of the tensor to be packed
+// packType: Type to be packed - packed8avx2 or packed8avx512
+// transpose: the matrix is transposed
+// nrow (out): the number of rows
+// ncol (out): the number of columns
+// packsize (out): the size of the packed matrix in byte
 void fbgemmPacked8PackInfo(const marian::Shape& shape,
                            const marian::Type packType,
                            const bool transpose,
@@ -221,6 +268,20 @@ inline void col_offsets_with_zero_pt_s8acc32(
   }
 }
 
+// Pack a matrix (fp16) into cache utilization efficient way (block format) into fp16
+// out: output tensor - packed format
+// inData: input tensor data - pointer of float data
+// transpose: the matrix is transposed
+// nrow: the number of rows
+// ncol: the number of columns
+// kernel_ncol_blocks: the number of column blocks
+// brow: the number of rows in a block
+// bcol: the number of columns in a block
+// last_brow: the number of rows in the last block
+// nbrow: row index in a block
+// nbcol: column index in a block
+// packsize: the size of the packed matrix
+//          (the number of fp16 elements + padding (1024) + extra temporary memory (256))
 void fbgemmPacked16Pack(marian::Tensor out,
                         const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
                         const bool transpose,
@@ -258,20 +319,37 @@ void fbgemmPacked16Pack(marian::Tensor out,
   // pack the matrix
   for(int i = 0; i < nrow; i++) {
     for(int j = 0; j < ncol; j++) {
-      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
-          = tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
+      float src = clip(transpose ? inData[i + nrow * j] : inData[i * ncol + j], -FP16_MAX, FP16_MAX);
+      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)] = tconv(src, *dummy);
     }
   }
   delete dummy;
 }
 
+// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
+// out: output tensor - packed format and quantized into int8
+// inData: input tensor data - pointer of float data
+// packType: Type to be packed - packed8avx2 or packed8avx512
+// transpose: the matrix is transposed
+// nrow: the number of rows
+// ncol: the number of columns
+// packsize: the size of the packed matrix
+//          (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+// quantRangeStdDevs: the range to be quantized for the original float data in multiples standard deviation
+//                    the default value is 0.0f which means min/max quantization
+//                    only a half range of normal int8 which is [-64, 63] used to avoid overflow
+//                    during the accumulation in VPMADDUBSW instruction 
+//                    https://intel.github.io/mkl-dnn/dev_guide_int8_computations.html
+//                    (e.g. 3.f means the original tensor is quantized
+//                    from [mean - 3.f * standard deviation, mean + 3.f * standard deviation] to [-64, 63])
 void fbgemmPacked8Pack(marian::Tensor out,
                        const float* inData,
                        const marian::Type packType,
                        const bool transpose,
                        const int nrow,
                        const int ncol,
-                       const uint64_t packsize) {
+                       const uint64_t packsize,
+                       const float quantRangeStdDevs) {
   int k = nrow;
   int n = ncol;
   int len = k * n;
@@ -282,46 +360,43 @@ void fbgemmPacked8Pack(marian::Tensor out,
 
   const float* data = inData;
   float val = 0;
+
+  // Use half of the quantization range to prevent overflow of VPMADDUBSW
+  constexpr static int quantizedRange = 127;
+  constexpr static int quantizedMax = 63;
 
-  if (transpose) {
-    for (int jj = 0; jj < n; jj++) {
-      float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
-      double mean = 0, sqrsum = 0;
-      for (int ii = 0; ii < k; ii++) {
-        val = data[jj * k + ii];
-        mean += val;
-        sqrsum += val * val;
-      }
-      mean /= k;
-      sqrsum /= k;
-      sqrsum -= mean * mean;
-      sqrsum = sqrt(sqrsum);
-
-      min = (float)(mean - 7.0f*sqrsum);
-      max = (float)(mean + 7.0f*sqrsum);
-      bqScale[jj] = (max - min) / 255;
-      bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
-    }
-  } else {
-    for (int jj = 0; jj < n; jj++) {
-      float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
-      double mean = 0, sqrsum = 0;
-      for (int ii = 0; ii < k; ii++) {
-        val = data[jj + ii * n];
-        mean += val;
-        sqrsum += val * val;
-      }
-      mean /= k;
-      sqrsum /= k;
-      sqrsum -= mean * mean;
-      sqrsum = sqrt(sqrsum);
-
-      min = (float)(mean - 7.0f*sqrsum);
-      max = (float)(mean + 7.0f*sqrsum);
-      bqScale[jj] = (max - min) / 255;
-      bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
-    }
-  }
+  // This routine compute the quantization range for each column - either one of min/max range or quantRangeStdDevs sigma range.
+  for (size_t jj = 0; jj < n; jj++) { // for each column, collect stats (min/max or mean/std.dev.)
+    float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
+    double mean = 0, sqrsum = 0;
+    for (size_t ii = 0; ii < k; ii++) { // in a column, go throuhg all the rows and collect stats
+      val = getVal2dArr(data, ii, jj, k, n, transpose);
+      // If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range
+      if(quantRangeStdDevs == 0.f) {
+        if(min > val)
+          min = val;
+        if(max < val)
+          max = val;
+      } else {
+        // Quantize by std.dev. range
+        mean += val;
+        sqrsum += val * val;
+      }
+    }
+    // If a quantization range (in multiples of std. dev.) is given with a non-zero value,
+    // it calculate the range for this column (different quantization scale/offset are used for each column)
+    if(quantRangeStdDevs != 0.f) {
+      mean /= k;
+      sqrsum /= k;
+      sqrsum -= mean * mean;
+      sqrsum = sqrt(sqrsum);
+      min = (float)(mean - quantRangeStdDevs * sqrsum);
+      max = (float)(mean + quantRangeStdDevs * sqrsum);
+    }
+    // based on the quantization range, this computes the scale and offset for the quantization
+    bqScale[jj] = (max - min) / quantizedRange;
+    bqZeropoint[jj] = (int32_t)(quantizedMax - max / bqScale[jj]);
+  }
 
   // 2. quantize
   int8_t* quantized = 0;
@@ -335,7 +410,7 @@ void fbgemmPacked8Pack(marian::Tensor out,
     TensorQuantizationParams bQuantParam;
     bQuantParam.scale = bqScale[jj];
     bQuantParam.zero_point = bqZeropoint[jj];
-    bQuantParam.precision = 8;
+    bQuantParam.precision = 7;  // Use half of the quantization range to prevent overflow of VPMADDUBSW
 
     if (transpose)
       fbgemm::Quantize<int8_t>(data + jj * k, quantized + jj * k, k, bQuantParam);
diff --git a/src/tensors/cpu/fbgemm/packed_gemm.h b/src/tensors/cpu/fbgemm/packed_gemm.h
index d0a63ea9..694860d4 100644
--- a/src/tensors/cpu/fbgemm/packed_gemm.h
+++ b/src/tensors/cpu/fbgemm/packed_gemm.h
@@ -94,13 +94,21 @@ void fbgemmPacked16Pack(marian::Tensor out,
 // ncol: the number of columns
 // packsize: the size of the packed matrix
 //          (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+// quantRangeStdDevs: the range to be quantized for the original float data in multiples standard deviation
+//                    the default value is 0.0f which means min/max quantization
+//                    only a half range of normal int8 which is [-64, 63] used to avoid overflow
+//                    during the accumulation in VPMADDUBSW instruction 
+//                    https://intel.github.io/mkl-dnn/dev_guide_int8_computations.html
+//                    (e.g. 3.f means the original tensor is quantized
+//                    from [mean - 3.f * standard deviation, mean + 3.f * standard deviation] to [-64, 63])
 void fbgemmPacked8Pack(marian::Tensor out,
                        const float* inData,
                        const marian::Type packType,
                        const bool transpose,
                        const int nrow,
                        const int ncol,
-                       const uint64_t packsize); // @TODO: change to size_t where appropriate
+                       const uint64_t packsize,
+                       const float quantRangeStdDevs = 0.f); // @TODO: change to size_t where appropriate
 
 // GEMM operation on the packed B matrix
 // C: output matrix
diff --git a/src/translator/nth_element.cpp b/src/translator/nth_element.cpp
index 8b2f8947..237d9b9d 100644
--- a/src/translator/nth_element.cpp
+++ b/src/translator/nth_element.cpp
@@ -56,7 +56,7 @@ public:
       for(size_t i = 0; i < N; ++i) {
         int idx = idxs[i];
         // since idxs is re-used for each batch, add batch offset to each idx to get absolute position
-        h_res_idx[pos] = idx + batchIdx * batchOffset;
+        h_res_idx[pos] = (int) (idx + batchIdx * batchOffset);
         h_res[pos] = scoresData[idx];
         ++pos;
       }
diff --git a/vs/Marian.vcxproj b/vs/Marian.vcxproj
index 241aa307..0cb4a5de 100755
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@@ -1445,7 +1445,7 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
     <ClCompile Include="..\src\training\communicator.cpp" />
-    <ClCompile Include="..\src\training\graph_group_multinode_sync.cpp" />
+    <ClCompile Include="..\src\training\graph_group.cpp" />
     <ClCompile Include="..\src\training\scheduler.cpp" />
     <ClCompile Include="..\src\translator\history.cpp" />
     <ClCompile Include="..\src\translator\output_collector.cpp" />
@@ -1454,10 +1454,8 @@
     <ClCompile Include="..\src\translator\output_printer.cpp" />
     <ClCompile Include="..\src\translator\scorers.cpp" />
     <ClCompile Include="..\src\training\graph_group_async.cpp" />
-    <ClCompile Include="..\src\training\graph_group_async_drop.cpp" />
     <ClCompile Include="..\src\training\graph_group_sync.cpp" />
     <ClCompile Include="..\src\training\graph_group_singleton.cpp" />
-    <ClCompile Include="..\src\training\graph_group_multinode.cpp" />
     <ClCompile Include="..\src\training\validator.cpp" />
     <ClCompile Include="..\src\3rd_party\yaml-cpp\convert.cpp" />
     <ClCompile Include="..\src\3rd_party\yaml-cpp\directives.cpp" />
@@ -1653,7 +1651,6 @@
     <ClInclude Include="..\src\common\config_parser.h" />
     <ClInclude Include="..\src\common\definitions.h" />
     <ClInclude Include="..\src\common\file_stream.h" />
-    <ClInclude Include="..\src\common\keywords.h" />
     <ClInclude Include="..\src\common\logging.h" />
     <ClInclude Include="..\src\common\options.h" />
     <ClInclude Include="..\src\common\regex.h" />
@@ -1755,18 +1752,12 @@
     <ClInclude Include="..\src\training\communicator.h" />
     <ClInclude Include="..\src\training\graph_group.h" />
     <ClInclude Include="..\src\training\graph_group_async.h" />
-    <ClInclude Include="..\src\training\graph_group_async_drop.h" />
-    <ClInclude Include="..\src\training\graph_group_multinode.h" />
-    <ClInclude Include="..\src\training\graph_group_multinode_sync.h" />
     <ClInclude Include="..\src\training\graph_group_singleton.h" />
     <ClInclude Include="..\src\training\graph_group_sync.h" />
     <ClInclude Include="..\src\training\scheduler.h" />
     <ClInclude Include="..\src\training\training.h" />
     <ClInclude Include="..\src\training\training_state.h" />
     <ClInclude Include="..\src\training\validator.h" />
-    <ClInclude Include="..\src\training\gradient_dropping\dropper.h" />
-    <ClInclude Include="..\src\training\gradient_dropping\sparse_tensor.h" />
-    <ClInclude Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.h" />
     <ClInclude Include="..\src\translator\beam_search.h" />
     <ClInclude Include="..\src\translator\helpers.h" />
     <ClInclude Include="..\src\translator\history.h" />
@@ -1906,8 +1897,6 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
     </CudaCompile>
-    <CudaCompile Include="..\src\training\gradient_dropping\gpu\dropper.cu" />
-    <CudaCompile Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.cu" />
     <CudaCompile Include="..\src\translator\helpers.cu" />
     <CudaCompile Include="..\src\translator\nth_element.cu" />
   </ItemGroup>
diff --git a/vs/Marian.vcxproj.filters b/vs/Marian.vcxproj.filters
index a4cbc827..bb6080ae 100755
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@@ -94,18 +94,12 @@
     <ClCompile Include="..\src\training\graph_group_async.cpp">
       <Filter>training</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\training\graph_group_async_drop.cpp">
-      <Filter>training</Filter>
-    </ClCompile>
     <ClCompile Include="..\src\training\graph_group_sync.cpp">
       <Filter>training</Filter>
     </ClCompile>
     <ClCompile Include="..\src\training\graph_group_singleton.cpp">
       <Filter>training</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\training\graph_group_multinode.cpp">
-      <Filter>training</Filter>
-    </ClCompile>
     <ClCompile Include="..\src\training\validator.cpp">
       <Filter>training</Filter>
     </ClCompile>
@@ -226,9 +220,6 @@
     <ClCompile Include="..\src\3rd_party\yaml-cpp\binary_renamed.cpp">
       <Filter>3rd_party\yaml-cpp</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\training\graph_group_multinode_sync.cpp">
-      <Filter>training</Filter>
-    </ClCompile>
     <ClCompile Include="..\src\command\marian_main.cpp">
       <Filter>command</Filter>
     </ClCompile>
@@ -883,6 +874,9 @@
     <ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp">
       <Filter>tensors\cpu\fbgemm</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\training\graph_group.cpp">
+      <Filter>training</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\src\marian.h" />
@@ -1288,9 +1282,6 @@
     <ClInclude Include="..\src\common\file_stream.h">
       <Filter>common</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\common\keywords.h">
-      <Filter>common</Filter>
-    </ClInclude>
     <ClInclude Include="..\src\common\logging.h">
       <Filter>common</Filter>
     </ClInclude>
@@ -1531,12 +1522,6 @@
     <ClInclude Include="..\src\training\graph_group_async.h">
       <Filter>training</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\training\graph_group_async_drop.h">
-      <Filter>training</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\training\graph_group_multinode.h">
-      <Filter>training</Filter>
-    </ClInclude>
     <ClInclude Include="..\src\training\graph_group_singleton.h">
       <Filter>training</Filter>
     </ClInclude>
@@ -1555,15 +1540,6 @@
     <ClInclude Include="..\src\training\validator.h">
       <Filter>training</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\training\gradient_dropping\dropper.h">
-      <Filter>training\gradient_dropping</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\training\gradient_dropping\sparse_tensor.h">
-      <Filter>training\gradient_dropping</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.h">
-      <Filter>training\gradient_dropping\gpu</Filter>
-    </ClInclude>
     <ClInclude Include="..\src\translator\beam_search.h">
       <Filter>translator</Filter>
     </ClInclude>
@@ -1642,9 +1618,6 @@
     <ClInclude Include="..\src\translator\output_printer.h">
       <Filter>translator</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\training\graph_group_multinode_sync.h">
-      <Filter>training</Filter>
-    </ClInclude>
     <ClInclude Include="..\src\command\marian_vocab.cpp">
       <Filter>command</Filter>
     </ClInclude>
@@ -2373,12 +2346,6 @@
     <Filter Include="training">
       <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0098}</UniqueIdentifier>
     </Filter>
-    <Filter Include="training\gradient_dropping">
-      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0101}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="training\gradient_dropping\gpu">
-      <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0104}</UniqueIdentifier>
-    </Filter>
     <Filter Include="translator">
       <UniqueIdentifier>{880c8f51-3306-4d80-a682-7242341b0107}</UniqueIdentifier>
     </Filter>
@@ -2703,11 +2670,5 @@
     <CudaCompile Include="..\src\tensors\gpu\sparse.cu">
       <Filter>tensors\gpu</Filter>
     </CudaCompile>
-    <CudaCompile Include="..\src\training\gradient_dropping\gpu\dropper.cu">
-      <Filter>training\gradient_dropping\gpu</Filter>
-    </CudaCompile>
-    <CudaCompile Include="..\src\training\gradient_dropping\gpu\sparse_algorithm.cu">
-      <Filter>training\gradient_dropping\gpu</Filter>
-    </CudaCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file