Merged PR 10266: FBGEMM based Int8 model

FBGEMM based Int8 model - working with the master 1. Added int8 implementation into packed_gemm.h/cpp with FBGEMM 2. Update FBGEMM library to make it work on windows 3. Split 'ispacked' into packed8 and packed16 4. Change all names for PackFp32 to PackFp16 which is more accurate
2024-10-05 19:17:10 +03:00 · 2019-12-03 19:14:18 +00:00 · 2019-12-03 19:14:18 +00:00 · 9c9a240354
commit 9c9a240354
parent 0197b89b43
18 changed files with 1899 additions and 904 deletions
--- a/src/3rd_party/fbgemm
+++ b/src/3rd_party/fbgemm
@ -1 +1 @@
-Subproject commit f0b354327aaf2330c65340725b1981040c8bec9e
+Subproject commit 501f92e531378154daad708cd9aeb649637b4696
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -51,7 +51,7 @@ add_library(marian STATIC
  tensors/cpu/sharp/int_gemm.cpp
  tensors/cpu/sharp/avx_gemm.cpp
  tensors/cpu/sharp/sse_gemm.cpp
-  tensors/cpu/sharp/packed_gemm.cpp
+  tensors/cpu/fbgemm/packed_gemm.cpp

  graph/expression_graph.cpp
  graph/expression_operators.cpp
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@ -4,7 +4,7 @@

 #include <sstream>

-#include "graph/expression_graph_packable.h"
+#include "tensors/cpu/fbgemm/expression_graph_packable.h"

 int main(int argc, char** argv) {
  using namespace marian;
@ -22,7 +22,7 @@ int main(int argc, char** argv) {
        "  ./marian-conv -f model.npz -t model.bin --gemm-type fp16packed");
    cli->add<std::string>("--from,-f", "Input model", "model.npz");
    cli->add<std::string>("--to,-t", "Output model", "model.bin");
-    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used with this weights", "mklfp32");
+    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used with this weights - float16, float32, int8packed, fp16packed", "float32");
    cli->parse(argc, argv);
    options->merge(config);
  }
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@ -1,5 +1,5 @@
 #include "common/types.h"
-#include "tensors/cpu/sharp/packed_gemm.h"
+#include "tensors/cpu/fbgemm/packed_gemm.h"

 namespace marian {

@ -8,11 +8,21 @@ namespace marian {
 // But for instance, for intransparent types like packed tensors, it cannot easily be inferred by
 // multiplying. All cases are handed here and can later be passed to allocators etc. 
 size_t requiredBytes(const Shape& shape, Type type) {
-  if(isPacked(type)) {
+#if USE_FBGEMM
+  if (type == Type::packed8)
+  {
+    int nrow, ncol;
    uint64_t packsize;
-    cpu::variant::PackInfoFp32(shape, false, packsize);
+    cpu::variant::fbgemmPacked8PackInfo(shape, false, /*out=*/nrow, /*out=*/ncol, /*out=*/packsize);
    return (size_t)packsize;
-  } else {
+  } else if (type == Type::packed16)
+  {
+    uint64_t packsize;
+    cpu::variant::fbgemmPacked16PackInfo(shape, false, /*out=*/packsize);
+    return (size_t)packsize;
+  } else
+#endif  // USE_FBGEMM
+  {
    return shape.elements() * sizeOf(type);
  }
 }
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -7,7 +7,7 @@

 #include "graph/auto_tuner.h"
 #include "tensors/cpu/int16.h"
-#include "tensors/cpu/expanded_gemm.h"
+#include "tensors/cpu/fbgemm/expanded_gemm.h"

 #if USE_FBGEMM
 #include "fbgemm/Utils.h"
@ -416,17 +416,50 @@ Expr weighted_average(Expr in, Expr weights, int ax) {
 Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
  auto device = a->graph()->getDeviceId().type;
  float clipValue = a->graph()->getBackend()->getClip();
+  // added support for packed GEMM API (fp16, int8)
+  Type aElementType = a->value_type();
+  Type bElementType = b->value_type();

  // Currently only true when command line options
  // --optimize --cpu-thread=N with N > 0 are set.
-  if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()) {
-    // dotInt16 computes A * B.T, hence the transpose for B to get A * B
-    // if transA = false and transB = false.
+  if(device == DeviceType::cpu) {
+    if(isFloat(aElementType) && isFloat(bElementType)) {
+      if(a->graph()->getBackend()->isOptimized()) {
+        // dotInt16 computes A * B.T, hence the transpose for B to get A * B
+        // if transA = false and transB = false.

-    return cpu::int16::dot(
-        cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
-        cpu::int16::quantize(transB ? b : transpose(b), clipValue),
-        scale);
+        return cpu::int16::dot(
+          cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
+          cpu::int16::quantize(transB ? b : transpose(b), clipValue),
+          scale);
+      } else {
+        return Expression<DotNodeOp>(
+          clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
+      }
+    } else if(isFloat(aElementType) && isPacked(bElementType)) {
+#if USE_FBGEMM
+      // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
+      // one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
+      // It looks at the cpu register
+      // (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
+      // and this cpu lookup is executed only once and the state is kept in FBGEMM.
+      if(fbgemm::fbgemmHasAvx2Support()) {
+        // This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
+        return cpu::variant::dot(clip(a, clipValue),
+                                 b,
+                                 b->shape(),
+                                 transA,
+                                 transB,
+                                 scale);
+      } else {
+        ABORT("AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed GEMM");
+      }
+#else
+      ABORT("Packed GEMM is not available in this build");
+#endif  // USE_FBGEMM
+    } else {
+      ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType);
+    }
  } else {
    return Expression<DotNodeOp>(
        clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
@ -485,6 +518,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
      // (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
      // and this cpu lookup is executed only once and the state is kept in FBGEMM.
      if(fbgemm::fbgemmHasAvx2Support()) {
+        // This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
        return cpu::variant::affine(clip(a, clipValue),
                                    b,
                                    b->shape(),
@ -493,7 +527,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
                                    transB,
                                    scale);
      } else {
-        ABORT("No on-the-fly packing at the moment");
+        ABORT("AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed GEMM");
      }
 #else
      ABORT("Packed GEMM is not available in this build");
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@ -10,7 +10,7 @@
 #include "translator/scorers.h"
 #include "data/alignment.h"
 #include "data/vocab_base.h"
-#include "graph/expression_graph_packable.h"
+#include "tensors/cpu/fbgemm/expression_graph_packable.h"

 #if USE_FBGEMM
 #include "fbgemm/Utils.h"
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@ -5,15 +5,6 @@

 namespace marian {

-// GEMM type enum
-typedef enum { 
-  Auto = 0,            // auto tuning between available GEMMs
-  MklFp32 = 1,         // MKL based GEMM, fp32
-  IntrinInt16 = 2,     // Intrinsic implementation of Int 16 GEMM
-  FbFp16Packed = 10,   // FBGEMM based fp16 GEMM with packing
-  FbInt8Packed = 11    // FBGEMM based int8 GEMM with packing
-} GemmType;
-
 class Backend {
 protected:
  DeviceId deviceId_;
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@ -12,7 +12,6 @@ namespace cpu {
 class Backend : public marian::Backend {
 protected:
  bool optimized_{false};
-  GemmType gemmType_{GemmType::Auto};

 public:
  Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
--- a/src/tensors/cpu/expanded_gemm.h
+++ b/src/tensors/cpu/expanded_gemm.h
@ -1,205 +0,0 @@
-#pragma once
-
-#include "graph/node.h"
-#include "tensors/cpu/sharp/packed_gemm.h"
-
-#if USE_FBGEMM
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#endif
-
-#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-using namespace fbgemm;
-#endif  // USE_FBGEMM
-
-namespace marian {
-namespace cpu {
-namespace variant {
-
-// Enumeration for the Matrix used in pack functions
-// A matrix - 0, B matrix - 1
-enum class PackMatrix : uint8_t {
-  A = 0x00,
-  B = 0x01
-};
-
-// Pack a matrix into cache utilization efficient way (block format)
-// PackMatrix packMat_: the type of packed matrix - A or B matrix
-// bool transpose_: transpose
-// int nrow_: the number of rows
-// int ncol_: the number of columns
-// int kernel_ncol_blocks_: the number of column blocks
-// int brow_: the number of rows in a block
-// int bcol_: the number of columns in a block
-// int last_brow_: the number of rows in the last block
-// int nbrow_: row index in a block
-// int nbcol_: column index in a block
-// uint64_t packsize_: the size of the packed matrix
-//                    (the number of fp16 elements + padding (1024) + extra temporary memory (256))
-struct PackNodeOp : public UnaryNodeOp {
-  PackMatrix packMat_;
-  bool transpose_;
-  int nrow_;
-  int ncol_;
-  int kernel_ncol_blocks_;
-  int brow_;
-  int bcol_;
-  int last_brow_;
-  int nbrow_;
-  int nbcol_;
-  uint64_t packsize_;
-
-  PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
-      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
-        packMat_(packMat),
-        transpose_(transpose) {
-    if(packMat != PackMatrix::B)
-      ABORT("Only prepacking of B (weight matrix) is supported");
-    if(clipValue != 0)
-      ABORT("Clipping is not supported");
-    if(!memoize_)
-      ABORT("Only constant weight node can be packed");
-  }
-
-  NodeOps forwardOps() override {
-    return {NodeOp(PackFp32(val_,
-                            child(0)->val()->data(),
-                            transpose_,
-                            nrow_,
-                            ncol_,
-                            kernel_ncol_blocks_,
-                            brow_,
-                            bcol_,
-                            last_brow_,
-                            nbrow_,
-                            nbcol_,
-                            packsize_))
-    };
-  }
-
-  NodeOps backwardOps() override {
-    ABORT("PackNodeOp only available for inference");
-    return {NodeOp(0)};
-  }
-
-  const std::string type() override { return "packMat"; }
-
-  Shape newShape(Expr a, bool transpose) {
-#if USE_FBGEMM
-    auto shapeMat = a->shape();
-    // Should be 2D - weight matrix
-    ABORT_IF(shapeMat.size() != 2,
-             "Weight Matrix should be 2D");
-    PackInfoFp32(shapeMat,
-                 transpose,
-                 nrow_,
-                 ncol_,
-                 kernel_ncol_blocks_,
-                 brow_,
-                 bcol_,
-                 last_brow_,
-                 nbrow_,
-                 nbcol_,
-                 packsize_);
-
-    Shape outShape({(int)packsize_});
-
-    return outShape;
-#else // USE_FBGEMM
-    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
-    return Shape();
-#endif  // USE_FBGEMM
-  }
-};
-
-// Affine transform (matrix multiplication) using packed B matrix
-// float scalar_: scalar multiplier
-// size_t m_: the number of rows in A and C
-// size_t n_: the number of columns in B and C
-// size_t k_: the number of columns in A and the number of rows in C
-// bool transA_: transpose A
-// bool transB_: transpose B
-class AffineNodeOp : public NaryNodeOp {
-private:
-  float scalar_;
-  size_t m_;
-  size_t n_;
-  size_t k_;
-  bool transA_;
-  bool transB_;
-
-public:
-  AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
-        scalar_(scalar) {
-    transA_ = transA;
-    transB_ = transB;
-    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
-    k_ = nodes[0]->shape().back();
-    if(transA)
-      std::swap(m_, k_);
-
-    size_t l = bShape.elements() / bShape[-1];
-    n_ = bShape[-1];
-    if(transB)
-      std::swap(l, n_);
-  }
-
-  Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
-    auto shapeA = a->shape();
-    if(transA) {
-      shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
-      shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
-    }
-
-    auto shapeB = bShape;
-    if(transB) {
-      shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
-      shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
-    }
-
-    Shape outShape = shapeA;
-    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
-    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
-             "Matrix product requires inner dimensions to match");
-    return outShape;
-  }
-
-  NodeOps forwardOps() override {
-    return {
-      NodeOp(GemmPackFp32(val_,
-                          child(0)->val(),
-                          child(1)->val(),
-                          child(2)->val(),
-                          m_,
-                          n_,
-                          transA_))
-    };
-  }
-
-  NodeOps backwardOps() override {
-    ABORT("Only used for inference");
-    return {NodeOp(0)};
-  }
-
-  const std::string type() override { return "fp16packed"; }
-};
-
-static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
-  std::vector<Expr> nodes = {a, b, c};
-  return Expression<cpu::variant::AffineNodeOp>(nodes, bShape, transA, transB, scalar);
-}
-
-static inline Expr pack(Expr a, PackMatrix packMat, bool transpose, float clipValue) {
-  return Expression<cpu::variant::PackNodeOp>(a, packMat, transpose, clipValue);
-}
-
-}  // namespace variant
-}  // namespace cpu
-}  // namespace marian
--- a/src/tensors/cpu/fbgemm/expanded_gemm.h
+++ b/src/tensors/cpu/fbgemm/expanded_gemm.h
@ -0,0 +1,400 @@
+#pragma once
+
+#include "graph/node.h"
+#include "packed_gemm.h"
+#include "tensors/cpu/sharp/int_gemm.h"
+
+#if USE_FBGEMM
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+using namespace fbgemm;
+#endif  // USE_FBGEMM
+
+namespace marian {
+namespace cpu {
+namespace variant {
+
+// Enumeration for the Matrix used in pack functions
+// A matrix - 0, B matrix - 1
+enum class PackMatrix : uint8_t {
+  A = 0x00,
+  B = 0x01
+};
+
+// Pack a matrix (fp16) into cache utilization efficient way (block format) together with quantization into fp16
+// PackMatrix packMat_: the type of packed matrix - A or B matrix
+// bool transpose_: transpose
+// int nrow_: the number of rows
+// int ncol_: the number of columns
+// int kernel_ncol_blocks_: the number of column blocks
+// int brow_: the number of rows in a block
+// int bcol_: the number of columns in a block
+// int last_brow_: the number of rows in the last block
+// int nbrow_: row index in a block
+// int nbcol_: column index in a block
+// uint64_t packsize_: the size of the packed matrix
+//                    (the number of fp16 elements + padding (1024) + extra temporary memory (256))
+struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
+  PackMatrix packMat_;
+  bool transpose_;
+  int nrow_;
+  int ncol_;
+  int kernel_ncol_blocks_;
+  int brow_;
+  int bcol_;
+  int last_brow_;
+  int nbrow_;
+  int nbcol_;
+  uint64_t packsize_;
+
+  FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
+      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
+        packMat_(packMat),
+        transpose_(transpose) {
+    if(packMat != PackMatrix::B)
+      ABORT("Only prepacking of B (weight matrix) is supported");
+    if(clipValue != 0)
+      ABORT("Clipping is not supported");
+    if(!memoize_)
+      ABORT("Only constant weight node can be packed");
+  }
+
+  NodeOps forwardOps() override {
+#if USE_FBGEMM
+    return {NodeOp(fbgemmPacked16Pack(val_,
+                                      child(0)->val()->data(),
+                                      transpose_,
+                                      nrow_,
+                                      ncol_,
+                                      kernel_ncol_blocks_,
+                                      brow_,
+                                      bcol_,
+                                      last_brow_,
+                                      nbrow_,
+                                      nbcol_,
+                                      packsize_))
+    };
+#else // USE_FBGEMM
+    ABORT("FbgemmPacked16PackNodeOp can only be used with FBGEMM enabled.");
+    return { NodeOp(0) };
+#endif  // USE_FBGEMM
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("FbgemmPacked16PackNodeOp only available for inference");
+    return {NodeOp(0)};
+  }
+
+  const std::string type() override { return "packMatFp16"; }
+
+  Shape newShape(Expr a, bool transpose) {
+#if USE_FBGEMM
+    auto shapeMat = a->shape();
+    // Should be 2D - weight matrix
+    ABORT_IF(shapeMat.size() != 2,
+             "Weight Matrix should be 2D");
+    fbgemmPacked16PackInfo(shapeMat,
+                           transpose,
+                           nrow_,
+                           ncol_,
+                           kernel_ncol_blocks_,
+                           brow_,
+                           bcol_,
+                           last_brow_,
+                           nbrow_,
+                           nbcol_,
+                           packsize_);
+
+    Shape outShape({(int)packsize_});
+
+    return outShape;
+#else // USE_FBGEMM
+    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
+    return Shape();
+#endif  // USE_FBGEMM
+  }
+};
+
+// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
+// PackMatrix packMat_: the type of packed matrix - A or B matrix
+// bool transpose_: transpose
+// int nrow_: the number of rows
+// int ncol_: the number of columns
+// uint64_t packsize_: the size of the packed matrix
+//                    (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
+  PackMatrix packMat_;
+  bool transpose_;
+  int nrow_;
+  int ncol_;
+  uint64_t packsize_;
+
+  FbgemmPacked8PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
+      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
+        packMat_(packMat),
+        transpose_(transpose) {
+    if(packMat != PackMatrix::B)
+      ABORT("Only prepacking of B (weight matrix) is supported");
+    if(clipValue != 0)
+      ABORT("Clipping is not supported");
+    if(!memoize_)
+      ABORT("Only constant weight node can be packed");
+  }
+
+  NodeOps forwardOps() override {
+#if USE_FBGEMM
+    return {NodeOp(fbgemmPacked8Pack(val_,
+                                     child(0)->val()->data(),
+                                     transpose_,
+                                     nrow_,
+                                     ncol_,
+                                     packsize_))
+    };
+#else // USE_FBGEMM
+    ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled.");
+    return { NodeOp(0) };
+#endif  // USE_FBGEMM
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("FbgemmPacked8PackNodeOp only available for inference");
+    return {NodeOp(0)};
+  }
+
+  const std::string type() override { return "packMatInt8"; }
+
+  Shape newShape(Expr a, bool transpose) {
+#if USE_FBGEMM
+    fbgemmPacked8PackInfo(a->shape(), transpose, nrow_, ncol_, packsize_);
+    Shape outShape({(int)packsize_});
+
+    return outShape;
+#else // USE_FBGEMM
+    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
+    return Shape();
+#endif  // USE_FBGEMM
+  }
+};
+
+// Affine transform (matrix multiplication) using packed B matrix
+// float scalar_: scalar multiplier
+// size_t m_: the number of rows in A and C
+// size_t n_: the number of columns in B and C
+// size_t k_: the number of columns in A and the number of rows in C
+// bool transA_: transpose A
+// bool transB_: transpose B
+class FbgemmPacked16AffineNodeOp : public NaryNodeOp {
+private:
+  float scalar_;
+  size_t m_;
+  size_t n_;
+  size_t k_;
+  bool transA_;
+  bool transB_;
+
+public:
+  FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
+      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
+        scalar_(scalar) {
+    transA_ = transA;
+    transB_ = transB;
+    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
+    k_ = nodes[0]->shape().back();
+    if(transA)
+      std::swap(m_, k_);
+
+    size_t l = bShape.elements() / bShape[-1];
+    n_ = bShape[-1];
+    if(transB)
+      std::swap(l, n_);
+  }
+
+  Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
+    auto shapeA = a->shape();
+    if(transA) {
+      shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
+      shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
+    }
+
+    auto shapeB = bShape;
+    if(transB) {
+      shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
+      shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
+    }
+
+    Shape outShape = shapeA;
+    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
+    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
+             "Matrix product requires inner dimensions to match");
+    return outShape;
+  }
+
+  NodeOps forwardOps() override {
+#if USE_FBGEMM
+    return {
+      NodeOp(fbgemmPacked16Gemm(val_,
+                                child(0)->val(),
+                                child(1)->val(),
+                                children().size() > 2 ? child(2)->val() : nullptr, // pass only if it has a bias
+                                m_,
+                                n_,
+                                transA_))
+    };
+#else // USE_FBGEMM
+    ABORT("FbgemmPacked16AffineNodeOp can only be used with FBGEMM enabled.");
+    return { NodeOp(0) };
+#endif  // USE_FBGEMM
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("Only used for inference");
+    return {NodeOp(0)};
+  }
+
+  const std::string type() override { return "fp16packed"; }
+};
+
+// Affine transform (matrix multiplication) using packed B matrix
+// Especially, this gemm performs quantized gemms in 8-bit integers.
+// float scalar_: scalar multiplier
+// size_t m_: the number of rows in A and C
+// size_t n_: the number of columns in B and C
+// size_t k_: the number of columns in A and the number of rows in C
+// bool transA_: transpose A
+// bool transB_: transpose B
+class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
+private:
+  float scalar_;
+  size_t m_;
+  size_t n_;
+  size_t k_;
+  bool transA_;
+  bool transB_;
+
+public:
+  FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
+      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
+        scalar_(scalar) {
+    transA_ = transA;
+    transB_ = transB;
+    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
+    k_ = nodes[0]->shape().back();
+    if(transA)
+      std::swap(m_, k_);
+
+    size_t l = bShape.elements() / bShape[-1];
+    n_ = bShape[-1];
+    if(transB)
+     std::swap(l, n_);
+  }
+
+  Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
+    auto shapeA = a->shape();
+    if(transA) {
+      shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
+      shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
+    }
+
+    auto shapeB = bShape;
+    if(transB) {
+     shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
+     shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
+    }
+
+    Shape outShape = shapeA;
+    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
+    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
+            "Matrix product requires inner dimensions to match");
+    return outShape;
+  }
+
+  NodeOps forwardOps() override {
+    NodeOps nodeOps;
+#if USE_FBGEMM
+    // Do addBias only if it has a bias term
+    if (children().size() > 2) {
+      nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
+                                           child(0)->val(),
+                                           child(1)->val(),
+                                           m_,
+                                           n_,
+                                           k_,
+                                           transA_,
+                                           transB_);
+                       marian::cpu::int16::AddBias(val_, child(2)->val())) };
+    } else {
+      nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
+                                           child(0)->val(),
+                                           child(1)->val(),
+                                           m_,
+                                           n_,
+                                           k_,
+                                           transA_,
+                                           transB_)) };
+    }
+#else // USE_FBGEMM
+    ABORT("FbgemmPacked8AffineNodeOp can only be used with FBGEMM enabled.");
+#endif  // USE_FBGEMM
+
+    return nodeOps;
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("Only used for inference");
+    return {NodeOp(0)};
+  }
+
+  const std::string type() override { return "int8packed"; }
+};
+
+static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
+  std::vector<Expr> nodes = {a, b, c};
+  Type elementType = b->value_type();
+
+  if (elementType == Type::packed16)
+    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+  else if (elementType == Type::packed8)
+    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+  else {
+    ABORT("Only int8 and fp16 are available. {}", elementType);
+    return nullptr;
+  }
+}
+
+static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
+  if (elementType == Type::packed16)
+    return Expression<cpu::variant::FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
+  else if (elementType == Type::packed8)
+    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, transpose, clipValue);
+  else {
+    ABORT("Only int8 and fp16 are available. {}", elementType);
+    return nullptr;
+  }
+}
+
+static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
+  std::vector<Expr> nodes = {a, b};
+  Type elementType = b->value_type();
+
+  if (elementType == Type::packed16)
+    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+  else if (elementType == Type::packed8)
+    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+  else {
+    ABORT("Only int8 and fp16 are available. {}", elementType);
+    return nullptr;
+  }
+}
+
+}  // namespace variant
+}  // namespace cpu
+}  // namespace marian
--- a/src/tensors/cpu/fbgemm/expression_graph_packable.h
+++ b/src/tensors/cpu/fbgemm/expression_graph_packable.h
@ -1,7 +1,7 @@
 #pragma once

 #include "graph/expression_graph.h"
-#include "tensors/cpu/sharp/packed_gemm.h"
+#include "packed_gemm.h"

 namespace marian {

@ -34,16 +34,60 @@ public:

      Tensor val = p.second->val();

+#if USE_FBGEMM
      // save as packed format
-      // @TODO Hardcoded to find packable weights - all the weights used for affine op
-      if (saveGemmType == "fp16packed" && pName.find("_W") == pName.length() - 3) {
+      // @TODO Hardcoded to find packable weights - all the weights used for affine op (fp16), all the weights used for affine op and dot op (int8)
+      if (saveGemmType == "int8packed" && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2))
+      {
+        using namespace marian::cpu::variant;
+
+        // packing information - size
+        int nrow;
+        int ncol;
+        uint64_t packsize;
+
+        fbgemmPacked8PackInfo(val->shape(),
+          pName.find("Wemb") != std::string::npos,
+          nrow,
+          ncol,
+          packsize);
+
+        auto allocator = New<TensorAllocator>(getBackend());
+
+        // buffer tensor to save packed matrix
+        Tensor packedTensor;
+        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
+
+        //Pack B matrix into int8
+        fbgemmPacked8Pack(packedTensor,
+          val->data(),
+          pName.find("Wemb") != std::string::npos,
+          nrow,
+          ncol,
+          packsize);
+        io::Item item;
+        item.name = pName;
+        item.shape = val->shape();
+        item.type = Type::packed8;
+
+        // Use the actual memory as this will be aligned and padded.
+        // When memory mapping this is required. Shape keeps track of
+        // tensor size. Saving to *.npz will cut to size.
+        auto mem = packedTensor->memory();
+        item.bytes.resize(mem->size());
+        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
+
+        ioItems.emplace_back(std::move(item));
+
+      } else if (saveGemmType == "fp16packed" && pName.find("_W") == pName.length() - 3)
+      {
        using namespace marian::cpu::variant;

        // packing information
        int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
        uint64_t packsize;

-        PackInfoFp32(val->shape(),
+        fbgemmPacked16PackInfo(val->shape(),
          false,
          nrow,
          ncol,
@ -60,8 +104,8 @@ public:
        Tensor packedTensor;
        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);

-        // PackFp32
-        PackFp32(packedTensor,
+        // fbgemmPacked16Pack
+        fbgemmPacked16Pack(packedTensor,
          val->data(),
          false,
          nrow,
@ -86,7 +130,9 @@ public:
        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());

        ioItems.emplace_back(std::move(item));
-      } else {
+      } else
+#endif  // USE_FBGEMM
+      {
        io::Item item;
        val->get(item, pName);
        item.convert(saveElementType);
--- a/src/tensors/cpu/fbgemm/packed_gemm.cpp
+++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@ -0,0 +1,496 @@
+#include "packed_gemm.h"
+#include "tensors/tensor_allocator.h"
+#include "tensors/tensor_operators.h"
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#include <cassert>
+#include <cstddef>
+#include <unordered_map>
+//#include <chrono>
+
+#if USE_FBGEMM
+#ifdef _MSC_VER
+#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
+#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
+#pragma warning(disable: 4661) // 'fbgemm::PackMatrix<fbgemm::PackBMatrix<int8_t,int32_t>,int8_t,int32_t>::PackMatrix(int32_t,int32_t,inpType *,int,const fbgemm::BlockingFactors *)': no suitable definition provided for explicit template instantiation request
+#pragma warning(disable: 4244) // fbgemm\quantutils.h(51): warning C4244: 'return': conversion from 'const _Ty' to 'T2', possible loss of data
+// the following does not work; need to manually disable them in Linker options
+//#pragma comment(linker, "/ignore:4049") // locally defined symbol ...asmjit... imported
+//#pragma comment(linker, "/ignore:4217") // locally defined symbol ...asmjit... imported
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
+#include "3rd_party/fbgemm/include/fbgemm/QuantUtils.h"
+#include "3rd_party/fbgemm/include/fbgemm/Fbgemm.h"
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#if MKL_FOUND
+#include <mkl.h>
+#include <mkl_types.h>
+#endif
+
+using namespace fbgemm;
+#endif // USE_FBGEMM
+
+namespace marian {
+namespace cpu {
+namespace variant { // Variants of GEMM implementations
+
+#if USE_FBGEMM
+// initialize with a dummy
+// When this class is instantiated,
+// the actual packing operation is happening. If we create this instance every time we call GEMM,
+// we are doing packing every time and very slow.
+// In Caffe2, the operator is stateful and hold an instance of this.
+// But, we don't have any logic for this in marian. We can only cache a tensor (which means a memory chunk).
+// So, for now, we keep the packed memory on our own 1D tensor, then when we call GEMM,
+// we just reuse this instance again and again by replacing the class members (including memory pointer). Eventually,
+// I will add a new constructor to the class in FBGEMM which accepts
+// pre - allocated and pre - packed memory as a parameter.After it's done,
+// this temporary buffer will be removed.
+// When constructing this dummy buffer, ones are used for all the parameters to allocate minimum amount of memory.
+//
+// In a multi marian instance setting (as a dynamic library),
+// different marian instances should not share this variable.
+static thread_local PackedGemmMatrixFP16 packedPlaceholder(1, 1, 1, 1, 1, 1, 1, 1);
+
+// Copied code from fbgemm. It's padding required from some kernel in FBGEMM
+// Verbatim - 'required by sw pipelined kernels'
+// https://github.com/marian-nmt/FBGEMM/blob/master/include/fbgemm/FbgemmFP16.h#L109
+const int PACK16_PADDING = 1024;  
+
+// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
+const int PACK16_SPECIALMEM = 256;
+
+// This is copied from FBGEMM code
+// A better way?
+// will be removed, when FBGEMM api is changed
+// blocked row-major format address arithmetic
+/**
+ * Returns the memory address in the packed (block formatted) matrix array of a specific element 
+ * indexed by the original non-packed array.
+ *
+ * @param r_ row index in the original matrix
+ * @param c_ column index in the original matrix
+ * @param brow_ row wide block index
+ * @param bcol_ column wide block index
+ * @param nbrow_ number of blocks in row
+ * @param nbcol_ number of blocks in column
+ * @param last_brow_ row number of the last block
+ */
+inline uint64_t addr(const int r_,
+                     const int c_,
+                     const int brow_,
+                     const int bcol_,
+                     const int nbrow_,
+                     const int nbcol_,
+                     const int last_brow_) {
+  uint64_t r = (uint64_t)r_;
+  uint64_t c = (uint64_t)c_;
+
+  uint64_t block_row_id = r / brow_;
+  uint64_t brow_offset = (block_row_id * nbcol_) * (brow_ * bcol_);
+  uint64_t block_col_id = c / bcol_;
+  uint64_t bcol_offset
+      = block_col_id * ((block_row_id != nbrow_ - 1) ? (brow_ * bcol_) : (last_brow_ * bcol_));
+  uint64_t block_offset = brow_offset + bcol_offset;
+  uint64_t inblock_offset = r % brow_ * bcol_ + c % bcol_;
+
+  uint64_t index = block_offset + inblock_offset;
+  return index;
+}
+
+void fbgemmPacked16PackInfo(const marian::Shape& shape,
+                            const bool transpose,
+                            uint64_t& packsize) {
+  int nrow, ncol, kernel_ncol_blocks, brow = 512, bcol, last_brow, nbrow, nbcol;
+  fbgemmPacked16PackInfo(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
+}
+
+void fbgemmPacked16PackInfo(const marian::Shape& shape,
+                            const bool transpose,
+                            int& nrow,
+                            int& ncol,
+                            int& kernel_ncol_blocks,
+                            int& brow,
+                            int& bcol,
+                            int& last_brow,
+                            int& nbrow,
+                            int& nbcol,
+                            uint64_t& packsize) {
+  nrow = transpose ? shape[1] : shape[0];
+  ncol = transpose ? shape[0] : shape[1];
+  kernel_ncol_blocks = 2;
+  brow = 512;
+  bcol = 8 * kernel_ncol_blocks;
+  last_brow = nrow % brow == 0 ? brow : nrow % brow;
+  nbrow = nrow % brow == 0 ? nrow / brow : (nrow + brow) / brow;
+  nbcol = ncol % bcol == 0 ? ncol / bcol : (ncol + bcol) / bcol;
+  ABORT_IF(ncol % bcol != 0, "ncol (number of columns) should be multiple of 16. {}", ncol);
+  packsize = ((nbrow * brow) * (nbcol * bcol)) * sizeof(fbgemm::float16) + PACK16_PADDING
+             + PACK16_SPECIALMEM;
+}
+
+void fbgemmPacked8PackInfo(const marian::Shape& shape,
+                           const bool transpose,
+                           int& nrow,
+                           int& ncol,
+                           uint64_t& packsize) {
+    // Should be 2D - weight matrix
+    ABORT_IF(shape.size() != 2,
+            "Weight Matrix should be 2D");
+    nrow = transpose ? shape[1] : shape[0];
+    ncol = transpose ? shape[0] : shape[1];
+    packsize = fbgemm::PackMatrix<fbgemm::PackBMatrix<int8_t>, int8_t>::packedBufferSize(
+        transpose ? shape[1] : shape[0],
+        transpose ? shape[0] : shape[1]);
+    // add extra space for storing some other variables specific to B matrix
+    // quantization sacles: 1 per column and float
+    // quantization offset: 1 per column and int32
+    // column offsets: 1 per column and int32
+    packsize += ncol * (sizeof(float) + sizeof(int32_t) + sizeof(int32_t));
+}
+
+// This function computes the offset values for each column which are used for compensating the remainders of quantized values
+// More detailed math is avilable in the FBGEMM's blog - https://engineering.fb.com/ml-applications/fbgemm/
+inline void col_offsets_with_zero_pt_s8acc32(
+    bool transpose,
+    int K,
+    int N,
+    const int8_t* Bint8,
+    const int32_t* B_zero_point,
+    int32_t* col_offsets,
+    int ncols_per_quant_group) {
+  for (int n = 0; n < N; ++n) {
+    int32_t sum = 0;
+    for (int k = 0; k < K; ++k) {
+      sum += transpose ? Bint8[k + n * K] : Bint8[k * N + n];
+    }
+    col_offsets[n] = sum - B_zero_point[n / ncols_per_quant_group] * K;
+  }
+}
+
+void fbgemmPacked16Pack(marian::Tensor out,
+                        const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
+                        const bool transpose,
+                        const int nrow,
+                        const int ncol,
+                        const int kernel_ncol_blocks,
+                        const int brow,
+                        const int bcol,
+                        const int last_brow,
+                        const int nbrow,
+                        const int nbcol,
+                        const uint64_t packsize) {
+  // initialize memory
+  uint8_t* outmemorg = out->data<uint8_t>();
+  for(auto i = 0; i < packsize; i++) {
+    outmemorg[i] = 0;
+  }
+  // save the other auxiliary variables
+  uint64_t* auxmemsize = (uint64_t*)outmemorg;
+  auxmemsize[0] = packsize;
+  // save FBGEMM related parameters into the header of the allocated memory by marian
+  int32_t header[8];
+  header[0] = nrow;
+  header[1] = ncol;
+  header[2] = kernel_ncol_blocks;
+  header[3] = brow;
+  header[4] = bcol;
+  header[5] = last_brow;
+  header[6] = nbrow;
+  header[7] = nbcol;
+  memcpy(auxmemsize + 1, header, sizeof(header));
+  // cast to float16
+  fbgemm::float16* outmem = (fbgemm::float16*)(outmemorg + 256);
+  fbgemm::float16* dummy = new fbgemm::float16;
+  // pack the matrix
+  for(int i = 0; i < nrow; i++) {
+    for(int j = 0; j < ncol; j++) {
+      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
+          = tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
+    }
+  }
+  delete dummy;
+}
+
+void fbgemmPacked8Pack(marian::Tensor out,
+                       const float* inData,
+                       const bool transpose,
+                       const int nrow,
+                       const int ncol,
+                       const uint64_t packsize) {
+  int k = nrow;
+  int n = ncol;
+  int len = k * n;
+
+  // 1. collect stats for each column
+  float* bqScale = new float[n];
+  int32_t* bqZeropoint = new int32_t[n];
+
+  const float* data = inData;
+  float val = 0;
+
+  if (transpose) {
+    for (int jj = 0; jj < n; jj++) {
+      float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
+      double mean = 0, sqrsum = 0;
+      for (int ii = 0; ii < k; ii++) {
+        val = data[jj * k + ii];
+        mean += val;
+        sqrsum += val * val;
+      }
+      mean /= k;
+      sqrsum /= k;
+      sqrsum -= mean * mean;
+      sqrsum = sqrt(sqrsum);
+
+      min = (float)(mean - 7.0f*sqrsum);
+      max = (float)(mean + 7.0f*sqrsum);
+      bqScale[jj] = (max - min) / 255;
+      bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
+    }
+  } else {
+    for (int jj = 0; jj < n; jj++) {
+      float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
+      double mean = 0, sqrsum = 0;
+      for (int ii = 0; ii < k; ii++) {
+        val = data[jj + ii * n];
+        mean += val;
+        sqrsum += val * val;
+      }
+      mean /= k;
+      sqrsum /= k;
+      sqrsum -= mean * mean;
+      sqrsum = sqrt(sqrsum);
+
+      min = (float)(mean - 7.0f*sqrsum);
+      max = (float)(mean + 7.0f*sqrsum);
+      bqScale[jj] = (max - min) / 255;
+      bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
+    }
+  }
+
+  // 2. quantize
+  int8_t* quantized = 0;
+#ifdef _MSC_VER
+  quantized = (int8_t*)_aligned_malloc(len, 256);
+#else
+  int result = posix_memalign((void**)&quantized, 256, len); result;
+  assert(result == 0);
+#endif
+  for (int jj = 0; jj < n; jj++) {
+    TensorQuantizationParams bQuantParam;
+    bQuantParam.scale = bqScale[jj];
+    bQuantParam.zero_point = bqZeropoint[jj];
+    bQuantParam.precision = 8;
+
+    if (transpose)
+      fbgemm::Quantize<int8_t>(data + jj * k, quantized + jj * k, k, bQuantParam);
+    else {
+      for (int ii = 0; ii < k; ii++) {
+        quantized[ii*n + jj] = fbgemm::Quantize<int8_t>(data[ii*n + jj], bQuantParam);
+      }
+    }
+  }
+
+  // 3. compute column offsets
+  int32_t* col_offsets = new int32_t[n];
+  col_offsets_with_zero_pt_s8acc32(transpose, k, n, quantized, bqZeropoint, col_offsets, 1);
+
+
+  int8_t* packedbuf = out->data<int8_t>();
+  for(auto i = 0; i < packsize; i++) {
+    packedbuf[i] = 0;
+  }
+
+  // 4. packing
+  PackBMatrix<int8_t> packedBN(
+      transpose ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
+      nrow, ncol, quantized, transpose ? nrow : ncol, packedbuf, 1);
+
+  // copy quantization scale
+  memcpy(packedbuf + (packsize - n * (sizeof(float) + sizeof(int32_t) + sizeof(int32_t))), bqScale, n * sizeof(float));
+  // copy quantization offset
+  memcpy(packedbuf + (packsize - n * (sizeof(int32_t) + sizeof(int32_t))), bqZeropoint, n * sizeof(int32_t));
+  // copy column offsets to the memory
+  memcpy(packedbuf + (packsize - n * sizeof(int32_t)), col_offsets, n * sizeof(int32_t));
+
+#ifdef _MSC_VER
+  _aligned_free(quantized);
+#else
+  free(quantized);
+#endif
+  delete[] col_offsets;
+  delete[] bqScale;
+  delete[] bqZeropoint;
+}
+
+// GEMM operation on the packed B matrix
+// C: output matrix
+// A: A matrix
+// B: B matrix (packed)
+// m: the number of rows in A and C
+// n: the number of columns in B and C
+// transA: transpose of A matrix
+// B is already packed. So, we don't need transB
+void fbgemmPacked16Gemm(marian::Tensor C,
+                        const marian::Tensor A,
+                        const marian::Tensor B,
+                        const marian::Tensor bias,
+                        const size_t m,
+                        const size_t n,
+                        const int transA) {
+  // row major
+  // keep the original mem
+  fbgemm::float16* pmat = packedPlaceholder.pmat_;
+  // retreive aux fields from the memory
+  uint64_t* packedmemSize = (uint64_t*)B->data();
+  packedPlaceholder.size_ = packedmemSize[0];
+  int32_t header[8];
+  memcpy(header, packedmemSize + 1, sizeof(header));
+  packedPlaceholder.nrow_ = header[0];
+  packedPlaceholder.ncol_ = header[1];
+  packedPlaceholder.kernel_ncol_blocks_ = header[2];
+  packedPlaceholder.brow_ = header[3];
+  packedPlaceholder.bcol_ = header[4];
+  packedPlaceholder.last_brow_ = header[5];
+  packedPlaceholder.nbrow_ = header[6];
+  packedPlaceholder.nbcol_ = header[7];
+
+  // packed matrix
+  packedPlaceholder.pmat_ = (fbgemm::float16*)(B->data<uint8_t>() + 256);
+
+  if(bias != nullptr) {
+#if MKL_FOUND
+    for(int i = 0; i < m; ++i) {
+      mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
+    }
+#else
+    for(int i = 0; i < m; ++i) {
+      std::copy(bias->data(), bias->data() + n, C->data() + n * i);
+    }
+#endif
+  }
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+  {
+#ifdef _OPENMP
+    int num_threads = omp_get_num_threads();
+    int tid = omp_get_thread_num();
+#else
+    int num_threads = 1;
+    int tid = 0;
+#endif
+    fbgemm::cblas_gemm_compute(transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
+                      (int)m,
+                      A->data(),
+                      packedPlaceholder,
+                      bias != nullptr ? 1.0f : 0.0f,
+                      C->data(),
+                      tid,
+                      num_threads);
+  }
+
+  // return back the original mem
+  packedPlaceholder.pmat_ = pmat;
+}
+
+// GEMM operation on the packed B matrix in 8 bit integers
+// C: output matrix
+// A: A matrix
+// B: B matrix (packed)
+// m: the number of rows in A and C
+// n: the number of columns in B and C
+// k: the number of columns in A and the number of rows in B
+// transA: whether A matrix is transposed or not
+// transB: whether B matrix is transposed or not
+void fbgemmPacked8Gemm(marian::Tensor C,
+                       const marian::Tensor A,
+                       const marian::Tensor B,
+                       const size_t m,
+                       const size_t n,
+                       const size_t k,
+                       const int transA,
+                       const int transB) {
+  // compute range to quantize A (activations) - (min/max quantization)
+  float min_est = std::numeric_limits<float>::max(), max_est = std::numeric_limits<float>::min();
+
+  int elem = A->shape().elements();
+  float* data = A->data();
+  // AVX based find min/max
+  FindMinMax(data, &min_est, &max_est, elem);
+
+  float ascale = (max_est - min_est) / 255;
+  int32_t azeropoint = (int32_t)(255 - max_est / ascale);
+
+  std::vector<int32_t> row_offset_buf(PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
+  PackAWithQuantRowOffset<uint8_t> packAN(
+      transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
+      (int32_t) (transA ? k : m),
+      (int32_t) (transA ? m : k),
+      A->data(),
+      (int32_t) (transA ? m : k),
+      nullptr, /*buffer for packed matrix*/
+      ascale,
+      azeropoint,
+      1, /*groups*/
+      row_offset_buf.data());
+
+  // packed matrix size of B
+  int bPackSize = PackMatrix<PackBMatrix<int8_t>, int8_t>::packedBufferSize((int32_t)k, (int32_t)n);
+
+  // retrieve B matrix
+  int8_t* bdata = B->data<int8_t>();
+  float* bqScale = new float[n];
+  memcpy(bqScale, bdata + bPackSize, n * sizeof(float));
+
+  int32_t* bqZeropoint = new int32_t[n];
+  memcpy(bqZeropoint, bdata + bPackSize + n * sizeof(float), n * sizeof(int32_t));
+
+  int32_t* col_offsets = new int32_t[n];
+  memcpy(col_offsets, bdata + bPackSize + n * (sizeof(float) + sizeof(int32_t)), n * sizeof(int32_t));
+
+  DoNothing<float, float> doNothingObj{};
+  ReQuantizeForFloat<false, QuantizationGranularity::OUT_CHANNEL> outputProcObj(
+      doNothingObj,
+      ascale,
+      bqScale,
+      azeropoint,
+      bqZeropoint,
+      packAN.getRowOffsetBuffer(),
+      col_offsets,
+      nullptr,
+      (std::uint32_t) n);
+
+  PackBMatrix<int8_t> repackedBN(
+    transB ? matrix_op_t::Transpose : matrix_op_t::NoTranspose, (int32_t) k, (int32_t) n, bdata, (int32_t) (transB ? k : n, 1));
+
+  // gemm computation
+  fbgemmPacked(packAN, repackedBN, C->data(), (int32_t*)C->data(), (int32_t) n, outputProcObj, 0, 1);
+
+  delete[] col_offsets;
+  delete[] bqZeropoint;
+  delete[] bqScale;
+}
+
+#endif // USE_FBGEMM
+
+}  // namespace variant
+}  // namespace cpu
+}  // namespace marian
--- a/src/tensors/cpu/fbgemm/packed_gemm.h
+++ b/src/tensors/cpu/fbgemm/packed_gemm.h
@ -0,0 +1,137 @@
+#pragma once
+
+#include "tensors/tensor.h"
+
+namespace marian {
+namespace cpu {
+namespace variant { // Variants of GEMM implementations
+
+// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// Packing with fp16 only targets AVX2 instruction sets for now.
+// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// packsize (out): the size of the packed matrix in byte
+void fbgemmPacked16PackInfo(const marian::Shape& shape,
+                            const bool transpose,
+                            /*out*/uint64_t& packsize);
+
+// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// This function returns some other extra variables
+// Packing with fp16 only targets AVX2 instruction sets for now.
+// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// nrow (out): the number of rows
+// ncol (out): the number of columns
+// kernel_ncol_blocks (out): the number of column blocks
+// brow (out): the number of rows in a block
+// bcol (out): the number of columns in a block
+// last_brow (out): the number of rows in the last block
+// nbrow (out): row index in a block
+// nbcol (out): column index in a block
+// packsize (out): the size of the packed matrix in byte
+void fbgemmPacked16PackInfo(const marian::Shape& shape,
+                          const bool transpose,
+                          /*out*/int& nrow,
+                          /*out*/int& ncol,
+                          /*out*/int& kernel_ncol_blocks,
+                          /*out*/int& brow,
+                          /*out*/int& bcol,
+                          /*out*/int& last_brow,
+                          /*out*/int& nbrow,
+                          /*out*/int& nbcol,
+                          /*out*/uint64_t& packsize); // @TODO: change to size_t where appropriate
+
+// Returns the byte size of packed matrix in int8. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// See '3rd_party/fbgemm/src/PackBMatrix.cc'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// nrow (out): the number of rows
+// ncol (out): the number of columns
+// packsize (out): the size of the packed matrix in byte
+void fbgemmPacked8PackInfo(const marian::Shape& shape,
+                           const bool transpose,
+                           /*out*/int& nrow,
+                           /*out*/int& ncol,
+                           /*out*/uint64_t& packsize);
+
+// Pack a matrix (fp16) into cache utilization efficient way (block format) into fp16
+// out: output tensor - packed format
+// inData: input tensor data - pointer of float data
+// transpose: the matrix is transposed
+// nrow: the number of rows
+// ncol: the number of columns
+// kernel_ncol_blocks: the number of column blocks
+// brow: the number of rows in a block
+// bcol: the number of columns in a block
+// last_brow: the number of rows in the last block
+// nbrow: row index in a block
+// nbcol: column index in a block
+// packsize: the size of the packed matrix
+//          (the number of fp16 elements + padding (1024) + extra temporary memory (256))
+void fbgemmPacked16Pack(marian::Tensor out,
+                        const float* inData,
+                        const bool transpose,
+                        const int nrow,
+                        const int ncol,
+                        const int kernel_ncol_blocks,
+                        const int brow,
+                        const int bcol,
+                        const int last_brow,
+                        const int nbrow,
+                        const int nbcol,
+                        const uint64_t packsize); // @TODO: change to size_t where appropriate
+
+// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
+// out: output tensor - packed format and quantized into int8
+// inData: input tensor data - pointer of float data
+// transpose: the matrix is transposed
+// nrow: the number of rows
+// ncol: the number of columns
+// packsize: the size of the packed matrix
+//          (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+void fbgemmPacked8Pack(marian::Tensor out,
+                       const float* inData,
+                       const bool transpose,
+                       const int nrow,
+                       const int ncol,
+                       const uint64_t packsize); // @TODO: change to size_t where appropriate
+
+// GEMM operation on the packed B matrix
+// C: output matrix
+// A: A matrix
+// B: B matrix (packed)
+// m: the number of rows in A and C
+// n: the number of columns in B and C
+// transA: transpose of A matrix
+// B is already packed. So, we don't need transB
+void fbgemmPacked16Gemm(marian::Tensor C,
+                        const marian::Tensor A,
+                        const marian::Tensor B,
+                        const marian::Tensor bias,
+                        const size_t m,
+                        const size_t n,
+                        const int transA = 0);
+
+// GEMM operation on the packed B matrix in 8 bit integers
+// C: output matrix
+// A: A matrix
+// B: B matrix (packed)
+// m: the number of rows in A and C
+// n: the number of columns in B and C
+// k: the number of columns in A and rows in B
+// transA: transpose of A matrix
+// transB: transpose of B matrix
+void fbgemmPacked8Gemm(marian::Tensor C,
+                       const marian::Tensor A,
+                       const marian::Tensor B,
+                       const size_t m,
+                       const size_t n,
+                       const size_t k,
+                       const int transA = 0,
+                       const int transB = 0);
+
+}  // namespace variant
+}  // namespace cpu
+}  // namespace marian
--- a/src/tensors/cpu/sharp/packed_gemm.cpp
+++ b/src/tensors/cpu/sharp/packed_gemm.cpp
@ -1,313 +0,0 @@
-#include "packed_gemm.h"
-#include "tensors/tensor_allocator.h"
-#include "tensors/tensor_operators.h"
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-#include <cassert>
-#include <cstddef>
-#include <unordered_map>
-//#include <chrono>
-
-#if USE_FBGEMM
-#ifdef _MSC_VER
-#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
-#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
-// the following does not work; need to manually disable them in Linker options
-//#pragma comment(linker, "/ignore:4049") // locally defined symbol ...asmjit... imported
-//#pragma comment(linker, "/ignore:4217") // locally defined symbol ...asmjit... imported
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#endif
-#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
-#include "3rd_party/fbgemm/include/fbgemm/QuantUtils.h"
-#include "3rd_party/fbgemm/include/fbgemm/Fbgemm.h"
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#if MKL_FOUND
-#include <mkl.h>
-#include <mkl_types.h>
-#endif
-
-using namespace fbgemm;
-#endif // USE_FBGEMM
-
-namespace marian {
-namespace cpu {
-namespace variant { // Variants of GEMM implementations
-
-#if USE_FBGEMM
-// initialize with a dummy
-// When this class is instantiated,
-// the actual packing operation is happening. If we create this instance every time we call GEMM,
-// we are doing packing every time and very slow.
-// In Caffe2, the operator is stateful and hold an instance of this.
-// But, we don't have any logic for this in marian. We can only cache a tensor (which means a memory chunk).
-// So, for now, we keep the packed memory on our own 1D tensor, then when we call GEMM,
-// we just reuse this instance again and again by replacing the class members (including memory pointer). Eventually,
-// I will add a new constructor to the class in FBGEMM which accepts
-// pre - allocated and pre - packed memory as a parameter.After it's done,
-// this temporary buffer will be removed.
-// When constructing this dummy buffer, ones are used for all the parameters to allocate minimum amount of memory.
-//
-// In a multi marian instance setting (as a dynamic library),
-// different marian instances should not share this variable.
-static thread_local PackedGemmMatrixFP16 packedPlaceholder(1, 1, 1, 1, 1, 1, 1, 1);
-
-// Copied code from fbgemm. It's padding required from some kernel in FBGEMM
-// Verbatim - 'required by sw pipelined kernels'
-// https://github.com/marian-nmt/FBGEMM/blob/master/include/fbgemm/FbgemmFP16.h#L109
-const int PACK16_PADDING = 1024;  
-
-// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
-const int PACK16_SPECIALMEM = 256;
-
-// This is copied from FBGEMM code
-// A better way?
-// will be removed, when FBGEMM api is changed
-// blocked row-major format address arithmetic
-/**
- * Returns the memory address in the packed (block formatted) matrix array of a specific element 
- * indexed by the original non-packed array.
- *
- * @param r_ row index in the original matrix
- * @param c_ column index in the original matrix
- * @param brow_ row wide block index
- * @param bcol_ column wide block index
- * @param nbrow_ number of blocks in row
- * @param nbcol_ number of blocks in column
- * @param last_brow_ row number of the last block
- */
-inline uint64_t addr(const int r_,
-                     const int c_,
-                     const int brow_,
-                     const int bcol_,
-                     const int nbrow_,
-                     const int nbcol_,
-                     const int last_brow_) {
-  uint64_t r = (uint64_t)r_;
-  uint64_t c = (uint64_t)c_;
-
-  uint64_t block_row_id = r / brow_;
-  uint64_t brow_offset = (block_row_id * nbcol_) * (brow_ * bcol_);
-  uint64_t block_col_id = c / bcol_;
-  uint64_t bcol_offset
-      = block_col_id * ((block_row_id != nbrow_ - 1) ? (brow_ * bcol_) : (last_brow_ * bcol_));
-  uint64_t block_offset = brow_offset + bcol_offset;
-  uint64_t inblock_offset = r % brow_ * bcol_ + c % bcol_;
-
-  uint64_t index = block_offset + inblock_offset;
-  return index;
-}
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  uint64_t& packsize) {
-  int nrow, ncol, kernel_ncol_blocks, brow = 512, bcol, last_brow, nbrow, nbcol;
-  PackInfoFp32(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
-}
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  int& nrow,
-                  int& ncol,
-                  int& kernel_ncol_blocks,
-                  int& brow,
-                  int& bcol,
-                  int& last_brow,
-                  int& nbrow,
-                  int& nbcol,
-                  uint64_t& packsize) {
-  nrow = transpose ? shape[1] : shape[0];
-  ncol = transpose ? shape[0] : shape[1];
-  kernel_ncol_blocks = 2;
-  brow = 512;
-  bcol = 8 * kernel_ncol_blocks;
-  last_brow = nrow % brow == 0 ? brow : nrow % brow;
-  nbrow = nrow % brow == 0 ? nrow / brow : (nrow + brow) / brow;
-  nbcol = ncol % bcol == 0 ? ncol / bcol : (ncol + bcol) / bcol;
-  ABORT_IF(ncol % bcol != 0, "ncol (number of columns) should be multiple of 16. {}", ncol);
-  packsize = ((nbrow * brow) * (nbcol * bcol)) * sizeof(fbgemm::float16) + PACK16_PADDING
-             + PACK16_SPECIALMEM;
-}
-
-void PackFp32(marian::Tensor out,
-              const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
-              const bool transpose,
-              const int nrow,
-              const int ncol,
-              const int kernel_ncol_blocks,
-              const int brow,
-              const int bcol,
-              const int last_brow,
-              const int nbrow,
-              const int nbcol,
-              const uint64_t packsize) {
-  // initialize memory
-  uint8_t* outmemorg = out->data<uint8_t>();
-  for(auto i = 0; i < packsize; i++) {
-    outmemorg[i] = 0;
-  }
-  // save the other auxiliary variables
-  uint64_t* auxmemsize = (uint64_t*)outmemorg;
-  auxmemsize[0] = packsize;
-  // save FBGEMM related parameters into the header of the allocated memory by marian
-  int32_t header[8];
-  header[0] = nrow;
-  header[1] = ncol;
-  header[2] = kernel_ncol_blocks;
-  header[3] = brow;
-  header[4] = bcol;
-  header[5] = last_brow;
-  header[6] = nbrow;
-  header[7] = nbcol;
-  memcpy(auxmemsize + 1, header, sizeof(header));
-  // cast to float16
-  fbgemm::float16* outmem = (fbgemm::float16*)(outmemorg + 256);
-  fbgemm::float16* dummy = new fbgemm::float16;
-  // pack the matrix
-  for(int i = 0; i < nrow; i++) {
-    for(int j = 0; j < ncol; j++) {
-      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
-          = tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
-    }
-  }
-  delete dummy;
-}
-
-// GEMM operation on the packed B matrix
-// C: output matrix
-// A: A matrix
-// B: B matrix (packed)
-// m: the number of rows in A and C
-// n: the number of columns in B and C
-// transA: transpose of A matrix
-// B is already packed. So, we don't need transB
-void GemmPackFp32(marian::Tensor C,
-                  const marian::Tensor A,
-                  const marian::Tensor B,
-                  const marian::Tensor bias,
-                  const size_t m,
-                  const size_t n,
-                  const int transA) {
-  // row major
-  // keep the original mem
-  fbgemm::float16* pmat = packedPlaceholder.pmat_;
-  // retreive aux fields from the memory
-  uint64_t* packedmemSize = (uint64_t*)B->data();
-  packedPlaceholder.size_ = packedmemSize[0];
-  int32_t header[8];
-  memcpy(header, packedmemSize + 1, sizeof(header));
-  packedPlaceholder.nrow_ = header[0];
-  packedPlaceholder.ncol_ = header[1];
-  packedPlaceholder.kernel_ncol_blocks_ = header[2];
-  packedPlaceholder.brow_ = header[3];
-  packedPlaceholder.bcol_ = header[4];
-  packedPlaceholder.last_brow_ = header[5];
-  packedPlaceholder.nbrow_ = header[6];
-  packedPlaceholder.nbcol_ = header[7];
-
-  // packed matrix
-  packedPlaceholder.pmat_ = (fbgemm::float16*)(B->data<uint8_t>() + 256);
-
-  if(bias != nullptr) {
-#if MKL_FOUND
-    for(int i = 0; i < m; ++i) {
-      mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
-    }
-#else
-    for(int i = 0; i < m; ++i) {
-      std::copy(bias->data(), bias->data() + n, C->data() + n * i);
-    }
-#endif
-  }
-
-#ifdef _OPENMP
-#pragma omp parallel
-#endif
-  {
-#ifdef _OPENMP
-    int num_threads = omp_get_num_threads();
-    int tid = omp_get_thread_num();
-#else
-    int num_threads = 1;
-    int tid = 0;
-#endif
-    fbgemm::cblas_gemm_compute(transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
-                      (int)m,
-                      A->data(),
-                      packedPlaceholder,
-                      bias != nullptr ? 1.0f : 0.0f,
-                      C->data(),
-                      tid,
-                      num_threads);
-  }
-
-  // return back the original mem
-  packedPlaceholder.pmat_ = pmat;
-}
-#else // USE_FBGEMM
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  uint64_t& packsize) {
-  // does nothing. supports only FBGEMM based packed gemm at this moment.
-  ABORT("FBGEMM is needed to use packed GEMM.");
-}
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  int& nrow,
-                  int& ncol,
-                  int& kernel_ncol_blocks,
-                  int& brow,
-                  int& bcol,
-                  int& last_brow,
-                  int& nbrow,
-                  int& nbcol,
-                  uint64_t& packsize) {
-  // does nothing. supports only FBGEMM based packed gemm at this moment.
-  ABORT("FBGEMM is needed to use packed GEMM.");
-}
-
-void PackFp32(marian::Tensor out,
-              const float* inData,
-              const bool transpose,
-              const int nrow,
-              const int ncol,
-              const int kernel_ncol_blocks,
-              const int brow,
-              const int bcol,
-              const int last_brow,
-              const int nbrow,
-              const int nbcol,
-              const uint64_t packsize) {
-                // does nothing. supports only FBGEMM based packed gemm at this moment.
-                ABORT("FBGEMM is needed to use packed GEMM.");
-}
-void GemmPackFp32(marian::Tensor C,
-                  const marian::Tensor A,
-                  const marian::Tensor B,
-                  const marian::Tensor bias,
-                  const size_t m,
-                  const size_t n,
-                  const int transA) {
-                // does nothing. supports only FBGEMM based packed gemm at this moment.
-                ABORT("FBGEMM is needed to use packed GEMM.");
-}
-#endif // USE_FBGEMM
-
-}  // namespace variant
-}  // namespace cpu
-}  // namespace marian
--- a/src/tensors/cpu/sharp/packed_gemm.h
+++ b/src/tensors/cpu/sharp/packed_gemm.h
@ -1,70 +0,0 @@
-#pragma once
-
-#include "tensors/tensor.h"
-
-namespace marian {
-namespace cpu {
-namespace variant { // Variants of GEMM implementations
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  /*out*/uint64_t& packsize);
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  int& nrow,
-                  int& ncol,
-                  int& kernel_ncol_blocks,
-                  int& brow,
-                  int& bcol,
-                  int& last_brow,
-                  int& nbrow,
-                  int& nbcol,
-                  /*out*/uint64_t& packsize); // @TODO: change to size_t where appropriate
-
-// Pack a matrix into cache utilization efficient way (block format)
-// out: output tensor - packed format
-// inData: input tensor data - pointer of float data
-// transpose: the matrix is transposed
-// nrow: the number of rows
-// ncol: the number of columns
-// kernel_ncol_blocks: the number of column blocks
-// brow: the number of rows in a block
-// bcol: the number of columns in a block
-// last_brow: the number of rows in the last block
-// nbrow: row index in a block
-// nbcol: column index in a block
-// packsize: the size of the packed matrix
-//          (the number of fp16 elements + padding (1024) + extra temporary memory (256))
-void PackFp32(marian::Tensor out,
-              const float* inData,
-              const bool transpose,
-              const int nrow,
-              const int ncol,
-              const int kernel_ncol_blocks,
-              const int brow,
-              const int bcol,
-              const int last_brow,
-              const int nbrow,
-              const int nbcol,
-              const uint64_t packsize); // @TODO: change to size_t where appropriate
-
-// GEMM operation on the packed B matrix
-// C: output matrix
-// A: A matrix
-// B: B matrix (packed)
-// m: the number of rows in A and C
-// n: the number of columns in B and C
-// transA: transpose of A matrix
-// B is already packed. So, we don't need transB
-void GemmPackFp32(marian::Tensor C,
-                  const marian::Tensor A,
-                  const marian::Tensor B,
-                  const marian::Tensor bias,
-                  const size_t m,
-                  const size_t n,
-                  const int transA = 0);
-
-}  // namespace variant
-}  // namespace cpu
-}  // namespace marian
--- a/vs/Marian.sln
+++ b/vs/Marian.sln
@ -1,7 +1,7 @@

 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 15
-VisualStudioVersion = 15.0.27703.2047
+VisualStudioVersion = 15.0.28307.902
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Marian", "Marian.vcxproj", "{E2F320FE-0C01-4C80-810C-3A92205A29DC}"
 EndProject
@ -20,6 +20,6 @@ Global
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {8CA1BE8F-87A9-4094-B549-E8C790F79D8C}
+		SolutionGuid = {3B922907-3384-4D39-9CEB-816BF7BB390D}
 	EndGlobalSection
 EndGlobal
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@ -43,14 +43,14 @@
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
-    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\googletest\googletest;..\src\3rd_party\fbgemm\third_party\googletest\googletest\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
    <LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
-    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\googletest\googletest;..\src\3rd_party\fbgemm\third_party\googletest\googletest\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
    <LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
  </PropertyGroup>
  <ItemDefinitionGroup>
@ -70,7 +70,7 @@
      </PrecompiledHeader>
      <WarningLevel>Level4</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>USE_MKL;ASMJIT_EXPORTS;BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>false</SDLCheck>
      <TreatWarningAsError>true</TreatWarningAsError>
      <AdditionalOptions>/bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
@ -107,7 +107,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>USE_MKL;ASMJIT_EXPORTS;BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>false</SDLCheck>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <AdditionalOptions>/d2Zi+ /bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
@ -141,6 +141,102 @@
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="..\src\3rd_party\ExceptionWithCallStack.cpp" />
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\BenchUtils.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\ConvUnifiedBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\Depthwise3DBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\DepthwiseBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\FP16Benchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsTunableBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GroupwiseConvRequantizeBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\I8SpmdmBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\Im2ColFusedRequantizeBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedFloatInOutBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc16Benchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc32Benchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\RequantizeBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\RowOffsetBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\codegen_fp16fp32.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -177,6 +273,12 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8Depthwise3DAvx2.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -201,6 +303,12 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC16Avx512VNNI.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -213,6 +321,12 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32Avx512VNNI.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\GroupwiseConvAcc32Avx2.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -255,6 +369,12 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\PackDepthwiseConvMatrixAvx2.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\PackMatrix.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -309,153 +429,253 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\FP16Test.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\GConvTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\I8DepthwiseTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\I8SpmdmTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\Im2ColFusedRequantizeTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeAcc16Test.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\QuantUtilsTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\RequantizeOnlyTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\TestUtils.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\UniConvTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal.cpp">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass.cpp">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
@ -472,20 +692,10 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand_regs.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
@ -579,6 +789,11 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-all.cc" />
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest_main.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\entry_iterator.cpp" />
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\errors.cpp" />
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\path.cpp" />
@ -795,6 +1010,8 @@
    <ClInclude Include="..\src\3rd_party\any_type.h" />
    <ClInclude Include="..\src\3rd_party\avx_mathfun.h" />
    <ClInclude Include="..\src\3rd_party\ExceptionWithCallStack.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\bench\AlignedVec.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\bench\BenchUtils.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\ConvUtils.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Fbgemm.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\FbgemmBuild.h" />
@ -808,56 +1025,77 @@
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Types.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Utils.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\UtilsAvx2.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\src\CodeCache.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernelGeneric.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernelU8S8.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmFP16UKernelsAvx2.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2-inl.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\GenerateKernel.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\GroupwiseConv.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\OptimizedKernelsAvx2.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\RefImplementations.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\TransposeUtils.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\arm.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\test\TestUtils.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apibegin.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apiend.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_build.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\misc_p.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc_p.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\simdtypes.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\build.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codebufferwriter_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\datatypes.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\features.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\misc_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\raassignment_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rabuilders_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\radefs_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestring.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86emitter.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86globals.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86logging_p.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86misc.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86opcode_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\include\cpuinfo-mock.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\include\cpuinfo.h" />
@ -868,6 +1106,7 @@
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\api.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\cpuid.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\windows\api.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-internal-inl.h" />
    <ClInclude Include="..\src\3rd_party\half_float\umHalf.h" />
    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\collectives.h">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -1142,13 +1381,10 @@
    <ClCompile Include="..\src\rescorer\score_collector.cpp" />
    <ClCompile Include="..\src\tensors\backend.cpp" />
    <ClCompile Include="..\src\tensors\cpu\device.cpp" />
+    <ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp" />
    <ClCompile Include="..\src\tensors\cpu\prod.cpp" />
    <ClCompile Include="..\src\tensors\cpu\sharp\avx_gemm.cpp" />
    <ClCompile Include="..\src\tensors\cpu\sharp\int_gemm.cpp" />
-    <ClCompile Include="..\src\tensors\cpu\sharp\packed_gemm.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-    </ClCompile>
    <ClCompile Include="..\src\tensors\cpu\sharp\sse_gemm.cpp" />
    <ClCompile Include="..\src\tensors\cpu\tensor_operators.cpp" />
    <ClCompile Include="..\src\graph\expression_graph.cpp" />
@ -1274,7 +1510,6 @@
    <ClInclude Include="..\src\examples\mnist\validator.h" />
    <ClInclude Include="..\src\functional\approx.h" />
    <ClInclude Include="..\src\functional\operators.h" />
-    <ClInclude Include="..\src\graph\expression_graph_packable.h" />
    <ClInclude Include="..\src\layers\loss.h" />
    <ClInclude Include="..\src\layers\weight.h" />
    <ClInclude Include="..\src\marian.h" />
@ -1494,9 +1729,10 @@
    <ClInclude Include="..\src\rnn\types.h" />
    <ClInclude Include="..\src\tensors\allocator.h" />
    <ClInclude Include="..\src\tensors\backend.h" />
-    <ClInclude Include="..\src\tensors\cpu\expanded_gemm.h" />
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\expanded_gemm.h" />
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\expression_graph_packable.h" />
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\packed_gemm.h" />
    <ClInclude Include="..\src\tensors\cpu\sharp\int_gemm.h" />
-    <ClInclude Include="..\src\tensors\cpu\sharp\packed_gemm.h" />
    <ClInclude Include="..\src\tensors\device.h" />
    <ClInclude Include="..\src\tensors\dispatch.h" />
    <ClInclude Include="..\src\tensors\gpu\add.h" />
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@ -490,9 +490,6 @@
    <ClCompile Include="..\src\tensors\gpu\prod.cpp">
      <Filter>tensors\gpu</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\tensors\cpu\sharp\packed_gemm.cpp">
-      <Filter>tensors\cpu\sharp</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.cc">
      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
@ -616,19 +613,127 @@
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\cache\init.c">
      <Filter>3rd_party\fbgemm\third_party\cpuinfo\src\x86\cacehe</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src\clog.c">
+      <Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\common\aliases.cpp">
+      <Filter>common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\common\filesystem.cpp">
+      <Filter>common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal.cpp">
@ -640,74 +745,110 @@
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand_regs.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\codegen_fp16fp32.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8Depthwise3DAvx2.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC16Avx512VNNI.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32Avx512VNNI.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\PackDepthwiseConvMatrixAvx2.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\FP16Test.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\GConvTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\I8DepthwiseTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\I8SpmdmTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\Im2ColFusedRequantizeTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeAcc16Test.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\QuantUtilsTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\RequantizeOnlyTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\TestUtils.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\UniConvTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\BenchUtils.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\ConvUnifiedBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\Depthwise3DBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\DepthwiseBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src\clog.c">
-      <Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\FP16Benchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsTunableBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GroupwiseConvRequantizeBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\I8SpmdmBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\Im2ColFusedRequantizeBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedFloatInOutBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc16Benchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc32Benchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\RequantizeBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\RowOffsetBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest_main.cc">
+      <Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-all.cc">
+      <Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
    </ClCompile>
    <ClCompile Include="..\src\common\aliases.cpp">
      <Filter>common</Filter>
@ -739,6 +880,9 @@
    <ClCompile Include="..\src\3rd_party\phf\phf.cc">
      <Filter>3rd_party\phf</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp">
+      <Filter>tensors\cpu\fbgemm</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\src\marian.h" />
@ -1804,12 +1948,6 @@
    <ClInclude Include="..\src\tensors\gpu\add.inc">
      <Filter>tensors\gpu</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\tensors\cpu\expanded_gemm.h">
-      <Filter>tensors\cpu</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\tensors\cpu\sharp\packed_gemm.h">
-      <Filter>tensors\cpu\sharp</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\ConvUtils.h">
      <Filter>3rd_party\fbgemm\include\fbgemm</Filter>
    </ClInclude>
@ -1903,46 +2041,163 @@
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\cpuinfo\utils.h">
      <Filter>3rd_party\fbgemm\third_party\cpuinfo\src\cpuinfo</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\arm.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apibegin.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apiend.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_build.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h">
+      <Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\build.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codebufferwriter_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\datatypes.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\features.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\misc_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\raassignment_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rabuilders_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\radefs_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestring.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86emitter.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86globals.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.h">
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl_p.h">
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal_p.h">
@ -1951,83 +2206,35 @@
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86logging_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86misc.h">
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86opcode_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc_p.h">
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\src\CodeCache.h">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2-inl.h">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.h">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\test\TestUtils.h">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\bench\AlignedVec.h">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\bench\BenchUtils.h">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\misc_p.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc_p.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\simdtypes.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h">
-      <Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-internal-inl.h">
+      <Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\half_float\umHalf.h">
      <Filter>3rd_party\half_float</Filter>
@ -2053,15 +2260,24 @@
    <ClInclude Include="..\src\3rd_party\zstr\zstr.hpp">
      <Filter>3rd_party</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\graph\expression_graph_packable.h">
-      <Filter>graph</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\common\fastopt.h">
      <Filter>common</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\phf\phf.h">
      <Filter>3rd_party\phf</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\expanded_gemm.h">
+      <Filter>tensors\cpu\fbgemm</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\expression_graph_packable.h">
+      <Filter>tensors\cpu\fbgemm</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\packed_gemm.h">
+      <Filter>tensors\cpu\fbgemm</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="3rd_party">
@ -2277,9 +2493,6 @@
    <Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86">
      <UniqueIdentifier>{5818c959-7963-4d8e-9e87-b61f340476c2}</UniqueIdentifier>
    </Filter>
-    <Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\base">
-      <UniqueIdentifier>{15414ec0-8761-4068-afef-822b7bed88df}</UniqueIdentifier>
-    </Filter>
    <Filter Include="3rd_party\fbgemm\third_party\cpuinfo\deps">
      <UniqueIdentifier>{d4505c8d-5e6e-4baf-8525-dc59ae8b6415}</UniqueIdentifier>
    </Filter>
@ -2292,12 +2505,33 @@
    <Filter Include="3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src">
      <UniqueIdentifier>{8fd74b1e-d3c1-4158-ad46-4a447222934e}</UniqueIdentifier>
    </Filter>
+    <Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\core">
+      <UniqueIdentifier>{b3b34c5f-5b98-436a-b34c-11e2dccb7ea2}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\test">
+      <UniqueIdentifier>{40576dca-07d5-4904-8119-ffbc982451a3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\bench">
+      <UniqueIdentifier>{9f11c8f1-78f7-47c6-9eac-34cd2c6cd909}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\third_party\googletest">
+      <UniqueIdentifier>{75f9df88-0eb1-4d9a-858e-4e0b8fc3aa8a}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\third_party\googletest\googletest">
+      <UniqueIdentifier>{9f77e916-1d2f-4c15-9eba-46bcbddd2658}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\third_party\googletest\googletest\src">
+      <UniqueIdentifier>{050ba410-c56a-4607-8401-935f58f598b5}</UniqueIdentifier>
+    </Filter>
    <Filter Include="3rd_party\half_float">
      <UniqueIdentifier>{defd3aec-3c56-4d70-a4bb-90ba9003d98d}</UniqueIdentifier>
    </Filter>
    <Filter Include="3rd_party\phf">
      <UniqueIdentifier>{352ac0e9-daed-437a-bc36-fb85ecd037eb}</UniqueIdentifier>
    </Filter>
+    <Filter Include="tensors\cpu\fbgemm">
+      <UniqueIdentifier>{bf361868-f451-45b8-9695-570d67924972}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\src\3rd_party\nccl\src\bootstrap.cu">