Merged PR 10797: Differentiate packed8 type by layout

For FBGEMM based int8 implementation, packed matrix (model) could be different based on available AVX instruction sets. This PR split packed8 format into two separate data formats (packed8avx2, packed8avx512). And, this enables any packed model can be generated on any machine. * Added packed8avx2, packed8avx512 types, removed packe8 type * Added blocking factors to the fbgemm interface based on the pack type for pack function and gem functions.
2024-09-11 06:15:56 +03:00 · 2019-12-23 20:04:13 +00:00 · 2019-12-23 20:04:13 +00:00 · 0dc1ef11d3
commit 0dc1ef11d3
parent f882f27c09
8 changed files with 173 additions and 58 deletions
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@ -33,10 +33,12 @@ int main(int argc, char** argv) {
  Type saveGemmType;
  if(saveGemmTypeStr == "float32") {
    saveGemmType = Type::float32;
-  } else if(saveGemmTypeStr == "packed16") {
+  } else if(saveGemmTypeStr == "packed16") {  // packed16 only supports AVX2. AVX512 might be added later
    saveGemmType = Type::packed16;
-  } else if(saveGemmTypeStr == "packed8") {
-    saveGemmType = Type::packed8;
+  } else if(saveGemmTypeStr == "packed8avx2") { // packed8 for AVX2
+    saveGemmType = Type::packed8avx2;
+  } else if(saveGemmTypeStr == "packed8avx512") { // packed8 for AVX512
+    saveGemmType = Type::packed8avx512;
  } else {
    ABORT("Unknown gemm-type: {}", saveGemmTypeStr);
  }
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@ -9,22 +9,30 @@ namespace marian {
 // multiplying. All cases are handed here and can later be passed to allocators etc. 
 size_t requiredBytes(const Shape& shape, Type type) {
 #if USE_FBGEMM
-  if (type == Type::packed8)
-  {
-    int nrow, ncol;
-    uint64_t packsize;
-    cpu::variant::fbgemmPacked8PackInfo(shape, false, /*out=*/nrow, /*out=*/ncol, /*out=*/packsize);
-    return (size_t)packsize;
-  } else if (type == Type::packed16)
-  {
-    uint64_t packsize;
-    cpu::variant::fbgemmPacked16PackInfo(shape, false, /*out=*/packsize);
-    return (size_t)packsize;
-  } else
-#endif  // USE_FBGEMM
-  {
+  if (isPacked(type)) {
+    if (sizeOf(type) == 1) {
+      // Type::packed8avx2 || type == Type::packed8avx512
+      // AVX2 and AVX512 CPUs have different cache and vector lanes,
+      // so the optimal memory layouts for them are different.
+      int nrow, ncol;
+      uint64_t packsize;
+      cpu::variant::fbgemmPacked8PackInfo(shape, type, false, /*out=*/nrow, /*out=*/ncol, /*out=*/packsize);
+      return (size_t)packsize;
+    } else if (type == Type::packed16) {
+      uint64_t packsize;
+      cpu::variant::fbgemmPacked16PackInfo(shape, false, /*out=*/packsize);
+      return (size_t)packsize;
+    } else {
+      ABORT("Not a supported data type: {}", type);
+      return 0;
+    }
+  } else {
    return shape.elements() * sizeOf(type);
  }
+#else
+  return shape.elements() * sizeOf(type);
+#endif  // USE_FBGEMM
+  
 }

 }
--- a/src/common/types.h
+++ b/src/common/types.h
@ -135,13 +135,19 @@ do { \
 namespace marian {

 // small struct to enable templating based on types use for packing
-struct packed8 {
+struct packed16 {
+  uint16_t x;
+};
+
+// small struct to enable templating based on types use for packing. This is a memory holder.
+// There's no difference between packed8avx2 and packed8avx512. But, they are separately defined to be distinguished.
+struct packed8avx2 {
  uint8_t x;
 };

-// small struct to enable templating based on types use for packing
-struct packed16 {
-  uint16_t x;
+// small struct to enable templating based on types use for packing. This is a memory holder.
+struct packed8avx512 {
+  uint8_t x;
 };

 #ifndef __CUDACC__ // vectorized types not available from .cu files
@ -209,18 +215,25 @@ struct float32x8 {

 // Internal to types.h, don't use. Use test functions below.
 enum class TypeClass : size_t {
-  signed_type   = 0x100,
-  unsigned_type = 0x200,
-  float_type    = 0x400,
-  packed_type   = 0x800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
+  signed_type   = 0x0100,
+  unsigned_type = 0x0200,
+  float_type    = 0x0400,

-  size_mask     = 0x0FF
+  packed_type   = 0x0800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
+  avx2_type     = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only
+  avx512_type   = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only
+
+  size_mask     = 0x00FF
 };

 constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
  return (size_t)typeClass + val;
 }

+constexpr inline size_t operator+(size_t val, TypeClass typeClass) {
+  return val + (size_t)typeClass;
+}
+
 // @TODO: rename to ElementType when things become stable, so it's easier to review
 enum class Type : size_t {
  int8     = TypeClass::signed_type + 1u,
@ -237,8 +250,10 @@ enum class Type : size_t {
  float32  = TypeClass::float_type + 4u,
  float64  = TypeClass::float_type + 8u,

-  packed8  = TypeClass::packed_type + 1u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-  packed16 = TypeClass::packed_type + 2u  // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
+  packed16      = TypeClass::packed_type + 2u,                          // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
+  packed8avx2   = TypeClass::packed_type + 1u + TypeClass::avx2_type,   // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  
 };

 static inline size_t operator&(TypeClass typeClass, Type type) {
@ -269,6 +284,14 @@ static inline bool isPacked(Type type) {
  return (TypeClass::packed_type & type) != 0;
 }

+static inline bool isAvx2(Type type) {
+  return (TypeClass::avx2_type & type) != 0;
+}
+
+static inline bool isAvx512(Type type) {
+  return (TypeClass::avx512_type & type) != 0;
+}
+
 size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type

 template <typename T>
@ -290,8 +313,9 @@ template <> inline bool matchType<float16>(Type type)  { return type == Type::fl
 template <> inline bool matchType<float>(Type type)    { return type == Type::float32;  }
 template <> inline bool matchType<double>(Type type)   { return type == Type::float64;  }

-template <> inline bool matchType<packed8>(Type type)  { return type == Type::packed8;  }
-template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
+template <> inline bool matchType<packed16>(Type type)       { return type == Type::packed16;       }
+template <> inline bool matchType<packed8avx2>(Type type)    { return type == Type::packed8avx2;    }
+template <> inline bool matchType<packed8avx512>(Type type)  { return type == Type::packed8avx512;  }
 // clang-format on

 static inline std::ostream& operator<<(std::ostream& out, Type type) {
@ -310,8 +334,9 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) {
    case Type::float32 : out << "float32"; break;
    case Type::float64 : out << "float64"; break;

-    case Type::packed8 : out << "packed8"; break;
-    case Type::packed16: out << "packed16"; break;
+    case Type::packed16      : out << "packed16"; break;
+    case Type::packed8avx2   : out << "packed8avx2"; break;
+    case Type::packed8avx512 : out << "packed8avx512"; break;
  }
  return out;
 }
@ -334,8 +359,9 @@ template <> inline std::string request<float16>()  { return "float16"; }
 template <> inline std::string request<float>()    { return "float32"; }
 template <> inline std::string request<double>()   { return "float64"; }

-template <> inline std::string request<packed8>()  { return "packed8"; }
 template <> inline std::string request<packed16>() { return "packed16"; }
+template <> inline std::string request<packed8avx2>()  { return "packed8avx2"; }
+template <> inline std::string request<packed8avx512>()  { return "packed8avx512"; }
 // clang-format on

 static Type inline typeFromString(const std::string& str) {
@ -363,6 +389,13 @@ static Type inline typeFromString(const std::string& str) {
    return Type::float32;
  if(str == "float64")
    return Type::float64;
+  
+  if(str == "packed16")
+    return Type::packed16;
+  if(str == "packed8avx2")
+    return Type::packed8avx2;
+  if(str == "packed8avx512")
+    return Type::packed8avx512;

  ABORT("Unknown type {}", str);
 }
@ -384,6 +417,10 @@ template <> inline Type typeId<float16>()  { return Type::float16; }
 template <> inline Type typeId<float>()    { return Type::float32; }
 template <> inline Type typeId<double>()   { return Type::float64; }

+template <> inline Type typeId<packed16>()      { return Type::packed16; }
+template <> inline Type typeId<packed8avx2>()   { return Type::packed8avx2; }
+template <> inline Type typeId<packed8avx512>() { return Type::packed8avx512; }
+
 // Abort if given C++ does not correspond to runtime type
 template <typename T>
 void matchOrAbort(Type type) {
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@ -262,7 +262,7 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP
  if (targetPrec == 16)
    saveGemmType = Type::packed16;
  else if (targetPrec == 8)
-    saveGemmType = Type::packed8;
+    saveGemmType = Type::packed8avx2; // We currently use avx2 by default.

  // added a flag if the weights needs to be packed or not
  graph->packAndSave(outputFile, configStr.str(), saveGemmType);
--- a/src/tensors/cpu/fbgemm/expanded_gemm.h
+++ b/src/tensors/cpu/fbgemm/expanded_gemm.h
@ -126,6 +126,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {

 // Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
 // PackMatrix packMat_: the type of packed matrix - A or B matrix
+// marian::Type packType_: the type the input matrix is packed - packed8avx2 or packed8avx512
 // bool transpose_: transpose
 // int nrow_: the number of rows
 // int ncol_: the number of columns
@ -133,14 +134,20 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
 //                    (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
 struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
  PackMatrix packMat_;
+  marian::Type packType_;
  bool transpose_;
  int nrow_;
  int ncol_;
  uint64_t packsize_;

-  FbgemmPacked8PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
+  FbgemmPacked8PackNodeOp(Expr a,
+                          PackMatrix packMat,
+                          marian::Type packType,
+                          bool transpose,
+                          float clipValue)
      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
        packMat_(packMat),
+        packType_(packType),
        transpose_(transpose) {
    if(packMat != PackMatrix::B)
      ABORT("Only prepacking of B (weight matrix) is supported");
@ -154,6 +161,7 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
 #if USE_FBGEMM
    return {NodeOp(fbgemmPacked8Pack(val_,
                                     child(0)->val()->data(),
+                                     packType_,
                                     transpose_,
                                     nrow_,
                                     ncol_,
@ -174,7 +182,7 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {

  Shape newShape(Expr a, bool transpose) {
 #if USE_FBGEMM
-    fbgemmPacked8PackInfo(a->shape(), transpose, nrow_, ncol_, packsize_);
+    fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
    Shape outShape({(int)packsize_});

    return outShape;
@ -362,7 +370,7 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo

  if (elementType == Type::packed16)
    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
-  else if (elementType == Type::packed8)
+  else if (isPacked(elementType) && sizeOf(elementType) == 1)
    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
@ -373,8 +381,8 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
 static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
  if (elementType == Type::packed16)
    return Expression<cpu::variant::FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
-  else if (elementType == Type::packed8)
-    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, transpose, clipValue);
+  else if (isPacked(elementType) && sizeOf(elementType) == 1)
+    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
    return nullptr;
@ -387,7 +395,7 @@ static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, f

  if (elementType == Type::packed16)
    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
-  else if (elementType == Type::packed8)
+  else if (isPacked(elementType) && sizeOf(elementType) == 1)
    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
--- a/src/tensors/cpu/fbgemm/expression_graph_packable.h
+++ b/src/tensors/cpu/fbgemm/expression_graph_packable.h
@ -36,7 +36,8 @@ public:

      // save as packed format
      // @TODO Hardcoded to find packable weights - all the weights used for affine op (fp16), all the weights used for affine op and dot op (int8)
-      if (gemmElementType == Type::packed8 && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
+      if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
+        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
  #if USE_FBGEMM
        using namespace marian::cpu::variant;
        // packing information - size
@ -45,10 +46,11 @@ public:
        uint64_t packsize;

        fbgemmPacked8PackInfo(val->shape(),
-          pName.find("Wemb") != std::string::npos,
-          nrow,
-          ncol,
-          packsize);
+                              gemmElementType,
+                              pName.find("Wemb") != std::string::npos,
+                              nrow,
+                              ncol,
+                              packsize);

        auto allocator = New<TensorAllocator>(getBackend());

@ -58,11 +60,12 @@ public:

        //Pack B matrix into int8
        fbgemmPacked8Pack(packedTensor,
-          val->data(),
-          pName.find("Wemb") != std::string::npos,
-          nrow,
-          ncol,
-          packsize);
+                          val->data(),
+                          gemmElementType,
+                          pName.find("Wemb") != std::string::npos,
+                          nrow,
+                          ncol,
+                          packsize);
        io::Item item;
        item.name = pName;
        item.shape = val->shape();
--- a/src/tensors/cpu/fbgemm/packed_gemm.cpp
+++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@ -113,6 +113,39 @@ inline uint64_t addr(const int r_,
  return index;
 }

+// Memory blocking factors (parameters) for packing into AVX2 int8
+static const fbgemm::BlockingFactors Packed8Avx2BlockingFactors = {
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MR,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NR,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NR_MIN,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::ROW_INTERLEAVE,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MCB,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::KCB,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NCB
+};
+
+// Memory blocking factors (parameters) for packing into AVX512 int8
+static const fbgemm::BlockingFactors Packed8Avx512BlockingFactors = {
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::MR,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NR,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NR_MIN,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::ROW_INTERLEAVE,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::MCB,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::KCB,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NCB
+};
+
+// This function returns the correct blocking factors structure for given packing type.
+inline const fbgemm::BlockingFactors* getBlockingFactors(marian::Type packType) {
+  if(packType == Type::packed8avx2) {
+    return &Packed8Avx2BlockingFactors;
+  } else if(packType == Type::packed8avx512) {
+    return &Packed8Avx512BlockingFactors;
+  } else {
+    ABORT("Only avx2 and avx512 instruction sets are supported for int8. {}", packType);
+  }
+}
+
 void fbgemmPacked16PackInfo(const marian::Shape& shape,
                            const bool transpose,
                            uint64_t& packsize) {
@ -145,6 +178,7 @@ void fbgemmPacked16PackInfo(const marian::Shape& shape,
 }

 void fbgemmPacked8PackInfo(const marian::Shape& shape,
+                           const marian::Type packType,
                           const bool transpose,
                           int& nrow,
                           int& ncol,
@ -154,9 +188,12 @@ void fbgemmPacked8PackInfo(const marian::Shape& shape,
            "Weight Matrix should be 2D");
    nrow = transpose ? shape[1] : shape[0];
    ncol = transpose ? shape[0] : shape[1];
+
+    const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
+
    packsize = fbgemm::PackMatrix<fbgemm::PackBMatrix<int8_t>, int8_t>::packedBufferSize(
        transpose ? shape[1] : shape[0],
-        transpose ? shape[0] : shape[1]);
+        transpose ? shape[0] : shape[1], params);
    // add extra space for storing some other variables specific to B matrix
    // quantization sacles: 1 per column and float
    // quantization offset: 1 per column and int32
@ -229,6 +266,7 @@ void fbgemmPacked16Pack(marian::Tensor out,

 void fbgemmPacked8Pack(marian::Tensor out,
                       const float* inData,
+                       const marian::Type packType,
                       const bool transpose,
                       const int nrow,
                       const int ncol,
@ -318,9 +356,11 @@ void fbgemmPacked8Pack(marian::Tensor out,
  }

  // 4. packing
+  const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
+  
  PackBMatrix<int8_t> packedBN(
      transpose ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
-      nrow, ncol, quantized, transpose ? nrow : ncol, packedbuf, 1);
+      nrow, ncol, quantized, transpose ? nrow : ncol, packedbuf, 1, params);

  // copy quantization scale
  memcpy(packedbuf + (packsize - n * (sizeof(float) + sizeof(int32_t) + sizeof(int32_t))), bqScale, n * sizeof(float));
@ -428,6 +468,18 @@ void fbgemmPacked8Gemm(marian::Tensor C,
                       const size_t k,
                       const int transA,
                       const int transB) {
+  // pack type
+  marian::Type packType = B->type();
+
+  const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
+
+  if((packType == Type::packed8avx2 && fbgemmHasAvx512Support())
+     || (packType == Type::packed8avx512 && !fbgemmHasAvx512Support())) {
+    ABORT("FBGEMM doesn't allow to use {} packing order on {} CPUs",
+          packType == Type::packed8avx2 ? "AVX2" : "AVX512",
+          fbgemmHasAvx512Support() ? "AVX512" : "AVX2");
+  }
+
  // compute range to quantize A (activations) - (min/max quantization)
  float min_est = std::numeric_limits<float>::max(), max_est = std::numeric_limits<float>::min();

@ -442,15 +494,16 @@ void fbgemmPacked8Gemm(marian::Tensor C,
  std::vector<int32_t> row_offset_buf(PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
  PackAWithQuantRowOffset<uint8_t> packAN(
      transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
-      (int32_t) (transA ? k : m),
-      (int32_t) (transA ? m : k),
+      (int32_t)(transA ? k : m),
+      (int32_t)(transA ? m : k),
      A->data(),
-      (int32_t) (transA ? m : k),
+      (int32_t)(transA ? m : k),
      nullptr, /*buffer for packed matrix*/
      ascale,
      azeropoint,
      1, /*groups*/
-      row_offset_buf.data());
+      row_offset_buf.data(),
+      params);

  // packed matrix size of B
  int bPackSize = PackMatrix<PackBMatrix<int8_t>, int8_t>::packedBufferSize((int32_t)k, (int32_t)n);
@ -479,10 +532,10 @@ void fbgemmPacked8Gemm(marian::Tensor C,
      (std::uint32_t) n);

  PackBMatrix<int8_t> repackedBN(
-    transB ? matrix_op_t::Transpose : matrix_op_t::NoTranspose, (int32_t) k, (int32_t) n, bdata, (int32_t) (transB ? k : n, 1));
+    transB ? matrix_op_t::Transpose : matrix_op_t::NoTranspose, (int32_t) k, (int32_t) n, bdata, (int32_t) (transB ? k : n), 1, params);

  // gemm computation
-  fbgemmPacked(packAN, repackedBN, C->data(), (int32_t*)C->data(), (int32_t) n, outputProcObj, 0, 1);
+  fbgemmPacked(packAN, repackedBN, C->data(), (int32_t*)C->data(), (int32_t) n, outputProcObj, 0, 1, params);

  delete[] col_offsets;
  delete[] bqZeropoint;
--- a/src/tensors/cpu/fbgemm/packed_gemm.h
+++ b/src/tensors/cpu/fbgemm/packed_gemm.h
@ -46,11 +46,13 @@ void fbgemmPacked16PackInfo(const marian::Shape& shape,
 // Returns the byte size of packed matrix in int8. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
 // See '3rd_party/fbgemm/src/PackBMatrix.cc'.
 // shape: shape of the tensor to be packed
+// packType: Type to be packed - packed8avx2 or packed8avx512
 // transpose: the matrix is transposed
 // nrow (out): the number of rows
 // ncol (out): the number of columns
 // packsize (out): the size of the packed matrix in byte
 void fbgemmPacked8PackInfo(const marian::Shape& shape,
+                           const marian::Type packType,
                           const bool transpose,
                           /*out*/int& nrow,
                           /*out*/int& ncol,
@ -86,6 +88,7 @@ void fbgemmPacked16Pack(marian::Tensor out,
 // Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
 // out: output tensor - packed format and quantized into int8
 // inData: input tensor data - pointer of float data
+// packType: Type to be packed - packed8avx2 or packed8avx512
 // transpose: the matrix is transposed
 // nrow: the number of rows
 // ncol: the number of columns
@ -93,6 +96,7 @@ void fbgemmPacked16Pack(marian::Tensor out,
 //          (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
 void fbgemmPacked8Pack(marian::Tensor out,
                       const float* inData,
+                       const marian::Type packType,
                       const bool transpose,
                       const int nrow,
                       const int ncol,