mirror of
https://github.com/marian-nmt/marian.git
synced 2024-11-03 20:13:47 +03:00
Merge branch 'pmaster'
This commit is contained in:
commit
1c0b899444
@ -696,6 +696,15 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
|
||||
"Use approximate knn search in output layer (currently only in transformer)")
|
||||
->implicit_val("100 1024");
|
||||
|
||||
// parameters for on-line quantization
|
||||
cli.add<bool>("--optimize",
|
||||
"Optimize the graph on-the-fly", false);
|
||||
cli.add<std::string>("--gemm-type,-g",
|
||||
"GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
|
||||
cli.add<float>("--quantize-range",
|
||||
"Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
|
||||
0.f);
|
||||
|
||||
#if 0 // @TODO: Ask Hany if there are any decoding-time options
|
||||
// add ULR settings
|
||||
addSuboptionsULR(cli);
|
||||
@ -747,6 +756,15 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
|
||||
"Mixed precision for inference, set parameter type in expression graph",
|
||||
{"float32"});
|
||||
|
||||
// parameters for on-line quantization
|
||||
cli.add<bool>("--optimize",
|
||||
"Optimize the graph on-the-fly", false);
|
||||
cli.add<std::string>("--gemm-type,-g",
|
||||
"GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
|
||||
cli.add<float>("--quantize-range",
|
||||
"Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
|
||||
0.f);
|
||||
|
||||
cli.switchGroup(previous_group);
|
||||
// clang-format on
|
||||
}
|
||||
|
@ -483,7 +483,45 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
|
||||
// --optimize --cpu-thread=N with N > 0 are set.
|
||||
if(device == DeviceType::cpu) {
|
||||
if(isFloat(aElementType) && isFloat(bElementType)) {
|
||||
return Expression<DotNodeOp>(a, b, transA, transB, scale);
|
||||
if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed ||
|
||||
a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) {
|
||||
#if USE_FBGEMM
|
||||
if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) {
|
||||
auto packedB = cpu::variant::pack(
|
||||
marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB);
|
||||
return cpu::variant::dot(marian::Type::packed16,
|
||||
a, packedB, b->shape(), transA, transB, scale);
|
||||
} else {
|
||||
float quantizeRange = b->graph()->getBackend()->getQuantizeRange();
|
||||
if(fbgemm::fbgemmHasAvx512Support()) {
|
||||
auto packedB = cpu::variant::pack(marian::Type::packed8avx512,
|
||||
b,
|
||||
cpu::variant::PackMatrix::B,
|
||||
transB,
|
||||
quantizeRange);
|
||||
return cpu::variant::dot(marian::Type::packed8avx512,
|
||||
a, packedB, b->shape(), transA, transB, scale);
|
||||
} else if(fbgemm::fbgemmHasAvx2Support()) {
|
||||
auto packedB = cpu::variant::pack(marian::Type::packed8avx2,
|
||||
b,
|
||||
cpu::variant::PackMatrix::B,
|
||||
transB,
|
||||
quantizeRange);
|
||||
return cpu::variant::dot(marian::Type::packed8avx2,
|
||||
a, packedB, b->shape(), transA, transB, scale);
|
||||
} else {
|
||||
ABORT(
|
||||
"AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed "
|
||||
"GEMM");
|
||||
}
|
||||
}
|
||||
#else
|
||||
ABORT("Packed GEMM is not available in this build");
|
||||
#endif // USE_FBGEMM
|
||||
} else {
|
||||
return Expression<DotNodeOp>(
|
||||
a, b, transA, transB, scale);
|
||||
}
|
||||
} else if(isFloat(aElementType) && isIntgemm(bElementType)) {
|
||||
return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale);
|
||||
} else if(isFloat(aElementType) && isPacked(bElementType)) {
|
||||
@ -495,7 +533,8 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
|
||||
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
|
||||
if(fbgemm::fbgemmHasAvx2Support()) {
|
||||
// This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
|
||||
return cpu::variant::dot(a,
|
||||
return cpu::variant::dot(b->value_type(),
|
||||
a,
|
||||
b,
|
||||
b->shape(),
|
||||
transA,
|
||||
@ -541,7 +580,48 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
|
||||
|
||||
if(device == DeviceType::cpu) {
|
||||
if(isFloat(aElementType) && isFloat(bElementType)) {
|
||||
return affineDefault(a, b, bias, transA, transB, scale);
|
||||
if(a->graph()->getBackend()->isOptimized()) {
|
||||
if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed ||
|
||||
a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) {
|
||||
#if USE_FBGEMM
|
||||
if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) {
|
||||
auto packedB = cpu::variant::pack(
|
||||
marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB);
|
||||
return cpu::variant::affine(marian::Type::packed16,
|
||||
a, packedB, b->shape(), bias, transA, transB, scale);
|
||||
} else {
|
||||
float quantizeRange = b->graph()->getBackend()->getQuantizeRange();
|
||||
if(fbgemm::fbgemmHasAvx512Support()) {
|
||||
auto packedB = cpu::variant::pack(marian::Type::packed8avx512,
|
||||
b,
|
||||
cpu::variant::PackMatrix::B,
|
||||
transB,
|
||||
quantizeRange);
|
||||
return cpu::variant::affine(marian::Type::packed8avx512,
|
||||
a, packedB, b->shape(), bias, transA, transB, scale);
|
||||
} else if(fbgemm::fbgemmHasAvx2Support()) {
|
||||
auto packedB = cpu::variant::pack(marian::Type::packed8avx2,
|
||||
b,
|
||||
cpu::variant::PackMatrix::B,
|
||||
transB,
|
||||
quantizeRange);
|
||||
return cpu::variant::affine(marian::Type::packed8avx2,
|
||||
a, packedB, b->shape(), bias, transA, transB, scale);
|
||||
} else {
|
||||
ABORT(
|
||||
"AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed "
|
||||
"GEMM");
|
||||
}
|
||||
}
|
||||
#else
|
||||
ABORT("Packed GEMM is not available in this build");
|
||||
#endif // USE_FBGEMM
|
||||
} else {
|
||||
return affineDefault(a, b, bias, transA, transB, scale);
|
||||
}
|
||||
} else {
|
||||
return affineDefault(a, b, bias, transA, transB, scale);
|
||||
}
|
||||
} else if(isFloat(aElementType) && isIntgemm(bElementType)) {
|
||||
return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale);
|
||||
} else if(isFloat(aElementType) && isPacked(bElementType)) {
|
||||
@ -553,7 +633,8 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
|
||||
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
|
||||
if(fbgemm::fbgemmHasAvx2Support()) {
|
||||
// This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
|
||||
return cpu::variant::affine(a,
|
||||
return cpu::variant::affine(b->value_type(),
|
||||
a,
|
||||
b,
|
||||
b->shape(),
|
||||
bias,
|
||||
|
@ -177,6 +177,8 @@ static inline std::function<Expr(Expr)> activationByName(const std::string& actN
|
||||
return (ActivationFunction*)swish;
|
||||
else if (actName == "gelu")
|
||||
return (ActivationFunction*)gelu;
|
||||
else if (actName == "sigmoid")
|
||||
return (ActivationFunction*)sigmoid;
|
||||
else if (actName == "") // return identity function if activation name is empty
|
||||
return [](Expr x) { return x; };
|
||||
ABORT("Invalid activation name '{}'", actName);
|
||||
|
@ -5,6 +5,14 @@
|
||||
|
||||
namespace marian {
|
||||
|
||||
// GEMM type enum
|
||||
typedef enum {
|
||||
Auto = 0, // auto tuning between available GEMMs
|
||||
Float32 = 1, // MKL based GEMM, fp32
|
||||
FbFp16Packed = 10, // FBGEMM based fp16 GEMM with packing
|
||||
FbInt8Packed = 11 // FBGEMM based int8 GEMM with packing
|
||||
} GemmType;
|
||||
|
||||
class Backend {
|
||||
protected:
|
||||
DeviceId deviceId_;
|
||||
@ -21,6 +29,19 @@ public:
|
||||
// for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name.
|
||||
virtual void setDevice() = 0;
|
||||
virtual void synchronize() = 0;
|
||||
|
||||
// for CPU, sets to use optimized code for inference.
|
||||
// for GPU, this is invalid. for gpu, isOptimized() function always returns false.
|
||||
virtual void setOptimized(bool optimize) = 0;
|
||||
virtual bool isOptimized() = 0;
|
||||
// for CPU, selects different GEMM types for the inference.
|
||||
// for GPU, there's no gemm type. so, it does nothing.
|
||||
virtual void setGemmType(std::string gemmType) = 0;
|
||||
virtual GemmType getGemmType() = 0;
|
||||
// for CPU, sets quantization range of weight matrices for the inference.
|
||||
// for GPU, there's no quantization. so, it does nothing.
|
||||
virtual void setQuantizeRange(float range) = 0;
|
||||
virtual float getQuantizeRange() = 0;
|
||||
};
|
||||
|
||||
Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);
|
||||
|
@ -10,10 +10,34 @@ namespace marian {
|
||||
namespace cpu {
|
||||
|
||||
class Backend : public marian::Backend {
|
||||
protected:
|
||||
bool optimized_{false};
|
||||
GemmType gemmType_{GemmType::Float32};
|
||||
float quantizeRange_{0.f};
|
||||
|
||||
public:
|
||||
Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
|
||||
void setDevice() override {}
|
||||
void synchronize() override {}
|
||||
|
||||
// for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
|
||||
void setOptimized(bool optimize) override { optimized_ = optimize; }
|
||||
bool isOptimized() override { return optimized_; }
|
||||
// for CPU only, selects different GEMM types for the inference. Does nothing for GPU.
|
||||
void setGemmType(std::string gemmType) override {
|
||||
if (gemmType == "auto") gemmType_ = GemmType::Auto;
|
||||
else if (gemmType == "float32") gemmType_ = GemmType::Float32;
|
||||
#if USE_FBGEMM
|
||||
else if (gemmType == "packed16") gemmType_ = GemmType::FbFp16Packed;
|
||||
else if (gemmType.find("packed8") == 0) gemmType_ = GemmType::FbInt8Packed;
|
||||
#endif // USE_FBGEMM
|
||||
else ABORT("Unknown GEMM type - '{}'", gemmType);
|
||||
}
|
||||
GemmType getGemmType() override { return gemmType_; }
|
||||
// for CPU, sets quantization range of weight matrices for the inference.
|
||||
// for GPU, there's no quantization. so, it does nothing.
|
||||
void setQuantizeRange(float range) override { quantizeRange_ = range; }
|
||||
float getQuantizeRange() override { return quantizeRange_; }
|
||||
};
|
||||
|
||||
} // namespace cpu
|
||||
|
@ -138,15 +138,18 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
|
||||
int nrow_;
|
||||
int ncol_;
|
||||
uint64_t packsize_;
|
||||
float quantizeRange_;
|
||||
|
||||
FbgemmPacked8PackNodeOp(Expr a,
|
||||
PackMatrix packMat,
|
||||
marian::Type packType,
|
||||
bool transpose)
|
||||
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
|
||||
bool transpose,
|
||||
float quantizeRange)
|
||||
: UnaryNodeOp(a, newShape(a, packType, transpose), Type::uint8),
|
||||
packMat_(packMat),
|
||||
packType_(packType),
|
||||
transpose_(transpose) {
|
||||
transpose_(transpose),
|
||||
quantizeRange_(quantizeRange){
|
||||
if(packMat != PackMatrix::B)
|
||||
ABORT("Only prepacking of B (weight matrix) is supported");
|
||||
if(!memoize_)
|
||||
@ -161,7 +164,8 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
|
||||
transpose_,
|
||||
nrow_,
|
||||
ncol_,
|
||||
packsize_))
|
||||
packsize_,
|
||||
quantizeRange_))
|
||||
};
|
||||
#else // USE_FBGEMM
|
||||
ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled.");
|
||||
@ -177,13 +181,19 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
|
||||
const std::string type() override { return "packMatInt8"; }
|
||||
|
||||
#if USE_FBGEMM
|
||||
Shape newShape(Expr a, bool transpose) {
|
||||
fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
|
||||
Shape newShape(Expr a, marian::Type packType, bool transpose) {
|
||||
fbgemmPacked8PackInfo(
|
||||
a->shape(),
|
||||
packType,
|
||||
transpose,
|
||||
nrow_,
|
||||
ncol_,
|
||||
packsize_);
|
||||
Shape outShape({(int)packsize_});
|
||||
return outShape;
|
||||
}
|
||||
#else
|
||||
Shape newShape(Expr /*a*/, bool /*transpose*/) {
|
||||
Shape newShape(Expr /*a*/, marian::Type /*packType*/, bool /*transpose*/) {
|
||||
ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
|
||||
return Shape();
|
||||
}
|
||||
@ -282,10 +292,17 @@ private:
|
||||
size_t k_;
|
||||
bool transA_;
|
||||
bool transB_;
|
||||
Type elementType_;
|
||||
|
||||
public:
|
||||
FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
|
||||
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ {
|
||||
FbgemmPacked8AffineNodeOp(Type elementType,
|
||||
const std::vector<Expr>& nodes,
|
||||
Shape bShape,
|
||||
bool transA,
|
||||
bool transB,
|
||||
float /*scalar*/)
|
||||
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
|
||||
elementType_(elementType) {
|
||||
transA_ = transA;
|
||||
transB_ = transB;
|
||||
m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
|
||||
@ -324,7 +341,8 @@ public:
|
||||
#if USE_FBGEMM
|
||||
// Do addBias only if it has a bias term
|
||||
if (children().size() > 2) {
|
||||
nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
|
||||
nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_,
|
||||
val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
m_,
|
||||
@ -334,7 +352,8 @@ public:
|
||||
transB_);
|
||||
marian::cpu::integer::AddBias(val_, child(2)->val())) };
|
||||
} else {
|
||||
nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
|
||||
nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_,
|
||||
val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
m_,
|
||||
@ -358,39 +377,46 @@ public:
|
||||
const std::string type() override { return "gemmPacked8"; }
|
||||
};
|
||||
|
||||
static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
|
||||
static inline Expr affine(Type elementType,
|
||||
Expr a,
|
||||
Expr b,
|
||||
Shape bShape,
|
||||
Expr c,
|
||||
bool transA,
|
||||
bool transB,
|
||||
float scalar) {
|
||||
std::vector<Expr> nodes = {a, b, c};
|
||||
Type elementType = b->value_type();
|
||||
|
||||
if (elementType == Type::packed16)
|
||||
return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
|
||||
else if (isPacked(elementType) && sizeOf(elementType) == 1)
|
||||
return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
|
||||
return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(
|
||||
elementType, nodes, bShape, transA, transB, scalar);
|
||||
else {
|
||||
ABORT("Only int8 and fp16 are available. {}", elementType);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) {
|
||||
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float quantizeRange = 0.f) {
|
||||
if (elementType == Type::packed16)
|
||||
return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose);
|
||||
else if (isPacked(elementType) && sizeOf(elementType) == 1)
|
||||
return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose);
|
||||
return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, quantizeRange);
|
||||
else {
|
||||
ABORT("Only int8 and fp16 are available. {}", elementType);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
|
||||
static inline Expr dot(Type elementType, Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
|
||||
std::vector<Expr> nodes = {a, b};
|
||||
Type elementType = b->value_type();
|
||||
|
||||
if (elementType == Type::packed16)
|
||||
return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
|
||||
else if (isPacked(elementType) && sizeOf(elementType) == 1)
|
||||
return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
|
||||
return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(
|
||||
elementType, nodes, bShape, transA, transB, scalar);
|
||||
else {
|
||||
ABORT("Only int8 and fp16 are available. {}", elementType);
|
||||
return nullptr;
|
||||
|
@ -360,10 +360,10 @@ void fbgemmPacked8Pack(marian::Tensor out,
|
||||
|
||||
const float* data = inData;
|
||||
float val = 0;
|
||||
|
||||
// Use half of the quantization range to prevent overflow of VPMADDUBSW
|
||||
constexpr static int quantizedRange = 127;
|
||||
constexpr static int quantizedMax = 63;
|
||||
|
||||
// Use half of the quantization range to prevent overflow of VPMADDUBSW
|
||||
constexpr static int quantizedRange = 127;
|
||||
constexpr static int quantizedMax = 63;
|
||||
|
||||
// This routine compute the quantization range for each column - either one of min/max range or quantRangeStdDevs sigma range.
|
||||
for (size_t jj = 0; jj < n; jj++) { // for each column, collect stats (min/max or mean/std.dev.)
|
||||
@ -371,32 +371,32 @@ void fbgemmPacked8Pack(marian::Tensor out,
|
||||
double mean = 0, sqrSum = 0;
|
||||
for (size_t ii = 0; ii < k; ii++) { // in a column, go throuhg all the rows and collect stats
|
||||
val = getVal2dArr(data, ii, jj, k, n, transpose);
|
||||
// If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range
|
||||
if(quantRangeStdDevs == 0.f) {
|
||||
if(min > val)
|
||||
min = val;
|
||||
if(max < val)
|
||||
max = val;
|
||||
} else {
|
||||
// Quantize by std.dev. range
|
||||
mean += val;
|
||||
sqrSum += val * val;
|
||||
}
|
||||
}
|
||||
// If a quantization range (in multiples of std. dev.) is given with a non-zero value,
|
||||
// it calculate the range for this column (different quantization scale/offset are used for each column)
|
||||
if(quantRangeStdDevs != 0.f) {
|
||||
mean /= k;
|
||||
sqrSum /= k;
|
||||
sqrSum -= mean * mean;
|
||||
sqrSum = sqrt(sqrSum);
|
||||
min = (float)(mean - quantRangeStdDevs * sqrSum);
|
||||
max = (float)(mean + quantRangeStdDevs * sqrSum);
|
||||
}
|
||||
// based on the quantization range, this computes the scale and offset for the quantization
|
||||
quantScaleB[jj] = (max - min) / quantizedRange;
|
||||
quantZeropointB[jj] = (int32_t)(quantizedMax - max / quantScaleB[jj]);
|
||||
}
|
||||
// If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range
|
||||
if(quantRangeStdDevs == 0.f) {
|
||||
if(min > val)
|
||||
min = val;
|
||||
if(max < val)
|
||||
max = val;
|
||||
} else {
|
||||
// Quantize by std.dev. range
|
||||
mean += val;
|
||||
sqrSum += val * val;
|
||||
}
|
||||
}
|
||||
// If a quantization range (in multiples of std. dev.) is given with a non-zero value,
|
||||
// it calculate the range for this column (different quantization scale/offset are used for each column)
|
||||
if(quantRangeStdDevs != 0.f) {
|
||||
mean /= k;
|
||||
sqrSum /= k;
|
||||
sqrSum -= mean * mean;
|
||||
sqrSum = sqrt(sqrSum);
|
||||
min = (float)(mean - quantRangeStdDevs * sqrSum);
|
||||
max = (float)(mean + quantRangeStdDevs * sqrSum);
|
||||
}
|
||||
// based on the quantization range, this computes the scale and offset for the quantization
|
||||
quantScaleB[jj] = (max - min) / quantizedRange;
|
||||
quantZeropointB[jj] = (int32_t)(quantizedMax - max / quantScaleB[jj]);
|
||||
}
|
||||
|
||||
// 2. quantize
|
||||
int8_t* quantized = 0;
|
||||
@ -410,7 +410,7 @@ void fbgemmPacked8Pack(marian::Tensor out,
|
||||
TensorQuantizationParams bQuantParam;
|
||||
bQuantParam.scale = quantScaleB[jj];
|
||||
bQuantParam.zero_point = quantZeropointB[jj];
|
||||
bQuantParam.precision = 7; // Use half of the quantization range to prevent overflow of VPMADDUBSW
|
||||
bQuantParam.precision = 7; // Use half of the quantization range to prevent overflow of VPMADDUBSW
|
||||
|
||||
if (transpose)
|
||||
fbgemm::Quantize<int8_t>(data + jj * k, quantized + jj * k, k, bQuantParam);
|
||||
@ -536,7 +536,8 @@ void fbgemmPacked16Gemm(marian::Tensor C,
|
||||
// k: the number of columns in A and the number of rows in B
|
||||
// transA: whether A matrix is transposed or not
|
||||
// transB: whether B matrix is transposed or not
|
||||
void fbgemmPacked8Gemm(marian::Tensor C,
|
||||
void fbgemmPacked8Gemm(Type packType,
|
||||
marian::Tensor C,
|
||||
const marian::Tensor A,
|
||||
const marian::Tensor B,
|
||||
const size_t m,
|
||||
@ -544,9 +545,6 @@ void fbgemmPacked8Gemm(marian::Tensor C,
|
||||
const size_t k,
|
||||
const int transA,
|
||||
const int transB) {
|
||||
// pack type
|
||||
marian::Type packType = B->type();
|
||||
|
||||
const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
|
||||
|
||||
// Check if the packed format matches with the available AVX instruction set in the machine
|
||||
|
@ -135,7 +135,8 @@ void fbgemmPacked16Gemm(marian::Tensor C,
|
||||
// k: the number of columns in A and rows in B
|
||||
// transA: transpose of A matrix
|
||||
// transB: transpose of B matrix
|
||||
void fbgemmPacked8Gemm(marian::Tensor C,
|
||||
void fbgemmPacked8Gemm(Type packType,
|
||||
marian::Tensor C,
|
||||
const marian::Tensor A,
|
||||
const marian::Tensor B,
|
||||
const size_t m,
|
||||
|
@ -64,6 +64,36 @@ public:
|
||||
return cusparseHandle_;
|
||||
}
|
||||
|
||||
// for CPU, sets to use optimized code for inference.
|
||||
// for GPU, this is invalid. for gpu, isOptimized() function always returns false.
|
||||
void setOptimized(bool optimize) override {
|
||||
LOG_ONCE(info, "setOptimized() not supported for GPU_{}", optimize);
|
||||
}
|
||||
bool isOptimized() override {
|
||||
LOG_ONCE(info, "isOptimized() not supported for GPU");
|
||||
return false;
|
||||
};
|
||||
|
||||
// for CPU, selects different GEMM types for the inference.
|
||||
// for GPU, there's no gemm type. so, it does nothing.
|
||||
void setGemmType(std::string gemmType) override {
|
||||
LOG_ONCE(info, "setGemmType() not supported for GPU_{}", gemmType);
|
||||
}
|
||||
GemmType getGemmType() override {
|
||||
LOG_ONCE(info, "getGemmType() not supported for GPU");
|
||||
return GemmType::Float32;
|
||||
}
|
||||
|
||||
// for CPU, sets quantization range of weight matrices for the inference.
|
||||
// for GPU, there's no quantization. so, it does nothing.
|
||||
void setQuantizeRange(float range) override {
|
||||
LOG_ONCE(info, "setQuantizeRange() not supported for GPU_{}", range);
|
||||
}
|
||||
float getQuantizeRange() override {
|
||||
LOG_ONCE(info, "getQuantizeRange() not supported for GPU");
|
||||
return 0.f;
|
||||
}
|
||||
|
||||
CudaCompute getCudaComputeCapability() { return compute_; }
|
||||
|
||||
private:
|
||||
|
@ -89,6 +89,11 @@ public:
|
||||
auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
|
||||
graph->setDefaultElementType(typeFromString(prec[0]));
|
||||
graph->setDevice(device);
|
||||
if (device.type == DeviceType::cpu) {
|
||||
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
|
||||
graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
|
||||
graph->getBackend()->setQuantizeRange(options_->get<float>("quantize-range"));
|
||||
}
|
||||
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
graphs_[id] = graph;
|
||||
|
||||
@ -282,6 +287,11 @@ public:
|
||||
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
|
||||
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
|
||||
graph->setDevice(device);
|
||||
if (device.type == DeviceType::cpu) {
|
||||
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
|
||||
graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
|
||||
graph->getBackend()->setQuantizeRange(options_->get<float>("quantize-range"));
|
||||
}
|
||||
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
graphs_.push_back(graph);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user