Integrate intgemm into marian (#595)

Adds intgemm as a module for Marian. Intgemm is @kpu 's 8/16 bit gemm library with support for architectures from SSE2 to AVX512VNNI
Removes outdated integer code, related to the --optimize option

Co-authored-by: Kenneth Heafield <github@kheafield.com>
Co-authored-by: Kenneth Heafield <kpu@users.noreply.github.com>
Co-authored-by: Ulrich Germann <ugermann@inf.ed.ac.uk>
Co-authored-by: Marcin Junczys-Dowmunt <marcinjd@microsoft.com>
Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
This commit is contained in:
Nikolay Bogoychev 2021-01-25 00:02:30 +00:00 committed by GitHub
parent 737f43014a
commit 600f5cbdec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
39 changed files with 984 additions and 1671 deletions

3
.gitmodules vendored
View File

@ -14,6 +14,9 @@
path = src/3rd_party/fbgemm
url = https://github.com/marian-nmt/FBGEMM
branch = master
[submodule "src/3rd_party/intgemm"]
path = src/3rd_party/intgemm
url = https://github.com/marian-nmt/intgemm/
[submodule "src/3rd_party/simple-websocket-server"]
path = src/3rd_party/simple-websocket-server
url = https://github.com/marian-nmt/Simple-WebSocket-Server

View File

@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]
### Added
- Added `intgemm8(ssse3|avx|avx512)?`, `intgemm16(sse2|avx|avx512)?` types to marian-conv with uses intgemm backend. Types intgemm8 and intgemm16 are hardware-agnostic, the other ones hardware-specific.
- Shortlist is now always multiple-of-eight.
- Added intgemm 8/16bit integer binary architecture agnostic format.
- Add --train-embedder-rank for fine-tuning any encoder(-decoder) model for multi-lingual similarity via softmax-margin loss
- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument.
- Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation
@ -56,6 +59,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Fix the runtime failures for FASTOPT on 32-bit builds (wasm just happens to be 32-bit) because it uses hashing with an inconsistent mix of uint64_t and size_t.
### Changed
- Remove `--clip-gemm` which is obsolete and was never used anyway
- Removed `--optimize` switch, instead we now determine compute type based on binary model.
- Updated SentencePiece repository to version 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 from https://github.com/google/sentencepiece.
- Enabled compilation of SentencePiece by default since no dependency on protobuf anymore.
- Changed default value of --sentencepiece-max-lines from 10000000 to 2000000 since apparently the new version doesn't sample automatically anymore (Not quite clear how that affects quality of the vocabulary).

View File

@ -93,8 +93,8 @@ if(MSVC)
# Or maybe use these?
set(INTRINSICS "/arch:AVX2")
# set(INTRINSICS "/arch:AVX512")
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS ${DISABLE_GLOBALLY}")
# /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
@ -438,6 +438,7 @@ endif(USE_MPI)
###############################################################################
# Find BLAS library
if(COMPILE_CPU)
set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
if(USE_APPLE_ACCELERATE)
if(NOT APPLE)
message(FATAL_ERROR "FATAL ERROR: Apple Accelerate only works on macOS.")

@ -1 +1 @@
Subproject commit 16914ae94c80f338c678f0461c4e45965149f6aa
Subproject commit 97b2f95abab6134c1632b286e373e513ecc52020

View File

@ -8,6 +8,11 @@ add_subdirectory(./zlib)
add_subdirectory(./faiss)
include_directories(./faiss)
if(COMPILE_CPU)
set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
add_subdirectory(./intgemm)
endif(COMPILE_CPU)
if(USE_FBGEMM)
# @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
# meant to silence CMakeFiles of 3rd_party tools.

1
src/3rd_party/intgemm vendored Submodule

@ -0,0 +1 @@
Subproject commit 874ceebbf53a85691b326495100b6361a2166cec

View File

@ -5,6 +5,8 @@ include_directories(3rd_party)
include_directories(3rd_party/SQLiteCpp/include)
include_directories(3rd_party/sentencepiece)
include_directories(3rd_party/fbgemm/include)
include_directories(3rd_party/intgemm)
include_directories(${CMAKE_BINARY_DIR}/src/3rd_party/intgemm) # running cmake on the intgemm submodule triggers config file generation in this directory.
include_directories(${CMAKE_BINARY_DIR}/local/include)
set(MARIAN_SOURCES
@ -41,6 +43,7 @@ set(MARIAN_SOURCES
3rd_party/cnpy/cnpy.cpp
3rd_party/ExceptionWithCallStack.cpp
3rd_party/onnx/protobuf/onnx-ml.pb-wrapper.cpp
3rd_party/phf/phf.cc
@ -52,10 +55,7 @@ set(MARIAN_SOURCES
tensors/cpu/prod.cpp
tensors/cpu/topk.cpp
tensors/cpu/tensor_operators.cpp
tensors/cpu/sharp/int_gemm.cpp
tensors/cpu/sharp/avx_gemm.cpp
tensors/cpu/sharp/sse_gemm.cpp
tensors/cpu/integer_common.cpp
tensors/cpu/fbgemm/packed_gemm.cpp
graph/expression_graph.cpp

View File

@ -1,12 +1,10 @@
#include "marian.h"
#include "common/cli_wrapper.h"
#include "tensors/cpu/expression_graph_packable.h"
#include "onnx/expression_graph_onnx_exporter.h"
#include <sstream>
#include "tensors/cpu/fbgemm/expression_graph_packable.h"
#include "onnx/expression_graph_onnx_exporter.h"
int main(int argc, char** argv) {
using namespace marian;
@ -24,7 +22,9 @@ int main(int argc, char** argv) {
cli->add<std::string>("--from,-f", "Input model", "model.npz");
cli->add<std::string>("--to,-t", "Output model", "model.bin");
cli->add<std::string>("--export-as", "Kind of conversion: marian-bin or onnx-{encode,decoder-step,decoder-init,decoder-stop}", "marian-bin");
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512", "float32");
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512, "
"intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512",
"float32");
cli->add<std::vector<std::string>>("--vocabs,-V", "Vocabulary file, required for ONNX export");
cli->parse(argc, argv);
options->merge(config);
@ -35,19 +35,8 @@ int main(int argc, char** argv) {
auto exportAs = options->get<std::string>("export-as");
auto vocabPaths = options->get<std::vector<std::string>>("vocabs");// , std::vector<std::string>());
auto saveGemmTypeStr = options->get<std::string>("gemm-type", "float32");
Type saveGemmType;
if(saveGemmTypeStr == "float32") {
saveGemmType = Type::float32;
} else if(saveGemmTypeStr == "packed16") { // packed16 only supports AVX2. AVX512 might be added later
saveGemmType = Type::packed16;
} else if(saveGemmTypeStr == "packed8avx2") { // packed8 for AVX2
saveGemmType = Type::packed8avx2;
} else if(saveGemmTypeStr == "packed8avx512") { // packed8 for AVX512
saveGemmType = Type::packed8avx512;
} else {
ABORT("Unknown gemm-type: {}", saveGemmTypeStr);
}
// We accept any type here and will later croak during packAndSave if the type cannot be used for conversion
Type saveGemmType = typeFromString(options->get<std::string>("gemm-type", "float32"));
LOG(info, "Outputting {}, precision: {}", modelTo, saveGemmType);
@ -58,12 +47,11 @@ int main(int argc, char** argv) {
auto load = [&](Ptr<ExpressionGraph> graph) {
graph->setDevice(CPU0);
graph->getBackend()->setOptimized(false);
graph->load(modelFrom);
graph->forward(); // run the initializers
};
if (exportAs == "marian-bin") {
auto graph = New<ExpressionGraphPackable>();
load(graph);

View File

@ -3,6 +3,7 @@
#include "common/file_stream.h"
#include "common/io_item.h"
#include "common/types.h"
#include "tensors/cpu/integer_common.h"
#include <string>
@ -57,13 +58,31 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
get<char>(current, offset);
for(int i = 0; i < numHeaders; ++i) {
// For intgemm AVX512 and AVX512VNNI have the same arangement, but the VNNI algorithm is faster.
// Change the type to the fastest one supported.
if (items[i].type == Type::intgemm8avx512) {
items[i].type = cpu::integer::getIntgemmType(Type::intgemm8);
}
if(items[i].mapped) { // memory-mapped, hence only set pointer
// @TOOD: verify this actually works for the hardware-specific ones like intgemm8avx2
ABORT_IF(items[i].type == Type::intgemm8 || items[i].type == Type::intgemm16, "mmap format not supported for hardware non-specific intgemm matrices");
items[i].ptr = get<char>(current, headers[i].dataLength);
} else { // reading into item data
size_t len = headers[i].dataLength;
items[i].bytes.resize(len);
const char* ptr = get<char>(current, len);
std::copy(ptr, ptr + len, items[i].bytes.begin());
// Intgemm8/16 matrices in binary model are just quantized, however they also need to be reordered
// Reordering depends on the architecture (SSE/AVX2/AVX512) so we read in the quantized matrices and
// then reorder them before adding them as a parameter in the graph.
if (matchType<intgemm8>(items[i].type)) {
items[i].type = cpu::integer::getIntgemmType(Type::intgemm8);
cpu::integer::prepareAndTransposeB<Type::intgemm8>(items[i], ptr);
} else if (matchType<intgemm16>(items[i].type)) {
items[i].type = cpu::integer::getIntgemmType(Type::intgemm16);
cpu::integer::prepareAndTransposeB<Type::intgemm16>(items[i], ptr);
} else {
std::copy(ptr, ptr + len, items[i].bytes.begin());
}
}
}
}

View File

@ -134,8 +134,6 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
"Suppress logging for translation");
cli.add<size_t>("--seed",
"Seed for all random number generators. 0 means initialize randomly");
cli.add<float>("--clip-gemm",
"If not 0 clip GEMM input values to +/- arg");
cli.add<bool>("--interpolate-env-vars",
"allow the use of environment variables in paths, of the form ${VAR_NAME}");
cli.add<bool>("--relative-paths",
@ -671,15 +669,13 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
addSuboptionsDevices(cli);
addSuboptionsBatching(cli);
cli.add<bool>("--optimize",
"Optimize speed aggressively sacrificing memory or precision");
cli.add<bool>("--skip-cost",
"Ignore model cost during translation, not recommended for beam-size > 1");
cli.add<bool>("--fp16",
"Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
cli.add<std::vector<std::string>>("--precision",
"Mixed precision for inference, set parameter type in expression graph",
{"float32"});
cli.add<bool>("--skip-cost",
"Ignore model cost during translation, not recommended for beam-size > 1");
cli.add<std::vector<std::string>>("--shortlist",
"Use softmax shortlist: path first best prune");
@ -737,8 +733,6 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
addSuboptionsDevices(cli);
addSuboptionsBatching(cli);
cli.add<bool>("--optimize",
"Optimize speed aggressively sacrificing memory or precision");
cli.add<bool>("--fp16",
"Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
cli.add<std::vector<std::string>>("--precision",
@ -776,12 +770,10 @@ void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) {
addSuboptionsDevices(cli);
addSuboptionsBatching(cli);
cli.add<bool>("--optimize",
"Optimize speed aggressively sacrificing memory or precision");
cli.add<bool>("--fp16",
"Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
cli.add<std::vector<std::string>>("--precision",
"Mixed precision for inference, set parameter type in expression graph",
"Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16",
{"float32"});
cli.switchGroup(previous_group);
@ -934,7 +926,6 @@ void ConfigParser::addSuboptionsQuantization(cli::CLIWrapper& cli) {
// clang-format on
}
cli::mode ConfigParser::getMode() const { return mode_; }
Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {

View File

@ -26,13 +26,16 @@ size_t requiredBytes(const Shape& shape, Type type) {
ABORT("Not a supported data type: {}", type);
return 0;
}
}
#endif // USE_FBGEMM
if (isIntgemm(type)) {
/* Intgemm tensors have an extra float at the back that stores the quantization multiplier */
return shape.elements() * sizeOf(type) + sizeOf(Type::float32);
} else {
return shape.elements() * sizeOf(type);
}
#else
return shape.elements() * sizeOf(type);
#endif // USE_FBGEMM
}
}
} // namespace marian

View File

@ -135,20 +135,26 @@ do { \
namespace marian {
// small struct to enable templating based on types use for packing
struct packed16 {
uint16_t x;
};
struct packed16 { uint16_t x; };
// small struct to enable templating based on types use for packing. This is a memory holder.
// There's no difference between packed8avx2 and packed8avx512. But, they are separately defined to be distinguished.
struct packed8avx2 {
uint8_t x;
};
struct packed8avx2 { uint8_t x; };
struct packed8avx512 { uint8_t x; };
// similar to the packed16, but to use with 16bit intgemm model packing.
struct intgemm16 { int16_t x; };
struct intgemm16sse2 { int16_t x; };
struct intgemm16avx2 { int16_t x; };
struct intgemm16avx512 { int16_t x; };
// similar to packed8* but for intgemm 8bit model packing.
struct intgemm8 { int8_t x; };
struct intgemm8ssse3 { int8_t x; };
struct intgemm8avx2 { int8_t x; };
struct intgemm8avx512 { int8_t x; };
struct intgemm8avx512vnni { int8_t x; };
// small struct to enable templating based on types use for packing. This is a memory holder.
struct packed8avx512 {
uint8_t x;
};
#ifndef __CUDACC__ // vectorized types not available from .cu files
@ -214,17 +220,22 @@ struct float32x8 {
#endif
// Internal to types.h, don't use. Use test functions below.
enum class TypeClass : size_t {
signed_type = 0x0100,
unsigned_type = 0x0200,
float_type = 0x0400,
enum class TypeClass : size_t { // size_type has 8 bytes, so we can have 16 fields here, currently using 5. Extend to the left for back-compat.
// built-in type classes
signed_type = 0x00100,
unsigned_type = 0x00200,
float_type = 0x00400,
packed_type = 0x0800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
avx2_type = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only
avx512_type = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only
avx2_type = 0x01000, // processor-specific layout for avx2, currently used for FBGEMM only (keep 0x1000 for back-compat)
avx512_type = 0x02000, // processor-specific layout for avx512, currently used for FBGEMM only (keep 0x2000 for back-compat)
sse2_type = 0x04000, // processor-specific layout for sse2, currently used for Intgemm only
ssse3_type = 0x08000, // processor-specific layout for ssse3, currently used for Intgemm only
size_mask = 0x00FF,
class_mask = 0xFF00
packed_type = 0x00800, // special packed (CPU cache friendly) type class, used in FBGEMM. Annoyingly we need to keep 0x800 for back-compat, would be nicer to align with intgemm
intgemm_type = 0x10000, // intgemm quantized architecture agnostic models
size_mask = 0x000FF, // maximum allowed size is 256 bytes right now; if more are required, extend the size field
class_mask = 0xFFF00, // three fields for different type classes, if more classes are added we need to increase the number of fields here
};
constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
@ -251,10 +262,21 @@ enum class Type : size_t {
float32 = TypeClass::float_type + 4u,
float64 = TypeClass::float_type + 8u,
packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
intgemm8 = TypeClass::intgemm_type + 1u, // Int8 quantized (not packed) matrices for intgemm
intgemm16 = TypeClass::intgemm_type + 2u, // Int16 quantized (not packed) matrices for intgemm
intgemm8ssse3 = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type, // Int8 quantized and packed (ssse3) matrices for intgemm
intgemm8avx2 = TypeClass::intgemm_type + 1u + TypeClass::avx2_type, // Int8 quantized and packed (avx2) matrices for intgemm
intgemm8avx512 = TypeClass::intgemm_type + 1u + TypeClass::avx512_type, // Int8 quantized and packed (avx512) matrices for intgemm
intgemm8avx512vnni = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, // Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
intgemm16sse2 = TypeClass::intgemm_type + 2u + TypeClass::sse2_type, // Int16 quantized and packed (sse2) matrices for intgemm
intgemm16avx2 = TypeClass::intgemm_type + 2u + TypeClass::avx2_type, // Int16 quantized and packed (avx2) matrices for intgemm
intgemm16avx512 = TypeClass::intgemm_type + 2u + TypeClass::avx512_type, // Int16 quantized and packed (avx512) matrices for intgemm
};
static inline size_t operator&(TypeClass typeClass, Type type) {
@ -289,6 +311,14 @@ static inline bool isPacked(Type type) {
return (TypeClass::packed_type & type) != 0;
}
static inline bool isSse2(Type type) {
return (TypeClass::sse2_type & type) != 0;
}
static inline bool isSsse3(Type type) {
return (TypeClass::ssse3_type & type) != 0;
}
static inline bool isAvx2(Type type) {
return (TypeClass::avx2_type & type) != 0;
}
@ -297,6 +327,10 @@ static inline bool isAvx512(Type type) {
return (TypeClass::avx512_type & type) != 0;
}
static inline bool isIntgemm(Type type) {
return (TypeClass::intgemm_type & type) != 0;
}
size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type
template <typename T>
@ -314,13 +348,24 @@ template <> inline bool matchType<uint16_t>(Type type) { return type == Type::ui
template <> inline bool matchType<uint32_t>(Type type) { return type == Type::uint32; }
template <> inline bool matchType<uint64_t>(Type type) { return type == Type::uint64; }
template <> inline bool matchType<float16>(Type type) { return type == Type::float16; }
template <> inline bool matchType<float>(Type type) { return type == Type::float32; }
template <> inline bool matchType<double>(Type type) { return type == Type::float64; }
template <> inline bool matchType<float16>(Type type) { return type == Type::float16; }
template <> inline bool matchType<float>(Type type) { return type == Type::float32; }
template <> inline bool matchType<double>(Type type) { return type == Type::float64; }
template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
template <> inline bool matchType<packed8avx2>(Type type) { return type == Type::packed8avx2; }
template <> inline bool matchType<packed8avx512>(Type type) { return type == Type::packed8avx512; }
template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
template <> inline bool matchType<packed8avx2>(Type type) { return type == Type::packed8avx2; }
template <> inline bool matchType<packed8avx512>(Type type) { return type == Type::packed8avx512; }
template <> inline bool matchType<intgemm8>(Type type) { return type == Type::intgemm8; }
template <> inline bool matchType<intgemm8ssse3>(Type type) { return type == Type::intgemm8ssse3; }
template <> inline bool matchType<intgemm8avx2>(Type type) { return type == Type::intgemm8avx2; }
template <> inline bool matchType<intgemm8avx512>(Type type) { return type == Type::intgemm8avx512; }
template <> inline bool matchType<intgemm8avx512vnni>(Type type) { return type == Type::intgemm8avx512vnni; }
template <> inline bool matchType<intgemm16>(Type type) { return type == Type::intgemm16; }
template <> inline bool matchType<intgemm16sse2>(Type type) { return type == Type::intgemm16sse2; }
template <> inline bool matchType<intgemm16avx2>(Type type) { return type == Type::intgemm16avx2; }
template <> inline bool matchType<intgemm16avx512>(Type type) { return type == Type::intgemm16avx512; }
// clang-format on
static inline std::ostream& operator<<(std::ostream& out, Type type) {
@ -342,6 +387,16 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) {
case Type::packed16 : out << "packed16"; break;
case Type::packed8avx2 : out << "packed8avx2"; break;
case Type::packed8avx512 : out << "packed8avx512"; break;
case Type::intgemm8 : out << "intgemm8"; break;
case Type::intgemm8ssse3 : out << "intgemm8ssse3"; break;
case Type::intgemm8avx2 : out << "intgemm8avx2"; break;
case Type::intgemm8avx512 : out << "intgemm8avx512"; break;
case Type::intgemm8avx512vnni : out << "intgemm8avx512vnni"; break;
case Type::intgemm16 : out << "intgemm16"; break;
case Type::intgemm16sse2 : out << "intgemm16sse2"; break;
case Type::intgemm16avx2 : out << "intgemm16avx2"; break;
case Type::intgemm16avx512 : out << "intgemm16avx512"; break;
}
return out;
}
@ -350,12 +405,12 @@ template <typename T>
inline std::string request();
// clang-format off
template <> inline std::string request<int8_t>() { return "int8"; }
template <> inline std::string request<int8_t>() { return "int8"; }
template <> inline std::string request<int16_t>() { return "int16"; }
template <> inline std::string request<int32_t>() { return "int32"; }
template <> inline std::string request<int64_t>() { return "int64"; }
template <> inline std::string request<uint8_t>() { return "uint8"; }
template <> inline std::string request<uint8_t>() { return "uint8"; }
template <> inline std::string request<uint16_t>() { return "uint16"; }
template <> inline std::string request<uint32_t>() { return "uint32"; }
template <> inline std::string request<uint64_t>() { return "uint64"; }
@ -364,9 +419,19 @@ template <> inline std::string request<float16>() { return "float16"; }
template <> inline std::string request<float>() { return "float32"; }
template <> inline std::string request<double>() { return "float64"; }
template <> inline std::string request<packed16>() { return "packed16"; }
template <> inline std::string request<packed8avx2>() { return "packed8avx2"; }
template <> inline std::string request<packed8avx512>() { return "packed8avx512"; }
template <> inline std::string request<packed16>() { return "packed16"; }
template <> inline std::string request<packed8avx2>() { return "packed8avx2"; }
template <> inline std::string request<packed8avx512>() { return "packed8avx512"; }
template <> inline std::string request<intgemm8>() { return "intgemm8"; }
template <> inline std::string request<intgemm8ssse3>() { return "intgemm8ssse3"; }
template <> inline std::string request<intgemm8avx2>() { return "intgemm8avx2"; }
template <> inline std::string request<intgemm8avx512>() { return "intgemm8avx512"; }
template <> inline std::string request<intgemm8avx512vnni>() { return "intgemm8avx512vnni"; }
template <> inline std::string request<intgemm16>() { return "intgemm16"; }
template <> inline std::string request<intgemm16sse2>() { return "intgemm16sse2"; }
template <> inline std::string request<intgemm16avx2>() { return "intgemm16avx2"; }
template <> inline std::string request<intgemm16avx512>() { return "intgemm16avx512"; }
// clang-format on
static Type inline typeFromString(const std::string& str) {
@ -402,18 +467,38 @@ static Type inline typeFromString(const std::string& str) {
if(str == "packed8avx512")
return Type::packed8avx512;
if(str == "intgemm8")
return Type::intgemm8;
if(str == "intgemm8ssse3")
return Type::intgemm8ssse3;
if(str == "intgemm8avx2")
return Type::intgemm8avx2;
if(str == "intgemm8avx512")
return Type::intgemm8avx512;
if(str == "intgemm8avx512vnni")
return Type::intgemm8avx512vnni;
if(str == "intgemm16")
return Type::intgemm16;
if(str == "intgemm16sse2")
return Type::intgemm16sse2;
if(str == "intgemm16avx2")
return Type::intgemm16avx2;
if(str == "intgemm16avx512")
return Type::intgemm16avx512;
ABORT("Unknown type {}", str);
}
template <typename T>
inline Type typeId();
template <> inline Type typeId<int8_t>() { return Type::int8; }
template <> inline Type typeId<int8_t>() { return Type::int8; }
template <> inline Type typeId<int16_t>() { return Type::int16; }
template <> inline Type typeId<int32_t>() { return Type::int32; }
template <> inline Type typeId<int64_t>() { return Type::int64; }
template <> inline Type typeId<uint8_t>() { return Type::uint8; }
template <> inline Type typeId<uint8_t>() { return Type::uint8; }
template <> inline Type typeId<uint16_t>() { return Type::uint16; }
template <> inline Type typeId<uint32_t>() { return Type::uint32; }
template <> inline Type typeId<uint64_t>() { return Type::uint64; }
@ -422,10 +507,21 @@ template <> inline Type typeId<float16>() { return Type::float16; }
template <> inline Type typeId<float>() { return Type::float32; }
template <> inline Type typeId<double>() { return Type::float64; }
template <> inline Type typeId<packed16>() { return Type::packed16; }
template <> inline Type typeId<packed8avx2>() { return Type::packed8avx2; }
template <> inline Type typeId<packed16>() { return Type::packed16; }
template <> inline Type typeId<packed8avx2>() { return Type::packed8avx2; }
template <> inline Type typeId<packed8avx512>() { return Type::packed8avx512; }
template <> inline Type typeId<intgemm8>() { return Type::intgemm8; }
template <> inline Type typeId<intgemm8ssse3>() { return Type::intgemm8ssse3; }
template <> inline Type typeId<intgemm8avx2>() { return Type::intgemm8avx2; }
template <> inline Type typeId<intgemm8avx512>() { return Type::intgemm8avx512; }
template <> inline Type typeId<intgemm8avx512vnni>() { return Type::intgemm8avx512vnni; }
template <> inline Type typeId<intgemm16>() { return Type::intgemm16; }
template <> inline Type typeId<intgemm16sse2>() { return Type::intgemm16sse2; }
template <> inline Type typeId<intgemm16avx2>() { return Type::intgemm16avx2; }
template <> inline Type typeId<intgemm16avx512>() { return Type::intgemm16avx512; }
// Abort if given C++ does not correspond to runtime type
template <typename T>
void matchOrAbort(Type type) {

View File

@ -260,6 +260,14 @@ public:
for(auto& it : data_[i])
indexSet.insert(it.first);
}
// Ensure that the generated vocabulary items from a shortlist are a multiple-of-eight
// This is necessary until intgemm supports non-multiple-of-eight matrices.
// TODO better solution here? This could potentially be slow.
WordIndex i = static_cast<WordIndex>(firstNum_);
while (indexSet.size() % 8 != 0) {
indexSet.insert(i);
i++;
}
// turn into vector and sort (selected indices)
std::vector<WordIndex> indices(indexSet.begin(), indexSet.end());

View File

@ -84,11 +84,6 @@ public:
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
graph->setDevice(device);
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
}
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_.push_back(graph);
}

View File

@ -7,7 +7,7 @@
#include "graph/node_operators_tuple.h"
#include "graph/auto_tuner.h"
#include "tensors/cpu/int16.h"
#include "tensors/cpu/intgemm_interface.h"
#include "tensors/cpu/fbgemm/expanded_gemm.h"
#if USE_FBGEMM
@ -466,7 +466,6 @@ Expr weighted_average(Expr in, Expr weights, int ax) {
Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
auto device = a->graph()->getDeviceId().type;
float clipValue = a->graph()->getBackend()->getClip();
// added support for packed GEMM API (fp16, int8)
Type aElementType = a->value_type();
Type bElementType = b->value_type();
@ -475,18 +474,9 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
// --optimize --cpu-thread=N with N > 0 are set.
if(device == DeviceType::cpu) {
if(isFloat(aElementType) && isFloat(bElementType)) {
if(a->graph()->getBackend()->isOptimized()) {
// dotInt16 computes A * B.T, hence the transpose for B to get A * B
// if transA = false and transB = false.
return cpu::int16::dot(
cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
cpu::int16::quantize(transB ? b : transpose(b), clipValue),
scale);
} else {
return Expression<DotNodeOp>(
clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
}
return Expression<DotNodeOp>(a, b, transA, transB, scale);
} else if(isFloat(aElementType) && isIntgemm(bElementType)) {
return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale);
} else if(isFloat(aElementType) && isPacked(bElementType)) {
#if USE_FBGEMM
// 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
@ -496,7 +486,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support()) {
// This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
return cpu::variant::dot(clip(a, clipValue),
return cpu::variant::dot(a,
b,
b->shape(),
transA,
@ -512,8 +502,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType);
}
} else {
return Expression<DotNodeOp>(
clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
return Expression<DotNodeOp>(a, b, transA, transB, scale);
}
}
@ -524,16 +513,9 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) {
static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
// general version, MKL, CBlas or CUDA
// if clipValue > 0, the inputs will be clipped to range [-clipValue,
// clipValue] This is meant to keep values at the same range as used during
// training when optimizing for 8-bit integer products. Likely to be removed
// in the future when we explore better ways to handle this.
float clipValue = a->graph()->getBackend()->getClip();
int rows = a->shape().elements() / a->shape()[-1];
Expr ones = a->graph()->ones({ rows, 1 });
std::vector<Expr> nodes
= { clip(a, clipValue), clip(b, clipValue), bias, ones };
std::vector<Expr> nodes = { a, b, bias, ones };
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
}
@ -545,22 +527,14 @@ static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, f
Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
auto device = a->graph()->getDeviceId().type;
float clipValue = a->graph()->getBackend()->getClip();
Type aElementType = a->value_type();
Type bElementType = b->value_type();
if(device == DeviceType::cpu) {
if(isFloat(aElementType) && isFloat(bElementType)) {
if(a->graph()->getBackend()->isOptimized()) {
// cpu int16 version
return cpu::int16::affine(
cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
cpu::int16::quantize(transB ? b : transpose(b), clipValue),
bias,
scale);
} else {
return affineDefault(a, b, bias, transA, transB, scale);
}
return affineDefault(a, b, bias, transA, transB, scale);
} else if(isFloat(aElementType) && isIntgemm(bElementType)) {
return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale);
} else if(isFloat(aElementType) && isPacked(bElementType)) {
#if USE_FBGEMM
// 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
@ -570,7 +544,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support()) {
// This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
return cpu::variant::affine(clip(a, clipValue),
return cpu::variant::affine(a,
b,
b->shape(),
bias,

View File

@ -10,7 +10,7 @@
#include "translator/scorers.h"
#include "data/alignment.h"
#include "data/vocab_base.h"
#include "tensors/cpu/fbgemm/expression_graph_packable.h"
#include "tensors/cpu/expression_graph_packable.h"
#if USE_FBGEMM
#include "fbgemm/Utils.h"
@ -256,7 +256,6 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP
auto graph = New<ExpressionGraphPackable>();
graph->setDevice(CPU0);
graph->getBackend()->setOptimized(false);
graph->load(inputFile);
graph->forward();

View File

@ -73,11 +73,6 @@ public:
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
graph->setDevice(device);
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
}
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_.push_back(graph);
}

View File

@ -10,10 +10,7 @@ protected:
DeviceId deviceId_;
size_t seed_;
Ptr<RandomGenerator> randomGenerator_;
// global clipping value for matrix-multiplies, should soon be removed.
float clipValue_{0.f};
public:
Backend(DeviceId deviceId, size_t seed)
: deviceId_(deviceId), seed_(seed), randomGenerator_(createRandomGenerator(seed, deviceId)) {}
@ -24,14 +21,6 @@ public:
// for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name.
virtual void setDevice() = 0;
virtual void synchronize() = 0;
virtual void setClip(float clipValue) { clipValue_ = clipValue; }
float getClip() { return clipValue_; }
// for CPU, sets to use optimized code for inference.
// for GPU, this is invalid. for gpu, isOptimized() function always returns false.
virtual void setOptimized(bool optimize) = 0;
virtual bool isOptimized() = 0;
};
Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);

43
src/tensors/cpu/aligned.h Normal file
View File

@ -0,0 +1,43 @@
#pragma once
#include "common/definitions.h"
#include <stdlib.h>
#ifdef _WIN32
#include <malloc.h>
#endif
namespace marian {
namespace cpu {
namespace {
// allocate function for tensor reserve() below.
// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.
#ifdef _WIN32
void *genericMalloc(size_t alignment, size_t size) {
void *ret = _aligned_malloc(size, alignment);
ABORT_IF(!ret, "Failed to allocate memory on CPU");
return ret;
}
void genericFree(void *ptr) {
_aligned_free(ptr);
}
#else
// Linux and OS X. There is no fallback to malloc because we need it to be aligned.
void *genericMalloc(size_t alignment, size_t size) {
// On macos, aligned_alloc is available only on c++17
// Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
// posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
void *result;
// Error could be detected by return value or just remaining nullptr.
ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
return result;
}
void genericFree(void *ptr) {
free(ptr);
}
#endif
}
} // namespace cpu
} // namespace marian

View File

@ -10,17 +10,11 @@ namespace marian {
namespace cpu {
class Backend : public marian::Backend {
protected:
bool optimized_{false};
public:
Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
void setDevice() override {}
void synchronize() override {}
// for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
void setOptimized(bool optimize) override { optimized_ = optimize; }
bool isOptimized() override { return optimized_; }
};
} // namespace cpu
} // namespace marian

View File

@ -1,44 +1,8 @@
#include "tensors/device.h"
#include "tensors/cpu/aligned.h"
#include <iostream>
#ifdef _WIN32
#include <malloc.h>
#endif
#include <stdlib.h>
namespace marian {
namespace cpu {
namespace {
// allocate function for tensor reserve() below.
// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.
#ifdef _WIN32
void *genericMalloc(size_t alignment, size_t size) {
void *ret = _aligned_malloc(size, alignment);
ABORT_IF(!ret, "Failed to allocate memory on CPU");
return ret;
}
void genericFree(void *ptr) {
_aligned_free(ptr);
}
#else
// Linux and OS X. There is no fallback to malloc because we need it to be aligned.
void *genericMalloc(size_t alignment, size_t size) {
// On macos, aligned_alloc is available only on c++17
// Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
// posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
void *result;
// Error could be detected by return value or just remaining nullptr.
ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
return result;
}
void genericFree(void *ptr) {
free(ptr);
}
#endif
} // namespace
Device::~Device() {
genericFree(data_);

View File

@ -0,0 +1,266 @@
#pragma once
#include "graph/expression_graph.h"
#include "fbgemm/packed_gemm.h"
#include "tensors/cpu/integer_common.h"
namespace marian {
namespace cpu {
void Transpose10(marian::Tensor out, const marian::Tensor in);
}
}
namespace marian {
// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
// The decision which weights can be packed or not should be done walking through the graph.
// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
// We will improve this in the near future.
class ExpressionGraphPackable : public ExpressionGraph {
public:
ExpressionGraphPackable()
: ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference
virtual ~ExpressionGraphPackable() {}
// Convert model weights into packed format and save to IO items.
// @TODO: review this
void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
std::vector<io::Item> ioItems;
// sorted by name in std::map
for (auto p : params()->getMap()) {
std::string pName = p.first;
if (!namespace_.empty()) {
if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
pName = pName.substr(namespace_.size() + 2);
}
Tensor val = p.second->val();
// save as packed format
// @TODO Hardcoded to find packable weights
// int8 - all the weights used for affine op and dot op
// fp16 - all the weights used for affine op
if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information - size
int nrow;
int ncol;
uint64_t packsize;
fbgemmPacked8PackInfo(val->shape(),
gemmElementType,
pName.find("Wemb") != std::string::npos,
nrow,
ncol,
packsize);
auto allocator = New<TensorAllocator>(getBackend());
// buffer tensor to save packed matrix
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
//Pack B matrix into int8
fbgemmPacked8Pack(packedTensor,
val->data(),
gemmElementType,
pName.find("Wemb") != std::string::npos,
nrow,
ncol,
packsize);
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = gemmElementType;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
// tensor size. Saving to *.npz will cut to size.
auto mem = packedTensor->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
// fp16 quantization option
} else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information
int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
uint64_t packsize;
fbgemmPacked16PackInfo(val->shape(),
false,
nrow,
ncol,
kernel_ncol_blocks,
brow,
bcol,
last_brow,
nbrow,
nbcol,
packsize);
auto allocator = New<TensorAllocator>(getBackend());
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
// fbgemmPacked16Pack
fbgemmPacked16Pack(packedTensor,
val->data(),
false,
nrow,
ncol,
kernel_ncol_blocks,
brow,
bcol,
last_brow,
nbrow,
nbcol,
packsize);
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = Type::packed16;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
// tensor size. Saving to *.npz will cut to size.
auto mem = packedTensor->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
} else if (isIntgemm(gemmElementType) &&
(pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
#if COMPILE_CPU
using cpu::integer::cols;
using cpu::integer::rows;
auto allocator = New<TensorAllocator>(getBackend());
Tensor paramMat; //This allocates extra 4 bytes at the end because of gemmElementType
allocator->allocate(paramMat, val->shape(), gemmElementType);
// Compute QuantMultiplier, compress matrix and store quantMult at the end.
// We need to tranpose first, because of our architecture independet format requiring a transposed matrix
Tensor tmp;
allocator->allocate(tmp, val->shape(), val->type());
cpu::Transpose10(tmp, val);
if(sizeOf(gemmElementType) == 1) { // is 8-bit Intgemm type
float quantMult = cpu::integer::computeQuantMult<Type::intgemm8>(val);
// Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime
cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type
if(isSsse3(gemmElementType)) {
intgemm::ssse3::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int8_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else if(isAvx2(gemmElementType)) {
intgemm::avx2::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int8_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else if(isAvx512(gemmElementType)) {
intgemm::avx512bw::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int8_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else {
ABORT_IF(gemmElementType != Type::intgemm8, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure
intgemm::Int8::PrepareA(tmp->data(), /*input*/
paramMat->data<int8_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
}
//Put the quantMult at the back of the tensor
cpu::integer::getQuantMult<Type::intgemm8>(paramMat) = quantMult;
} else if(sizeOf(gemmElementType) == 2) { // is 16-bit Intgemm type
float quantMult = cpu::integer::computeQuantMult<Type::intgemm16>(val);
// Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime
cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type
if(isSse2(gemmElementType)) {
intgemm::sse2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int16_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else if(isAvx2(gemmElementType)) {
intgemm::avx2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int16_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else if(isAvx512(gemmElementType)) {
intgemm::avx512bw::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int16_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else {
ABORT_IF(gemmElementType != Type::intgemm16, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure
intgemm::Int16::PrepareA(tmp->data(), /*input*/
paramMat->data<int16_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
}
//Put the quantMult at the back of the tensor
cpu::integer::getQuantMult<Type::intgemm16>(paramMat) = quantMult;
} else {
ABORT("Incorrect Intgemm type size: {}", sizeOf(gemmElementType));
}
//Save... Same as the fbgemm case
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = gemmElementType;
auto mem = paramMat->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DCOMPILE_CPU=on", gemmElementType);
#endif
} else {
ABORT_IF(saveElementType != Type::float32, "We currently do not know how to save matrices as {}", saveElementType);
io::Item item;
val->get(item, pName);
item.convert(saveElementType);
ioItems.emplace_back(std::move(item));
}
}
if (!meta.empty())
io::addMetaToItems(meta, "special:model.yml", ioItems);
io::saveItems(name, ioItems);
}
};
} // namespace marian

View File

@ -2,7 +2,7 @@
#include "graph/node.h"
#include "packed_gemm.h"
#include "tensors/cpu/sharp/int_gemm.h"
#include "tensors/cpu/integer_common.h"
#if USE_FBGEMM
#ifdef __GNUC__
@ -57,14 +57,12 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
int nbcol_;
uint64_t packsize_;
FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose)
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
packMat_(packMat),
transpose_(transpose) {
if(packMat != PackMatrix::B)
ABORT("Only prepacking of B (weight matrix) is supported");
if(clipValue != 0)
ABORT("Clipping is not supported");
if(!memoize_)
ABORT("Only constant weight node can be packed");
}
@ -144,16 +142,13 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
FbgemmPacked8PackNodeOp(Expr a,
PackMatrix packMat,
marian::Type packType,
bool transpose,
float clipValue)
bool transpose)
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
packMat_(packMat),
packType_(packType),
transpose_(transpose) {
if(packMat != PackMatrix::B)
ABORT("Only prepacking of B (weight matrix) is supported");
if(clipValue != 0)
ABORT("Clipping is not supported");
if(!memoize_)
ABORT("Only constant weight node can be packed");
}
@ -337,7 +332,7 @@ public:
k_,
transA_,
transB_);
marian::cpu::int16::AddBias(val_, child(2)->val())) };
marian::cpu::integer::AddBias(val_, child(2)->val())) };
} else {
nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
child(0)->val(),
@ -377,11 +372,11 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
}
}
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) {
if (elementType == Type::packed16)
return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;

View File

@ -1,156 +0,0 @@
#pragma once
#include "graph/expression_graph.h"
#include "packed_gemm.h"
namespace marian {
// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
// The decision which weights can be packed or not should be done walking through the graph.
// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
// We will improve this in the near future.
class ExpressionGraphPackable : public ExpressionGraph {
public:
ExpressionGraphPackable()
: ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference
virtual ~ExpressionGraphPackable() {}
// Convert model weights into packed format and save to IO items.
// @TODO: review this
void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
std::vector<io::Item> ioItems;
// sorted by name in std::map
for (auto p : params()->getMap()) {
std::string pName = p.first;
if (!namespace_.empty()) {
if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
pName = pName.substr(namespace_.size() + 2);
}
Tensor val = p.second->val();
// save as packed format
// @TODO Hardcoded to find packable weights
// int8 - all the weights used for affine op and dot op
// fp16 - all the weights used for affine op
if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information - size
int nrow;
int ncol;
uint64_t packsize;
fbgemmPacked8PackInfo(val->shape(),
gemmElementType,
pName.find("Wemb") != std::string::npos,
nrow,
ncol,
packsize);
auto allocator = New<TensorAllocator>(getBackend());
// buffer tensor to save packed matrix
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
//Pack B matrix into int8
fbgemmPacked8Pack(packedTensor,
val->data(),
gemmElementType,
pName.find("Wemb") != std::string::npos,
nrow,
ncol,
packsize);
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = gemmElementType;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
// tensor size. Saving to *.npz will cut to size.
auto mem = packedTensor->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
// fp16 quantization option
} else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information
int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
uint64_t packsize;
fbgemmPacked16PackInfo(val->shape(),
false,
nrow,
ncol,
kernel_ncol_blocks,
brow,
bcol,
last_brow,
nbrow,
nbcol,
packsize);
auto allocator = New<TensorAllocator>(getBackend());
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
// fbgemmPacked16Pack
fbgemmPacked16Pack(packedTensor,
val->data(),
false,
nrow,
ncol,
kernel_ncol_blocks,
brow,
bcol,
last_brow,
nbrow,
nbcol,
packsize);
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = Type::packed16;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
// tensor size. Saving to *.npz will cut to size.
auto mem = packedTensor->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
} else {
io::Item item;
val->get(item, pName);
item.convert(saveElementType);
ioItems.emplace_back(std::move(item));
}
}
if (!meta.empty())
io::addMetaToItems(meta, "special:model.yml", ioItems);
io::saveItems(name, ioItems);
}
};
} // namespace marian

View File

@ -1,113 +0,0 @@
#pragma once
#include "graph/node.h"
#include "tensors/cpu/sharp/int_gemm.h"
namespace marian {
namespace cpu {
namespace int16 {
struct QuantizeNodeOp : public UnaryNodeOp {
float clipValue_;
QuantizeNodeOp(Expr a, float clipValue)
: UnaryNodeOp(a, Type::int16), clipValue_{clipValue} {}
NodeOps forwardOps() override {
return {NodeOp(Quantize16(val_, child(0)->val(), clipValue_))};
}
NodeOps backwardOps() override {
ABORT("Only used for inference");
}
const std::string type() override { return "quantizeInt16"; }
};
class DotNodeOp : public NaryNodeOp {
private:
float scalar_;
public:
DotNodeOp(Expr a, Expr b, float scalar)
: NaryNodeOp({a, b}, newShape(a, b), Type::float32), scalar_(scalar) {}
Shape newShape(Expr a, Expr b) {
auto shapeA = a->shape();
auto shapeB = b->shape();
// Computing A * B^T
shapeB.set(-2, b->shape()[-1]);
shapeB.set(-1, b->shape()[-2]);
Shape outShape = shapeA;
outShape.set(-1, shapeB[-1]);
ABORT_IF(shapeA[-1] != shapeB[-2],
"matrix product requires dimensions to match");
return outShape;
}
NodeOps forwardOps() override {
return {NodeOp(ProdInt16(val_, child(0)->val(), child(1)->val(), scalar_))};
}
NodeOps backwardOps() override {
ABORT("Only used for inference");
}
const std::string type() override { return "dotInt16"; }
};
class AffineNodeOp : public NaryNodeOp {
private:
float scalar_;
public:
AffineNodeOp(const std::vector<Expr>& nodes, float scalar)
: NaryNodeOp(nodes, newShape(nodes[0], nodes[1]), Type::float32), scalar_(scalar) {}
Shape newShape(Expr a, Expr b) {
auto shapeA = a->shape();
auto shapeB = b->shape();
// Computing A * B^T
shapeB.set(-2, b->shape()[-1]);
shapeB.set(-1, b->shape()[-2]);
Shape outShape = shapeA;
outShape.set(-1, shapeB[-1]);
ABORT_IF(shapeA[-1] != shapeB[-2],
"matrix product requires dimensions to match");
return outShape;
}
NodeOps forwardOps() override {
return {
NodeOp(ProdInt16(val_, child(0)->val(), child(1)->val(), scalar_);
AddBias(val_, child(2)->val()))
};
}
NodeOps backwardOps() override {
ABORT("Only used for inference");
}
const std::string type() override { return "affineInt16"; }
};
static inline Expr dot(Expr a, Expr b, float scalar) {
return Expression<cpu::int16::DotNodeOp>(a, b, scalar);
}
static inline Expr affine(Expr a, Expr b, Expr c, float scalar) {
std::vector<Expr> nodes = {a, b, c};
return Expression<cpu::int16::AffineNodeOp>(nodes, scalar);
}
static inline Expr quantize(Expr a, float clipValue) {
return Expression<cpu::int16::QuantizeNodeOp>(a, clipValue);
}
} // namespace int16
} // namespace cpu
} // namespace marian

View File

@ -0,0 +1,45 @@
#include "integer_common.h"
namespace marian {
namespace cpu {
namespace integer {
// This operates on floats after processing so doesn't care about int8_t vs int16_t.
void AddBias(marian::Tensor C, const marian::Tensor Bias) {
float* y = C->data();
const float* x = C->data();
const float* bias = Bias->data();
const int m = C->shape().elements() / C->shape()[-1];
const int n = C->shape()[-1];
for(int j = 0; j < m; ++j) {
int i = 0;
#ifdef __AVX512F__
int n16 = n & ~15;
for(; i < n16; i += 16) {
__m512 ai = _mm512_loadu_ps(x + j * n + i);
__m512 bi = _mm512_loadu_ps(bias + i);
__m512 yi = _mm512_add_ps(ai, bi);
_mm512_storeu_ps(y + j * n + i, yi);
}
#else
int n4 = (n / 4) * 4;
for(; i < n4; i += 4) {
__m128 ai = _mm_loadu_ps(x + j * n + i);
__m128 bi = _mm_loadu_ps(bias + i);
__m128 yi = _mm_add_ps(ai, bi);
_mm_storeu_ps(y + j * n + i, yi);
}
#endif
for(; i < n; i++) {
y[j * n + i] = x[j * n + i] + bias[i];
}
}
}
//template void prepareAndTranspose<intgemm8>;//(io::Item& item, const char * input);
//template void prepareAndTranspose<intgemm16>(io::Item&, const char *);
} //integer
} //cpu
} //marian

View File

@ -0,0 +1,223 @@
#pragma once
#include "tensors/tensor_allocator.h"
#include "tensors/tensor_operators.h"
#include "tensors/cpu/aligned.h"
#include "common/io_item.h"
#if COMPILE_CPU
#include "3rd_party/intgemm/intgemm/intgemm.h"
#else
namespace intgemm {
struct Int8;
struct Int16;
namespace ssse3 {
struct Kernels8;
}
namespace sse2 {
struct Kernels16;
}
namespace avx2 {
struct Kernels8;
struct Kernels16;
}
namespace avx512bw {
struct Kernels8;
struct Kernels16;
}
namespace avx512vnni {
struct Kernels8;
}
}
#endif
#include <emmintrin.h>
#include <immintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
namespace marian {
namespace cpu {
namespace integer {
//Convenient function to get rows and columns of a tensor, shadowed by namespace.
inline int cols(Tensor& tensor) { return tensor->shape()[-1]; }
inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tensor); }
inline int cols(Shape& shape) { return shape[-1]; }
inline int rows(Shape& shape) { return shape.elements() / cols(shape); }
template <Type type> struct intgemm_;
template <> struct intgemm_<Type::intgemm8> {
using width = intgemm::Int8;
using type = int8_t;
};
template <> struct intgemm_<Type::intgemm8ssse3> {
using width = intgemm::ssse3::Kernels8;
using type = int8_t;
};
template <> struct intgemm_<Type::intgemm8avx2> {
using width = intgemm::avx2::Kernels8;
using type = int8_t;
};
template <> struct intgemm_<Type::intgemm8avx512> {
using width = intgemm::avx512bw::Kernels8;
using type = int8_t;
};
template <> struct intgemm_<Type::intgemm8avx512vnni> {
using width = intgemm::avx512vnni::Kernels8;
using type = int8_t;
};
template <> struct intgemm_<Type::intgemm16> {
using width = intgemm::Int16;
using type = int16_t;
};
template <> struct intgemm_<Type::intgemm16sse2> {
using width = intgemm::sse2::Kernels16;
using type = int16_t;
};
template <> struct intgemm_<Type::intgemm16avx2> {
using width = intgemm::avx2::Kernels16;
using type = int16_t;
};
template <> struct intgemm_<Type::intgemm16avx512> {
using width = intgemm::avx512bw::Kernels16;
using type = int16_t;
};
template <Type vtype>
static inline float& getQuantMult(marian::Tensor val) {
#if COMPILE_CPU
ABORT_IF(!isIntgemm(val->type()), "getQuantMult does not work for type {}", val->type());
typedef typename intgemm_<vtype>::type Integer;
return *(reinterpret_cast<float*>(val->data<Integer>() + val->shape().elements()));
#else
val;
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
#endif
}
static inline Type getIntgemmType(Type vtype) {
#if COMPILE_CPU
if (vtype == Type::intgemm8) {
if (intgemm::kCPU == intgemm::CPUType::AVX512VNNI) {
return Type::intgemm8avx512vnni;
} else if (intgemm::kCPU == intgemm::CPUType::AVX512BW) {
return Type::intgemm8avx512;
} else if (intgemm::kCPU == intgemm::CPUType::AVX2) {
return Type::intgemm8avx2;
} else if (intgemm::kCPU == intgemm::CPUType::SSSE3) {
return Type::intgemm8ssse3;
} else {
ABORT("Your CPU doesn't support SSSE3, necessary for 8bit intgemm to work.");
}
} else if (vtype == Type::intgemm16) {
if (intgemm::kCPU > intgemm::CPUType::AVX2) {
return Type::intgemm16avx512;
} else if (intgemm::kCPU == intgemm::CPUType::AVX2) {
return Type::intgemm16avx2;
} else if (intgemm::kCPU >= intgemm::CPUType::SSE2) {
return Type::intgemm16sse2;
} else {
ABORT("Your CPU doesn't support SSE2, necessary for 16bit intgemm to work.");
}
} else {
ABORT("Unrecognised type {}.", vtype);
}
#else
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
return vtype;
#endif
}
static inline bool passOrAbort(Type vtype) {
#if COMPILE_CPU
if (vtype == Type::intgemm8 || vtype == Type::intgemm16) {
return true;
} else if (vtype == Type::intgemm16sse2) {
ABORT_IF(intgemm::kCPU < intgemm::CPUType::SSE2, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
} else if (vtype == Type::intgemm8ssse3) {
ABORT_IF(intgemm::kCPU < intgemm::CPUType::SSSE3, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
} else if (vtype == Type::intgemm8avx2 || vtype == Type::intgemm16avx2) {
ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX2, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
} else if (vtype == Type::intgemm8avx512 || vtype == Type::intgemm16avx512) {
ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX512BW, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
} else if (vtype == Type::intgemm8avx512vnni) {
ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX512VNNI, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
}
return true;
#else
vtype;
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
return false;
#endif
}
template <Type vtype>
static inline float computeQuantMult(marian::Tensor val) {
#if COMPILE_CPU
if(sizeOf(vtype) == 1)
return 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements());
else if(sizeOf(vtype) == 2)
return 1024.0f;
else
ABORT("Unhandled type size {}", sizeOf(vtype));
#else
val;
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
#endif
}
// This operates on floats after processing so doesn't care about int8_t vs int16_t.
void AddBias(marian::Tensor C, const marian::Tensor Bias);
// For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed
// in our binary format. Then we copy the quantizationMultiplier information at the end
template<Type vtype>
void prepareAndTransposeB(io::Item& item, const char * input) {
#if COMPILE_CPU
typedef typename intgemm_<vtype>::type Integer;
Integer * output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
// Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
// If this is the case, we will need to temporary allocate aligned memory, copy the results, and then free it
if (reinterpret_cast<uintptr_t>(input) % 64 == 0 && reinterpret_cast<uintptr_t>(output_tensor) % 64 == 0) {
intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(input),
output_tensor,
rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary,
cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
} else {
Integer * aligned_input = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
std::copy(input, input + rows(item.shape)*cols(item.shape), aligned_input);
Integer * aligned_output = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(aligned_input),
reinterpret_cast<Integer *>(aligned_output),
rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary,
cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
// Copy to output tensor
std::copy(aligned_output, aligned_output + rows(item.shape)*cols(item.shape), output_tensor);
genericFree(aligned_input);
genericFree(aligned_output);
}
//Copy the quantMult
float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
*(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
#else
item, input;
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
#endif
}
} //integer
} //cpu
} //marian

View File

@ -0,0 +1,132 @@
#pragma once
#include "graph/node.h"
#include "graph/node_operators_unary.h"
#include "integer_common.h"
namespace marian {
namespace cpu {
namespace integer {
#if COMPILE_CPU
/*
* Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized.
* Expr input: The input tensor
*/
template<Type vtype>
static inline Expr prepareA(Expr a) {
auto nodeOp = [](Expr out, const std::vector<Expr>& children) {
Expr in = children[0];
auto quantMult = computeQuantMult<vtype>(in->val());
typedef typename intgemm_<vtype>::type Integer;
intgemm_<vtype>::width::PrepareA(in->val()->data(), /*input*/
out->val()->data<Integer>(), /*output*/
quantMult, /*Quant Mult*/
rows(in->val()),
cols(in->val()));
getQuantMult<vtype>(out->val()) = quantMult;
};
return lambda({a}, a->shape(), vtype, nodeOp);
}
#endif
/*
* This computes A*B (+ bias if available) in intgemm.
* Expr a: The activation matrix in intgemm format
* Expr b: The parameter matrix in intgemm fromat
* Expr bias: The bias
* bool transA - tranpose input A if true
* bool transB - unused here (@TODO remove?)
* float scale - scale the output by `scale`
* the template argument controls whether we're doing 16bit integers or 8bit integers.
* It can be Type::intgemm8 or Type::intgemm16 and all hardware-specific variants
*/
template<Type vtype>
static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) {
#if COMPILE_CPU
ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type());
ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type());
auto aQuant = prepareA<vtype>(transA ? transpose(a) : a); // A should not be quantized yet as seen above, hence quantize here
// determine the output shape m x n for A: m x k and B: k x n
// since we transpose A beforehand we don't need to take care of transposed shapes here
Shape outShape = aQuant->shape();
outShape.set(-1, bQuant->shape()[-1]);
// wrap the multiply finctions to be executed in the forward step of a Lambda node
auto dotOrAffineNodeOp = [=](Expr out, const std::vector<Expr>& children) {
Expr aQuant = children[0];
Expr bQuant = children[1];
Expr bias = children.size() > 2 ? children[2] : nullptr;
// when we arrive here, A and B are already quantized, so just get the multipliers
float aQuantMult = getQuantMult<vtype>(aQuant->val());
float bQuantMult = getQuantMult<vtype>(bQuant->val());
float unquant_mult = 1.0f / (aQuantMult * bQuantMult);
unquant_mult = unquant_mult * scale;
typedef typename intgemm_<vtype>::type Integer;
if(bias) { // dispatch a multiply with integrated bias addition i.e affine(...)
intgemm_<vtype>::width::Multiply(/*A=*/aQuant->val()->data<Integer>(),
/*B=*/bQuant->val()->data<Integer>(),
rows(aQuant->val()),
cols(aQuant->val()),
cols(bQuant->val()),
intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, /*bias=*/bias->val()->data(), /*output=*/out->val()->data()));
} else { // dispatch a multiply without bias addition i.e dot(...)
intgemm_<vtype>::width::Multiply(/*A=*/aQuant->val()->data<Integer>(),
/*B=*/bQuant->val()->data<Integer>(),
rows(aQuant->val()),
cols(aQuant->val()),
cols(bQuant->val()),
intgemm::callbacks::UnquantizeAndWrite(unquant_mult, /*output=*/out->val()->data()));
}
};
std::vector<Expr> children = {aQuant, bQuant};
if(bias)
children.push_back(bias);
return lambda(children, outShape, Type::float32, dotOrAffineNodeOp); // inference-only Lambda node
#else
a, bQuant, bias, transA, scale;
ABORT("You need to enable CPU compilation to use this feature. Use cmake .. -DCOMPILE_CPU=ON");
#endif
}
// Dispatch correct hardware-agnostic or hardware-specific matrix multiplies
static inline Expr affineOrDot(Expr a, Expr bQuant, Expr bias, bool transA, bool transB, float scale) {
Type bQuantElementType = bQuant->value_type();
static const bool pass = cpu::integer::passOrAbort(bQuantElementType);
pass; // We declare this variable as static so that passOrAbort is only ever run once during the initialization.
switch(bQuantElementType) {
//case Type::intgemm8 : // The generic case selects CPU automatically, but we set all the types manually anyways.
// return cpu::integer::affineOrDotTyped<Type::intgemm8>(a, bQuant, bias, transA, transB, scale);
case Type::intgemm8ssse3 :
return cpu::integer::affineOrDotTyped<Type::intgemm8ssse3>(a, bQuant, bias, transA, transB, scale);
case Type::intgemm8avx2 :
return cpu::integer::affineOrDotTyped<Type::intgemm8avx2>(a, bQuant, bias, transA, transB, scale);
case Type::intgemm8avx512 :
return cpu::integer::affineOrDotTyped<Type::intgemm8avx512>(a, bQuant, bias, transA, transB, scale);
case Type::intgemm8avx512vnni :
return cpu::integer::affineOrDotTyped<Type::intgemm8avx512vnni>(a, bQuant, bias, transA, transB, scale);
//case Type::intgemm16 : // The generic case selects CPU automatically, but we set all the types manually anyways.
// return cpu::integer::affineOrDotTyped<Type::intgemm16>(a, bQuant, bias, transA, transB, scale);
case Type::intgemm16sse2 :
return cpu::integer::affineOrDotTyped<Type::intgemm16sse2>(a, bQuant, bias, transA, transB, scale);
case Type::intgemm16avx2 :
return cpu::integer::affineOrDotTyped<Type::intgemm16avx2>(a, bQuant, bias, transA, transB, scale);
case Type::intgemm16avx512 :
return cpu::integer::affineOrDotTyped<Type::intgemm16avx512>(a, bQuant, bias, transA, transB, scale);
default:
ABORT("Unsupported type {} for Intgemm type??", bQuantElementType);
}
}
} // namespace integer
} // namespace cpu
} // namespace marian

View File

@ -7,8 +7,17 @@
#include "tensors/tensor.h"
#include "tensors/tensor_allocator.h"
#if MKL_FOUND
#include <mkl.h>
#else
#if BLAS_FOUND
#include <cblas.h>
#endif
#endif
#include "integer_common.h"
#include "prod_blas.h"
#include "sharp/int_gemm.h"
namespace marian {
@ -187,7 +196,7 @@ void ProdWithBias(marian::Tensor C,
float beta,
float scalar) {
cpu::Prod(C, A, B, transA, transB, beta, scalar);
cpu::int16::AddBias(C, bias);
cpu::integer::AddBias(C, bias);
}
void CSRProd(marian::Tensor C,

View File

@ -1,615 +0,0 @@
#include <emmintrin.h>
#include <immintrin.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
#ifdef __AVX512F__
namespace marian {
namespace cpu {
namespace int16 {
namespace {
// Load from memory, multiply, and convert to int32_t.
inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
// Load 16 floats
__m512 val = _mm512_load_ps(input);
// Multiply each by the quantization factor.
val = _mm512_mul_ps(val, quant_mult_reg);
// Cast to 32-bit int
return _mm512_cvtps_epi32(val);
}
} // namespace
// Convert
void AVX_Quantize16(const float *input,
int16_t *output,
float quant_mult,
std::size_t size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
// Fill with the quantization multiplier.
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
const float *end = input + size;
for(; input != end; input += 16, output += 16) {
// There doesn't seem to be an unmasked version.
_mm512_mask_cvtsepi32_storeu_epi16(
output, 0xffff, QuantizerGrab(input, quant_mult_reg));
}
}
void AVX_Quantize8(const float *input,
int8_t *output,
float quant_mult,
std::size_t size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
const __m512i neg127 = _mm512_set1_epi32(-127);
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
const float *end = input + size;
for(; input < end; input += 16, output += 16) {
__m512i asint = QuantizerGrab(input, quant_mult_reg);
/* Ban -128. We can't negate it.
* The largest possbile product is -128 * -128 = 2^14. If two of those are
* summed that's 2^15 which is too large for int16_t. By banning -128 we
* can accumulate two in int16_t w/o saturation before going to int32_t.
* But this is ok because apparently the instruction will saturate.
*/
asint = _mm512_max_epi32(asint, neg127);
// There doesn't seem to be an unmasked version.
_mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint);
}
}
namespace {
union FloatAccess {
float as_f[4];
__m128 as_n;
};
union IntAccess {
int32_t as_i[4];
__m128i as_n;
};
/* Convert 16-bit to 32-bit and add, not caring what parts are added.
* Implementations:
* 1.
* https://github.com/tesseract-ocr/tesseract/blob/master/src/arch/intsimdmatrixavx2.cpp#L67
* under Apache license: This does a multiply by 1 and horizontal add:
* _mm512_madd_epi16(sum, _mm512_set1_epi16(1))
* Current fastest.
*
* 2. Signed extension and fold halves:
* sum = _mm512_add_epi32(
* _mm512_cvtepi16_epi32(_mm512_castsi512_si256(sum)),
* _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(sum, 1)));
*
* 3. Sign extend by abuse of bitshift, then add.
* __m128i shift16 = _mm_set_epi32(0,0,0,16);
* sum = _mm512_add_epi32(
* _mm512_sra_epi32(_mm512_sll_epi32(sum, shift16), shift16),
* _mm512_sra_epi32(sum, shift16));
*/
inline void Convert32Sum(__m512i &sum) {
short one = 1;
sum = _mm512_madd_epi16(sum, _mm512_set1_epi16(one));
}
// Two sum version.
struct ReducedPair {
int32_t result[2];
};
inline ReducedPair Reduce16to32(__m512i sum1, __m512i sum2) {
Convert32Sum(sum1);
Convert32Sum(sum2);
// 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
__m512i pack12 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum1, sum2),
_mm512_unpacklo_epi32(sum1, sum2));
// 1 2 1 2 1 2 1 2
__m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack12),
_mm512_extracti64x4_epi64(pack12, (short)1));
// 1 2 1 2
IntAccess a;
a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves),
_mm256_extracti128_si256(halves, 1));
ReducedPair ret;
ret.result[0] = a.as_i[0] + a.as_i[2];
ret.result[1] = a.as_i[1] + a.as_i[3];
return ret;
}
// Assuming sum1, sum2, sum3, and sum4 are arrays 32-bit signed integers,
// reduce within each.
// Returns [sum(sum1), sum(sum2), sum(sum3), sum(sum4)]
// TODO: consider doing in 64-bit, allowing 4 more bits of quantization?
inline __m128i Reduce32(__m512i sum1,
__m512i sum2,
__m512i sum3,
__m512i sum4) {
// 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
__m512i pack12 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum1, sum2),
_mm512_unpacklo_epi32(sum1, sum2));
// 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
__m512i pack34 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum3, sum4),
_mm512_unpacklo_epi32(sum3, sum4));
// 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4
__m512i pack1234 = _mm512_add_epi32(_mm512_unpackhi_epi64(pack12, pack34),
_mm512_unpacklo_epi64(pack12, pack34));
// Cut the register into halves and sum those. 1 2 3 4 1 2 3 4
__m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack1234),
_mm512_extracti64x4_epi64(pack1234, (short)1));
// Again: cut the register into halves and sum those. 1 2 3 4
return _mm_add_epi32(_mm256_castsi256_si128(halves),
_mm256_extracti128_si256(halves, 1));
}
// Four sum version
inline __m128i Reduce16to32(__m512i sum1,
__m512i sum2,
__m512i sum3,
__m512i sum4) {
Convert32Sum(sum1);
Convert32Sum(sum2);
Convert32Sum(sum3);
Convert32Sum(sum4);
return Reduce32(sum1, sum2, sum3, sum4);
}
// Somewhat inefficient reduce for single __m256i containing int32_t
inline int32_t Reduce32(__m256i halves) {
IntAccess a;
a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves),
_mm256_extracti128_si256(halves, 1));
// TODO is there a more efficient way?
return a.as_i[0] + a.as_i[1] + a.as_i[2] + a.as_i[3];
}
// Somewhat inefficient reduce for single __m512i containing int32_t
inline int32_t Reduce32(__m512i sum1) {
// Fold register over itself.
return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
_mm512_extracti64x4_epi64(sum1, (short)1)));
}
inline int32_t Reduce16to32(__m512i sum1) {
Convert32Sum(sum1);
// Fold register over itself.
return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
_mm512_extracti64x4_epi64(sum1, (short)1)));
}
class ScatterPut {
public:
explicit ScatterPut(float unquant_mult, int num_B_rows)
: unquant_mult_(unquant_mult),
unquant_mult_sse_(_mm_set1_ps(unquant_mult)),
#ifdef __AVX512VL__
num_b_rows_scatter_(_mm_set_epi32(num_B_rows * 3 * sizeof(float),
num_B_rows * 2 * sizeof(float),
num_B_rows * 1 * sizeof(float),
num_B_rows * 0 * sizeof(float))),
#endif
num_B_rows_(num_B_rows) {
}
inline void Write(float *base, __m128i reduced) {
__m128 float_sums = _mm_cvtepi32_ps(reduced);
float_sums = _mm_mul_ps(float_sums, unquant_mult_sse_);
#ifdef __AVX512VL__
// The scatter instruction requires avx512vl
_mm_i32scatter_ps(base, num_b_rows_scatter_, float_sums, (short)1);
#else
FloatAccess a;
// Get floats for each of the sums to write.
a.as_n = float_sums;
// Also note that the memory acceses on C are not consecutive, but this is a
// tradeoff that we have to make. We can't have consecutive accesses of A,
// B, *and* C. But we access A and B a lot more so it makes sense to do it
// this way. Scatter to outputs:
base[0] = a.as_f[0];
base[num_B_rows_] = a.as_f[1];
base[2 * num_B_rows_] = a.as_f[2];
base[3 * num_B_rows_] = a.as_f[3];
#endif
}
inline void Write(float *base, ReducedPair reduced) {
base[0] = unquant_mult_ * static_cast<float>(reduced.result[0]);
base[num_B_rows_] = unquant_mult_ * static_cast<float>(reduced.result[1]);
}
inline void Write(float *base, int32_t reduced) {
base[0] = unquant_mult_ * static_cast<float>(reduced);
}
private:
const float unquant_mult_;
const __m128 unquant_mult_sse_;
#ifdef __AVX512VL__
const __m128i num_b_rows_scatter_;
#endif
const int num_B_rows_;
};
} // namespace
// This is an AVX512F implementation of int16_t multiply based on Jacob
// Devlin's SSE code. The original SSE code was:
// Copyright (c) 2017 Microsoft Corporation
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// We are multiplying A * B^T, as opposed to A * B. This is important because it
// means we can do consecutive memory access on A * B^T which allows to to take
// the most advantage of L1 cache.
//
// B is typically a weight matrix, so it can be pre-processed offline, and
// therefore this transpose does not cost anything. A is typically an activation
// minibatch matrix. A and B must be 64-byte aligned. C should be the usual
// 4-byte alignment.
void AVX_MatrixMult16(const __m512i *A,
const __m512i *B,
float *C,
float unquant_mult,
int num_A_rows,
int num_B_rows,
int width) {
assert(width % 32 == 0);
assert(reinterpret_cast<uintptr_t>(A) % 64 == 0);
assert(reinterpret_cast<uintptr_t>(B) % 64 == 0);
ScatterPut put(unquant_mult, num_B_rows);
const int sse_width = width / 32;
// We do loop unrolling over A. This is *significantly* faster
// since B can live in the registers. We are assuming that
// A is a multiple of 4, but we can add extra code to handle values of 1,
// 2, 3.
//
// We could also do loop unrolling over B, which adds some additional speedup.
// We don't do that for the sake of clarity.
//
// There are other memory access patterns we could do, e.g., put B on the
// outer loop. The justification is that A is typically small enough that it
// can live in L1 cache. B is usually a larger weight matrix, so it might not
// be able to. However, we are using each element of B four times while it's
// still in a register, so caching is not as important.
// Round down to a multiple of 4.
int num_unroll_rows = num_A_rows & ~3;
for(int i = 0; i < num_unroll_rows; i += 4) {
const __m512i *A1_row = A + (i + 0) * sse_width;
const __m512i *A2_row = A + (i + 1) * sse_width;
const __m512i *A3_row = A + (i + 2) * sse_width;
const __m512i *A4_row = A + (i + 3) * sse_width;
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
__m512i sum4 = _mm512_setzero_si512();
// This is just a simple dot product, unrolled four ways.
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i a1 = *(A1_row + k);
__m512i a2 = *(A2_row + k);
__m512i a3 = *(A3_row + k);
__m512i a4 = *(A4_row + k);
// madd_epi16 does multiply add on 8 16-bit integers and accumulates
// into a four 32-bit register. E.g., a1 = [f1, f2, f3, f4, f5, f6, f7,
// h8] (16-bit ints) b1 = [h1, h2, h3, h4, h5, h6, h7, h8] (16-bit ints)
// result = [f1*h1 + f2*h2, f3*h3 + f4*h4, f5*h5 + f6*h6, f7*h7 + f8*h8]
// (32-bit ints) Then add_epi32 just effectively does a += on these
// 32-bit integers.
sum1 = _mm512_add_epi32(sum1, _mm512_madd_epi16(b, a1));
sum2 = _mm512_add_epi32(sum2, _mm512_madd_epi16(b, a2));
sum3 = _mm512_add_epi32(sum3, _mm512_madd_epi16(b, a3));
sum4 = _mm512_add_epi32(sum4, _mm512_madd_epi16(b, a4));
}
put.Write(C + i * num_B_rows + j, Reduce32(sum1, sum2, sum3, sum4));
}
}
// Handle the non-multiples of 4 rows.
// TODO: efficient version for 3 rows, 2 rows, etc.
for(int i = num_unroll_rows; i < num_A_rows; ++i) {
const __m512i *A1_row = A + i * sse_width;
for(int j = 0; j < num_B_rows; j++) {
__m512i sum1 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
const __m512i *B_row = B + j * sse_width;
__m512i b = *(B_row + k);
__m512i a1 = *(A1_row + k);
sum1 = _mm512_add_epi32(sum1, _mm512_madd_epi16(b, a1));
}
// TODO is there a more efficient way?
*(C + (i)*num_B_rows + j)
= unquant_mult * static_cast<float>(Reduce32(sum1));
}
}
}
namespace {
/* Three ways considered to apply sign bits:
* 1. Use 256-bit sign instruction:
* __m256i a_first = _mm256_sign_epi8(_mm512_castsi512_si256(a),
* _mm512_castsi512_si256(b));
* __m256i a_second = _mm256_sign_epi8(_mm512_extracti64x4_epi64(a, 1),
* b_second); a = _mm512_inserti64x4(_mm512_castsi256_si512(a_first), a_second,
* 1); a = Concat(a_first, a_second);
*
* 2. Extract a mask and xor + 1
* __mmask64 neg_mask _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
* Use set1 to to build to_xor
* a = _mm512_xor_si512(a, to_xor)
* And add one:
* const __m512i ones8 = _mm512_set1_epi8(1);
* a = _mm512_mask_add_epi8(a, neg_mask, a, ones8);
*
* 3. Extract a mask and subtract from 0
* In the outer loop on b:
* __mmask64 neg_mask _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128))
* For each a:
* a = _mm512_mask_sub_epi8(a, neg_mask, _mm512_setzero_si512(), a);
*
* Finally, subtraction won the benchmark
*/
inline void Accum(const __m512i zeros,
__m512i a,
const __m512i b,
const __m512i b_positive,
const __mmask64 neg_mask,
__m512i &sum) {
// Apply sign bits.
a = _mm512_mask_sub_epi8(a, neg_mask, zeros, a);
// The magic 8-bit multiply then horizontal sum into 16-bit.
__m512i multiplied = _mm512_maddubs_epi16(b_positive, a);
// Now we have 16-bit results that are the sum of two multiplies.
// Choosing to approximate and do adds.
// Perhaps every so often we could accumulate by Convert32Sum
sum = _mm512_adds_epi16(sum, multiplied);
b; // make compiler happy
}
} // namespace
void AVX_MatrixMult8(const __m512i *A,
const __m512i *B,
float *C,
float unquant_mult,
int num_A_rows,
int num_B_rows,
int width) {
assert(width % 32 == 0);
assert(reinterpret_cast<uintptr_t>(A) % 64 == 0);
assert(reinterpret_cast<uintptr_t>(B) % 64 == 0);
ScatterPut put(unquant_mult, num_B_rows);
const __m512i zeros = _mm512_setzero_si512();
const int sse_width = width / 64;
int i = 0;
int mult8rows = num_A_rows & (~7);
for(; i < mult8rows; i += 8) {
const __m512i *A1_row = A + (i + 0) * sse_width;
const __m512i *A2_row = A + (i + 1) * sse_width;
const __m512i *A3_row = A + (i + 2) * sse_width;
const __m512i *A4_row = A + (i + 3) * sse_width;
const __m512i *A5_row = A + (i + 4) * sse_width;
const __m512i *A6_row = A + (i + 5) * sse_width;
const __m512i *A7_row = A + (i + 6) * sse_width;
const __m512i *A8_row = A + (i + 7) * sse_width;
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
__m512i sum4 = _mm512_setzero_si512();
__m512i sum5 = _mm512_setzero_si512();
__m512i sum6 = _mm512_setzero_si512();
__m512i sum7 = _mm512_setzero_si512();
__m512i sum8 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i b_positive = _mm512_abs_epi8(b);
/* Didn't seem to make a difference definining sign bits here vs at top
*/
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
Accum(zeros, *(A7_row + k), b, b_positive, neg_mask, sum7);
Accum(zeros, *(A8_row + k), b, b_positive, neg_mask, sum8);
}
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
put.Write(C + (i + 4) * num_B_rows + j,
Reduce16to32(sum5, sum6, sum7, sum8));
}
}
const __m512i *A1_row = A + (i + 0) * sse_width;
const __m512i *A2_row = A + (i + 1) * sse_width;
const __m512i *A3_row = A + (i + 2) * sse_width;
const __m512i *A4_row = A + (i + 3) * sse_width;
const __m512i *A5_row = A + (i + 4) * sse_width;
const __m512i *A6_row = A + (i + 5) * sse_width;
const __m512i *A7_row = A + (i + 6) * sse_width;
switch(num_A_rows & 7) {
case 7:
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
__m512i sum4 = _mm512_setzero_si512();
__m512i sum5 = _mm512_setzero_si512();
__m512i sum6 = _mm512_setzero_si512();
__m512i sum7 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i b_positive = _mm512_abs_epi8(b);
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
Accum(zeros, *(A7_row + k), b, b_positive, neg_mask, sum7);
}
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5, sum6));
put.Write(C + (i + 6) * num_B_rows + j, Reduce16to32(sum7));
}
/* fall through */
case 6:
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
__m512i sum4 = _mm512_setzero_si512();
__m512i sum5 = _mm512_setzero_si512();
__m512i sum6 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i b_positive = _mm512_abs_epi8(b);
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
}
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5, sum6));
}
/* fall through */
case 5:
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
__m512i sum4 = _mm512_setzero_si512();
__m512i sum5 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i b_positive = _mm512_abs_epi8(b);
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
}
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5));
}
/* fall through */
case 4:
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
__m512i sum4 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i b_positive = _mm512_abs_epi8(b);
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
}
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
}
/* fall through */
case 3:
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
__m512i sum3 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i b_positive = _mm512_abs_epi8(b);
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
}
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2));
put.Write(C + (i + 2) * num_B_rows + j, Reduce16to32(sum3));
}
/* fall through */
case 2:
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
__m512i sum2 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i b_positive = _mm512_abs_epi8(b);
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
}
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2));
}
/* fall through */
case 1:
for(int j = 0; j < num_B_rows; j++) {
const __m512i *B_row = B + j * sse_width;
__m512i sum1 = _mm512_setzero_si512();
for(int k = 0; k < sse_width; k++) {
__m512i b = *(B_row + k);
__m512i b_positive = _mm512_abs_epi8(b);
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
}
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1));
}
}
}
} // namespace int16
} // namespace cpu
} // namespace marian
#endif

View File

@ -1,187 +0,0 @@
#include "int_gemm.h"
#include "tensors/tensor_allocator.h"
#include "tensors/tensor_operators.h"
#include <emmintrin.h>
#include <immintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
namespace marian {
namespace cpu {
namespace int16 {
#ifdef __AVX512F__
void AVX_Quantize16(const float* input,
int16_t* output,
float quant_mult,
std::size_t size);
void AVX_Quantize8(const float* input,
int8_t* output,
float quant_mult,
std::size_t size);
void AVX_MatrixMult16(const __m512i* A,
const __m512i* B,
float* C,
float unquant_mult,
int num_A_rows,
int num_B_rows,
int width);
void AVX_MatrixMult8(const __m512i* A,
const __m512i* B,
float* C,
float unquant_mult,
int num_A_rows,
int num_B_rows,
int width);
#endif
void SSE_Quantize16(const float* input,
__m128i* output,
float quant_mult,
int num_rows,
int width);
void SSE_MatrixMult16(const __m128i* A,
const __m128i* B,
float* C,
float unquant_mult,
int num_A_rows,
int num_B_rows,
int width);
void Quantize16(marian::Tensor out,
const marian::Tensor in,
float /*clipValue*/) {
float quant_mult = (float)pow(2.0, BITS);
#ifdef __AVX512F__
AVX_Quantize16(
in->data(), out->data<int16_t>(), quant_mult, in->shape().elements());
#else
int num_rows = in->shape().elements() / in->shape()[-1];
int width = in->shape()[-1];
SSE_Quantize16(in->data(), out->data<__m128i>(), quant_mult, num_rows, width);
#endif
}
void Quantize8(marian::Tensor out,
const marian::Tensor in,
float clipValue) {
#ifdef __AVX512F__
float quant_mult = 127.0f / clipValue;
AVX_Quantize8(
in->data(), out->data<int8_t>(), quant_mult, in->shape().elements());
#else
out; in; clipValue;
ABORT("8-bit is currently only AVX512");
#endif
}
// This operates on floats after processing so doesn't care about int8_t vs
// int16_t.
void AddBias(marian::Tensor C, const marian::Tensor Bias) {
float* y = C->data();
const float* x = C->data();
const float* bias = Bias->data();
const int m = C->shape().elements() / C->shape()[-1];
const int n = C->shape()[-1];
for(int j = 0; j < m; ++j) {
int i = 0;
#ifdef __AVX512F__
int n16 = n & ~15;
for(; i < n16; i += 16) {
__m512 ai = _mm512_loadu_ps(x + j * n + i);
__m512 bi = _mm512_loadu_ps(bias + i);
__m512 yi = _mm512_add_ps(ai, bi);
_mm512_storeu_ps(y + j * n + i, yi);
}
#else
int n4 = (n / 4) * 4;
for(; i < n4; i += 4) {
__m128 ai = _mm_loadu_ps(x + j * n + i);
__m128 bi = _mm_loadu_ps(bias + i);
__m128 yi = _mm_add_ps(ai, bi);
_mm_storeu_ps(y + j * n + i, yi);
}
#endif
for(; i < n; i++) {
y[j * n + i] = x[j * n + i] + bias[i];
}
}
}
void ProdInt16(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
float scale) {
ABORT_IF(scale != 1, "Scale other than 1 not supported");
// @TODO: make this a parameter
float quant_mult = (float)pow(2.0, BITS);
// If we quantize to n bits and then multiple the values together, the result
// will be quantized to n^2 bits. So we must divide by 1.0/(n^2) to get back
// the original value.
float unquant_mult = 1.0f / (quant_mult * quant_mult);
float* fC = C->data();
int num_A_rows = A->shape().elements() / A->shape()[-1];
int num_B_rows = B->shape().elements() / B->shape()[-1];
int width = B->shape()[-1];
#ifdef __AVX512F__
AVX_MatrixMult16(A->data<__m512i>(),
B->data<__m512i>(),
fC,
unquant_mult,
num_A_rows,
num_B_rows,
width);
#else
SSE_MatrixMult16(A->data<__m128i>(),
B->data<__m128i>(),
fC,
unquant_mult,
num_A_rows,
num_B_rows,
width);
#endif
}
void ProdInt8(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
float scale,
float clipValue) {
#ifdef __AVX512F__
// This would be easy...
ABORT_IF(scale != 1, "Scale other than 1 not supported");
float quant_mult = 127.0f / clipValue;
float unquant_mult = 1.0f / (quant_mult * quant_mult);
float* fC = C->data();
int num_A_rows = A->shape().elements() / A->shape()[-1];
int num_B_rows = B->shape().elements() / B->shape()[-1];
int width = B->shape()[-1];
AVX_MatrixMult8(A->data<__m512i>(),
B->data<__m512i>(),
fC,
unquant_mult,
num_A_rows,
num_B_rows,
width);
#else
C; A; B; scale; clipValue;
ABORT("8-bit is currently only AVX512");
#endif
}
} // namespace int16
} // namespace cpu
} // namespace marian

View File

@ -1,36 +0,0 @@
#pragma once
#include "tensors/tensor.h"
namespace marian {
namespace cpu {
namespace int16 {
const int BITS = 10;
void Quantize16(marian::Tensor out,
const marian::Tensor in,
float /*clipValue*/);
void Quantize8(marian::Tensor out,
const marian::Tensor in,
float clipValue);
// This operates on floats after processing so doesn't care about int8_t vs
// int16_t.
void AddBias(marian::Tensor C, const marian::Tensor Bias);
void ProdInt16(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
float scale);
void ProdInt8(marian::Tensor C,
const marian::Tensor A,
const marian::Tensor B,
float scale,
float clipValue);
} // namespace int16
} // namespace cpu
} // namespace marian

View File

@ -1,341 +0,0 @@
// Copyright (c) 2017 Microsoft Corporation
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <emmintrin.h>
#include <immintrin.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#include <cassert>
namespace marian {
namespace cpu {
namespace int16 {
// This is a reference implementation of 16-bit matrix multiplication described
// in "Sharp Models on Dull Hardware: Fast and Accurate Neural Machine
// Translation Decoding on the CPU". This model is not as fast as the one in the
// paper, becuase it uses SSE2 instead of AVX2. AVX2 instructions are only
// available on more modern CPUs (Haswell or later). The only difference between
// SSE2 and AVX2 is that SSE operates on 128-bit vectors and AVX2 operates on
// 256-bit vecetors. So AVX2 can fit 16 16-bit integers intead of 8 8-bit
// integers. The algorithm is the same, you just replace these instructions with
// their 256-bit counterpart, i.e., _mm256_add_epi32, _mm256_madd_epi16,
// _mm256_hadd_epi32, ... Additional improvements can also be made from
// unrolling the for loop over num_B_rows in SSE_MatrixMult, which is not done
// here for clarity.
// ***************************************
// ************** IMPORTANT **************
// ***************************************
// The biggest "gotcha" when using this type of multiplication is dealing with
// overflow related to quantization. It is NOT enough to simply ensure that A
// and B fit into 16 bit integers. If A and B are quantized with $n$ bits, the
// result of multiplying them together will be quantized to $n^2$ bits. So if
// they are near the boundary of the 16-bit mark, then the result will be near
// 32-bits and overflow. However, if we use, say, n = 10 bits, then the product
// is 20 bits. This gives us 12 bits left over for the accumulation. So as long
// as the width of the common dimension is less than 2^12 = 4096, it is
// *impossible* to overflow. If we used, say, n = 12 bits, then we have
// 32-(12*2) = 8 bits left over. So we *could* overflow if width > 2^8.
//
// So, the tradeoff is between quantization precision and possibility of
// overflow. A good general value is 10 bits, since this gives high precision
// (precision is 1/2^10 ~= 0.001, which is more than what's needed for almost
// all neural nets), and cannot overflow unless the matrix width is > 4096.
// This quantizes floating point values into fixed-point 16-bit integers.
// Effectively, we are performing an SSE version of float x = ...; int16_t y =
// (int16_t)(quant_mult*x);
//
// Except that the casting is saturated. However, you should always ensure that
// the input fits into a fixed range anyways. I.e., you should ensure that
// quant_mult*x fits into the range [-2^15, 2^15]. This should always be
// possible because the value you're quantizing will either be NN weights or NN
// activations, both of which can be clipped to a fixed range during training.
void SSE_Quantize16(const float* input,
__m128i* output,
float quant_mult,
int num_rows,
int width) {
assert(width % 8 == 0);
int num_input_chunks = width / 8;
// Fill an SSE float with 4 copies of the quant mult
__m128 sse_quant_mult
= _mm_set_ps(quant_mult, quant_mult, quant_mult, quant_mult);
for(int i = 0; i < num_rows; i++) {
const float* input_row = input + i * width;
__m128i* output_row = output + i * num_input_chunks;
for(int j = 0; j < num_input_chunks; j++) {
const float* x = input_row + j * 8;
// Process 8 floats at once, since each __m128i can contain 8 16-bit
// integers.
// Load floats floats into SSE registers.
__m128 f_0 = _mm_loadu_ps(x);
__m128 f_1 = _mm_loadu_ps(x + 4);
// Multiply by quantization factor (e.g., if quant_mult = 1000.0, 0.34291
// --> 342.21)
__m128 m_0 = _mm_mul_ps(f_0, sse_quant_mult);
__m128 m_1 = _mm_mul_ps(f_1, sse_quant_mult);
// Cast float to 32-bit int (e.g., 342.21 --> 342)
__m128i i_0 = _mm_cvtps_epi32(m_0);
__m128i i_1 = _mm_cvtps_epi32(m_1);
// Cast 32-bit int to 16-bit int. You must ensure that these fit into the
// 16-bit range by clipping values during training.
*(output_row + j) = _mm_packs_epi32(i_0, i_1);
}
}
}
// We are multiplying A * B^T, as opposed to A * B. This is important because it
// means we can do consecutive memory access on A * B^T which allows to to take
// the most advantage of L1 cache.
//
// B is typically a weight matrix, so it can be pre-processed offline, and
// therefore this transpose does not cost anything. A is typically an activation
// minibatch matrix.
void SSE_MatrixMult16(const __m128i* qA,
const __m128i* qB,
float* fC,
float unquant_mult,
int num_A_rows,
int num_B_rows,
int width) {
assert(width % 8 == 0);
int sse_width = width / 8;
// We do loop unrolling over A. This is *significantly* faster
// since B can live in the registers. We are assuming that
// A is a multiple of 4, but we can add extra code to handle values of 1,
// 2, 3.
//
// We could also do loop unrolling over B, which adds some additional speedup.
// We don't do that for the sake of clarity.
//
// There are other memory access patterns we could do, e.g., put B on the
// outer loop. The justification is that A is typically small enough that it
// can live in L1 cache. B is usually a larger weight matrix, so it might not
// be able to. However, we are using each element of B four times while it's
// still in a register, so caching is not as important.
int mult4 = (num_A_rows / 4) * 4;
int rest = num_A_rows % 4;
int i = 0;
for(; i < mult4; i += 4) {
const __m128i* A1_row = qA + (i + 0) * sse_width;
const __m128i* A2_row = qA + (i + 1) * sse_width;
const __m128i* A3_row = qA + (i + 2) * sse_width;
const __m128i* A4_row = qA + (i + 3) * sse_width;
for(int j = 0; j < num_B_rows; j++) {
const __m128i* B_row = qB + j * sse_width;
__m128i sum1 = _mm_setzero_si128();
__m128i sum2 = _mm_setzero_si128();
__m128i sum3 = _mm_setzero_si128();
__m128i sum4 = _mm_setzero_si128();
// This is just a simple dot product, unrolled four ways.
for(int k = 0; k < sse_width; k++) {
__m128i b = *(B_row + k);
__m128i a1 = *(A1_row + k);
__m128i a2 = *(A2_row + k);
__m128i a3 = *(A3_row + k);
__m128i a4 = *(A4_row + k);
// _mm_madd_epi16 does multiply add on 8 16-bit integers and accumulates
// into a four 32-bit register. E.g., a1 = [f1, f2, f3, f4, f5, f6, f7,
// h8] (16-bit ints) b1 = [h1, h2, h3, h4, h5, h6, h7, h8] (16-bit ints)
// result = [f1*h1 + f2*h2, f3*h3 + f4*h4, f5*h5 + f6*h6, f7*h7 + f8*h8]
// (32-bit ints) Then _mm_add_epi32 just effectively does a += on these
// 32-bit integers.
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(b, a3));
sum4 = _mm_add_epi32(sum4, _mm_madd_epi16(b, a4));
}
// We now have each sum spread across 4 32-bit ints in SSE register, e.g.,
// sum1 = [r1, r2, r3, r4]. We need to compute r1 + r2 + r3 + r4.
//
// This uses 'horizontal add' to do that efficiently. The first add gets
// us [r1 + r2, r2 + r3, r1 + r2, r2 + r3] Then the second gets us. [r1 +
// r2 + r2 + r3, r2 + r3 + r1 + r2, r1 + r2 + r2 + r3, r2 + r3 + r1 + r2]
// E.g., each 32-bit in contains the full sum.
sum1 = _mm_hadd_epi32(sum1, sum1);
sum1 = _mm_hadd_epi32(sum1, sum1);
sum2 = _mm_hadd_epi32(sum2, sum2);
sum2 = _mm_hadd_epi32(sum2, sum2);
sum3 = _mm_hadd_epi32(sum3, sum3);
sum3 = _mm_hadd_epi32(sum3, sum3);
sum4 = _mm_hadd_epi32(sum4, sum4);
sum4 = _mm_hadd_epi32(sum4, sum4);
float* C1 = fC + (i + 0) * num_B_rows + j;
float* C2 = fC + (i + 1) * num_B_rows + j;
float* C3 = fC + (i + 2) * num_B_rows + j;
float* C4 = fC + (i + 3) * num_B_rows + j;
// Now that we have the full sum in each 32-bit register, we convert them
// to an integer with _mm_cvtepi32_ps and take the first one with
// _mm_store_ss. We don't use an SSE instruction to unquantize, although
// we could. It doesn't really matter since most of the computation is in
// the above loop over the width.
//
// Also note that the memory acceses on C are not consecutive, but this is
// a tradeoff that we have to make. We can't have consecutive accesses of
// qA, qB, *and* C. But we access qA and qB a lot more so it makes sense
// to do it this way.
_mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
*(C1) *= unquant_mult;
_mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
*(C2) *= unquant_mult;
_mm_store_ss(C3, _mm_cvtepi32_ps(sum3));
*(C3) *= unquant_mult;
_mm_store_ss(C4, _mm_cvtepi32_ps(sum4));
*(C4) *= unquant_mult;
}
}
if(rest == 1) {
const __m128i* A1_row = qA + (i + 0) * sse_width;
for(int j = 0; j < num_B_rows; j++) {
const __m128i* B_row = qB + j * sse_width;
__m128i sum1 = _mm_setzero_si128();
// This is just a simple dot product, unrolled four ways.
for(int k = 0; k < sse_width; k++) {
__m128i b = *(B_row + k);
__m128i a1 = *(A1_row + k);
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
}
sum1 = _mm_hadd_epi32(sum1, sum1);
sum1 = _mm_hadd_epi32(sum1, sum1);
float* C1 = fC + (i + 0) * num_B_rows + j;
_mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
*(C1) *= unquant_mult;
}
} else if(rest == 2) {
const __m128i* A1_row = qA + (i + 0) * sse_width;
const __m128i* A2_row = qA + (i + 1) * sse_width;
for(int j = 0; j < num_B_rows; j++) {
const __m128i* B_row = qB + j * sse_width;
__m128i sum1 = _mm_setzero_si128();
__m128i sum2 = _mm_setzero_si128();
for(int k = 0; k < sse_width; k++) {
__m128i b = *(B_row + k);
__m128i a1 = *(A1_row + k);
__m128i a2 = *(A2_row + k);
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
}
sum1 = _mm_hadd_epi32(sum1, sum1);
sum1 = _mm_hadd_epi32(sum1, sum1);
sum2 = _mm_hadd_epi32(sum2, sum2);
sum2 = _mm_hadd_epi32(sum2, sum2);
float* C1 = fC + (i + 0) * num_B_rows + j;
float* C2 = fC + (i + 1) * num_B_rows + j;
_mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
*(C1) *= unquant_mult;
_mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
*(C2) *= unquant_mult;
}
} else if(rest == 3) {
const __m128i* A1_row = qA + (i + 0) * sse_width;
const __m128i* A2_row = qA + (i + 1) * sse_width;
const __m128i* A3_row = qA + (i + 2) * sse_width;
for(int j = 0; j < num_B_rows; j++) {
const __m128i* B_row = qB + j * sse_width;
__m128i sum1 = _mm_setzero_si128();
__m128i sum2 = _mm_setzero_si128();
__m128i sum3 = _mm_setzero_si128();
for(int k = 0; k < sse_width; k++) {
__m128i b = *(B_row + k);
__m128i a1 = *(A1_row + k);
__m128i a2 = *(A2_row + k);
__m128i a3 = *(A3_row + k);
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(b, a3));
}
sum1 = _mm_hadd_epi32(sum1, sum1);
sum1 = _mm_hadd_epi32(sum1, sum1);
sum2 = _mm_hadd_epi32(sum2, sum2);
sum2 = _mm_hadd_epi32(sum2, sum2);
sum3 = _mm_hadd_epi32(sum3, sum3);
sum3 = _mm_hadd_epi32(sum3, sum3);
float* C1 = fC + (i + 0) * num_B_rows + j;
float* C2 = fC + (i + 1) * num_B_rows + j;
float* C3 = fC + (i + 2) * num_B_rows + j;
_mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
*(C1) *= unquant_mult;
_mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
*(C2) *= unquant_mult;
_mm_store_ss(C3, _mm_cvtepi32_ps(sum3));
*(C3) *= unquant_mult;
}
}
}
} // namespace int16
} // namespace cpu
} // namespace marian

View File

@ -66,16 +66,6 @@ public:
CudaCompute getCudaComputeCapability() { return compute_; }
// for CPU, sets to use optimized code for inference.
// for GPU, this is invalid. for gpu, isOptimized() function always returns false.
void setOptimized(bool optimize) override {
LOG_ONCE(info, "setOptimized() not supported for GPU_{}", optimize);
}
bool isOptimized() override {
return false;
}
private:
cublasHandle_t cublasHandle_{0}; // make sure it's 0, so it can be initalized lazily
cusparseHandle_t cusparseHandle_{0}; // as above

View File

@ -7,7 +7,9 @@ int main(int /*argc*/, char** /*argv*/) {
{
auto g = New<ExpressionGraph>(true);
g->setDevice({0, DeviceType::cpu});
g->getBackend()->setOptimized(false);
#if 0 // this file is not a real test, just used for manual stuff. Disable here by hand for now.
g->getBackend()->setInt16(false);
#endif
g->reserveWorkspaceMB(2512);
timer::AutoTimer timer;
@ -40,7 +42,44 @@ int main(int /*argc*/, char** /*argv*/) {
{
auto g = New<ExpressionGraph>(true);
g->setDevice({0, DeviceType::cpu});
g->getBackend()->setOptimized(true);
#if 0
g->getBackend()->setInt16(true);
#endif
g->reserveWorkspaceMB(2512);
timer::AutoTimer timer;
for(int i = 0; i < 100; ++i) {
g->clear();
auto x = g->constant({1, 4, 8, 256}, inits::glorotUniform());
auto W1 = g->param("W1", {256, 2048}, inits::glorotUniform());
auto b1 = g->param("b1", {1, 2048}, inits::glorotUniform());
auto out = affine(x, W1, b1);
for(int i = 2; i < 20; ++i) {
auto Wi = g->param("W" + std::to_string(i), {2048, 2048}, inits::glorotUniform());
auto bi = g->param("b" + std::to_string(i), {1, 2048}, inits::glorotUniform());
out = relu(affine(out, Wi, bi));
}
auto Wn = g->param("Wn", {2048, 256}, inits::glorotUniform());
auto bn = g->param("bn", {1, 256}, inits::glorotUniform());
auto y = affine(out, Wn, bn);
g->forward();
}
}
{
auto g = New<ExpressionGraph>(true);
g->setDevice({0, DeviceType::cpu});
#if 0
g->getBackend()->setInt8(true);
#endif
g->reserveWorkspaceMB(2512);
timer::AutoTimer timer;

View File

@ -19,7 +19,6 @@ AsyncGraphGroup::AsyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
auto graph = New<ExpressionGraph>();
graph->setDevice(device);
graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_.push_back(graph);
shardOpt_.push_back(Optimizer(options_));

View File

@ -35,7 +35,6 @@ public:
graph_ = New<ExpressionGraph>();
graph_->setDevice(deviceId);
graph_->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph_->getBackend()->setClip(options_->get<float>("clip-gemm"));
graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
opt_ = Optimizer(options_);
builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);

View File

@ -12,7 +12,6 @@ SyncGraphGroup::SyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
graph->setDevice(device);
graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
graphs_.push_back(graph);
shardOpt_.push_back(Optimizer(options_));

View File

@ -87,10 +87,6 @@ public:
auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
graph->setDefaultElementType(typeFromString(prec[0]));
graph->setDevice(device);
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
}
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_[id] = graph;
@ -229,10 +225,6 @@ public:
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
graph->setDevice(device);
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
}
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_.push_back(graph);