mirror of
https://github.com/marian-nmt/marian.git
synced 2024-10-03 18:17:12 +03:00
Integrate intgemm into marian (#595)
Adds intgemm as a module for Marian. Intgemm is @kpu 's 8/16 bit gemm library with support for architectures from SSE2 to AVX512VNNI Removes outdated integer code, related to the --optimize option Co-authored-by: Kenneth Heafield <github@kheafield.com> Co-authored-by: Kenneth Heafield <kpu@users.noreply.github.com> Co-authored-by: Ulrich Germann <ugermann@inf.ed.ac.uk> Co-authored-by: Marcin Junczys-Dowmunt <marcinjd@microsoft.com> Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
This commit is contained in:
parent
737f43014a
commit
600f5cbdec
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -14,6 +14,9 @@
|
||||
path = src/3rd_party/fbgemm
|
||||
url = https://github.com/marian-nmt/FBGEMM
|
||||
branch = master
|
||||
[submodule "src/3rd_party/intgemm"]
|
||||
path = src/3rd_party/intgemm
|
||||
url = https://github.com/marian-nmt/intgemm/
|
||||
[submodule "src/3rd_party/simple-websocket-server"]
|
||||
path = src/3rd_party/simple-websocket-server
|
||||
url = https://github.com/marian-nmt/Simple-WebSocket-Server
|
||||
|
@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Added `intgemm8(ssse3|avx|avx512)?`, `intgemm16(sse2|avx|avx512)?` types to marian-conv with uses intgemm backend. Types intgemm8 and intgemm16 are hardware-agnostic, the other ones hardware-specific.
|
||||
- Shortlist is now always multiple-of-eight.
|
||||
- Added intgemm 8/16bit integer binary architecture agnostic format.
|
||||
- Add --train-embedder-rank for fine-tuning any encoder(-decoder) model for multi-lingual similarity via softmax-margin loss
|
||||
- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument.
|
||||
- Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation
|
||||
@ -56,6 +59,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
||||
- Fix the runtime failures for FASTOPT on 32-bit builds (wasm just happens to be 32-bit) because it uses hashing with an inconsistent mix of uint64_t and size_t.
|
||||
|
||||
### Changed
|
||||
- Remove `--clip-gemm` which is obsolete and was never used anyway
|
||||
- Removed `--optimize` switch, instead we now determine compute type based on binary model.
|
||||
- Updated SentencePiece repository to version 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 from https://github.com/google/sentencepiece.
|
||||
- Enabled compilation of SentencePiece by default since no dependency on protobuf anymore.
|
||||
- Changed default value of --sentencepiece-max-lines from 10000000 to 2000000 since apparently the new version doesn't sample automatically anymore (Not quite clear how that affects quality of the vocabulary).
|
||||
|
@ -93,8 +93,8 @@ if(MSVC)
|
||||
# Or maybe use these?
|
||||
set(INTRINSICS "/arch:AVX2")
|
||||
# set(INTRINSICS "/arch:AVX512")
|
||||
|
||||
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS ${DISABLE_GLOBALLY}")
|
||||
# /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
|
||||
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
|
||||
|
||||
@ -438,6 +438,7 @@ endif(USE_MPI)
|
||||
###############################################################################
|
||||
# Find BLAS library
|
||||
if(COMPILE_CPU)
|
||||
set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
|
||||
if(USE_APPLE_ACCELERATE)
|
||||
if(NOT APPLE)
|
||||
message(FATAL_ERROR "FATAL ERROR: Apple Accelerate only works on macOS.")
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 16914ae94c80f338c678f0461c4e45965149f6aa
|
||||
Subproject commit 97b2f95abab6134c1632b286e373e513ecc52020
|
5
src/3rd_party/CMakeLists.txt
vendored
5
src/3rd_party/CMakeLists.txt
vendored
@ -8,6 +8,11 @@ add_subdirectory(./zlib)
|
||||
add_subdirectory(./faiss)
|
||||
include_directories(./faiss)
|
||||
|
||||
if(COMPILE_CPU)
|
||||
set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
|
||||
add_subdirectory(./intgemm)
|
||||
endif(COMPILE_CPU)
|
||||
|
||||
if(USE_FBGEMM)
|
||||
# @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
|
||||
# meant to silence CMakeFiles of 3rd_party tools.
|
||||
|
1
src/3rd_party/intgemm
vendored
Submodule
1
src/3rd_party/intgemm
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 874ceebbf53a85691b326495100b6361a2166cec
|
@ -5,6 +5,8 @@ include_directories(3rd_party)
|
||||
include_directories(3rd_party/SQLiteCpp/include)
|
||||
include_directories(3rd_party/sentencepiece)
|
||||
include_directories(3rd_party/fbgemm/include)
|
||||
include_directories(3rd_party/intgemm)
|
||||
include_directories(${CMAKE_BINARY_DIR}/src/3rd_party/intgemm) # running cmake on the intgemm submodule triggers config file generation in this directory.
|
||||
include_directories(${CMAKE_BINARY_DIR}/local/include)
|
||||
|
||||
set(MARIAN_SOURCES
|
||||
@ -41,6 +43,7 @@ set(MARIAN_SOURCES
|
||||
|
||||
3rd_party/cnpy/cnpy.cpp
|
||||
3rd_party/ExceptionWithCallStack.cpp
|
||||
|
||||
3rd_party/onnx/protobuf/onnx-ml.pb-wrapper.cpp
|
||||
|
||||
3rd_party/phf/phf.cc
|
||||
@ -52,10 +55,7 @@ set(MARIAN_SOURCES
|
||||
tensors/cpu/prod.cpp
|
||||
tensors/cpu/topk.cpp
|
||||
tensors/cpu/tensor_operators.cpp
|
||||
|
||||
tensors/cpu/sharp/int_gemm.cpp
|
||||
tensors/cpu/sharp/avx_gemm.cpp
|
||||
tensors/cpu/sharp/sse_gemm.cpp
|
||||
tensors/cpu/integer_common.cpp
|
||||
tensors/cpu/fbgemm/packed_gemm.cpp
|
||||
|
||||
graph/expression_graph.cpp
|
||||
|
@ -1,12 +1,10 @@
|
||||
#include "marian.h"
|
||||
|
||||
#include "common/cli_wrapper.h"
|
||||
#include "tensors/cpu/expression_graph_packable.h"
|
||||
#include "onnx/expression_graph_onnx_exporter.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "tensors/cpu/fbgemm/expression_graph_packable.h"
|
||||
#include "onnx/expression_graph_onnx_exporter.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
using namespace marian;
|
||||
|
||||
@ -24,7 +22,9 @@ int main(int argc, char** argv) {
|
||||
cli->add<std::string>("--from,-f", "Input model", "model.npz");
|
||||
cli->add<std::string>("--to,-t", "Output model", "model.bin");
|
||||
cli->add<std::string>("--export-as", "Kind of conversion: marian-bin or onnx-{encode,decoder-step,decoder-init,decoder-stop}", "marian-bin");
|
||||
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512", "float32");
|
||||
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512, "
|
||||
"intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512",
|
||||
"float32");
|
||||
cli->add<std::vector<std::string>>("--vocabs,-V", "Vocabulary file, required for ONNX export");
|
||||
cli->parse(argc, argv);
|
||||
options->merge(config);
|
||||
@ -35,19 +35,8 @@ int main(int argc, char** argv) {
|
||||
auto exportAs = options->get<std::string>("export-as");
|
||||
auto vocabPaths = options->get<std::vector<std::string>>("vocabs");// , std::vector<std::string>());
|
||||
|
||||
auto saveGemmTypeStr = options->get<std::string>("gemm-type", "float32");
|
||||
Type saveGemmType;
|
||||
if(saveGemmTypeStr == "float32") {
|
||||
saveGemmType = Type::float32;
|
||||
} else if(saveGemmTypeStr == "packed16") { // packed16 only supports AVX2. AVX512 might be added later
|
||||
saveGemmType = Type::packed16;
|
||||
} else if(saveGemmTypeStr == "packed8avx2") { // packed8 for AVX2
|
||||
saveGemmType = Type::packed8avx2;
|
||||
} else if(saveGemmTypeStr == "packed8avx512") { // packed8 for AVX512
|
||||
saveGemmType = Type::packed8avx512;
|
||||
} else {
|
||||
ABORT("Unknown gemm-type: {}", saveGemmTypeStr);
|
||||
}
|
||||
// We accept any type here and will later croak during packAndSave if the type cannot be used for conversion
|
||||
Type saveGemmType = typeFromString(options->get<std::string>("gemm-type", "float32"));
|
||||
|
||||
LOG(info, "Outputting {}, precision: {}", modelTo, saveGemmType);
|
||||
|
||||
@ -58,12 +47,11 @@ int main(int argc, char** argv) {
|
||||
|
||||
auto load = [&](Ptr<ExpressionGraph> graph) {
|
||||
graph->setDevice(CPU0);
|
||||
graph->getBackend()->setOptimized(false);
|
||||
|
||||
graph->load(modelFrom);
|
||||
graph->forward(); // run the initializers
|
||||
};
|
||||
|
||||
|
||||
if (exportAs == "marian-bin") {
|
||||
auto graph = New<ExpressionGraphPackable>();
|
||||
load(graph);
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "common/file_stream.h"
|
||||
#include "common/io_item.h"
|
||||
#include "common/types.h"
|
||||
#include "tensors/cpu/integer_common.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
@ -57,13 +58,31 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
|
||||
get<char>(current, offset);
|
||||
|
||||
for(int i = 0; i < numHeaders; ++i) {
|
||||
// For intgemm AVX512 and AVX512VNNI have the same arangement, but the VNNI algorithm is faster.
|
||||
// Change the type to the fastest one supported.
|
||||
if (items[i].type == Type::intgemm8avx512) {
|
||||
items[i].type = cpu::integer::getIntgemmType(Type::intgemm8);
|
||||
}
|
||||
if(items[i].mapped) { // memory-mapped, hence only set pointer
|
||||
// @TOOD: verify this actually works for the hardware-specific ones like intgemm8avx2
|
||||
ABORT_IF(items[i].type == Type::intgemm8 || items[i].type == Type::intgemm16, "mmap format not supported for hardware non-specific intgemm matrices");
|
||||
items[i].ptr = get<char>(current, headers[i].dataLength);
|
||||
} else { // reading into item data
|
||||
size_t len = headers[i].dataLength;
|
||||
items[i].bytes.resize(len);
|
||||
const char* ptr = get<char>(current, len);
|
||||
std::copy(ptr, ptr + len, items[i].bytes.begin());
|
||||
// Intgemm8/16 matrices in binary model are just quantized, however they also need to be reordered
|
||||
// Reordering depends on the architecture (SSE/AVX2/AVX512) so we read in the quantized matrices and
|
||||
// then reorder them before adding them as a parameter in the graph.
|
||||
if (matchType<intgemm8>(items[i].type)) {
|
||||
items[i].type = cpu::integer::getIntgemmType(Type::intgemm8);
|
||||
cpu::integer::prepareAndTransposeB<Type::intgemm8>(items[i], ptr);
|
||||
} else if (matchType<intgemm16>(items[i].type)) {
|
||||
items[i].type = cpu::integer::getIntgemmType(Type::intgemm16);
|
||||
cpu::integer::prepareAndTransposeB<Type::intgemm16>(items[i], ptr);
|
||||
} else {
|
||||
std::copy(ptr, ptr + len, items[i].bytes.begin());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -134,8 +134,6 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
|
||||
"Suppress logging for translation");
|
||||
cli.add<size_t>("--seed",
|
||||
"Seed for all random number generators. 0 means initialize randomly");
|
||||
cli.add<float>("--clip-gemm",
|
||||
"If not 0 clip GEMM input values to +/- arg");
|
||||
cli.add<bool>("--interpolate-env-vars",
|
||||
"allow the use of environment variables in paths, of the form ${VAR_NAME}");
|
||||
cli.add<bool>("--relative-paths",
|
||||
@ -671,15 +669,13 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
|
||||
addSuboptionsDevices(cli);
|
||||
addSuboptionsBatching(cli);
|
||||
|
||||
cli.add<bool>("--optimize",
|
||||
"Optimize speed aggressively sacrificing memory or precision");
|
||||
cli.add<bool>("--skip-cost",
|
||||
"Ignore model cost during translation, not recommended for beam-size > 1");
|
||||
cli.add<bool>("--fp16",
|
||||
"Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
|
||||
cli.add<std::vector<std::string>>("--precision",
|
||||
"Mixed precision for inference, set parameter type in expression graph",
|
||||
{"float32"});
|
||||
cli.add<bool>("--skip-cost",
|
||||
"Ignore model cost during translation, not recommended for beam-size > 1");
|
||||
|
||||
cli.add<std::vector<std::string>>("--shortlist",
|
||||
"Use softmax shortlist: path first best prune");
|
||||
@ -737,8 +733,6 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
|
||||
addSuboptionsDevices(cli);
|
||||
addSuboptionsBatching(cli);
|
||||
|
||||
cli.add<bool>("--optimize",
|
||||
"Optimize speed aggressively sacrificing memory or precision");
|
||||
cli.add<bool>("--fp16",
|
||||
"Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
|
||||
cli.add<std::vector<std::string>>("--precision",
|
||||
@ -776,12 +770,10 @@ void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) {
|
||||
addSuboptionsDevices(cli);
|
||||
addSuboptionsBatching(cli);
|
||||
|
||||
cli.add<bool>("--optimize",
|
||||
"Optimize speed aggressively sacrificing memory or precision");
|
||||
cli.add<bool>("--fp16",
|
||||
"Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
|
||||
cli.add<std::vector<std::string>>("--precision",
|
||||
"Mixed precision for inference, set parameter type in expression graph",
|
||||
"Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16",
|
||||
{"float32"});
|
||||
|
||||
cli.switchGroup(previous_group);
|
||||
@ -934,7 +926,6 @@ void ConfigParser::addSuboptionsQuantization(cli::CLIWrapper& cli) {
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
|
||||
cli::mode ConfigParser::getMode() const { return mode_; }
|
||||
|
||||
Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
|
||||
|
@ -26,13 +26,16 @@ size_t requiredBytes(const Shape& shape, Type type) {
|
||||
ABORT("Not a supported data type: {}", type);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endif // USE_FBGEMM
|
||||
|
||||
if (isIntgemm(type)) {
|
||||
/* Intgemm tensors have an extra float at the back that stores the quantization multiplier */
|
||||
return shape.elements() * sizeOf(type) + sizeOf(Type::float32);
|
||||
} else {
|
||||
return shape.elements() * sizeOf(type);
|
||||
}
|
||||
#else
|
||||
return shape.elements() * sizeOf(type);
|
||||
#endif // USE_FBGEMM
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace marian
|
@ -135,20 +135,26 @@ do { \
|
||||
namespace marian {
|
||||
|
||||
// small struct to enable templating based on types use for packing
|
||||
struct packed16 {
|
||||
uint16_t x;
|
||||
};
|
||||
struct packed16 { uint16_t x; };
|
||||
|
||||
// small struct to enable templating based on types use for packing. This is a memory holder.
|
||||
// There's no difference between packed8avx2 and packed8avx512. But, they are separately defined to be distinguished.
|
||||
struct packed8avx2 {
|
||||
uint8_t x;
|
||||
};
|
||||
struct packed8avx2 { uint8_t x; };
|
||||
struct packed8avx512 { uint8_t x; };
|
||||
|
||||
// similar to the packed16, but to use with 16bit intgemm model packing.
|
||||
struct intgemm16 { int16_t x; };
|
||||
struct intgemm16sse2 { int16_t x; };
|
||||
struct intgemm16avx2 { int16_t x; };
|
||||
struct intgemm16avx512 { int16_t x; };
|
||||
|
||||
// similar to packed8* but for intgemm 8bit model packing.
|
||||
struct intgemm8 { int8_t x; };
|
||||
struct intgemm8ssse3 { int8_t x; };
|
||||
struct intgemm8avx2 { int8_t x; };
|
||||
struct intgemm8avx512 { int8_t x; };
|
||||
struct intgemm8avx512vnni { int8_t x; };
|
||||
|
||||
// small struct to enable templating based on types use for packing. This is a memory holder.
|
||||
struct packed8avx512 {
|
||||
uint8_t x;
|
||||
};
|
||||
|
||||
#ifndef __CUDACC__ // vectorized types not available from .cu files
|
||||
|
||||
@ -214,17 +220,22 @@ struct float32x8 {
|
||||
#endif
|
||||
|
||||
// Internal to types.h, don't use. Use test functions below.
|
||||
enum class TypeClass : size_t {
|
||||
signed_type = 0x0100,
|
||||
unsigned_type = 0x0200,
|
||||
float_type = 0x0400,
|
||||
enum class TypeClass : size_t { // size_type has 8 bytes, so we can have 16 fields here, currently using 5. Extend to the left for back-compat.
|
||||
// built-in type classes
|
||||
signed_type = 0x00100,
|
||||
unsigned_type = 0x00200,
|
||||
float_type = 0x00400,
|
||||
|
||||
packed_type = 0x0800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
|
||||
avx2_type = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only
|
||||
avx512_type = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only
|
||||
avx2_type = 0x01000, // processor-specific layout for avx2, currently used for FBGEMM only (keep 0x1000 for back-compat)
|
||||
avx512_type = 0x02000, // processor-specific layout for avx512, currently used for FBGEMM only (keep 0x2000 for back-compat)
|
||||
sse2_type = 0x04000, // processor-specific layout for sse2, currently used for Intgemm only
|
||||
ssse3_type = 0x08000, // processor-specific layout for ssse3, currently used for Intgemm only
|
||||
|
||||
size_mask = 0x00FF,
|
||||
class_mask = 0xFF00
|
||||
packed_type = 0x00800, // special packed (CPU cache friendly) type class, used in FBGEMM. Annoyingly we need to keep 0x800 for back-compat, would be nicer to align with intgemm
|
||||
intgemm_type = 0x10000, // intgemm quantized architecture agnostic models
|
||||
|
||||
size_mask = 0x000FF, // maximum allowed size is 256 bytes right now; if more are required, extend the size field
|
||||
class_mask = 0xFFF00, // three fields for different type classes, if more classes are added we need to increase the number of fields here
|
||||
};
|
||||
|
||||
constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
|
||||
@ -251,10 +262,21 @@ enum class Type : size_t {
|
||||
float32 = TypeClass::float_type + 4u,
|
||||
float64 = TypeClass::float_type + 8u,
|
||||
|
||||
packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
|
||||
packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
|
||||
packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
|
||||
packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
|
||||
packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
|
||||
packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
|
||||
|
||||
intgemm8 = TypeClass::intgemm_type + 1u, // Int8 quantized (not packed) matrices for intgemm
|
||||
intgemm16 = TypeClass::intgemm_type + 2u, // Int16 quantized (not packed) matrices for intgemm
|
||||
|
||||
intgemm8ssse3 = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type, // Int8 quantized and packed (ssse3) matrices for intgemm
|
||||
intgemm8avx2 = TypeClass::intgemm_type + 1u + TypeClass::avx2_type, // Int8 quantized and packed (avx2) matrices for intgemm
|
||||
intgemm8avx512 = TypeClass::intgemm_type + 1u + TypeClass::avx512_type, // Int8 quantized and packed (avx512) matrices for intgemm
|
||||
intgemm8avx512vnni = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, // Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
|
||||
|
||||
intgemm16sse2 = TypeClass::intgemm_type + 2u + TypeClass::sse2_type, // Int16 quantized and packed (sse2) matrices for intgemm
|
||||
intgemm16avx2 = TypeClass::intgemm_type + 2u + TypeClass::avx2_type, // Int16 quantized and packed (avx2) matrices for intgemm
|
||||
intgemm16avx512 = TypeClass::intgemm_type + 2u + TypeClass::avx512_type, // Int16 quantized and packed (avx512) matrices for intgemm
|
||||
};
|
||||
|
||||
static inline size_t operator&(TypeClass typeClass, Type type) {
|
||||
@ -289,6 +311,14 @@ static inline bool isPacked(Type type) {
|
||||
return (TypeClass::packed_type & type) != 0;
|
||||
}
|
||||
|
||||
static inline bool isSse2(Type type) {
|
||||
return (TypeClass::sse2_type & type) != 0;
|
||||
}
|
||||
|
||||
static inline bool isSsse3(Type type) {
|
||||
return (TypeClass::ssse3_type & type) != 0;
|
||||
}
|
||||
|
||||
static inline bool isAvx2(Type type) {
|
||||
return (TypeClass::avx2_type & type) != 0;
|
||||
}
|
||||
@ -297,6 +327,10 @@ static inline bool isAvx512(Type type) {
|
||||
return (TypeClass::avx512_type & type) != 0;
|
||||
}
|
||||
|
||||
static inline bool isIntgemm(Type type) {
|
||||
return (TypeClass::intgemm_type & type) != 0;
|
||||
}
|
||||
|
||||
size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type
|
||||
|
||||
template <typename T>
|
||||
@ -314,13 +348,24 @@ template <> inline bool matchType<uint16_t>(Type type) { return type == Type::ui
|
||||
template <> inline bool matchType<uint32_t>(Type type) { return type == Type::uint32; }
|
||||
template <> inline bool matchType<uint64_t>(Type type) { return type == Type::uint64; }
|
||||
|
||||
template <> inline bool matchType<float16>(Type type) { return type == Type::float16; }
|
||||
template <> inline bool matchType<float>(Type type) { return type == Type::float32; }
|
||||
template <> inline bool matchType<double>(Type type) { return type == Type::float64; }
|
||||
template <> inline bool matchType<float16>(Type type) { return type == Type::float16; }
|
||||
template <> inline bool matchType<float>(Type type) { return type == Type::float32; }
|
||||
template <> inline bool matchType<double>(Type type) { return type == Type::float64; }
|
||||
|
||||
template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
|
||||
template <> inline bool matchType<packed8avx2>(Type type) { return type == Type::packed8avx2; }
|
||||
template <> inline bool matchType<packed8avx512>(Type type) { return type == Type::packed8avx512; }
|
||||
template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
|
||||
template <> inline bool matchType<packed8avx2>(Type type) { return type == Type::packed8avx2; }
|
||||
template <> inline bool matchType<packed8avx512>(Type type) { return type == Type::packed8avx512; }
|
||||
|
||||
template <> inline bool matchType<intgemm8>(Type type) { return type == Type::intgemm8; }
|
||||
template <> inline bool matchType<intgemm8ssse3>(Type type) { return type == Type::intgemm8ssse3; }
|
||||
template <> inline bool matchType<intgemm8avx2>(Type type) { return type == Type::intgemm8avx2; }
|
||||
template <> inline bool matchType<intgemm8avx512>(Type type) { return type == Type::intgemm8avx512; }
|
||||
template <> inline bool matchType<intgemm8avx512vnni>(Type type) { return type == Type::intgemm8avx512vnni; }
|
||||
|
||||
template <> inline bool matchType<intgemm16>(Type type) { return type == Type::intgemm16; }
|
||||
template <> inline bool matchType<intgemm16sse2>(Type type) { return type == Type::intgemm16sse2; }
|
||||
template <> inline bool matchType<intgemm16avx2>(Type type) { return type == Type::intgemm16avx2; }
|
||||
template <> inline bool matchType<intgemm16avx512>(Type type) { return type == Type::intgemm16avx512; }
|
||||
// clang-format on
|
||||
|
||||
static inline std::ostream& operator<<(std::ostream& out, Type type) {
|
||||
@ -342,6 +387,16 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) {
|
||||
case Type::packed16 : out << "packed16"; break;
|
||||
case Type::packed8avx2 : out << "packed8avx2"; break;
|
||||
case Type::packed8avx512 : out << "packed8avx512"; break;
|
||||
|
||||
case Type::intgemm8 : out << "intgemm8"; break;
|
||||
case Type::intgemm8ssse3 : out << "intgemm8ssse3"; break;
|
||||
case Type::intgemm8avx2 : out << "intgemm8avx2"; break;
|
||||
case Type::intgemm8avx512 : out << "intgemm8avx512"; break;
|
||||
case Type::intgemm8avx512vnni : out << "intgemm8avx512vnni"; break;
|
||||
case Type::intgemm16 : out << "intgemm16"; break;
|
||||
case Type::intgemm16sse2 : out << "intgemm16sse2"; break;
|
||||
case Type::intgemm16avx2 : out << "intgemm16avx2"; break;
|
||||
case Type::intgemm16avx512 : out << "intgemm16avx512"; break;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
@ -350,12 +405,12 @@ template <typename T>
|
||||
inline std::string request();
|
||||
|
||||
// clang-format off
|
||||
template <> inline std::string request<int8_t>() { return "int8"; }
|
||||
template <> inline std::string request<int8_t>() { return "int8"; }
|
||||
template <> inline std::string request<int16_t>() { return "int16"; }
|
||||
template <> inline std::string request<int32_t>() { return "int32"; }
|
||||
template <> inline std::string request<int64_t>() { return "int64"; }
|
||||
|
||||
template <> inline std::string request<uint8_t>() { return "uint8"; }
|
||||
template <> inline std::string request<uint8_t>() { return "uint8"; }
|
||||
template <> inline std::string request<uint16_t>() { return "uint16"; }
|
||||
template <> inline std::string request<uint32_t>() { return "uint32"; }
|
||||
template <> inline std::string request<uint64_t>() { return "uint64"; }
|
||||
@ -364,9 +419,19 @@ template <> inline std::string request<float16>() { return "float16"; }
|
||||
template <> inline std::string request<float>() { return "float32"; }
|
||||
template <> inline std::string request<double>() { return "float64"; }
|
||||
|
||||
template <> inline std::string request<packed16>() { return "packed16"; }
|
||||
template <> inline std::string request<packed8avx2>() { return "packed8avx2"; }
|
||||
template <> inline std::string request<packed8avx512>() { return "packed8avx512"; }
|
||||
template <> inline std::string request<packed16>() { return "packed16"; }
|
||||
template <> inline std::string request<packed8avx2>() { return "packed8avx2"; }
|
||||
template <> inline std::string request<packed8avx512>() { return "packed8avx512"; }
|
||||
|
||||
template <> inline std::string request<intgemm8>() { return "intgemm8"; }
|
||||
template <> inline std::string request<intgemm8ssse3>() { return "intgemm8ssse3"; }
|
||||
template <> inline std::string request<intgemm8avx2>() { return "intgemm8avx2"; }
|
||||
template <> inline std::string request<intgemm8avx512>() { return "intgemm8avx512"; }
|
||||
template <> inline std::string request<intgemm8avx512vnni>() { return "intgemm8avx512vnni"; }
|
||||
template <> inline std::string request<intgemm16>() { return "intgemm16"; }
|
||||
template <> inline std::string request<intgemm16sse2>() { return "intgemm16sse2"; }
|
||||
template <> inline std::string request<intgemm16avx2>() { return "intgemm16avx2"; }
|
||||
template <> inline std::string request<intgemm16avx512>() { return "intgemm16avx512"; }
|
||||
// clang-format on
|
||||
|
||||
static Type inline typeFromString(const std::string& str) {
|
||||
@ -402,18 +467,38 @@ static Type inline typeFromString(const std::string& str) {
|
||||
if(str == "packed8avx512")
|
||||
return Type::packed8avx512;
|
||||
|
||||
if(str == "intgemm8")
|
||||
return Type::intgemm8;
|
||||
if(str == "intgemm8ssse3")
|
||||
return Type::intgemm8ssse3;
|
||||
if(str == "intgemm8avx2")
|
||||
return Type::intgemm8avx2;
|
||||
if(str == "intgemm8avx512")
|
||||
return Type::intgemm8avx512;
|
||||
if(str == "intgemm8avx512vnni")
|
||||
return Type::intgemm8avx512vnni;
|
||||
|
||||
if(str == "intgemm16")
|
||||
return Type::intgemm16;
|
||||
if(str == "intgemm16sse2")
|
||||
return Type::intgemm16sse2;
|
||||
if(str == "intgemm16avx2")
|
||||
return Type::intgemm16avx2;
|
||||
if(str == "intgemm16avx512")
|
||||
return Type::intgemm16avx512;
|
||||
|
||||
ABORT("Unknown type {}", str);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline Type typeId();
|
||||
|
||||
template <> inline Type typeId<int8_t>() { return Type::int8; }
|
||||
template <> inline Type typeId<int8_t>() { return Type::int8; }
|
||||
template <> inline Type typeId<int16_t>() { return Type::int16; }
|
||||
template <> inline Type typeId<int32_t>() { return Type::int32; }
|
||||
template <> inline Type typeId<int64_t>() { return Type::int64; }
|
||||
|
||||
template <> inline Type typeId<uint8_t>() { return Type::uint8; }
|
||||
template <> inline Type typeId<uint8_t>() { return Type::uint8; }
|
||||
template <> inline Type typeId<uint16_t>() { return Type::uint16; }
|
||||
template <> inline Type typeId<uint32_t>() { return Type::uint32; }
|
||||
template <> inline Type typeId<uint64_t>() { return Type::uint64; }
|
||||
@ -422,10 +507,21 @@ template <> inline Type typeId<float16>() { return Type::float16; }
|
||||
template <> inline Type typeId<float>() { return Type::float32; }
|
||||
template <> inline Type typeId<double>() { return Type::float64; }
|
||||
|
||||
template <> inline Type typeId<packed16>() { return Type::packed16; }
|
||||
template <> inline Type typeId<packed8avx2>() { return Type::packed8avx2; }
|
||||
template <> inline Type typeId<packed16>() { return Type::packed16; }
|
||||
template <> inline Type typeId<packed8avx2>() { return Type::packed8avx2; }
|
||||
template <> inline Type typeId<packed8avx512>() { return Type::packed8avx512; }
|
||||
|
||||
template <> inline Type typeId<intgemm8>() { return Type::intgemm8; }
|
||||
template <> inline Type typeId<intgemm8ssse3>() { return Type::intgemm8ssse3; }
|
||||
template <> inline Type typeId<intgemm8avx2>() { return Type::intgemm8avx2; }
|
||||
template <> inline Type typeId<intgemm8avx512>() { return Type::intgemm8avx512; }
|
||||
template <> inline Type typeId<intgemm8avx512vnni>() { return Type::intgemm8avx512vnni; }
|
||||
template <> inline Type typeId<intgemm16>() { return Type::intgemm16; }
|
||||
template <> inline Type typeId<intgemm16sse2>() { return Type::intgemm16sse2; }
|
||||
template <> inline Type typeId<intgemm16avx2>() { return Type::intgemm16avx2; }
|
||||
template <> inline Type typeId<intgemm16avx512>() { return Type::intgemm16avx512; }
|
||||
|
||||
|
||||
// Abort if given C++ does not correspond to runtime type
|
||||
template <typename T>
|
||||
void matchOrAbort(Type type) {
|
||||
|
@ -260,6 +260,14 @@ public:
|
||||
for(auto& it : data_[i])
|
||||
indexSet.insert(it.first);
|
||||
}
|
||||
// Ensure that the generated vocabulary items from a shortlist are a multiple-of-eight
|
||||
// This is necessary until intgemm supports non-multiple-of-eight matrices.
|
||||
// TODO better solution here? This could potentially be slow.
|
||||
WordIndex i = static_cast<WordIndex>(firstNum_);
|
||||
while (indexSet.size() % 8 != 0) {
|
||||
indexSet.insert(i);
|
||||
i++;
|
||||
}
|
||||
|
||||
// turn into vector and sort (selected indices)
|
||||
std::vector<WordIndex> indices(indexSet.begin(), indexSet.end());
|
||||
|
@ -84,11 +84,6 @@ public:
|
||||
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
|
||||
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
|
||||
graph->setDevice(device);
|
||||
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
|
||||
if (device.type == DeviceType::cpu) {
|
||||
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
|
||||
}
|
||||
|
||||
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
graphs_.push_back(graph);
|
||||
}
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include "graph/node_operators_tuple.h"
|
||||
|
||||
#include "graph/auto_tuner.h"
|
||||
#include "tensors/cpu/int16.h"
|
||||
#include "tensors/cpu/intgemm_interface.h"
|
||||
#include "tensors/cpu/fbgemm/expanded_gemm.h"
|
||||
|
||||
#if USE_FBGEMM
|
||||
@ -466,7 +466,6 @@ Expr weighted_average(Expr in, Expr weights, int ax) {
|
||||
|
||||
Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
|
||||
auto device = a->graph()->getDeviceId().type;
|
||||
float clipValue = a->graph()->getBackend()->getClip();
|
||||
// added support for packed GEMM API (fp16, int8)
|
||||
Type aElementType = a->value_type();
|
||||
Type bElementType = b->value_type();
|
||||
@ -475,18 +474,9 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
|
||||
// --optimize --cpu-thread=N with N > 0 are set.
|
||||
if(device == DeviceType::cpu) {
|
||||
if(isFloat(aElementType) && isFloat(bElementType)) {
|
||||
if(a->graph()->getBackend()->isOptimized()) {
|
||||
// dotInt16 computes A * B.T, hence the transpose for B to get A * B
|
||||
// if transA = false and transB = false.
|
||||
|
||||
return cpu::int16::dot(
|
||||
cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
|
||||
cpu::int16::quantize(transB ? b : transpose(b), clipValue),
|
||||
scale);
|
||||
} else {
|
||||
return Expression<DotNodeOp>(
|
||||
clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
|
||||
}
|
||||
return Expression<DotNodeOp>(a, b, transA, transB, scale);
|
||||
} else if(isFloat(aElementType) && isIntgemm(bElementType)) {
|
||||
return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale);
|
||||
} else if(isFloat(aElementType) && isPacked(bElementType)) {
|
||||
#if USE_FBGEMM
|
||||
// 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
|
||||
@ -496,7 +486,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
|
||||
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
|
||||
if(fbgemm::fbgemmHasAvx2Support()) {
|
||||
// This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
|
||||
return cpu::variant::dot(clip(a, clipValue),
|
||||
return cpu::variant::dot(a,
|
||||
b,
|
||||
b->shape(),
|
||||
transA,
|
||||
@ -512,8 +502,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
|
||||
ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType);
|
||||
}
|
||||
} else {
|
||||
return Expression<DotNodeOp>(
|
||||
clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
|
||||
return Expression<DotNodeOp>(a, b, transA, transB, scale);
|
||||
}
|
||||
}
|
||||
|
||||
@ -524,16 +513,9 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) {
|
||||
static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
|
||||
// general version, MKL, CBlas or CUDA
|
||||
|
||||
// if clipValue > 0, the inputs will be clipped to range [-clipValue,
|
||||
// clipValue] This is meant to keep values at the same range as used during
|
||||
// training when optimizing for 8-bit integer products. Likely to be removed
|
||||
// in the future when we explore better ways to handle this.
|
||||
float clipValue = a->graph()->getBackend()->getClip();
|
||||
|
||||
int rows = a->shape().elements() / a->shape()[-1];
|
||||
Expr ones = a->graph()->ones({ rows, 1 });
|
||||
std::vector<Expr> nodes
|
||||
= { clip(a, clipValue), clip(b, clipValue), bias, ones };
|
||||
std::vector<Expr> nodes = { a, b, bias, ones };
|
||||
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
|
||||
}
|
||||
|
||||
@ -545,22 +527,14 @@ static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, f
|
||||
Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
|
||||
auto device = a->graph()->getDeviceId().type;
|
||||
|
||||
float clipValue = a->graph()->getBackend()->getClip();
|
||||
Type aElementType = a->value_type();
|
||||
Type bElementType = b->value_type();
|
||||
|
||||
if(device == DeviceType::cpu) {
|
||||
if(isFloat(aElementType) && isFloat(bElementType)) {
|
||||
if(a->graph()->getBackend()->isOptimized()) {
|
||||
// cpu int16 version
|
||||
return cpu::int16::affine(
|
||||
cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
|
||||
cpu::int16::quantize(transB ? b : transpose(b), clipValue),
|
||||
bias,
|
||||
scale);
|
||||
} else {
|
||||
return affineDefault(a, b, bias, transA, transB, scale);
|
||||
}
|
||||
return affineDefault(a, b, bias, transA, transB, scale);
|
||||
} else if(isFloat(aElementType) && isIntgemm(bElementType)) {
|
||||
return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale);
|
||||
} else if(isFloat(aElementType) && isPacked(bElementType)) {
|
||||
#if USE_FBGEMM
|
||||
// 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
|
||||
@ -570,7 +544,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
|
||||
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
|
||||
if(fbgemm::fbgemmHasAvx2Support()) {
|
||||
// This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
|
||||
return cpu::variant::affine(clip(a, clipValue),
|
||||
return cpu::variant::affine(a,
|
||||
b,
|
||||
b->shape(),
|
||||
bias,
|
||||
|
@ -10,7 +10,7 @@
|
||||
#include "translator/scorers.h"
|
||||
#include "data/alignment.h"
|
||||
#include "data/vocab_base.h"
|
||||
#include "tensors/cpu/fbgemm/expression_graph_packable.h"
|
||||
#include "tensors/cpu/expression_graph_packable.h"
|
||||
|
||||
#if USE_FBGEMM
|
||||
#include "fbgemm/Utils.h"
|
||||
@ -256,7 +256,6 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP
|
||||
|
||||
auto graph = New<ExpressionGraphPackable>();
|
||||
graph->setDevice(CPU0);
|
||||
graph->getBackend()->setOptimized(false);
|
||||
|
||||
graph->load(inputFile);
|
||||
graph->forward();
|
||||
|
@ -73,11 +73,6 @@ public:
|
||||
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
|
||||
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
|
||||
graph->setDevice(device);
|
||||
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
|
||||
if (device.type == DeviceType::cpu) {
|
||||
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
|
||||
}
|
||||
|
||||
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
graphs_.push_back(graph);
|
||||
}
|
||||
|
@ -10,10 +10,7 @@ protected:
|
||||
DeviceId deviceId_;
|
||||
size_t seed_;
|
||||
Ptr<RandomGenerator> randomGenerator_;
|
||||
|
||||
// global clipping value for matrix-multiplies, should soon be removed.
|
||||
float clipValue_{0.f};
|
||||
|
||||
|
||||
public:
|
||||
Backend(DeviceId deviceId, size_t seed)
|
||||
: deviceId_(deviceId), seed_(seed), randomGenerator_(createRandomGenerator(seed, deviceId)) {}
|
||||
@ -24,14 +21,6 @@ public:
|
||||
// for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name.
|
||||
virtual void setDevice() = 0;
|
||||
virtual void synchronize() = 0;
|
||||
|
||||
virtual void setClip(float clipValue) { clipValue_ = clipValue; }
|
||||
float getClip() { return clipValue_; }
|
||||
|
||||
// for CPU, sets to use optimized code for inference.
|
||||
// for GPU, this is invalid. for gpu, isOptimized() function always returns false.
|
||||
virtual void setOptimized(bool optimize) = 0;
|
||||
virtual bool isOptimized() = 0;
|
||||
};
|
||||
|
||||
Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);
|
||||
|
43
src/tensors/cpu/aligned.h
Normal file
43
src/tensors/cpu/aligned.h
Normal file
@ -0,0 +1,43 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/definitions.h"
|
||||
#include <stdlib.h>
|
||||
#ifdef _WIN32
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace {
|
||||
|
||||
// allocate function for tensor reserve() below.
|
||||
// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.
|
||||
|
||||
#ifdef _WIN32
|
||||
void *genericMalloc(size_t alignment, size_t size) {
|
||||
void *ret = _aligned_malloc(size, alignment);
|
||||
ABORT_IF(!ret, "Failed to allocate memory on CPU");
|
||||
return ret;
|
||||
}
|
||||
void genericFree(void *ptr) {
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
#else
|
||||
// Linux and OS X. There is no fallback to malloc because we need it to be aligned.
|
||||
void *genericMalloc(size_t alignment, size_t size) {
|
||||
// On macos, aligned_alloc is available only on c++17
|
||||
// Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
|
||||
// posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
|
||||
void *result;
|
||||
// Error could be detected by return value or just remaining nullptr.
|
||||
ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
|
||||
return result;
|
||||
}
|
||||
void genericFree(void *ptr) {
|
||||
free(ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
} // namespace cpu
|
||||
} // namespace marian
|
@ -10,17 +10,11 @@ namespace marian {
|
||||
namespace cpu {
|
||||
|
||||
class Backend : public marian::Backend {
|
||||
protected:
|
||||
bool optimized_{false};
|
||||
|
||||
public:
|
||||
Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
|
||||
void setDevice() override {}
|
||||
void synchronize() override {}
|
||||
|
||||
// for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
|
||||
void setOptimized(bool optimize) override { optimized_ = optimize; }
|
||||
bool isOptimized() override { return optimized_; }
|
||||
};
|
||||
|
||||
} // namespace cpu
|
||||
} // namespace marian
|
||||
|
@ -1,44 +1,8 @@
|
||||
#include "tensors/device.h"
|
||||
#include "tensors/cpu/aligned.h"
|
||||
#include <iostream>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace {
|
||||
|
||||
// allocate function for tensor reserve() below.
|
||||
// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.
|
||||
|
||||
#ifdef _WIN32
|
||||
void *genericMalloc(size_t alignment, size_t size) {
|
||||
void *ret = _aligned_malloc(size, alignment);
|
||||
ABORT_IF(!ret, "Failed to allocate memory on CPU");
|
||||
return ret;
|
||||
}
|
||||
void genericFree(void *ptr) {
|
||||
_aligned_free(ptr);
|
||||
}
|
||||
#else
|
||||
// Linux and OS X. There is no fallback to malloc because we need it to be aligned.
|
||||
void *genericMalloc(size_t alignment, size_t size) {
|
||||
// On macos, aligned_alloc is available only on c++17
|
||||
// Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
|
||||
// posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
|
||||
void *result;
|
||||
// Error could be detected by return value or just remaining nullptr.
|
||||
ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
|
||||
return result;
|
||||
}
|
||||
void genericFree(void *ptr) {
|
||||
free(ptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
Device::~Device() {
|
||||
genericFree(data_);
|
||||
|
266
src/tensors/cpu/expression_graph_packable.h
Normal file
266
src/tensors/cpu/expression_graph_packable.h
Normal file
@ -0,0 +1,266 @@
|
||||
#pragma once
|
||||
|
||||
#include "graph/expression_graph.h"
|
||||
#include "fbgemm/packed_gemm.h"
|
||||
#include "tensors/cpu/integer_common.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
void Transpose10(marian::Tensor out, const marian::Tensor in);
|
||||
}
|
||||
}
|
||||
|
||||
namespace marian {
|
||||
|
||||
|
||||
// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
|
||||
// The decision which weights can be packed or not should be done walking through the graph.
|
||||
// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
|
||||
// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
|
||||
// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
|
||||
// We will improve this in the near future.
|
||||
class ExpressionGraphPackable : public ExpressionGraph {
|
||||
public:
|
||||
ExpressionGraphPackable()
|
||||
: ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference
|
||||
|
||||
virtual ~ExpressionGraphPackable() {}
|
||||
|
||||
// Convert model weights into packed format and save to IO items.
|
||||
// @TODO: review this
|
||||
void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
|
||||
std::vector<io::Item> ioItems;
|
||||
|
||||
// sorted by name in std::map
|
||||
for (auto p : params()->getMap()) {
|
||||
std::string pName = p.first;
|
||||
|
||||
if (!namespace_.empty()) {
|
||||
if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
|
||||
pName = pName.substr(namespace_.size() + 2);
|
||||
}
|
||||
|
||||
Tensor val = p.second->val();
|
||||
|
||||
// save as packed format
|
||||
// @TODO Hardcoded to find packable weights
|
||||
// int8 - all the weights used for affine op and dot op
|
||||
// fp16 - all the weights used for affine op
|
||||
if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
|
||||
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
|
||||
#if USE_FBGEMM
|
||||
using namespace marian::cpu::variant;
|
||||
// packing information - size
|
||||
int nrow;
|
||||
int ncol;
|
||||
uint64_t packsize;
|
||||
|
||||
fbgemmPacked8PackInfo(val->shape(),
|
||||
gemmElementType,
|
||||
pName.find("Wemb") != std::string::npos,
|
||||
nrow,
|
||||
ncol,
|
||||
packsize);
|
||||
|
||||
auto allocator = New<TensorAllocator>(getBackend());
|
||||
|
||||
// buffer tensor to save packed matrix
|
||||
Tensor packedTensor;
|
||||
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
|
||||
|
||||
//Pack B matrix into int8
|
||||
fbgemmPacked8Pack(packedTensor,
|
||||
val->data(),
|
||||
gemmElementType,
|
||||
pName.find("Wemb") != std::string::npos,
|
||||
nrow,
|
||||
ncol,
|
||||
packsize);
|
||||
io::Item item;
|
||||
item.name = pName;
|
||||
item.shape = val->shape();
|
||||
item.type = gemmElementType;
|
||||
|
||||
// Use the actual memory as this will be aligned and padded.
|
||||
// When memory mapping this is required. Shape keeps track of
|
||||
// tensor size. Saving to *.npz will cut to size.
|
||||
auto mem = packedTensor->memory();
|
||||
item.bytes.resize(mem->size());
|
||||
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
|
||||
|
||||
ioItems.emplace_back(std::move(item));
|
||||
#else
|
||||
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
|
||||
#endif
|
||||
// fp16 quantization option
|
||||
} else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
|
||||
#if USE_FBGEMM
|
||||
using namespace marian::cpu::variant;
|
||||
|
||||
// packing information
|
||||
int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
|
||||
uint64_t packsize;
|
||||
|
||||
fbgemmPacked16PackInfo(val->shape(),
|
||||
false,
|
||||
nrow,
|
||||
ncol,
|
||||
kernel_ncol_blocks,
|
||||
brow,
|
||||
bcol,
|
||||
last_brow,
|
||||
nbrow,
|
||||
nbcol,
|
||||
packsize);
|
||||
|
||||
auto allocator = New<TensorAllocator>(getBackend());
|
||||
|
||||
Tensor packedTensor;
|
||||
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
|
||||
|
||||
// fbgemmPacked16Pack
|
||||
fbgemmPacked16Pack(packedTensor,
|
||||
val->data(),
|
||||
false,
|
||||
nrow,
|
||||
ncol,
|
||||
kernel_ncol_blocks,
|
||||
brow,
|
||||
bcol,
|
||||
last_brow,
|
||||
nbrow,
|
||||
nbcol,
|
||||
packsize);
|
||||
io::Item item;
|
||||
item.name = pName;
|
||||
item.shape = val->shape();
|
||||
item.type = Type::packed16;
|
||||
|
||||
// Use the actual memory as this will be aligned and padded.
|
||||
// When memory mapping this is required. Shape keeps track of
|
||||
// tensor size. Saving to *.npz will cut to size.
|
||||
auto mem = packedTensor->memory();
|
||||
item.bytes.resize(mem->size());
|
||||
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
|
||||
|
||||
ioItems.emplace_back(std::move(item));
|
||||
#else
|
||||
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
|
||||
#endif
|
||||
} else if (isIntgemm(gemmElementType) &&
|
||||
(pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
|
||||
#if COMPILE_CPU
|
||||
using cpu::integer::cols;
|
||||
using cpu::integer::rows;
|
||||
auto allocator = New<TensorAllocator>(getBackend());
|
||||
|
||||
Tensor paramMat; //This allocates extra 4 bytes at the end because of gemmElementType
|
||||
allocator->allocate(paramMat, val->shape(), gemmElementType);
|
||||
|
||||
// Compute QuantMultiplier, compress matrix and store quantMult at the end.
|
||||
// We need to tranpose first, because of our architecture independet format requiring a transposed matrix
|
||||
Tensor tmp;
|
||||
allocator->allocate(tmp, val->shape(), val->type());
|
||||
cpu::Transpose10(tmp, val);
|
||||
|
||||
if(sizeOf(gemmElementType) == 1) { // is 8-bit Intgemm type
|
||||
float quantMult = cpu::integer::computeQuantMult<Type::intgemm8>(val);
|
||||
|
||||
// Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime
|
||||
cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type
|
||||
if(isSsse3(gemmElementType)) {
|
||||
intgemm::ssse3::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
|
||||
paramMat->data<int8_t>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(val),
|
||||
cols(val));
|
||||
} else if(isAvx2(gemmElementType)) {
|
||||
intgemm::avx2::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
|
||||
paramMat->data<int8_t>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(val),
|
||||
cols(val));
|
||||
} else if(isAvx512(gemmElementType)) {
|
||||
intgemm::avx512bw::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
|
||||
paramMat->data<int8_t>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(val),
|
||||
cols(val));
|
||||
} else {
|
||||
ABORT_IF(gemmElementType != Type::intgemm8, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure
|
||||
intgemm::Int8::PrepareA(tmp->data(), /*input*/
|
||||
paramMat->data<int8_t>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(val),
|
||||
cols(val));
|
||||
}
|
||||
//Put the quantMult at the back of the tensor
|
||||
cpu::integer::getQuantMult<Type::intgemm8>(paramMat) = quantMult;
|
||||
|
||||
} else if(sizeOf(gemmElementType) == 2) { // is 16-bit Intgemm type
|
||||
float quantMult = cpu::integer::computeQuantMult<Type::intgemm16>(val);
|
||||
|
||||
// Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime
|
||||
cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type
|
||||
if(isSse2(gemmElementType)) {
|
||||
intgemm::sse2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
|
||||
paramMat->data<int16_t>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(val),
|
||||
cols(val));
|
||||
} else if(isAvx2(gemmElementType)) {
|
||||
intgemm::avx2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
|
||||
paramMat->data<int16_t>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(val),
|
||||
cols(val));
|
||||
} else if(isAvx512(gemmElementType)) {
|
||||
intgemm::avx512bw::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
|
||||
paramMat->data<int16_t>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(val),
|
||||
cols(val));
|
||||
} else {
|
||||
ABORT_IF(gemmElementType != Type::intgemm16, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure
|
||||
intgemm::Int16::PrepareA(tmp->data(), /*input*/
|
||||
paramMat->data<int16_t>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(val),
|
||||
cols(val));
|
||||
}
|
||||
//Put the quantMult at the back of the tensor
|
||||
cpu::integer::getQuantMult<Type::intgemm16>(paramMat) = quantMult;
|
||||
|
||||
} else {
|
||||
ABORT("Incorrect Intgemm type size: {}", sizeOf(gemmElementType));
|
||||
}
|
||||
|
||||
//Save... Same as the fbgemm case
|
||||
io::Item item;
|
||||
item.name = pName;
|
||||
item.shape = val->shape();
|
||||
item.type = gemmElementType;
|
||||
|
||||
auto mem = paramMat->memory();
|
||||
item.bytes.resize(mem->size());
|
||||
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
|
||||
ioItems.emplace_back(std::move(item));
|
||||
#else
|
||||
ABORT("Packed type {} only supported when compiled with -DCOMPILE_CPU=on", gemmElementType);
|
||||
#endif
|
||||
} else {
|
||||
ABORT_IF(saveElementType != Type::float32, "We currently do not know how to save matrices as {}", saveElementType);
|
||||
io::Item item;
|
||||
val->get(item, pName);
|
||||
item.convert(saveElementType);
|
||||
ioItems.emplace_back(std::move(item));
|
||||
}
|
||||
}
|
||||
|
||||
if (!meta.empty())
|
||||
io::addMetaToItems(meta, "special:model.yml", ioItems);
|
||||
io::saveItems(name, ioItems);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace marian
|
@ -2,7 +2,7 @@
|
||||
|
||||
#include "graph/node.h"
|
||||
#include "packed_gemm.h"
|
||||
#include "tensors/cpu/sharp/int_gemm.h"
|
||||
#include "tensors/cpu/integer_common.h"
|
||||
|
||||
#if USE_FBGEMM
|
||||
#ifdef __GNUC__
|
||||
@ -57,14 +57,12 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
|
||||
int nbcol_;
|
||||
uint64_t packsize_;
|
||||
|
||||
FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
|
||||
FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose)
|
||||
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
|
||||
packMat_(packMat),
|
||||
transpose_(transpose) {
|
||||
if(packMat != PackMatrix::B)
|
||||
ABORT("Only prepacking of B (weight matrix) is supported");
|
||||
if(clipValue != 0)
|
||||
ABORT("Clipping is not supported");
|
||||
if(!memoize_)
|
||||
ABORT("Only constant weight node can be packed");
|
||||
}
|
||||
@ -144,16 +142,13 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
|
||||
FbgemmPacked8PackNodeOp(Expr a,
|
||||
PackMatrix packMat,
|
||||
marian::Type packType,
|
||||
bool transpose,
|
||||
float clipValue)
|
||||
bool transpose)
|
||||
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
|
||||
packMat_(packMat),
|
||||
packType_(packType),
|
||||
transpose_(transpose) {
|
||||
if(packMat != PackMatrix::B)
|
||||
ABORT("Only prepacking of B (weight matrix) is supported");
|
||||
if(clipValue != 0)
|
||||
ABORT("Clipping is not supported");
|
||||
if(!memoize_)
|
||||
ABORT("Only constant weight node can be packed");
|
||||
}
|
||||
@ -337,7 +332,7 @@ public:
|
||||
k_,
|
||||
transA_,
|
||||
transB_);
|
||||
marian::cpu::int16::AddBias(val_, child(2)->val())) };
|
||||
marian::cpu::integer::AddBias(val_, child(2)->val())) };
|
||||
} else {
|
||||
nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
|
||||
child(0)->val(),
|
||||
@ -377,11 +372,11 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
|
||||
}
|
||||
}
|
||||
|
||||
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
|
||||
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) {
|
||||
if (elementType == Type::packed16)
|
||||
return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
|
||||
return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose);
|
||||
else if (isPacked(elementType) && sizeOf(elementType) == 1)
|
||||
return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
|
||||
return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose);
|
||||
else {
|
||||
ABORT("Only int8 and fp16 are available. {}", elementType);
|
||||
return nullptr;
|
||||
|
@ -1,156 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "graph/expression_graph.h"
|
||||
#include "packed_gemm.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
|
||||
// The decision which weights can be packed or not should be done walking through the graph.
|
||||
// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
|
||||
// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
|
||||
// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
|
||||
// We will improve this in the near future.
|
||||
class ExpressionGraphPackable : public ExpressionGraph {
|
||||
public:
|
||||
ExpressionGraphPackable()
|
||||
: ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference
|
||||
|
||||
virtual ~ExpressionGraphPackable() {}
|
||||
|
||||
// Convert model weights into packed format and save to IO items.
|
||||
// @TODO: review this
|
||||
void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
|
||||
std::vector<io::Item> ioItems;
|
||||
|
||||
// sorted by name in std::map
|
||||
for (auto p : params()->getMap()) {
|
||||
std::string pName = p.first;
|
||||
|
||||
if (!namespace_.empty()) {
|
||||
if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
|
||||
pName = pName.substr(namespace_.size() + 2);
|
||||
}
|
||||
|
||||
Tensor val = p.second->val();
|
||||
|
||||
// save as packed format
|
||||
// @TODO Hardcoded to find packable weights
|
||||
// int8 - all the weights used for affine op and dot op
|
||||
// fp16 - all the weights used for affine op
|
||||
if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
|
||||
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
|
||||
#if USE_FBGEMM
|
||||
using namespace marian::cpu::variant;
|
||||
// packing information - size
|
||||
int nrow;
|
||||
int ncol;
|
||||
uint64_t packsize;
|
||||
|
||||
fbgemmPacked8PackInfo(val->shape(),
|
||||
gemmElementType,
|
||||
pName.find("Wemb") != std::string::npos,
|
||||
nrow,
|
||||
ncol,
|
||||
packsize);
|
||||
|
||||
auto allocator = New<TensorAllocator>(getBackend());
|
||||
|
||||
// buffer tensor to save packed matrix
|
||||
Tensor packedTensor;
|
||||
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
|
||||
|
||||
//Pack B matrix into int8
|
||||
fbgemmPacked8Pack(packedTensor,
|
||||
val->data(),
|
||||
gemmElementType,
|
||||
pName.find("Wemb") != std::string::npos,
|
||||
nrow,
|
||||
ncol,
|
||||
packsize);
|
||||
io::Item item;
|
||||
item.name = pName;
|
||||
item.shape = val->shape();
|
||||
item.type = gemmElementType;
|
||||
|
||||
// Use the actual memory as this will be aligned and padded.
|
||||
// When memory mapping this is required. Shape keeps track of
|
||||
// tensor size. Saving to *.npz will cut to size.
|
||||
auto mem = packedTensor->memory();
|
||||
item.bytes.resize(mem->size());
|
||||
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
|
||||
|
||||
ioItems.emplace_back(std::move(item));
|
||||
#else
|
||||
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
|
||||
#endif
|
||||
// fp16 quantization option
|
||||
} else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
|
||||
#if USE_FBGEMM
|
||||
using namespace marian::cpu::variant;
|
||||
|
||||
// packing information
|
||||
int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
|
||||
uint64_t packsize;
|
||||
|
||||
fbgemmPacked16PackInfo(val->shape(),
|
||||
false,
|
||||
nrow,
|
||||
ncol,
|
||||
kernel_ncol_blocks,
|
||||
brow,
|
||||
bcol,
|
||||
last_brow,
|
||||
nbrow,
|
||||
nbcol,
|
||||
packsize);
|
||||
|
||||
auto allocator = New<TensorAllocator>(getBackend());
|
||||
|
||||
Tensor packedTensor;
|
||||
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
|
||||
|
||||
// fbgemmPacked16Pack
|
||||
fbgemmPacked16Pack(packedTensor,
|
||||
val->data(),
|
||||
false,
|
||||
nrow,
|
||||
ncol,
|
||||
kernel_ncol_blocks,
|
||||
brow,
|
||||
bcol,
|
||||
last_brow,
|
||||
nbrow,
|
||||
nbcol,
|
||||
packsize);
|
||||
io::Item item;
|
||||
item.name = pName;
|
||||
item.shape = val->shape();
|
||||
item.type = Type::packed16;
|
||||
|
||||
// Use the actual memory as this will be aligned and padded.
|
||||
// When memory mapping this is required. Shape keeps track of
|
||||
// tensor size. Saving to *.npz will cut to size.
|
||||
auto mem = packedTensor->memory();
|
||||
item.bytes.resize(mem->size());
|
||||
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
|
||||
|
||||
ioItems.emplace_back(std::move(item));
|
||||
#else
|
||||
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
|
||||
#endif
|
||||
} else {
|
||||
io::Item item;
|
||||
val->get(item, pName);
|
||||
item.convert(saveElementType);
|
||||
ioItems.emplace_back(std::move(item));
|
||||
}
|
||||
}
|
||||
|
||||
if (!meta.empty())
|
||||
io::addMetaToItems(meta, "special:model.yml", ioItems);
|
||||
io::saveItems(name, ioItems);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace marian
|
@ -1,113 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "graph/node.h"
|
||||
#include "tensors/cpu/sharp/int_gemm.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace int16 {
|
||||
|
||||
struct QuantizeNodeOp : public UnaryNodeOp {
|
||||
float clipValue_;
|
||||
|
||||
QuantizeNodeOp(Expr a, float clipValue)
|
||||
: UnaryNodeOp(a, Type::int16), clipValue_{clipValue} {}
|
||||
|
||||
NodeOps forwardOps() override {
|
||||
return {NodeOp(Quantize16(val_, child(0)->val(), clipValue_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() override {
|
||||
ABORT("Only used for inference");
|
||||
}
|
||||
|
||||
const std::string type() override { return "quantizeInt16"; }
|
||||
};
|
||||
|
||||
class DotNodeOp : public NaryNodeOp {
|
||||
private:
|
||||
float scalar_;
|
||||
|
||||
public:
|
||||
DotNodeOp(Expr a, Expr b, float scalar)
|
||||
: NaryNodeOp({a, b}, newShape(a, b), Type::float32), scalar_(scalar) {}
|
||||
|
||||
Shape newShape(Expr a, Expr b) {
|
||||
auto shapeA = a->shape();
|
||||
auto shapeB = b->shape();
|
||||
|
||||
// Computing A * B^T
|
||||
shapeB.set(-2, b->shape()[-1]);
|
||||
shapeB.set(-1, b->shape()[-2]);
|
||||
|
||||
Shape outShape = shapeA;
|
||||
outShape.set(-1, shapeB[-1]);
|
||||
ABORT_IF(shapeA[-1] != shapeB[-2],
|
||||
"matrix product requires dimensions to match");
|
||||
return outShape;
|
||||
}
|
||||
|
||||
NodeOps forwardOps() override {
|
||||
return {NodeOp(ProdInt16(val_, child(0)->val(), child(1)->val(), scalar_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() override {
|
||||
ABORT("Only used for inference");
|
||||
}
|
||||
|
||||
const std::string type() override { return "dotInt16"; }
|
||||
};
|
||||
|
||||
class AffineNodeOp : public NaryNodeOp {
|
||||
private:
|
||||
float scalar_;
|
||||
|
||||
public:
|
||||
AffineNodeOp(const std::vector<Expr>& nodes, float scalar)
|
||||
: NaryNodeOp(nodes, newShape(nodes[0], nodes[1]), Type::float32), scalar_(scalar) {}
|
||||
|
||||
Shape newShape(Expr a, Expr b) {
|
||||
auto shapeA = a->shape();
|
||||
auto shapeB = b->shape();
|
||||
|
||||
// Computing A * B^T
|
||||
shapeB.set(-2, b->shape()[-1]);
|
||||
shapeB.set(-1, b->shape()[-2]);
|
||||
|
||||
Shape outShape = shapeA;
|
||||
outShape.set(-1, shapeB[-1]);
|
||||
ABORT_IF(shapeA[-1] != shapeB[-2],
|
||||
"matrix product requires dimensions to match");
|
||||
return outShape;
|
||||
}
|
||||
|
||||
NodeOps forwardOps() override {
|
||||
return {
|
||||
NodeOp(ProdInt16(val_, child(0)->val(), child(1)->val(), scalar_);
|
||||
AddBias(val_, child(2)->val()))
|
||||
};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() override {
|
||||
ABORT("Only used for inference");
|
||||
}
|
||||
|
||||
const std::string type() override { return "affineInt16"; }
|
||||
};
|
||||
|
||||
static inline Expr dot(Expr a, Expr b, float scalar) {
|
||||
return Expression<cpu::int16::DotNodeOp>(a, b, scalar);
|
||||
}
|
||||
|
||||
static inline Expr affine(Expr a, Expr b, Expr c, float scalar) {
|
||||
std::vector<Expr> nodes = {a, b, c};
|
||||
return Expression<cpu::int16::AffineNodeOp>(nodes, scalar);
|
||||
}
|
||||
|
||||
static inline Expr quantize(Expr a, float clipValue) {
|
||||
return Expression<cpu::int16::QuantizeNodeOp>(a, clipValue);
|
||||
}
|
||||
|
||||
} // namespace int16
|
||||
} // namespace cpu
|
||||
} // namespace marian
|
45
src/tensors/cpu/integer_common.cpp
Normal file
45
src/tensors/cpu/integer_common.cpp
Normal file
@ -0,0 +1,45 @@
|
||||
#include "integer_common.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace integer {
|
||||
// This operates on floats after processing so doesn't care about int8_t vs int16_t.
|
||||
void AddBias(marian::Tensor C, const marian::Tensor Bias) {
|
||||
float* y = C->data();
|
||||
const float* x = C->data();
|
||||
const float* bias = Bias->data();
|
||||
|
||||
const int m = C->shape().elements() / C->shape()[-1];
|
||||
const int n = C->shape()[-1];
|
||||
|
||||
for(int j = 0; j < m; ++j) {
|
||||
int i = 0;
|
||||
#ifdef __AVX512F__
|
||||
int n16 = n & ~15;
|
||||
for(; i < n16; i += 16) {
|
||||
__m512 ai = _mm512_loadu_ps(x + j * n + i);
|
||||
__m512 bi = _mm512_loadu_ps(bias + i);
|
||||
__m512 yi = _mm512_add_ps(ai, bi);
|
||||
_mm512_storeu_ps(y + j * n + i, yi);
|
||||
}
|
||||
#else
|
||||
int n4 = (n / 4) * 4;
|
||||
for(; i < n4; i += 4) {
|
||||
__m128 ai = _mm_loadu_ps(x + j * n + i);
|
||||
__m128 bi = _mm_loadu_ps(bias + i);
|
||||
__m128 yi = _mm_add_ps(ai, bi);
|
||||
_mm_storeu_ps(y + j * n + i, yi);
|
||||
}
|
||||
#endif
|
||||
for(; i < n; i++) {
|
||||
y[j * n + i] = x[j * n + i] + bias[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//template void prepareAndTranspose<intgemm8>;//(io::Item& item, const char * input);
|
||||
//template void prepareAndTranspose<intgemm16>(io::Item&, const char *);
|
||||
|
||||
} //integer
|
||||
} //cpu
|
||||
} //marian
|
223
src/tensors/cpu/integer_common.h
Normal file
223
src/tensors/cpu/integer_common.h
Normal file
@ -0,0 +1,223 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensors/tensor_allocator.h"
|
||||
#include "tensors/tensor_operators.h"
|
||||
#include "tensors/cpu/aligned.h"
|
||||
#include "common/io_item.h"
|
||||
|
||||
#if COMPILE_CPU
|
||||
#include "3rd_party/intgemm/intgemm/intgemm.h"
|
||||
#else
|
||||
namespace intgemm {
|
||||
struct Int8;
|
||||
struct Int16;
|
||||
namespace ssse3 {
|
||||
struct Kernels8;
|
||||
}
|
||||
namespace sse2 {
|
||||
struct Kernels16;
|
||||
}
|
||||
namespace avx2 {
|
||||
struct Kernels8;
|
||||
struct Kernels16;
|
||||
}
|
||||
namespace avx512bw {
|
||||
struct Kernels8;
|
||||
struct Kernels16;
|
||||
}
|
||||
namespace avx512vnni {
|
||||
struct Kernels8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace integer {
|
||||
|
||||
//Convenient function to get rows and columns of a tensor, shadowed by namespace.
|
||||
inline int cols(Tensor& tensor) { return tensor->shape()[-1]; }
|
||||
inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tensor); }
|
||||
|
||||
inline int cols(Shape& shape) { return shape[-1]; }
|
||||
inline int rows(Shape& shape) { return shape.elements() / cols(shape); }
|
||||
|
||||
template <Type type> struct intgemm_;
|
||||
|
||||
template <> struct intgemm_<Type::intgemm8> {
|
||||
using width = intgemm::Int8;
|
||||
using type = int8_t;
|
||||
};
|
||||
|
||||
template <> struct intgemm_<Type::intgemm8ssse3> {
|
||||
using width = intgemm::ssse3::Kernels8;
|
||||
using type = int8_t;
|
||||
};
|
||||
|
||||
template <> struct intgemm_<Type::intgemm8avx2> {
|
||||
using width = intgemm::avx2::Kernels8;
|
||||
using type = int8_t;
|
||||
};
|
||||
|
||||
template <> struct intgemm_<Type::intgemm8avx512> {
|
||||
using width = intgemm::avx512bw::Kernels8;
|
||||
using type = int8_t;
|
||||
};
|
||||
|
||||
template <> struct intgemm_<Type::intgemm8avx512vnni> {
|
||||
using width = intgemm::avx512vnni::Kernels8;
|
||||
using type = int8_t;
|
||||
};
|
||||
|
||||
template <> struct intgemm_<Type::intgemm16> {
|
||||
using width = intgemm::Int16;
|
||||
using type = int16_t;
|
||||
};
|
||||
|
||||
template <> struct intgemm_<Type::intgemm16sse2> {
|
||||
using width = intgemm::sse2::Kernels16;
|
||||
using type = int16_t;
|
||||
};
|
||||
|
||||
template <> struct intgemm_<Type::intgemm16avx2> {
|
||||
using width = intgemm::avx2::Kernels16;
|
||||
using type = int16_t;
|
||||
};
|
||||
|
||||
template <> struct intgemm_<Type::intgemm16avx512> {
|
||||
using width = intgemm::avx512bw::Kernels16;
|
||||
using type = int16_t;
|
||||
};
|
||||
|
||||
template <Type vtype>
|
||||
static inline float& getQuantMult(marian::Tensor val) {
|
||||
#if COMPILE_CPU
|
||||
ABORT_IF(!isIntgemm(val->type()), "getQuantMult does not work for type {}", val->type());
|
||||
typedef typename intgemm_<vtype>::type Integer;
|
||||
return *(reinterpret_cast<float*>(val->data<Integer>() + val->shape().elements()));
|
||||
#else
|
||||
val;
|
||||
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline Type getIntgemmType(Type vtype) {
|
||||
#if COMPILE_CPU
|
||||
if (vtype == Type::intgemm8) {
|
||||
if (intgemm::kCPU == intgemm::CPUType::AVX512VNNI) {
|
||||
return Type::intgemm8avx512vnni;
|
||||
} else if (intgemm::kCPU == intgemm::CPUType::AVX512BW) {
|
||||
return Type::intgemm8avx512;
|
||||
} else if (intgemm::kCPU == intgemm::CPUType::AVX2) {
|
||||
return Type::intgemm8avx2;
|
||||
} else if (intgemm::kCPU == intgemm::CPUType::SSSE3) {
|
||||
return Type::intgemm8ssse3;
|
||||
} else {
|
||||
ABORT("Your CPU doesn't support SSSE3, necessary for 8bit intgemm to work.");
|
||||
}
|
||||
} else if (vtype == Type::intgemm16) {
|
||||
if (intgemm::kCPU > intgemm::CPUType::AVX2) {
|
||||
return Type::intgemm16avx512;
|
||||
} else if (intgemm::kCPU == intgemm::CPUType::AVX2) {
|
||||
return Type::intgemm16avx2;
|
||||
} else if (intgemm::kCPU >= intgemm::CPUType::SSE2) {
|
||||
return Type::intgemm16sse2;
|
||||
} else {
|
||||
ABORT("Your CPU doesn't support SSE2, necessary for 16bit intgemm to work.");
|
||||
}
|
||||
} else {
|
||||
ABORT("Unrecognised type {}.", vtype);
|
||||
}
|
||||
#else
|
||||
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
|
||||
return vtype;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool passOrAbort(Type vtype) {
|
||||
#if COMPILE_CPU
|
||||
if (vtype == Type::intgemm8 || vtype == Type::intgemm16) {
|
||||
return true;
|
||||
} else if (vtype == Type::intgemm16sse2) {
|
||||
ABORT_IF(intgemm::kCPU < intgemm::CPUType::SSE2, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
|
||||
} else if (vtype == Type::intgemm8ssse3) {
|
||||
ABORT_IF(intgemm::kCPU < intgemm::CPUType::SSSE3, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
|
||||
} else if (vtype == Type::intgemm8avx2 || vtype == Type::intgemm16avx2) {
|
||||
ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX2, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
|
||||
} else if (vtype == Type::intgemm8avx512 || vtype == Type::intgemm16avx512) {
|
||||
ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX512BW, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
|
||||
} else if (vtype == Type::intgemm8avx512vnni) {
|
||||
ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX512VNNI, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
vtype;
|
||||
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <Type vtype>
|
||||
static inline float computeQuantMult(marian::Tensor val) {
|
||||
#if COMPILE_CPU
|
||||
if(sizeOf(vtype) == 1)
|
||||
return 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements());
|
||||
else if(sizeOf(vtype) == 2)
|
||||
return 1024.0f;
|
||||
else
|
||||
ABORT("Unhandled type size {}", sizeOf(vtype));
|
||||
#else
|
||||
val;
|
||||
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
|
||||
#endif
|
||||
}
|
||||
|
||||
// This operates on floats after processing so doesn't care about int8_t vs int16_t.
|
||||
void AddBias(marian::Tensor C, const marian::Tensor Bias);
|
||||
|
||||
// For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed
|
||||
// in our binary format. Then we copy the quantizationMultiplier information at the end
|
||||
template<Type vtype>
|
||||
void prepareAndTransposeB(io::Item& item, const char * input) {
|
||||
#if COMPILE_CPU
|
||||
typedef typename intgemm_<vtype>::type Integer;
|
||||
Integer * output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
|
||||
// Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
|
||||
// If this is the case, we will need to temporary allocate aligned memory, copy the results, and then free it
|
||||
if (reinterpret_cast<uintptr_t>(input) % 64 == 0 && reinterpret_cast<uintptr_t>(output_tensor) % 64 == 0) {
|
||||
intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(input),
|
||||
output_tensor,
|
||||
rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary,
|
||||
cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
|
||||
} else {
|
||||
Integer * aligned_input = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
|
||||
std::copy(input, input + rows(item.shape)*cols(item.shape), aligned_input);
|
||||
Integer * aligned_output = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
|
||||
intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(aligned_input),
|
||||
reinterpret_cast<Integer *>(aligned_output),
|
||||
rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary,
|
||||
cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
|
||||
// Copy to output tensor
|
||||
std::copy(aligned_output, aligned_output + rows(item.shape)*cols(item.shape), output_tensor);
|
||||
genericFree(aligned_input);
|
||||
genericFree(aligned_output);
|
||||
}
|
||||
//Copy the quantMult
|
||||
float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
|
||||
*(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
|
||||
#else
|
||||
item, input;
|
||||
ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
|
||||
#endif
|
||||
}
|
||||
|
||||
} //integer
|
||||
} //cpu
|
||||
} //marian
|
132
src/tensors/cpu/intgemm_interface.h
Normal file
132
src/tensors/cpu/intgemm_interface.h
Normal file
@ -0,0 +1,132 @@
|
||||
#pragma once
|
||||
|
||||
#include "graph/node.h"
|
||||
#include "graph/node_operators_unary.h"
|
||||
#include "integer_common.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
namespace cpu {
|
||||
namespace integer {
|
||||
|
||||
#if COMPILE_CPU
|
||||
/*
|
||||
* Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized.
|
||||
* Expr input: The input tensor
|
||||
*/
|
||||
template<Type vtype>
|
||||
static inline Expr prepareA(Expr a) {
|
||||
auto nodeOp = [](Expr out, const std::vector<Expr>& children) {
|
||||
Expr in = children[0];
|
||||
auto quantMult = computeQuantMult<vtype>(in->val());
|
||||
typedef typename intgemm_<vtype>::type Integer;
|
||||
intgemm_<vtype>::width::PrepareA(in->val()->data(), /*input*/
|
||||
out->val()->data<Integer>(), /*output*/
|
||||
quantMult, /*Quant Mult*/
|
||||
rows(in->val()),
|
||||
cols(in->val()));
|
||||
getQuantMult<vtype>(out->val()) = quantMult;
|
||||
};
|
||||
|
||||
return lambda({a}, a->shape(), vtype, nodeOp);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This computes A*B (+ bias if available) in intgemm.
|
||||
* Expr a: The activation matrix in intgemm format
|
||||
* Expr b: The parameter matrix in intgemm fromat
|
||||
* Expr bias: The bias
|
||||
* bool transA - tranpose input A if true
|
||||
* bool transB - unused here (@TODO remove?)
|
||||
* float scale - scale the output by `scale`
|
||||
* the template argument controls whether we're doing 16bit integers or 8bit integers.
|
||||
* It can be Type::intgemm8 or Type::intgemm16 and all hardware-specific variants
|
||||
*/
|
||||
template<Type vtype>
|
||||
static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) {
|
||||
#if COMPILE_CPU
|
||||
ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type());
|
||||
ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type());
|
||||
|
||||
auto aQuant = prepareA<vtype>(transA ? transpose(a) : a); // A should not be quantized yet as seen above, hence quantize here
|
||||
|
||||
// determine the output shape m x n for A: m x k and B: k x n
|
||||
// since we transpose A beforehand we don't need to take care of transposed shapes here
|
||||
Shape outShape = aQuant->shape();
|
||||
outShape.set(-1, bQuant->shape()[-1]);
|
||||
|
||||
// wrap the multiply finctions to be executed in the forward step of a Lambda node
|
||||
auto dotOrAffineNodeOp = [=](Expr out, const std::vector<Expr>& children) {
|
||||
Expr aQuant = children[0];
|
||||
Expr bQuant = children[1];
|
||||
Expr bias = children.size() > 2 ? children[2] : nullptr;
|
||||
|
||||
// when we arrive here, A and B are already quantized, so just get the multipliers
|
||||
float aQuantMult = getQuantMult<vtype>(aQuant->val());
|
||||
float bQuantMult = getQuantMult<vtype>(bQuant->val());
|
||||
|
||||
float unquant_mult = 1.0f / (aQuantMult * bQuantMult);
|
||||
unquant_mult = unquant_mult * scale;
|
||||
|
||||
typedef typename intgemm_<vtype>::type Integer;
|
||||
if(bias) { // dispatch a multiply with integrated bias addition i.e affine(...)
|
||||
intgemm_<vtype>::width::Multiply(/*A=*/aQuant->val()->data<Integer>(),
|
||||
/*B=*/bQuant->val()->data<Integer>(),
|
||||
rows(aQuant->val()),
|
||||
cols(aQuant->val()),
|
||||
cols(bQuant->val()),
|
||||
intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, /*bias=*/bias->val()->data(), /*output=*/out->val()->data()));
|
||||
} else { // dispatch a multiply without bias addition i.e dot(...)
|
||||
intgemm_<vtype>::width::Multiply(/*A=*/aQuant->val()->data<Integer>(),
|
||||
/*B=*/bQuant->val()->data<Integer>(),
|
||||
rows(aQuant->val()),
|
||||
cols(aQuant->val()),
|
||||
cols(bQuant->val()),
|
||||
intgemm::callbacks::UnquantizeAndWrite(unquant_mult, /*output=*/out->val()->data()));
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<Expr> children = {aQuant, bQuant};
|
||||
if(bias)
|
||||
children.push_back(bias);
|
||||
|
||||
return lambda(children, outShape, Type::float32, dotOrAffineNodeOp); // inference-only Lambda node
|
||||
#else
|
||||
a, bQuant, bias, transA, scale;
|
||||
ABORT("You need to enable CPU compilation to use this feature. Use cmake .. -DCOMPILE_CPU=ON");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Dispatch correct hardware-agnostic or hardware-specific matrix multiplies
|
||||
static inline Expr affineOrDot(Expr a, Expr bQuant, Expr bias, bool transA, bool transB, float scale) {
|
||||
Type bQuantElementType = bQuant->value_type();
|
||||
static const bool pass = cpu::integer::passOrAbort(bQuantElementType);
|
||||
pass; // We declare this variable as static so that passOrAbort is only ever run once during the initialization.
|
||||
switch(bQuantElementType) {
|
||||
//case Type::intgemm8 : // The generic case selects CPU automatically, but we set all the types manually anyways.
|
||||
// return cpu::integer::affineOrDotTyped<Type::intgemm8>(a, bQuant, bias, transA, transB, scale);
|
||||
case Type::intgemm8ssse3 :
|
||||
return cpu::integer::affineOrDotTyped<Type::intgemm8ssse3>(a, bQuant, bias, transA, transB, scale);
|
||||
case Type::intgemm8avx2 :
|
||||
return cpu::integer::affineOrDotTyped<Type::intgemm8avx2>(a, bQuant, bias, transA, transB, scale);
|
||||
case Type::intgemm8avx512 :
|
||||
return cpu::integer::affineOrDotTyped<Type::intgemm8avx512>(a, bQuant, bias, transA, transB, scale);
|
||||
case Type::intgemm8avx512vnni :
|
||||
return cpu::integer::affineOrDotTyped<Type::intgemm8avx512vnni>(a, bQuant, bias, transA, transB, scale);
|
||||
//case Type::intgemm16 : // The generic case selects CPU automatically, but we set all the types manually anyways.
|
||||
// return cpu::integer::affineOrDotTyped<Type::intgemm16>(a, bQuant, bias, transA, transB, scale);
|
||||
case Type::intgemm16sse2 :
|
||||
return cpu::integer::affineOrDotTyped<Type::intgemm16sse2>(a, bQuant, bias, transA, transB, scale);
|
||||
case Type::intgemm16avx2 :
|
||||
return cpu::integer::affineOrDotTyped<Type::intgemm16avx2>(a, bQuant, bias, transA, transB, scale);
|
||||
case Type::intgemm16avx512 :
|
||||
return cpu::integer::affineOrDotTyped<Type::intgemm16avx512>(a, bQuant, bias, transA, transB, scale);
|
||||
default:
|
||||
ABORT("Unsupported type {} for Intgemm type??", bQuantElementType);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace integer
|
||||
} // namespace cpu
|
||||
} // namespace marian
|
@ -7,8 +7,17 @@
|
||||
#include "tensors/tensor.h"
|
||||
#include "tensors/tensor_allocator.h"
|
||||
|
||||
#if MKL_FOUND
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#if BLAS_FOUND
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "integer_common.h"
|
||||
#include "prod_blas.h"
|
||||
#include "sharp/int_gemm.h"
|
||||
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -187,7 +196,7 @@ void ProdWithBias(marian::Tensor C,
|
||||
float beta,
|
||||
float scalar) {
|
||||
cpu::Prod(C, A, B, transA, transB, beta, scalar);
|
||||
cpu::int16::AddBias(C, bias);
|
||||
cpu::integer::AddBias(C, bias);
|
||||
}
|
||||
|
||||
void CSRProd(marian::Tensor C,
|
||||
|
@ -1,615 +0,0 @@
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <tmmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
|
||||
#ifdef __AVX512F__
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace int16 {
|
||||
|
||||
namespace {
|
||||
// Load from memory, multiply, and convert to int32_t.
|
||||
inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
|
||||
// Load 16 floats
|
||||
__m512 val = _mm512_load_ps(input);
|
||||
// Multiply each by the quantization factor.
|
||||
val = _mm512_mul_ps(val, quant_mult_reg);
|
||||
// Cast to 32-bit int
|
||||
return _mm512_cvtps_epi32(val);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Convert
|
||||
void AVX_Quantize16(const float *input,
|
||||
int16_t *output,
|
||||
float quant_mult,
|
||||
std::size_t size) {
|
||||
assert(size % 16 == 0);
|
||||
assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
|
||||
// Fill with the quantization multiplier.
|
||||
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
|
||||
const float *end = input + size;
|
||||
for(; input != end; input += 16, output += 16) {
|
||||
// There doesn't seem to be an unmasked version.
|
||||
_mm512_mask_cvtsepi32_storeu_epi16(
|
||||
output, 0xffff, QuantizerGrab(input, quant_mult_reg));
|
||||
}
|
||||
}
|
||||
|
||||
void AVX_Quantize8(const float *input,
|
||||
int8_t *output,
|
||||
float quant_mult,
|
||||
std::size_t size) {
|
||||
assert(size % 16 == 0);
|
||||
assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
|
||||
const __m512i neg127 = _mm512_set1_epi32(-127);
|
||||
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
|
||||
const float *end = input + size;
|
||||
for(; input < end; input += 16, output += 16) {
|
||||
__m512i asint = QuantizerGrab(input, quant_mult_reg);
|
||||
/* Ban -128. We can't negate it.
|
||||
* The largest possbile product is -128 * -128 = 2^14. If two of those are
|
||||
* summed that's 2^15 which is too large for int16_t. By banning -128 we
|
||||
* can accumulate two in int16_t w/o saturation before going to int32_t.
|
||||
* But this is ok because apparently the instruction will saturate.
|
||||
*/
|
||||
asint = _mm512_max_epi32(asint, neg127);
|
||||
// There doesn't seem to be an unmasked version.
|
||||
_mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint);
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
union FloatAccess {
|
||||
float as_f[4];
|
||||
__m128 as_n;
|
||||
};
|
||||
union IntAccess {
|
||||
int32_t as_i[4];
|
||||
__m128i as_n;
|
||||
};
|
||||
|
||||
/* Convert 16-bit to 32-bit and add, not caring what parts are added.
|
||||
* Implementations:
|
||||
* 1.
|
||||
* https://github.com/tesseract-ocr/tesseract/blob/master/src/arch/intsimdmatrixavx2.cpp#L67
|
||||
* under Apache license: This does a multiply by 1 and horizontal add:
|
||||
* _mm512_madd_epi16(sum, _mm512_set1_epi16(1))
|
||||
* Current fastest.
|
||||
*
|
||||
* 2. Signed extension and fold halves:
|
||||
* sum = _mm512_add_epi32(
|
||||
* _mm512_cvtepi16_epi32(_mm512_castsi512_si256(sum)),
|
||||
* _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(sum, 1)));
|
||||
*
|
||||
* 3. Sign extend by abuse of bitshift, then add.
|
||||
* __m128i shift16 = _mm_set_epi32(0,0,0,16);
|
||||
* sum = _mm512_add_epi32(
|
||||
* _mm512_sra_epi32(_mm512_sll_epi32(sum, shift16), shift16),
|
||||
* _mm512_sra_epi32(sum, shift16));
|
||||
*/
|
||||
inline void Convert32Sum(__m512i &sum) {
|
||||
short one = 1;
|
||||
sum = _mm512_madd_epi16(sum, _mm512_set1_epi16(one));
|
||||
}
|
||||
|
||||
// Two sum version.
|
||||
struct ReducedPair {
|
||||
int32_t result[2];
|
||||
};
|
||||
inline ReducedPair Reduce16to32(__m512i sum1, __m512i sum2) {
|
||||
Convert32Sum(sum1);
|
||||
Convert32Sum(sum2);
|
||||
// 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
|
||||
__m512i pack12 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum1, sum2),
|
||||
_mm512_unpacklo_epi32(sum1, sum2));
|
||||
// 1 2 1 2 1 2 1 2
|
||||
__m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack12),
|
||||
_mm512_extracti64x4_epi64(pack12, (short)1));
|
||||
// 1 2 1 2
|
||||
IntAccess a;
|
||||
a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves),
|
||||
_mm256_extracti128_si256(halves, 1));
|
||||
ReducedPair ret;
|
||||
ret.result[0] = a.as_i[0] + a.as_i[2];
|
||||
ret.result[1] = a.as_i[1] + a.as_i[3];
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Assuming sum1, sum2, sum3, and sum4 are arrays 32-bit signed integers,
|
||||
// reduce within each.
|
||||
// Returns [sum(sum1), sum(sum2), sum(sum3), sum(sum4)]
|
||||
// TODO: consider doing in 64-bit, allowing 4 more bits of quantization?
|
||||
inline __m128i Reduce32(__m512i sum1,
|
||||
__m512i sum2,
|
||||
__m512i sum3,
|
||||
__m512i sum4) {
|
||||
// 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
|
||||
__m512i pack12 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum1, sum2),
|
||||
_mm512_unpacklo_epi32(sum1, sum2));
|
||||
// 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
|
||||
__m512i pack34 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum3, sum4),
|
||||
_mm512_unpacklo_epi32(sum3, sum4));
|
||||
// 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4
|
||||
__m512i pack1234 = _mm512_add_epi32(_mm512_unpackhi_epi64(pack12, pack34),
|
||||
_mm512_unpacklo_epi64(pack12, pack34));
|
||||
// Cut the register into halves and sum those. 1 2 3 4 1 2 3 4
|
||||
__m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack1234),
|
||||
_mm512_extracti64x4_epi64(pack1234, (short)1));
|
||||
// Again: cut the register into halves and sum those. 1 2 3 4
|
||||
return _mm_add_epi32(_mm256_castsi256_si128(halves),
|
||||
_mm256_extracti128_si256(halves, 1));
|
||||
}
|
||||
|
||||
// Four sum version
|
||||
inline __m128i Reduce16to32(__m512i sum1,
|
||||
__m512i sum2,
|
||||
__m512i sum3,
|
||||
__m512i sum4) {
|
||||
Convert32Sum(sum1);
|
||||
Convert32Sum(sum2);
|
||||
Convert32Sum(sum3);
|
||||
Convert32Sum(sum4);
|
||||
return Reduce32(sum1, sum2, sum3, sum4);
|
||||
}
|
||||
|
||||
// Somewhat inefficient reduce for single __m256i containing int32_t
|
||||
inline int32_t Reduce32(__m256i halves) {
|
||||
IntAccess a;
|
||||
a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves),
|
||||
_mm256_extracti128_si256(halves, 1));
|
||||
// TODO is there a more efficient way?
|
||||
return a.as_i[0] + a.as_i[1] + a.as_i[2] + a.as_i[3];
|
||||
}
|
||||
|
||||
// Somewhat inefficient reduce for single __m512i containing int32_t
|
||||
inline int32_t Reduce32(__m512i sum1) {
|
||||
// Fold register over itself.
|
||||
return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
|
||||
_mm512_extracti64x4_epi64(sum1, (short)1)));
|
||||
}
|
||||
|
||||
inline int32_t Reduce16to32(__m512i sum1) {
|
||||
Convert32Sum(sum1);
|
||||
// Fold register over itself.
|
||||
return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
|
||||
_mm512_extracti64x4_epi64(sum1, (short)1)));
|
||||
}
|
||||
|
||||
class ScatterPut {
|
||||
public:
|
||||
explicit ScatterPut(float unquant_mult, int num_B_rows)
|
||||
: unquant_mult_(unquant_mult),
|
||||
unquant_mult_sse_(_mm_set1_ps(unquant_mult)),
|
||||
#ifdef __AVX512VL__
|
||||
num_b_rows_scatter_(_mm_set_epi32(num_B_rows * 3 * sizeof(float),
|
||||
num_B_rows * 2 * sizeof(float),
|
||||
num_B_rows * 1 * sizeof(float),
|
||||
num_B_rows * 0 * sizeof(float))),
|
||||
#endif
|
||||
num_B_rows_(num_B_rows) {
|
||||
}
|
||||
|
||||
inline void Write(float *base, __m128i reduced) {
|
||||
__m128 float_sums = _mm_cvtepi32_ps(reduced);
|
||||
float_sums = _mm_mul_ps(float_sums, unquant_mult_sse_);
|
||||
#ifdef __AVX512VL__
|
||||
// The scatter instruction requires avx512vl
|
||||
_mm_i32scatter_ps(base, num_b_rows_scatter_, float_sums, (short)1);
|
||||
#else
|
||||
FloatAccess a;
|
||||
// Get floats for each of the sums to write.
|
||||
a.as_n = float_sums;
|
||||
// Also note that the memory acceses on C are not consecutive, but this is a
|
||||
// tradeoff that we have to make. We can't have consecutive accesses of A,
|
||||
// B, *and* C. But we access A and B a lot more so it makes sense to do it
|
||||
// this way. Scatter to outputs:
|
||||
base[0] = a.as_f[0];
|
||||
base[num_B_rows_] = a.as_f[1];
|
||||
base[2 * num_B_rows_] = a.as_f[2];
|
||||
base[3 * num_B_rows_] = a.as_f[3];
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Write(float *base, ReducedPair reduced) {
|
||||
base[0] = unquant_mult_ * static_cast<float>(reduced.result[0]);
|
||||
base[num_B_rows_] = unquant_mult_ * static_cast<float>(reduced.result[1]);
|
||||
}
|
||||
|
||||
inline void Write(float *base, int32_t reduced) {
|
||||
base[0] = unquant_mult_ * static_cast<float>(reduced);
|
||||
}
|
||||
|
||||
private:
|
||||
const float unquant_mult_;
|
||||
const __m128 unquant_mult_sse_;
|
||||
#ifdef __AVX512VL__
|
||||
const __m128i num_b_rows_scatter_;
|
||||
#endif
|
||||
const int num_B_rows_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
// This is an AVX512F implementation of int16_t multiply based on Jacob
|
||||
// Devlin's SSE code. The original SSE code was:
|
||||
|
||||
// Copyright (c) 2017 Microsoft Corporation
|
||||
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
// We are multiplying A * B^T, as opposed to A * B. This is important because it
|
||||
// means we can do consecutive memory access on A * B^T which allows to to take
|
||||
// the most advantage of L1 cache.
|
||||
//
|
||||
// B is typically a weight matrix, so it can be pre-processed offline, and
|
||||
// therefore this transpose does not cost anything. A is typically an activation
|
||||
// minibatch matrix. A and B must be 64-byte aligned. C should be the usual
|
||||
// 4-byte alignment.
|
||||
void AVX_MatrixMult16(const __m512i *A,
|
||||
const __m512i *B,
|
||||
float *C,
|
||||
float unquant_mult,
|
||||
int num_A_rows,
|
||||
int num_B_rows,
|
||||
int width) {
|
||||
assert(width % 32 == 0);
|
||||
assert(reinterpret_cast<uintptr_t>(A) % 64 == 0);
|
||||
assert(reinterpret_cast<uintptr_t>(B) % 64 == 0);
|
||||
|
||||
ScatterPut put(unquant_mult, num_B_rows);
|
||||
|
||||
const int sse_width = width / 32;
|
||||
|
||||
// We do loop unrolling over A. This is *significantly* faster
|
||||
// since B can live in the registers. We are assuming that
|
||||
// A is a multiple of 4, but we can add extra code to handle values of 1,
|
||||
// 2, 3.
|
||||
//
|
||||
// We could also do loop unrolling over B, which adds some additional speedup.
|
||||
// We don't do that for the sake of clarity.
|
||||
//
|
||||
// There are other memory access patterns we could do, e.g., put B on the
|
||||
// outer loop. The justification is that A is typically small enough that it
|
||||
// can live in L1 cache. B is usually a larger weight matrix, so it might not
|
||||
// be able to. However, we are using each element of B four times while it's
|
||||
// still in a register, so caching is not as important.
|
||||
|
||||
// Round down to a multiple of 4.
|
||||
int num_unroll_rows = num_A_rows & ~3;
|
||||
for(int i = 0; i < num_unroll_rows; i += 4) {
|
||||
const __m512i *A1_row = A + (i + 0) * sse_width;
|
||||
const __m512i *A2_row = A + (i + 1) * sse_width;
|
||||
const __m512i *A3_row = A + (i + 2) * sse_width;
|
||||
const __m512i *A4_row = A + (i + 3) * sse_width;
|
||||
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
__m512i sum2 = _mm512_setzero_si512();
|
||||
__m512i sum3 = _mm512_setzero_si512();
|
||||
__m512i sum4 = _mm512_setzero_si512();
|
||||
|
||||
// This is just a simple dot product, unrolled four ways.
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
|
||||
__m512i a1 = *(A1_row + k);
|
||||
__m512i a2 = *(A2_row + k);
|
||||
__m512i a3 = *(A3_row + k);
|
||||
__m512i a4 = *(A4_row + k);
|
||||
|
||||
// madd_epi16 does multiply add on 8 16-bit integers and accumulates
|
||||
// into a four 32-bit register. E.g., a1 = [f1, f2, f3, f4, f5, f6, f7,
|
||||
// h8] (16-bit ints) b1 = [h1, h2, h3, h4, h5, h6, h7, h8] (16-bit ints)
|
||||
// result = [f1*h1 + f2*h2, f3*h3 + f4*h4, f5*h5 + f6*h6, f7*h7 + f8*h8]
|
||||
// (32-bit ints) Then add_epi32 just effectively does a += on these
|
||||
// 32-bit integers.
|
||||
sum1 = _mm512_add_epi32(sum1, _mm512_madd_epi16(b, a1));
|
||||
sum2 = _mm512_add_epi32(sum2, _mm512_madd_epi16(b, a2));
|
||||
sum3 = _mm512_add_epi32(sum3, _mm512_madd_epi16(b, a3));
|
||||
sum4 = _mm512_add_epi32(sum4, _mm512_madd_epi16(b, a4));
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce32(sum1, sum2, sum3, sum4));
|
||||
}
|
||||
}
|
||||
// Handle the non-multiples of 4 rows.
|
||||
// TODO: efficient version for 3 rows, 2 rows, etc.
|
||||
for(int i = num_unroll_rows; i < num_A_rows; ++i) {
|
||||
const __m512i *A1_row = A + i * sse_width;
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i a1 = *(A1_row + k);
|
||||
sum1 = _mm512_add_epi32(sum1, _mm512_madd_epi16(b, a1));
|
||||
}
|
||||
// TODO is there a more efficient way?
|
||||
*(C + (i)*num_B_rows + j)
|
||||
= unquant_mult * static_cast<float>(Reduce32(sum1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/* Three ways considered to apply sign bits:
|
||||
* 1. Use 256-bit sign instruction:
|
||||
* __m256i a_first = _mm256_sign_epi8(_mm512_castsi512_si256(a),
|
||||
* _mm512_castsi512_si256(b));
|
||||
* __m256i a_second = _mm256_sign_epi8(_mm512_extracti64x4_epi64(a, 1),
|
||||
* b_second); a = _mm512_inserti64x4(_mm512_castsi256_si512(a_first), a_second,
|
||||
* 1); a = Concat(a_first, a_second);
|
||||
*
|
||||
* 2. Extract a mask and xor + 1
|
||||
* __mmask64 neg_mask _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
* Use set1 to to build to_xor
|
||||
* a = _mm512_xor_si512(a, to_xor)
|
||||
* And add one:
|
||||
* const __m512i ones8 = _mm512_set1_epi8(1);
|
||||
* a = _mm512_mask_add_epi8(a, neg_mask, a, ones8);
|
||||
*
|
||||
* 3. Extract a mask and subtract from 0
|
||||
* In the outer loop on b:
|
||||
* __mmask64 neg_mask _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128))
|
||||
* For each a:
|
||||
* a = _mm512_mask_sub_epi8(a, neg_mask, _mm512_setzero_si512(), a);
|
||||
*
|
||||
* Finally, subtraction won the benchmark
|
||||
*/
|
||||
inline void Accum(const __m512i zeros,
|
||||
__m512i a,
|
||||
const __m512i b,
|
||||
const __m512i b_positive,
|
||||
const __mmask64 neg_mask,
|
||||
__m512i &sum) {
|
||||
// Apply sign bits.
|
||||
a = _mm512_mask_sub_epi8(a, neg_mask, zeros, a);
|
||||
// The magic 8-bit multiply then horizontal sum into 16-bit.
|
||||
__m512i multiplied = _mm512_maddubs_epi16(b_positive, a);
|
||||
// Now we have 16-bit results that are the sum of two multiplies.
|
||||
// Choosing to approximate and do adds.
|
||||
// Perhaps every so often we could accumulate by Convert32Sum
|
||||
sum = _mm512_adds_epi16(sum, multiplied);
|
||||
b; // make compiler happy
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void AVX_MatrixMult8(const __m512i *A,
|
||||
const __m512i *B,
|
||||
float *C,
|
||||
float unquant_mult,
|
||||
int num_A_rows,
|
||||
int num_B_rows,
|
||||
int width) {
|
||||
assert(width % 32 == 0);
|
||||
assert(reinterpret_cast<uintptr_t>(A) % 64 == 0);
|
||||
assert(reinterpret_cast<uintptr_t>(B) % 64 == 0);
|
||||
ScatterPut put(unquant_mult, num_B_rows);
|
||||
const __m512i zeros = _mm512_setzero_si512();
|
||||
|
||||
const int sse_width = width / 64;
|
||||
int i = 0;
|
||||
int mult8rows = num_A_rows & (~7);
|
||||
|
||||
for(; i < mult8rows; i += 8) {
|
||||
const __m512i *A1_row = A + (i + 0) * sse_width;
|
||||
const __m512i *A2_row = A + (i + 1) * sse_width;
|
||||
const __m512i *A3_row = A + (i + 2) * sse_width;
|
||||
const __m512i *A4_row = A + (i + 3) * sse_width;
|
||||
const __m512i *A5_row = A + (i + 4) * sse_width;
|
||||
const __m512i *A6_row = A + (i + 5) * sse_width;
|
||||
const __m512i *A7_row = A + (i + 6) * sse_width;
|
||||
const __m512i *A8_row = A + (i + 7) * sse_width;
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
__m512i sum2 = _mm512_setzero_si512();
|
||||
__m512i sum3 = _mm512_setzero_si512();
|
||||
__m512i sum4 = _mm512_setzero_si512();
|
||||
__m512i sum5 = _mm512_setzero_si512();
|
||||
__m512i sum6 = _mm512_setzero_si512();
|
||||
__m512i sum7 = _mm512_setzero_si512();
|
||||
__m512i sum8 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i b_positive = _mm512_abs_epi8(b);
|
||||
/* Didn't seem to make a difference definining sign bits here vs at top
|
||||
*/
|
||||
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
|
||||
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
|
||||
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
|
||||
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
|
||||
Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
|
||||
Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
|
||||
Accum(zeros, *(A7_row + k), b, b_positive, neg_mask, sum7);
|
||||
Accum(zeros, *(A8_row + k), b, b_positive, neg_mask, sum8);
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
|
||||
put.Write(C + (i + 4) * num_B_rows + j,
|
||||
Reduce16to32(sum5, sum6, sum7, sum8));
|
||||
}
|
||||
}
|
||||
|
||||
const __m512i *A1_row = A + (i + 0) * sse_width;
|
||||
const __m512i *A2_row = A + (i + 1) * sse_width;
|
||||
const __m512i *A3_row = A + (i + 2) * sse_width;
|
||||
const __m512i *A4_row = A + (i + 3) * sse_width;
|
||||
const __m512i *A5_row = A + (i + 4) * sse_width;
|
||||
const __m512i *A6_row = A + (i + 5) * sse_width;
|
||||
const __m512i *A7_row = A + (i + 6) * sse_width;
|
||||
switch(num_A_rows & 7) {
|
||||
case 7:
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
__m512i sum2 = _mm512_setzero_si512();
|
||||
__m512i sum3 = _mm512_setzero_si512();
|
||||
__m512i sum4 = _mm512_setzero_si512();
|
||||
__m512i sum5 = _mm512_setzero_si512();
|
||||
__m512i sum6 = _mm512_setzero_si512();
|
||||
__m512i sum7 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i b_positive = _mm512_abs_epi8(b);
|
||||
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
|
||||
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
|
||||
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
|
||||
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
|
||||
Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
|
||||
Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
|
||||
Accum(zeros, *(A7_row + k), b, b_positive, neg_mask, sum7);
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
|
||||
put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5, sum6));
|
||||
put.Write(C + (i + 6) * num_B_rows + j, Reduce16to32(sum7));
|
||||
}
|
||||
/* fall through */
|
||||
case 6:
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
__m512i sum2 = _mm512_setzero_si512();
|
||||
__m512i sum3 = _mm512_setzero_si512();
|
||||
__m512i sum4 = _mm512_setzero_si512();
|
||||
__m512i sum5 = _mm512_setzero_si512();
|
||||
__m512i sum6 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i b_positive = _mm512_abs_epi8(b);
|
||||
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
|
||||
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
|
||||
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
|
||||
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
|
||||
Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
|
||||
Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
|
||||
put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5, sum6));
|
||||
}
|
||||
/* fall through */
|
||||
case 5:
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
__m512i sum2 = _mm512_setzero_si512();
|
||||
__m512i sum3 = _mm512_setzero_si512();
|
||||
__m512i sum4 = _mm512_setzero_si512();
|
||||
__m512i sum5 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i b_positive = _mm512_abs_epi8(b);
|
||||
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
|
||||
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
|
||||
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
|
||||
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
|
||||
Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
|
||||
put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5));
|
||||
}
|
||||
/* fall through */
|
||||
case 4:
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
__m512i sum2 = _mm512_setzero_si512();
|
||||
__m512i sum3 = _mm512_setzero_si512();
|
||||
__m512i sum4 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i b_positive = _mm512_abs_epi8(b);
|
||||
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
|
||||
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
|
||||
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
|
||||
Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
|
||||
}
|
||||
/* fall through */
|
||||
case 3:
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
__m512i sum2 = _mm512_setzero_si512();
|
||||
__m512i sum3 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i b_positive = _mm512_abs_epi8(b);
|
||||
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
|
||||
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
|
||||
Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2));
|
||||
put.Write(C + (i + 2) * num_B_rows + j, Reduce16to32(sum3));
|
||||
}
|
||||
/* fall through */
|
||||
case 2:
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
__m512i sum2 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i b_positive = _mm512_abs_epi8(b);
|
||||
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
|
||||
Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2));
|
||||
}
|
||||
/* fall through */
|
||||
case 1:
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m512i *B_row = B + j * sse_width;
|
||||
__m512i sum1 = _mm512_setzero_si512();
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m512i b = *(B_row + k);
|
||||
__m512i b_positive = _mm512_abs_epi8(b);
|
||||
__mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
|
||||
Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
|
||||
}
|
||||
put.Write(C + i * num_B_rows + j, Reduce16to32(sum1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace int16
|
||||
} // namespace cpu
|
||||
} // namespace marian
|
||||
#endif
|
@ -1,187 +0,0 @@
|
||||
#include "int_gemm.h"
|
||||
#include "tensors/tensor_allocator.h"
|
||||
#include "tensors/tensor_operators.h"
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace int16 {
|
||||
|
||||
#ifdef __AVX512F__
|
||||
void AVX_Quantize16(const float* input,
|
||||
int16_t* output,
|
||||
float quant_mult,
|
||||
std::size_t size);
|
||||
|
||||
void AVX_Quantize8(const float* input,
|
||||
int8_t* output,
|
||||
float quant_mult,
|
||||
std::size_t size);
|
||||
|
||||
void AVX_MatrixMult16(const __m512i* A,
|
||||
const __m512i* B,
|
||||
float* C,
|
||||
float unquant_mult,
|
||||
int num_A_rows,
|
||||
int num_B_rows,
|
||||
int width);
|
||||
|
||||
void AVX_MatrixMult8(const __m512i* A,
|
||||
const __m512i* B,
|
||||
float* C,
|
||||
float unquant_mult,
|
||||
int num_A_rows,
|
||||
int num_B_rows,
|
||||
int width);
|
||||
#endif
|
||||
|
||||
void SSE_Quantize16(const float* input,
|
||||
__m128i* output,
|
||||
float quant_mult,
|
||||
int num_rows,
|
||||
int width);
|
||||
|
||||
void SSE_MatrixMult16(const __m128i* A,
|
||||
const __m128i* B,
|
||||
float* C,
|
||||
float unquant_mult,
|
||||
int num_A_rows,
|
||||
int num_B_rows,
|
||||
int width);
|
||||
|
||||
void Quantize16(marian::Tensor out,
|
||||
const marian::Tensor in,
|
||||
float /*clipValue*/) {
|
||||
float quant_mult = (float)pow(2.0, BITS);
|
||||
#ifdef __AVX512F__
|
||||
AVX_Quantize16(
|
||||
in->data(), out->data<int16_t>(), quant_mult, in->shape().elements());
|
||||
#else
|
||||
int num_rows = in->shape().elements() / in->shape()[-1];
|
||||
int width = in->shape()[-1];
|
||||
SSE_Quantize16(in->data(), out->data<__m128i>(), quant_mult, num_rows, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Quantize8(marian::Tensor out,
|
||||
const marian::Tensor in,
|
||||
float clipValue) {
|
||||
#ifdef __AVX512F__
|
||||
float quant_mult = 127.0f / clipValue;
|
||||
AVX_Quantize8(
|
||||
in->data(), out->data<int8_t>(), quant_mult, in->shape().elements());
|
||||
#else
|
||||
out; in; clipValue;
|
||||
ABORT("8-bit is currently only AVX512");
|
||||
#endif
|
||||
}
|
||||
|
||||
// This operates on floats after processing so doesn't care about int8_t vs
|
||||
// int16_t.
|
||||
void AddBias(marian::Tensor C, const marian::Tensor Bias) {
|
||||
float* y = C->data();
|
||||
const float* x = C->data();
|
||||
const float* bias = Bias->data();
|
||||
|
||||
const int m = C->shape().elements() / C->shape()[-1];
|
||||
const int n = C->shape()[-1];
|
||||
|
||||
for(int j = 0; j < m; ++j) {
|
||||
int i = 0;
|
||||
#ifdef __AVX512F__
|
||||
int n16 = n & ~15;
|
||||
for(; i < n16; i += 16) {
|
||||
__m512 ai = _mm512_loadu_ps(x + j * n + i);
|
||||
__m512 bi = _mm512_loadu_ps(bias + i);
|
||||
__m512 yi = _mm512_add_ps(ai, bi);
|
||||
_mm512_storeu_ps(y + j * n + i, yi);
|
||||
}
|
||||
#else
|
||||
int n4 = (n / 4) * 4;
|
||||
for(; i < n4; i += 4) {
|
||||
__m128 ai = _mm_loadu_ps(x + j * n + i);
|
||||
__m128 bi = _mm_loadu_ps(bias + i);
|
||||
__m128 yi = _mm_add_ps(ai, bi);
|
||||
_mm_storeu_ps(y + j * n + i, yi);
|
||||
}
|
||||
#endif
|
||||
for(; i < n; i++) {
|
||||
y[j * n + i] = x[j * n + i] + bias[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ProdInt16(marian::Tensor C,
|
||||
const marian::Tensor A,
|
||||
const marian::Tensor B,
|
||||
float scale) {
|
||||
ABORT_IF(scale != 1, "Scale other than 1 not supported");
|
||||
|
||||
// @TODO: make this a parameter
|
||||
float quant_mult = (float)pow(2.0, BITS);
|
||||
|
||||
// If we quantize to n bits and then multiple the values together, the result
|
||||
// will be quantized to n^2 bits. So we must divide by 1.0/(n^2) to get back
|
||||
// the original value.
|
||||
float unquant_mult = 1.0f / (quant_mult * quant_mult);
|
||||
|
||||
float* fC = C->data();
|
||||
int num_A_rows = A->shape().elements() / A->shape()[-1];
|
||||
int num_B_rows = B->shape().elements() / B->shape()[-1];
|
||||
int width = B->shape()[-1];
|
||||
#ifdef __AVX512F__
|
||||
AVX_MatrixMult16(A->data<__m512i>(),
|
||||
B->data<__m512i>(),
|
||||
fC,
|
||||
unquant_mult,
|
||||
num_A_rows,
|
||||
num_B_rows,
|
||||
width);
|
||||
#else
|
||||
SSE_MatrixMult16(A->data<__m128i>(),
|
||||
B->data<__m128i>(),
|
||||
fC,
|
||||
unquant_mult,
|
||||
num_A_rows,
|
||||
num_B_rows,
|
||||
width);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ProdInt8(marian::Tensor C,
|
||||
const marian::Tensor A,
|
||||
const marian::Tensor B,
|
||||
float scale,
|
||||
float clipValue) {
|
||||
#ifdef __AVX512F__
|
||||
// This would be easy...
|
||||
ABORT_IF(scale != 1, "Scale other than 1 not supported");
|
||||
float quant_mult = 127.0f / clipValue;
|
||||
float unquant_mult = 1.0f / (quant_mult * quant_mult);
|
||||
|
||||
float* fC = C->data();
|
||||
int num_A_rows = A->shape().elements() / A->shape()[-1];
|
||||
int num_B_rows = B->shape().elements() / B->shape()[-1];
|
||||
int width = B->shape()[-1];
|
||||
AVX_MatrixMult8(A->data<__m512i>(),
|
||||
B->data<__m512i>(),
|
||||
fC,
|
||||
unquant_mult,
|
||||
num_A_rows,
|
||||
num_B_rows,
|
||||
width);
|
||||
#else
|
||||
C; A; B; scale; clipValue;
|
||||
ABORT("8-bit is currently only AVX512");
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace int16
|
||||
} // namespace cpu
|
||||
} // namespace marian
|
@ -1,36 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace int16 {
|
||||
|
||||
const int BITS = 10;
|
||||
|
||||
void Quantize16(marian::Tensor out,
|
||||
const marian::Tensor in,
|
||||
float /*clipValue*/);
|
||||
|
||||
void Quantize8(marian::Tensor out,
|
||||
const marian::Tensor in,
|
||||
float clipValue);
|
||||
|
||||
// This operates on floats after processing so doesn't care about int8_t vs
|
||||
// int16_t.
|
||||
void AddBias(marian::Tensor C, const marian::Tensor Bias);
|
||||
|
||||
void ProdInt16(marian::Tensor C,
|
||||
const marian::Tensor A,
|
||||
const marian::Tensor B,
|
||||
float scale);
|
||||
|
||||
void ProdInt8(marian::Tensor C,
|
||||
const marian::Tensor A,
|
||||
const marian::Tensor B,
|
||||
float scale,
|
||||
float clipValue);
|
||||
|
||||
} // namespace int16
|
||||
} // namespace cpu
|
||||
} // namespace marian
|
@ -1,341 +0,0 @@
|
||||
// Copyright (c) 2017 Microsoft Corporation
|
||||
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <tmmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#include <cassert>
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace int16 {
|
||||
|
||||
// This is a reference implementation of 16-bit matrix multiplication described
|
||||
// in "Sharp Models on Dull Hardware: Fast and Accurate Neural Machine
|
||||
// Translation Decoding on the CPU". This model is not as fast as the one in the
|
||||
// paper, becuase it uses SSE2 instead of AVX2. AVX2 instructions are only
|
||||
// available on more modern CPUs (Haswell or later). The only difference between
|
||||
// SSE2 and AVX2 is that SSE operates on 128-bit vectors and AVX2 operates on
|
||||
// 256-bit vecetors. So AVX2 can fit 16 16-bit integers intead of 8 8-bit
|
||||
// integers. The algorithm is the same, you just replace these instructions with
|
||||
// their 256-bit counterpart, i.e., _mm256_add_epi32, _mm256_madd_epi16,
|
||||
// _mm256_hadd_epi32, ... Additional improvements can also be made from
|
||||
// unrolling the for loop over num_B_rows in SSE_MatrixMult, which is not done
|
||||
// here for clarity.
|
||||
|
||||
// ***************************************
|
||||
// ************** IMPORTANT **************
|
||||
// ***************************************
|
||||
// The biggest "gotcha" when using this type of multiplication is dealing with
|
||||
// overflow related to quantization. It is NOT enough to simply ensure that A
|
||||
// and B fit into 16 bit integers. If A and B are quantized with $n$ bits, the
|
||||
// result of multiplying them together will be quantized to $n^2$ bits. So if
|
||||
// they are near the boundary of the 16-bit mark, then the result will be near
|
||||
// 32-bits and overflow. However, if we use, say, n = 10 bits, then the product
|
||||
// is 20 bits. This gives us 12 bits left over for the accumulation. So as long
|
||||
// as the width of the common dimension is less than 2^12 = 4096, it is
|
||||
// *impossible* to overflow. If we used, say, n = 12 bits, then we have
|
||||
// 32-(12*2) = 8 bits left over. So we *could* overflow if width > 2^8.
|
||||
//
|
||||
// So, the tradeoff is between quantization precision and possibility of
|
||||
// overflow. A good general value is 10 bits, since this gives high precision
|
||||
// (precision is 1/2^10 ~= 0.001, which is more than what's needed for almost
|
||||
// all neural nets), and cannot overflow unless the matrix width is > 4096.
|
||||
|
||||
// This quantizes floating point values into fixed-point 16-bit integers.
|
||||
// Effectively, we are performing an SSE version of float x = ...; int16_t y =
|
||||
// (int16_t)(quant_mult*x);
|
||||
//
|
||||
// Except that the casting is saturated. However, you should always ensure that
|
||||
// the input fits into a fixed range anyways. I.e., you should ensure that
|
||||
// quant_mult*x fits into the range [-2^15, 2^15]. This should always be
|
||||
// possible because the value you're quantizing will either be NN weights or NN
|
||||
// activations, both of which can be clipped to a fixed range during training.
|
||||
|
||||
void SSE_Quantize16(const float* input,
|
||||
__m128i* output,
|
||||
float quant_mult,
|
||||
int num_rows,
|
||||
int width) {
|
||||
assert(width % 8 == 0);
|
||||
|
||||
int num_input_chunks = width / 8;
|
||||
|
||||
// Fill an SSE float with 4 copies of the quant mult
|
||||
__m128 sse_quant_mult
|
||||
= _mm_set_ps(quant_mult, quant_mult, quant_mult, quant_mult);
|
||||
|
||||
for(int i = 0; i < num_rows; i++) {
|
||||
const float* input_row = input + i * width;
|
||||
__m128i* output_row = output + i * num_input_chunks;
|
||||
for(int j = 0; j < num_input_chunks; j++) {
|
||||
const float* x = input_row + j * 8;
|
||||
// Process 8 floats at once, since each __m128i can contain 8 16-bit
|
||||
// integers.
|
||||
|
||||
// Load floats floats into SSE registers.
|
||||
__m128 f_0 = _mm_loadu_ps(x);
|
||||
__m128 f_1 = _mm_loadu_ps(x + 4);
|
||||
|
||||
// Multiply by quantization factor (e.g., if quant_mult = 1000.0, 0.34291
|
||||
// --> 342.21)
|
||||
__m128 m_0 = _mm_mul_ps(f_0, sse_quant_mult);
|
||||
__m128 m_1 = _mm_mul_ps(f_1, sse_quant_mult);
|
||||
|
||||
// Cast float to 32-bit int (e.g., 342.21 --> 342)
|
||||
__m128i i_0 = _mm_cvtps_epi32(m_0);
|
||||
__m128i i_1 = _mm_cvtps_epi32(m_1);
|
||||
|
||||
// Cast 32-bit int to 16-bit int. You must ensure that these fit into the
|
||||
// 16-bit range by clipping values during training.
|
||||
*(output_row + j) = _mm_packs_epi32(i_0, i_1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We are multiplying A * B^T, as opposed to A * B. This is important because it
|
||||
// means we can do consecutive memory access on A * B^T which allows to to take
|
||||
// the most advantage of L1 cache.
|
||||
//
|
||||
// B is typically a weight matrix, so it can be pre-processed offline, and
|
||||
// therefore this transpose does not cost anything. A is typically an activation
|
||||
// minibatch matrix.
|
||||
void SSE_MatrixMult16(const __m128i* qA,
|
||||
const __m128i* qB,
|
||||
float* fC,
|
||||
float unquant_mult,
|
||||
int num_A_rows,
|
||||
int num_B_rows,
|
||||
int width) {
|
||||
assert(width % 8 == 0);
|
||||
|
||||
int sse_width = width / 8;
|
||||
|
||||
// We do loop unrolling over A. This is *significantly* faster
|
||||
// since B can live in the registers. We are assuming that
|
||||
// A is a multiple of 4, but we can add extra code to handle values of 1,
|
||||
// 2, 3.
|
||||
//
|
||||
// We could also do loop unrolling over B, which adds some additional speedup.
|
||||
// We don't do that for the sake of clarity.
|
||||
//
|
||||
// There are other memory access patterns we could do, e.g., put B on the
|
||||
// outer loop. The justification is that A is typically small enough that it
|
||||
// can live in L1 cache. B is usually a larger weight matrix, so it might not
|
||||
// be able to. However, we are using each element of B four times while it's
|
||||
// still in a register, so caching is not as important.
|
||||
|
||||
int mult4 = (num_A_rows / 4) * 4;
|
||||
int rest = num_A_rows % 4;
|
||||
|
||||
int i = 0;
|
||||
for(; i < mult4; i += 4) {
|
||||
const __m128i* A1_row = qA + (i + 0) * sse_width;
|
||||
const __m128i* A2_row = qA + (i + 1) * sse_width;
|
||||
const __m128i* A3_row = qA + (i + 2) * sse_width;
|
||||
const __m128i* A4_row = qA + (i + 3) * sse_width;
|
||||
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m128i* B_row = qB + j * sse_width;
|
||||
|
||||
__m128i sum1 = _mm_setzero_si128();
|
||||
__m128i sum2 = _mm_setzero_si128();
|
||||
__m128i sum3 = _mm_setzero_si128();
|
||||
__m128i sum4 = _mm_setzero_si128();
|
||||
|
||||
// This is just a simple dot product, unrolled four ways.
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m128i b = *(B_row + k);
|
||||
|
||||
__m128i a1 = *(A1_row + k);
|
||||
__m128i a2 = *(A2_row + k);
|
||||
__m128i a3 = *(A3_row + k);
|
||||
__m128i a4 = *(A4_row + k);
|
||||
|
||||
// _mm_madd_epi16 does multiply add on 8 16-bit integers and accumulates
|
||||
// into a four 32-bit register. E.g., a1 = [f1, f2, f3, f4, f5, f6, f7,
|
||||
// h8] (16-bit ints) b1 = [h1, h2, h3, h4, h5, h6, h7, h8] (16-bit ints)
|
||||
// result = [f1*h1 + f2*h2, f3*h3 + f4*h4, f5*h5 + f6*h6, f7*h7 + f8*h8]
|
||||
// (32-bit ints) Then _mm_add_epi32 just effectively does a += on these
|
||||
// 32-bit integers.
|
||||
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
|
||||
sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
|
||||
sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(b, a3));
|
||||
sum4 = _mm_add_epi32(sum4, _mm_madd_epi16(b, a4));
|
||||
}
|
||||
|
||||
// We now have each sum spread across 4 32-bit ints in SSE register, e.g.,
|
||||
// sum1 = [r1, r2, r3, r4]. We need to compute r1 + r2 + r3 + r4.
|
||||
//
|
||||
// This uses 'horizontal add' to do that efficiently. The first add gets
|
||||
// us [r1 + r2, r2 + r3, r1 + r2, r2 + r3] Then the second gets us. [r1 +
|
||||
// r2 + r2 + r3, r2 + r3 + r1 + r2, r1 + r2 + r2 + r3, r2 + r3 + r1 + r2]
|
||||
// E.g., each 32-bit in contains the full sum.
|
||||
sum1 = _mm_hadd_epi32(sum1, sum1);
|
||||
sum1 = _mm_hadd_epi32(sum1, sum1);
|
||||
sum2 = _mm_hadd_epi32(sum2, sum2);
|
||||
sum2 = _mm_hadd_epi32(sum2, sum2);
|
||||
sum3 = _mm_hadd_epi32(sum3, sum3);
|
||||
sum3 = _mm_hadd_epi32(sum3, sum3);
|
||||
sum4 = _mm_hadd_epi32(sum4, sum4);
|
||||
sum4 = _mm_hadd_epi32(sum4, sum4);
|
||||
|
||||
float* C1 = fC + (i + 0) * num_B_rows + j;
|
||||
float* C2 = fC + (i + 1) * num_B_rows + j;
|
||||
float* C3 = fC + (i + 2) * num_B_rows + j;
|
||||
float* C4 = fC + (i + 3) * num_B_rows + j;
|
||||
|
||||
// Now that we have the full sum in each 32-bit register, we convert them
|
||||
// to an integer with _mm_cvtepi32_ps and take the first one with
|
||||
// _mm_store_ss. We don't use an SSE instruction to unquantize, although
|
||||
// we could. It doesn't really matter since most of the computation is in
|
||||
// the above loop over the width.
|
||||
//
|
||||
// Also note that the memory acceses on C are not consecutive, but this is
|
||||
// a tradeoff that we have to make. We can't have consecutive accesses of
|
||||
// qA, qB, *and* C. But we access qA and qB a lot more so it makes sense
|
||||
// to do it this way.
|
||||
_mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
|
||||
*(C1) *= unquant_mult;
|
||||
|
||||
_mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
|
||||
*(C2) *= unquant_mult;
|
||||
|
||||
_mm_store_ss(C3, _mm_cvtepi32_ps(sum3));
|
||||
*(C3) *= unquant_mult;
|
||||
|
||||
_mm_store_ss(C4, _mm_cvtepi32_ps(sum4));
|
||||
*(C4) *= unquant_mult;
|
||||
}
|
||||
}
|
||||
if(rest == 1) {
|
||||
const __m128i* A1_row = qA + (i + 0) * sse_width;
|
||||
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m128i* B_row = qB + j * sse_width;
|
||||
|
||||
__m128i sum1 = _mm_setzero_si128();
|
||||
|
||||
// This is just a simple dot product, unrolled four ways.
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m128i b = *(B_row + k);
|
||||
|
||||
__m128i a1 = *(A1_row + k);
|
||||
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
|
||||
}
|
||||
|
||||
sum1 = _mm_hadd_epi32(sum1, sum1);
|
||||
sum1 = _mm_hadd_epi32(sum1, sum1);
|
||||
|
||||
float* C1 = fC + (i + 0) * num_B_rows + j;
|
||||
|
||||
_mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
|
||||
*(C1) *= unquant_mult;
|
||||
}
|
||||
} else if(rest == 2) {
|
||||
const __m128i* A1_row = qA + (i + 0) * sse_width;
|
||||
const __m128i* A2_row = qA + (i + 1) * sse_width;
|
||||
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m128i* B_row = qB + j * sse_width;
|
||||
|
||||
__m128i sum1 = _mm_setzero_si128();
|
||||
__m128i sum2 = _mm_setzero_si128();
|
||||
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m128i b = *(B_row + k);
|
||||
|
||||
__m128i a1 = *(A1_row + k);
|
||||
__m128i a2 = *(A2_row + k);
|
||||
|
||||
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
|
||||
sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
|
||||
}
|
||||
|
||||
sum1 = _mm_hadd_epi32(sum1, sum1);
|
||||
sum1 = _mm_hadd_epi32(sum1, sum1);
|
||||
sum2 = _mm_hadd_epi32(sum2, sum2);
|
||||
sum2 = _mm_hadd_epi32(sum2, sum2);
|
||||
|
||||
float* C1 = fC + (i + 0) * num_B_rows + j;
|
||||
float* C2 = fC + (i + 1) * num_B_rows + j;
|
||||
|
||||
_mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
|
||||
*(C1) *= unquant_mult;
|
||||
|
||||
_mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
|
||||
*(C2) *= unquant_mult;
|
||||
}
|
||||
} else if(rest == 3) {
|
||||
const __m128i* A1_row = qA + (i + 0) * sse_width;
|
||||
const __m128i* A2_row = qA + (i + 1) * sse_width;
|
||||
const __m128i* A3_row = qA + (i + 2) * sse_width;
|
||||
|
||||
for(int j = 0; j < num_B_rows; j++) {
|
||||
const __m128i* B_row = qB + j * sse_width;
|
||||
|
||||
__m128i sum1 = _mm_setzero_si128();
|
||||
__m128i sum2 = _mm_setzero_si128();
|
||||
__m128i sum3 = _mm_setzero_si128();
|
||||
|
||||
for(int k = 0; k < sse_width; k++) {
|
||||
__m128i b = *(B_row + k);
|
||||
|
||||
__m128i a1 = *(A1_row + k);
|
||||
__m128i a2 = *(A2_row + k);
|
||||
__m128i a3 = *(A3_row + k);
|
||||
|
||||
sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
|
||||
sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
|
||||
sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(b, a3));
|
||||
}
|
||||
|
||||
sum1 = _mm_hadd_epi32(sum1, sum1);
|
||||
sum1 = _mm_hadd_epi32(sum1, sum1);
|
||||
sum2 = _mm_hadd_epi32(sum2, sum2);
|
||||
sum2 = _mm_hadd_epi32(sum2, sum2);
|
||||
sum3 = _mm_hadd_epi32(sum3, sum3);
|
||||
sum3 = _mm_hadd_epi32(sum3, sum3);
|
||||
|
||||
float* C1 = fC + (i + 0) * num_B_rows + j;
|
||||
float* C2 = fC + (i + 1) * num_B_rows + j;
|
||||
float* C3 = fC + (i + 2) * num_B_rows + j;
|
||||
|
||||
_mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
|
||||
*(C1) *= unquant_mult;
|
||||
|
||||
_mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
|
||||
*(C2) *= unquant_mult;
|
||||
|
||||
_mm_store_ss(C3, _mm_cvtepi32_ps(sum3));
|
||||
*(C3) *= unquant_mult;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace int16
|
||||
} // namespace cpu
|
||||
} // namespace marian
|
@ -66,16 +66,6 @@ public:
|
||||
|
||||
CudaCompute getCudaComputeCapability() { return compute_; }
|
||||
|
||||
// for CPU, sets to use optimized code for inference.
|
||||
// for GPU, this is invalid. for gpu, isOptimized() function always returns false.
|
||||
void setOptimized(bool optimize) override {
|
||||
LOG_ONCE(info, "setOptimized() not supported for GPU_{}", optimize);
|
||||
}
|
||||
|
||||
bool isOptimized() override {
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
cublasHandle_t cublasHandle_{0}; // make sure it's 0, so it can be initalized lazily
|
||||
cusparseHandle_t cusparseHandle_{0}; // as above
|
||||
|
@ -7,7 +7,9 @@ int main(int /*argc*/, char** /*argv*/) {
|
||||
{
|
||||
auto g = New<ExpressionGraph>(true);
|
||||
g->setDevice({0, DeviceType::cpu});
|
||||
g->getBackend()->setOptimized(false);
|
||||
#if 0 // this file is not a real test, just used for manual stuff. Disable here by hand for now.
|
||||
g->getBackend()->setInt16(false);
|
||||
#endif
|
||||
g->reserveWorkspaceMB(2512);
|
||||
|
||||
timer::AutoTimer timer;
|
||||
@ -40,7 +42,44 @@ int main(int /*argc*/, char** /*argv*/) {
|
||||
{
|
||||
auto g = New<ExpressionGraph>(true);
|
||||
g->setDevice({0, DeviceType::cpu});
|
||||
g->getBackend()->setOptimized(true);
|
||||
#if 0
|
||||
g->getBackend()->setInt16(true);
|
||||
#endif
|
||||
g->reserveWorkspaceMB(2512);
|
||||
|
||||
timer::AutoTimer timer;
|
||||
for(int i = 0; i < 100; ++i) {
|
||||
g->clear();
|
||||
|
||||
auto x = g->constant({1, 4, 8, 256}, inits::glorotUniform());
|
||||
|
||||
auto W1 = g->param("W1", {256, 2048}, inits::glorotUniform());
|
||||
auto b1 = g->param("b1", {1, 2048}, inits::glorotUniform());
|
||||
|
||||
auto out = affine(x, W1, b1);
|
||||
|
||||
for(int i = 2; i < 20; ++i) {
|
||||
auto Wi = g->param("W" + std::to_string(i), {2048, 2048}, inits::glorotUniform());
|
||||
auto bi = g->param("b" + std::to_string(i), {1, 2048}, inits::glorotUniform());
|
||||
|
||||
out = relu(affine(out, Wi, bi));
|
||||
}
|
||||
|
||||
auto Wn = g->param("Wn", {2048, 256}, inits::glorotUniform());
|
||||
auto bn = g->param("bn", {1, 256}, inits::glorotUniform());
|
||||
|
||||
auto y = affine(out, Wn, bn);
|
||||
|
||||
g->forward();
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto g = New<ExpressionGraph>(true);
|
||||
g->setDevice({0, DeviceType::cpu});
|
||||
#if 0
|
||||
g->getBackend()->setInt8(true);
|
||||
#endif
|
||||
g->reserveWorkspaceMB(2512);
|
||||
|
||||
timer::AutoTimer timer;
|
||||
|
@ -19,7 +19,6 @@ AsyncGraphGroup::AsyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
|
||||
auto graph = New<ExpressionGraph>();
|
||||
graph->setDevice(device);
|
||||
graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
|
||||
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
|
||||
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
graphs_.push_back(graph);
|
||||
shardOpt_.push_back(Optimizer(options_));
|
||||
|
@ -35,7 +35,6 @@ public:
|
||||
graph_ = New<ExpressionGraph>();
|
||||
graph_->setDevice(deviceId);
|
||||
graph_->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
|
||||
graph_->getBackend()->setClip(options_->get<float>("clip-gemm"));
|
||||
graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
opt_ = Optimizer(options_);
|
||||
builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
|
||||
|
@ -12,7 +12,6 @@ SyncGraphGroup::SyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
|
||||
graph->setDevice(device);
|
||||
graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
|
||||
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
|
||||
|
||||
graphs_.push_back(graph);
|
||||
shardOpt_.push_back(Optimizer(options_));
|
||||
|
@ -87,10 +87,6 @@ public:
|
||||
auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
|
||||
graph->setDefaultElementType(typeFromString(prec[0]));
|
||||
graph->setDevice(device);
|
||||
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
|
||||
if (device.type == DeviceType::cpu) {
|
||||
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
|
||||
}
|
||||
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
graphs_[id] = graph;
|
||||
|
||||
@ -229,10 +225,6 @@ public:
|
||||
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
|
||||
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
|
||||
graph->setDevice(device);
|
||||
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
|
||||
if (device.type == DeviceType::cpu) {
|
||||
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
|
||||
}
|
||||
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||
graphs_.push_back(graph);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user