Merged PR 9284: Packed model support in production

This PR enables loading and serving of offline packed models.
* Enable support for FBGEMM offline-packed models
* Added different parameters sets for different parameter element types (currently inference only)
* New types (packed*) are used for the packed memory format.
* Added CPU instruction query into the quicksand API.

Related work items: #102572
This commit is contained in:
Young Jin Kim 2019-11-01 17:24:49 +00:00 committed by Martin Junczys-Dowmunt
parent 8c3cb06944
commit 5ed441f843
41 changed files with 615 additions and 411 deletions

View File

@ -22,6 +22,7 @@ add_library(marian STATIC
common/io.cpp
common/filesystem.cpp
common/file_stream.cpp
common/types.cpp
data/alignment.cpp
data/vocab.cpp

View File

@ -4,6 +4,8 @@
#include <sstream>
#include "graph/expression_graph_packable.h"
int main(int argc, char** argv) {
using namespace marian;
@ -13,16 +15,18 @@ int main(int argc, char** argv) {
{
auto cli = New<cli::CLIWrapper>(
options,
"Convert a model in the .npz format to a mmap-able binary model",
"Convert a model in the .npz format and normal memory layout to a mmap-able binary model which could be in normal memory layout or packed memory layout",
"Allowed options",
"Examples:\n"
" ./marian-conv -f model.npz -t model.bin");
" ./marian-conv -f model.npz -t model.bin --gemm-type fp16packed");
cli->add<std::string>("--from,-f", "Input model", "model.npz");
cli->add<std::string>("--to,-t", "Output model", "model.bin");
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used with this weights", "mklfp32");
cli->parse(argc, argv);
}
auto modelFrom = options->get<std::string>("from");
auto modelTo = options->get<std::string>("to");
auto saveGemmType = options->get<std::string>("gemm-type");
LOG(info, "Outputting {}", modelTo);
@ -31,13 +35,14 @@ int main(int argc, char** argv) {
marian::io::getYamlFromModel(config, "special:model.yml", modelFrom);
configStr << config;
auto graph = New<ExpressionGraph>(true);
auto graph = New<ExpressionGraphPackable>();
graph->setDevice(CPU0);
graph->getBackend()->setOptimized(false);
graph->load(modelFrom);
graph->forward();
graph->save(modelTo, configStr.str());
// added a flag if the weights needs to be packed or not
graph->packAndSave(modelTo, configStr.str(), /* --gemm-type */ saveGemmType, Type::float32);
// graph->saveBinary(vm["bin"].as<std::string>());

View File

@ -18,6 +18,7 @@ struct Header {
size_t dataLength;
};
// cast current void pointer to T pointer and move forward by num elements
template <typename T>
const T* get(const void*& current, size_t num = 1) {
const T* ptr = (const T*)current;
@ -32,9 +33,10 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
binaryFileVersion,
BINARY_FILE_VERSION);
size_t numHeaders = *get<size_t>(current);
const Header* headers = get<Header>(current, numHeaders);
size_t numHeaders = *get<size_t>(current); // number of item headers that follow
const Header* headers = get<Header>(current, numHeaders); // read that many headers
// prepopulate items with meta data from headers
items.resize(numHeaders);
for(int i = 0; i < numHeaders; ++i) {
items[i].type = (Type)headers[i].type;
@ -42,21 +44,22 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
items[i].mapped = mapped;
}
// read in actual shape and data
for(int i = 0; i < numHeaders; ++i) {
size_t len = headers[i].shapeLength;
items[i].shape.resize(len);
const int* arr = get<int>(current, len);
std::copy(arr, arr + len, items[i].shape.begin());
items[i].shape.resize(len);
const int* arr = get<int>(current, len); // read shape
std::copy(arr, arr + len, items[i].shape.begin()); // copy to Item::shape
}
// move by offset bytes
// move by offset bytes, aligned to 256-bytes boundary
size_t offset = *get<size_t>(current);
get<char>(current, offset);
for(int i = 0; i < numHeaders; ++i) {
if(items[i].mapped) {
if(items[i].mapped) { // memory-mapped, hence only set pointer
items[i].ptr = get<char>(current, headers[i].dataLength);
} else {
} else { // reading into item data
size_t len = headers[i].dataLength;
items[i].bytes.resize(len);
const char* ptr = get<char>(current, len);
@ -69,6 +72,7 @@ void loadItems(const std::string& fileName, std::vector<io::Item>& items) {
// Read file into buffer
size_t fileSize = filesystem::fileSize(fileName);
std::vector<char> buf(fileSize);
// @TODO: check this again:
#if 1 // for some reason, the #else branch fails with "file not found" in the *read* operation (open succeeds)
FILE *f = fopen(fileName.c_str(), "rb");
ABORT_IF(f == nullptr, "Error {} ('{}') opening file '{}'", errno, strerror(errno), fileName);
@ -119,7 +123,8 @@ void saveItems(const std::string& fileName,
headers.push_back(Header{item.name.size() + 1,
(size_t)item.type,
item.shape.size(),
item.size()});
item.size()}); // item size without padding
// @TODO: should this be done with padding as asked below?
}
size_t headerSize = headers.size();
@ -146,9 +151,11 @@ void saveItems(const std::string& fileName,
}
// Write out all values
for(const auto& item : items) {
pos += out.write(item.data(), item.size());
}
for(const auto& item : items)
pos += out.write(item.data(), item.size()); // writes out data without padding, not aligned, @BUGBUG?
// @TODO: find out if padding should be enforced for memory-mapped storage like this:
// pos += out.write(item.data(), item.bytes.size()); // writes out data with padding
}
} // namespace binary

View File

@ -5,10 +5,10 @@
#include <string>
#include <vector>
// Increase this if binary format changes
#define BINARY_FILE_VERSION 1
namespace marian {
const static int BINARY_FILE_VERSION = 1;
namespace io {
namespace binary {

View File

@ -607,9 +607,6 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Optimize speed aggressively sacrificing memory or precision");
cli.add<bool>("--skip-cost",
"Ignore model cost during translation, not recommended for beam-size > 1");
cli.add<std::string>("--gemm-type",
"Select GEMM options: auto, mklfp32, intrinint16, fp16packed, int8packed",
"auto");
cli.add<bool>("--fp16",
"Shortcut for mixed precision inference with float16, corresponds to: --precision float16");

2
src/common/file_stream.h Executable file → Normal file
View File

@ -12,7 +12,7 @@
#pragma GCC diagnostic ignored "-Wsuggest-override"
#endif
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(push) // 4101: 'identifier' : unreferenced local variable. One parameter variable in zstr.hpp is not used.
#pragma warning(disable : 4101)
#endif
#include "3rd_party/zstr/zstr.hpp"

0
src/common/filesystem.cpp Executable file → Normal file
View File

View File

@ -139,8 +139,7 @@ void saveItemsNpz(const std::string& fileName, const std::vector<Item>& items) {
else
ABORT("Other types not supported yet");
npzItems.emplace_back(
item.name, item.bytes, shape, type, sizeOf(item.type));
npzItems.emplace_back(item.name, item.bytes, shape, type, sizeOf(item.type));
}
cnpy::npz_save(fileName, npzItems);
}

View File

@ -24,11 +24,8 @@ struct Item {
return bytes.data();
}
size_t size() const {
if(mapped)
return shape.elements() * sizeOf(type);
else
return bytes.size();
size_t size() const { // @TODO: review this again for 256-bytes boundary alignment
return requiredBytes(shape, type);
}
// Extend this item with data and shape from the input item, creating a flattened concatenation.

View File

@ -49,7 +49,7 @@ namespace marian {
}
try {
return !options_[key].as<std::string>().empty();
} catch(const YAML::BadConversion& e) {
} catch(const YAML::BadConversion& /* e */) {
ABORT("Option '{}' is neither a sequence nor a text");
}
return false;

20
src/common/types.cpp Normal file
View File

@ -0,0 +1,20 @@
#include "common/types.h"
#include "tensors/cpu/sharp/packed_gemm.h"
namespace marian {
// this function calculates the amount of bytes needed to contain a tensor of given shape and type.
// For most situation that is trivial (just number of elements time size of single element).
// But for instance, for intransparent types like packed tensors, it cannot easily be inferred by
// multiplying. All cases are handed here and can later be passed to allocators etc.
size_t requiredBytes(const Shape& shape, Type type) {
if(isPacked(type)) {
uint64_t packsize;
cpu::variant::PackInfoFp32(shape, false, packsize);
return (size_t)packsize;
} else {
return shape.elements() * sizeOf(type);
}
}
}

84
src/common/types.h Executable file → Normal file
View File

@ -1,5 +1,6 @@
#pragma once
#include "common/logging.h" // for ABORT and ABORT_IF
#include "common/shape.h"
#if __GNUC__ >= 7
#pragma GCC diagnostic push
@ -133,6 +134,16 @@ do { \
namespace marian {
// small struct to enable templating based on types use for packing
struct packed8 {
uint8_t x;
};
// small struct to enable templating based on types use for packing
struct packed16 {
uint16_t x;
};
#ifndef __CUDACC__ // vectorized types not available from .cu files
// @TODO: check what intrinsics are actually available.
@ -190,31 +201,38 @@ public:
};
#endif
// Internal to types.h, don't use. Use test functions below.
enum class TypeClass : size_t {
signed_type = 0x100,
signed_type = 0x100,
unsigned_type = 0x200,
float_type = 0x400,
size_mask = 0x0FF
float_type = 0x400,
packed_type = 0x800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
size_mask = 0x0FF
};
constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
return (size_t)typeClass + val;
}
// @TODO: rename to ElementType when things become stable, so it's easier to review
enum class Type : size_t {
int8 = TypeClass::signed_type + 1u,
int16 = TypeClass::signed_type + 2u,
int32 = TypeClass::signed_type + 4u,
int64 = TypeClass::signed_type + 8u,
int8 = TypeClass::signed_type + 1u,
int16 = TypeClass::signed_type + 2u,
int32 = TypeClass::signed_type + 4u,
int64 = TypeClass::signed_type + 8u,
uint8 = TypeClass::unsigned_type + 1u,
uint16 = TypeClass::unsigned_type + 2u,
uint32 = TypeClass::unsigned_type + 4u,
uint64 = TypeClass::unsigned_type + 8u,
uint8 = TypeClass::unsigned_type + 1u,
uint16 = TypeClass::unsigned_type + 2u,
uint32 = TypeClass::unsigned_type + 4u,
uint64 = TypeClass::unsigned_type + 8u,
float16 = TypeClass::float_type + 2u,
float32 = TypeClass::float_type + 4u,
float64 = TypeClass::float_type + 8u
float16 = TypeClass::float_type + 2u,
float32 = TypeClass::float_type + 4u,
float64 = TypeClass::float_type + 8u,
packed8 = TypeClass::packed_type + 1u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed16 = TypeClass::packed_type + 2u // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
};
static inline size_t operator&(TypeClass typeClass, Type type) {
@ -241,23 +259,33 @@ static inline bool isFloat(Type type) {
return (TypeClass::float_type & type) != 0;
}
static inline bool isPacked(Type type) {
return (TypeClass::packed_type & type) != 0;
}
size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type
template <typename T>
inline bool matchType(Type type);
// clang-format off
template <> inline bool matchType<int8_t>(Type type) { return type == Type::int8; }
template <> inline bool matchType<int16_t>(Type type) { return type == Type::int16; }
template <> inline bool matchType<int32_t>(Type type) { return type == Type::int32; }
template <> inline bool matchType<int64_t>(Type type) { return type == Type::int64; }
template <> inline bool matchType<int8_t>(Type type) { return type == Type::int8; }
template <> inline bool matchType<int16_t>(Type type) { return type == Type::int16; }
template <> inline bool matchType<int32_t>(Type type) { return type == Type::int32; }
template <> inline bool matchType<int64_t>(Type type) { return type == Type::int64; }
template <> inline bool matchType<uint8_t>(Type type) { return type == Type::uint8; }
template <> inline bool matchType<uint16_t>(Type type) { return type == Type::uint16; }
template <> inline bool matchType<uint32_t>(Type type) { return type == Type::uint32; }
template <> inline bool matchType<uint64_t>(Type type) { return type == Type::uint64; }
// In case of packed type, it uses uint8 as underlying memory type
template <> inline bool matchType<uint8_t>(Type type) { return type == Type::uint8; }
template <> inline bool matchType<uint16_t>(Type type) { return type == Type::uint16; }
template <> inline bool matchType<uint32_t>(Type type) { return type == Type::uint32; }
template <> inline bool matchType<uint64_t>(Type type) { return type == Type::uint64; }
template <> inline bool matchType<float16>(Type type) { return type == Type::float16; }
template <> inline bool matchType<float>(Type type) { return type == Type::float32; }
template <> inline bool matchType<double>(Type type) { return type == Type::float64; }
template <> inline bool matchType<float16>(Type type) { return type == Type::float16; }
template <> inline bool matchType<float>(Type type) { return type == Type::float32; }
template <> inline bool matchType<double>(Type type) { return type == Type::float64; }
template <> inline bool matchType<packed8>(Type type) { return type == Type::packed8; }
template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
// clang-format on
static inline std::ostream& operator<<(std::ostream& out, Type type) {
@ -275,6 +303,9 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) {
case Type::float16 : out << "float16"; break;
case Type::float32 : out << "float32"; break;
case Type::float64 : out << "float64"; break;
case Type::packed8 : out << "packed8"; break;
case Type::packed16: out << "packed16"; break;
}
return out;
}
@ -296,6 +327,9 @@ template <> inline std::string request<uint64_t>() { return "uint64"; }
template <> inline std::string request<float16>() { return "float16"; }
template <> inline std::string request<float>() { return "float32"; }
template <> inline std::string request<double>() { return "float64"; }
template <> inline std::string request<packed8>() { return "packed8"; }
template <> inline std::string request<packed16>() { return "packed16"; }
// clang-format on
static Type inline typeFromString(const std::string& str) {

View File

@ -24,7 +24,6 @@ private:
// the autotuner runs each algorithm at least this 'collectStatMax' number of times and
// collects the statistics.
const size_t collectStatMax = 50;
UPtr<timer::CPUTimer> timer_;
// This structure holds a hash key an algorithm function (e.g. int16, packed gemm, mkl gemm)

View File

@ -4,6 +4,7 @@
#include <memory>
#include <vector>
#include <list>
namespace marian {

View File

@ -1,8 +1,8 @@
#include "graph/expression_graph.h"
#include <sstream>
#include "tensors/tensor_operators.h"
#include <sstream>
namespace marian {
ExpressionGraph::ExpressionGraph(bool inference)
@ -70,7 +70,6 @@ void createSubtape(Expr node) {
node->setSubtape(subtape);
}
void ExpressionGraph::forwardNext() {
// @TODO: check if allocation works properly
tensors_->clearShorttermMemory();

View File

@ -130,11 +130,12 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
bool inferenceOnly_{false};
bool optimized_{false}; // during inference, use optimizations that might lead to precision loss, e.g. 8-bit MatMul.
// during inference, use optimizations that might lead to precision loss, e.g. 8-bit MatMul.
// At this moment, this is used for int16 qunatized Matmul - 11/1/2019
bool optimized_{false};
bool checkpointing_{false}; // use gradient checkpointing if true
bool reloaded_{false};
std::string namespace_;
bool throwNaN_{false};
@ -152,6 +153,8 @@ protected:
ElementTypeParamsMap paramsByElementType_;
Ptr<Backend> backend_;
std::string namespace_;
public:
/** @brief Constructs a new expression graph
*
@ -260,32 +263,64 @@ public:
dot.close();
}
private:
// Find the named parameter and its typed parent parameter object (params) and return both.
// If the parameter is not found return the parent parameter object that the parameter should be added to.
// Return [nullptr, nullptr] if no matching parent parameter object exists.
std::tuple<Expr, Ptr<Parameters>> findParams(const std::string& name,
Type elementType,
bool typeSpecified) const {
Expr p; Ptr<Parameters> params;
if(typeSpecified) { // type has been specified, so we are only allowed to look for a parameter with that type
auto it = paramsByElementType_.find(elementType);
if(it != paramsByElementType_.end()) {
params = it->second;
p = params->get(name);
}
} else { // type has not been specified, so we take any type as long as the name matches
for(auto kvParams : paramsByElementType_) {
p = kvParams.second->get(name);
if(p) { // p has been found, return with matching params object
params = kvParams.second;
break;
}
if(kvParams.first == elementType) // even if p has not been found, set the params object to be returned
params = kvParams.second;
}
}
return std::make_tuple(p, params);
}
Expr param(const std::string& pname,
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
const Type valueType,
bool fixed = false) {
const Type elementType,
bool fixed,
bool typeSpecified) {
std::string name = pname;
if(!namespace_.empty())
name = namespace_ + "::" + name;
// check first if parameter already exists
auto it = paramsByElementType_.find(valueType);
Ptr<Parameters> params = it != paramsByElementType_.end() ? it->second : nullptr;
Expr p; Ptr<Parameters> params; std::tie
(p, params) = findParams(name, elementType, typeSpecified);
if(!params) {
params = New<Parameters>(valueType);
params = New<Parameters>(elementType);
params->init(backend_);
paramsByElementType_.insert({valueType, params});
paramsByElementType_.insert({elementType, params});
} else {
Expr p = params->get(name);
if(p) {
// if yes add to tape and return
ABORT_IF(shape != p->shape(),
"Requested shape {} for existing parameter '{}' does not match "
"original shape {}",
shape,
name,
p->shape());
"Requested shape {} for existing parameter '{}' does not match "
"original shape {}",
shape,
name,
p->shape());
p->setTrainable(!fixed);
add(p);
@ -295,16 +330,16 @@ public:
// if graph was reloaded do not allow creation of new parameters
ABORT_IF(reloaded_,
"Graph was reloaded and parameter '{}' is newly created",
name);
"Graph was reloaded and parameter '{}' with type {} (specified: {}) is newly created",
name, elementType, typeSpecified);
// if not check if name is not taken by other node
auto other = get(name);
ABORT_IF(other, "Parameter with name '{}' already exists and has type {}", name, other->value_type());
// create parameter node (adds to tape)
Expr p = Expression<ParamNode>(shared_from_this(), shape, init, valueType, fixed);
LOG(debug, "Created parameter {} with shape {} and type {}", pname, shape, valueType);
p = Expression<ParamNode>(shared_from_this(), shape, init, elementType, fixed);
LOG(debug, "Created parameter {} with shape {} and type {}", name, shape, elementType);
// set name and id and add to list of parameters
p->set_name(name);
@ -313,17 +348,28 @@ public:
return p;
}
public:
Expr param(const std::string& pname,
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
const Type elementType,
bool fixed = false) {
// since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
}
Expr param(const std::string& pname,
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
bool fixed = false) {
return param(pname, shape, init, defaultElementType_, fixed);
// since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
}
Expr constant(const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
Type valueType) {
return Expression<ConstantNode>(shared_from_this(), shape, init, valueType);
Type elementType) {
return Expression<ConstantNode>(shared_from_this(), shape, init, elementType);
}
Expr constant(const Shape& shape,
@ -351,32 +397,38 @@ public:
Type::uint32);
}
Expr ones(const Shape& shape, Type valueType) {
return constant(shape, inits::ones(), valueType);
Expr ones(const Shape& shape, Type elementType) {
return constant(shape, inits::ones(), elementType);
}
Expr ones(const Shape& shape) {
return constant(shape, inits::ones(), defaultElementType_);
}
Expr zeros(const Shape& shape, Type valueType) {
return constant(shape, inits::zeros(), valueType);
Expr zeros(const Shape& shape, Type elementType) {
return constant(shape, inits::zeros(), elementType);
}
Expr zeros(const Shape& shape) {
return constant(shape, inits::zeros(), defaultElementType_);
}
// prob = dropProb, e.g. 0.1 means 90% of values are kept
Expr dropoutMask(float dropProb, const Shape& shape, Type valueType);
Expr dropoutMask(float dropProb, const Shape& shape, Type elementType);
Expr dropoutMask(float dropProb, const Shape& shape);
Expr get(std::string name) {
if(!namespace_.empty())
name = namespace_ + "::" + name;
Expr p; Ptr<Parameters> params; std::tie
(p, params) = findParams(name, defaultElementType_, /*specifiedType=*/false);
return p;
}
for(auto kvParams : paramsByElementType_)
return kvParams.second->get(name);
return Expr();
Expr get(std::string name, Type specifiedElementType) {
if(!namespace_.empty())
name = namespace_ + "::" + name;
Expr p; Ptr<Parameters> params; std::tie
(p, params) = findParams(name, specifiedElementType, /*specifiedType=*/true);
return p;
}
Ptr<Parameters>& params() {
@ -445,7 +497,7 @@ public:
// skip over special parameters starting with "special:"
if(pName.substr(0, 8) == "special:")
continue;
param(pName, item.shape, inits::fromItem(item));
param(pName, item.shape, inits::fromItem(item), item.type, /*fixed=*/false);
}
if(markReloaded)
setReloaded(true);
@ -488,15 +540,11 @@ public:
void save(std::vector<io::Item>& ioItems, Type saveElementType = Type::float32);
void save(const std::string& name, const std::string& meta = "", Type saveElementType = Type::float32) {
// LOG(info, "Saving model to {}", name);
std::vector<io::Item> ioItems;
save(ioItems, saveElementType);
if(!meta.empty())
io::addMetaToItems(meta, "special:model.yml", ioItems);
io::saveItems(name, ioItems);
// LOG(info, "Saved {} items.", ioItems.size());
}
};

View File

@ -0,0 +1,103 @@
#pragma once
#include "graph/expression_graph.h"
#include "tensors/cpu/sharp/packed_gemm.h"
namespace marian {
// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
// The decision which weights can be packed or not should be done walking through the graph.
// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
// We will improve this in the near future.
class ExpressionGraphPackable : public ExpressionGraph {
public:
ExpressionGraphPackable()
: ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference
virtual ~ExpressionGraphPackable() {}
// Convert model weights into packed format and save to IO items.
// @TODO: review this
void packAndSave(const std::string& name, const std::string& meta, std::string& saveGemmType, Type saveElementType = Type::float32) {
std::vector<io::Item> ioItems;
// sorted by name in std::map
for (auto p : params()->getMap()) {
std::string pName = p.first;
if (!namespace_.empty()) {
if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
pName = pName.substr(namespace_.size() + 2);
}
Tensor val = p.second->val();
// save as packed format
// @TODO Hardcoded to find packable weights - all the weights used for affine op
if (saveGemmType == "fp16packed" && pName.find("_W") == pName.length() - 3) {
using namespace marian::cpu::variant;
// packing information
int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
uint64_t packsize;
PackInfoFp32(val->shape(),
false,
nrow,
ncol,
kernel_ncol_blocks,
brow,
bcol,
last_brow,
nbrow,
nbcol,
packsize);
auto allocator = New<TensorAllocator>(getBackend());
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
// PackFp32
PackFp32(packedTensor,
val->data(),
false,
nrow,
ncol,
kernel_ncol_blocks,
brow,
bcol,
last_brow,
nbrow,
nbcol,
packsize);
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = Type::packed16;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
// tensor size. Saving to *.npz will cut to size.
auto mem = packedTensor->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
} else {
io::Item item;
val->get(item, pName);
item.convert(saveElementType);
ioItems.emplace_back(std::move(item));
}
}
if (!meta.empty())
io::addMetaToItems(meta, "special:model.yml", ioItems);
io::saveItems(name, ioItems);
}
};
} // namespace marian

View File

@ -403,8 +403,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
// Currently only true when command line options
// --optimize --cpu-thread=N with N > 0 are set.
if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()
&& a->graph()->getBackend()->getGemmType() == GemmType::IntrinInt16) {
if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()) {
// dotInt16 computes A * B.T, hence the transpose for B to get A * B
// if transA = false and transB = false.
@ -422,198 +421,73 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) {
return Expression<DotBatchedNodeOp>(a, b, transA, transB, scale);
}
static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
// general version, MKL, CBlas or CUDA
// if clipValue > 0, the inputs will be clipped to range [-clipValue,
// clipValue] This is meant to keep values at the same range as used during
// training when optimizing for 8-bit integer products. Likely to be removed
// in the future when we explore better ways to handle this.
float clipValue = a->graph()->getBackend()->getClip();
int rows = a->shape().elements() / a->shape()[-1];
Expr ones = a->graph()->ones({ rows, 1 });
std::vector<Expr> nodes
= { clip(a, clipValue), clip(b, clipValue), bias, ones };
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
}
// This operation used to implement auto-tuning. We have removed it for now due to complexity, but plan to revisit it in the future.
// The last branch with auto-tuner is:
// youki/packed-model-pr-backup1031
// https://machinetranslation.visualstudio.com/Marian/_git/marian-dev?version=GByouki%2Fpacked-model-pr-backup1031
// SHA: 3456a7ed1d1608cfad74cd2c414e7e8fe141aa52
Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
auto device = a->graph()->getDeviceId().type;
float clipValue = a->graph()->getBackend()->getClip();
Type aElementType = a->value_type();
Type bElementType = b->value_type();
if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()) {
GemmType gemmType = a->graph()->getBackend()->getGemmType();
// When gemmType is set to 'auto', an autotuner decides the best algorithm available.
// A new autotuner is created, then different kinds of algorithms are added to the autotuner.
// For each GEMM size, there is a unique hash key.
// (e.g. m, n, k, transpose A, transpose B, bias size for GEMM)
if(gemmType == GemmType::Auto) {
thread_local Ptr<AutoTuner<Expr>> tuner = New<AutoTuner<Expr>>();
// start with new set of algorithms
tuner->clear();
// lower precicion for shapes, reduces data sparsity
auto sh = [](Shape sh) {
for(size_t i = 0; i < sh.size(); ++i)
sh.set(i, sh[i] / 4);
return sh;
};
// create context for current call as hash
std::size_t hash = sh(a->shape()).hash();
util::hash_combine(hash, sh(b->shape()).hash());
util::hash_combine(hash, sh(bias->shape()).hash());
util::hash_combine(hash, transA);
util::hash_combine(hash, transB);
#if USE_FBGEMM
// Use Packed GEMM only if the node b in the graph is memoized.
// More specifically, packed GEMM is used only if the B matrix (weight) is constant.
// In general, 'memoized' means that the node is a constant variable or
// a combination of contant nodes which is also a constant variable
// when it's computed once.
// Those memoized nodes are cached to avoid duplicated computations.
// 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
// one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
// It looks at the cpu register
// (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support() && b->memoize()) {
// add packed GEMM algorithm variant (Packed GEMM) to the autotuner
// Once an algorithm is added to the autotuner,
// autotuner runs all the added algorithms for a designated times.
// One algorithm is run per one this operation call
// and the stat for that algorithm is collected.
// When all the algorithms reach the maximum stat collection count,
// the autotuner decide the best algorithm, and keep using it afterward.
size_t hashPack = hash;
util::hash_combine(hashPack, 1);
auto recPack = [=](Expr e, bool stop = false) {
e->record(tuner, hashPack, stop);
return e;
};
auto algPack = [=]() {
auto packed = cpu::variant::pack(b, cpu::variant::PackMatrix::B, transB, clipValue);
return recPack(
cpu::variant::affine(
clip(a, clipValue),
packed,
b->shape(),
bias,
transA,
transB,
scale),
true);
};
tuner->insert({hashPack, algPack});
}
#endif // USE_FBGEMM
// add second algorithm variant (Int16) to the autotuner
size_t hashInt16 = hash;
util::hash_combine(hashInt16, 2);
auto recInt16 = [=](Expr e, bool stop = false) {
e->record(tuner, hashInt16, stop);
return e;
};
auto algInt16 = [=]() {
return recInt16(
cpu::int16::affine(
recInt16(
cpu::int16::quantize(
transA ? recInt16(transpose(a)) : a,
clipValue)),
cpu::int16::quantize(
transB ? b : transpose(b),
clipValue),
bias,
scale),
true);
};
tuner->insert({hashInt16, algInt16});
// add third algorithm variant (CBlas) to the autotuner
size_t hashCblas = hash;
util::hash_combine(hashCblas, 3);
auto recCblas = [=](Expr e, bool stop = false) {
e->record(tuner, hashCblas, stop);
return e;
};
auto algCblas = [=]() {
auto ac = clip(a, clipValue);
if(ac != a)
ac = recCblas(ac);
auto bc = clip(b, clipValue);
if(bc != b)
bc = recCblas(bc);
int rows = ac->shape().elements() / ac->shape()[-1];
Expr ones = ac->graph()->ones({rows, 1}, bias->value_type());
std::vector<Expr> nodes = {ac, bc, bias, ones};
return recCblas(Expression<AffineNodeOp>(nodes, transA, transB, scale),
true);
};
tuner->insert({hashCblas, algCblas});
// execute algorithm with autotuning
return tuner->run();
} else {
if(gemmType == GemmType::IntrinInt16) {
if(device == DeviceType::cpu) {
if(isFloat(aElementType) && isFloat(bElementType)) {
if(a->graph()->getBackend()->isOptimized()) {
// cpu int16 version
return cpu::int16::affine(
cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
cpu::int16::quantize(transB ? b : transpose(b), clipValue),
bias,
scale);
} else if(gemmType == GemmType::FbFp16Packed) {
#if USE_FBGEMM
// 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
// one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
// It looks at the cpu register
// (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support() && b->memoize()) {
auto packed = cpu::variant::pack(b, cpu::variant::PackMatrix::B, transB, clipValue);
return cpu::variant::affine(
clip(a, clipValue),
packed,
b->shape(),
bias,
transA,
transB,
scale);
} else {
int rows = a->shape().elements() / a->shape()[-1];
Expr ones = a->graph()->ones({rows, 1}, bias->value_type());
std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias, ones};
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
}
#else
ABORT("Packed GEMM is not available in this build");
#endif // USE_FBGEMM
} else if(gemmType == GemmType::MklFp32) {
// general version, MKL, CBlas or CUDA
// if clipValue > 0, the inputs will be clipped to range [-clipValue,
// clipValue] This is meant to keep values at the same range as used during
// training when optimizing for 8-bit integer products. Likely to be removed
// in the future when we explore better ways to handle this.
int rows = a->shape().elements() / a->shape()[-1];
Expr ones = a->graph()->ones({rows, 1}, bias->value_type());
std::vector<Expr> nodes
= {clip(a, clipValue), clip(b, clipValue), bias, ones};
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
cpu::int16::quantize(transB ? b : transpose(b), clipValue),
bias,
scale);
} else {
ABORT("GemmType..{} not available by affine()", gemmType);
return affineDefault(a, b, bias, transA, transB, scale);
}
} else if(isFloat(aElementType) && isPacked(bElementType)) {
#if USE_FBGEMM
// 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
// one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
// It looks at the cpu register
// (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support()) {
return cpu::variant::affine(clip(a, clipValue),
b,
b->shape(),
bias,
transA,
transB,
scale);
} else {
ABORT("No on-the-fly packing at the moment");
}
#else
ABORT("Packed GEMM is not available in this build");
#endif // USE_FBGEMM
} else {
ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType);
}
} else {
// general version, MKL, CBlas or CUDA
// if clipValue > 0, the inputs will be clipped to range [-clipValue,
// clipValue] This is meant to keep values at the same range as used during
// training when optimizing for 8-bit integer products. Likely to be removed
// in the future when we explore better ways to handle this.
int rows = a->shape().elements() / a->shape()[-1];
Expr ones = a->graph()->ones({rows, 1});
std::vector<Expr> nodes
= {clip(a, clipValue), clip(b, clipValue), bias, ones};
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
// Default GEMM
return affineDefault(a, b, bias, transA, transB, scale);
}
}

View File

@ -85,7 +85,7 @@ public:
virtual void setId(size_t id) override { id_ = id; }
virtual size_t getId() override { return id_; }
virtual void increaseEdges(size_t edges = 1) { edges_ += edges; };
virtual void decreaseEdges(size_t edges = 1) { edges_ -= edges; };
virtual size_t edges() { return edges_; };

View File

@ -39,7 +39,7 @@ class LambdaInitConvert : public NodeInitializer {
auto sharedAllocator = allocator_.lock();
ABORT_IF(!sharedAllocator, "Allocator in LambdaInitConvert has not been set or expired");
auto memory = sharedAllocator->alloc(tensor->size(), intermediateType_);
auto memory = sharedAllocator->alloc(requiredBytes(tensor->shape(), intermediateType_));
auto temp = TensorBase::New(memory,
tensor->shape(),
intermediateType_,
@ -183,11 +183,11 @@ Ptr<NodeInitializer> fromItem(const io::Item& item) {
"Tensor type ({}) and type for mapping ({}) do not match",
tensor->type(),
item.type);
ABORT_IF(tensor->size() != item.size() / sizeOf(item.type),
"Tensor size ({}) and mapped size ({}) do not match",
tensor->size(),
item.size() / sizeOf(item.type));
auto mp = MemoryPiece::New((uint8_t*)item.ptr, tensor->size() * sizeOf(item.type));
ABORT_IF(tensor->shape() != item.shape,
"Tensor shape ({}) and shape of mapped item ({}) do not match",
tensor->shape(),
item.shape);
auto mp = MemoryPiece::New((uint8_t*)item.ptr, item.size()); // @TODO: this is not properly aligned now
tensor->reset(mp);
});
} else {

View File

@ -40,6 +40,10 @@ public:
LOG(debug, "Created parameter object of type {}", acceptedElementType_);
}
~Parameters() {
LOG(debug, "Destroyed parameter object of type {}", acceptedElementType_);
}
auto begin() -> decltype(params_.begin()) { return params_.begin(); }
auto end() -> decltype(params_.begin()) { return params_.end(); }
@ -58,6 +62,8 @@ public:
size_t size() { return params_.size(); }
void add(Expr p, const std::string& name) {
LOG(debug, "Adding parameter {} to parameter object of type {}", name, acceptedElementType_);
ABORT_IF(named_.count(name), "Parameter '{}' already exists", name);
ABORT_IF(p->value_type() != acceptedElementType_,
"Requested parameter type ({}) is different from chosen parameter type ({})",

View File

@ -281,7 +281,7 @@ namespace marian {
factorB = slice(b_, -1, Slice((int)range.first, (int)range.second));
}
// @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits
auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
// optionally add lemma-dependent bias
if (Plemma) { // [B... x U0]
int lemmaVocabDim = Plemma->shape()[-1];

View File

@ -10,6 +10,11 @@
#include "translator/scorers.h"
#include "data/alignment.h"
#include "data/vocab_base.h"
#include "graph/expression_graph_packable.h"
#if USE_FBGEMM
#include "fbgemm/Utils.h"
#endif
namespace marian {
@ -69,10 +74,6 @@ public:
device_ = New<cpu::WrappedDevice>(deviceId);
graph_->setDevice(deviceId, device_);
// Use packed GEMM for the production
graph_->getBackend()->setOptimized(true);
graph_->getBackend()->setGemmType("fp16packed");
#if MKL_FOUND
mkl_set_num_threads(options->get<int>("mkl-threads", 1));
#endif
@ -211,5 +212,64 @@ std::vector<Ptr<IVocabWrapper>> loadVocabs(const std::vector<std::string>& vocab
return res;
}
// query CPU AVX version
DecoderCpuAvxVersion getCpuAvxVersion() {
#if USE_FBGEMM
// Default value is AVX
DecoderCpuAvxVersion cpuAvxVer = DecoderCpuAvxVersion::AVX;
if (fbgemm::fbgemmHasAvx512Support())
cpuAvxVer = DecoderCpuAvxVersion::AVX512;
else if (fbgemm::fbgemmHasAvx2Support())
cpuAvxVer = DecoderCpuAvxVersion::AVX2;
return cpuAvxVer;
#else
// Default value is AVX
return DecoderCpuAvxVersion::AVX;
#endif
}
DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) {
if (name == "avx") {
return DecoderCpuAvxVersion::AVX;
} else if (name == "avx2") {
return DecoderCpuAvxVersion::AVX2;
} else if (name == "avx512") {
return DecoderCpuAvxVersion::AVX512;
} else {
ABORT("Unknown CPU Instruction Set: {}", name);
return DecoderCpuAvxVersion::AVX;
}
}
// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc.
bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec) {
std::cout << "Converting from: " << inputFile << ", to: " << outputFile << std::endl;
YAML::Node config;
std::stringstream configStr;
marian::io::getYamlFromModel(config, "special:model.yml", inputFile);
configStr << config;
auto graph = New<ExpressionGraphPackable>();
graph->setDevice(CPU0);
graph->getBackend()->setOptimized(false);
graph->load(inputFile);
graph->forward();
std::string saveGemmType = "fp32default";
if (targetPrec == 16)
saveGemmType = "fp16packed";
else if (targetPrec == 8)
saveGemmType = "int8packed";
// added a flag if the weights needs to be packed or not
graph->packAndSave(outputFile, configStr.str(), saveGemmType); // @TODO: this should just be type-based
std::cout << "Conversion Finished." << std::endl;
return true;
}
} // namespace quicksand
} // namespace marian

View File

@ -25,6 +25,12 @@ typedef std::tuple<WordIndices, AlignmentSets, float> QSSentenceWithProb;
typedef std::vector<QSSentenceWithProb> QSNBest;
typedef std::vector<QSNBest> QSNBestBatch;
enum class DecoderCpuAvxVersion {
AVX,
AVX2,
AVX512
};
Ptr<Options> newOptions();
template <class T>
@ -64,5 +70,11 @@ Ptr<IBeamSearchDecoder> newDecoder(Ptr<Options> options,
// load src and tgt vocabs
std::vector<Ptr<IVocabWrapper>> loadVocabs(const std::vector<std::string>& vocabPaths);
// query CPU AVX version
DecoderCpuAvxVersion getCpuAvxVersion();
DecoderCpuAvxVersion parseCpuAvxVersion(std::string name);
bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec);
} // namespace quicksand
} // namespace marian

View File

@ -77,7 +77,6 @@ public:
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
}
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));

View File

@ -193,12 +193,6 @@ public:
return alignedSize(num * sizeof(T));
}
size_t capacity(size_t num, Type type) { return alignedSize(num * sizeOf(type)); }
MemoryPiece::PtrType alloc(size_t num, Type type) {
return alloc(num * sizeOf(type));
}
template <typename T>
MemoryPiece::PtrType alloc(size_t num) {
return alloc(capacity<T>(num));

View File

@ -41,10 +41,6 @@ public:
// for GPU, this is invalid. for gpu, isOptimized() function always returns false.
virtual void setOptimized(bool optimize) = 0;
virtual bool isOptimized() = 0;
// for CPU, selects different GEMM types for the inference.
// for GPU, there's no gemm type. so, it does nothing.
virtual void setGemmType(std::string gemmType) = 0;
virtual GemmType getGemmType() = 0;
};
Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);

View File

@ -22,18 +22,6 @@ public:
// for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
void setOptimized(bool optimize) override { optimized_ = optimize; }
bool isOptimized() override { return optimized_; }
// for CPU only, selects different GEMM types for the inference. Does nothing for GPU.
void setGemmType(std::string gemmType) override {
if (gemmType == "auto") gemmType_ = GemmType::Auto;
else if (gemmType == "mklfp32") gemmType_ = GemmType::MklFp32;
else if (gemmType == "intrinint16") gemmType_ = GemmType::IntrinInt16;
#if USE_FBGEMM
else if (gemmType == "fp16packed") gemmType_ = GemmType::FbFp16Packed;
else if (gemmType == "int8packed") gemmType_ = GemmType::FbInt8Packed;
#endif // USE_FBGEMM
else ABORT("Unknown GEMM type - '{}'", gemmType);
}
GemmType getGemmType() override { return gemmType_; }
};
} // namespace cpu
} // namespace marian

View File

@ -4,7 +4,6 @@
#include "tensors/cpu/sharp/packed_gemm.h"
#if USE_FBGEMM
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
@ -70,7 +69,7 @@ struct PackNodeOp : public UnaryNodeOp {
NodeOps forwardOps() override {
return {NodeOp(PackFp32(val_,
child(0)->val(),
child(0)->val()->data(),
transpose_,
nrow_,
ncol_,
@ -97,17 +96,17 @@ struct PackNodeOp : public UnaryNodeOp {
// Should be 2D - weight matrix
ABORT_IF(shapeMat.size() != 2,
"Weight Matrix should be 2D");
nrow_ = transpose ? shapeMat[1] : shapeMat[0];
ncol_ = transpose ? shapeMat[0] : shapeMat[1];
kernel_ncol_blocks_ = 2;
brow_ = 512;
bcol_ = 8 * kernel_ncol_blocks_;
last_brow_ = nrow_ % brow_ == 0 ? brow_ : nrow_ % brow_;
nbrow_ = nrow_ % brow_ == 0 ? nrow_ / brow_ : (nrow_ + brow_) / brow_;
nbcol_ = ncol_ % bcol_ == 0 ? ncol_ / bcol_ : (ncol_ + bcol_) / bcol_;
const int padding = 1024; // required by sw pipelined kernels
const int specialMem = 256;
packsize_ = ((nbrow_ * brow_) * (nbcol_ * bcol_)) * sizeof(fbgemm::float16) + padding + specialMem;
PackInfoFp32(shapeMat,
transpose,
nrow_,
ncol_,
kernel_ncol_blocks_,
brow_,
bcol_,
last_brow_,
nbrow_,
nbcol_,
packsize_);
Shape outShape({(int)packsize_});

88
src/tensors/cpu/sharp/packed_gemm.cpp Executable file → Normal file
View File

@ -20,16 +20,13 @@
//#pragma comment(linker, "/ignore:4217") // locally defined symbol ...asmjit... imported
#endif
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#endif
#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
#include "3rd_party/fbgemm/include/fbgemm/QuantUtils.h"
#include "3rd_party/fbgemm/include/fbgemm/Fbgemm.h"
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
@ -68,6 +65,14 @@ namespace variant { // Variants of GEMM implementations
// different marian instances should not share this variable.
static thread_local PackedGemmMatrixFP16 packedPlaceholder(1, 1, 1, 1, 1, 1, 1, 1);
// Copied code from fbgemm. It's padding required from some kernel in FBGEMM
// Verbatim - 'required by sw pipelined kernels'
// https://github.com/marian-nmt/FBGEMM/blob/master/include/fbgemm/FbgemmFP16.h#L109
const int PACK16_PADDING = 1024;
// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
const int PACK16_SPECIALMEM = 256;
// This is copied from FBGEMM code
// A better way?
// will be removed, when FBGEMM api is changed
@ -106,8 +111,39 @@ inline uint64_t addr(const int r_,
return index;
}
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
uint64_t& packsize) {
int nrow, ncol, kernel_ncol_blocks, brow = 512, bcol, last_brow, nbrow, nbcol;
PackInfoFp32(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
}
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
int& nrow,
int& ncol,
int& kernel_ncol_blocks,
int& brow,
int& bcol,
int& last_brow,
int& nbrow,
int& nbcol,
uint64_t& packsize) {
nrow = transpose ? shape[1] : shape[0];
ncol = transpose ? shape[0] : shape[1];
kernel_ncol_blocks = 2;
brow = 512;
bcol = 8 * kernel_ncol_blocks;
last_brow = nrow % brow == 0 ? brow : nrow % brow;
nbrow = nrow % brow == 0 ? nrow / brow : (nrow + brow) / brow;
nbcol = ncol % bcol == 0 ? ncol / bcol : (ncol + bcol) / bcol;
ABORT_IF(ncol % bcol != 0, "ncol (number of columns) should be multiple of 16. {}", ncol);
packsize = ((nbrow * brow) * (nbcol * bcol)) * sizeof(fbgemm::float16) + PACK16_PADDING
+ PACK16_SPECIALMEM;
}
void PackFp32(marian::Tensor out,
const marian::Tensor in,
const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
const bool transpose,
const int nrow,
const int ncol,
@ -141,11 +177,10 @@ void PackFp32(marian::Tensor out,
fbgemm::float16* outmem = (fbgemm::float16*)(outmemorg + 256);
fbgemm::float16* dummy = new fbgemm::float16;
// pack the matrix
float* inmem = in->data();
for(int i = 0; i < nrow; i++) {
for(int j = 0; j < ncol; j++) {
outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
= tconv(!transpose ? inmem[i * ncol + j] : inmem[i + nrow * j], *dummy);
= tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
}
}
delete dummy;
@ -186,15 +221,17 @@ void GemmPackFp32(marian::Tensor C,
// packed matrix
packedPlaceholder.pmat_ = (fbgemm::float16*)(B->data<uint8_t>() + 256);
if(bias != nullptr) {
#if MKL_FOUND
for(int i = 0; i < m; ++i) {
mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
}
for(int i = 0; i < m; ++i) {
mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
}
#else
for(int i = 0; i < m; ++i) {
std::copy(bias->data(), bias->data() + n, C->data() + n * i);
}
for(int i = 0; i < m; ++i) {
std::copy(bias->data(), bias->data() + n, C->data() + n * i);
}
#endif
}
#ifdef _OPENMP
#pragma omp parallel
@ -211,7 +248,7 @@ void GemmPackFp32(marian::Tensor C,
(int)m,
A->data(),
packedPlaceholder,
1,
bias != nullptr ? 1.0 : 0.0,
C->data(),
tid,
num_threads);
@ -221,8 +258,31 @@ void GemmPackFp32(marian::Tensor C,
packedPlaceholder.pmat_ = pmat;
}
#else // USE_FBGEMM
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
uint64_t& packsize) {
// does nothing. supports only FBGEMM based packed gemm at this moment.
ABORT("FBGEMM is needed to use packed GEMM.");
}
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
int& nrow,
int& ncol,
int& kernel_ncol_blocks,
int& brow,
int& bcol,
int& last_brow,
int& nbrow,
int& nbcol,
uint64_t& packsize) {
// does nothing. supports only FBGEMM based packed gemm at this moment.
ABORT("FBGEMM is needed to use packed GEMM.");
}
void PackFp32(marian::Tensor out,
const marian::Tensor in,
const float* inData,
const bool transpose,
const int nrow,
const int ncol,

View File

@ -6,9 +6,25 @@ namespace marian {
namespace cpu {
namespace variant { // Variants of GEMM implementations
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
/*out*/uint64_t& packsize);
void PackInfoFp32(const marian::Shape& shape,
const bool transpose,
int& nrow,
int& ncol,
int& kernel_ncol_blocks,
int& brow,
int& bcol,
int& last_brow,
int& nbrow,
int& nbcol,
/*out*/uint64_t& packsize); // @TODO: change to size_t where appropriate
// Pack a matrix into cache utilization efficient way (block format)
// out: output tensor - packed format
// in: input tensor - normal format
// inData: input tensor data - pointer of float data
// transpose: the matrix is transposed
// nrow: the number of rows
// ncol: the number of columns
@ -21,7 +37,7 @@ namespace variant { // Variants of GEMM implementations
// packsize: the size of the packed matrix
// (the number of fp16 elements + padding (1024) + extra temporary memory (256))
void PackFp32(marian::Tensor out,
const marian::Tensor in,
const float* inData,
const bool transpose,
const int nrow,
const int ncol,
@ -31,7 +47,7 @@ void PackFp32(marian::Tensor out,
const int last_brow,
const int nbrow,
const int nbcol,
const uint64_t packsize);
const uint64_t packsize); // @TODO: change to size_t where appropriate
// GEMM operation on the packed B matrix
// C: output matrix

View File

@ -59,16 +59,6 @@ public:
return false;
}
// for CPU, selects different GEMM types for the inference.
// for GPU, there's no gemm type. so, it does nothing.
void setGemmType(std::string gemmType) override {
LOG_ONCE(info, "setGemmType() not supported for GPU_{}", gemmType);
}
GemmType getGemmType() override {
LOG_ONCE(info, "getGemmType() not supported for GPU");
return GemmType::Auto;
}
private:
cublasHandle_t cublasHandle_;
cusparseHandle_t cusparseHandle_;

View File

@ -214,12 +214,10 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
int threads = std::min(MAX_THREADS, cols_in);
if(out->type() == Type::float32) {
gInsertCols<false><<<blocks, threads>>>(
out->data<float>(), in->data<float>(), rows, cols_in, cols_out, cols_in, offset, 0);
gInsertCols<false><<<blocks, threads>>>(out->data<float>(), in->data<float>(), rows, cols_in, cols_out, cols_in, offset, 0);
#if COMPILE_FP16
} else if(out->type() == Type::float16) {
gInsertCols<false><<<blocks, threads>>>(
out->data<half>(), in->data<half>(), rows, cols_in, cols_out, cols_in, offset, 0);
gInsertCols<false><<<blocks, threads>>>(out->data<half>(), in->data<half>(), rows, cols_in, cols_out, cols_in, offset, 0);
#endif
} else {
ABORT("Concatenate1 not implemented for type {}", out->type());
@ -2130,11 +2128,11 @@ void LayerNormalizationGrad(Ptr<Allocator> allocator,
int threads = std::min(MAX_THREADS, cols);
int blocks = std::min(MAX_BLOCKS, rows);
auto tempGradGammaMemory = allocator->alloc(adj->memory()->size(), adj->type());
auto tempGradGammaMemory = allocator->alloc(adj->memory()->size());
Tensor tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend());
tempGradGamma->set(0.f);
auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type()), adj->type());
auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type()));
Tensor tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend());
tempOnes->set(1.f);

View File

@ -1,5 +1,6 @@
#include "tensors/tensor.h"
#include "tensors/tensor_operators.h"
#include "common/io.h"
namespace marian {
@ -120,13 +121,22 @@ void TensorBase::get(io::Item& item, const std::string& name) {
item.shape = shape_;
item.type = type_;
size_t bytesWithoutPadding = shape_.elements() * sizeOf(type_);
item.bytes.resize(bytesWithoutPadding);
item.bytes.resize(memory_->size());
copy(backend_,
memory_->data<char>(),
memory_->data<char>() + bytesWithoutPadding,
memory_->data<char>() + memory_->size(),
item.bytes.data());
}
void TensorBase::set(const io::Item& item) {
ABORT_IF(item.type != type_, "Tensor type {} and item type {} do not match", type_, item.type);
ABORT_IF(item.shape != shape_, "Tensor shape {} and item shape {} do not match", shape_, item.shape);
ABORT_IF(item.bytes.size() > memory_->size(), "Item data size {} too large for memory {}", item.bytes.size(), memory_->size());
copy(backend_,
item.bytes.data(),
item.bytes.data() + item.bytes.size(),
memory_->data<char>());
}
} // namespace marian

View File

@ -3,7 +3,6 @@
#include "common/definitions.h"
#include "common/shape.h"
#include "common/types.h"
#include "common/io.h"
#include "tensors/backend.h"
#include "tensors/memory_piece.h"
#ifdef CUDA_FOUND
@ -18,6 +17,10 @@
namespace marian {
namespace io {
struct Item;
}
class TensorBase {
MemoryPiece::PtrType memory_;
Shape shape_;
@ -150,6 +153,11 @@ public:
template <typename T>
void set(const T* begin, const T* end) {
ABORT_IF(end - begin != shape_.elements(),
"Vector size ({}) and underlying shape ({}, {}) do not match",
end - begin,
std::string(shape_),
memory_->size());
matchOrAbort<T>(type_);
if(backend_->getDeviceId().type == DeviceType::cpu) {
@ -167,36 +175,7 @@ public:
set(v.data(), v.data() + v.size());
}
// a binary copy with type checking
void set(const char* begin, const char* end, Type type) {
ABORT_IF(type_ != type,
"Tensor type ({}) and data type ({}) do not match",
type_,
type);
size_t dataSize = (end - begin) / sizeOf(type);
ABORT_IF(size() != dataSize,
"Tensor size ({}) and mapped size ({}) do not match",
size(),
dataSize);
if(backend_->getDeviceId().type == DeviceType::cpu) {
std::copy(begin, end, data<char>());
}
#ifdef CUDA_FOUND
else {
gpu::copy(backend_, begin, end, data<char>());
}
#endif
}
void set(const std::vector<char>& v, Type type) {
set(v.data(), v.data() + v.size(), type);
}
void set(const io::Item& item) {
set(item.bytes.data(), item.bytes.data() + item.bytes.size(), item.type);
}
void set(const io::Item& item);
// For single values enable conversion to other numeric formats if possible
template <typename T>

View File

@ -70,14 +70,13 @@ public:
void clear() { allocator_->clear(); }
size_t capacity(Shape shape, Type type = Type::float32) {
return allocator_->capacity(shape.elements(), type);
return allocator_->capacity<char>(requiredBytes(shape, type));
}
void allocate(Tensor& t, Shape shape, Type type = Type::float32) {
void allocate(/*out*/ Tensor& t, Shape shape, Type type = Type::float32) {
if(!t || t->shape() != shape) {
int size = shape.elements();
auto mem = allocator_->alloc(size, type);
t = TensorBase::New(mem, shape, type, backend_);
auto mem = allocator_->alloc(requiredBytes(shape, type));
t = Tensor(TensorBase::New(mem, shape, type, backend_));
}
}

View File

@ -41,7 +41,6 @@ int main(int argc, char** argv) {
auto g = New<ExpressionGraph>(true);
g->setDevice({0, DeviceType::cpu});
g->getBackend()->setOptimized(true);
g->getBackend()->setGemmType("auto");
g->reserveWorkspaceMB(2512);
timer::AutoTimer timer;

View File

@ -169,7 +169,7 @@ public:
// to only accept one parameter, and remove this error check can be removed.
ABORT_IF(sendbuf != recvbuf, "FakeMPIWrapper::allReduce() only implemented for in-place operation"); // otherwise it's not a no-op, we must copy data
}
#pragma warning(push)
#pragma warning(pop)
virtual void finalize() override { }
};

View File

@ -65,7 +65,6 @@ public:
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
}
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_[id] = graph;
@ -187,7 +186,6 @@ public:
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
}
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_.push_back(graph);

View File

@ -16,6 +16,7 @@
<RootNamespace>Marian</RootNamespace>
<ProjectName>Marian</ProjectName>
<WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
<CudaToolkitDir Condition="'$(CudaToolkitDir)' == ''">$(CUDA_PATH)</CudaToolkitDir>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@ -33,7 +34,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 10.1.props" />
<Import Project="$(CudaToolkitDir)\extras\visual_studio_integration\MSBuildExtensions\CUDA 10.1.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@ -42,15 +43,15 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
<IncludePath>..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;%CUDA_PATH%\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
<LibraryPath>%CUDA_PATH%\lib\x64;%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
<IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
<LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ExecutablePath)</ExecutablePath>
<IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
<IncludePath>..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;%CUDA_PATH%\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
<LibraryPath>%CUDA_PATH%\lib\x64;%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
<IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
<LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
@ -72,7 +73,7 @@
<PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>false</SDLCheck>
<TreatWarningAsError>true</TreatWarningAsError>
<AdditionalOptions>/bigobj %(AdditionalOptions) /arch:AVX</AdditionalOptions>
<AdditionalOptions>/bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebugDLL</RuntimeLibrary>
<DisableSpecificWarnings>4996; 4702</DisableSpecificWarnings>
<MultiProcessorCompilation>true</MultiProcessorCompilation>
@ -109,7 +110,7 @@
<PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<SDLCheck>false</SDLCheck>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions) /arch:AVX</AdditionalOptions>
<AdditionalOptions>/d2Zi+ /bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
<TreatWarningAsError>true</TreatWarningAsError>
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreadedDLL</RuntimeLibrary>
<RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">MultiThreaded</RuntimeLibrary>
@ -1006,6 +1007,8 @@
<ClCompile Include="..\src\common\filesystem.cpp" />
<ClCompile Include="..\src\common\file_stream.cpp" />
<ClCompile Include="..\src\common\io.cpp" />
<ClCompile Include="..\src\common\options.cpp" />
<ClCompile Include="..\src\common\types.cpp" />
<ClCompile Include="..\src\common\utils.cpp" />
<ClCompile Include="..\src\common\logging.cpp" />
<ClCompile Include="..\src\common\config.cpp" />
@ -1111,6 +1114,7 @@
</ClCompile>
<ClCompile Include="..\src\training\communicator.cpp" />
<ClCompile Include="..\src\training\graph_group_multinode_sync.cpp" />
<ClCompile Include="..\src\training\scheduler.cpp" />
<ClCompile Include="..\src\translator\history.cpp" />
<ClCompile Include="..\src\translator\output_collector.cpp" />
<ClCompile Include="..\src\translator\nth_element.cpp" />
@ -1173,6 +1177,7 @@
<ClInclude Include="..\src\examples\mnist\validator.h" />
<ClInclude Include="..\src\functional\approx.h" />
<ClInclude Include="..\src\functional\operators.h" />
<ClInclude Include="..\src\graph\expression_graph_packable.h" />
<ClInclude Include="..\src\layers\loss.h" />
<ClInclude Include="..\src\layers\weight.h" />
<ClInclude Include="..\src\marian.h" />
@ -1607,6 +1612,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 10.1.targets" />
<Import Project="$(CudaToolkitDir)\extras\visual_studio_integration\MSBuildExtensions\CUDA 10.1.targets" />
</ImportGroup>
</Project>

View File

@ -724,6 +724,15 @@
<ClCompile Include="..\src\common\file_stream.cpp">
<Filter>common</Filter>
</ClCompile>
<ClCompile Include="..\src\training\scheduler.cpp">
<Filter>training</Filter>
</ClCompile>
<ClCompile Include="..\src\common\options.cpp">
<Filter>common</Filter>
</ClCompile>
<ClCompile Include="..\src\common\types.cpp">
<Filter>common</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\marian.h" />
@ -2038,6 +2047,9 @@
<ClInclude Include="..\src\3rd_party\zstr\zstr.hpp">
<Filter>3rd_party</Filter>
</ClInclude>
<ClInclude Include="..\src\graph\expression_graph_packable.h">
<Filter>graph</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Filter Include="3rd_party">