Merged PR 9284: Packed model support in production

This PR enables loading and serving of offline packed models. * Enable support for FBGEMM offline-packed models * Added different parameters sets for different parameter element types (currently inference only) * New types (packed*) are used for the packed memory format. * Added CPU instruction query into the quicksand API. Related work items: #102572
2024-09-11 06:15:56 +03:00 · 2019-11-01 17:24:49 +00:00 · 2019-11-01 17:24:49 +00:00 · 5ed441f843
commit 5ed441f843
parent 8c3cb06944
41 changed files with 615 additions and 411 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -22,6 +22,7 @@ add_library(marian STATIC
  common/io.cpp
  common/filesystem.cpp
  common/file_stream.cpp
+  common/types.cpp
  
  data/alignment.cpp
  data/vocab.cpp
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@ -4,6 +4,8 @@

 #include <sstream>

+#include "graph/expression_graph_packable.h"
+
 int main(int argc, char** argv) {
  using namespace marian;

@ -13,16 +15,18 @@ int main(int argc, char** argv) {
  {
    auto cli = New<cli::CLIWrapper>(
        options,
-        "Convert a model in the .npz format to a mmap-able binary model",
+        "Convert a model in the .npz format and normal memory layout to a mmap-able binary model which could be in normal memory layout or packed memory layout",
        "Allowed options",
        "Examples:\n"
-        "  ./marian-conv -f model.npz -t model.bin");
+        "  ./marian-conv -f model.npz -t model.bin --gemm-type fp16packed");
    cli->add<std::string>("--from,-f", "Input model", "model.npz");
    cli->add<std::string>("--to,-t", "Output model", "model.bin");
+    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used with this weights", "mklfp32");
    cli->parse(argc, argv);
  }
  auto modelFrom = options->get<std::string>("from");
  auto modelTo = options->get<std::string>("to");
+  auto saveGemmType = options->get<std::string>("gemm-type");

  LOG(info, "Outputting {}", modelTo);

@ -31,13 +35,14 @@ int main(int argc, char** argv) {
  marian::io::getYamlFromModel(config, "special:model.yml", modelFrom);
  configStr << config;

-  auto graph = New<ExpressionGraph>(true);
+  auto graph = New<ExpressionGraphPackable>();
  graph->setDevice(CPU0);
  graph->getBackend()->setOptimized(false);

  graph->load(modelFrom);
  graph->forward();
-  graph->save(modelTo, configStr.str());
+  // added a flag if the weights needs to be packed or not
+  graph->packAndSave(modelTo, configStr.str(), /* --gemm-type */ saveGemmType, Type::float32);

  // graph->saveBinary(vm["bin"].as<std::string>());

--- a/src/common/binary.cpp
+++ b/src/common/binary.cpp
@ -18,6 +18,7 @@ struct Header {
  size_t dataLength;
 };

+// cast current void pointer to T pointer and move forward by num elements 
 template <typename T>
 const T* get(const void*& current, size_t num = 1) {
  const T* ptr = (const T*)current;
@ -32,9 +33,10 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
           binaryFileVersion,
           BINARY_FILE_VERSION);

-  size_t numHeaders = *get<size_t>(current);
-  const Header* headers = get<Header>(current, numHeaders);
+  size_t numHeaders = *get<size_t>(current); // number of item headers that follow
+  const Header* headers = get<Header>(current, numHeaders); // read that many headers

+  // prepopulate items with meta data from headers
  items.resize(numHeaders);
  for(int i = 0; i < numHeaders; ++i) {
    items[i].type = (Type)headers[i].type;
@ -42,21 +44,22 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
    items[i].mapped = mapped;
  }

+  // read in actual shape and data
  for(int i = 0; i < numHeaders; ++i) {
    size_t len = headers[i].shapeLength;
-    items[i].shape.resize(len);
-    const int* arr = get<int>(current, len);
-    std::copy(arr, arr + len, items[i].shape.begin());
+    items[i].shape.resize(len); 
+    const int* arr = get<int>(current, len); // read shape
+    std::copy(arr, arr + len, items[i].shape.begin()); // copy to Item::shape 
  }

-  // move by offset bytes
+  // move by offset bytes, aligned to 256-bytes boundary
  size_t offset = *get<size_t>(current);
  get<char>(current, offset);

  for(int i = 0; i < numHeaders; ++i) {
-    if(items[i].mapped) {
+    if(items[i].mapped) { // memory-mapped, hence only set pointer
      items[i].ptr = get<char>(current, headers[i].dataLength);
-    } else {
+    } else { // reading into item data
      size_t len = headers[i].dataLength;
      items[i].bytes.resize(len);
      const char* ptr = get<char>(current, len);
@ -69,6 +72,7 @@ void loadItems(const std::string& fileName, std::vector<io::Item>& items) {
  // Read file into buffer
  size_t fileSize = filesystem::fileSize(fileName);
  std::vector<char> buf(fileSize);
+// @TODO: check this again:
 #if 1 // for some reason, the #else branch fails with "file not found" in the *read* operation (open succeeds)
  FILE *f = fopen(fileName.c_str(), "rb");
  ABORT_IF(f == nullptr, "Error {} ('{}') opening file '{}'", errno, strerror(errno), fileName);
@ -119,7 +123,8 @@ void saveItems(const std::string& fileName,
    headers.push_back(Header{item.name.size() + 1,
                             (size_t)item.type,
                             item.shape.size(),
-                             item.size()});
+                             item.size()}); // item size without padding 
+                                            // @TODO: should this be done with padding as asked below?
  }

  size_t headerSize = headers.size();
@ -146,9 +151,11 @@ void saveItems(const std::string& fileName,
  }

  // Write out all values
-  for(const auto& item : items) {
-    pos += out.write(item.data(), item.size());
-  }
+  for(const auto& item : items)
+    pos += out.write(item.data(), item.size()); // writes out data without padding, not aligned, @BUGBUG?
+
+    // @TODO: find out if padding should be enforced for memory-mapped storage like this:
+    // pos += out.write(item.data(), item.bytes.size()); // writes out data with padding
 }

 }  // namespace binary
--- a/src/common/binary.h
+++ b/src/common/binary.h
@ -5,10 +5,10 @@
 #include <string>
 #include <vector>

-// Increase this if binary format changes
-#define BINARY_FILE_VERSION 1
-
 namespace marian {
+
+const static int BINARY_FILE_VERSION = 1;
+
 namespace io {
 namespace binary {

--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -607,9 +607,6 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
      "Optimize speed aggressively sacrificing memory or precision");
  cli.add<bool>("--skip-cost",
      "Ignore model cost during translation, not recommended for beam-size > 1");
-  cli.add<std::string>("--gemm-type",
-      "Select GEMM options: auto, mklfp32, intrinint16, fp16packed, int8packed",
-      "auto");
  
  cli.add<bool>("--fp16", 
      "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@ -12,7 +12,7 @@
 #pragma GCC diagnostic ignored "-Wsuggest-override"
 #endif
 #ifdef _MSC_VER
-#pragma warning(push)
+#pragma warning(push) // 4101: 'identifier' : unreferenced local variable. One parameter variable in zstr.hpp is not used.
 #pragma warning(disable : 4101)
 #endif
 #include "3rd_party/zstr/zstr.hpp"
--- a/src/common/filesystem.cpp
+++ b/src/common/filesystem.cpp
--- a/src/common/io.cpp
+++ b/src/common/io.cpp
@ -139,8 +139,7 @@ void saveItemsNpz(const std::string& fileName, const std::vector<Item>& items) {
    else
      ABORT("Other types not supported yet");

-    npzItems.emplace_back(
-        item.name, item.bytes, shape, type, sizeOf(item.type));
+    npzItems.emplace_back(item.name, item.bytes, shape, type, sizeOf(item.type));
  }
  cnpy::npz_save(fileName, npzItems);
 }
--- a/src/common/io_item.h
+++ b/src/common/io_item.h
@ -24,11 +24,8 @@ struct Item {
      return bytes.data();
  }

-  size_t size() const {
-    if(mapped)
-      return shape.elements() * sizeOf(type);
-    else
-      return bytes.size();
+  size_t size() const { // @TODO: review this again for 256-bytes boundary alignment
+    return requiredBytes(shape, type);
  }

  // Extend this item with data and shape from the input item, creating a flattened concatenation.
--- a/src/common/options.cpp
+++ b/src/common/options.cpp
@ -49,7 +49,7 @@ namespace marian {
    }
    try {
      return !options_[key].as<std::string>().empty();
-    } catch(const YAML::BadConversion& e) {
+    } catch(const YAML::BadConversion& /* e */) {
      ABORT("Option '{}' is neither a sequence nor a text");
    }
    return false;
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@ -0,0 +1,20 @@
+#include "common/types.h"
+#include "tensors/cpu/sharp/packed_gemm.h"
+
+namespace marian {
+
+// this function calculates the amount of bytes needed to contain a tensor of given shape and type. 
+// For most situation that is trivial (just number of elements time size of single element).
+// But for instance, for intransparent types like packed tensors, it cannot easily be inferred by
+// multiplying. All cases are handed here and can later be passed to allocators etc. 
+size_t requiredBytes(const Shape& shape, Type type) {
+  if(isPacked(type)) {
+    uint64_t packsize;
+    cpu::variant::PackInfoFp32(shape, false, packsize);
+    return (size_t)packsize;
+  } else {
+    return shape.elements() * sizeOf(type);
+  }
+}
+
+}
--- a/src/common/types.h
+++ b/src/common/types.h
@ -1,5 +1,6 @@
 #pragma once
 #include "common/logging.h" // for ABORT and ABORT_IF
+#include "common/shape.h"

 #if __GNUC__ >= 7
 #pragma GCC diagnostic push
@ -133,6 +134,16 @@ do { \

 namespace marian {

+// small struct to enable templating based on types use for packing
+struct packed8 {
+  uint8_t x;
+};
+
+// small struct to enable templating based on types use for packing
+struct packed16 {
+  uint16_t x;
+};
+
 #ifndef __CUDACC__ // vectorized types not available from .cu files

 // @TODO: check what intrinsics are actually available.
@ -190,31 +201,38 @@ public:
 };
 #endif

+// Internal to types.h, don't use. Use test functions below.
 enum class TypeClass : size_t {
-  signed_type = 0x100,
+  signed_type   = 0x100,
  unsigned_type = 0x200,
-  float_type = 0x400,
-  size_mask = 0x0FF
+  float_type    = 0x400,
+  packed_type   = 0x800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
+            
+  size_mask     = 0x0FF
 };

 constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
  return (size_t)typeClass + val;
 }

+// @TODO: rename to ElementType when things become stable, so it's easier to review
 enum class Type : size_t {
-  int8  = TypeClass::signed_type + 1u,
-  int16 = TypeClass::signed_type + 2u,
-  int32 = TypeClass::signed_type + 4u,
-  int64 = TypeClass::signed_type + 8u,
+  int8     = TypeClass::signed_type + 1u,
+  int16    = TypeClass::signed_type + 2u,
+  int32    = TypeClass::signed_type + 4u,
+  int64    = TypeClass::signed_type + 8u,

-  uint8  = TypeClass::unsigned_type + 1u,
-  uint16 = TypeClass::unsigned_type + 2u,
-  uint32 = TypeClass::unsigned_type + 4u,
-  uint64 = TypeClass::unsigned_type + 8u,
+  uint8    = TypeClass::unsigned_type + 1u,
+  uint16   = TypeClass::unsigned_type + 2u,
+  uint32   = TypeClass::unsigned_type + 4u,
+  uint64   = TypeClass::unsigned_type + 8u,

-  float16 = TypeClass::float_type + 2u,
-  float32 = TypeClass::float_type + 4u,
-  float64 = TypeClass::float_type + 8u
+  float16  = TypeClass::float_type + 2u,
+  float32  = TypeClass::float_type + 4u,
+  float64  = TypeClass::float_type + 8u,
+
+  packed8  = TypeClass::packed_type + 1u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  packed16 = TypeClass::packed_type + 2u  // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
 };

 static inline size_t operator&(TypeClass typeClass, Type type) {
@ -241,23 +259,33 @@ static inline bool isFloat(Type type) {
  return (TypeClass::float_type & type) != 0;
 }

+static inline bool isPacked(Type type) {
+  return (TypeClass::packed_type & type) != 0;
+}
+
+size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type
+
 template <typename T>
 inline bool matchType(Type type);

 // clang-format off
-template <> inline bool matchType<int8_t>(Type type)   { return type == Type::int8; }
-template <> inline bool matchType<int16_t>(Type type)  { return type == Type::int16; }
-template <> inline bool matchType<int32_t>(Type type)  { return type == Type::int32; }
-template <> inline bool matchType<int64_t>(Type type)  { return type == Type::int64; }
+template <> inline bool matchType<int8_t>(Type type)   { return type == Type::int8;     }
+template <> inline bool matchType<int16_t>(Type type)  { return type == Type::int16;    }
+template <> inline bool matchType<int32_t>(Type type)  { return type == Type::int32;    }
+template <> inline bool matchType<int64_t>(Type type)  { return type == Type::int64;    }

-template <> inline bool matchType<uint8_t>(Type type)  { return type == Type::uint8; }
-template <> inline bool matchType<uint16_t>(Type type) { return type == Type::uint16; }
-template <> inline bool matchType<uint32_t>(Type type) { return type == Type::uint32; }
-template <> inline bool matchType<uint64_t>(Type type) { return type == Type::uint64; }
+// In case of packed type, it uses uint8 as underlying memory type
+template <> inline bool matchType<uint8_t>(Type type)  { return type == Type::uint8;    }
+template <> inline bool matchType<uint16_t>(Type type) { return type == Type::uint16;   }
+template <> inline bool matchType<uint32_t>(Type type) { return type == Type::uint32;   }
+template <> inline bool matchType<uint64_t>(Type type) { return type == Type::uint64;   }

-template <> inline bool matchType<float16>(Type type)  { return type == Type::float16; }
-template <> inline bool matchType<float>(Type type)    { return type == Type::float32; }
-template <> inline bool matchType<double>(Type type)   { return type == Type::float64; }
+template <> inline bool matchType<float16>(Type type)  { return type == Type::float16;  }
+template <> inline bool matchType<float>(Type type)    { return type == Type::float32;  }
+template <> inline bool matchType<double>(Type type)   { return type == Type::float64;  }
+
+template <> inline bool matchType<packed8>(Type type)  { return type == Type::packed8;  }
+template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
 // clang-format on

 static inline std::ostream& operator<<(std::ostream& out, Type type) {
@ -275,6 +303,9 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) {
    case Type::float16 : out << "float16"; break;
    case Type::float32 : out << "float32"; break;
    case Type::float64 : out << "float64"; break;
+
+    case Type::packed8 : out << "packed8"; break;
+    case Type::packed16: out << "packed16"; break;
  }
  return out;
 }
@ -296,6 +327,9 @@ template <> inline std::string request<uint64_t>() { return "uint64"; }
 template <> inline std::string request<float16>()  { return "float16"; }
 template <> inline std::string request<float>()    { return "float32"; }
 template <> inline std::string request<double>()   { return "float64"; }
+
+template <> inline std::string request<packed8>()  { return "packed8"; }
+template <> inline std::string request<packed16>() { return "packed16"; }
 // clang-format on

 static Type inline typeFromString(const std::string& str) {
--- a/src/graph/auto_tuner.h
+++ b/src/graph/auto_tuner.h
@ -24,7 +24,6 @@ private:
  // the autotuner runs each algorithm at least this 'collectStatMax' number of times and
  // collects the statistics.
  const size_t collectStatMax = 50;
-
  UPtr<timer::CPUTimer> timer_;

  // This structure holds a hash key an algorithm function (e.g. int16, packed gemm, mkl gemm)
--- a/src/graph/chainable.h
+++ b/src/graph/chainable.h
@ -4,6 +4,7 @@

 #include <memory>
 #include <vector>
+#include <list>

 namespace marian {

--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@ -1,8 +1,8 @@
 #include "graph/expression_graph.h"
-#include <sstream>
-
 #include "tensors/tensor_operators.h"

+#include <sstream>
+
 namespace marian {

 ExpressionGraph::ExpressionGraph(bool inference)
@ -70,7 +70,6 @@ void createSubtape(Expr node) {
  node->setSubtape(subtape);
 }

-
 void ExpressionGraph::forwardNext() {
  // @TODO: check if allocation works properly
  tensors_->clearShorttermMemory();
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -130,11 +130,12 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {

  bool inferenceOnly_{false};

-  bool optimized_{false};     // during inference, use optimizations that might lead to precision loss, e.g. 8-bit MatMul.
+  // during inference, use optimizations that might lead to precision loss, e.g. 8-bit MatMul.
+  // At this moment, this is used for int16 qunatized Matmul - 11/1/2019
+  bool optimized_{false};
  bool checkpointing_{false}; // use gradient checkpointing if true

  bool reloaded_{false};
-  std::string namespace_;

  bool throwNaN_{false};

@ -152,6 +153,8 @@ protected:
  ElementTypeParamsMap paramsByElementType_;
  Ptr<Backend> backend_;

+  std::string namespace_;
+
 public:
  /** @brief Constructs a new expression graph
   *
@ -260,32 +263,64 @@ public:
    dot.close();
  }

+private:
+
+  // Find the named parameter and its typed parent parameter object (params) and return both.
+  // If the parameter is not found return the parent parameter object that the parameter should be added to.
+  // Return [nullptr, nullptr] if no matching parent parameter object exists. 
+  std::tuple<Expr, Ptr<Parameters>> findParams(const std::string& name, 
+                                               Type elementType, 
+                                               bool typeSpecified) const {
+    Expr p; Ptr<Parameters> params;
+    if(typeSpecified) { // type has been specified, so we are only allowed to look for a parameter with that type
+      auto it = paramsByElementType_.find(elementType);
+      if(it != paramsByElementType_.end()) {
+        params = it->second;
+        p = params->get(name);
+      }
+    } else { // type has not been specified, so we take any type as long as the name matches
+      for(auto kvParams : paramsByElementType_) {
+        p = kvParams.second->get(name);
+        
+        if(p) { // p has been found, return with matching params object
+          params = kvParams.second;
+          break;
+        }
+        
+        if(kvParams.first == elementType) // even if p has not been found, set the params object to be returned
+          params = kvParams.second;
+      }
+    }
+
+    return std::make_tuple(p, params);
+  }
+
  Expr param(const std::string& pname,
             const Shape& shape,
             const Ptr<inits::NodeInitializer>& init,
-             const Type valueType,
-             bool fixed = false) {
+             const Type elementType,
+             bool fixed,
+             bool typeSpecified) {
    std::string name = pname;
    if(!namespace_.empty())
      name = namespace_ + "::" + name;

-    // check first if parameter already exists
-    auto it = paramsByElementType_.find(valueType);
-    Ptr<Parameters> params = it != paramsByElementType_.end() ? it->second : nullptr;
+    Expr p; Ptr<Parameters> params; std::tie
+    (p, params) = findParams(name, elementType, typeSpecified);
+    
    if(!params) { 
-      params = New<Parameters>(valueType);
+      params = New<Parameters>(elementType);
      params->init(backend_);
-      paramsByElementType_.insert({valueType, params});
+      paramsByElementType_.insert({elementType, params});
    } else {
-      Expr p = params->get(name);
      if(p) {
        // if yes add to tape and return
        ABORT_IF(shape != p->shape(),
-                "Requested shape {} for existing parameter '{}' does not match "
-                "original shape {}",
-                shape,
-                name,
-                p->shape());
+                 "Requested shape {} for existing parameter '{}' does not match "
+                 "original shape {}",
+                 shape,
+                 name,
+                 p->shape());

        p->setTrainable(!fixed);
        add(p);
@ -295,16 +330,16 @@ public:

    // if graph was reloaded do not allow creation of new parameters
    ABORT_IF(reloaded_,
-             "Graph was reloaded and parameter '{}' is newly created",
-             name);
+             "Graph was reloaded and parameter '{}' with type {} (specified: {}) is newly created",
+             name, elementType, typeSpecified);

    // if not check if name is not taken by other node
    auto other = get(name);
    ABORT_IF(other, "Parameter with name '{}' already exists and has type {}", name, other->value_type());

    // create parameter node (adds to tape)
-    Expr p = Expression<ParamNode>(shared_from_this(), shape, init, valueType, fixed);
-    LOG(debug, "Created parameter {} with shape {} and type {}", pname, shape, valueType);
+    p = Expression<ParamNode>(shared_from_this(), shape, init, elementType, fixed);
+    LOG(debug, "Created parameter {} with shape {} and type {}", name, shape, elementType);

    // set name and id and add to list of parameters
    p->set_name(name);
@ -313,17 +348,28 @@ public:
    return p;
  }

+public:
+  Expr param(const std::string& pname,
+             const Shape& shape,
+             const Ptr<inits::NodeInitializer>& init,
+             const Type elementType,
+             bool fixed = false) {
+    // since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
+    return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
+  }
+
  Expr param(const std::string& pname,
             const Shape& shape,
             const Ptr<inits::NodeInitializer>& init,
             bool fixed = false) {
-    return param(pname, shape, init, defaultElementType_, fixed);
+    // since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
+    return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
  }

  Expr constant(const Shape& shape,
                const Ptr<inits::NodeInitializer>& init,
-                Type valueType) {
-    return Expression<ConstantNode>(shared_from_this(), shape, init, valueType);
+                Type elementType) {
+    return Expression<ConstantNode>(shared_from_this(), shape, init, elementType);
  }

  Expr constant(const Shape& shape,
@ -351,32 +397,38 @@ public:
                    Type::uint32);
  }

-  Expr ones(const Shape& shape, Type valueType) {
-    return constant(shape, inits::ones(), valueType);
+  Expr ones(const Shape& shape, Type elementType) {
+    return constant(shape, inits::ones(), elementType);
  }
  Expr ones(const Shape& shape) {
    return constant(shape, inits::ones(), defaultElementType_);
  }

-  Expr zeros(const Shape& shape, Type valueType) {
-    return constant(shape, inits::zeros(), valueType);
+  Expr zeros(const Shape& shape, Type elementType) {
+    return constant(shape, inits::zeros(), elementType);
  }
  Expr zeros(const Shape& shape) {
    return constant(shape, inits::zeros(), defaultElementType_);
  }

  // prob = dropProb, e.g. 0.1 means 90% of values are kept
-  Expr dropoutMask(float dropProb, const Shape& shape, Type valueType);
+  Expr dropoutMask(float dropProb, const Shape& shape, Type elementType);
  Expr dropoutMask(float dropProb, const Shape& shape);

  Expr get(std::string name) {
    if(!namespace_.empty())
      name = namespace_ + "::" + name;
+    Expr p; Ptr<Parameters> params; std::tie
+    (p, params) = findParams(name, defaultElementType_, /*specifiedType=*/false);
+    return p;
+  }

-    for(auto kvParams : paramsByElementType_)
-      return kvParams.second->get(name);
-
-    return Expr();
+  Expr get(std::string name, Type specifiedElementType) {
+    if(!namespace_.empty())
+      name = namespace_ + "::" + name;
+    Expr p; Ptr<Parameters> params; std::tie
+    (p, params) = findParams(name, specifiedElementType, /*specifiedType=*/true);
+    return p;
  }

  Ptr<Parameters>& params() { 
@ -445,7 +497,7 @@ public:
      // skip over special parameters starting with "special:"
      if(pName.substr(0, 8) == "special:")
        continue;
-      param(pName, item.shape, inits::fromItem(item));
+      param(pName, item.shape, inits::fromItem(item), item.type, /*fixed=*/false);
    }
    if(markReloaded)
      setReloaded(true);
@ -488,15 +540,11 @@ public:
  void save(std::vector<io::Item>& ioItems, Type saveElementType = Type::float32);

  void save(const std::string& name, const std::string& meta = "", Type saveElementType = Type::float32) {
-    // LOG(info, "Saving model to {}", name);
-
    std::vector<io::Item> ioItems;
    save(ioItems, saveElementType);
    if(!meta.empty())
      io::addMetaToItems(meta, "special:model.yml", ioItems);
    io::saveItems(name, ioItems);
-
-    // LOG(info, "Saved {} items.", ioItems.size());
  }
 };

--- a/src/graph/expression_graph_packable.h
+++ b/src/graph/expression_graph_packable.h
@ -0,0 +1,103 @@
+#pragma once
+
+#include "graph/expression_graph.h"
+#include "tensors/cpu/sharp/packed_gemm.h"
+
+namespace marian {
+
+// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
+// The decision which weights can be packed or not should be done walking through the graph.
+// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
+// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
+// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
+// We will improve this in the near future. 
+class ExpressionGraphPackable : public ExpressionGraph {
+public:
+  ExpressionGraphPackable()
+    : ExpressionGraph( /* inference =  */ true) {} // Packable expression graph only supports inference
+
+  virtual ~ExpressionGraphPackable() {}
+
+  // Convert model weights into packed format and save to IO items.
+  // @TODO: review this
+  void packAndSave(const std::string& name, const std::string& meta, std::string& saveGemmType, Type saveElementType = Type::float32) {
+    std::vector<io::Item> ioItems;
+
+    // sorted by name in std::map
+    for (auto p : params()->getMap()) {
+      std::string pName = p.first;
+
+      if (!namespace_.empty()) {
+        if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
+          pName = pName.substr(namespace_.size() + 2);
+      }
+
+      Tensor val = p.second->val();
+
+      // save as packed format
+      // @TODO Hardcoded to find packable weights - all the weights used for affine op
+      if (saveGemmType == "fp16packed" && pName.find("_W") == pName.length() - 3) {
+        using namespace marian::cpu::variant;
+
+        // packing information
+        int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
+        uint64_t packsize;
+
+        PackInfoFp32(val->shape(),
+          false,
+          nrow,
+          ncol,
+          kernel_ncol_blocks,
+          brow,
+          bcol,
+          last_brow,
+          nbrow,
+          nbcol,
+          packsize);
+
+        auto allocator = New<TensorAllocator>(getBackend());
+
+        Tensor packedTensor;
+        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
+
+        // PackFp32
+        PackFp32(packedTensor,
+          val->data(),
+          false,
+          nrow,
+          ncol,
+          kernel_ncol_blocks,
+          brow,
+          bcol,
+          last_brow,
+          nbrow,
+          nbcol,
+          packsize);
+        io::Item item;
+        item.name = pName;
+        item.shape = val->shape();
+        item.type = Type::packed16;
+
+        // Use the actual memory as this will be aligned and padded.
+        // When memory mapping this is required. Shape keeps track of
+        // tensor size. Saving to *.npz will cut to size.
+        auto mem = packedTensor->memory();
+        item.bytes.resize(mem->size());
+        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
+
+        ioItems.emplace_back(std::move(item));
+      } else {
+        io::Item item;
+        val->get(item, pName);
+        item.convert(saveElementType);
+        ioItems.emplace_back(std::move(item));
+      }
+    }
+
+    if (!meta.empty())
+      io::addMetaToItems(meta, "special:model.yml", ioItems);
+    io::saveItems(name, ioItems);
+  }
+};
+
+}  // namespace marian
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -403,8 +403,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {

  // Currently only true when command line options
  // --optimize --cpu-thread=N with N > 0 are set.
-  if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()
-     && a->graph()->getBackend()->getGemmType() == GemmType::IntrinInt16) {
+  if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()) {
    // dotInt16 computes A * B.T, hence the transpose for B to get A * B
    // if transA = false and transB = false.

@ -422,198 +421,73 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) {
  return Expression<DotBatchedNodeOp>(a, b, transA, transB, scale);
 }

+static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
+  // general version, MKL, CBlas or CUDA
+
+  // if clipValue > 0, the inputs will be clipped to range [-clipValue,
+  // clipValue] This is meant to keep values at the same range as used during
+  // training when optimizing for 8-bit integer products. Likely to be removed
+  // in the future when we explore better ways to handle this.
+  float clipValue = a->graph()->getBackend()->getClip();
+
+  int rows = a->shape().elements() / a->shape()[-1];
+  Expr ones = a->graph()->ones({ rows, 1 });
+  std::vector<Expr> nodes
+    = { clip(a, clipValue), clip(b, clipValue), bias, ones };
+  return Expression<AffineNodeOp>(nodes, transA, transB, scale);
+}
+
+// This operation used to implement auto-tuning. We have removed it for now due to complexity, but plan to revisit it in the future. 
+// The last branch with auto-tuner is: 
+// youki/packed-model-pr-backup1031
+// https://machinetranslation.visualstudio.com/Marian/_git/marian-dev?version=GByouki%2Fpacked-model-pr-backup1031
+// SHA: 3456a7ed1d1608cfad74cd2c414e7e8fe141aa52
 Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
  auto device = a->graph()->getDeviceId().type;

  float clipValue = a->graph()->getBackend()->getClip();
+  Type aElementType = a->value_type();
+  Type bElementType = b->value_type();

-  if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()) {
-    GemmType gemmType = a->graph()->getBackend()->getGemmType();
-    // When gemmType is set to 'auto', an autotuner decides the best algorithm available.
-    // A new autotuner is created, then different kinds of algorithms are added to the autotuner.
-    // For each GEMM size, there is a unique hash key.
-    // (e.g. m, n, k, transpose A, transpose B, bias size for GEMM)
-    if(gemmType == GemmType::Auto) {
-      thread_local Ptr<AutoTuner<Expr>> tuner = New<AutoTuner<Expr>>();
-
-      // start with new set of algorithms
-      tuner->clear();
-
-      // lower precicion for shapes, reduces data sparsity
-      auto sh = [](Shape sh) {
-        for(size_t i = 0; i < sh.size(); ++i)
-          sh.set(i, sh[i] / 4);
-        return sh;
-      };
-
-      // create context for current call as hash
-      std::size_t hash = sh(a->shape()).hash();
-      util::hash_combine(hash, sh(b->shape()).hash());
-      util::hash_combine(hash, sh(bias->shape()).hash());
-      util::hash_combine(hash, transA);
-      util::hash_combine(hash, transB);
-
-#if USE_FBGEMM
-      // Use Packed GEMM only if the node b in the graph is memoized.
-      // More specifically, packed GEMM is used only if the B matrix (weight) is constant.
-      // In general, 'memoized' means that the node is a constant variable or
-      // a combination of contant nodes which is also a constant variable
-      // when it's computed once.
-      // Those memoized nodes are cached to avoid duplicated computations.
-      // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
-      // one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
-      // It looks at the cpu register 
-      // (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
-      // and this cpu lookup is executed only once and the state is kept in FBGEMM.
-      if(fbgemm::fbgemmHasAvx2Support() && b->memoize()) {
-        // add packed GEMM algorithm variant (Packed GEMM) to the autotuner
-        // Once an algorithm is added to the autotuner,
-        // autotuner runs all the added algorithms for a designated times.
-        // One algorithm is run per one this operation call
-        // and the stat for that algorithm is collected.
-        // When all the algorithms reach the maximum stat collection count,
-        // the autotuner decide the best algorithm, and keep using it afterward.
-        size_t hashPack = hash;
-        util::hash_combine(hashPack, 1);
-        auto recPack = [=](Expr e, bool stop = false) {
-          e->record(tuner, hashPack, stop);
-          return e;
-        };
-
-        auto algPack = [=]() {
-          auto packed = cpu::variant::pack(b, cpu::variant::PackMatrix::B, transB, clipValue);
-
-          return recPack(
-              cpu::variant::affine(
-                  clip(a, clipValue),
-                  packed,
-                  b->shape(),
-                  bias,
-                  transA,
-                  transB,
-                  scale),
-              true);
-        };
-        tuner->insert({hashPack, algPack});
-      }
-#endif // USE_FBGEMM
-
-      // add second algorithm variant (Int16) to the autotuner
-      size_t hashInt16 = hash;
-      util::hash_combine(hashInt16, 2);
-      auto recInt16 = [=](Expr e, bool stop = false) {
-        e->record(tuner, hashInt16, stop);
-        return e;
-      };
-      auto algInt16 = [=]() {
-        return recInt16(
-            cpu::int16::affine(
-                recInt16(
-                    cpu::int16::quantize(
-                        transA ? recInt16(transpose(a)) : a,
-                        clipValue)),
-                cpu::int16::quantize(
-                    transB ? b : transpose(b),
-                    clipValue),
-                bias,
-                scale),
-            true);
-      };
-      tuner->insert({hashInt16, algInt16});
-
-      // add third algorithm variant (CBlas) to the autotuner
-      size_t hashCblas = hash;
-      util::hash_combine(hashCblas, 3);
-      auto recCblas = [=](Expr e, bool stop = false) {
-        e->record(tuner, hashCblas, stop);
-        return e;
-      };
-
-      auto algCblas = [=]() {
-        auto ac = clip(a, clipValue);
-        if(ac != a)
-          ac = recCblas(ac);
-
-        auto bc = clip(b, clipValue);
-        if(bc != b)
-          bc = recCblas(bc);
-
-        int rows = ac->shape().elements() / ac->shape()[-1];
-        Expr ones = ac->graph()->ones({rows, 1}, bias->value_type());
-        std::vector<Expr> nodes = {ac, bc, bias, ones};
-        return recCblas(Expression<AffineNodeOp>(nodes, transA, transB, scale),
-                        true);
-      };
-      tuner->insert({hashCblas, algCblas});
-
-      // execute algorithm with autotuning
-      return tuner->run();
-    } else {
-      if(gemmType == GemmType::IntrinInt16) {
+  if(device == DeviceType::cpu) {
+    if(isFloat(aElementType) && isFloat(bElementType)) {
+      if(a->graph()->getBackend()->isOptimized()) {
        // cpu int16 version
        return cpu::int16::affine(
-            cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
-            cpu::int16::quantize(transB ? b : transpose(b), clipValue),
-            bias,
-            scale);
-      } else if(gemmType == GemmType::FbFp16Packed) {
-#if USE_FBGEMM
-        // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
-        // one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
-        // It looks at the cpu register
-        // (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
-        // and this cpu lookup is executed only once and the state is kept in FBGEMM.
-        if(fbgemm::fbgemmHasAvx2Support() && b->memoize()) {
-          auto packed = cpu::variant::pack(b, cpu::variant::PackMatrix::B, transB, clipValue);
-
-          return cpu::variant::affine(
-              clip(a, clipValue),
-              packed,
-              b->shape(),
-              bias,
-              transA,
-              transB,
-              scale);
-        } else {
-          int rows = a->shape().elements() / a->shape()[-1];
-          Expr ones = a->graph()->ones({rows, 1}, bias->value_type());
-          std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias, ones};
-          return Expression<AffineNodeOp>(nodes, transA, transB, scale);
-        }
-#else
-        ABORT("Packed GEMM is not available in this build");
-#endif  // USE_FBGEMM
-
-      } else if(gemmType == GemmType::MklFp32) {
-        // general version, MKL, CBlas or CUDA
-
-        // if clipValue > 0, the inputs will be clipped to range [-clipValue,
-        // clipValue] This is meant to keep values at the same range as used during
-        // training when optimizing for 8-bit integer products. Likely to be removed
-        // in the future when we explore better ways to handle this.
-
-        int rows = a->shape().elements() / a->shape()[-1];
-        Expr ones = a->graph()->ones({rows, 1}, bias->value_type());
-        std::vector<Expr> nodes
-            = {clip(a, clipValue), clip(b, clipValue), bias, ones};
-        return Expression<AffineNodeOp>(nodes, transA, transB, scale);
+          cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
+          cpu::int16::quantize(transB ? b : transpose(b), clipValue),
+          bias,
+          scale);
      } else {
-        ABORT("GemmType..{} not available by affine()", gemmType);
+        return affineDefault(a, b, bias, transA, transB, scale);
      }
+    } else if(isFloat(aElementType) && isPacked(bElementType)) {
+#if USE_FBGEMM
+      // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
+      // one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
+      // It looks at the cpu register
+      // (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
+      // and this cpu lookup is executed only once and the state is kept in FBGEMM.
+      if(fbgemm::fbgemmHasAvx2Support()) {
+        return cpu::variant::affine(clip(a, clipValue),
+                                    b,
+                                    b->shape(),
+                                    bias,
+                                    transA,
+                                    transB,
+                                    scale);
+      } else {
+        ABORT("No on-the-fly packing at the moment");
+      }
+#else
+      ABORT("Packed GEMM is not available in this build");
+#endif  // USE_FBGEMM
+    } else {
+      ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType);
    }
  } else {
-    // general version, MKL, CBlas or CUDA
-
-    // if clipValue > 0, the inputs will be clipped to range [-clipValue,
-    // clipValue] This is meant to keep values at the same range as used during
-    // training when optimizing for 8-bit integer products. Likely to be removed
-    // in the future when we explore better ways to handle this.
-
-    int rows = a->shape().elements() / a->shape()[-1];
-    Expr ones = a->graph()->ones({rows, 1});
-    std::vector<Expr> nodes
-        = {clip(a, clipValue), clip(b, clipValue), bias, ones};
-    return Expression<AffineNodeOp>(nodes, transA, transB, scale);
+    // Default GEMM
+    return affineDefault(a, b, bias, transA, transB, scale);
  }
 }

--- a/src/graph/node.h
+++ b/src/graph/node.h
@ -85,7 +85,7 @@ public:
  virtual void setId(size_t id) override { id_ = id; }

  virtual size_t getId() override { return id_; }
-
+
  virtual void increaseEdges(size_t edges = 1) { edges_ += edges; };
  virtual void decreaseEdges(size_t edges = 1) { edges_ -= edges; };
  virtual size_t edges() { return edges_; };
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@ -39,7 +39,7 @@ class LambdaInitConvert : public NodeInitializer {
        auto sharedAllocator = allocator_.lock();
        ABORT_IF(!sharedAllocator, "Allocator in LambdaInitConvert has not been set or expired");

-        auto memory = sharedAllocator->alloc(tensor->size(), intermediateType_);
+        auto memory = sharedAllocator->alloc(requiredBytes(tensor->shape(), intermediateType_));
        auto temp = TensorBase::New(memory,
                                    tensor->shape(),
                                    intermediateType_,
@ -183,11 +183,11 @@ Ptr<NodeInitializer> fromItem(const io::Item& item) {
               "Tensor type ({}) and type for mapping ({}) do not match",
               tensor->type(),
               item.type);
-      ABORT_IF(tensor->size() != item.size() / sizeOf(item.type),
-               "Tensor size ({}) and mapped size ({}) do not match",
-               tensor->size(),
-               item.size() / sizeOf(item.type));
-      auto mp = MemoryPiece::New((uint8_t*)item.ptr, tensor->size() * sizeOf(item.type));
+      ABORT_IF(tensor->shape() != item.shape,
+               "Tensor shape ({}) and shape of mapped item ({}) do not match",
+               tensor->shape(),
+               item.shape);
+      auto mp = MemoryPiece::New((uint8_t*)item.ptr, item.size()); // @TODO: this is not properly aligned now
      tensor->reset(mp);
    });
  } else {
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@ -40,6 +40,10 @@ public:
    LOG(debug, "Created parameter object of type {}", acceptedElementType_);
  }

+  ~Parameters() {
+    LOG(debug, "Destroyed parameter object of type {}", acceptedElementType_);
+  }
+
  auto begin() -> decltype(params_.begin()) { return params_.begin(); }

  auto end() -> decltype(params_.begin()) { return params_.end(); }
@ -58,6 +62,8 @@ public:
  size_t size() { return params_.size(); }

  void add(Expr p, const std::string& name) {
+    LOG(debug, "Adding parameter {} to parameter object of type {}", name, acceptedElementType_);
+
    ABORT_IF(named_.count(name), "Parameter '{}' already exists", name);
    ABORT_IF(p->value_type() != acceptedElementType_,
             "Requested parameter type ({}) is different from chosen parameter type ({})",
--- a/src/layers/generic.cpp
+++ b/src/layers/generic.cpp
@ -281,7 +281,7 @@ namespace marian {
            factorB  = slice(b_,                              -1, Slice((int)range.first, (int)range.second));
          }
          // @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
-          auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true); // [B... x U] factor logits
+          auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
          // optionally add lemma-dependent bias
          if (Plemma) { // [B... x U0]
            int lemmaVocabDim = Plemma->shape()[-1];
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@ -10,6 +10,11 @@
 #include "translator/scorers.h"
 #include "data/alignment.h"
 #include "data/vocab_base.h"
+#include "graph/expression_graph_packable.h"
+
+#if USE_FBGEMM
+#include "fbgemm/Utils.h"
+#endif

 namespace marian {

@ -69,10 +74,6 @@ public:
    device_ = New<cpu::WrappedDevice>(deviceId);
    graph_->setDevice(deviceId, device_);

-    // Use packed GEMM for the production
-    graph_->getBackend()->setOptimized(true);
-    graph_->getBackend()->setGemmType("fp16packed");
-
 #if MKL_FOUND
    mkl_set_num_threads(options->get<int>("mkl-threads", 1));
 #endif
@ -211,5 +212,64 @@ std::vector<Ptr<IVocabWrapper>> loadVocabs(const std::vector<std::string>& vocab
  return res;
 }

+// query CPU AVX version
+DecoderCpuAvxVersion getCpuAvxVersion() {
+#if USE_FBGEMM
+  // Default value is AVX
+  DecoderCpuAvxVersion cpuAvxVer = DecoderCpuAvxVersion::AVX;
+  if (fbgemm::fbgemmHasAvx512Support())
+    cpuAvxVer = DecoderCpuAvxVersion::AVX512;
+  else if (fbgemm::fbgemmHasAvx2Support())
+    cpuAvxVer = DecoderCpuAvxVersion::AVX2;
+
+  return cpuAvxVer;
+#else
+  // Default value is AVX
+  return DecoderCpuAvxVersion::AVX;
+#endif
+}
+
+DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) {
+  if (name == "avx") {
+    return DecoderCpuAvxVersion::AVX;
+  } else if (name == "avx2") {
+    return DecoderCpuAvxVersion::AVX2;
+  } else if (name == "avx512") {
+    return DecoderCpuAvxVersion::AVX512;
+  } else {
+    ABORT("Unknown CPU Instruction Set: {}", name);
+    return DecoderCpuAvxVersion::AVX;
+  }
+}
+
+// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc. 
+bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec) {
+  std::cout << "Converting from: " << inputFile << ", to: " << outputFile << std::endl;
+
+  YAML::Node config;
+  std::stringstream configStr;
+  marian::io::getYamlFromModel(config, "special:model.yml", inputFile);
+  configStr << config;
+
+  auto graph = New<ExpressionGraphPackable>();
+  graph->setDevice(CPU0);
+  graph->getBackend()->setOptimized(false);
+
+  graph->load(inputFile);
+  graph->forward();
+  std::string saveGemmType = "fp32default";
+  if (targetPrec == 16)
+    saveGemmType = "fp16packed";
+  else if (targetPrec == 8)
+    saveGemmType = "int8packed";
+
+  // added a flag if the weights needs to be packed or not
+  graph->packAndSave(outputFile, configStr.str(), saveGemmType); // @TODO: this should just be type-based
+
+  std::cout << "Conversion Finished." << std::endl;
+
+  return true;
+}
+
 }  // namespace quicksand
 }  // namespace marian
--- a/src/microsoft/quicksand.h
+++ b/src/microsoft/quicksand.h
@ -25,6 +25,12 @@ typedef std::tuple<WordIndices, AlignmentSets, float> QSSentenceWithProb;
 typedef std::vector<QSSentenceWithProb> QSNBest;
 typedef std::vector<QSNBest> QSNBestBatch;

+enum class DecoderCpuAvxVersion {
+  AVX,
+  AVX2,
+  AVX512
+};
+
 Ptr<Options> newOptions();

 template <class T>
@ -64,5 +70,11 @@ Ptr<IBeamSearchDecoder> newDecoder(Ptr<Options> options,
 // load src and tgt vocabs
 std::vector<Ptr<IVocabWrapper>> loadVocabs(const std::vector<std::string>& vocabPaths);

+// query CPU AVX version
+DecoderCpuAvxVersion getCpuAvxVersion();
+DecoderCpuAvxVersion parseCpuAvxVersion(std::string name);
+
+bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec);
+
 }  // namespace quicksand
 }  // namespace marian
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@ -77,7 +77,6 @@ public:
      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
      if (device.type == DeviceType::cpu) {
        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
-        graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
      }

      graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@ -193,12 +193,6 @@ public:
    return alignedSize(num * sizeof(T));
  }

-  size_t capacity(size_t num, Type type) { return alignedSize(num * sizeOf(type)); }
-
-  MemoryPiece::PtrType alloc(size_t num, Type type) {
-    return alloc(num * sizeOf(type));
-  }
-
  template <typename T>
  MemoryPiece::PtrType alloc(size_t num) {
    return alloc(capacity<T>(num));
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@ -41,10 +41,6 @@ public:
  // for GPU, this is invalid. for gpu, isOptimized() function always returns false.
  virtual void setOptimized(bool optimize) = 0;
  virtual bool isOptimized() = 0;
-  // for CPU, selects different GEMM types for the inference.
-  // for GPU, there's no gemm type. so, it does nothing.
-  virtual void setGemmType(std::string gemmType) = 0;
-  virtual GemmType getGemmType() = 0;
 };

 Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@ -22,18 +22,6 @@ public:
  // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
  void setOptimized(bool optimize) override { optimized_ = optimize; }
  bool isOptimized() override { return optimized_; }
-  // for CPU only, selects different GEMM types for the inference. Does nothing for GPU.
-  void setGemmType(std::string gemmType) override {
-    if      (gemmType == "auto")        gemmType_ = GemmType::Auto;
-    else if (gemmType == "mklfp32")     gemmType_ = GemmType::MklFp32;
-    else if (gemmType == "intrinint16") gemmType_ = GemmType::IntrinInt16;
-#if USE_FBGEMM
-    else if (gemmType == "fp16packed")  gemmType_ = GemmType::FbFp16Packed;
-    else if (gemmType == "int8packed")  gemmType_ = GemmType::FbInt8Packed;
-#endif // USE_FBGEMM
-    else ABORT("Unknown GEMM type - '{}'", gemmType);
-  }
-  GemmType getGemmType() override { return gemmType_; }
 };
 }  // namespace cpu
 }  // namespace marian
--- a/src/tensors/cpu/expanded_gemm.h
+++ b/src/tensors/cpu/expanded_gemm.h
@ -4,7 +4,6 @@
 #include "tensors/cpu/sharp/packed_gemm.h"

 #if USE_FBGEMM
-
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
@ -70,7 +69,7 @@ struct PackNodeOp : public UnaryNodeOp {

  NodeOps forwardOps() override {
    return {NodeOp(PackFp32(val_,
-                            child(0)->val(),
+                            child(0)->val()->data(),
                            transpose_,
                            nrow_,
                            ncol_,
@ -97,17 +96,17 @@ struct PackNodeOp : public UnaryNodeOp {
    // Should be 2D - weight matrix
    ABORT_IF(shapeMat.size() != 2,
             "Weight Matrix should be 2D");
-    nrow_ = transpose ? shapeMat[1] : shapeMat[0];
-    ncol_ = transpose ? shapeMat[0] : shapeMat[1];
-    kernel_ncol_blocks_ = 2;
-    brow_ = 512;
-    bcol_ = 8 * kernel_ncol_blocks_;
-    last_brow_ = nrow_ % brow_ == 0 ? brow_ : nrow_ % brow_;
-    nbrow_ = nrow_ % brow_ == 0 ? nrow_ / brow_ : (nrow_ + brow_) / brow_;
-    nbcol_ = ncol_ % bcol_ == 0 ? ncol_ / bcol_ : (ncol_ + bcol_) / bcol_;
-    const int padding = 1024;  // required by sw pipelined kernels
-    const int specialMem = 256;
-    packsize_ = ((nbrow_ * brow_) * (nbcol_ * bcol_)) * sizeof(fbgemm::float16) + padding + specialMem;
+    PackInfoFp32(shapeMat,
+                 transpose,
+                 nrow_,
+                 ncol_,
+                 kernel_ncol_blocks_,
+                 brow_,
+                 bcol_,
+                 last_brow_,
+                 nbrow_,
+                 nbcol_,
+                 packsize_);

    Shape outShape({(int)packsize_});

--- a/src/tensors/cpu/sharp/packed_gemm.cpp
+++ b/src/tensors/cpu/sharp/packed_gemm.cpp
@ -20,16 +20,13 @@
 //#pragma comment(linker, "/ignore:4217") // locally defined symbol ...asmjit... imported
 #endif

-
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-variable"
 #endif
-
 #include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
 #include "3rd_party/fbgemm/include/fbgemm/QuantUtils.h"
 #include "3rd_party/fbgemm/include/fbgemm/Fbgemm.h"
-
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
@ -68,6 +65,14 @@ namespace variant { // Variants of GEMM implementations
 // different marian instances should not share this variable.
 static thread_local PackedGemmMatrixFP16 packedPlaceholder(1, 1, 1, 1, 1, 1, 1, 1);

+// Copied code from fbgemm. It's padding required from some kernel in FBGEMM
+// Verbatim - 'required by sw pipelined kernels'
+// https://github.com/marian-nmt/FBGEMM/blob/master/include/fbgemm/FbgemmFP16.h#L109
+const int PACK16_PADDING = 1024;  
+
+// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
+const int PACK16_SPECIALMEM = 256;
+
 // This is copied from FBGEMM code
 // A better way?
 // will be removed, when FBGEMM api is changed
@ -106,8 +111,39 @@ inline uint64_t addr(const int r_,
  return index;
 }

+void PackInfoFp32(const marian::Shape& shape,
+                  const bool transpose,
+                  uint64_t& packsize) {
+  int nrow, ncol, kernel_ncol_blocks, brow = 512, bcol, last_brow, nbrow, nbcol;
+  PackInfoFp32(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
+}
+
+void PackInfoFp32(const marian::Shape& shape,
+                  const bool transpose,
+                  int& nrow,
+                  int& ncol,
+                  int& kernel_ncol_blocks,
+                  int& brow,
+                  int& bcol,
+                  int& last_brow,
+                  int& nbrow,
+                  int& nbcol,
+                  uint64_t& packsize) {
+  nrow = transpose ? shape[1] : shape[0];
+  ncol = transpose ? shape[0] : shape[1];
+  kernel_ncol_blocks = 2;
+  brow = 512;
+  bcol = 8 * kernel_ncol_blocks;
+  last_brow = nrow % brow == 0 ? brow : nrow % brow;
+  nbrow = nrow % brow == 0 ? nrow / brow : (nrow + brow) / brow;
+  nbcol = ncol % bcol == 0 ? ncol / bcol : (ncol + bcol) / bcol;
+  ABORT_IF(ncol % bcol != 0, "ncol (number of columns) should be multiple of 16. {}", ncol);
+  packsize = ((nbrow * brow) * (nbcol * bcol)) * sizeof(fbgemm::float16) + PACK16_PADDING
+             + PACK16_SPECIALMEM;
+}
+
 void PackFp32(marian::Tensor out,
-              const marian::Tensor in,
+              const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
              const bool transpose,
              const int nrow,
              const int ncol,
@ -141,11 +177,10 @@ void PackFp32(marian::Tensor out,
  fbgemm::float16* outmem = (fbgemm::float16*)(outmemorg + 256);
  fbgemm::float16* dummy = new fbgemm::float16;
  // pack the matrix
-  float* inmem = in->data();
  for(int i = 0; i < nrow; i++) {
    for(int j = 0; j < ncol; j++) {
      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
-          = tconv(!transpose ? inmem[i * ncol + j] : inmem[i + nrow * j], *dummy);
+          = tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
    }
  }
  delete dummy;
@ -186,15 +221,17 @@ void GemmPackFp32(marian::Tensor C,
  // packed matrix
  packedPlaceholder.pmat_ = (fbgemm::float16*)(B->data<uint8_t>() + 256);

+  if(bias != nullptr) {
 #if MKL_FOUND
-  for(int i = 0; i < m; ++i) {
-    mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
-  }
+    for(int i = 0; i < m; ++i) {
+      mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
+    }
 #else
-  for(int i = 0; i < m; ++i) {
-    std::copy(bias->data(), bias->data() + n, C->data() + n * i);
-  }
+    for(int i = 0; i < m; ++i) {
+      std::copy(bias->data(), bias->data() + n, C->data() + n * i);
+    }
 #endif
+  }

 #ifdef _OPENMP
 #pragma omp parallel
@ -211,7 +248,7 @@ void GemmPackFp32(marian::Tensor C,
                      (int)m,
                      A->data(),
                      packedPlaceholder,
-                      1,
+                      bias != nullptr ? 1.0 : 0.0,
                      C->data(),
                      tid,
                      num_threads);
@ -221,8 +258,31 @@ void GemmPackFp32(marian::Tensor C,
  packedPlaceholder.pmat_ = pmat;
 }
 #else // USE_FBGEMM
+
+void PackInfoFp32(const marian::Shape& shape,
+                  const bool transpose,
+                  uint64_t& packsize) {
+  // does nothing. supports only FBGEMM based packed gemm at this moment.
+  ABORT("FBGEMM is needed to use packed GEMM.");
+}
+
+void PackInfoFp32(const marian::Shape& shape,
+                  const bool transpose,
+                  int& nrow,
+                  int& ncol,
+                  int& kernel_ncol_blocks,
+                  int& brow,
+                  int& bcol,
+                  int& last_brow,
+                  int& nbrow,
+                  int& nbcol,
+                  uint64_t& packsize) {
+  // does nothing. supports only FBGEMM based packed gemm at this moment.
+  ABORT("FBGEMM is needed to use packed GEMM.");
+}
+
 void PackFp32(marian::Tensor out,
-              const marian::Tensor in,
+              const float* inData,
              const bool transpose,
              const int nrow,
              const int ncol,
--- a/src/tensors/cpu/sharp/packed_gemm.h
+++ b/src/tensors/cpu/sharp/packed_gemm.h
@ -6,9 +6,25 @@ namespace marian {
 namespace cpu {
 namespace variant { // Variants of GEMM implementations

+void PackInfoFp32(const marian::Shape& shape,
+                  const bool transpose,
+                  /*out*/uint64_t& packsize);
+
+void PackInfoFp32(const marian::Shape& shape,
+                  const bool transpose,
+                  int& nrow,
+                  int& ncol,
+                  int& kernel_ncol_blocks,
+                  int& brow,
+                  int& bcol,
+                  int& last_brow,
+                  int& nbrow,
+                  int& nbcol,
+                  /*out*/uint64_t& packsize); // @TODO: change to size_t where appropriate
+
 // Pack a matrix into cache utilization efficient way (block format)
 // out: output tensor - packed format
-// in: input tensor - normal format
+// inData: input tensor data - pointer of float data
 // transpose: the matrix is transposed
 // nrow: the number of rows
 // ncol: the number of columns
@ -21,7 +37,7 @@ namespace variant { // Variants of GEMM implementations
 // packsize: the size of the packed matrix
 //          (the number of fp16 elements + padding (1024) + extra temporary memory (256))
 void PackFp32(marian::Tensor out,
-              const marian::Tensor in,
+              const float* inData,
              const bool transpose,
              const int nrow,
              const int ncol,
@ -31,7 +47,7 @@ void PackFp32(marian::Tensor out,
              const int last_brow,
              const int nbrow,
              const int nbcol,
-              const uint64_t packsize);
+              const uint64_t packsize); // @TODO: change to size_t where appropriate

 // GEMM operation on the packed B matrix
 // C: output matrix
--- a/src/tensors/gpu/backend.h
+++ b/src/tensors/gpu/backend.h
@ -59,16 +59,6 @@ public:
    return false;
  }

-  // for CPU, selects different GEMM types for the inference.
-  // for GPU, there's no gemm type. so, it does nothing.
-  void setGemmType(std::string gemmType) override {
-    LOG_ONCE(info, "setGemmType() not supported for GPU_{}", gemmType);
-  }
-  GemmType getGemmType() override {
-    LOG_ONCE(info, "getGemmType() not supported for GPU");
-    return GemmType::Auto;
-  }
-
 private:
  cublasHandle_t cublasHandle_;
  cusparseHandle_t cusparseHandle_;
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@ -214,12 +214,10 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
    int threads = std::min(MAX_THREADS, cols_in);

    if(out->type() == Type::float32) {
-      gInsertCols<false><<<blocks, threads>>>(
-          out->data<float>(), in->data<float>(), rows, cols_in, cols_out, cols_in, offset, 0);
+      gInsertCols<false><<<blocks, threads>>>(out->data<float>(), in->data<float>(), rows, cols_in, cols_out, cols_in, offset, 0);
 #if COMPILE_FP16
    } else if(out->type() == Type::float16) {
-      gInsertCols<false><<<blocks, threads>>>(
-          out->data<half>(), in->data<half>(), rows, cols_in, cols_out, cols_in, offset, 0);
+      gInsertCols<false><<<blocks, threads>>>(out->data<half>(), in->data<half>(), rows, cols_in, cols_out, cols_in, offset, 0);
 #endif
    } else {
      ABORT("Concatenate1 not implemented for type {}", out->type());
@ -2130,11 +2128,11 @@ void LayerNormalizationGrad(Ptr<Allocator> allocator,
  int threads = std::min(MAX_THREADS, cols);
  int blocks = std::min(MAX_BLOCKS, rows);

-  auto tempGradGammaMemory = allocator->alloc(adj->memory()->size(), adj->type());
+  auto tempGradGammaMemory = allocator->alloc(adj->memory()->size());
  Tensor tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend());
  tempGradGamma->set(0.f);

-  auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type()), adj->type());
+  auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type()));
  Tensor tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend());
  tempOnes->set(1.f);

--- a/src/tensors/tensor.cpp
+++ b/src/tensors/tensor.cpp
@ -1,5 +1,6 @@
 #include "tensors/tensor.h"
 #include "tensors/tensor_operators.h"
+#include "common/io.h"

 namespace marian {

@ -120,13 +121,22 @@ void TensorBase::get(io::Item& item, const std::string& name) {
  item.shape = shape_;
  item.type  = type_;

-  size_t bytesWithoutPadding = shape_.elements() * sizeOf(type_);
-  item.bytes.resize(bytesWithoutPadding);
+  item.bytes.resize(memory_->size());
  copy(backend_,
       memory_->data<char>(),
-       memory_->data<char>() + bytesWithoutPadding,
+       memory_->data<char>() + memory_->size(),
       item.bytes.data());
 }

+void TensorBase::set(const io::Item& item) {
+  ABORT_IF(item.type != type_, "Tensor type {} and item type {} do not match", type_, item.type);
+  ABORT_IF(item.shape != shape_, "Tensor shape {} and item shape {} do not match", shape_, item.shape);
+  ABORT_IF(item.bytes.size() > memory_->size(), "Item data size {} too large for memory {}", item.bytes.size(), memory_->size());
+  copy(backend_,
+       item.bytes.data(),
+       item.bytes.data() + item.bytes.size(),
+       memory_->data<char>());
+}
+
 }  // namespace marian

--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -3,7 +3,6 @@
 #include "common/definitions.h"
 #include "common/shape.h"
 #include "common/types.h"
-#include "common/io.h"
 #include "tensors/backend.h"
 #include "tensors/memory_piece.h"
 #ifdef CUDA_FOUND
@ -18,6 +17,10 @@

 namespace marian {

+namespace io {
+  struct Item;
+}
+
 class TensorBase {
  MemoryPiece::PtrType memory_;
  Shape shape_;
@ -150,6 +153,11 @@ public:

  template <typename T>
  void set(const T* begin, const T* end) {
+    ABORT_IF(end - begin != shape_.elements(),
+             "Vector size ({}) and underlying shape ({}, {}) do not match",
+             end - begin,
+             std::string(shape_),
+             memory_->size());
    matchOrAbort<T>(type_);

    if(backend_->getDeviceId().type == DeviceType::cpu) {
@ -167,36 +175,7 @@ public:
    set(v.data(), v.data() + v.size());
  }

-  // a binary copy with type checking
-  void set(const char* begin, const char* end, Type type) {
-    ABORT_IF(type_ != type,
-             "Tensor type ({}) and data type ({}) do not match",
-             type_,
-             type);
-
-    size_t dataSize = (end - begin) / sizeOf(type);
-    ABORT_IF(size() != dataSize,
-             "Tensor size ({}) and mapped size ({}) do not match",
-             size(),
-             dataSize);
-
-    if(backend_->getDeviceId().type == DeviceType::cpu) {
-      std::copy(begin, end, data<char>());
-    }
-#ifdef CUDA_FOUND
-    else {
-      gpu::copy(backend_, begin, end, data<char>());
-    }
-#endif
-  }
-
-  void set(const std::vector<char>& v, Type type) {
-    set(v.data(), v.data() + v.size(), type);
-  }
-
-  void set(const io::Item& item) {
-    set(item.bytes.data(), item.bytes.data() + item.bytes.size(), item.type);
-  }
+  void set(const io::Item& item);

  // For single values enable conversion to other numeric formats if possible
  template <typename T>
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@ -70,14 +70,13 @@ public:
  void clear() { allocator_->clear(); }

  size_t capacity(Shape shape, Type type = Type::float32) {
-    return allocator_->capacity(shape.elements(), type);
+    return allocator_->capacity<char>(requiredBytes(shape, type));
  }

-  void allocate(Tensor& t, Shape shape, Type type = Type::float32) {
+  void allocate(/*out*/ Tensor& t, Shape shape, Type type = Type::float32) {
    if(!t || t->shape() != shape) {
-      int size = shape.elements();
-      auto mem = allocator_->alloc(size, type);
-      t = TensorBase::New(mem, shape, type, backend_);
+      auto mem = allocator_->alloc(requiredBytes(shape, type));
+      t = Tensor(TensorBase::New(mem, shape, type, backend_));
    }
  }

--- a/src/tests/prod.cpp
+++ b/src/tests/prod.cpp
@ -41,7 +41,6 @@ int main(int argc, char** argv) {
        auto g = New<ExpressionGraph>(true);
        g->setDevice({0, DeviceType::cpu});
        g->getBackend()->setOptimized(true);
-        g->getBackend()->setGemmType("auto");
        g->reserveWorkspaceMB(2512);

        timer::AutoTimer timer;
--- a/src/training/communicator.cpp
+++ b/src/training/communicator.cpp
@ -169,7 +169,7 @@ public:
    //        to only accept one parameter, and remove this error check can be removed.
    ABORT_IF(sendbuf != recvbuf, "FakeMPIWrapper::allReduce() only implemented for in-place operation"); // otherwise it's not a no-op, we must copy data
  }
-#pragma warning(push)
+#pragma warning(pop)
  virtual void finalize() override { }
 };

--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@ -65,7 +65,6 @@ public:
        graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
        if (device.type == DeviceType::cpu) {
          graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
-          graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
        }
        graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
        graphs_[id] = graph;
@ -187,7 +186,6 @@ public:
      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
      if (device.type == DeviceType::cpu) {
        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
-        graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
      }
      graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
      graphs_.push_back(graph);
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@ -16,6 +16,7 @@
    <RootNamespace>Marian</RootNamespace>
    <ProjectName>Marian</ProjectName>
    <WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
+    <CudaToolkitDir Condition="'$(CudaToolkitDir)' == ''">$(CUDA_PATH)</CudaToolkitDir>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
@ -33,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 10.1.props" />
+    <Import Project="$(CudaToolkitDir)\extras\visual_studio_integration\MSBuildExtensions\CUDA 10.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@ -42,15 +43,15 @@
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
-    <IncludePath>..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;%CUDA_PATH%\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
-    <LibraryPath>%CUDA_PATH%\lib\x64;%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
+    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
-    <IncludePath>..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;%CUDA_PATH%\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
-    <LibraryPath>%CUDA_PATH%\lib\x64;%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
+    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
@ -72,7 +73,7 @@
      <PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>false</SDLCheck>
      <TreatWarningAsError>true</TreatWarningAsError>
-      <AdditionalOptions>/bigobj %(AdditionalOptions) /arch:AVX</AdditionalOptions>
+      <AdditionalOptions>/bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebugDLL</RuntimeLibrary>
      <DisableSpecificWarnings>4996; 4702</DisableSpecificWarnings>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
@ -109,7 +110,7 @@
      <PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>false</SDLCheck>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions) /arch:AVX</AdditionalOptions>
+      <AdditionalOptions>/d2Zi+ /bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
      <TreatWarningAsError>true</TreatWarningAsError>
      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreadedDLL</RuntimeLibrary>
      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">MultiThreaded</RuntimeLibrary>
@ -1006,6 +1007,8 @@
    <ClCompile Include="..\src\common\filesystem.cpp" />
    <ClCompile Include="..\src\common\file_stream.cpp" />
    <ClCompile Include="..\src\common\io.cpp" />
+    <ClCompile Include="..\src\common\options.cpp" />
+    <ClCompile Include="..\src\common\types.cpp" />
    <ClCompile Include="..\src\common\utils.cpp" />
    <ClCompile Include="..\src\common\logging.cpp" />
    <ClCompile Include="..\src\common\config.cpp" />
@ -1111,6 +1114,7 @@
    </ClCompile>
    <ClCompile Include="..\src\training\communicator.cpp" />
    <ClCompile Include="..\src\training\graph_group_multinode_sync.cpp" />
+    <ClCompile Include="..\src\training\scheduler.cpp" />
    <ClCompile Include="..\src\translator\history.cpp" />
    <ClCompile Include="..\src\translator\output_collector.cpp" />
    <ClCompile Include="..\src\translator\nth_element.cpp" />
@ -1173,6 +1177,7 @@
    <ClInclude Include="..\src\examples\mnist\validator.h" />
    <ClInclude Include="..\src\functional\approx.h" />
    <ClInclude Include="..\src\functional\operators.h" />
+    <ClInclude Include="..\src\graph\expression_graph_packable.h" />
    <ClInclude Include="..\src\layers\loss.h" />
    <ClInclude Include="..\src\layers\weight.h" />
    <ClInclude Include="..\src\marian.h" />
@ -1607,6 +1612,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 10.1.targets" />
+    <Import Project="$(CudaToolkitDir)\extras\visual_studio_integration\MSBuildExtensions\CUDA 10.1.targets" />
  </ImportGroup>
 </Project>
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@ -724,6 +724,15 @@
    <ClCompile Include="..\src\common\file_stream.cpp">
      <Filter>common</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\training\scheduler.cpp">
+      <Filter>training</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\common\options.cpp">
+      <Filter>common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\common\types.cpp">
+      <Filter>common</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\src\marian.h" />
@ -2038,6 +2047,9 @@
    <ClInclude Include="..\src\3rd_party\zstr\zstr.hpp">
      <Filter>3rd_party</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\graph\expression_graph_packable.h">
+      <Filter>graph</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="3rd_party">