Merge branch 'master' into ug-const-diligence

2024-09-11 06:15:56 +03:00 · 2020-01-29 16:23:35 +00:00 · 2020-01-29 16:23:35 +00:00 · cfdde151a1
commit cfdde151a1
parent 76e229308a b3a23108b4
94 changed files with 4098 additions and 1832 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,9 +5,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

+
 ## [Unreleased]

 ### Added
+- Add CMAKE options to disable compilation for specific GPU SM types
+- An option to print word-level translation scores
+- An option to turn off automatic detokenization from SentencePiece
+- Separate quantization types for 8-bit FBGEMM for AVX2 and AVX512
+- Sequence-level unliklihood training
+- Allow file name templated valid-translation-output files
+- Support for lexical shortlists in marian-server
+- Support for 8-bit matrix multiplication with FBGEMM
+- CMakeLists.txt now looks for SSE 4.2
 - Purging of finished hypotheses during beam-search. A lot faster for large batches.
 - Faster option look-up, up to 20-30% faster translation
 - Added --cite and --authors flag
@ -24,6 +34,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Gradient-checkpointing

 ### Fixed
+- Fix empty source batch entries with batch purging
+- Clear RNN chache in transformer model, add correct hash functions to nodes
+- Gather-operation for all index sizes
+- Fix word weighting with max length cropping
+- Fixed compilation on CPUs without support for AVX
+- FastOpt now reads "n" and "y" values as strings, not as boolean values
+- Fixed multiple reduction kernels on GPU
+- Fixed guided-alignment training with cross-entropy
 - Replace IntrusivePtr with std::uniq_ptr in FastOpt, fixes random segfaults 
  due to thread-non-safty of reference counting.
 - Make sure that items are 256-byte aligned during saving
@ -38,6 +56,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Compilation with CUDA 10.1

 ### Changed
+- Revert LayerNorm eps to old position, i.e. sigma' = sqrt(sigma^2 + eps)
+- Downgrade NCCL to 2.3.7 as 2.4.2 is buggy (hangs with larger models)
+- Return error signal on SIGTERM
+- Dropped support for CUDA 8.0, CUDA 9.0 is now minimal requirement
 - Removed autotuner for now, will be switched back on later
 - Boost depdendency is now optional and only required for marian_server 
 - Dropped support for g++-4.9
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,6 +13,10 @@ set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
 # Custom CMake options
 option(COMPILE_CPU "Compile CPU version" ON)
 option(COMPILE_CUDA "Compile GPU version" ON)
+option(COMPILE_CUDA_SM35 "Compile GPU version with SM35 support" ON)
+option(COMPILE_CUDA_SM50 "Compile GPU version with SM50 support" ON)
+option(COMPILE_CUDA_SM60 "Compile GPU version with SM60 support" ON)
+option(COMPILE_CUDA_SM70 "Compile GPU version with SM70 support" ON)
 option(COMPILE_EXAMPLES "Compile examples" OFF)
 option(COMPILE_SERVER "Compile marian-server" OFF)
 option(COMPILE_TESTS "Compile tests" OFF)
@ -181,8 +185,6 @@ set(EXT_LIBS ${EXT_LIBS} ${CMAKE_DL_LIBS})

 if(COMPILE_CUDA)

-LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35; -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52; -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;)
-
 if(USE_STATIC_LIBS)
  # link statically to stdlib libraries
  set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++")
@ -196,16 +198,25 @@ if(USE_STATIC_LIBS)
  endif()
 endif()

-find_package(CUDA "8.0") # TODO: only enable FP16-related options for compute_70 and higher.
+find_package(CUDA "9.0") # TODO: only enable FP16-related options for compute_70 and higher.
 if(CUDA_FOUND)
  # CUDA >= 10.0 requires CMake >= 3.12.2
  if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2"))
      message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
  endif()
-
-  if(CUDA_VERSION VERSION_GREATER "8.0")
-    LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70)
-  endif()
+  
+  if(COMPILE_CUDA_SM35)
+    LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;)                             # Tesla K40 and above
+  endif(COMPILE_CUDA_SM35)
+  if(COMPILE_CUDA_SM50)
+    LIST(APPEND COMPUTE -gencode=arch=compute_50,code=sm_50; -gencode=arch=compute_52,code=sm_52;)     # Maxwell GPUs
+  endif(COMPILE_CUDA_SM50)
+  if(COMPILE_CUDA_SM60)
+    LIST(APPEND COMPUTE -gencode=arch=compute_60,code=sm_60; -gencode=arch=compute_61,code=sm_61;)     # Pascal GPUs
+  endif(COMPILE_CUDA_SM60)
+  if(COMPILE_CUDA_SM70)
+    LIST(APPEND COMPUTE -gencode=arch=compute_70,code=sm_70; -gencode=arch=compute_70,code=compute_70) # Volta GPUs
+  endif(COMPILE_CUDA_SM70)

  if(USE_STATIC_LIBS)
    find_library(CUDA_culibos_LIBRARY NAMES culibos PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
@ -321,9 +332,12 @@ if(COMPILE_CPU)
    set(BLA_VENDOR "OpenBLAS")
    find_package(BLAS)
    if(BLAS_FOUND)
-      include_directories(${BLAS_INCLUDE_DIR})
-      set(EXT_LIBS ${EXT_LIBS} ${BLAS_LIBRARIES})
-      add_definitions(-DBLAS_FOUND=1)
+      include(FindCBLAS)
+      if(CBLAS_FOUND)
+        include_directories(${BLAS_INCLUDE_DIR} ${CBLAS_INCLUDE_DIR})
+        set(EXT_LIBS ${EXT_LIBS} ${BLAS_LIBRARIES} ${CBLAS_LIBRARIES})
+        add_definitions(-DBLAS_FOUND=1)
+      endif(CBLAS_FOUND)
    endif(BLAS_FOUND)
  endif(MKL_FOUND)
 endif(COMPILE_CPU)
--- a/README.md
+++ b/README.md
@ -1,7 +1,6 @@
 Marian
 ======

-[![Build Status CUDA 8](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-8.0.svg?label=CUDA%208)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-8.0/)
 [![Build Status CUDA 9](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-9.2.svg?label=CUDA%209)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-9.2/)
 [![Build Status CUDA 10](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.1.svg?label=CUDA%2010)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cuda-10.1/)
 [![Build Status CPU](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/)
--- a/2
+++ b/2
@ -1 +1 @@
-v1.8.21
+v1.8.40
--- a/cmake/FindCBLAS.cmake
+++ b/cmake/FindCBLAS.cmake
@ -0,0 +1,186 @@
+# - Find CBLAS library
+#
+# This module finds an installed fortran library that implements the CBLAS 
+# linear-algebra interface (see http://www.netlib.org/blas/), with CBLAS
+# interface.
+#
+# This module sets the following variables:
+#  CBLAS_FOUND - set to true if a library implementing the CBLAS interface
+#    is found
+#  CBLAS_LINKER_FLAGS - uncached list of required linker flags (excluding -l
+#    and -L).
+#  CBLAS_LIBRARIES - uncached list of libraries (using full path name) to 
+#    link against to use CBLAS
+#  CBLAS_INCLUDE_DIR - path to includes
+#  CBLAS_INCLUDE_FILE - the file to be included to use CBLAS
+#
+
+## Based on https://github.com/Eyescale/CMake/blob/master/FindCBLAS.cmake
+
+INCLUDE(CheckFunctionExists)
+INCLUDE(CheckIncludeFile)
+
+MACRO(CHECK_ALL_LIBRARIES LIBRARIES INCLUDE _prefix _name _flags _list _include _search_include)
+  # This macro checks for the existence of the combination of fortran libraries
+  # given by _list.  If the combination is found, this macro checks (using the 
+  # Check_Fortran_Function_Exists macro) whether can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to FALSE.
+  
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+
+  SET(__list)
+  FOREACH(_elem ${_list})
+    IF(__list)
+      SET(__list "${__list} - ${_elem}")
+    ELSE(__list)
+      SET(__list "${_elem}")
+    ENDIF(__list)
+  ENDFOREACH(_elem)
+  MESSAGE(STATUS "Checking for [${__list}]")
+  SET(_libraries_work TRUE)
+  SET(${LIBRARIES})
+  SET(_combined_name)
+  SET(_paths)
+  FOREACH(_library ${_list})
+    SET(_combined_name ${_combined_name}_${_library})
+
+    # did we find all the libraries in the _list until now?
+    # (we stop at the first unfound one)
+    IF(_libraries_work)      
+      IF(APPLE) 
+        FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library}
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV 
+          DYLD_LIBRARY_PATH 
+          )
+      ELSE(APPLE)
+        FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
+          NAMES ${_library}
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV 
+          LD_LIBRARY_PATH 
+          )
+      ENDIF(APPLE)
+      MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY)
+      IF(${_prefix}_${_library}_LIBRARY)
+        GET_FILENAME_COMPONENT(_path ${${_prefix}_${_library}_LIBRARY} PATH)
+        LIST(APPEND _paths ${_path}/../include ${_path}/../../include)
+      ENDIF(${_prefix}_${_library}_LIBRARY)
+      SET(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      SET(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+    ENDIF(_libraries_work)
+  ENDFOREACH(_library ${_list})
+
+  # Test include
+  SET(_bug_search_include ${_search_include}) #CMAKE BUG!!! SHOULD NOT BE THAT
+  IF(_bug_search_include)
+    FIND_PATH(${_prefix}${_combined_name}_INCLUDE ${_include} ${_paths})
+    MARK_AS_ADVANCED(${_prefix}${_combined_name}_INCLUDE)
+    IF(${_prefix}${_combined_name}_INCLUDE)
+      MESSAGE(STATUS "Checking for [${__list}] -- includes found")
+      SET(${_prefix}_INCLUDE_DIR ${${_prefix}${_combined_name}_INCLUDE})
+      SET(${_prefix}_INCLUDE_FILE ${_include})
+      SET(${INCLUDE} ${${_prefix}_INCLUDE_DIR})
+    ELSE(${_prefix}${_combined_name}_INCLUDE)
+      MESSAGE(STATUS "Checking for [${__list}] -- includes not found")
+      SET(_libraries_work FALSE)
+    ENDIF(${_prefix}${_combined_name}_INCLUDE)
+  ELSE(_bug_search_include)
+    SET(${_prefix}_INCLUDE_DIR)
+    SET(${_prefix}_INCLUDE_FILE ${_include})
+  ENDIF(_bug_search_include)
+
+  IF(_libraries_work)
+    # Test this combination of libraries.
+    SET(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
+    CHECK_FUNCTION_EXISTS(${_name} ${_prefix}${_combined_name}_WORKS)
+    SET(CMAKE_REQUIRED_LIBRARIES)
+    MARK_AS_ADVANCED(${_prefix}${_combined_name}_WORKS)
+    SET(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+
+    IF(_libraries_work)
+      MESSAGE(STATUS "Checking for [${__list}] -- libraries found")
+    ENDIF(_libraries_work)
+
+  ENDIF(_libraries_work)
+  
+
+  IF(NOT _libraries_work)
+    SET(${LIBRARIES} FALSE)
+  ENDIF(NOT _libraries_work)
+
+ENDMACRO(CHECK_ALL_LIBRARIES)
+
+SET(CBLAS_LINKER_FLAGS)
+SET(CBLAS_LIBRARIES)
+SET(CBLAS_INCLUDE_DIR)
+
+# CBLAS in openBLAS
+IF(NOT CBLAS_LIBRARIES)
+  CHECK_ALL_LIBRARIES(
+    CBLAS_LIBRARIES
+    CBLAS_INCLUDE_DIR
+    cblas
+    cblas_sgemm
+    ""
+    "openblas"
+    "cblas.h"
+    TRUE
+    )
+ENDIF(NOT CBLAS_LIBRARIES)
+
+#MESSAGE(STATUS ${openblas_INCLUDE_DIR})
+
+# CBLAS in CBLAS
+IF(NOT CBLAS_LIBRARIES)
+  CHECK_ALL_LIBRARIES(
+    CBLAS_LIBRARIES
+    CBLAS_INCLUDE_DIR
+    cblas
+    cblas_sgemm
+    ""
+    "cblas"
+    "cblas.h"
+    TRUE
+    )
+ENDIF(NOT CBLAS_LIBRARIES)
+
+#MESSAGE(STATUS ${cblas_INCLUDE_DIR})
+
+# CBLAS in lapacke
+IF(NOT CBLAS_LIBRARIES)
+  CHECK_ALL_LIBRARIES(
+    CBLAS_LIBRARIES
+    CBLAS_INCLUDE_DIR
+    cblas
+    cblas_sgemm
+    ""
+    "lapacke"
+    "cblas.h"
+    TRUE
+    )
+ENDIF(NOT CBLAS_LIBRARIES)
+
+#MESSAGE(STATUS ${lapacke_INCLUDE_DIR})
+
+IF(CBLAS_LIBRARIES)
+  SET(CBLAS_FOUND TRUE)
+ELSE(CBLAS_LIBRARIES)
+  SET(CBLAS_FOUND FALSE)
+ENDIF(CBLAS_LIBRARIES)
+
+IF(NOT CBLAS_FOUND AND CBLAS_FIND_REQUIRED)
+  MESSAGE(FATAL_ERROR "CBLAS library not found. Please specify library  location")
+ENDIF(NOT CBLAS_FOUND AND CBLAS_FIND_REQUIRED)
+
+IF(NOT CBLAS_FIND_QUIETLY)
+  IF(CBLAS_FOUND)
+    MESSAGE(STATUS "CBLAS library found: " ${CBLAS_LIBRARIES})
+    MESSAGE(STATUS "cblas.h include directory: " ${CBLAS_INCLUDE_DIR})
+  ELSE(CBLAS_FOUND)
+    MESSAGE(STATUS "CBLAS library not found. Please specify library location")
+  ENDIF(CBLAS_FOUND)
+ENDIF(NOT CBLAS_FIND_QUIETLY)
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 336740065d9c23e53e912a1befff18981d9d27ab
+Subproject commit c19b7814d71febf1053bd93af6ac314b46204092
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 2a5833e41110c19f0bbe9f3cf2aa92caad96cf42
+Subproject commit 6a08849b23f6c14eefbe12f4eb73dc638b962587
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@ -13,6 +13,11 @@ if(USE_FBGEMM)
    set(CMAKE_SUPPRESS_DEVELOPER_WARNINGS 1 CACHE INTERNAL "No dev warnings")
  endif()

+  if(NOT MSVC)
+    # only locally disabled for the 3rd_party folder
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
+  endif()
+
  set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "Disable fbgemm tests")
  set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "Disable fbgemm benchmark")
  add_subdirectory(./fbgemm)
@ -66,8 +71,21 @@ set(INSTALLS "") # this will contain a list of 3rd part dependencies that we ins
 if(CUDA_FOUND)
  if(USE_NCCL)

-    # disables compilation for sm_30 to avoid ptxas warning... that's general Kepler support. But K80s are supported for instance by sm_35
-    set(GENCODE "-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
+    # disables compilation for sm_30 to avoid ptxas warning... that is general Kepler support. But K80s are supported for instance by sm_35
+
+    set(GENCODE "")
+    if(COMPILE_CUDA_SM35)
+      set(GENCODE "${GENCODE} -gencode=arch=compute_35,code=sm_35")
+    endif(COMPILE_CUDA_SM35)
+    if(COMPILE_CUDA_SM50)
+      set(GENCODE "${GENCODE} -gencode=arch=compute_50,code=sm_50")
+    endif(COMPILE_CUDA_SM50)
+    if(COMPILE_CUDA_SM60)
+      set(GENCODE "${GENCODE} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
+    endif(COMPILE_CUDA_SM60)
+    if(COMPILE_CUDA_SM70)
+      set(GENCODE "${GENCODE} -gencode=arch=compute_70,code=sm_70")
+    endif(COMPILE_CUDA_SM70)

    # install nccl in ${CMAKE_BINARY_DIR}/local similar to /usr/local linux installation
    ExternalProject_Add(nccl_install
--- a/src/3rd_party/avx_mathfun.h
+++ b/src/3rd_party/avx_mathfun.h
--- a/src/3rd_party/fbgemm
+++ b/src/3rd_party/fbgemm
@ -1 +1 @@
-Subproject commit f0b354327aaf2330c65340725b1981040c8bec9e
+Subproject commit 84e66a976046180187724aff60a236c5378fde7c
--- a/src/3rd_party/nccl
+++ b/src/3rd_party/nccl
@ -1 +1 @@
-Subproject commit 8e3a3f7c5b520babff49cec54a866fa3eda3a3b6
+Subproject commit b56650c7f59b8cd40d18809784a6d6be38ef8acb
--- a/src/3rd_party/reduce_all.h
+++ b/src/3rd_party/reduce_all.h
@ -1,347 +1,248 @@
-// This software contains source code provided by NVIDIA Corporation.
-
-/*
- * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECINDIRECFunctor, T, AccTyf, pe, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACSf, TRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-/*
-MJD: Relevant text from the NVIDIA EULA:

-2.1 Sample Source Code Modification, Ownership and Distribution
-
-Subject to the terms of the SLA and this Supplement, NVIDIA hereby grants you a non-
-exclusive, non-transferable license, without the right to sublicense, during the applicable
-license term unless earlier terminated pursuant to the SLA, to have Authorized Users
-modify and create derivative works of CUDA Licensed Software that constitutes sample
-source code, when provided to you by NVIDIA in source code form. You hold all rights,
-title and interest in and to your modifications and derivative works of the sample source
-code software that you create as permitted hereunder (collective, Derivatives”), subject
-to NVIDIA’s underlying Intellectual Property Rights in and to the CUDA Licensed
-Software; provided, however that you grant NVIDIA and its Affiliates an irrevocable,
-perpetual, nonexclusive, worldwide, royalty-free paid-up license to make, have made,
-use, have used, reproduce, license, distribute, sublicense, transfer and otherwise
-commercialize Derivatives including (without limitation) with the CUDA Licensed
-Software or other NVIDIA products, technologies or materials. You may distribute the
-CUDA Supplement to Software License Agreement End User License Agreements (EULA) 
-DR-06739-001_v01_v9.0 | 14 sample source code as delivered by NVIDIA and/or your Derivatives, 
-provided that all NVIDIA copyright notices and trademarks are maintained and used properly 
-and the sample source code includes the following notice: “This software contains source code
-provided by NVIDIA Corporation.”
-*/
-
-#pragma once
-
-#include "tensors/tensor.h"
-
-#include <cuda_runtime.h>
+#include "functional/tmp.h"
+#include <cooperative_groups.h>

 namespace marian {

-template <unsigned int blockSize, typename AccType>
-__device__ void
-reduceBlock(volatile AccType *sdata, AccType mySum, const unsigned int tid)
-{
-    sdata[tid] = mySum;
-    __syncthreads();
+namespace cg = cooperative_groups;

-    // do reduction in shared mem
-    if (blockSize >= 512)
-    {
-        if (tid < 256)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid + 256];
-        }
+// Utility class used to avoid linker errors with extern
+// unsized shared memory arrays with templated type
+template <class T>
+struct SharedMemory {
+  __device__ inline operator T *() {
+    extern __shared__ int __smem[];
+    return (T *)__smem;
+  }

-        __syncthreads();
+  __device__ inline operator const T *() const {
+    extern __shared__ int __smem[];
+    return (T *)__smem;
+  }
+};
+
+// specialize for double to avoid unaligned memory
+// access compile errors
+template <>
+struct SharedMemory<double> {
+  __device__ inline operator double *() {
+    extern __shared__ double __smem_d[];
+    return (double *)__smem_d;
+  }
+
+  __device__ inline operator const double *() const {
+    extern __shared__ double __smem_d[];
+    return (double *)__smem_d;
+  }
+};
+
+
+/*
+    This version adds multiple elements per thread sequentially.  This reduces
+   the overall cost of the algorithm while keeping the work complexity O(n) and
+   the step complexity O(log n). (Brent's Theorem optimization)
+
+    Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
+    In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
+    If blockSize > 32, allocate blockSize*sizeof(T) bytes.
+*/
+template <typename T, typename AccType, unsigned int blockSize, bool nIsPow2Greater1, size_t K, class Functor, class AggFunctor>
+__global__ void reduceSinglePass(Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale,
+                                 const functional::Shape full,
+                                 functional::Tensor<AccType> out,
+                                 functional::Array<functional::Tensor<T>, K> ins) {
+  int n = full.elements();
+
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  AccType *sdata = SharedMemory<AccType>();
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
+  unsigned int gridSize = blockSize * 2 * gridDim.x;
+
+  AccType mySum = aggInit;
+
+  // we reduceSinglePass multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim).  More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread
+  while (i < n) {
+    mySum = aggFunctor(mySum, functional::applyWithCast<AccType>(functor, ins, i));
+
+    // ensure we don't read out of bounds -- this is optimized away for powerOf2
+    // sized arrays
+    if (nIsPow2Greater1 || i + blockSize < n) 
+      mySum = aggFunctor(mySum, functional::applyWithCast<AccType>(functor, ins, i + blockSize));
+
+    i += gridSize;
+  }
+
+  // each thread puts its local sum into shared memory
+  sdata[tid] = mySum;
+  cg::sync(cta);
+
+  // do reduction in shared mem
+  if ((blockSize >= 512) && (tid < 256)) {
+    sdata[tid] = mySum = aggFunctor(mySum, sdata[tid + 256]);
+  }
+
+  cg::sync(cta);
+
+  if ((blockSize >= 256) && (tid < 128)) {
+    sdata[tid] = mySum = aggFunctor(mySum, sdata[tid + 128]);
+  }
+
+  cg::sync(cta);
+
+  if ((blockSize >= 128) && (tid < 64)) {
+    sdata[tid] = mySum = aggFunctor(mySum, sdata[tid + 64]);
+  }
+
+  cg::sync(cta);
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+  if (cta.thread_rank() < 32) {
+    // Fetch final intermediate sum from 2nd warp
+    if (blockSize >= 64) 
+      mySum = aggFunctor(mySum, sdata[tid + 32]);
+    // reduce final warp using shuffle
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      mySum = aggFunctor(mySum, tile32.shfl_down(mySum, offset));
    }
+  }

-    if (blockSize >= 256)
-    {
-        if (tid < 128)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid + 128];
-        }
-
-        __syncthreads();
-    }
-
-    if (blockSize >= 128)
-    {
-        if (tid <  64)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid +  64];
-        }
-
-        __syncthreads();
-    }
-
-    if (tid < 32)
-    {
-        if (blockSize >=  64)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid + 32];
-        }
-
-        if (blockSize >=  32)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid + 16];
-        }
-
-        if (blockSize >=  16)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid +  8];
-        }
-
-        if (blockSize >=   8)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid +  4];
-        }
-
-        if (blockSize >=   4)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid +  2];
-        }
-
-        if (blockSize >=   2)
-        {
-            sdata[tid] = mySum = mySum + sdata[tid +  1];
-        }
-    }
+  // write result for this block to global mem
+  if (cta.thread_rank() == 0) 
+    out[blockIdx.x] = aggFunctor(out[blockIdx.x], mySum * scale); // aggFunctor?
 }

-template <unsigned int blockSize, bool nIsPow2, typename T, typename AccType, class Functor>
-__device__ void
-reduceBlocks(Functor f, T *g_idata, AccType *g_odata, unsigned int n)
-{
-    extern __shared__ AccType sdata[];
+static inline bool isPow2Greater1(unsigned int x) { // is power of two but also larger than 1, otherwise an out-of-bounds read occurs
+  return x > 1 && ((x & (x - 1)) == 0);
+}

-    // perform first level of reduction,
-    // reading from global memory, writing to shared memory
-    unsigned int tid = threadIdx.x;
-    unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
-    unsigned int gridSize = blockSize*2*gridDim.x;
-    AccType mySum = 0;
+static inline unsigned int nextPow2(unsigned int x) {
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+}

-    // we reduce multiple elements per thread.  The number is determined by the
-    // number of active thread blocks (via gridDim).  More blocks will result
-    // in a larger gridSize and therefore fewer elements per thread
-    while (i < n)
-    {
-        mySum += f((AccType)g_idata[i]);
+////////////////////////////////////////////////////////////////////////////////
+// Wrapper function for kernel launch
+////////////////////////////////////////////////////////////////////////////////
+template <typename T, typename AccType, size_t K, class Functor, class AggFunctor>
+void reduceSinglePass(Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale,
+                      const functional::Shape full,
+                      functional::Tensor<AccType> out,
+                      functional::Array<functional::Tensor<T>, K> ins,
+                      int threads, int blocks) {
+  int size = full.elements();
+  // when there is only one warp per block, we need to allocate two warps
+  // worth of shared memory so that we don't index shared memory out of bounds
+  int smemSize = (threads <= 32) ? 2 * threads * sizeof(AccType) : threads * sizeof(AccType);
+  dim3 dimBlock(threads, 1, 1);
+  dim3 dimGrid(blocks, 1, 1);

-        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
-        if (nIsPow2 || i + blockSize < n)
-            mySum += f((AccType)g_idata[i+blockSize]);
-
-        i += gridSize;
+  if (isPow2Greater1(size)) {
+    switch (threads) {
+      case 512:
+        reduceSinglePass<T, AccType, 512, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 256:
+        reduceSinglePass<T, AccType, 256, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 128:
+        reduceSinglePass<T, AccType, 128, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 64:
+        reduceSinglePass<T, AccType, 64, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 32:
+        reduceSinglePass<T, AccType, 32, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 16:
+        reduceSinglePass<T, AccType, 16, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 8:
+        reduceSinglePass<T, AccType, 8, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 4:
+        reduceSinglePass<T, AccType, 4, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 2:
+        reduceSinglePass<T, AccType, 2, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 1:
+        reduceSinglePass<T, AccType, 1, true><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
    }
-
-    // do reduction in shared mem
-    reduceBlock<blockSize>(sdata, mySum, tid);
-
-    // write result for this block to global mem
-    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
-}
-
-// Global variable used by reduceSinglePass to count how many blocks have finished
-__device__ unsigned int retirementCount = 0;
-
-cudaError_t setRetirementCount(int retCnt)
-{
-    return cudaMemcpyToSymbol(retirementCount, &retCnt, sizeof(unsigned int), 0, cudaMemcpyHostToDevice);
-}
-
-// This reduction kernel reduces an arbitrary size array in a single kernel invocation
-// It does so by keeping track of how many blocks have finished.  After each thread
-// block completes the reduction of its own block of data, it "takes a ticket" by
-// atomically incrementing a global counter.  If the ticket value is equal to the number
-// of thread blocks, then the block holding the ticket knows that it is the last block
-// to finish.  This last block is responsible for summing the results of all the other
-// blocks.
-//
-// In order for this to work, we must be sure that before a block takes a ticket, all
-// of its memory transactions have completed.  This is what __threadfence() does -- it
-// blocks until the results of all outstanding memory transactions within the
-// calling thread are visible to all other threads.
-//
-// For more details on the reduction algorithm (notably the multi-pass approach), see
-// the "reduction" sample in the CUDA SDK.
-
-template <unsigned int blockSize, bool nIsPow2, typename T, typename AccType, class Functor>
-__global__ void reduceSinglePass(Functor f, T *g_idata, AccType *g_odata, unsigned int n)
-{
-
-    //
-    // PHASE 1: Process all inputs assigned to this block
-    //
-
-    reduceBlocks<blockSize, nIsPow2, T, AccType>(f, g_idata, g_odata, n);
-
-    //
-    // PHASE 2: Last block finished will process all partial sums
-    //
-
-    if (gridDim.x > 1)
-    {
-        const unsigned int tid = threadIdx.x;
-        __shared__ bool amLast;
-        extern AccType __shared__ smem[];
-
-        // wait until all outstanding memory instructions in this thread are finished
-        __threadfence();
-
-        // Thread 0 takes a ticket
-        if (tid==0)
-        {
-            unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
-            // If the ticket ID is equal to the number of blocks, we are the last block!
-            amLast = (ticket == gridDim.x-1);
-        }
-
-        __syncthreads();
-
-        // The last block sums the results of all other blocks
-        if (amLast)
-        {
-            int i = tid;
-            AccType mySum = 0;
-
-            while (i < gridDim.x)
-            {
-                mySum += g_odata[i];
-                i += blockSize;
-            }
-
-            reduceBlock<blockSize>(smem, mySum, tid);
-
-            if (tid==0)
-            {
-                g_odata[0] = smem[0];
-
-                // reset retirement count so that next run succeeds
-                retirementCount = 0;
-            }
-        }
+  } else {
+    switch (threads) {
+      case 512:
+        reduceSinglePass<T, AccType, 512, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 256:
+        reduceSinglePass<T, AccType, 256, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 128:
+        reduceSinglePass<T, AccType, 128, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 64:
+        reduceSinglePass<T, AccType, 64, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 32:
+        reduceSinglePass<T, AccType, 32, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 16:
+        reduceSinglePass<T, AccType, 16, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 8:
+        reduceSinglePass<T, AccType, 8, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 4:
+        reduceSinglePass<T, AccType, 4, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 2:
+        reduceSinglePass<T, AccType, 2, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
+      case 1:
+        reduceSinglePass<T, AccType, 1, false><<<dimGrid, dimBlock, smemSize>>>(functor, aggInit, aggFunctor, scale, full, out, ins);
+        break;
    }
+  }
 }

-bool isPow2(unsigned int x)
-{
-    return ((x&(x-1))==0);
-}
-
-template <typename T, typename AccType, class Functor>
-void ReduceAll(Functor f, Tensor blockMem, Tensor in)
-{
-    cudaSetDevice(in->getDeviceId().no);
-    int size = in->shape().elements();
-    int threads = std::min(MAX_THREADS, size);
-    int blocks  = std::min(MAX_BLOCKS, size / threads  + (size % threads != 0));
-
-    dim3 dimBlock(threads, 1, 1);
-    dim3 dimGrid(blocks, 1, 1);
-    int smemSize = threads * sizeof(AccType);
-
-    T* d_idata = in->data<T>();
-    AccType* d_odata = blockMem->data<AccType>();
-
-    // choose which of the optimized versions of reduction to launch
-    if (isPow2(size))
-    {
-        switch (threads)
-        {
-            case 512:
-                reduceSinglePass<512, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 256:
-                reduceSinglePass<256, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 128:
-                reduceSinglePass<128, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 64:
-                reduceSinglePass< 64, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 32:
-                reduceSinglePass< 32, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 16:
-                reduceSinglePass< 16, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case  8:
-                reduceSinglePass<  8, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case  4:
-                reduceSinglePass<  4, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case  2:
-                reduceSinglePass<  2, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case  1:
-                reduceSinglePass<  1, true, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-        }
-    }
-    else
-    {
-        switch (threads)
-        {
-            case 512:
-                reduceSinglePass<512, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 256:
-                reduceSinglePass<256, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 128:
-                reduceSinglePass<128, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 64:
-                reduceSinglePass< 64, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 32:
-                reduceSinglePass< 32, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case 16:
-                reduceSinglePass< 16, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case  8:
-                reduceSinglePass<  8, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case  4:
-                reduceSinglePass<  4, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case  2:
-                reduceSinglePass<  2, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-
-            case  1:
-                reduceSinglePass<  1, false, T, AccType><<< dimGrid, dimBlock, smemSize >>>(f, d_idata, d_odata, size);
-                break;
-        }
-    }
-}
-
-}
+}
--- a/src/3rd_party/sse_mathfun.h
+++ b/src/3rd_party/sse_mathfun.h
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -51,7 +51,7 @@ add_library(marian STATIC
  tensors/cpu/sharp/int_gemm.cpp
  tensors/cpu/sharp/avx_gemm.cpp
  tensors/cpu/sharp/sse_gemm.cpp
-  tensors/cpu/sharp/packed_gemm.cpp
+  tensors/cpu/fbgemm/packed_gemm.cpp

  graph/expression_graph.cpp
  graph/expression_operators.cpp
@ -138,7 +138,8 @@ cuda_add_library(marian_cuda
  tensors/gpu/algorithm.cu
  tensors/gpu/prod.cpp
  tensors/gpu/element.cu
-  tensors/gpu/add.cu
+  tensors/gpu/add.cu  
+  tensors/gpu/add_all.cu
  tensors/gpu/tensor_operators.cu
  tensors/gpu/cudnn_wrappers.cu
  translator/nth_element.cu
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@ -4,7 +4,7 @@

 #include <sstream>

-#include "graph/expression_graph_packable.h"
+#include "tensors/cpu/fbgemm/expression_graph_packable.h"

 int main(int argc, char** argv) {
  using namespace marian;
@ -19,16 +19,29 @@ int main(int argc, char** argv) {
        "Convert a model in the .npz format and normal memory layout to a mmap-able binary model which could be in normal memory layout or packed memory layout",
        "Allowed options",
        "Examples:\n"
-        "  ./marian-conv -f model.npz -t model.bin --gemm-type fp16packed");
+        "  ./marian-conv -f model.npz -t model.bin --gemm-type packed16");
    cli->add<std::string>("--from,-f", "Input model", "model.npz");
    cli->add<std::string>("--to,-t", "Output model", "model.bin");
-    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used with this weights", "mklfp32");
+    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512", "float32");
    cli->parse(argc, argv);
    options->merge(config);
  }
  auto modelFrom = options->get<std::string>("from");
  auto modelTo = options->get<std::string>("to");
-  auto saveGemmType = options->get<std::string>("gemm-type");
+  
+  auto saveGemmTypeStr = options->get<std::string>("gemm-type", "float32");
+  Type saveGemmType;
+  if(saveGemmTypeStr == "float32") {
+    saveGemmType = Type::float32;
+  } else if(saveGemmTypeStr == "packed16") {  // packed16 only supports AVX2. AVX512 might be added later
+    saveGemmType = Type::packed16;
+  } else if(saveGemmTypeStr == "packed8avx2") { // packed8 for AVX2
+    saveGemmType = Type::packed8avx2;
+  } else if(saveGemmTypeStr == "packed8avx512") { // packed8 for AVX512
+    saveGemmType = Type::packed8avx512;
+  } else {
+    ABORT("Unknown gemm-type: {}", saveGemmTypeStr);
+  }

  LOG(info, "Outputting {}", modelTo);

--- a/src/command/marian_train.cpp
+++ b/src/command/marian_train.cpp
@ -1,3 +1,4 @@
+#include <signal.h>
 #include "marian.h"

 #include "training/graph_group_async.h"
@ -68,5 +69,13 @@ int main(int argc, char** argv) {
    }
  }

-  return 0;
+  // If we exit due to SIGTERM, exit with 128 + the signal number, as suggested
+  // for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent
+  // scripts to determine if training terminated naturally or via SIGTERM.
+  // Whith this approach we can accommodate additional signals in the future.
+  // An alternative would be to return 124, which is what the timeout command
+  // returns for timeout -s SIGTERM <seconds> ...., because exiting after SIGTERM
+  // is not technically a fatal error (which is what the 128+x convention usually
+  // stands for).
+  return getSigtermFlag() ? (128 + SIGTERM) : 0;
 }
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -328,6 +328,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
      "Optimization criterion: ce-mean, ce-mean-words, ce-sum, perplexity", "ce-mean");
  cli.add<std::string>("--multi-loss-type",
      "How to accumulate multi-objective losses: sum, scaled, mean", "sum");
+  cli.add<bool>("--unlikelihood-loss",
+      "Use word-level weights as indicators for sequence-level unlikelihood training");
  cli.add<bool>("--overwrite",
      "Do not create model checkpoints, only overwrite main model file with last checkpoint. "
      "Reduces disk usage");
@ -502,7 +504,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
     true);

  // add ULR settings
-  addSuboptionsULR(cli);
+  addSuboptionsULR(cli); 

  cli.add<std::vector<std::string>>("--task",
     "Use predefined set of options. Possible values: transformer, transformer-big");
@ -543,6 +545,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
      "Allow unknown words to appear in output");
  cli.add<bool>("--n-best",
      "Generate n-best list");
+  cli.add<bool>("--word-scores",
+      "Print word-level scores");

  // efficiency options
  cli.add<int>("--valid-mini-batch",
@ -562,8 +566,10 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
      "Additional args passed to --valid-script-path. These are inserted"
      " between the script path and the output translation-file path");
  cli.add<std::string>("--valid-translation-output",
-     "Path to store the translation");
-
+     "(Template for) path to store the translation. "
+     "E.g., validation-output-after-{U}-updates-{T}-tokens.txt. Template "
+     "parameters: {E} for epoch; {B} for No. of batches within epoch; "
+     "{U} for total No. of updates; {T} for total No. of tokens seen.");
  cli.add<bool>("--keep-best",
      "Keep best model for each validation metric");
  cli.add<std::string>("--valid-log",
@ -603,6 +609,12 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
  cli.add<std::string>("--alignment",
     "Return word alignment. Possible values: 0.0-1.0, hard, soft")
    ->implicit_val("1");
+  cli.add<bool>("--word-scores",
+      "Print word-level scores");
+#ifdef USE_SENTENCEPIECE
+  cli.add<bool>("--no-spm-decode",
+      "Keep the output segmented into SentencePiece subwords");
+#endif

  addSuboptionsDevices(cli);
  addSuboptionsInputLength(cli);
@ -612,7 +624,7 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
      "Optimize speed aggressively sacrificing memory or precision");
  cli.add<bool>("--skip-cost",
      "Ignore model cost during translation, not recommended for beam-size > 1");
-  cli.add<bool>("--fp16", 
+  cli.add<bool>("--fp16",
      "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
  cli.add<std::vector<std::string>>("--precision",
      "Mixed precision for inference, set parameter type in expression graph",
@ -626,8 +638,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
     "Noise output layer with gumbel noise",
      false);

+#if 0 // @TODO: Ask Hany if there are any decoding-time options
  // add ULR settings
  addSuboptionsULR(cli);
+#endif

  cli.switchGroup(previous_group);
  // clang-format on
@ -737,29 +751,31 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
      "Sorting strategy for maxi-batch: none, src, trg (not available for decoder)",
      defaultMaxiBatchSort);

-  cli.add<bool>("--shuffle-in-ram",
-      "Keep shuffled corpus in RAM, do not write to temp file");
-  // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
-  cli.add<size_t>("--all-caps-every",
-      "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
-  cli.add<size_t>("--english-title-case-every",
-      "When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");
+  if(mode_ == cli::mode::training) {
+    cli.add<bool>("--shuffle-in-ram",
+        "Keep shuffled corpus in RAM, do not write to temp file");
+    // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
+    cli.add<size_t>("--all-caps-every",
+        "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
+    cli.add<size_t>("--english-title-case-every",
+        "When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");

-  cli.add<int>("--mini-batch-words-ref",
-      "If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
-      "--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
-  cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
-      "Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
-      "Auto-adjusted to --mini-batch-words-ref if given",
-      {"0"});
-  cli.add<bool>("--mini-batch-track-lr",
-      "Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
-  cli.add<size_t>("--mini-batch-overstuff",
-      "[experimental] Stuff this much more data into a minibatch, but scale down the LR and progress counter",
-      1);
-  cli.add<size_t>("--mini-batch-understuff",
-      "[experimental] Break each batch into this many updates",
-      1);
+    cli.add<int>("--mini-batch-words-ref",
+        "If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
+        "--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
+    cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
+        "Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
+        "Auto-adjusted to --mini-batch-words-ref if given",
+        {"0"});
+    cli.add<bool>("--mini-batch-track-lr",
+        "Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
+    cli.add<size_t>("--mini-batch-overstuff",
+        "[experimental] Stuff this much more data into a minibatch, but scale down the LR and progress counter",
+        1);
+    cli.add<size_t>("--mini-batch-understuff",
+        "[experimental] Break each batch into this many updates",
+        1);
+  }
  // clang-format on
 }

--- a/src/common/fastopt.cpp
+++ b/src/common/fastopt.cpp
@ -23,7 +23,7 @@ struct Convert {
 // specialization for translating from string, @TODO check if this is required at all, mostly for compilation now.
 template <typename To>
 struct Convert<To, std::string> { 
-  static inline To apply(const std::string& from) { 
+  static inline To apply(const std::string& /* from */) { 
    ABORT("Not implemented");
  }
 };
@ -84,7 +84,10 @@ std::vector<T> As<std::vector<T>>::apply(const FastOpt& node) {
 // specializations for simple vector types
 template struct As<std::vector<bool>>;
 template struct As<std::vector<int>>;
-template struct As<std::vector<unsigned long>>;
+// Windows and Unix based OS have different type definitions for 'unsigned long'.
+// So, we need an explicit definition for uint64_t. Otherwise, there's a linking error on windows.
+// https://software.intel.com/en-us/articles/size-of-long-integer-type-on-different-architecture-and-os/
+template struct As<std::vector<uint64_t>>;
 template struct As<std::vector<float>>;
 template struct As<std::vector<double>>;
 template struct As<std::vector<std::string>>;
--- a/src/common/fastopt.h
+++ b/src/common/fastopt.h
@ -154,6 +154,13 @@ private:
  void makeScalar(const YAML::Node& v) {
    elements_ = 0;
    try {
+      // Cast node to text first, that works for any scalar node and test that it does not contain single characters
+      // that according to YAML could be boolean values. Unfortunately, we do not have any type information at this point. 
+      // This means we are disabling support for boolean values in YAML that are expressed with these characters. 
+      auto asText = v.as<std::string>();
+      if(asText.size() == 1 && asText.find_first_of("nyNYtfTF") == 0) // @TODO: should we disallow other strings too?
+        throw YAML::BadConversion(YAML::Mark()); // get's picked up by next catch block
+
      value_ = v.as<bool>();
      type_ = NodeType::Bool;
    } catch(const YAML::BadConversion& /*e*/) {
--- a/src/common/file_stream.cpp
+++ b/src/common/file_stream.cpp
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@ -1,5 +1,5 @@
 #include "common/types.h"
-#include "tensors/cpu/sharp/packed_gemm.h"
+#include "tensors/cpu/fbgemm/packed_gemm.h"

 namespace marian {

@ -8,13 +8,31 @@ namespace marian {
 // But for instance, for intransparent types like packed tensors, it cannot easily be inferred by
 // multiplying. All cases are handed here and can later be passed to allocators etc. 
 size_t requiredBytes(const Shape& shape, Type type) {
-  if(isPacked(type)) {
-    uint64_t packsize;
-    cpu::variant::PackInfoFp32(shape, false, packsize);
-    return (size_t)packsize;
+#if USE_FBGEMM
+  if (isPacked(type)) {
+    if (sizeOf(type) == 1) {
+      // Type::packed8avx2 || type == Type::packed8avx512
+      // AVX2 and AVX512 CPUs have different cache and vector lanes,
+      // so the optimal memory layouts for them are different.
+      int nrow, ncol;
+      uint64_t packsize;
+      cpu::variant::fbgemmPacked8PackInfo(shape, type, false, /*out=*/nrow, /*out=*/ncol, /*out=*/packsize);
+      return (size_t)packsize;
+    } else if (type == Type::packed16) {
+      uint64_t packsize;
+      cpu::variant::fbgemmPacked16PackInfo(shape, false, /*out=*/packsize);
+      return (size_t)packsize;
+    } else {
+      ABORT("Not a supported data type: {}", type);
+      return 0;
+    }
  } else {
    return shape.elements() * sizeOf(type);
  }
+#else
+  return shape.elements() * sizeOf(type);
+#endif  // USE_FBGEMM
+  
 }

 }
--- a/src/common/types.h
+++ b/src/common/types.h
@ -31,7 +31,7 @@
 #include <cuda.h> // required to see CUDA_VERSION
 #if (CUDA_VERSION > 9000)
 #define COMPILE_FP16 1
-#else 
+#else
 #define COMPILE_FP16 0
 #endif
 #else
@ -135,13 +135,19 @@ do { \
 namespace marian {

 // small struct to enable templating based on types use for packing
-struct packed8 {
+struct packed16 {
+  uint16_t x;
+};
+
+// small struct to enable templating based on types use for packing. This is a memory holder.
+// There's no difference between packed8avx2 and packed8avx512. But, they are separately defined to be distinguished.
+struct packed8avx2 {
  uint8_t x;
 };

-// small struct to enable templating based on types use for packing
-struct packed16 {
-  uint16_t x;
+// small struct to enable templating based on types use for packing. This is a memory holder.
+struct packed8avx512 {
+  uint8_t x;
 };

 #ifndef __CUDACC__ // vectorized types not available from .cu files
@ -174,6 +180,7 @@ public:
 };

 // @TODO: consider how code can be shared via templating
+#ifdef __AVX__
 struct float32x8 {
 private:
  __m256 f_;
@ -199,22 +206,35 @@ public:
    return out;
  }
 };
+#else
+//Dummy version to get things to compile on older CPUs
+struct float32x8 {
+};
+#endif
 #endif

 // Internal to types.h, don't use. Use test functions below.
 enum class TypeClass : size_t {
-  signed_type   = 0x100,
-  unsigned_type = 0x200,
-  float_type    = 0x400,
-  packed_type   = 0x800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
-            
-  size_mask     = 0x0FF
+  signed_type   = 0x0100,
+  unsigned_type = 0x0200,
+  float_type    = 0x0400,
+
+  packed_type   = 0x0800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
+  avx2_type     = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only
+  avx512_type   = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only
+
+  size_mask     = 0x00FF,
+  class_mask    = 0xFF00
 };

 constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
  return (size_t)typeClass + val;
 }

+constexpr inline size_t operator+(size_t val, TypeClass typeClass) {
+  return val + (size_t)typeClass;
+}
+
 // @TODO: rename to ElementType when things become stable, so it's easier to review
 enum class Type : size_t {
  int8     = TypeClass::signed_type + 1u,
@ -231,14 +251,20 @@ enum class Type : size_t {
  float32  = TypeClass::float_type + 4u,
  float64  = TypeClass::float_type + 8u,

-  packed8  = TypeClass::packed_type + 1u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-  packed16 = TypeClass::packed_type + 2u  // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
+  packed16      = TypeClass::packed_type + 2u,                          // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
+  packed8avx2   = TypeClass::packed_type + 1u + TypeClass::avx2_type,   // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  
 };

 static inline size_t operator&(TypeClass typeClass, Type type) {
  return (size_t)typeClass & (size_t)type;
 }

+static inline bool isSameTypeClass(Type type1, Type type2) {
+  return (TypeClass::class_mask & type1) == (TypeClass::class_mask & type2);
+}
+
 static inline size_t sizeOf(Type type) {
  return TypeClass::size_mask & type;
 }
@ -263,6 +289,14 @@ static inline bool isPacked(Type type) {
  return (TypeClass::packed_type & type) != 0;
 }

+static inline bool isAvx2(Type type) {
+  return (TypeClass::avx2_type & type) != 0;
+}
+
+static inline bool isAvx512(Type type) {
+  return (TypeClass::avx512_type & type) != 0;
+}
+
 size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type

 template <typename T>
@ -284,8 +318,9 @@ template <> inline bool matchType<float16>(Type type)  { return type == Type::fl
 template <> inline bool matchType<float>(Type type)    { return type == Type::float32;  }
 template <> inline bool matchType<double>(Type type)   { return type == Type::float64;  }

-template <> inline bool matchType<packed8>(Type type)  { return type == Type::packed8;  }
-template <> inline bool matchType<packed16>(Type type) { return type == Type::packed16; }
+template <> inline bool matchType<packed16>(Type type)       { return type == Type::packed16;       }
+template <> inline bool matchType<packed8avx2>(Type type)    { return type == Type::packed8avx2;    }
+template <> inline bool matchType<packed8avx512>(Type type)  { return type == Type::packed8avx512;  }
 // clang-format on

 static inline std::ostream& operator<<(std::ostream& out, Type type) {
@ -304,8 +339,9 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) {
    case Type::float32 : out << "float32"; break;
    case Type::float64 : out << "float64"; break;

-    case Type::packed8 : out << "packed8"; break;
-    case Type::packed16: out << "packed16"; break;
+    case Type::packed16      : out << "packed16"; break;
+    case Type::packed8avx2   : out << "packed8avx2"; break;
+    case Type::packed8avx512 : out << "packed8avx512"; break;
  }
  return out;
 }
@ -328,8 +364,9 @@ template <> inline std::string request<float16>()  { return "float16"; }
 template <> inline std::string request<float>()    { return "float32"; }
 template <> inline std::string request<double>()   { return "float64"; }

-template <> inline std::string request<packed8>()  { return "packed8"; }
 template <> inline std::string request<packed16>() { return "packed16"; }
+template <> inline std::string request<packed8avx2>()  { return "packed8avx2"; }
+template <> inline std::string request<packed8avx512>()  { return "packed8avx512"; }
 // clang-format on

 static Type inline typeFromString(const std::string& str) {
@ -357,6 +394,13 @@ static Type inline typeFromString(const std::string& str) {
    return Type::float32;
  if(str == "float64")
    return Type::float64;
+  
+  if(str == "packed16")
+    return Type::packed16;
+  if(str == "packed8avx2")
+    return Type::packed8avx2;
+  if(str == "packed8avx512")
+    return Type::packed8avx512;

  ABORT("Unknown type {}", str);
 }
@ -378,6 +422,10 @@ template <> inline Type typeId<float16>()  { return Type::float16; }
 template <> inline Type typeId<float>()    { return Type::float32; }
 template <> inline Type typeId<double>()   { return Type::float64; }

+template <> inline Type typeId<packed16>()      { return Type::packed16; }
+template <> inline Type typeId<packed8avx2>()   { return Type::packed8avx2; }
+template <> inline Type typeId<packed8avx512>() { return Type::packed8avx512; }
+
 // Abort if given C++ does not correspond to runtime type
 template <typename T>
 void matchOrAbort(Type type) {
--- a/src/data/alignment.cpp
+++ b/src/data/alignment.cpp
@ -8,9 +8,7 @@ namespace data {

 WordAlignment::WordAlignment() {}

-WordAlignment::WordAlignment(
-    const std::vector<Point>& align)
-    : data_(align) {}
+WordAlignment::WordAlignment(const std::vector<Point>& align) : data_(align) {}

 WordAlignment::WordAlignment(const std::string& line) {
  std::vector<std::string> atok = utils::splitAny(line, " -");
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@ -13,17 +13,17 @@ namespace data {

 Corpus::Corpus(Ptr<Options> options, bool translate /*= false*/)
    : CorpusBase(options, translate),
-        shuffleInRAM_(options_->get<bool>("shuffle-in-ram")),
-        allCapsEvery_(options_->get<size_t>("all-caps-every")),
-        titleCaseEvery_(options_->get<size_t>("english-title-case-every")) {}
+        shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
+        allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
+        titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}

 Corpus::Corpus(std::vector<std::string> paths,
               std::vector<Ptr<Vocab>> vocabs,
               Ptr<Options> options)
    : CorpusBase(paths, vocabs, options),
-        shuffleInRAM_(options_->get<bool>("shuffle-in-ram")),
-        allCapsEvery_(options_->get<size_t>("all-caps-every")),
-        titleCaseEvery_(options_->get<size_t>("english-title-case-every")) {}
+        shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
+        allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
+        titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}

 void Corpus::preprocessLine(std::string& line, size_t streamId) {
  if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) {
@ -235,11 +235,12 @@ CorpusBase::batch_ptr Corpus::toBatch(const std::vector<Sample>& batchVector) {
  }

  std::vector<size_t> words(maxDims.size(), 0);
-  for(size_t i = 0; i < batchSize; ++i) {
-    for(size_t j = 0; j < maxDims.size(); ++j) {
-      for(size_t k = 0; k < batchVector[i][j].size(); ++k) {
-        subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
-        subBatches[j]->mask()[k * batchSize + i] = 1.f;
+  for(size_t b = 0; b < batchSize; ++b) {                    // loop over batch entries
+    for(size_t j = 0; j < maxDims.size(); ++j) {             // loop over streams
+      auto subBatch = subBatches[j];
+      for(size_t s = 0; s < batchVector[b][j].size(); ++s) { // loop over word positions
+        subBatch->data()[subBatch->locate(/*batchIdx=*/b, /*wordPos=*/s)/*s * batchSize + b*/] = batchVector[b][j][s];
+        subBatch->mask()[subBatch->locate(/*batchIdx=*/b, /*wordPos=*/s)/*s * batchSize + b*/] = 1.f;
        words[j]++;
      }
    }
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@ -1,6 +1,7 @@
 #include <random>

 #include "data/corpus.h"
+#include "data/factored_vocab.h"

 namespace marian {
 namespace data {
@ -84,19 +85,19 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
                "Vocabularies will be built separately for each file.");

      std::vector<int> vocabDims(paths_.size(), 0);
-      std::vector<std::string> vocabPaths(paths_.size());
+      std::vector<std::string> vocabPaths1(paths_.size());
      // Create vocabs if not provided
      for(size_t i = 0; i < paths_.size(); ++i) {
        Ptr<Vocab> vocab = New<Vocab>(options_, i);
        std::vector<std::string> trainPaths = { paths_[i] };
-        vocabDims[i] = vocab->loadOrCreate("", trainPaths, maxVocabs[i]);
-        vocabPaths[i] = paths_[i] + ".yml";
+        vocabDims[i] = (int) vocab->loadOrCreate("", trainPaths, maxVocabs[i]);
+        vocabPaths1[i] = paths_[i] + ".yml";
        vocabs_.emplace_back(vocab);
      }
      // TODO: this is not nice as it modifies the option object and needs to expose the changes
      // outside the corpus as models need to know about the vocabulary size; extract the vocab
      // creation functionality from the class.
-      options_->set("dim-vocabs", vocabDims, "vocabs", vocabPaths);
+      options_->set("dim-vocabs", vocabDims, "vocabs", vocabPaths1);
    } else {
      // Load all vocabs
      size_t numVocs = vocabPaths.size();
@ -128,7 +129,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
        // it wild not be created again, but just correctly loaded.
        auto pathsAndSize = groupVocab[vocabPaths[i]];
        std::vector<std::string> groupedPaths(pathsAndSize.paths.begin(), pathsAndSize.paths.end());
-        vocabDims[i] = vocab->loadOrCreate(vocabPaths[i], groupedPaths, pathsAndSize.size);
+        vocabDims[i] = (int) vocab->loadOrCreate(vocabPaths[i], groupedPaths, pathsAndSize.size);
        vocabs_.emplace_back(vocab);
      }
      // TODO: this is not nice as it modifies the option object and needs to expose the changes
@ -150,7 +151,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate)
    vocabDims.resize(numVocs, 0);
    for(size_t i = 0; i + 1 < numVocs; ++i) {
      Ptr<Vocab> vocab = New<Vocab>(options_, i);
-      vocabDims[i] = vocab->load(vocabPaths[i], maxVocabs[i]);
+      vocabDims[i] = (int) vocab->load(vocabPaths[i], maxVocabs[i]);
      vocabs_.emplace_back(vocab);
    }
    // TODO: As above, this is not nice as it modifies the option object and needs to expose the changes
@ -240,10 +241,10 @@ void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupl

  if(!elements.empty()) {
    std::vector<float> weights;
-    for(auto& e : elements) {
-      if(maxLengthCrop_ && weights.size() > maxLength_)
+    for(auto& e : elements) {                             // Iterate weights as strings
+      if(maxLengthCrop_ && weights.size() >= maxLength_)  // Cut if the input is going to be cut
        break;
-      weights.emplace_back(std::stof(e));
+      weights.emplace_back(std::stof(e));                 // Add a weight converted into float
    }

    if(rightLeft_)
@ -330,5 +331,54 @@ void CorpusBase::initEOS(bool training = true) {
    }
 }

+// experimental: hide inline-fix source tokens from cross attention
+std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
+{
+  const auto& srcVocab = *vocab();
+
+  auto factoredVocab = vocab()->tryAs<FactoredVocab>();
+  size_t inlineFixGroupIndex = 0, inlineFixSrc = 0;
+  auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc);
+
+  auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG];
+  auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG];
+  auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG];
+  auto unkId = srcVocab.getUnkId();
+  auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId;
+
+  auto m = mask(); // default return value, which we will modify in-place below in case we need to
+  if (hasInlineFixFactors || hasInlineFixTags) {
+    LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens");
+
+    // example: force French translation of name "frank" to always be "franck"
+    //  - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to
+    //  - hasInlineFixTags:    "<IOPEN> frank <IDELIM> franck <ICLOSE>", "frank" and all tags cannot be cross-attended to
+    auto dimBatch = batchSize();  // number of sentences in the batch
+    auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch
+    const auto& d = data();
+    size_t numWords = 0;
+    for (size_t b = 0; b < dimBatch; b++) {     // loop over batch entries
+      bool inside = false;
+      for (size_t s = 0; s < dimWidth; s++) {  // loop over source positions
+        auto i = locate(/*batchIdx=*/b, /*wordPos=*/s);
+        if (!m[i])
+          break;
+        numWords++;
+        // keep track of entering/exiting the inline-fix source tags
+        auto w = d[i];
+        if (w == fixSrcId)
+          inside = true;
+        else if (w == fixTgtId)
+          inside = false;
+        bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc;
+        if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor)
+          m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens
+      }
+    }
+    ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??");
+  }
+  return m;
+}
+
 }  // namespace data
 }  // namespace marian
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -143,12 +143,19 @@ public:
   * words (width) and \f$s\f$ is the number of sentences (size).
   */
  Words& data() { return indices_; }
+  const Words& data() const { return indices_; }
+  /**
+   * @brief compute flat index into data() and mask() vectors for given batch index and word index in sentence
+   */
+  size_t locate(size_t batchIdx, size_t wordPos) const { return locate(batchIdx, wordPos, size_); }
+  static size_t locate(size_t batchIdx, size_t wordPos, size_t batchSize) { return wordPos * batchSize + batchIdx; }
  /**
   * @brief Flat masking vector; 0 is used for masked words.
   *
   * @see data()
   */
  std::vector<float>& mask() { return mask_; }
+  const std::vector<float>& mask() const { return mask_; }

  /**
   * @brief Accessors to the vocab_ field.
@ -158,15 +165,15 @@ public:
  /**
   * @brief The number of sentences in the batch.
   */
-  size_t batchSize() { return size_; }
+  size_t batchSize() const { return size_; }
  /**
   * @brief The number of words in the longest sentence in the batch.
   */
-  size_t batchWidth() { return width_; };
+  size_t batchWidth() const { return width_; };
  /**
   * @brief The total number of words in the batch (not counting masked-out words).
   */
-  size_t batchWords() { return words_; }
+  size_t batchWords() const { return words_; }

  /**
   * @brief Splits the stream into sub-batches of equal size (except for last).
@ -179,7 +186,7 @@ public:
   *
   * @see marian::data::Batch::split(size_t n)
   */
-  std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) {
+  std::vector<Ptr<SubBatch>> split(size_t n, size_t sizeLimit /*or SIZE_MAX*/) const {
    ABORT_IF(size_ == 0, "Encountered sub-batch size of 0");

    auto size = std::min(size_, sizeLimit); // if limit is given then pretend the batch only has that many sentences
@ -191,26 +198,24 @@ public:

      // determine actual width (=max length) of this sub-batch, which may be smaller than the overall max length
      size_t subWidth = 0;
-      for(size_t j = 0; j < width_; ++j) {
-        for(size_t i = 0; i < subSize; ++i) {
-          if(mask_[j * size_ + (pos + i)] != 0)
-            if (subWidth < j + 1)
-              subWidth = j + 1;
+      for(size_t s = 0; s < width_; ++s) {
+        for(size_t b = 0; b < subSize; ++b) {
+          if(mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)] != 0)   // s * size_ + (pos + b)
+            if (subWidth < s + 1)
+              subWidth = s + 1;
        }
      }
-      //if (subWidth < width_)
-      //  LOG(info, "[data] sub-batch {} of {} wide batch has effective width of {}", pos / targetSize, width_, subWidth);

      // create sub-batch
      auto sb = New<SubBatch>(subSize, subWidth, vocab_);

      size_t words = 0;
-      for(size_t j = 0; j < subWidth; ++j) {
-        for(size_t i = 0; i < subSize; ++i) {
-          sb->data()[j * subSize + i] = indices_[j * size_ + (pos + i)];
-          sb->mask()[j * subSize + i] =    mask_[j * size_ + (pos + i)];
+      for(size_t s = 0; s < subWidth; ++s) {
+        for(size_t b = 0; b < subSize; ++b) {
+          sb->data()[locate(/*batchIdx=*/b, /*wordPos=*/s, /*batchSize=*/subSize)/*s * subSize + b*/] = indices_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)]; // s * size_ + (pos + b)
+          sb->mask()[locate(/*batchIdx=*/b, /*wordPos=*/s, /*batchSize=*/subSize)/*s * subSize + b*/] =    mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)]; // s * size_ + (pos + b)

-          if(mask_[j * size_ + (pos + i)] != 0)
+          if(mask_[locate(/*batchIdx=*/pos + b, /*wordPos=*/s)/*s * size_ + (pos + b)*/] != 0)
            words++;
        }
      }
@ -222,6 +227,9 @@ public:
  }

  void setWords(size_t words) { words_ = words; }
+
+  // experimental: hide inline-fix source tokens from cross attention
+  std::vector<float> crossMaskWithInlineFixSourceSuppressed() const;
 };

 /**
@ -231,7 +239,7 @@ public:
 class CorpusBatch : public Batch {
 protected:
  std::vector<Ptr<SubBatch>> subBatches_;
-  std::vector<float> guidedAlignment_;
+  std::vector<float> guidedAlignment_; // [max source len, batch size, max target len] flattened
  std::vector<float> dataWeights_;

 public:
@ -302,7 +310,8 @@ public:

  /**
   * @brief Creates a batch filled with fake data. Used to determine the size of
-   * the batch object.
+   * the batch object. With guided-alignments and multiple encoders, those
+   * multiple source streams are expected to have the same lengths.
   *
   * @param lengths List of subbatch sizes.
   * @param batchSize Number of sentences in the batch.
@ -335,6 +344,7 @@ public:
      return batch;

    if(options->get("guided-alignment", std::string("none")) != "none") {
+      // @TODO: if > 1 encoder, verify that all encoders have the same sentence lengths
      std::vector<float> alignment(batchSize * lengths.front() * lengths.back(),
                                   0.f);
      batch->setGuidedAlignment(std::move(alignment));
@ -369,7 +379,7 @@ public:
    // split each stream separately
    for(auto batchStream : subBatches_) {
      size_t i = 0; // index into split batch
-      for(auto splitSubBatch : batchStream->split(n, sizeLimit)) {
+      for(auto splitSubBatch : batchStream->split(n, sizeLimit)) { // splits a batch into pieces, can also change width
        if(subs.size() <= i)
          subs.resize(i + 1);
        subs[i++].push_back(splitSubBatch); // this forms tuples across streams
@ -408,7 +418,7 @@ public:
          size_t bi = i + pos;
          for(size_t sid = 0; sid < srcWords; ++sid) {
            for(size_t tid = 0; tid < trgWords; ++tid) {
-              size_t bidx = sid * oldSize  * oldTrgWords + bi * oldTrgWords + tid;
+              size_t bidx = sid * oldSize  * oldTrgWords + bi * oldTrgWords + tid; // [sid, bi, tid]
              size_t idx  = sid * dimBatch *    trgWords +  i *    trgWords + tid;
              aligns[idx] = guidedAlignment_[bidx];
            }
@ -424,20 +434,19 @@ public:
    if(!dataWeights_.empty()) {
      size_t oldSize = size();

-      size_t width = 1;
-      // There are more weights than sentences, i.e. these are word weights.
-      if(dataWeights_.size() != oldSize)
-        width = subBatches_.back()->batchWidth();
-
      for(auto split : splits) {
+        auto cb = std::static_pointer_cast<CorpusBatch>(split);
+        size_t width = 1;                   // One weight per sentence in case of sentence-level weights
+        if(dataWeights_.size() != oldSize)  // if number of weights does not correspond to number of sentences we have word-level weights
+          width = cb->back()->batchWidth(); // splitting also affects width, hence we need to accomodate this here
        std::vector<float> ws(width * split->size(), 1.0f);

        // this needs to be split along the batch dimension
        // which is here the innermost dimension.
        // Should work for sentence-based weights, too.
-        for(size_t j = 0; j < width; ++j) {
-          for(size_t i = 0; i < split->size(); ++i) {
-            ws[j * split->size() + i] = dataWeights_[j * oldSize + i + pos];
+        for(size_t s = 0; s < width; ++s) {
+          for(size_t b = 0; b < split->size(); ++b) {
+            ws[s * split->size() + b] = dataWeights_[s * oldSize + b + pos]; // @TODO: use locate() as well
          }
        }
        split->setDataWeights(ws);
@ -448,9 +457,13 @@ public:
    return splits;
  }

-  std::vector<float>& getGuidedAlignment() { return guidedAlignment_; }
+  const std::vector<float>& getGuidedAlignment() const { return guidedAlignment_; }  // [dimSrcWords, dimBatch, dimTrgWords] flattened
  void setGuidedAlignment(std::vector<float>&& aln) override {
-      guidedAlignment_ = std::move(aln);
+    guidedAlignment_ = std::move(aln);
+  }
+
+  size_t locateInGuidedAlignments(size_t b, size_t s, size_t t) {
+    return ((s * size()) + b) * widthTrg() + t;
  }

  std::vector<float>& getDataWeights() { return dataWeights_; }
@ -472,15 +485,14 @@ public:
      std::cerr << std::endl;
    }

-    size_t b = 0;
+    size_t subBatchIndex = 0;
    for(auto sb : subBatches_) {
-      std::cerr << "batch " << b++ << ": " << std::endl;
+      std::cerr << "stream " << subBatchIndex++ << ": " << std::endl;
      const auto& vocab = sb->vocab();
-      for(size_t i = 0; i < sb->batchWidth(); i++) {
+      for(size_t s = 0; s < sb->batchWidth(); s++) {
        std::cerr << "\t w: ";
-        for(size_t j = 0; j < sb->batchSize(); j++) {
-          size_t idx = i * sb->batchSize() + j;
-          Word w = sb->data()[idx];
+        for(size_t b = 0; b < sb->batchSize(); b++) {
+          Word w = sb->data()[sb->locate(/*batchIdx=*/b, /*wordPos=*/s)]; // s * sb->batchSize() + b;
          if (vocab && !printIndices)
            std::cerr << (*vocab)[w] << " ";
          else
--- a/src/data/factored_vocab.cpp
+++ b/src/data/factored_vocab.cpp
@ -400,7 +400,7 @@ std::string FactoredVocab::word2string(Word word) const {
        res.append("?");
    }
    else
-      res.append(factorVocab_[(WordIndex)(index + groupRanges_[g].first)]);
+      res.append(getFactorName(g, index));
  }
  return res;
 }
@ -431,6 +431,21 @@ Word FactoredVocab::string2word(const std::string& w) const {
  return word;
 }

+// does a specific factor exist in the vocabulary
+// Factor name must be given without separator. This function cannot be used for lemmas.
+bool FactoredVocab::tryGetFactor(const std::string& factorName, size_t& groupIndex, size_t& factorIndex) const {
+  WordIndex u;
+  if (factorVocab_.tryFind(factorSeparator_ + factorName, u))
+  {
+      groupIndex = factorGroups_[u];
+      ABORT_IF(u < groupRanges_[groupIndex].first || u >= groupRanges_[groupIndex].second, "Invalid factorGroups_ entry??");
+      factorIndex = u - groupRanges_[groupIndex].first;
+      return true;
+  }
+  else
+      return false;
+}
+
 // extract the factor index of a given factor type from the 'Word' representation
 size_t FactoredVocab::getFactor(Word word, size_t groupIndex) const {
  size_t index = word.toWordIndex();
@ -565,12 +580,18 @@ void FactoredVocab::constructNormalizationInfoForVocab() {

 // decode a 'Word' array into the external string representation of that token sequence, as written to output files
 /*virtual*/ std::string FactoredVocab::decode(const Words& sentence, bool ignoreEOS /*= true*/) const /*override final*/ {
-  std::vector<std::string> decoded;
-  decoded.reserve(sentence.size());
-  for(auto w : sentence) {
+  std::vector<std::string> decoded; decoded.reserve(sentence.size());
+  for(auto w : sentence)
    if((w != getEosId() || !ignoreEOS))
      decoded.push_back((*this)[w]);
-  }
+  return utils::join(decoded, " ");
+}
+
+// diagnostics version of decode() that will not fail on partial words, will print EOS, and is a little slower
+std::string FactoredVocab::decodeForDiagnostics(const Words& sentence) const {
+  std::vector<std::string> decoded; decoded.reserve(sentence.size());
+  for (auto w : sentence)
+    decoded.push_back(word2string(w));
  return utils::join(decoded, " ");
 }

@ -740,7 +761,7 @@ Ptr<IVocab> createFactoredVocab(const std::string& vocabPath) {
    static std::map<std::string, Ptr<IVocab>> s_cache;
    auto iter = s_cache.find(vocabPath);
    if (iter != s_cache.end()) {
-      LOG(info, "[vocab] Reusing existing vocabulary object in memory (vocab size {})", iter->second->size());
+      LOG_ONCE(info, "[vocab] Reusing existing vocabulary object in memory (vocab size {})", iter->second->size());
      return iter->second;
    }
    auto vocab = New<FactoredVocab>();
--- a/src/data/factored_vocab.h
+++ b/src/data/factored_vocab.h
@ -66,6 +66,9 @@ public:
  bool canExpandFactoredWord(Word word, size_t groupIndex) const { return lemmaHasFactorGroup(getFactor(word, 0), groupIndex); }
  size_t getFactor(Word word, size_t groupIndex) const;
  bool lemmaHasFactorGroup(size_t factor0Index, size_t g) const { return lemmaHasFactorGroup_[factor0Index][g]; }
+  const std::string& getFactorGroupPrefix(size_t groupIndex) const { return groupPrefixes_[groupIndex]; } // for diagnostics only
+  const std::string& getFactorName(size_t groupIndex, size_t factorIndex) const { return factorVocab_[(WordIndex)(factorIndex + groupRanges_[groupIndex].first)]; }
+  std::string decodeForDiagnostics(const Words& sentence) const;

  static constexpr size_t FACTOR_NOT_APPLICABLE = (SIZE_MAX - 1);
  static constexpr size_t FACTOR_NOT_SPECIFIED  = (SIZE_MAX - 2);
@ -74,6 +77,17 @@ public:
  static Ptr<FactoredVocab> tryCreateAndLoad(const std::string& path); // load from "vocab" option if it specifies a factored vocab
  std::string word2string(Word word) const;
  Word string2word(const std::string& w) const;
+  bool tryGetFactor(const std::string& factorGroupName, size_t& groupIndex, size_t& factorIndex) const; // note: factorGroupName given without separator
+
+  // some hard-coded constants from FactoredSegmenter
+  // The naming mimics the names in FactoredSegmenter.cs, and therefore intentionally does not follow Marian conventions.
+  // @TODO: We have more hard-coded constants throughout the code. Move them all here.
+  // @TODO: figure out how to do this with static const*/constexpr
+#define FactoredVocab_INLINE_FIX_WHAT_serialized "is"
+#define FactoredVocab_FIX_SRC_ID_TAG             "<IOPEN>"
+#define FactoredVocab_FIX_TGT_ID_TAG             "<IDELIM>"
+#define FactoredVocab_FIX_END_ID_TAG             "<ICLOSE>"
+
 private:
  void constructGroupInfoFromFactorVocab();
  void constructFactorIndexConversion();
--- a/src/data/sentencepiece_vocab.cpp
+++ b/src/data/sentencepiece_vocab.cpp
@ -36,17 +36,18 @@ private:
  std::mt19937 generator_;
  std::uniform_int_distribution<int> randInt_; // from 0 to INT_MAX

+  // Keeps sentences segmented into subword units
+  bool keepEncoded_{false};
+
  // Sample from one file, based on first algorithm from:
  // https://en.wikipedia.org/wiki/Reservoir_sampling
  void reservoirSampling(std::vector<std::string>& sample, size_t& seenLines,
                        const std::string& trainPath, size_t maxLines, size_t maxBytes) {
-
    ABORT_IF(maxLines == 0, "Sample needs to be larger 0");

-    std::unique_ptr<std::istream> trainStrm(
-			    trainPath == "stdin" ? new std::istream(std::cin.rdbuf())
-                           : new io::InputFileStream(trainPath)
-    );
+    std::unique_ptr<std::istream> trainStrm(trainPath == "stdin"
+                                                ? new std::istream(std::cin.rdbuf())
+                                                : new io::InputFileStream(trainPath));

    std::string line;
    while(getline(*trainStrm, line)) {
@ -109,8 +110,10 @@ private:

 public:
  SentencePieceVocab(Ptr<Options> options, size_t batchIndex)
-    : options_(options), batchIndex_(batchIndex), generator_((uint32_t)Config::seed) {
-
+      : options_(options),
+        batchIndex_(batchIndex),
+        generator_((uint32_t)Config::seed),
+        keepEncoded_(options->get<bool>("no-spm-decode", false)) {
    if(options_->has("sentencepiece-alphas")) {
      auto alphas = options_->get<std::vector<float>>("sentencepiece-alphas");
      if(alphas.size() <= batchIndex)
@ -221,11 +224,18 @@ public:

  std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
    std::string line;
-    // convert vector of Word to vector of int
-    std::vector<int> spmSentence; spmSentence.reserve(sentence.size());
-    for (auto&& word : sentence)
-      spmSentence.push_back(word.toWordIndex());
-    spm_->Decode(spmSentence, &line);
+    if(keepEncoded_) {  // i.e. keep the sentence segmented into subword units
+      for(const Word& id : sentence)
+        line += (*this)[id] + " ";
+      line.pop_back();  // trim the trailing whitespace
+    } else {
+      // convert vector of Word to vector of int
+      std::vector<int> spmSentence;
+      spmSentence.reserve(sentence.size());
+      for(auto&& word : sentence)
+        spmSentence.push_back(word.toWordIndex());
+      spm_->Decode(spmSentence, &line);
+    }
    return line;
  }

--- a/src/data/vocab.cpp
+++ b/src/data/vocab.cpp
--- a/src/data/vocab_base.h
+++ b/src/data/vocab_base.h
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@ -319,7 +319,7 @@ struct Ops<float32x4> {

 } // end namespace functional
 } // end namespace marian
-
+#ifdef __AVX__
 #include "3rd_party/avx_mathfun.h"

 namespace marian {
@ -438,7 +438,7 @@ struct Ops<float32x8> {

 } // end namespace functional
 } // end namespace marian
-
+#endif
 #endif // of "#ifndef __CUDACC__"

 #ifdef __CUDACC__
@ -600,4 +600,4 @@ BINARY(sPReLU,     PReLU,     Ops<ElementType>::prelu(x, y));
 BINARY(sPReLUBack, PReLUback, Ops<ElementType>::preluBack(x, y));

 } // end namespace functional
-} // end namespace marian
+} // end namespace marian
--- a/src/functional/shape.h
+++ b/src/functional/shape.h
@ -145,20 +145,20 @@ struct ConstantShape {

  HOST_DEVICE_INLINE int elements() const { return (int)elements_; }

-  // The following functions iterate over shape dimensions and use resursive
+  // The following functions iterate over shape dimensions and use recursive
  // templates. They unroll over a compile-time defined number of dimensions.

  // Struct for recurrent template calls over shape dimensions,
  // version for K > 0
  template <const int K, const int D> struct I {
    HOST_DEVICE_INLINE static int index(const Array<int, D>& dims,
-                             const Array<int, D>& stride) {
+                                        const Array<int, D>& stride) {
      return dims[K] * stride[K] + I<K-1, D>::index(dims, stride);
    }

    HOST_DEVICE_INLINE static int index(int si,
-                             const Array<int, D>& shape,
-                             const Array<int, D>& stride) {
+                                        const Array<int, D>& shape,
+                                        const Array<int, D>& stride) {
      return (si % shape[K]) * stride[K] + I<K-1, D>::index(si / shape[K], shape, stride);
    }

@ -175,19 +175,19 @@ struct ConstantShape {
  // specialization for K == 0
  template <const int D> struct I<0, D> {
    HOST_DEVICE_INLINE static int index(const Array<int, D>& dims,
-                             const Array<int, D>& stride) {
+                                        const Array<int, D>& stride) {
      return dims[0] * stride[0];
    }

    HOST_DEVICE_INLINE static int index(int si,
-                             const Array<int, D>& shape,
-                             const Array<int, D>& stride) {
+                                        const Array<int, D>& shape,
+                                        const Array<int, D>& stride) {
      return (si % shape[0]) * stride[0];
    }

   HOST_DEVICE_INLINE static void dims(int si,
-                            Array<int, D>& dims,
-                            const Array<int, D>& shape) {
+                                       Array<int, D>& dims,
+                                       const Array<int, D>& shape) {
      dims[0] = si % shape[0];
    }
  };
--- a/src/functional/tensor.h
+++ b/src/functional/tensor.h
@ -7,8 +7,8 @@
 namespace marian {
 namespace functional {

-// By default for single valued types like float do nothing. Usually the number of elements in a tensor 
-// is correctly mirrored in the shape object. Only special multi-element types like float32x4 (4 floats), 
+// By default for single valued types like float do nothing. Usually the number of elements in a tensor
+// is correctly mirrored in the shape object. Only special multi-element types like float32x4 (4 floats),
 // float32x8 (8 floats) and half2 (2 half) require special handling done by specializations below.
 // Similar for multi-element integer types to be added later.
 template <typename T>
@ -31,7 +31,7 @@ inline marian::Shape adapt<float32x4>(const marian::Shape& shape) {
  x4Shape.set(-1, shape[-1] / 4);
  return x4Shape;
 }
-
+#ifdef __AVX__
 template <>
 inline marian::Shape adapt<float32x8>(const marian::Shape& shape) {
  ABORT_IF(shape[-1] % 8 != 0,
@ -42,7 +42,7 @@ inline marian::Shape adapt<float32x8>(const marian::Shape& shape) {
  x8Shape.set(-1, shape[-1] / 8);
  return x8Shape;
 }
-
+#endif
 #endif

 template <typename T, const int D>
@ -211,4 +211,4 @@ template <typename T>
 using Tensor = View<T, CONST_SHAPE_DIMS>;

 }  // namespace functional
-}  // namespace marian
+}  // namespace marian
--- a/src/functional/tmp.h
+++ b/src/functional/tmp.h
@ -9,136 +9,155 @@
 namespace marian {
 namespace functional {

-template <size_t K, class Functor>
+// This struct and its specializations are never used directly, only through apply and applyWithCast below.
+template <size_t K, class Functor, typename AccType> // K-ary application of Functor, elements are cast to AccType before application of Functor
 struct FApply {};

-template <class Functor>
-struct FApply<1, Functor> {
+template <class Functor, typename AccType>
+struct FApply<1, Functor, AccType> {
  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 1>& in,
      const functional::Array<int, 1>& indices) {
-    return functor(in[0].data()[indices[0]]);
+    return functor((AccType)in[0].data()[indices[0]]); // indices is an array of offsets into multiple tensors, index[i] corresponds in[i] based on up to arity K
  }

  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 1>& in,
      int index) {
-    return functor(in[0].data()[index]);
+    return functor((AccType)in[0].data()[index]);
  }
 };

-template <class Functor>
-struct FApply<2, Functor> {
+template <class Functor, typename AccType>
+struct FApply<2, Functor, AccType> {
  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 2>& in,
      const functional::Array<int, 2>& indices) {
-    return functor(in[0].data()[indices[0]],
-                   in[1].data()[indices[1]]);
+    return functor((AccType)in[0].data()[indices[0]],
+                   (AccType)in[1].data()[indices[1]]);
  }

  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 2>& in,
      int index) {
-    return functor(in[0].data()[index],
-                   in[1].data()[index]);
+    return functor((AccType)in[0].data()[index],
+                   (AccType)in[1].data()[index]);
  }
 };

-template <class Functor>
-struct FApply<3, Functor> {
+template <class Functor, typename AccType>
+struct FApply<3, Functor, AccType> {
  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 3>& in,
      const functional::Array<int, 3>& indices) {
-    return functor(in[0].data()[indices[0]],
-                   in[1].data()[indices[1]],
-                   in[2].data()[indices[2]]);
+    return functor((AccType)in[0].data()[indices[0]],
+                   (AccType)in[1].data()[indices[1]],
+                   (AccType)in[2].data()[indices[2]]);
  }

  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 3>& in,
      int index) {
-    return functor(in[0].data()[index],
-                   in[1].data()[index],
-                   in[2].data()[index]);
+    return functor((AccType)in[0].data()[index],
+                   (AccType)in[1].data()[index],
+                   (AccType)in[2].data()[index]);
  }
 };

-template <class Functor>
-struct FApply<4, Functor> {
+template <class Functor, typename AccType>
+struct FApply<4, Functor, AccType> {
  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 4>& in,
      const functional::Array<int, 4>& indices) {
-    return functor(in[0].data()[indices[0]],
-                   in[1].data()[indices[1]],
-                   in[2].data()[indices[2]],
-                   in[3].data()[indices[3]]);
+    return functor((AccType)in[0].data()[indices[0]],
+                   (AccType)in[1].data()[indices[1]],
+                   (AccType)in[2].data()[indices[2]],
+                   (AccType)in[3].data()[indices[3]]);
  }

  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 4>& in,
      int index) {
-    return functor(in[0].data()[index],
-                   in[1].data()[index],
-                   in[2].data()[index],
-                   in[3].data()[index]);
+    return functor((AccType)in[0].data()[index],
+                   (AccType)in[1].data()[index],
+                   (AccType)in[2].data()[index],
+                   (AccType)in[3].data()[index]);
  }
 };

-template <class Functor>
-struct FApply<5, Functor> {
+template <class Functor, typename AccType>
+struct FApply<5, Functor, AccType> {
  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 5>& in,
      const functional::Array<int, 5>& indices) {
-    return functor(in[0].data()[indices[0]],
-                   in[1].data()[indices[1]],
-                   in[2].data()[indices[2]],
-                   in[3].data()[indices[3]],
-                   in[4].data()[indices[4]]);
+    return functor((AccType)in[0].data()[indices[0]],
+                   (AccType)in[1].data()[indices[1]],
+                   (AccType)in[2].data()[indices[2]],
+                   (AccType)in[3].data()[indices[3]],
+                   (AccType)in[4].data()[indices[4]]);
  }

  template <typename ElementType>
-  HOST_DEVICE_INLINE static ElementType apply(
+  HOST_DEVICE_INLINE static AccType apply(
      Functor functor,
      functional::Array<functional::Tensor<ElementType>, 5>& in,
      int index) {
-    return functor(in[0].data()[index], 
-                   in[1].data()[index], 
-                   in[2].data()[index], 
-                   in[3].data()[index], 
-                   in[4].data()[index]);
+    return functor((AccType)in[0].data()[index], 
+                   (AccType)in[1].data()[index], 
+                   (AccType)in[2].data()[index], 
+                   (AccType)in[3].data()[index], 
+                   (AccType)in[4].data()[index]);
  }
 };

-template <size_t K, class Functor, typename ElementType>
+/******************************************************************************/
+// Applying functor to sets of K tensors
+template <typename ElementType, size_t K, class Functor>
 HOST_DEVICE_INLINE ElementType apply(Functor functor,
                    functional::Array<functional::Tensor<ElementType>, K>& in,
                    const functional::Array<int, K>& indices) {
-  return FApply<K, Functor>::apply(functor, in, indices);
+  return FApply<K, Functor, ElementType>::apply(functor, in, indices); // functor is applied to same type as input ElementType, no casting required
 }

-template <size_t K, class Functor, typename ElementType>
+template <typename ElementType, size_t K, class Functor>
 HOST_DEVICE_INLINE ElementType apply(Functor functor,
                    functional::Array<functional::Tensor<ElementType>, K>& in,
                    int index) {
-  return FApply<K, Functor>::apply(functor, in, index);
+  return FApply<K, Functor, ElementType>::apply(functor, in, index); // functor is applied to same type as input ElementType, no casting required
+}
+
+template <typename AccType, typename ElementType, size_t K, class Functor>
+HOST_DEVICE_INLINE AccType applyWithCast(Functor functor,
+                    functional::Array<functional::Tensor<ElementType>, K>& in,
+                    const functional::Array<int, K>& indices) {
+  return FApply<K, Functor, AccType>::apply(functor, in, indices); // ElementType and AccType are potentially different, cast to AccType before applying functor.
+                                                                   // This is useful when accumulating e.g. 16-bit into 32-bit and we want to case to 32-bit before
+                                                                   // the functor is applied. L2-Norm is a good use-case since the square can be large. 
+}
+
+template <typename AccType, typename ElementType, size_t K, class Functor>
+HOST_DEVICE_INLINE AccType applyWithCast(Functor functor,
+                    functional::Array<functional::Tensor<ElementType>, K>& in,
+                    int index) {
+  return FApply<K, Functor, AccType>::apply(functor, in, index); // ElementType and AccType are potentially different, cast to AccType before applying functor
 }

 /******************************************************************************/
@ -180,7 +199,7 @@ struct Loop<1, N, K> {
      for(size_t j = 0; j < K; ++j) {
        acc[j] = pAcc[j] + (dim[N - 1] + i) * in[j].shape().bstride(N - 1);
      }
-      agg = aggFunctor(agg, (AccType)apply<K>(functor, in, acc));
+      agg = aggFunctor(agg, applyWithCast<AccType>(functor, in, acc));
    }
    return agg;
  }
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -354,7 +354,7 @@ public:
             const Ptr<inits::NodeInitializer>& init,
             const Type elementType,
             bool fixed = false) {
-    // since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
+    // this param is called with a specified type
    return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
  }

@ -362,7 +362,7 @@ public:
             const Shape& shape,
             const Ptr<inits::NodeInitializer>& init,
             bool fixed = false) {
-    // since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
+    // since this param is called without a specified type, we assume defaultElementType but allow to check for a different type
    return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
  }

@ -497,7 +497,12 @@ public:
      // skip over special parameters starting with "special:"
      if(pName.substr(0, 8) == "special:")
        continue;
-      param(pName, item.shape, inits::fromItem(item), item.type, /*fixed=*/false);
+      
+      // if during loading the loaded type is of the same type class as the default element type, allow conversion;
+      // otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both
+      // have type class TypeClass::float_type.
+      auto loadElementType = isSameTypeClass(item.type, defaultElementType_) ? defaultElementType_ : item.type;
+      param(pName, item.shape, inits::fromItem(item), loadElementType, /*fixed=*/false);
    }
    if(markReloaded)
      setReloaded(true);
@ -531,7 +536,7 @@ public:
      auto defaultParams = std::dynamic_pointer_cast<MappedParameters>(it->second);
      if(!defaultParams) {
        // but it's not mapped, so delete it and replace it with a mapped version
-        auto defaultParams = New<MappedParameters>(defaultElementType_);
+        defaultParams = New<MappedParameters>(defaultElementType_);
        defaultParams->init(backend_);
        paramsByElementType_[defaultElementType_] = defaultParams;
      }
@ -540,8 +545,8 @@ public:

    // pre-populate parameters by type
    for(auto& item : items) {
-      auto it = paramsByElementType_.find(item.type);
-      if(it == paramsByElementType_.end()) {
+      auto it1 = paramsByElementType_.find(item.type);
+      if(it1 == paramsByElementType_.end()) {
        auto params = New<MappedParameters>(item.type);
        params->init(backend_);
        paramsByElementType_.insert({item.type, params});
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -7,7 +7,7 @@

 #include "graph/auto_tuner.h"
 #include "tensors/cpu/int16.h"
-#include "tensors/cpu/expanded_gemm.h"
+#include "tensors/cpu/fbgemm/expanded_gemm.h"

 #if USE_FBGEMM
 #include "fbgemm/Utils.h"
@ -284,11 +284,6 @@ Expr stopGradient(Expr a) {
  return res;
 }

-Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init) {
-  auto graph = a->graph();
-  return graph->constant(a->shape(), init, a->value_type());
-}
-
 // gather() -- gather arbitrary elements along an axis; batched or non-batched
 Expr gather(Expr a, int axis, Expr indices) {
  return Expression<GatherNodeOp>(a, axis, indices);
@ -317,6 +312,7 @@ Expr index_select(Expr a, int axis, Expr indices) {
  indices = reshape(indices, shape); // move index to axis
  return gather(a, axis, indices);
 }
+
 Expr index_select(Expr a, int axis, const std::vector<IndexType>& indices) {
  auto indexExpr = a->graph()->indices(indices);
  return index_select(a, axis, indexExpr);
@ -355,35 +351,51 @@ Expr slice(Expr a, int axis, Slice slice) { // numpy __getslice__ semantics, but
 }

 Expr sum(Expr a, int ax) {
+  if(a->shape()[ax] == 1) // nothing to reduce, sum of itself is a
+    return a;
  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::sum);
 }

 Expr mean(Expr a, int ax) {
+  if(a->shape()[ax] == 1) // nothing to reduce, mean of itself is a
+    return a;
  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::mean);
 }

 Expr std(Expr a, int ax) {
-  return Expression<ReduceNodeOp>(a - mean(a,ax), ax, ReduceNodeOpCode::rms);
+  if(a->shape()[ax] == 1) // nothing to reduce, std(a) = 0
+    return a - a;
+  return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::rms);
 }

-Expr var(Expr a, int ax) {
+Expr var(Expr a, int ax) { 
+  if(a->shape()[ax] == 1) // nothing to reduce, var(a) = 0
+    return a - a;
  return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::meanSqr);
 }

 Expr max(Expr a, int ax) {
+  if(a->shape()[ax] == 1) // nothing to reduce, max of itself is a
+    return a;
  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::max);
 }

 Expr min(Expr a, int ax) {
+  if(a->shape()[ax] == 1) // nothing to reduce, min of itself is a
+    return a;
  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::min);
 }

 Expr prod(Expr a, int ax) {
+  if(a->shape()[ax] == 1) // nothing to reduce, prod of itself is a
+    return a;
  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::prod);
 }

 // log(sum(exp(a)))
 Expr logsumexp(Expr a, int ax) {
+  if(a->shape()[ax] == 1) // nothing to reduce, log(sum(exp(a))) = log(exp(a)) = a
+    return a;
  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::logSumExp);
 }

@ -400,17 +412,50 @@ Expr weighted_average(Expr in, Expr weights, int ax) {
 Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
  auto device = a->graph()->getDeviceId().type;
  float clipValue = a->graph()->getBackend()->getClip();
+  // added support for packed GEMM API (fp16, int8)
+  Type aElementType = a->value_type();
+  Type bElementType = b->value_type();

  // Currently only true when command line options
  // --optimize --cpu-thread=N with N > 0 are set.
-  if(device == DeviceType::cpu && a->graph()->getBackend()->isOptimized()) {
-    // dotInt16 computes A * B.T, hence the transpose for B to get A * B
-    // if transA = false and transB = false.
+  if(device == DeviceType::cpu) {
+    if(isFloat(aElementType) && isFloat(bElementType)) {
+      if(a->graph()->getBackend()->isOptimized()) {
+        // dotInt16 computes A * B.T, hence the transpose for B to get A * B
+        // if transA = false and transB = false.

-    return cpu::int16::dot(
-        cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
-        cpu::int16::quantize(transB ? b : transpose(b), clipValue),
-        scale);
+        return cpu::int16::dot(
+          cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
+          cpu::int16::quantize(transB ? b : transpose(b), clipValue),
+          scale);
+      } else {
+        return Expression<DotNodeOp>(
+          clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
+      }
+    } else if(isFloat(aElementType) && isPacked(bElementType)) {
+#if USE_FBGEMM
+      // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
+      // one of the fbgemm's sub modules, cpuinfo (https://github.com/pytorch/cpuinfo).
+      // It looks at the cpu register
+      // (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
+      // and this cpu lookup is executed only once and the state is kept in FBGEMM.
+      if(fbgemm::fbgemmHasAvx2Support()) {
+        // This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
+        return cpu::variant::dot(clip(a, clipValue),
+                                 b,
+                                 b->shape(),
+                                 transA,
+                                 transB,
+                                 scale);
+      } else {
+        ABORT("AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed GEMM");
+      }
+#else
+      ABORT("Packed GEMM is not available in this build");
+#endif  // USE_FBGEMM
+    } else {
+      ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType);
+    }
  } else {
    return Expression<DotNodeOp>(
        clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
@ -469,6 +514,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
      // (https://github.com/pytorch/cpuinfo/blob/master/src/x86/isa.c#L391),
      // and this cpu lookup is executed only once and the state is kept in FBGEMM.
      if(fbgemm::fbgemmHasAvx2Support()) {
+        // This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
        return cpu::variant::affine(clip(a, clipValue),
                                    b,
                                    b->shape(),
@ -477,7 +523,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
                                    transB,
                                    scale);
      } else {
-        ABORT("No on-the-fly packing at the moment");
+        ABORT("AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed GEMM");
      }
 #else
      ABORT("Packed GEMM is not available in this build");
@ -562,8 +608,20 @@ Expr cast(Expr a, Type type) {
  }
 }

-Expr cross_entropy(Expr a, Expr indices) {
-  return Expression<CrossEntropyNodeOp>(a, indices);
+Expr cross_entropy(Expr logits, Expr indices) {
+  return Expression<CrossEntropyNodeOp>(logits, indices);
+}
+
+// Unlikelihood loss based on https://arxiv.org/abs/1908.04319
+Expr unlikelihood(Expr logits, Expr indices) {
+  int dimBatch = logits->shape()[-2];
+  int dimTime  = logits->shape()[-3];
+
+  // @TODO: fix this outside of this function in decoder.h etc. 
+  auto indicesWithLayout = reshape(indices, {1, dimTime, dimBatch, 1});
+
+  // This is currently implemented with multiple ops, might be worth doing a special operation like for cross_entropy
+  return -log(gather(1.f - softmax(logits), /*axis=*/-1, indicesWithLayout));
 }

 Expr plus(const std::vector<Expr>& nodes) {
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@ -141,7 +141,17 @@ Expr atleast_4d(Expr a);
 Expr atleast_nd(Expr a, size_t dims);

 // create a constant of shape a->shape() and initialize with init
-Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init);
+// @TODO: add a && version, to avoid a ref count. NodeInitializers are typically temps.
+// @TODO: and/or make this a template on init
+static inline Expr constant_like(Expr a, const Ptr<inits::NodeInitializer>& init) {
+  return a->graph()->constant(a->shape(), init, a->value_type());
+}
+
+// short-cut to init from std::vector, since we do this so often
+template<typename ElementType>
+Expr constant_like(Expr a, const std::vector<ElementType>& v) { return constant_like(a, inits::fromVector(std::move(v))); }
+template<typename ElementType>
+Expr constant_like(Expr a, std::vector<ElementType>&& v) { return constant_like(a, inits::fromVector(v)); }

 Expr flatten(Expr a);
 Expr flatten_2d(Expr a);
@ -200,6 +210,8 @@ Expr logsoftmax(Expr a);

 Expr cross_entropy(Expr a, Expr b);

+Expr unlikelihood(Expr a, Expr b);
+
 Expr scalar_product(Expr a, Expr b, int ax = 0);

 Expr weighted_average(Expr in, Expr weights, int ax = 0);
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@ -145,10 +145,20 @@ Ptr<NodeInitializer> fromVector(const std::vector<T>& v) {
  return fromLambda([v](Tensor t) { t->set(v.data(), v.data() + v.size()); }, typeId<T>());
 }

+template <typename T>
+Ptr<NodeInitializer> fromVector(std::vector<T>&& v) {
+  return fromLambda([v](Tensor t) { t->set(v.data(), v.data() + v.size()); }, typeId<T>());
+}
+
 template Ptr<NodeInitializer> fromVector<float16>(const std::vector<float16>& v);
 template Ptr<NodeInitializer> fromVector<float>(const std::vector<float>& v);
 template Ptr<NodeInitializer> fromVector<IndexType>(const std::vector<IndexType>& v);

+// @TODO: can we remove the const& ones above? They always make a copy anyways, and often from a temp
+template Ptr<NodeInitializer> fromVector<float16>  (std::vector<float16>  && v);
+template Ptr<NodeInitializer> fromVector<float>    (std::vector<float>    && v);
+template Ptr<NodeInitializer> fromVector<IndexType>(std::vector<IndexType>&& v);
+
 Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v) {
  return fromLambda([v](Tensor t) { t->set(1e-6); t->setSparse(v.first, v.second); });
 }
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@ -143,6 +143,8 @@ Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
 // @TODO: add documentation
 template <typename T>
 Ptr<NodeInitializer> fromVector(const std::vector<T>& v);
+template <typename T>
+Ptr<NodeInitializer> fromVector(std::vector<T>&& v);

 // @TODO: add documentation
 Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v);
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -63,7 +63,6 @@ public:
    // df/dB += alpha * dot(op(A).T, D)
    // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C
    // to sum gradients from different graph parts
-
    if(!transA_ && transB_)
      return {NodeOp(Prod(child(0)->grad(),
                          adj_,
@ -130,6 +129,29 @@ public:

  const std::string type() override { return "dot"; }

+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, transA_);
+    util::hash_combine(seed, transB_);
+    util::hash_combine(seed, scalar_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<DotNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(transA_ != cnode->transA_)
+      return false;
+    if(transB_ != cnode->transB_)
+      return false;
+    if(scalar_ != cnode->scalar_)
+      return false;
+    return true;
+  }
+
  const std::string color() override { return "orange"; }
 };

@ -274,6 +296,30 @@ public:
  }

  const std::string type() override { return "affine"; }
+
+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, transA_);
+    util::hash_combine(seed, transB_);
+    util::hash_combine(seed, scalar_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<AffineNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(transA_ != cnode->transA_)
+      return false;
+    if(transB_ != cnode->transB_)
+      return false;
+    if(scalar_ != cnode->scalar_)
+      return false;
+    return true;
+  }
+
 };

 class DotBatchedNodeOp : public NaryNodeOp {
@ -402,6 +448,29 @@ public:

  const std::string type() override { return "bdot"; }

+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, transA_);
+    util::hash_combine(seed, transB_);
+    util::hash_combine(seed, scalar_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<DotBatchedNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(transA_ != cnode->transA_)
+      return false;
+    if(transB_ != cnode->transB_)
+      return false;
+    if(scalar_ != cnode->scalar_)
+      return false;
+    return true;
+  }
+
  const std::string color() override { return "orange"; }
 };

@ -443,18 +512,42 @@ public:
  }

  NodeOps backwardOps() override {
-    return {nullptr, // can't backprop into the sparse matrix (the gradient is dense)
-            nullptr,
-            nullptr,
-            NodeOp(CSRProd(child(3)->grad(), // child(3) = D
-                           graph()->allocator(),
-                           child(0)->val(), child(1)->val(), child(2)->val(), // children(0..2) = A
-                           adj_,
-                           /*transS=*/!transS_, /*swapOperands=*/swapOperands_, /*beta=*/1))};
+    return { nullptr, // can't backprop into the sparse matrix (the gradient is dense)
+             nullptr, 
+             nullptr,
+             NodeOp(CSRProd(child(3)->grad(), // child(3) = D
+                            graph()->allocator(),
+                            child(0)->val(), child(1)->val(), child(2)->val(), // children(0..2) = A
+                            adj_,
+                            /*transS=*/!transS_, /*swapOperands=*/swapOperands_, /*beta=*/1))};
  }

  const std::string type() override { return "csr_dot"; }

+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    for(auto s : shape())
+      util::hash_combine(seed, s);  
+    util::hash_combine(seed, transS_);
+    util::hash_combine(seed, swapOperands_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<CSRDotNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(transS_ != cnode->transS_)
+      return false;
+    if(shape() != cnode->shape())
+      return false;
+    if(swapOperands_ != cnode->swapOperands_)
+      return false;
+    return true;
+  }
+
  const std::string color() override { return "orange"; }
 };

@ -569,8 +662,6 @@ struct RowsNodeOp : public NaryNodeOp {
 //  out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
 //  out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
 // 'a' and 'indices' must have the same rank.
-// @TODO: The current implementation does not support batched indices (third scenario above).
-//        I.e. all axes of 'indices' except 'axis' must have dimension 1.
 struct GatherNodeOp : public NaryNodeOp {
  GatherNodeOp(Expr a, int axis, Expr indices)
      : NaryNodeOp({a, indices}, newShape(a, axis, indices), a->value_type()),
@ -599,10 +690,6 @@ struct GatherNodeOp : public NaryNodeOp {
      if (i != axis) {
        ABORT_IF(indices->shape()[i] != shape[i] && indices->shape()[i] != 1,
            "Dimensions must match or broadcast for input ({}) and indices ({})", std::string(shape), std::string(indices->shape()));
-#if 1 // presently, this implementation does not support batched indices
-        ABORT_IF(indices->shape()[i] != 1,
-            "Presently, gather() does not implement batched indices");
-#endif
      }
    }
    return shape;
@ -865,7 +952,9 @@ struct MinimumNodeOp : public ElementBinaryNodeOp {

 struct CmpNodeOp : public ElementBinaryNodeOp {
  CmpNodeOp(Expr a, Expr b, int cmp_, bool not_) : ElementBinaryNodeOp(a, b), cmp_(cmp_), not_(not_) {
-    setTrainable(false); // has no gradient
+    //setTrainable(false); // has no gradient
+    // Note: ^^ Disabled because it currently causing Marian to choke, for unknown reasons.
+    //       Not setting this will not change the result since the vector of gradient functions is empty.
  }

  NodeOps forwardOps() override {
@ -887,6 +976,29 @@ struct CmpNodeOp : public ElementBinaryNodeOp {
    ABORT("Should not get here??");
  }

+  virtual size_t hash() override {
+    if(!hash_) {
+      size_t seed = NaryNodeOp::hash();
+      util::hash_combine(seed, cmp_);
+      util::hash_combine(seed, not_);
+      hash_ = seed;
+    }
+    return hash_;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<CmpNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(cmp_ != cnode->cmp_)
+      return false;
+    if(not_ != cnode->not_)
+      return false;
+    return true;
+  }
+
 private:
  int cmp_;  // -1: less; 0: equal; 1: greater
  bool not_; // invert result if true
@ -1019,6 +1131,23 @@ public:

  const std::string type() override { return "layer_normalization"; }

+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, eps_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<LayerNormalizationOp>(node);
+    if(!cnode)
+      return false;
+    if(eps_ != cnode->eps_)
+      return false;
+    return true;
+  }
+
 private:
  float eps_;
 };
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@ -993,6 +993,8 @@ struct ShiftNodeOp : public UnaryNodeOp {
    if(!cnode)
      return false;
    if(shift_ != cnode->shift_)
+      return false;    
+    if(padValue_ != cnode->padValue_)
      return false;
    return true;
  }
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@ -130,8 +130,8 @@ private:
  Ptr<Backend> backend_;

 public:
-  MappedParameters(Type acceptedElementType_) : Parameters(acceptedElementType_) {
-    LOG(debug, "Created mapped parameter object of type {}", acceptedElementType_);
+  MappedParameters(Type acceptedElementType) : Parameters(acceptedElementType) {
+    LOG(debug, "Created mapped parameter object of type {}", acceptedElementType);
  }

  virtual void init(Ptr<Backend> backend) override { backend_ = backend; }
--- a/src/layers/generic.cpp
+++ b/src/layers/generic.cpp
@ -4,7 +4,8 @@
 #include "layers/constructors.h"
 #include "layers/loss.h"
 #include "data/factored_vocab.h"
-#include "rnn/types.h" // for State::select()
+#include "rnn/types.h"     // for State::select()
+#include "models/states.h" // for EncoderState

 //using std::size_t; // not sure why this is needed

@ -23,7 +24,11 @@ namespace marian {
    ABORT_IF(empty(), "Attempted to read out logits on empty Logits object");

    auto firstLogits = logits_.front()->loss();
-    ABORT_IF(labels.size() * firstLogits->shape()[-1] != firstLogits->shape().elements(), "Labels not matching logits shape??");
+    ABORT_IF(labels.size() * firstLogits->shape()[-1] != firstLogits->shape().elements(), 
+             "Labels not matching logits shape ({} != {}, {})??",
+             labels.size() * firstLogits->shape()[-1],
+             firstLogits->shape().elements(),
+             firstLogits->shape());

    // base case (no factors)
    if (!factoredVocab_) {
@ -219,7 +224,7 @@ namespace marian {
      factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
      if (factoredVocab_) {
        numOutputClasses = (int)factoredVocab_->factorVocabSize();
-        LOG(info, "[embedding] Factored outputs enabled");
+        LOG_ONCE(info, "[embedding] Factored outputs enabled");
      }

      if(tiedParam_) {
@ -237,10 +242,10 @@ namespace marian {

      /*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
      ABORT_IF(lemmaDimEmb && !factoredVocab_, "--lemma-dim-emb requires a factored vocabulary");
-      if (lemmaDimEmb > 0) {
+      if (lemmaDimEmb > 0) { // > 0 means to embed the (expected) word with a different embedding matrix
 #define HARDMAX_HACK
 #ifdef HARDMAX_HACK
-        lemmaDimEmb = lemmaDimEmb & 0xfffffffe;
+        lemmaDimEmb = lemmaDimEmb & 0xfffffffe; // hack to select hard-max: use an odd number
 #endif
        auto range = factoredVocab_->getGroupRange(0);
        auto lemmaVocabDim = (int)(range.second - range.first);
@ -263,8 +268,9 @@ namespace marian {
        // project each factor separately
        auto numGroups = factoredVocab_->getNumGroups();
        std::vector<Ptr<RationalLoss>> allLogits(numGroups, nullptr); // (note: null entries for absent factors)
-        Expr input1 = input;
-        Expr Plemma = nullptr;
+        Expr input1 = input; // [B... x D]
+        Expr Plemma = nullptr;     // used for lemmaDimEmb=-1
+        Expr inputLemma = nullptr; // used for lemmaDimEmb=-2, -3
        for (size_t g = 0; g < numGroups; g++) {
          auto range = factoredVocab_->getGroupRange(g);
          if (g > 0 && range.first == range.second) // empty entry
@ -280,6 +286,52 @@ namespace marian {
            factorWt = slice(Wt_, isLegacyUntransposedW ? -1 : 0, Slice((int)range.first, (int)range.second));
            factorB  = slice(b_,                              -1, Slice((int)range.first, (int)range.second));
          }
+          /*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
+          if ((lemmaDimEmb == -2 || lemmaDimEmb == -3) && g > 0) { // -2/-3 means a gated transformer-like structure (-3 = hard-max)
+            LOG_ONCE(info, "[embedding] using lemma conditioning with gate");
+            // this mimics one transformer layer
+            //  - attention over two inputs:
+            //     - e = current lemma. We use the original embedding vector; specifically, expectation over all lemmas.
+            //     - input = hidden state FF(h_enc+h_dec)
+            //  - dot-prod attention to allow both sides to influence (unlike our recurrent self-attention)
+            //  - multi-head to allow for multiple conditions to be modeled
+            //  - add & norm, for gradient flow and scaling
+            //  - FF layer   --this is expensive; it is per-factor
+            // multi-head attention
+            int inputDim = input->shape()[-1];
+            int heads = 8;
+            auto name = options_->get<std::string>("prefix") + "_factor" + std::to_string(g);
+            auto Wq = graph_->param(name + "_Wq", { inputDim,  inputDim }, inits::glorotUniform());
+            auto Wk = graph_->param(name + "_Wk", { inputDim,  inputDim }, inits::glorotUniform());
+            auto Wv = graph_->param(name + "_Wv", { inputDim,  inputDim }, inits::glorotUniform());
+            auto toMultiHead = [&](Expr x, int heads) {
+              const auto& shape = x->shape();
+              int inputDim = shape[-1];
+              int otherDim = shape.elements() / inputDim;
+              ABORT_IF(inputDim / heads * heads != inputDim, "inputDim ({}) must be multiple of number of heads ({})", inputDim, heads);
+              return reshape(x, { otherDim, heads, 1, inputDim / heads });
+            };
+            input1 = inputLemma;
+            auto qm  = toMultiHead(dot(input1,         Wq), heads); // [B... x H x D/H] projected query
+            auto kdm = toMultiHead(dot(input1 - input, Wk), heads); // [B... x H x D/H] the two data vectors projected as keys. Use diff and sigmoid, instead of softmax.
+            auto vem = toMultiHead(dot(input1,         Wv), heads); // [B... x H x D/H] one of the two data vectors projected as values
+            auto vim = toMultiHead(dot(         input, Wv), heads); // [B... x H x D/H] the other
+            auto zm = bdot(qm, kdm, false, true);              // [B... x H x 1]
+            auto sm = sigmoid(zm);                // [B... x H x 1]
+            auto rm = sm * (vem - vim) + vim;     // [B... x H x D/H]
+            auto r = reshape(rm, input->shape()); // [B... x D]
+            // add & norm
+            input1 = r + input1;
+            input1 = layerNorm(input1, name + "_att");
+            // FF layer
+            auto ffnDropProb = 0.1f;    // @TODO: get as a parameter
+            auto ffnDim = inputDim * 2; // @TODO: get as a parameter
+            auto f = denseInline(input1, name + "_ffn", /*suffix=*/"1", ffnDim, (ActivationFunction*)relu, ffnDropProb);
+            f      = denseInline(f,      name + "_ffn", /*suffix=*/"2", inputDim);
+            // add & norm
+            input1 = f + input1;
+            input1 = layerNorm(input1, name + "_ffn");
+          }
          // @TODO: b_ should be a vector, not a matrix; but shotlists use cols() in, which requires a matrix
          auto factorLogits = affine(input1, factorWt, factorB, false, /*transB=*/isLegacyUntransposedW ? false : true, /*scale=*/1.0f); // [B... x U] factor logits
          // optionally add lemma-dependent bias
@ -294,15 +346,28 @@ namespace marian {
          allLogits[g] = New<RationalLoss>(factorLogits, nullptr);
          // optionally add a soft embedding of lemma back to create some lemma dependency
          // @TODO: if this works, move it into lazyConstruct
-          /*const*/ int lemmaDimEmb = options_->get<int>("lemma-dim-emb", 0);
-          if (lemmaDimEmb < 0 && g == 0) {
-            ABORT_IF(shortlist_ && lemmaDimEmb != 0, "Lemma-dependent bias with short list is not yet implemented");
+          if (lemmaDimEmb == -2 && g == 0) { // -2 means a gated transformer-like structure
+            LOG_ONCE(info, "[embedding] using lemma conditioning with gate, soft-max version");
+            // get expected lemma embedding vector
+            auto factorLogSoftmax = logsoftmax(factorLogits); // [B... x U] note: with shortlist, this is not the full lemma set
+            auto factorSoftmax = exp(factorLogSoftmax);
+            inputLemma = dot(factorSoftmax, factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D]
+          }
+          else if (lemmaDimEmb == -3 && g == 0) { // same as -2 except with hard max
+            LOG_ONCE(info, "[embedding] using lemma conditioning with gate, hard-max version");
+            // get max-lemma embedding vector
+            auto maxVal = max(factorLogits, -1); // [B... x U] note: with shortlist, this is not the full lemma set
+            auto factorHardmax = eq(factorLogits, maxVal);
+            inputLemma = dot(factorHardmax, factorWt, false, /*transB=*/isLegacyUntransposedW ? true : false); // [B... x D]
+          }
+          else if (lemmaDimEmb == -1 && g == 0) { // -1 means learn a lemma-dependent bias
+            ABORT_IF(shortlist_, "Lemma-dependent bias with short list is not yet implemented");
            LOG_ONCE(info, "[embedding] using lemma-dependent bias");
            auto factorLogSoftmax = logsoftmax(factorLogits); // (we do that again later, CSE will kick in)
            auto z = /*stopGradient*/(factorLogSoftmax);
            Plemma = exp(z); // [B... x U]
          }
-          if (lemmaDimEmb > 0 && g == 0) {
+          else if (lemmaDimEmb > 0 && g == 0) { // > 0 means learn a re-embedding matrix
            LOG_ONCE(info, "[embedding] enabled re-embedding of lemma, at dim {}", lemmaDimEmb);
            // compute softmax. We compute logsoftmax() separately because this way, computation will be reused later via CSE
            auto factorLogSoftmax = logsoftmax(factorLogits);
@ -349,7 +414,7 @@ namespace marian {
    factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
    if (factoredVocab_) {
      dimVoc = (int)factoredVocab_->factorVocabSize();
-      LOG(info, "[embedding] Factored embeddings enabled");
+      LOG_ONCE(info, "[embedding] Factored embeddings enabled");
    }

    // Embedding layer initialization should depend only on embedding size, hence fanIn=false
@ -389,7 +454,7 @@ namespace marian {
    auto graph = E_->graph();
    int dimBatch = (int)subBatch->batchSize();
    int dimEmb = E_->shape()[-1];
-    int dimWords = (int)subBatch->batchWidth();
+    int dimWidth = (int)subBatch->batchWidth();

    // factored embeddings:
    //  - regular:
@ -419,9 +484,16 @@ namespace marian {
    //        - but forward pass weighs them down, so that all factors are in a similar numeric range
    //        - if it is required to be in a different range, the embeddings can still learn that, but more slowly

-    auto batchEmbeddings = apply(subBatch->data(), {dimWords, dimBatch, dimEmb});
-    auto batchMask = graph->constant({dimWords, dimBatch, 1},
+    auto batchEmbeddings = apply(subBatch->data(), {dimWidth, dimBatch, dimEmb});
+#if 0
+    auto batchMask = graph->constant({dimWidth, dimBatch, 1},
                                     inits::fromVector(subBatch->mask()));
+#else
+    // experimental: hide inline-fix source tokens from cross attention
+    auto batchMask = graph->constant({dimWidth, dimBatch, 1},
+                                     inits::fromVector(subBatch->crossMaskWithInlineFixSourceSuppressed()));
+#endif
+
    return std::make_tuple(batchEmbeddings, batchMask);
  }

--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@ -412,4 +412,32 @@ public:
    ABORT("not implemented"); // @TODO: implement me
  }
 };
+
+// --- a few layers with built-in parameters created on the fly, without proper object
+// @TODO: change to a proper layer object
+
+// like affine() but with built-in parameters, activation, and dropout
+static inline
+Expr denseInline(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
+{
+  auto graph = x->graph();
+
+  auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorotUniform());
+  auto b = graph->param(prefix + "_b" + suffix, { 1,              outDim }, inits::zeros());
+
+  x = affine(x, W, b);
+  if (actFn)
+    x = actFn(x);
+  x = dropout(x, dropProb);
+  return x;
+}
+
+static inline
+Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) {
+  int dimModel = x->shape()[-1];
+  auto scale = x->graph()->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones());
+  auto bias  = x->graph()->param(prefix + "_ln_bias"  + suffix, { 1, dimModel }, inits::zeros());
+  return marian::layerNorm(x, scale, bias, 1e-6f);
+}
+
 }  // namespace marian
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@ -1,43 +1,75 @@
 #pragma once

 #include "layers/loss.h"
+#include "common/logging.h"

 namespace marian {

 static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
                                               Ptr<data::CorpusBatch> batch,
                                               Ptr<Options> options,
-                                               Expr attention) {
+                                               Expr attention) { // [beam depth=1, max src length, batch size, tgt length]

-  // @TODO: there should be positional masking here ... on the other hand, positions that are not 
-  // in a sentence should always agree (both being 0). Lack of masking affects label count only which is 
-  // probably negligible?
-
-  // @TODO: change "cost" to "loss"
-  std::string guidedLossType = options->get<std::string>("guided-alignment-cost");
-  float guidedScalar = options->get<float>("guided-alignment-weight");
+  std::string guidedLossType = options->get<std::string>("guided-alignment-cost");  // @TODO: change "cost" to "loss"
+  float guidedLossWeight = options->get<float>("guided-alignment-weight");
  
+  const auto& shape = attention->shape(); // [beam depth=1, max src length, batch size, tgt length]
  float epsilon = 1e-6f;
-  Expr alignment = constant_like(attention, inits::fromVector(batch->getGuidedAlignment()));
  Expr alignmentLoss; // sum up loss over all attention/alignment positions
-  if(guidedLossType == "mse") {
-    alignmentLoss = sum(flatten(square(attention - alignment))) / 2.f;
-  } else if(guidedLossType == "mult") {
-    alignmentLoss = -log(sum(flatten(attention * alignment)) + epsilon);
-  } else if(guidedLossType == "ce") {
+  size_t numLabels;
+  if(guidedLossType == "ce") {
+    // normalizedAlignment is multi-hot, but ce requires normalized probabilities, so need to normalize to P(s|t)
+    auto dimBatch    = shape[-2];
+    auto dimTrgWords = shape[-1];
+    auto dimSrcWords = shape[-3];
+    ABORT_IF(shape[-4] != 1, "Guided alignments with beam??");
+    auto normalizedAlignment = batch->getGuidedAlignment(); // [dimSrcWords, dimBatch, dimTrgWords] flattened, matches shape of 'attention'
+    auto srcBatch = batch->front();
+    const auto& srcMask = srcBatch->mask();
+    ABORT_IF(shape.elements() != normalizedAlignment.size(), "Attention-matrix and alignment shapes differ??");
+    ABORT_IF(dimBatch != batch->size() || dimTrgWords != batch->widthTrg() || dimSrcWords != batch->width(), "Attention-matrix and batch shapes differ??");
+    auto locate = [=](size_t s, size_t b, size_t t) { return ((s * dimBatch) + b) * dimTrgWords + t; };
+    for (size_t b = 0; b < dimBatch; b++) {
+      for (size_t t = 0; t < dimTrgWords; t++) {
+        for (size_t s = 0; s < dimSrcWords; s++)
+          ABORT_IF(locate(s, b, t) != batch->locateInGuidedAlignments(b, s, t), "locate() and locateInGuidedAlignments() differ??");
+        // renormalize the alignment such that it sums up to 1
+        float sum = 0;
+        for (size_t s = 0; s < dimSrcWords; s++)
+          sum += srcMask[srcBatch->locate(b, s)] * normalizedAlignment[locate(s, b, t)]; // these values are 0 or 1
+        if (sum != 0 && sum != 1)
+          for (size_t s = 0; s < dimSrcWords; s++)
+            normalizedAlignment[locate(s, b, t)] /= sum;
+      }
+    }
+    auto alignment = constant_like(attention, std::move(normalizedAlignment));
    alignmentLoss = -sum(flatten(alignment * log(attention + epsilon)));
+    numLabels = batch->back()->batchWords();
+    ABORT_IF(numLabels > shape.elements() / shape[-3], "Num labels of guided alignment cost is off??");
  } else {
-    ABORT("Unknown alignment cost type: {}", guidedLossType);
+    auto alignment = constant_like(attention, batch->getGuidedAlignment());
+    if(guidedLossType == "mse")
+      alignmentLoss = sum(flatten(square(attention - alignment))) / 2.f;
+    else if(guidedLossType == "mult") // @TODO: I don't know what this criterion is for. Can we remove it?
+      alignmentLoss = -log(sum(flatten(attention * alignment)) + epsilon);
+    else
+       ABORT("Unknown alignment cost type: {}", guidedLossType);
+    // every position is a label as they should all agree
+    // @TODO: there should be positional masking here ... on the other hand, positions that are not 
+    // in a sentence should always agree (both being 0). Lack of masking affects label count only which is 
+    // probably negligible?
+    numLabels = shape.elements();
  }
-  
-  alignmentLoss = guidedScalar * alignmentLoss; // weigh by scalar

-  // every position is a label as they should all agree, see caveat at the top.
-  size_t numLabels = alignment->shape().elements();
-  
  // Create label node, also weigh by scalar so labels and cost are in the same domain.
-  // Fractional label counts are OK
-  return RationalLoss(alignmentLoss, guidedScalar * numLabels);
+  // Fractional label counts are OK. But only if combined as "sum".
+  // @TODO: It is ugly to check the multi-loss type here, but doing this right requires
+  // a substantial rewrite of the multi-loss architecture, which is planned anyways.
+  std::string multiLossType = options->get<std::string>("multi-loss-type", "sum");
+  if (multiLossType == "sum")         // sum of sums
+    return RationalLoss(guidedLossWeight * alignmentLoss, guidedLossWeight * numLabels);
+  else
+    return RationalLoss(guidedLossWeight * alignmentLoss, (float)numLabels);
 }

 }  // namespace marian
--- a/src/layers/loss.cpp
+++ b/src/layers/loss.cpp
@ -7,9 +7,15 @@ Ptr<LabelwiseLoss> newLoss(Ptr<Options> options, bool inference) {
  float smoothing = inference ? 0.f : options->get<float>("label-smoothing");
  float factorWeight = options->get<float>("factor-weight", 1.0f);
  std::string costType = options->get<std::string>("cost-type", "ce-mean");
+  bool unlikelihood = options->get<bool>("unlikelihood-loss", false);
  
  if(costType == "ce-rescore") { // returns per-batch-item scores (while ce-mean reduces over batch)
    return New<RescorerLoss>();
+  } else if(unlikelihood) {  
+    ABORT_IF(!options->hasAndNotEmpty("data-weighting") 
+             && options->get<std::string>("data-weighting-type") != "word",
+             "Unlikelihood loss training requires error annotation in form of per-target-label scores");
+    return New<SequenceUnlikelihoodLoss>(smoothing, factorWeight); // this is a mix of CE-loss and unlikelihood less depending on values given for data-weighting
  } else {  // same as ce-mean  --@TODO: better check all allowed values, and fail for invalid ones. E.g. what about ce-sum?
    return New<CrossEntropyLoss>(smoothing, factorWeight);
  }
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@ -206,7 +206,7 @@ private:
  virtual Expr accumulateLoss(const RationalLoss& current) override {
    if(loss_) {
      const auto& first = partialLosses_.front();
-      return loss_ + first.count() * (current.loss() / current.count()); // scale up/down to match scale of first loss
+      return loss_ + current.loss() * first.count() / current.count(); // scale up/down to match scale of first loss
    } else {
      return current.loss(); // first reference loss, keeps to scale with this one
    }
@ -344,8 +344,8 @@ protected:
      // for bert training or classification the time dimension is lot.
      // Here safeguard against 2d classifier output, adds 1 on the left, non-op.
      Expr ce = cast(cross_entropy(logits, indices), Type::float32);
-      if (inFactor) {
-        LOG_ONCE("scaling factor losses with weight {}", factorWeight_);
+      if (inFactor && factorWeight_ != 1.0f) {
+        LOG_ONCE(info, "scaling factor losses with weight {}", factorWeight_);
        ce = ce * factorWeight_;
      }
      if (labelSmoothing_ > 0) {
@ -366,13 +366,68 @@ protected:
    if(mask)
      ce = ce * cast(mask, Type::float32);

-    if(labelWeights)
+    if(labelWeights) {
+      // We currently do not know how to use target factors and word-level label weights together
+      bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights. 
+      ABORT_IF(wordlevel && logits.getNumFactorGroups() > 1, "CE loss with word-level label weights is not implemented for factors");
      ce = ce * cast(labelWeights, Type::float32);
+    }

    return ce;
  }
 };

+
+/**
+ * @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an 
+ * implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319. 
+ * We rely on word-level label weights where 1 is correct and 0 is marking an error. If there are not
+ * zeros for a sentence it going to be trained with normal CE loss if there is at least one 0 it is going
+ * to flip over to use SUL for that sentence to penalize the selected word.
+ * 
+ * SUL is implemented as:
+ * -log(gather(1 - softmax(logits), -1, indices))
+ * 
+ * Factors are currently not supported.
+ */
+class SequenceUnlikelihoodLoss : public CrossEntropyLoss {
+public:
+  SequenceUnlikelihoodLoss(float labelSmoothing, float factorWeight)
+  : CrossEntropyLoss(labelSmoothing, factorWeight) {} // cross-entropy already reduces over axis -1
+
+  SequenceUnlikelihoodLoss(const std::vector<int>& axes, float labelSmoothing, float factorWeight)
+  : CrossEntropyLoss(axes, labelSmoothing, factorWeight) {}
+
+protected:
+  virtual Expr compute(Logits logits, const Words& labels,
+                       Expr mask = nullptr, Expr labelWeights = nullptr) override {
+    auto ce = CrossEntropyLoss::compute(logits, labels, mask, /*labelWeights=*/nullptr); // don't pass label-weights to CE
+    if(!labelWeights)
+      return ce; // for validation, @TODO: maybe put rather abort or LOG_ONCE(warn, ...)?
+
+    // We currently do not know how to use target factors and word-level label weights together
+    ABORT_IF(logits.getNumFactorGroups() > 1, "Unlikelihood loss is not implemented for factors");
+
+    ABORT_IF(!mask, "mask is required"); // @TODO: check this, it seems weights for padding are by default 1, which would make this obsolete.
+    // use label weights, where 1 is GOOD and 0 is BAD. After inversion here, now 1 marks, mask again to eliminate padding (might be obsolete)
+    auto errorMask = (1.f - cast(labelWeights, Type::float32)) * cast(mask, Type::float32);
+  
+    auto ceUl = logits.applyLossFunction(labels, [&](Expr logits, Expr indices) {
+      return cast(unlikelihood(logits, indices), Type::float32);
+    });
+  
+    // compute if want to use CE or UL. If there are no errors train with CE, otherwise train _only on_ the errors with UL. This is the "mixed" training
+    // schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL. 
+    auto onlyCe  = eq(sum(errorMask, /*axis=*/-3), 0.f); // [1, 1, dimBatch, 1] - equal 1 if no errors are present
+    ceUl         = errorMask * ceUl;                     // don't use for correct label or padding
+
+    auto cost    = onlyCe * ce + (1.f - onlyCe) * ceUl;  // ce or unlikelihood part are never simultanously used as cost per batch entry 
+
+    return cost;
+  }
+};
+
+
 /**
 * @brief Cross entropy in rescorer used for computing sentences-level log probabilities
 */
--- a/src/layers/weight.cpp
+++ b/src/layers/weight.cpp
@ -15,8 +15,16 @@ Expr DataWeighting::getWeights(Ptr<ExpressionGraph> graph,
  bool sentenceWeighting = weightingType_ == "sentence";
  int dimBatch = (int)batch->size();
  int dimWords = sentenceWeighting ? 1 : (int)batch->back()->batchWidth();
+
+  // This would abort anyway in fromVector(...), but has clearer error message
+  // here for this particular case
+  ABORT_IF(batch->getDataWeights().size() != dimWords * dimBatch, 
+           "Number of sentence/word-level weights ({}) does not match tensor size ({})",
+           batch->getDataWeights().size(), dimWords * dimBatch);
+
  auto weights = graph->constant({1, dimWords, dimBatch, 1},
                                 inits::fromVector(batch->getDataWeights()));
-  return weights;
+  return weights; // [1, dimWords, dimBatch, 1] in case of word-level weights or
+                  // [1,        1, dimBatch, 1] in case of sentence-level weights
 }
 }  // namespace marian
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@ -10,7 +10,7 @@
 #include "translator/scorers.h"
 #include "data/alignment.h"
 #include "data/vocab_base.h"
-#include "graph/expression_graph_packable.h"
+#include "tensors/cpu/fbgemm/expression_graph_packable.h"

 #if USE_FBGEMM
 #include "fbgemm/Utils.h"
@ -258,14 +258,14 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP

  graph->load(inputFile);
  graph->forward();
-  std::string saveGemmType = "fp32default";
+  auto saveGemmType = Type::float32;
  if (targetPrec == 16)
-    saveGemmType = "fp16packed";
+    saveGemmType = Type::packed16;
  else if (targetPrec == 8)
-    saveGemmType = "int8packed";
+    saveGemmType = Type::packed8avx2; // We currently use avx2 by default.

  // added a flag if the weights needs to be packed or not
-  graph->packAndSave(outputFile, configStr.str(), saveGemmType); // @TODO: this should just be type-based
+  graph->packAndSave(outputFile, configStr.str(), saveGemmType);

  std::cout << "Conversion Finished." << std::endl;

--- a/src/models/decoder.h
+++ b/src/models/decoder.h
@ -47,13 +47,12 @@ public:

    ABORT_IF(shortlist_, "How did a shortlist make it into training?");

-    const Words& data = subBatch->data();
-    Expr yData = graph_->indices(toWordIndexVector(data));
-
    auto yShifted = shift(y, {1, 0, 0});

    state->setTargetHistoryEmbeddings(yShifted);
    state->setTargetMask(yMask);
+    
+    const Words& data = subBatch->data();
    state->setTargetWords(data);
  }

--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@ -196,7 +196,7 @@ Ptr<DecoderState> EncoderDecoder::step(Ptr<ExpressionGraph> graph,
  state = hypIndices.empty() ? state : state->select(hypIndices, batchIndices, beamSize);

  // Fill state with embeddings based on last prediction
-  decoders_[0]->embeddingsFromPrediction(graph, state, words, batchIndices.size(), beamSize);
+  decoders_[0]->embeddingsFromPrediction(graph, state, words, (int) batchIndices.size(), beamSize);
  auto nextState = decoders_[0]->step(graph, state);
  
  return nextState;
--- a/src/models/states.h
+++ b/src/models/states.h
@ -9,7 +9,7 @@ namespace marian {
 class EncoderState {
 private:
  Expr context_;
-  Expr mask_;
+  Expr mask_;       // [beam depth=1, max length, batch size, vector dim=1] source mask
  Ptr<data::CorpusBatch> batch_;

 public:
@ -18,9 +18,9 @@ public:

  EncoderState() {}

-  virtual Expr getContext() { return context_; }
-  virtual Expr getAttended() { return context_; }
-  virtual Expr getMask() { return mask_; }
+  virtual Expr getContext()   const { return context_;   }
+  virtual Expr getAttended()  const { return context_;   }
+  virtual Expr getMask()      const { return mask_;      } // source batch mask; may have additional positions suppressed

  virtual const Words& getSourceWords() {
    return batch_->front()->data();
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -142,29 +142,6 @@ public:
    return reshape(output, {dimBeam, dimBatch, dimSteps, dimModel});
  }

-  // like affine() but with built-in parameters, activation, and dropout
-  static inline
-  Expr dense(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
-  {
-    auto graph = x->graph();
-
-    auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorotUniform());
-    auto b = graph->param(prefix + "_b" + suffix, { 1,              outDim }, inits::zeros());
-
-    x = affine(x, W, b);
-    if (actFn)
-      x = actFn(x);
-    x = dropout(x, dropProb);
-    return x;
-  }
-
-  Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) const {
-    int dimModel = x->shape()[-1];
-    auto scale = graph_->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones());
-    auto bias  = graph_->param(prefix + "_ln_bias"  + suffix, { 1, dimModel }, inits::zeros());
-    return marian::layerNorm(x, scale, bias, 1e-6f);
-  }
-
  Expr preProcess(std::string prefix, std::string ops, Expr input, float dropProb = 0.0f) const {
    auto output = input;
    for(auto op : ops) {
@ -192,7 +169,7 @@ public:
      // highway connection
      else if(op == 'h') {
        int dimModel = input->shape()[-1];
-        auto t = dense(prevInput, prefix, /*suffix=*/"h", dimModel);
+        auto t = denseInline(prevInput, prefix, /*suffix=*/"h", dimModel);
        output = highway(output, prevInput, t);
      }
      // layer normalization
@ -402,8 +379,8 @@ public:

    // the stack of FF layers
    for(int i = 1; i < depthFfn; ++i)
-      output = dense(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
-    output = dense(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);
+      output = denseInline(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
+    output = denseInline(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);

    auto opsPost = opt<std::string>("transformer-postprocess");
    output
@ -430,14 +407,14 @@ public:

    // the stack of AAN layers
    for(int i = 1; i < depthAan; ++i)
-      y = dense(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
+      y = denseInline(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
    if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed
-      y = dense(y, prefix, std::to_string(depthAan), dimModel);
+      y = denseInline(y, prefix, std::to_string(depthAan), dimModel);

    bool noGate = opt<bool>("transformer-aan-nogate");
    if(!noGate) {
-      auto gi = dense(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
-      auto gf = dense(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
+      auto gi = denseInline(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
+      auto gf = denseInline(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
      y = gi * x + gf * y;
    }

@ -482,7 +459,7 @@ public:
                       int /*startPos*/) const {
    float dropoutRnn = inference_ ? 0.f : opt<float>("dropout-rnn");

-    if(!perLayerRnn[prefix]) // lazily created and cache RNNs in the docoder to avoid costly recreation @TODO: turn this into class members
+    if(!perLayerRnn[prefix]) // lazily create and cache RNNs in the decoder to avoid costly recreation @TODO: turn this into class members
      perLayerRnn[prefix] = rnn::rnn(
          "type", opt<std::string>("dec-cell"),
          "prefix", prefix,
@ -533,29 +510,31 @@ public:
    batchEmbeddings = addSpecialEmbeddings(batchEmbeddings, /*start=*/0, batch);
    
    // reorganize batch and timestep
-    batchEmbeddings = atleast_nd(batchEmbeddings, 4);
-    batchMask = atleast_nd(batchMask, 4);
-    auto layer = transposeTimeBatch(batchEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
-    auto layerMask
-      = reshape(transposeTimeBatch(batchMask), {1, dimBatch, 1, dimSrcWords}); // [-4: beam depth=1, -3: batch size, -2: vector dim=1, -1: max length]
+    batchEmbeddings = atleast_nd(batchEmbeddings, 4); // [beam depth=1, max length, batch size, vector dim]
+    batchMask       = atleast_nd(batchMask, 4);       // [beam depth=1, max length, batch size, vector dim=1]
+
+    auto layer     = transposeTimeBatch(batchEmbeddings); // [beam depth=1, batch size, max length, vector dim]
+    auto layerMask = transposeTimeBatch(batchMask);       // [beam depth=1, batch size, max length, vector dim=1]

    auto opsEmb = opt<std::string>("transformer-postprocess-emb");
-
    float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
    layer = preProcess(prefix_ + "_emb", opsEmb, layer, dropProb);

-    layerMask = transposedLogMask(layerMask); // [-4: batch size, -3: 1, -2: vector dim=1, -1: max length]
+    // LayerAttention expects mask in a different layout
+    layerMask = reshape(layerMask, {1, dimBatch, 1, dimSrcWords}); // [1,          batch size,            1,                      max length]
+    layerMask = transposedLogMask(layerMask);                      // [batch size, num heads broadcast=1, max length broadcast=1, max length]

    // apply encoder layers
+    // This is the Transformer Encoder stack.
    auto encDepth = opt<int>("enc-depth");
    for(int i = 1; i <= encDepth; ++i) {
      layer = LayerAttention(prefix_ + "_l" + std::to_string(i) + "_self",
                             layer, // query
                             layer, // keys
                             layer, // values
-                             layerMask);
-
+                             layerMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
      layer = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", layer);
+      checkpoint(layer); // sets a manually specified checkpoint if gradient checkpointing is enabled, does nothing otherwise.
    }

    // restore organization of batch and time steps. This is currently required
@ -698,12 +677,14 @@ public:

    std::vector<Expr> encoderContexts;
    std::vector<Expr> encoderMasks;
-
    for(auto encoderState : state->getEncoderStates()) {
-      auto encoderContext = encoderState->getContext();
-      auto encoderMask = encoderState->getMask();
+      auto encoderContext = encoderState->getContext(); // encoder output
+      auto encoderMask = encoderState->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention
+      encoderMask = atleast_nd(encoderMask, 4);
+
+      encoderContext = transposeTimeBatch(encoderContext); // [beam depth=1, batch size, max length, vector dim]
+      encoderMask    = transposeTimeBatch(encoderMask);    // [beam depth=1, max length, batch size, vector dim=1]

-      encoderContext = transposeTimeBatch(encoderContext); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
      int dimSrcWords = encoderContext->shape()[-2];

      // This would happen if something goes wrong during batch pruning.
@ -712,15 +693,17 @@ public:
               encoderContext->shape()[-3], 
               dimBatch);

-      encoderMask = atleast_nd(encoderMask, 4);
-      encoderMask = reshape(transposeTimeBatch(encoderMask),
-                            {1, dimBatch, 1, dimSrcWords});
-      encoderMask = transposedLogMask(encoderMask);
+      // LayerAttention expects mask in a different layout
+      encoderMask = reshape(encoderMask, { 1, dimBatch, 1, dimSrcWords }); // [1,          batch size,            1,                      max length]
+      encoderMask = transposedLogMask(encoderMask);                        // [batch size, num heads broadcast=1, max length broadcast=1, max length]
      if(dimBeam > 1)
        encoderMask = repeat(encoderMask, dimBeam, /*axis=*/ -4);

      encoderContexts.push_back(encoderContext);
      encoderMasks.push_back(encoderMask);
+
+      checkpoint(encoderContext);
+      checkpoint(encoderMask);
    }

    rnn::States prevDecoderStates = state->getStates();
@ -756,6 +739,8 @@ public:
        ABORT("Unknown auto-regressive layer type in transformer decoder {}",
              layerType);

+      checkpoint(query);
+
      // source-target attention
      // Iterate over multiple encoders and simply stack the attention blocks
      if(encoderContexts.size() > 0) {
@ -792,10 +777,14 @@ public:
        }
      }

+      checkpoint(query);
+
      // remember decoder state
      decoderStates.push_back(decoderState);

      query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+
+      checkpoint(query);
    }

    auto decoderContext = transposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
@ -831,6 +820,10 @@ public:
      output_->clear();
    cache_.clear();
    alignments_.clear();
+    perLayerRnn_.clear(); // this needs to be cleared between batches. 
+    // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, 
+    // but where underlying memory has been deallocated by dropping all tensors 
+    // from a TensorAllocator object. This can happen during ExpressionGraph::clear()
  }
 };

--- a/src/optimizers/optimizers.cpp
+++ b/src/optimizers/optimizers.cpp
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@ -69,11 +69,10 @@ public:

    for(auto device : devices) {
      auto graph = New<ExpressionGraph>(true);
-      graph->setDevice(device);

      auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
      graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
-
+      graph->setDevice(device);
      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
      if (device.type == DeviceType::cpu) {
        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
--- a/src/rnn/cells.cpp
+++ b/src/rnn/cells.cpp
@ -43,6 +43,23 @@ struct GRUFastNodeOp : public NaryNodeOp {
  const std::string type() override { return "GRU-ops"; }

  const std::string color() override { return "yellow"; }
+
+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, final_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<GRUFastNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(final_ != cnode->final_)
+      return false;
+    return true;
+  }
 };

 Expr gruOps(const std::vector<Expr>& nodes, bool final) {
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@ -5,15 +5,6 @@

 namespace marian {

-// GEMM type enum
-typedef enum { 
-  Auto = 0,            // auto tuning between available GEMMs
-  MklFp32 = 1,         // MKL based GEMM, fp32
-  IntrinInt16 = 2,     // Intrinsic implementation of Int 16 GEMM
-  FbFp16Packed = 10,   // FBGEMM based fp16 GEMM with packing
-  FbInt8Packed = 11    // FBGEMM based int8 GEMM with packing
-} GemmType;
-
 class Backend {
 protected:
  DeviceId deviceId_;
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@ -12,7 +12,6 @@ namespace cpu {
 class Backend : public marian::Backend {
 protected:
  bool optimized_{false};
-  GemmType gemmType_{GemmType::Auto};

 public:
  Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
--- a/src/tensors/cpu/element.h
+++ b/src/tensors/cpu/element.h
@ -99,8 +99,10 @@ void elementFloat(const Functor& functor, marian::Tensor out, Tensors... tensors

  if(div8) {
    // std::cerr << "8: " << functor.to_string() << std::endl;
+#ifdef __AVX__
    element<float32x8>(functor, out, tensors...);
    return;
+#endif
  }

  if(div4) {
--- a/src/tensors/cpu/expanded_gemm.h
+++ b/src/tensors/cpu/expanded_gemm.h
@ -1,205 +0,0 @@
-#pragma once
-
-#include "graph/node.h"
-#include "tensors/cpu/sharp/packed_gemm.h"
-
-#if USE_FBGEMM
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#endif
-
-#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-using namespace fbgemm;
-#endif  // USE_FBGEMM
-
-namespace marian {
-namespace cpu {
-namespace variant {
-
-// Enumeration for the Matrix used in pack functions
-// A matrix - 0, B matrix - 1
-enum class PackMatrix : uint8_t {
-  A = 0x00,
-  B = 0x01
-};
-
-// Pack a matrix into cache utilization efficient way (block format)
-// PackMatrix packMat_: the type of packed matrix - A or B matrix
-// bool transpose_: transpose
-// int nrow_: the number of rows
-// int ncol_: the number of columns
-// int kernel_ncol_blocks_: the number of column blocks
-// int brow_: the number of rows in a block
-// int bcol_: the number of columns in a block
-// int last_brow_: the number of rows in the last block
-// int nbrow_: row index in a block
-// int nbcol_: column index in a block
-// uint64_t packsize_: the size of the packed matrix
-//                    (the number of fp16 elements + padding (1024) + extra temporary memory (256))
-struct PackNodeOp : public UnaryNodeOp {
-  PackMatrix packMat_;
-  bool transpose_;
-  int nrow_;
-  int ncol_;
-  int kernel_ncol_blocks_;
-  int brow_;
-  int bcol_;
-  int last_brow_;
-  int nbrow_;
-  int nbcol_;
-  uint64_t packsize_;
-
-  PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
-      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
-        packMat_(packMat),
-        transpose_(transpose) {
-    if(packMat != PackMatrix::B)
-      ABORT("Only prepacking of B (weight matrix) is supported");
-    if(clipValue != 0)
-      ABORT("Clipping is not supported");
-    if(!memoize_)
-      ABORT("Only constant weight node can be packed");
-  }
-
-  NodeOps forwardOps() override {
-    return {NodeOp(PackFp32(val_,
-                            child(0)->val()->data(),
-                            transpose_,
-                            nrow_,
-                            ncol_,
-                            kernel_ncol_blocks_,
-                            brow_,
-                            bcol_,
-                            last_brow_,
-                            nbrow_,
-                            nbcol_,
-                            packsize_))
-    };
-  }
-
-  NodeOps backwardOps() override {
-    ABORT("PackNodeOp only available for inference");
-    return {NodeOp(0)};
-  }
-
-  const std::string type() override { return "packMat"; }
-
-  Shape newShape(Expr a, bool transpose) {
-#if USE_FBGEMM
-    auto shapeMat = a->shape();
-    // Should be 2D - weight matrix
-    ABORT_IF(shapeMat.size() != 2,
-             "Weight Matrix should be 2D");
-    PackInfoFp32(shapeMat,
-                 transpose,
-                 nrow_,
-                 ncol_,
-                 kernel_ncol_blocks_,
-                 brow_,
-                 bcol_,
-                 last_brow_,
-                 nbrow_,
-                 nbcol_,
-                 packsize_);
-
-    Shape outShape({(int)packsize_});
-
-    return outShape;
-#else // USE_FBGEMM
-    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
-    return Shape();
-#endif  // USE_FBGEMM
-  }
-};
-
-// Affine transform (matrix multiplication) using packed B matrix
-// float scalar_: scalar multiplier
-// size_t m_: the number of rows in A and C
-// size_t n_: the number of columns in B and C
-// size_t k_: the number of columns in A and the number of rows in C
-// bool transA_: transpose A
-// bool transB_: transpose B
-class AffineNodeOp : public NaryNodeOp {
-private:
-  float scalar_;
-  size_t m_;
-  size_t n_;
-  size_t k_;
-  bool transA_;
-  bool transB_;
-
-public:
-  AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
-        scalar_(scalar) {
-    transA_ = transA;
-    transB_ = transB;
-    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
-    k_ = nodes[0]->shape().back();
-    if(transA)
-      std::swap(m_, k_);
-
-    size_t l = bShape.elements() / bShape[-1];
-    n_ = bShape[-1];
-    if(transB)
-      std::swap(l, n_);
-  }
-
-  Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
-    auto shapeA = a->shape();
-    if(transA) {
-      shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
-      shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
-    }
-
-    auto shapeB = bShape;
-    if(transB) {
-      shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
-      shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
-    }
-
-    Shape outShape = shapeA;
-    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
-    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
-             "Matrix product requires inner dimensions to match");
-    return outShape;
-  }
-
-  NodeOps forwardOps() override {
-    return {
-      NodeOp(GemmPackFp32(val_,
-                          child(0)->val(),
-                          child(1)->val(),
-                          child(2)->val(),
-                          m_,
-                          n_,
-                          transA_))
-    };
-  }
-
-  NodeOps backwardOps() override {
-    ABORT("Only used for inference");
-    return {NodeOp(0)};
-  }
-
-  const std::string type() override { return "fp16packed"; }
-};
-
-static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
-  std::vector<Expr> nodes = {a, b, c};
-  return Expression<cpu::variant::AffineNodeOp>(nodes, bShape, transA, transB, scalar);
-}
-
-static inline Expr pack(Expr a, PackMatrix packMat, bool transpose, float clipValue) {
-  return Expression<cpu::variant::PackNodeOp>(a, packMat, transpose, clipValue);
-}
-
-}  // namespace variant
-}  // namespace cpu
-}  // namespace marian
--- a/src/tensors/cpu/fbgemm/expanded_gemm.h
+++ b/src/tensors/cpu/fbgemm/expanded_gemm.h
@ -0,0 +1,408 @@
+#pragma once
+
+#include "graph/node.h"
+#include "packed_gemm.h"
+#include "tensors/cpu/sharp/int_gemm.h"
+
+#if USE_FBGEMM
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+using namespace fbgemm;
+#endif  // USE_FBGEMM
+
+namespace marian {
+namespace cpu {
+namespace variant {
+
+// Enumeration for the Matrix used in pack functions
+// A matrix - 0, B matrix - 1
+enum class PackMatrix : uint8_t {
+  A = 0x00,
+  B = 0x01
+};
+
+// Pack a matrix (fp16) into cache utilization efficient way (block format) together with quantization into fp16
+// PackMatrix packMat_: the type of packed matrix - A or B matrix
+// bool transpose_: transpose
+// int nrow_: the number of rows
+// int ncol_: the number of columns
+// int kernel_ncol_blocks_: the number of column blocks
+// int brow_: the number of rows in a block
+// int bcol_: the number of columns in a block
+// int last_brow_: the number of rows in the last block
+// int nbrow_: row index in a block
+// int nbcol_: column index in a block
+// uint64_t packsize_: the size of the packed matrix
+//                    (the number of fp16 elements + padding (1024) + extra temporary memory (256))
+struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
+  PackMatrix packMat_;
+  bool transpose_;
+  int nrow_;
+  int ncol_;
+  int kernel_ncol_blocks_;
+  int brow_;
+  int bcol_;
+  int last_brow_;
+  int nbrow_;
+  int nbcol_;
+  uint64_t packsize_;
+
+  FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
+      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
+        packMat_(packMat),
+        transpose_(transpose) {
+    if(packMat != PackMatrix::B)
+      ABORT("Only prepacking of B (weight matrix) is supported");
+    if(clipValue != 0)
+      ABORT("Clipping is not supported");
+    if(!memoize_)
+      ABORT("Only constant weight node can be packed");
+  }
+
+  NodeOps forwardOps() override {
+#if USE_FBGEMM
+    return {NodeOp(fbgemmPacked16Pack(val_,
+                                      child(0)->val()->data(),
+                                      transpose_,
+                                      nrow_,
+                                      ncol_,
+                                      kernel_ncol_blocks_,
+                                      brow_,
+                                      bcol_,
+                                      last_brow_,
+                                      nbrow_,
+                                      nbcol_,
+                                      packsize_))
+    };
+#else // USE_FBGEMM
+    ABORT("FbgemmPacked16PackNodeOp can only be used with FBGEMM enabled.");
+    return { NodeOp(0) };
+#endif  // USE_FBGEMM
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("FbgemmPacked16PackNodeOp only available for inference");
+    return {NodeOp(0)};
+  }
+
+  const std::string type() override { return "packMatFp16"; }
+
+  Shape newShape(Expr a, bool transpose) {
+#if USE_FBGEMM
+    auto shapeMat = a->shape();
+    // Should be 2D - weight matrix
+    ABORT_IF(shapeMat.size() != 2,
+             "Weight Matrix should be 2D");
+    fbgemmPacked16PackInfo(shapeMat,
+                           transpose,
+                           nrow_,
+                           ncol_,
+                           kernel_ncol_blocks_,
+                           brow_,
+                           bcol_,
+                           last_brow_,
+                           nbrow_,
+                           nbcol_,
+                           packsize_);
+
+    Shape outShape({(int)packsize_});
+
+    return outShape;
+#else // USE_FBGEMM
+    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
+    return Shape();
+#endif  // USE_FBGEMM
+  }
+};
+
+// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
+// PackMatrix packMat_: the type of packed matrix - A or B matrix
+// marian::Type packType_: the type the input matrix is packed - packed8avx2 or packed8avx512
+// bool transpose_: transpose
+// int nrow_: the number of rows
+// int ncol_: the number of columns
+// uint64_t packsize_: the size of the packed matrix
+//                    (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
+  PackMatrix packMat_;
+  marian::Type packType_;
+  bool transpose_;
+  int nrow_;
+  int ncol_;
+  uint64_t packsize_;
+
+  FbgemmPacked8PackNodeOp(Expr a,
+                          PackMatrix packMat,
+                          marian::Type packType,
+                          bool transpose,
+                          float clipValue)
+      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
+        packMat_(packMat),
+        packType_(packType),
+        transpose_(transpose) {
+    if(packMat != PackMatrix::B)
+      ABORT("Only prepacking of B (weight matrix) is supported");
+    if(clipValue != 0)
+      ABORT("Clipping is not supported");
+    if(!memoize_)
+      ABORT("Only constant weight node can be packed");
+  }
+
+  NodeOps forwardOps() override {
+#if USE_FBGEMM
+    return {NodeOp(fbgemmPacked8Pack(val_,
+                                     child(0)->val()->data(),
+                                     packType_,
+                                     transpose_,
+                                     nrow_,
+                                     ncol_,
+                                     packsize_))
+    };
+#else // USE_FBGEMM
+    ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled.");
+    return { NodeOp(0) };
+#endif  // USE_FBGEMM
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("FbgemmPacked8PackNodeOp only available for inference");
+    return {NodeOp(0)};
+  }
+
+  const std::string type() override { return "packMatInt8"; }
+
+  Shape newShape(Expr a, bool transpose) {
+#if USE_FBGEMM
+    fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
+    Shape outShape({(int)packsize_});
+
+    return outShape;
+#else // USE_FBGEMM
+    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
+    return Shape();
+#endif  // USE_FBGEMM
+  }
+};
+
+// Affine transform (matrix multiplication) using packed B matrix
+// float scalar_: scalar multiplier
+// size_t m_: the number of rows in A and C
+// size_t n_: the number of columns in B and C
+// size_t k_: the number of columns in A and the number of rows in C
+// bool transA_: transpose A
+// bool transB_: transpose B
+class FbgemmPacked16AffineNodeOp : public NaryNodeOp {
+private:
+  float scalar_;
+  size_t m_;
+  size_t n_;
+  size_t k_;
+  bool transA_;
+  bool transB_;
+
+public:
+  FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
+      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
+        scalar_(scalar) {
+    transA_ = transA;
+    transB_ = transB;
+    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
+    k_ = nodes[0]->shape().back();
+    if(transA)
+      std::swap(m_, k_);
+
+    size_t l = bShape.elements() / bShape[-1];
+    n_ = bShape[-1];
+    if(transB)
+      std::swap(l, n_);
+  }
+
+  Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
+    auto shapeA = a->shape();
+    if(transA) {
+      shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
+      shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
+    }
+
+    auto shapeB = bShape;
+    if(transB) {
+      shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
+      shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
+    }
+
+    Shape outShape = shapeA;
+    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
+    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
+             "Matrix product requires inner dimensions to match");
+    return outShape;
+  }
+
+  NodeOps forwardOps() override {
+#if USE_FBGEMM
+    return {
+      NodeOp(fbgemmPacked16Gemm(val_,
+                                child(0)->val(),
+                                child(1)->val(),
+                                children().size() > 2 ? child(2)->val() : nullptr, // pass only if it has a bias
+                                m_,
+                                n_,
+                                transA_))
+    };
+#else // USE_FBGEMM
+    ABORT("FbgemmPacked16AffineNodeOp can only be used with FBGEMM enabled.");
+    return { NodeOp(0) };
+#endif  // USE_FBGEMM
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("Only used for inference");
+    return {NodeOp(0)};
+  }
+
+  const std::string type() override { return "gemmPacked16"; }
+};
+
+// Affine transform (matrix multiplication) using packed B matrix
+// Especially, this gemm performs quantized gemms in 8-bit integers.
+// float scalar_: scalar multiplier
+// size_t m_: the number of rows in A and C
+// size_t n_: the number of columns in B and C
+// size_t k_: the number of columns in A and the number of rows in C
+// bool transA_: transpose A
+// bool transB_: transpose B
+class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
+private:
+  float scalar_;
+  size_t m_;
+  size_t n_;
+  size_t k_;
+  bool transA_;
+  bool transB_;
+
+public:
+  FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
+      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
+        scalar_(scalar) {
+    transA_ = transA;
+    transB_ = transB;
+    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
+    k_ = nodes[0]->shape().back();
+    if(transA)
+      std::swap(m_, k_);
+
+    size_t l = bShape.elements() / bShape[-1];
+    n_ = bShape[-1];
+    if(transB)
+     std::swap(l, n_);
+  }
+
+  Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
+    auto shapeA = a->shape();
+    if(transA) {
+      shapeA.set(shapeA.size() - 2, a->shape()[shapeA.size() - 1]);
+      shapeA.set(shapeA.size() - 1, a->shape()[shapeA.size() - 2]);
+    }
+
+    auto shapeB = bShape;
+    if(transB) {
+     shapeB.set(shapeB.size() - 2, bShape[shapeB.size() - 1]);
+     shapeB.set(shapeB.size() - 1, bShape[shapeB.size() - 2]);
+    }
+
+    Shape outShape = shapeA;
+    outShape.set(outShape.size() - 1, shapeB[shapeB.size() - 1]);
+    ABORT_IF(shapeA[shapeA.size() - 1] != shapeB[shapeB.size() - 2],
+            "Matrix product requires inner dimensions to match");
+    return outShape;
+  }
+
+  NodeOps forwardOps() override {
+    NodeOps nodeOps;
+#if USE_FBGEMM
+    // Do addBias only if it has a bias term
+    if (children().size() > 2) {
+      nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
+                                           child(0)->val(),
+                                           child(1)->val(),
+                                           m_,
+                                           n_,
+                                           k_,
+                                           transA_,
+                                           transB_);
+                       marian::cpu::int16::AddBias(val_, child(2)->val())) };
+    } else {
+      nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
+                                           child(0)->val(),
+                                           child(1)->val(),
+                                           m_,
+                                           n_,
+                                           k_,
+                                           transA_,
+                                           transB_)) };
+    }
+#else // USE_FBGEMM
+    ABORT("FbgemmPacked8AffineNodeOp can only be used with FBGEMM enabled.");
+#endif  // USE_FBGEMM
+
+    return nodeOps;
+  }
+
+  NodeOps backwardOps() override {
+    ABORT("Only used for inference");
+    return {NodeOp(0)};
+  }
+
+  const std::string type() override { return "gemmPacked8"; }
+};
+
+static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
+  std::vector<Expr> nodes = {a, b, c};
+  Type elementType = b->value_type();
+
+  if (elementType == Type::packed16)
+    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+  else if (isPacked(elementType) && sizeOf(elementType) == 1)
+    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+  else {
+    ABORT("Only int8 and fp16 are available. {}", elementType);
+    return nullptr;
+  }
+}
+
+static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
+  if (elementType == Type::packed16)
+    return Expression<cpu::variant::FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
+  else if (isPacked(elementType) && sizeOf(elementType) == 1)
+    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
+  else {
+    ABORT("Only int8 and fp16 are available. {}", elementType);
+    return nullptr;
+  }
+}
+
+static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
+  std::vector<Expr> nodes = {a, b};
+  Type elementType = b->value_type();
+
+  if (elementType == Type::packed16)
+    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+  else if (isPacked(elementType) && sizeOf(elementType) == 1)
+    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+  else {
+    ABORT("Only int8 and fp16 are available. {}", elementType);
+    return nullptr;
+  }
+}
+
+}  // namespace variant
+}  // namespace cpu
+}  // namespace marian
--- a/src/tensors/cpu/fbgemm/expression_graph_packable.h
+++ b/src/tensors/cpu/fbgemm/expression_graph_packable.h
@ -1,7 +1,7 @@
 #pragma once

 #include "graph/expression_graph.h"
-#include "tensors/cpu/sharp/packed_gemm.h"
+#include "packed_gemm.h"

 namespace marian {

@ -20,7 +20,7 @@ public:

  // Convert model weights into packed format and save to IO items.
  // @TODO: review this
-  void packAndSave(const std::string& name, const std::string& meta, std::string& saveGemmType, Type saveElementType = Type::float32) {
+  void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
    std::vector<io::Item> ioItems;

    // sorted by name in std::map
@ -35,15 +35,62 @@ public:
      Tensor val = p.second->val();

      // save as packed format
-      // @TODO Hardcoded to find packable weights - all the weights used for affine op
-      if (saveGemmType == "fp16packed" && pName.find("_W") == pName.length() - 3) {
+      // @TODO Hardcoded to find packable weights - all the weights used for affine op (fp16), all the weights used for affine op and dot op (int8)
+      if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
+        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
+  #if USE_FBGEMM
+        using namespace marian::cpu::variant;
+        // packing information - size
+        int nrow;
+        int ncol;
+        uint64_t packsize;
+
+        fbgemmPacked8PackInfo(val->shape(),
+                              gemmElementType,
+                              pName.find("Wemb") != std::string::npos,
+                              nrow,
+                              ncol,
+                              packsize);
+
+        auto allocator = New<TensorAllocator>(getBackend());
+
+        // buffer tensor to save packed matrix
+        Tensor packedTensor;
+        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
+
+        //Pack B matrix into int8
+        fbgemmPacked8Pack(packedTensor,
+                          val->data(),
+                          gemmElementType,
+                          pName.find("Wemb") != std::string::npos,
+                          nrow,
+                          ncol,
+                          packsize);
+        io::Item item;
+        item.name = pName;
+        item.shape = val->shape();
+        item.type = gemmElementType;
+
+        // Use the actual memory as this will be aligned and padded.
+        // When memory mapping this is required. Shape keeps track of
+        // tensor size. Saving to *.npz will cut to size.
+        auto mem = packedTensor->memory();
+        item.bytes.resize(mem->size());
+        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
+
+        ioItems.emplace_back(std::move(item));
+#else
+        ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
+#endif
+      } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
+#if USE_FBGEMM
        using namespace marian::cpu::variant;

        // packing information
        int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
        uint64_t packsize;

-        PackInfoFp32(val->shape(),
+        fbgemmPacked16PackInfo(val->shape(),
          false,
          nrow,
          ncol,
@ -60,8 +107,8 @@ public:
        Tensor packedTensor;
        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);

-        // PackFp32
-        PackFp32(packedTensor,
+        // fbgemmPacked16Pack
+        fbgemmPacked16Pack(packedTensor,
          val->data(),
          false,
          nrow,
@ -76,7 +123,7 @@ public:
        io::Item item;
        item.name = pName;
        item.shape = val->shape();
-        item.type = Type::packed16;
+        item.type = gemmElementType;

        // Use the actual memory as this will be aligned and padded.
        // When memory mapping this is required. Shape keeps track of
@ -86,6 +133,9 @@ public:
        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());

        ioItems.emplace_back(std::move(item));
+#else
+        ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
+#endif
      } else {
        io::Item item;
        val->get(item, pName);
--- a/src/tensors/cpu/fbgemm/packed_gemm.cpp
+++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@ -0,0 +1,550 @@
+#include "packed_gemm.h"
+#include "tensors/tensor_allocator.h"
+#include "tensors/tensor_operators.h"
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#include <cassert>
+#include <cstddef>
+#include <unordered_map>
+//#include <chrono>
+
+#if USE_FBGEMM
+#ifdef _MSC_VER
+#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
+#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
+#pragma warning(disable: 4661) // 'fbgemm::PackMatrix<fbgemm::PackBMatrix<int8_t,int32_t>,int8_t,int32_t>::PackMatrix(int32_t,int32_t,inpType *,int,const fbgemm::BlockingFactors *)': no suitable definition provided for explicit template instantiation request
+#pragma warning(disable: 4244) // fbgemm\quantutils.h(51): warning C4244: 'return': conversion from 'const _Ty' to 'T2', possible loss of data
+#pragma warning(disable: 4717) // 'fbgemm::PackMatrix<fbgemm::PackAWithQuantRowOffset<unsigned char,int>,unsigned char,int>::isThisLastKBlock': recursive on all control paths, function will cause runtime stack overflow
+// the following does not work; need to manually disable them in Linker options
+//#pragma comment(linker, "/ignore:4049") // locally defined symbol ...asmjit... imported
+//#pragma comment(linker, "/ignore:4217") // locally defined symbol ...asmjit... imported
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
+#include "3rd_party/fbgemm/include/fbgemm/QuantUtils.h"
+#include "3rd_party/fbgemm/include/fbgemm/Fbgemm.h"
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#if MKL_FOUND
+#include <mkl.h>
+#include <mkl_types.h>
+#endif
+
+using namespace fbgemm;
+#endif // USE_FBGEMM
+
+namespace marian {
+namespace cpu {
+namespace variant { // Variants of GEMM implementations
+
+#if USE_FBGEMM
+// initialize with a dummy
+// When this class is instantiated,
+// the actual packing operation is happening. If we create this instance every time we call GEMM,
+// we are doing packing every time and very slow.
+// In Caffe2, the operator is stateful and hold an instance of this.
+// But, we don't have any logic for this in marian. We can only cache a tensor (which means a memory chunk).
+// So, for now, we keep the packed memory on our own 1D tensor, then when we call GEMM,
+// we just reuse this instance again and again by replacing the class members (including memory pointer). Eventually,
+// I will add a new constructor to the class in FBGEMM which accepts
+// pre - allocated and pre - packed memory as a parameter.After it's done,
+// this temporary buffer will be removed.
+// When constructing this dummy buffer, ones are used for all the parameters to allocate minimum amount of memory.
+//
+// In a multi marian instance setting (as a dynamic library),
+// different marian instances should not share this variable.
+static thread_local PackedGemmMatrixFP16 packedPlaceholder(1, 1, 1, 1, 1, 1, 1, 1);
+
+// Copied code from fbgemm. It's padding required from some kernel in FBGEMM
+// Verbatim - 'required by sw pipelined kernels'
+// https://github.com/marian-nmt/FBGEMM/blob/master/include/fbgemm/FbgemmFP16.h#L109
+const int PACK16_PADDING = 1024;  
+
+// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
+const int PACK16_SPECIALMEM = 256;
+
+// This is copied from FBGEMM code
+// A better way?
+// will be removed, when FBGEMM api is changed
+// blocked row-major format address arithmetic
+/**
+ * Returns the memory address in the packed (block formatted) matrix array of a specific element 
+ * indexed by the original non-packed array.
+ *
+ * @param r_ row index in the original matrix
+ * @param c_ column index in the original matrix
+ * @param brow_ row wide block index
+ * @param bcol_ column wide block index
+ * @param nbrow_ number of blocks in row
+ * @param nbcol_ number of blocks in column
+ * @param last_brow_ row number of the last block
+ */
+inline uint64_t addr(const int r_,
+                     const int c_,
+                     const int brow_,
+                     const int bcol_,
+                     const int nbrow_,
+                     const int nbcol_,
+                     const int last_brow_) {
+  uint64_t r = (uint64_t)r_;
+  uint64_t c = (uint64_t)c_;
+
+  uint64_t block_row_id = r / brow_;
+  uint64_t brow_offset = (block_row_id * nbcol_) * (brow_ * bcol_);
+  uint64_t block_col_id = c / bcol_;
+  uint64_t bcol_offset
+      = block_col_id * ((block_row_id != nbrow_ - 1) ? (brow_ * bcol_) : (last_brow_ * bcol_));
+  uint64_t block_offset = brow_offset + bcol_offset;
+  uint64_t inblock_offset = r % brow_ * bcol_ + c % bcol_;
+
+  uint64_t index = block_offset + inblock_offset;
+  return index;
+}
+
+// Memory blocking factors (parameters) for packing into AVX2 int8
+static const fbgemm::BlockingFactors Packed8Avx2BlockingFactors = {
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MR,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NR,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NR_MIN,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::ROW_INTERLEAVE,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::MCB,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::KCB,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx2>::NCB
+};
+
+// Memory blocking factors (parameters) for packing into AVX512 int8
+static const fbgemm::BlockingFactors Packed8Avx512BlockingFactors = {
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::MR,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NR,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NR_MIN,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::ROW_INTERLEAVE,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::MCB,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::KCB,
+    PackingTraits<int8_t, int32_t, inst_set_t::avx512>::NCB
+};
+
+// This function returns the correct blocking factors structure for given packing type.
+inline const fbgemm::BlockingFactors* getBlockingFactors(marian::Type packType) {
+  if(packType == Type::packed8avx2) {
+    return &Packed8Avx2BlockingFactors;
+  } else if(packType == Type::packed8avx512) {
+    return &Packed8Avx512BlockingFactors;
+  } else {
+    ABORT("Only avx2 and avx512 instruction sets are supported for int8. {}", packType);
+  }
+}
+
+void fbgemmPacked16PackInfo(const marian::Shape& shape,
+                            const bool transpose,
+                            uint64_t& packsize) {
+  int nrow, ncol, kernel_ncol_blocks, brow = 512, bcol, last_brow, nbrow, nbcol;
+  fbgemmPacked16PackInfo(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
+}
+
+void fbgemmPacked16PackInfo(const marian::Shape& shape,
+                            const bool transpose,
+                            int& nrow,
+                            int& ncol,
+                            int& kernel_ncol_blocks,
+                            int& brow,
+                            int& bcol,
+                            int& last_brow,
+                            int& nbrow,
+                            int& nbcol,
+                            uint64_t& packsize) {
+  nrow = transpose ? shape[1] : shape[0];
+  ncol = transpose ? shape[0] : shape[1];
+  kernel_ncol_blocks = 2;
+  brow = 512;
+  bcol = 8 * kernel_ncol_blocks;
+  last_brow = nrow % brow == 0 ? brow : nrow % brow;
+  nbrow = nrow % brow == 0 ? nrow / brow : (nrow + brow) / brow;
+  nbcol = ncol % bcol == 0 ? ncol / bcol : (ncol + bcol) / bcol;
+  ABORT_IF(ncol % bcol != 0, "ncol (number of columns) should be multiple of 16. {}", ncol);
+  packsize = ((nbrow * brow) * (nbcol * bcol)) * sizeof(fbgemm::float16) + PACK16_PADDING
+             + PACK16_SPECIALMEM;
+}
+
+void fbgemmPacked8PackInfo(const marian::Shape& shape,
+                           const marian::Type packType,
+                           const bool transpose,
+                           int& nrow,
+                           int& ncol,
+                           uint64_t& packsize) {
+    // Should be 2D - weight matrix
+    ABORT_IF(shape.size() != 2,
+            "Weight Matrix should be 2D");
+    nrow = transpose ? shape[1] : shape[0];
+    ncol = transpose ? shape[0] : shape[1];
+
+    const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
+
+    packsize = fbgemm::PackMatrix<fbgemm::PackBMatrix<int8_t>, int8_t>::packedBufferSize(
+        transpose ? shape[1] : shape[0],
+        transpose ? shape[0] : shape[1], params);
+    // add extra space for storing some other variables specific to B matrix
+    // quantization sacles: 1 per column and float
+    // quantization offset: 1 per column and int32
+    // column offsets: 1 per column and int32
+    packsize += ncol * (sizeof(float) + sizeof(int32_t) + sizeof(int32_t));
+}
+
+// This function computes the offset values for each column which are used for compensating the remainders of quantized values
+// More detailed math is avilable in the FBGEMM's blog - https://engineering.fb.com/ml-applications/fbgemm/
+inline void col_offsets_with_zero_pt_s8acc32(
+    bool transpose,
+    int K,
+    int N,
+    const int8_t* Bint8,
+    const int32_t* B_zero_point,
+    int32_t* col_offsets,
+    int ncols_per_quant_group) {
+  for (int n = 0; n < N; ++n) {
+    int32_t sum = 0;
+    for (int k = 0; k < K; ++k) {
+      sum += transpose ? Bint8[k + n * K] : Bint8[k * N + n];
+    }
+    col_offsets[n] = sum - B_zero_point[n / ncols_per_quant_group] * K;
+  }
+}
+
+void fbgemmPacked16Pack(marian::Tensor out,
+                        const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
+                        const bool transpose,
+                        const int nrow,
+                        const int ncol,
+                        const int kernel_ncol_blocks,
+                        const int brow,
+                        const int bcol,
+                        const int last_brow,
+                        const int nbrow,
+                        const int nbcol,
+                        const uint64_t packsize) {
+  // initialize memory
+  uint8_t* outmemorg = out->data<uint8_t>();
+  for(auto i = 0; i < packsize; i++) {
+    outmemorg[i] = 0;
+  }
+  // save the other auxiliary variables
+  uint64_t* auxmemsize = (uint64_t*)outmemorg;
+  auxmemsize[0] = packsize;
+  // save FBGEMM related parameters into the header of the allocated memory by marian
+  int32_t header[8];
+  header[0] = nrow;
+  header[1] = ncol;
+  header[2] = kernel_ncol_blocks;
+  header[3] = brow;
+  header[4] = bcol;
+  header[5] = last_brow;
+  header[6] = nbrow;
+  header[7] = nbcol;
+  memcpy(auxmemsize + 1, header, sizeof(header));
+  // cast to float16
+  fbgemm::float16* outmem = (fbgemm::float16*)(outmemorg + 256);
+  fbgemm::float16* dummy = new fbgemm::float16;
+  // pack the matrix
+  for(int i = 0; i < nrow; i++) {
+    for(int j = 0; j < ncol; j++) {
+      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
+          = tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
+    }
+  }
+  delete dummy;
+}
+
+void fbgemmPacked8Pack(marian::Tensor out,
+                       const float* inData,
+                       const marian::Type packType,
+                       const bool transpose,
+                       const int nrow,
+                       const int ncol,
+                       const uint64_t packsize) {
+  int k = nrow;
+  int n = ncol;
+  int len = k * n;
+
+  // 1. collect stats for each column
+  float* bqScale = new float[n];
+  int32_t* bqZeropoint = new int32_t[n];
+
+  const float* data = inData;
+  float val = 0;
+
+  if (transpose) {
+    for (int jj = 0; jj < n; jj++) {
+      float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
+      double mean = 0, sqrsum = 0;
+      for (int ii = 0; ii < k; ii++) {
+        val = data[jj * k + ii];
+        mean += val;
+        sqrsum += val * val;
+      }
+      mean /= k;
+      sqrsum /= k;
+      sqrsum -= mean * mean;
+      sqrsum = sqrt(sqrsum);
+
+      min = (float)(mean - 7.0f*sqrsum);
+      max = (float)(mean + 7.0f*sqrsum);
+      bqScale[jj] = (max - min) / 255;
+      bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
+    }
+  } else {
+    for (int jj = 0; jj < n; jj++) {
+      float min = std::numeric_limits<float>::max(), max = std::numeric_limits<float>::min();
+      double mean = 0, sqrsum = 0;
+      for (int ii = 0; ii < k; ii++) {
+        val = data[jj + ii * n];
+        mean += val;
+        sqrsum += val * val;
+      }
+      mean /= k;
+      sqrsum /= k;
+      sqrsum -= mean * mean;
+      sqrsum = sqrt(sqrsum);
+
+      min = (float)(mean - 7.0f*sqrsum);
+      max = (float)(mean + 7.0f*sqrsum);
+      bqScale[jj] = (max - min) / 255;
+      bqZeropoint[jj] = (int32_t)(127 - max / bqScale[jj]);
+    }
+  }
+
+  // 2. quantize
+  int8_t* quantized = 0;
+#ifdef _MSC_VER
+  quantized = (int8_t*)_aligned_malloc(len, 256);
+#else
+  int result = posix_memalign((void**)&quantized, 256, len); result;
+  assert(result == 0);
+#endif
+  for (int jj = 0; jj < n; jj++) {
+    TensorQuantizationParams bQuantParam;
+    bQuantParam.scale = bqScale[jj];
+    bQuantParam.zero_point = bqZeropoint[jj];
+    bQuantParam.precision = 8;
+
+    if (transpose)
+      fbgemm::Quantize<int8_t>(data + jj * k, quantized + jj * k, k, bQuantParam);
+    else {
+      for (int ii = 0; ii < k; ii++) {
+        quantized[ii*n + jj] = fbgemm::Quantize<int8_t>(data[ii*n + jj], bQuantParam);
+      }
+    }
+  }
+
+  // 3. compute column offsets
+  int32_t* col_offsets = new int32_t[n];
+  col_offsets_with_zero_pt_s8acc32(transpose, k, n, quantized, bqZeropoint, col_offsets, 1);
+
+
+  int8_t* packedbuf = out->data<int8_t>();
+  for(auto i = 0; i < packsize; i++) {
+    packedbuf[i] = 0;
+  }
+
+  // 4. packing
+  const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
+  
+  PackBMatrix<int8_t> packedBN(
+      transpose ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
+      nrow, ncol, quantized, transpose ? nrow : ncol, packedbuf, 1, params);
+
+  // copy quantization scale
+  memcpy(packedbuf + (packsize - n * (sizeof(float) + sizeof(int32_t) + sizeof(int32_t))), bqScale, n * sizeof(float));
+  // copy quantization offset
+  memcpy(packedbuf + (packsize - n * (sizeof(int32_t) + sizeof(int32_t))), bqZeropoint, n * sizeof(int32_t));
+  // copy column offsets to the memory
+  memcpy(packedbuf + (packsize - n * sizeof(int32_t)), col_offsets, n * sizeof(int32_t));
+
+#ifdef _MSC_VER
+  _aligned_free(quantized);
+#else
+  free(quantized);
+#endif
+  delete[] col_offsets;
+  delete[] bqScale;
+  delete[] bqZeropoint;
+}
+
+// GEMM operation on the packed B matrix
+// C: output matrix
+// A: A matrix
+// B: B matrix (packed)
+// m: the number of rows in A and C
+// n: the number of columns in B and C
+// transA: transpose of A matrix
+// B is already packed. So, we don't need transB
+void fbgemmPacked16Gemm(marian::Tensor C,
+                        const marian::Tensor A,
+                        const marian::Tensor B,
+                        const marian::Tensor bias,
+                        const size_t m,
+                        const size_t n,
+                        const int transA) {
+  // row major
+  // keep the original mem
+  fbgemm::float16* pmat = packedPlaceholder.pmat_;
+  // retreive aux fields from the memory
+  uint64_t* packedmemSize = (uint64_t*)B->data();
+  packedPlaceholder.size_ = packedmemSize[0];
+  int32_t header[8];
+  memcpy(header, packedmemSize + 1, sizeof(header));
+  packedPlaceholder.nrow_ = header[0];
+  packedPlaceholder.ncol_ = header[1];
+  packedPlaceholder.kernel_ncol_blocks_ = header[2];
+  packedPlaceholder.brow_ = header[3];
+  packedPlaceholder.bcol_ = header[4];
+  packedPlaceholder.last_brow_ = header[5];
+  packedPlaceholder.nbrow_ = header[6];
+  packedPlaceholder.nbcol_ = header[7];
+
+  // packed matrix
+  packedPlaceholder.pmat_ = (fbgemm::float16*)(B->data<uint8_t>() + 256);
+
+  if(bias != nullptr) {
+#if MKL_FOUND
+    for(int i = 0; i < m; ++i) {
+      mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
+    }
+#else
+    for(int i = 0; i < m; ++i) {
+      std::copy(bias->data(), bias->data() + n, C->data() + n * i);
+    }
+#endif
+  }
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+  {
+#ifdef _OPENMP
+    int num_threads = omp_get_num_threads();
+    int tid = omp_get_thread_num();
+#else
+    int num_threads = 1;
+    int tid = 0;
+#endif
+    fbgemm::cblas_gemm_compute(transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
+                      (int)m,
+                      A->data(),
+                      packedPlaceholder,
+                      bias != nullptr ? 1.0f : 0.0f,
+                      C->data(),
+                      tid,
+                      num_threads);
+  }
+
+  // return back the original mem
+  packedPlaceholder.pmat_ = pmat;
+}
+
+// GEMM operation on the packed B matrix in 8 bit integers
+// C: output matrix
+// A: A matrix
+// B: B matrix (packed)
+// m: the number of rows in A and C
+// n: the number of columns in B and C
+// k: the number of columns in A and the number of rows in B
+// transA: whether A matrix is transposed or not
+// transB: whether B matrix is transposed or not
+void fbgemmPacked8Gemm(marian::Tensor C,
+                       const marian::Tensor A,
+                       const marian::Tensor B,
+                       const size_t m,
+                       const size_t n,
+                       const size_t k,
+                       const int transA,
+                       const int transB) {
+  // pack type
+  marian::Type packType = B->type();
+
+  const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
+
+  if((packType == Type::packed8avx2 && fbgemmHasAvx512Support())
+     || (packType == Type::packed8avx512 && !fbgemmHasAvx512Support())) {
+    ABORT("FBGEMM doesn't allow to use {} packing order on {} CPUs",
+          packType == Type::packed8avx2 ? "AVX2" : "AVX512",
+          fbgemmHasAvx512Support() ? "AVX512" : "AVX2");
+  }
+
+  // compute range to quantize A (activations) - (min/max quantization)
+  float min_est = std::numeric_limits<float>::max(), max_est = std::numeric_limits<float>::min();
+
+  int elem = A->shape().elements();
+  float* data = A->data();
+  // AVX based find min/max
+  FindMinMax(data, &min_est, &max_est, elem);
+
+  float ascale = (max_est - min_est) / 255;
+  int32_t azeropoint = (int32_t)(255 - max_est / ascale);
+
+  std::vector<int32_t> row_offset_buf(PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
+  PackAWithQuantRowOffset<uint8_t> packAN(
+      transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
+      (int32_t)(transA ? k : m),
+      (int32_t)(transA ? m : k),
+      A->data(),
+      (int32_t)(transA ? m : k),
+      nullptr, /*buffer for packed matrix*/
+      ascale,
+      azeropoint,
+      1, /*groups*/
+      row_offset_buf.data(),
+      params);
+
+  // packed matrix size of B
+  int bPackSize = PackMatrix<PackBMatrix<int8_t>, int8_t>::packedBufferSize((int32_t)k, (int32_t)n);
+
+  // retrieve B matrix
+  int8_t* bdata = B->data<int8_t>();
+  float* bqScale = new float[n];
+  memcpy(bqScale, bdata + bPackSize, n * sizeof(float));
+
+  int32_t* bqZeropoint = new int32_t[n];
+  memcpy(bqZeropoint, bdata + bPackSize + n * sizeof(float), n * sizeof(int32_t));
+
+  int32_t* col_offsets = new int32_t[n];
+  memcpy(col_offsets, bdata + bPackSize + n * (sizeof(float) + sizeof(int32_t)), n * sizeof(int32_t));
+
+  DoNothing<float, float> doNothingObj{};
+  ReQuantizeForFloat<false, QuantizationGranularity::OUT_CHANNEL> outputProcObj(
+      doNothingObj,
+      ascale,
+      bqScale,
+      azeropoint,
+      bqZeropoint,
+      packAN.getRowOffsetBuffer(),
+      col_offsets,
+      nullptr,
+      (std::uint32_t) n);
+
+  PackBMatrix<int8_t> repackedBN(
+    transB ? matrix_op_t::Transpose : matrix_op_t::NoTranspose, (int32_t) k, (int32_t) n, bdata, (int32_t) (transB ? k : n), 1, params);
+
+  // gemm computation
+  fbgemmPacked(packAN, repackedBN, C->data(), (int32_t*)C->data(), (int32_t) n, outputProcObj, 0, 1, params);
+
+  delete[] col_offsets;
+  delete[] bqZeropoint;
+  delete[] bqScale;
+}
+
+#endif // USE_FBGEMM
+
+}  // namespace variant
+}  // namespace cpu
+}  // namespace marian
--- a/src/tensors/cpu/fbgemm/packed_gemm.h
+++ b/src/tensors/cpu/fbgemm/packed_gemm.h
@ -0,0 +1,141 @@
+#pragma once
+
+#include "tensors/tensor.h"
+
+namespace marian {
+namespace cpu {
+namespace variant { // Variants of GEMM implementations
+
+// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// Packing with fp16 only targets AVX2 instruction sets for now.
+// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// packsize (out): the size of the packed matrix in byte
+void fbgemmPacked16PackInfo(const marian::Shape& shape,
+                            const bool transpose,
+                            /*out*/uint64_t& packsize);
+
+// Returns the byte size of packed matrix in fp16. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// This function returns some other extra variables
+// Packing with fp16 only targets AVX2 instruction sets for now.
+// See '3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h'.
+// shape: shape of the tensor to be packed
+// transpose: the matrix is transposed
+// nrow (out): the number of rows
+// ncol (out): the number of columns
+// kernel_ncol_blocks (out): the number of column blocks
+// brow (out): the number of rows in a block
+// bcol (out): the number of columns in a block
+// last_brow (out): the number of rows in the last block
+// nbrow (out): row index in a block
+// nbcol (out): column index in a block
+// packsize (out): the size of the packed matrix in byte
+void fbgemmPacked16PackInfo(const marian::Shape& shape,
+                          const bool transpose,
+                          /*out*/int& nrow,
+                          /*out*/int& ncol,
+                          /*out*/int& kernel_ncol_blocks,
+                          /*out*/int& brow,
+                          /*out*/int& bcol,
+                          /*out*/int& last_brow,
+                          /*out*/int& nbrow,
+                          /*out*/int& nbcol,
+                          /*out*/uint64_t& packsize); // @TODO: change to size_t where appropriate
+
+// Returns the byte size of packed matrix in int8. It's calculated by fbgemm's internal logic due to the paddings and different layouts.
+// See '3rd_party/fbgemm/src/PackBMatrix.cc'.
+// shape: shape of the tensor to be packed
+// packType: Type to be packed - packed8avx2 or packed8avx512
+// transpose: the matrix is transposed
+// nrow (out): the number of rows
+// ncol (out): the number of columns
+// packsize (out): the size of the packed matrix in byte
+void fbgemmPacked8PackInfo(const marian::Shape& shape,
+                           const marian::Type packType,
+                           const bool transpose,
+                           /*out*/int& nrow,
+                           /*out*/int& ncol,
+                           /*out*/uint64_t& packsize);
+
+// Pack a matrix (fp16) into cache utilization efficient way (block format) into fp16
+// out: output tensor - packed format
+// inData: input tensor data - pointer of float data
+// transpose: the matrix is transposed
+// nrow: the number of rows
+// ncol: the number of columns
+// kernel_ncol_blocks: the number of column blocks
+// brow: the number of rows in a block
+// bcol: the number of columns in a block
+// last_brow: the number of rows in the last block
+// nbrow: row index in a block
+// nbcol: column index in a block
+// packsize: the size of the packed matrix
+//          (the number of fp16 elements + padding (1024) + extra temporary memory (256))
+void fbgemmPacked16Pack(marian::Tensor out,
+                        const float* inData,
+                        const bool transpose,
+                        const int nrow,
+                        const int ncol,
+                        const int kernel_ncol_blocks,
+                        const int brow,
+                        const int bcol,
+                        const int last_brow,
+                        const int nbrow,
+                        const int nbcol,
+                        const uint64_t packsize); // @TODO: change to size_t where appropriate
+
+// Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
+// out: output tensor - packed format and quantized into int8
+// inData: input tensor data - pointer of float data
+// packType: Type to be packed - packed8avx2 or packed8avx512
+// transpose: the matrix is transposed
+// nrow: the number of rows
+// ncol: the number of columns
+// packsize: the size of the packed matrix
+//          (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+void fbgemmPacked8Pack(marian::Tensor out,
+                       const float* inData,
+                       const marian::Type packType,
+                       const bool transpose,
+                       const int nrow,
+                       const int ncol,
+                       const uint64_t packsize); // @TODO: change to size_t where appropriate
+
+// GEMM operation on the packed B matrix
+// C: output matrix
+// A: A matrix
+// B: B matrix (packed)
+// m: the number of rows in A and C
+// n: the number of columns in B and C
+// transA: transpose of A matrix
+// B is already packed. So, we don't need transB
+void fbgemmPacked16Gemm(marian::Tensor C,
+                        const marian::Tensor A,
+                        const marian::Tensor B,
+                        const marian::Tensor bias,
+                        const size_t m,
+                        const size_t n,
+                        const int transA = 0);
+
+// GEMM operation on the packed B matrix in 8 bit integers
+// C: output matrix
+// A: A matrix
+// B: B matrix (packed)
+// m: the number of rows in A and C
+// n: the number of columns in B and C
+// k: the number of columns in A and rows in B
+// transA: transpose of A matrix
+// transB: transpose of B matrix
+void fbgemmPacked8Gemm(marian::Tensor C,
+                       const marian::Tensor A,
+                       const marian::Tensor B,
+                       const size_t m,
+                       const size_t n,
+                       const size_t k,
+                       const int transA = 0,
+                       const int transB = 0);
+
+}  // namespace variant
+}  // namespace cpu
+}  // namespace marian
--- a/src/tensors/cpu/sharp/packed_gemm.cpp
+++ b/src/tensors/cpu/sharp/packed_gemm.cpp
@ -1,313 +0,0 @@
-#include "packed_gemm.h"
-#include "tensors/tensor_allocator.h"
-#include "tensors/tensor_operators.h"
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-#include <cassert>
-#include <cstddef>
-#include <unordered_map>
-//#include <chrono>
-
-#if USE_FBGEMM
-#ifdef _MSC_VER
-#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
-#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
-// the following does not work; need to manually disable them in Linker options
-//#pragma comment(linker, "/ignore:4049") // locally defined symbol ...asmjit... imported
-//#pragma comment(linker, "/ignore:4217") // locally defined symbol ...asmjit... imported
-#endif
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#endif
-#include "3rd_party/fbgemm/include/fbgemm/FbgemmFP16.h"
-#include "3rd_party/fbgemm/include/fbgemm/QuantUtils.h"
-#include "3rd_party/fbgemm/include/fbgemm/Fbgemm.h"
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#if MKL_FOUND
-#include <mkl.h>
-#include <mkl_types.h>
-#endif
-
-using namespace fbgemm;
-#endif // USE_FBGEMM
-
-namespace marian {
-namespace cpu {
-namespace variant { // Variants of GEMM implementations
-
-#if USE_FBGEMM
-// initialize with a dummy
-// When this class is instantiated,
-// the actual packing operation is happening. If we create this instance every time we call GEMM,
-// we are doing packing every time and very slow.
-// In Caffe2, the operator is stateful and hold an instance of this.
-// But, we don't have any logic for this in marian. We can only cache a tensor (which means a memory chunk).
-// So, for now, we keep the packed memory on our own 1D tensor, then when we call GEMM,
-// we just reuse this instance again and again by replacing the class members (including memory pointer). Eventually,
-// I will add a new constructor to the class in FBGEMM which accepts
-// pre - allocated and pre - packed memory as a parameter.After it's done,
-// this temporary buffer will be removed.
-// When constructing this dummy buffer, ones are used for all the parameters to allocate minimum amount of memory.
-//
-// In a multi marian instance setting (as a dynamic library),
-// different marian instances should not share this variable.
-static thread_local PackedGemmMatrixFP16 packedPlaceholder(1, 1, 1, 1, 1, 1, 1, 1);
-
-// Copied code from fbgemm. It's padding required from some kernel in FBGEMM
-// Verbatim - 'required by sw pipelined kernels'
-// https://github.com/marian-nmt/FBGEMM/blob/master/include/fbgemm/FbgemmFP16.h#L109
-const int PACK16_PADDING = 1024;  
-
-// This is a memory space to store auxiliary variables for FBGEMM (e.g. block row, block column, kernel_ncol_blocks and etc.)
-const int PACK16_SPECIALMEM = 256;
-
-// This is copied from FBGEMM code
-// A better way?
-// will be removed, when FBGEMM api is changed
-// blocked row-major format address arithmetic
-/**
- * Returns the memory address in the packed (block formatted) matrix array of a specific element 
- * indexed by the original non-packed array.
- *
- * @param r_ row index in the original matrix
- * @param c_ column index in the original matrix
- * @param brow_ row wide block index
- * @param bcol_ column wide block index
- * @param nbrow_ number of blocks in row
- * @param nbcol_ number of blocks in column
- * @param last_brow_ row number of the last block
- */
-inline uint64_t addr(const int r_,
-                     const int c_,
-                     const int brow_,
-                     const int bcol_,
-                     const int nbrow_,
-                     const int nbcol_,
-                     const int last_brow_) {
-  uint64_t r = (uint64_t)r_;
-  uint64_t c = (uint64_t)c_;
-
-  uint64_t block_row_id = r / brow_;
-  uint64_t brow_offset = (block_row_id * nbcol_) * (brow_ * bcol_);
-  uint64_t block_col_id = c / bcol_;
-  uint64_t bcol_offset
-      = block_col_id * ((block_row_id != nbrow_ - 1) ? (brow_ * bcol_) : (last_brow_ * bcol_));
-  uint64_t block_offset = brow_offset + bcol_offset;
-  uint64_t inblock_offset = r % brow_ * bcol_ + c % bcol_;
-
-  uint64_t index = block_offset + inblock_offset;
-  return index;
-}
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  uint64_t& packsize) {
-  int nrow, ncol, kernel_ncol_blocks, brow = 512, bcol, last_brow, nbrow, nbcol;
-  PackInfoFp32(shape, transpose, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize);
-}
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  int& nrow,
-                  int& ncol,
-                  int& kernel_ncol_blocks,
-                  int& brow,
-                  int& bcol,
-                  int& last_brow,
-                  int& nbrow,
-                  int& nbcol,
-                  uint64_t& packsize) {
-  nrow = transpose ? shape[1] : shape[0];
-  ncol = transpose ? shape[0] : shape[1];
-  kernel_ncol_blocks = 2;
-  brow = 512;
-  bcol = 8 * kernel_ncol_blocks;
-  last_brow = nrow % brow == 0 ? brow : nrow % brow;
-  nbrow = nrow % brow == 0 ? nrow / brow : (nrow + brow) / brow;
-  nbcol = ncol % bcol == 0 ? ncol / bcol : (ncol + bcol) / bcol;
-  ABORT_IF(ncol % bcol != 0, "ncol (number of columns) should be multiple of 16. {}", ncol);
-  packsize = ((nbrow * brow) * (nbcol * bcol)) * sizeof(fbgemm::float16) + PACK16_PADDING
-             + PACK16_SPECIALMEM;
-}
-
-void PackFp32(marian::Tensor out,
-              const float* inData, // Packing is only available for 2D weight matrix in Marian. Otherwise, it's aborted in expanded_gemm.h.
-              const bool transpose,
-              const int nrow,
-              const int ncol,
-              const int kernel_ncol_blocks,
-              const int brow,
-              const int bcol,
-              const int last_brow,
-              const int nbrow,
-              const int nbcol,
-              const uint64_t packsize) {
-  // initialize memory
-  uint8_t* outmemorg = out->data<uint8_t>();
-  for(auto i = 0; i < packsize; i++) {
-    outmemorg[i] = 0;
-  }
-  // save the other auxiliary variables
-  uint64_t* auxmemsize = (uint64_t*)outmemorg;
-  auxmemsize[0] = packsize;
-  // save FBGEMM related parameters into the header of the allocated memory by marian
-  int32_t header[8];
-  header[0] = nrow;
-  header[1] = ncol;
-  header[2] = kernel_ncol_blocks;
-  header[3] = brow;
-  header[4] = bcol;
-  header[5] = last_brow;
-  header[6] = nbrow;
-  header[7] = nbcol;
-  memcpy(auxmemsize + 1, header, sizeof(header));
-  // cast to float16
-  fbgemm::float16* outmem = (fbgemm::float16*)(outmemorg + 256);
-  fbgemm::float16* dummy = new fbgemm::float16;
-  // pack the matrix
-  for(int i = 0; i < nrow; i++) {
-    for(int j = 0; j < ncol; j++) {
-      outmem[addr(i, j, brow, bcol, nbrow, nbcol, last_brow)]
-          = tconv(!transpose ? inData[i * ncol + j] : inData[i + nrow * j], *dummy);
-    }
-  }
-  delete dummy;
-}
-
-// GEMM operation on the packed B matrix
-// C: output matrix
-// A: A matrix
-// B: B matrix (packed)
-// m: the number of rows in A and C
-// n: the number of columns in B and C
-// transA: transpose of A matrix
-// B is already packed. So, we don't need transB
-void GemmPackFp32(marian::Tensor C,
-                  const marian::Tensor A,
-                  const marian::Tensor B,
-                  const marian::Tensor bias,
-                  const size_t m,
-                  const size_t n,
-                  const int transA) {
-  // row major
-  // keep the original mem
-  fbgemm::float16* pmat = packedPlaceholder.pmat_;
-  // retreive aux fields from the memory
-  uint64_t* packedmemSize = (uint64_t*)B->data();
-  packedPlaceholder.size_ = packedmemSize[0];
-  int32_t header[8];
-  memcpy(header, packedmemSize + 1, sizeof(header));
-  packedPlaceholder.nrow_ = header[0];
-  packedPlaceholder.ncol_ = header[1];
-  packedPlaceholder.kernel_ncol_blocks_ = header[2];
-  packedPlaceholder.brow_ = header[3];
-  packedPlaceholder.bcol_ = header[4];
-  packedPlaceholder.last_brow_ = header[5];
-  packedPlaceholder.nbrow_ = header[6];
-  packedPlaceholder.nbcol_ = header[7];
-
-  // packed matrix
-  packedPlaceholder.pmat_ = (fbgemm::float16*)(B->data<uint8_t>() + 256);
-
-  if(bias != nullptr) {
-#if MKL_FOUND
-    for(int i = 0; i < m; ++i) {
-      mkl_somatcopy('R', 'N', 1, n, 1, bias->data(), n, C->data() + n * i, n);
-    }
-#else
-    for(int i = 0; i < m; ++i) {
-      std::copy(bias->data(), bias->data() + n, C->data() + n * i);
-    }
-#endif
-  }
-
-#ifdef _OPENMP
-#pragma omp parallel
-#endif
-  {
-#ifdef _OPENMP
-    int num_threads = omp_get_num_threads();
-    int tid = omp_get_thread_num();
-#else
-    int num_threads = 1;
-    int tid = 0;
-#endif
-    fbgemm::cblas_gemm_compute(transA ? matrix_op_t::Transpose : matrix_op_t::NoTranspose,
-                      (int)m,
-                      A->data(),
-                      packedPlaceholder,
-                      bias != nullptr ? 1.0f : 0.0f,
-                      C->data(),
-                      tid,
-                      num_threads);
-  }
-
-  // return back the original mem
-  packedPlaceholder.pmat_ = pmat;
-}
-#else // USE_FBGEMM
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  uint64_t& packsize) {
-  // does nothing. supports only FBGEMM based packed gemm at this moment.
-  ABORT("FBGEMM is needed to use packed GEMM.");
-}
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  int& nrow,
-                  int& ncol,
-                  int& kernel_ncol_blocks,
-                  int& brow,
-                  int& bcol,
-                  int& last_brow,
-                  int& nbrow,
-                  int& nbcol,
-                  uint64_t& packsize) {
-  // does nothing. supports only FBGEMM based packed gemm at this moment.
-  ABORT("FBGEMM is needed to use packed GEMM.");
-}
-
-void PackFp32(marian::Tensor out,
-              const float* inData,
-              const bool transpose,
-              const int nrow,
-              const int ncol,
-              const int kernel_ncol_blocks,
-              const int brow,
-              const int bcol,
-              const int last_brow,
-              const int nbrow,
-              const int nbcol,
-              const uint64_t packsize) {
-                // does nothing. supports only FBGEMM based packed gemm at this moment.
-                ABORT("FBGEMM is needed to use packed GEMM.");
-}
-void GemmPackFp32(marian::Tensor C,
-                  const marian::Tensor A,
-                  const marian::Tensor B,
-                  const marian::Tensor bias,
-                  const size_t m,
-                  const size_t n,
-                  const int transA) {
-                // does nothing. supports only FBGEMM based packed gemm at this moment.
-                ABORT("FBGEMM is needed to use packed GEMM.");
-}
-#endif // USE_FBGEMM
-
-}  // namespace variant
-}  // namespace cpu
-}  // namespace marian
--- a/src/tensors/cpu/sharp/packed_gemm.h
+++ b/src/tensors/cpu/sharp/packed_gemm.h
@ -1,70 +0,0 @@
-#pragma once
-
-#include "tensors/tensor.h"
-
-namespace marian {
-namespace cpu {
-namespace variant { // Variants of GEMM implementations
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  /*out*/uint64_t& packsize);
-
-void PackInfoFp32(const marian::Shape& shape,
-                  const bool transpose,
-                  int& nrow,
-                  int& ncol,
-                  int& kernel_ncol_blocks,
-                  int& brow,
-                  int& bcol,
-                  int& last_brow,
-                  int& nbrow,
-                  int& nbcol,
-                  /*out*/uint64_t& packsize); // @TODO: change to size_t where appropriate
-
-// Pack a matrix into cache utilization efficient way (block format)
-// out: output tensor - packed format
-// inData: input tensor data - pointer of float data
-// transpose: the matrix is transposed
-// nrow: the number of rows
-// ncol: the number of columns
-// kernel_ncol_blocks: the number of column blocks
-// brow: the number of rows in a block
-// bcol: the number of columns in a block
-// last_brow: the number of rows in the last block
-// nbrow: row index in a block
-// nbcol: column index in a block
-// packsize: the size of the packed matrix
-//          (the number of fp16 elements + padding (1024) + extra temporary memory (256))
-void PackFp32(marian::Tensor out,
-              const float* inData,
-              const bool transpose,
-              const int nrow,
-              const int ncol,
-              const int kernel_ncol_blocks,
-              const int brow,
-              const int bcol,
-              const int last_brow,
-              const int nbrow,
-              const int nbcol,
-              const uint64_t packsize); // @TODO: change to size_t where appropriate
-
-// GEMM operation on the packed B matrix
-// C: output matrix
-// A: A matrix
-// B: B matrix (packed)
-// m: the number of rows in A and C
-// n: the number of columns in B and C
-// transA: transpose of A matrix
-// B is already packed. So, we don't need transB
-void GemmPackFp32(marian::Tensor C,
-                  const marian::Tensor A,
-                  const marian::Tensor B,
-                  const marian::Tensor bias,
-                  const size_t m,
-                  const size_t n,
-                  const int transA = 0);
-
-}  // namespace variant
-}  // namespace cpu
-}  // namespace marian
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -425,9 +425,13 @@ void Softmax(Tensor out, Tensor in) {
  matchOrAbort<float>(out->type());
  matchOrAbort<float>(in->type());

+#ifdef __AVX__
  if(out->shape()[-1] % 8 == 0) {
    Softmax<float32x8>(out, in);
-  } else if(out->shape()[-1] % 4 == 0) {
+    return;
+  }
+#endif
+  if(out->shape()[-1] % 4 == 0) {
    Softmax<float32x4>(out, in);
  } else {
    Softmax<float>(out, in);
@ -477,9 +481,13 @@ void LogSoftmax(Tensor out, Tensor in) {
  matchOrAbort<float>(out->type());
  matchOrAbort<float>(in->type());

+#ifdef __AVX__
  if(out->shape()[-1] % 8 == 0) {
    LogSoftmax<float32x8>(out, in);
-  } else if(out->shape()[-1] % 4 == 0) {
+    return;
+  }
+#endif
+  if(out->shape()[-1] % 4 == 0) {
    LogSoftmax<float32x4>(out, in);
  } else {
    LogSoftmax<float>(out, in);
@ -678,20 +686,22 @@ void Select(Tensor out,

  // @TODO: make this efficient
  functional::Shape outShape = out->shape();
-  functional::Shape inShape = in->shape();
+  functional::Shape inShape  = in->shape();
+  functional::Shape idxShape = indices->shape();
  int length = outShape.elements();

  functional::Array<int, functional::Shape::size()> dims;
  int axisCPU = (int)(axis + functional::Shape::size() - out->shape().size());

-  if(axisCPU == 2) // specialization for axis==2, assuming N=4
+  if(axisCPU == 2 && outShape == idxShape) // specialization for axis==2 when there is no broadcasting, @TODO to be removed once we have a faster implementation below
    return SelectAxis2(out, in, indices);

  for(int index = 0; index < length; ++index) {
-    outShape.dims(index, dims);
-    dims[axisCPU] = (int)indices->data<IndexType>()[dims[axisCPU]];
-    int inIndex = inShape.index(dims);
-    out->data()[index] = in->data()[inIndex];
+    outShape.dims(index, dims);                                // compute dimension-based indices from global index;
+    int idxIndex = idxShape.bindex(dims);                      // return global index for indices based on dimension-specific indices from out, take broadcasting into account;
+    dims[axisCPU] = (int)indices->data<IndexType>()[idxIndex]; // substitute index of out-tensor with corresponding axis-local position from in-tensor;
+    int inIndex = inShape.index(dims);                         // compute global index from dimension-specific indices, no broadcasting as out and in match in all dimensions apart from axis
+    out->data()[index] = in->data()[inIndex];                  // assign corresponding values.
  }
 }

@ -704,7 +714,8 @@ void Insert(Tensor out,

  // @TODO: make this efficient
  functional::Shape outShape = out->shape();
-  functional::Shape inShape = in->shape();
+  functional::Shape inShape  = in->shape();
+  functional::Shape idxShape = indices->shape();

  int length = inShape.elements();
  functional::Array<int, functional::Shape::size()> dims;
@ -712,7 +723,8 @@ void Insert(Tensor out,

  for(int index = 0; index < length; ++index) {
    inShape.dims(index, dims);
-    dims[axisCPU] = (int)indices->data<IndexType>()[dims[axisCPU]];
+    int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
+    dims[axisCPU] = (int)indices->data<IndexType>()[idxIndex];
    int outIndex = outShape.index(dims);
    out->data()[outIndex] += in->data()[index];
  }
@ -879,7 +891,7 @@ void CrossEntropyPick(Tensor out, Tensor in, Tensor labelIndices) {
    // Groundtruth label index
    IndexType i = labelIndices->data<IndexType>()[j];
    // This appears to be safe i.e. that i >= 0 && i < cols is known
-    out->data()[j] = std::log(sum) - sp[i] + max;
+    out->data()[j] = std::log(sum) - sp[i] + max;    // -log(p_i) = - logsoftmax(x_i - max) = - (x_i - max) - log(sum_j exp(x_j - max))
  }
 }

@ -912,7 +924,8 @@ void CrossEntropyPickBackward(Tensor out,
    // cross-entropy
    for(int i = 0; i < cols; ++i) {
      float sub = (float)(i == (int)labelIndices->data<IndexType>()[j]); // delta, true if label index and column index match
-      so[i] += adj->data()[j] * (std::exp(sp[i] - max) / sum - sub);
+      auto softmax = std::exp(sp[i] - max) / sum;
+      so[i] += adj->data()[j] * (softmax - sub);
    }
  }
 }
@ -1037,7 +1050,7 @@ void LayerNormalization(Tensor out_,
      sqSum += ex * ex;
    }

-    float sigma = std::sqrt(eps + sqSum / cols);
+    float sigma = std::sqrt(sqSum / cols + eps);

 #pragma omp simd
    for(int i = 0; i < cols; ++i) {
@ -1099,7 +1112,7 @@ void LayerNormalizationGrad(Tensor gradX_,
        sum_sqr += ex * ex;
      }

-      float sigma = std::sqrt(eps + sum_sqr / cols);
+      float sigma = std::sqrt(sum_sqr / cols + eps);
 #pragma omp simd
      for(size_t i = 0; i < cols; ++i) {
        float grad_x = 0.f;
@ -1141,7 +1154,7 @@ void LayerNormalizationGrad(Tensor gradX_,
        sum_sqr += ex * ex;
      }

-      float sigma = std::sqrt(eps + sum_sqr / cols);
+      float sigma = std::sqrt(sum_sqr / cols + eps);
 #pragma omp simd
      for(size_t i = 0; i < cols; ++i) {
        float grad_x = 0.f;
--- a/src/tensors/gpu/add.cu
+++ b/src/tensors/gpu/add.cu
@ -1,4 +1,5 @@
 #include "tensors/gpu/add.h"
+#include "tensors/gpu/add_all.h"

 #include "tensors/gpu/cuda_helpers.h"

@ -12,11 +13,13 @@ namespace marian {
 namespace gpu {

 template <size_t K, class Functor, class AggFunctor, typename T, typename AccType>
-__global__ void gAggregateGeneric(Functor functor, AccType aggInit, AggFunctor aggFunctor,
-                                  const functional::Shape full,
-                                  functional::Tensor<T> out,
-                                  functional::Array<functional::Tensor<T>, K> ins,
-                                  AccType scale = 1.0) {
+__global__ void gAggregateGeneric(Functor functor,                                 // functor applied to single corresponding elements in tensors (via broadcasting),
+                                  AccType aggInit,                                 // aggInit is starting value of accumulation (e.g. 0 for sum),
+                                  AggFunctor aggFunctor,                           // aggFunctor is used to accumulate values (e.g. sum),
+                                  const functional::Shape full,                    // maximal combined shape of all tensors via broadcasting
+                                  functional::Tensor<T> out,                       // output tensor
+                                  functional::Array<functional::Tensor<T>, K> ins, // input tensors
+                                  AccType scale = 1.0) {                           // scale accumulation result by scale. e.g. used for computing mean from sum over N elements with scale 1/N
  int outLength = out.shape().elements();
  bool same = outLength == full.elements();
  for(int i = 0; i < K; ++i)
@ -32,10 +35,10 @@ __global__ void gAggregateGeneric(Functor functor, AccType aggInit, AggFunctor a
    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
    if(index < outLength) {
      if(same) {
-        out[index] = aggFunctor(out[index], functional::apply(functor, ins, index) * (T)scale);
+        out[index] = (T)aggFunctor((AccType)out[index], functional::applyWithCast<AccType>(functor, ins, index) * scale); // apply functors to with arguments cast to AccType
      } else {
        out.shape().dims(index, dims);
-        out[index] = aggFunctor(out[index], (T)(functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale));
+        out[index] = (T)aggFunctor((AccType)out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale); // apply functors to with arguments cast to AccType
      }
    }
  }
@ -62,7 +65,7 @@ __global__ void gAggregateEqual(Functor functor, AggFunctor aggFunctor,
          indices[i] = ins[i].shape().bindex(dims);
      }

-      out[index] = aggFunctor(out[index], functional::apply(functor, ins, indices) * (T)scale);
+      out[index] = (T)aggFunctor((AccType)out[index], functional::applyWithCast<AccType>(functor, ins, indices) * scale);
    }
  }
 }
@ -76,7 +79,7 @@ __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor ag
  int rows = full.elements() / full.back();
  int cols = full.back();

-  bool same = true;
+  bool same = true; // do all inputs have the same number of elements?
  for(int i = 0; i < K; ++i)
    same = same && ins[i].shape().elements() == full.elements();

@ -93,7 +96,7 @@ __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor ag
        for(int tid = 0; tid < cols; tid += blockDim.x) {
          int id = tid + threadIdx.x;
          if(id < cols)
-            _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], (AccType)functional::apply(functor, ins, j * cols + id));
+            _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::applyWithCast<AccType>(functor, ins, j * cols + id)); // casts to AccType before applying functor which then performs operation in AccType
        }
      } else {
        functional::Array<int, functional::Shape::size()> dims;
@ -106,7 +109,7 @@ __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor ag
            functional::Array<int, K> indices;
            for(int i = 0; i < K; ++i)
              indices[i] = ins[i].shape().bindex(dims);
-            _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], (AccType)functional::apply(functor, ins, indices));
+            _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::applyWithCast<AccType>(functor, ins, indices));// casts to AccType before applying functor which then performs operation in AccType
          }
        }
      }
@ -121,7 +124,8 @@ __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor ag
        len = (len + 1) >> 1;
      }
      __syncthreads();
-      out[j] = aggFunctor(out[j], (T)(_sum[0] * scale));
+      if(threadIdx.x == 0) // only set value when in thread 0 in block
+        out[j] = aggFunctor(out[j], (T)(_sum[0] * scale));
    }
    __syncthreads();
  }
@ -140,16 +144,16 @@ void AggregateTyped(Functor functor, AccType aggInit, AggFunctor aggFunctor, Acc
  functional::Tensor<T> gOut = out;
  functional::Array<functional::Tensor<T>, K> gIns = {tensors...};

-  if(full.back() != 1 && out->shape().back() == 1) {
-    size_t m = full.elements() / length;
-    size_t k = full.back();
+  if(out->shape().elements() == 1) { // reduce everything into a single element
+    AggregateAll<T, AccType>(nullptr, functor, aggInit, aggFunctor, scale, out, tensors...); // @TODO: pass allocator in here, currently uses cudaMalloc
+  } else if(full.back() != 1 && out->shape().back() == 1 && full.elements() / full.back() == length) { // element number of out and full shape on axis that are not reduced must match
+    size_t m = full.elements() / full.back(); // how many rows are we iterating over?
+    size_t k = full.back();                   // how many columns are being reduced to 1 in each row?

-    int blocks = std::min(MAX_BLOCKS, (int)m);
+    int blocks  = std::min(MAX_BLOCKS,  (int)m);
    int threads = std::min(MAX_THREADS, (int)k);
-    int shared = sizeof(AccType) * threads; 
-
+    int shared  = sizeof(AccType) * threads;
    gAggregateReduce<K, Functor, AggFunctor, T, AccType><<<blocks, threads, shared>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
-
  } else if(out->shape() == full) {
    int threads = std::min(MAX_THREADS, length);
    int blocks  = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
--- a/src/tensors/gpu/add.inc
+++ b/src/tensors/gpu/add.inc
@ -15,21 +15,21 @@ template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div
 template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
-template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
-template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, float, marian::Tensor, marian::Tensor);
-template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Min, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Min, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor);
-template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor);
-template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor);
-template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::Tensor >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
-template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<3> > > > > >, marian::Tensor, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<3> > > > > >, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, marian::Tensor >(BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, float, marian::Tensor, marian::Tensor);
+template void Aggregate<Assignee<1>, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void Aggregate<Assignee<1>, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void Aggregate<Assignee<1>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void Aggregate<Assignee<1>, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>, marian::Tensor >(Assignee<1>, float, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, marian::Tensor, marian::Tensor, marian::Tensor >(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
--- a/src/tensors/gpu/add_all.cu
+++ b/src/tensors/gpu/add_all.cu
@ -0,0 +1,116 @@
+#include "tensors/gpu/add_all.h"
+#include "tensors/gpu/cuda_helpers.h"
+#include "functional/functional.h"
+#include "tensors/tensor_operators.h"
+#include "3rd_party/reduce_all.h" // only works with CUDA >9.0, we are dropping CUDA 8.0 support, also changed in CMakeLists.txt
+
+namespace marian {
+
+#if COMPILE_FP16
+// local overload to determine tensor type
+template <> inline Type typeId<half>()  { return Type::float16; }
+#endif
+
+// Version with variadic template arguments, called by version with explicit arguments below
+template <typename T, typename AccType, class Functor, class AggFunctor, class... Tensors>
+void AggregateAllVar(Ptr<Allocator> allocator,
+                     Functor functor, 
+                     AccType aggInit,
+                     AggFunctor aggFunctor,
+                     AccType scale,
+                     Tensor out, 
+                     const Tensors... tensors) {
+  cudaSetDevice(out->getDeviceId().no);
+
+  static_assert(CUDA_VERSION >= 9000, "Marian requires CUDA_VERSION >= 9000 (9.0)");
+
+  constexpr size_t K = sizeof...(Tensors);                         // obtain arity K of tensors...
+  functional::Array<functional::Tensor<T>, K> gIns = {tensors...}; // convert to array of K objects of type functional::Tensor<T>
+  functional::Shape full = marian::Shape::broadcast({tensors...}); // compute maximal broadcasted shape
+
+  int size = full.elements();
+  int threads = (size < MAX_THREADS * 2) ? nextPow2((size + 1) / 2) : MAX_THREADS; // suggested in NVidia example for the all_reduce kernel
+  int blocks  = std::min(MAX_BLOCKS, (size + (threads * 2 - 1)) / (threads * 2));  // suggested in NVidia example for the all_reduce kernel
+
+  // The all_reduce kernel by nivida needs to perform multiple passes if the number of blocks needed to perform the reduction is larger than 1.
+  // Here we allocate the memory for the intermediate reductions for each block.
+  Tensor blockMem;
+  if(blocks > 1 || out->type() != typeId<AccType>()) { // if the out tensor does not have elementType AccType we need to allocate and convert later
+    MemoryPiece::PtrType temporaryMemory;
+    if(allocator) {
+      temporaryMemory = allocator->alloc<AccType>(blocks);
+    } else { // @TODO: get rid of this branch
+      uint8_t* temporaryMemoryPtr = 0;
+      CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType) * blocks)); 
+      temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType) * blocks); // @TODO: consider implementing MemoryPiece::cudaMalloc<T>(size) for managed memory
+    }
+    blockMem = TensorBase::New(temporaryMemory,
+                               Shape({blocks}), 
+                               typeId<AccType>(), 
+                               out->getBackend());
+    blockMem->set(aggInit); // set temporary memory to aggInit
+  }
+  else {            // we are reducing into a single element now and the type matches, just use out as memory
+    blockMem = out; // do not set final output memory as we might be summing gradients... needs to be handled outside this function
+  }
+
+  functional::Tensor<AccType> gBlockMem = blockMem;
+  reduceSinglePass<T, AccType>(functor, aggInit, aggFunctor, scale, full, /*out=*/gBlockMem, /*in=*/gIns, threads, blocks);  // First pass reduction into intermediate memory
+
+  // If we actually needed more than one block to perform the first pass reduction, recursively run a second pass reduction over block memory until block memory has size 1.
+  if(blocks > 1) {
+    using namespace functional;
+    auto identity = _1; // transformation was done in first pass, hence only identity
+    AggregateAll<AccType, AccType>(allocator, identity, aggInit, aggFunctor, scale, out, /*in=*/blockMem); // Reducing AccType in AccType now (meta-reduction)
+  } else if(out->type() != typeId<AccType>()) { // it's only a single block, but we need to convert to different type, as mentioned above
+    CopyCast(out, blockMem);
+  }
+
+  if(blockMem != out) {
+    // Free temporary memory whether allocated in allocator or via cudaMalloc
+    if(allocator)
+      allocator->free(blockMem->memory());
+    else if(blockMem->memory()->data())
+      CUDA_CHECK(cudaFree(blockMem->memory()->data()));
+  }
+}
+
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor, 
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  Tensor out, 
+                  const Tensor in1) {
+  AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1);
+}
+
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor, 
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  Tensor out, 
+                  const Tensor in1,
+                  const Tensor in2) {
+  AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2);
+}
+
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor, 
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  Tensor out, 
+                  const Tensor in1,
+                  const Tensor in2,
+                  const Tensor in3) {
+  AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2, in3);
+}
+
+#include "tensors/gpu/add_all.inc"
+
+}
--- a/src/tensors/gpu/add_all.h
+++ b/src/tensors/gpu/add_all.h
@ -0,0 +1,87 @@
+#pragma once
+
+// This header file provides wrappers around NVidia's reduce_all kernel with our custom aggregation functionality
+// This kernel reduces a tensor into a single value. We have modified it to allow for different types of aggregations
+// like summing or max etc.
+
+#include "tensors/gpu/cuda_helpers.h"
+#include "tensors/tensor.h"
+#include "tensors/allocator.h"
+#include "functional/tensor.h"
+#include "tensors/tensor_operators.h"
+
+namespace marian {
+
+// These function declarations are repeated as template specialization with variadic template arguments does not seem to work.
+// Here I am just creating version for 1, 2, and 3 arguments. To be extended if required.
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor, 
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  Tensor out, 
+                  const Tensor in1);
+
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor, 
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  Tensor out, 
+                  const Tensor in1, 
+                  const Tensor in2);
+
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor, 
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  Tensor out, 
+                  const Tensor in1, 
+                  const Tensor in2, 
+                  const Tensor in3);
+
+// Aggregates all values into a single tensor and returns the value of that tensor as a float
+// This does a GPU to CPU memory copy via TensorBase::scalar().
+// Used currently only for L2Norm computation
+template <typename T, typename AccType, class Functor, class AggFunctor, class... Tensors>
+AccType AggregateAllAndReturn(Ptr<Allocator> allocator, 
+                              Functor functor, 
+                              AccType aggInit,
+                              AggFunctor aggFunctor,
+                              AccType scale,
+                              const Tensors... tensors) {
+  MemoryPiece::PtrType temporaryMemory;
+  if(allocator) {
+    temporaryMemory = allocator->alloc<AccType>(1);
+  } else { // @TODO: get rid of this branch
+    uint8_t* temporaryMemoryPtr = 0;
+    CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType))); 
+    temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType));
+  }
+
+  std::tuple<Tensors...> in(tensors...);
+
+  // Create a temporary tensor of size 1 to reduce into
+  auto out = TensorBase::New(temporaryMemory, 
+                             Shape({1}), 
+                             typeId<AccType>(), 
+                             std::get<0>(in)->getBackend());
+  out->set(aggInit); // init to aggInit
+
+  AggregateAll<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, tensors...);
+
+  AccType outScalar = out->template scalar<AccType>(); // convert to float also if other underlying type
+  
+  if(allocator)
+    allocator->free(out->memory());
+  else if(out->memory()->data()) // @TODO: get rid of this branch
+    CUDA_CHECK(cudaFree(out->memory()->data()));
+
+  return outScalar;
+} 
+
+}
--- a/src/tensors/gpu/add_all.inc
+++ b/src/tensors/gpu/add_all.inc
@ -0,0 +1,71 @@
+// see element.inc for instructions on how to maintain this
+using namespace functional;
+
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, UnaryFunctor<elem::Neg, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, UnaryFunctor<elem::Neg, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, Assignee<1>, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, Assignee<1>, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, Assignee<1>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, Assignee<1>, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<float, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+
+#if COMPILE_FP16
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, UnaryFunctor<elem::Neg, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, UnaryFunctor<elem::Neg, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::sReLUBack, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<2>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, Assignee<1>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Min, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Max, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::LogAddExp, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Eq, Assignee<1>, Assignee<2>>, Assignee<3>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<2>>, Assignee<3>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Mult, Capture, Assignee<3>>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>, BinaryFunctor<elem::Minus, Capture, BinaryFunctor<elem::Mult, Capture, Assignee<3>>>>>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, Assignee<1>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, Assignee<1>, Assignee<1>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor);
+#endif
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@ -1,5 +1,3 @@
-//#include <thrust/transform_reduce.h>
-
 #include "common/types.h"
 #include "tensors/tensor_operators.h"

@ -9,7 +7,7 @@
 #include "tensors/gpu/backend.h"
 #include "tensors/gpu/cuda_helpers.h"

-#include "3rd_party/reduce_all.h"
+#include "tensors/gpu/add_all.h"

 namespace marian {

@ -588,6 +586,8 @@ __global__ void gSoftmax(T* out,

      // determine max (used below to improve numeric stability)
      T* _max = _share;
+      
+      // @TODO: what's going on here with fp16?
      _max[threadIdx.x] = -CUDA_FLT_MAX;  // mask
      // find max over column indices that have the same relative column index (=threadIdx.x) across all blocks of columns
      for(int tid = 0; tid < cols; tid += blockDim.x) {
@ -980,7 +980,7 @@ __global__ void gPasteRows(T* out,
                           const IndexType* targetRowIdx,
                           size_t rows) {
  for(int bid = 0; bid < rows; bid += gridDim.x) {
-    int j = bid + blockIdx.x;
+    int j = bid + blockIdx.x; // index into 'indices' vector
    if(j < rows) {
      size_t dstId = targetRowIdx[j];
      size_t srcId = j;
@ -988,11 +988,15 @@ __global__ void gPasteRows(T* out,
      T* rowOut = out + dstId * cols;
      const T* rowIn = in + srcId * cols;

+      // aggregate the entire row
      for(int tid = 0; tid < cols; tid += blockDim.x) {
-        int i = tid + threadIdx.x;
+        int i = tid + threadIdx.x; // column index   --@TODO: column index should be called 'j'
        if(i < cols) {
-          // @TODO: Do we need to get rid of this atomic add? It seems slow for fp16
-          atomics::atomicAdd(rowOut + i, rowIn[i]);
+          // Note: atomicAdd() not needed if number of blocks is 1. Avoid it because it is slow for fp16.
+          if (gridDim.x == 1)
+            rowOut[i] += rowIn[i];
+          else
+            atomics::atomicAdd(rowOut + i, rowIn[i]);
        }
      }
    }
@ -1011,7 +1015,15 @@ void PasteRows(Tensor out,
  size_t rowsToCopy = indices->size();

  int threads = std::min(MAX_THREADS, (int)cols);
+#if 1   // @TODO: make this configurable with a 'deterministic' flag
+  // If we only use one block, then each core operates on a different column,
+  // hence the summation becomes deterministic.
+  // However, we only use e.g. 512 cores out of possibly 3000+, so this will be
+  // 6 x slower in this example.
+  int blocks = 1;
+#else
  int blocks = std::min(MAX_BLOCKS, (int)rowsToCopy);
+#endif

  if(out->type() == Type::float32) {
    gPasteRows<<<blocks, threads>>>(
@ -1132,7 +1144,8 @@ __global__ void gSelect(T* out,
                        const T* in,
                        const functional::Shape inShape,
                        int axis,
-                        IndexType* d_indices) {
+                        const IndexType* d_indices,
+                        const functional::Shape idxShape) {
  int length = outShape.elements();
  functional::Array<int, functional::Shape::size()> dims;

@ -1140,7 +1153,8 @@ __global__ void gSelect(T* out,
    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
    if(index < length) {
      outShape.dims(index, dims);
-      dims[axis] = d_indices[dims[axis]];
+      int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
+      dims[axis] = (int)d_indices[idxIndex];    
      int inIndex = inShape.index(dims);
      out[index] = in[inIndex];
    }
@ -1153,7 +1167,8 @@ __global__ void gInsert(T* out,
                        const T* in,
                        const functional::Shape inShape,
                        int axis,
-                        IndexType* d_indices) {
+                        const IndexType* d_indices,
+                        const functional::Shape idxShape) {
  int length = inShape.elements();
  functional::Array<int, functional::Shape::size()> dims;

@ -1161,7 +1176,8 @@ __global__ void gInsert(T* out,
    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
    if(index < length) {
      inShape.dims(index, dims);
-      dims[axis] = d_indices[dims[axis]];
+      int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
+      dims[axis] = (int)d_indices[idxIndex];    
      int outIndex = outShape.index(dims);
      out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
    }
@ -1189,7 +1205,8 @@ void Select(Tensor out,
                                in->data<float>(),
                                in->shape(),
                                axisGPU,
-                                indices->data<IndexType>());
+                                indices->data<IndexType>(), 
+                                indices->shape());
 #if COMPILE_FP16
  } else if (out->type() == Type::float16) {
    gSelect<<<blocks, threads>>>(out->data<half>(),
@ -1197,7 +1214,8 @@ void Select(Tensor out,
                                in->data<half>(),
                                in->shape(),
                                axisGPU,
-                                indices->data<IndexType>());
+                                indices->data<IndexType>(),
+                                indices->shape());
 #endif
  } else {
    ABORT("Select not implemented for type {}", out->type());
@ -1224,7 +1242,8 @@ void Insert(Tensor out,
                                in->data<float>(),
                                in->shape(),
                                axisGPU,
-                                indices->data<IndexType>());
+                                indices->data<IndexType>(),
+                                indices->shape());
 #if COMPILE_FP16
  } else if (out->type() == Type::float16) {
    gInsert<<<blocks, threads>>>(out->data<half>(),
@ -1232,7 +1251,8 @@ void Insert(Tensor out,
                                in->data<half>(),
                                in->shape(),
                                axisGPU,
-                                indices->data<IndexType>());
+                                indices->data<IndexType>(),
+                                indices->shape());
 #endif
  } else {
    ABORT("Insert not implemented for type {}", out->type());
@ -1522,11 +1542,11 @@ __global__ void gCrossEntropyPick(T* out,
      __syncthreads();

      // cross-entropy
+      auto sum = _sum[0];
      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int id = tid + threadIdx.x;
-        if(id == (int)pick[j]) {
-          out[j] = (T)functional::Ops<AccType>::log(_sum[0]) - sp[id] + max;
-        }
+        if(id == (int)pick[j])
+          out[j] = (T)functional::Ops<AccType>::log(sum) - sp[id] + max;
      }
    }
    __syncthreads();
@ -1628,7 +1648,8 @@ __global__ void gCrossEntropyPickBackward(T* out,
        int id = tid + threadIdx.x;
        if(id < cols) {
          AccType sub = (AccType)(id == (int)pick[j]);
-          so[id] += (AccType)adj[j] * (functional::Ops<AccType>::exp(sp[id] - max) / _sum[0] - sub);
+          auto softmax = functional::Ops<AccType>::exp(sp[id] - max) / _sum[0];
+          so[id] += (AccType)adj[j] * (softmax - sub);
        }
      }
    }
@ -1661,53 +1682,27 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor indices)
  }
 }

-float L2Norm(Tensor in, Ptr<Allocator> allocator) {
+// computes the L2Norm of tensor and returns value as flaot on the CPU, 
+// this is mostly used for diagnostic purposes and gradient clipping
+float L2Norm(Tensor in, Ptr<Allocator> allocator) { // @TODO: reverse order of arguments
  cudaSetDevice(in->getDeviceId().no);

  int size = in->shape().elements();
  int threads = std::min(MAX_THREADS, size);
-  int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
+  int blocks  = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));

-  if(allocator) {
-    auto memoryPiece = allocator->alloc<float>(blocks);
-    auto blockMem = TensorBase::New(memoryPiece, Shape({1, blocks}), Type::float32, in->getBackend());
-
-    using namespace functional;
-    if(in->type() == Type::float32) {
-      ReduceAll<float, float>(_1 * _1, blockMem, in);
+  using namespace functional;
+  float l2Norm;
+  if(in->type() == Type::float32) {
+    l2Norm = std::sqrt(AggregateAllAndReturn</*ElementType=*/float, /*AccType=*/float>(allocator, /*functor=*/_1 * _1, /*aggInit=*/0.f, /*aggFunctor=*/_1 + _2, /*scale=*/1.f, in));
 #if COMPILE_FP16
-    } else if(in->type() == Type::float16) {
-      ReduceAll<half, float>(_1 * _1, blockMem, in);
+  } else if(in->type() == Type::float16) {
+    l2Norm = std::sqrt(AggregateAllAndReturn</*ElementType=*/half, /*AccType=*/float>(allocator, /*functor=*/_1 * _1, /*aggInit=*/0.f, /*aggFunctor=*/_1 + _2, /*scale=*/1.f, in));
 #endif
-    } else {
-      ABORT("L2Norm not implemented for type {}", in->type());
-    }
-    float dataCpu = sqrtf(blockMem->get<float>(0));
-    allocator->free(memoryPiece);
-    return dataCpu;
-  } else { // @TODO: this branch is to be removed with next PR, old version
-    uint8_t* data;
-    cudaMalloc(&data, blocks * sizeof(float));
-    Tensor out(TensorBase::New(MemoryPiece::New(data, blocks * sizeof(float)),
-                               Shape({1, blocks}),
-                               Type::float32,
-                               in->getBackend()));
- 
-    using namespace functional;
-    if(in->type() == Type::float32) {
-      ReduceAll<float, float>(_1 * _1, out, in);
-#if COMPILE_FP16
-    } else if(in->type() == Type::float16) {
-      ReduceAll<half, float>(_1 * _1, out, in);
-#endif
-    } else {
-      ABORT("L2Norm not implemented for type {}", in->type());
-    }
-    float dataCpu = sqrtf(out->get<float>(0));
-    out.reset();
-    cudaFree(data);
-    return dataCpu;
+  } else {
+    ABORT("L2Norm not implemented for type {}", in->type());
  }
+  return l2Norm;
 }

 template <typename T, typename AccType = float>
@ -1761,22 +1756,22 @@ __global__ void gAtt(T* out,
 void Att(Tensor out, Tensor va, Tensor context, Tensor state) {
  cudaSetDevice(out->getDeviceId().no);

-  size_t m = out->shape().elements() / out->shape().back();
-  size_t k = context->shape()[-1];
-  size_t b = context->shape()[-2];
-  size_t t = context->shape()[-3];
+  size_t totalRows       = out->shape().elements() / out->shape().back(); // number of rows
+  size_t modelDim        = context->shape()[-1];                          // number of cols
+  size_t batchDim        = context->shape()[-2];
+  size_t contextWordsDim = context->shape()[-3];

-  int blocks = std::min(MAX_BLOCKS, (int)m);
-  int threads = std::min(MAX_THREADS, (int)k);
+  int blocks = std::min(MAX_BLOCKS, (int)totalRows);   
+  int threads = std::min(MAX_THREADS, (int)modelDim);
  int shared = sizeof(float) * threads;

  if(out->type() == Type::float32) {
    gAtt<float, float><<<blocks, threads, shared>>>(
-      out->data<float>(), va->data<float>(), context->data<float>(), state->data<float>(), m, k, b, t);
+      out->data<float>(), va->data<float>(), context->data<float>(), state->data<float>(), totalRows, modelDim, batchDim, contextWordsDim);
 #if COMPILE_FP16
  } else if (out->type() == Type::float16) {
    gAtt<half, float><<<blocks, threads, shared>>>(
-      out->data<half>(), va->data<half>(), context->data<half>(), state->data<half>(), m, k, b, t);
+      out->data<half>(), va->data<half>(), context->data<half>(), state->data<half>(), totalRows, modelDim, batchDim, contextWordsDim);
 #endif
  } else {
    ABORT("gAtt not implemented for type {}", out->type());
@ -1930,7 +1925,7 @@ __global__ void gLNormalization(T* out,
        len = (len + 1) >> 1;
      }
      __syncthreads();
-      AccType sigma = functional::Ops<AccType>::sqrt(_sqSum[0] / N); // all AccType
+      AccType sigma = functional::Ops<AccType>::sqrt(_sqSum[0] / N + eps); // all AccType
      __syncthreads();

      for(int tid = 0; tid < cols; tid += blockDim.x) {
@ -1939,7 +1934,7 @@ __global__ void gLNormalization(T* out,
          AccType gammav = (AccType)gamma[id];
          AccType xv     = (AccType)xRow[id];
          AccType betav  = beta ? (AccType)beta[id] : (AccType)0.f;
-          AccType lv     = (xv - mean) / (sigma + eps);
+          AccType lv     = (xv - mean) / sigma;
          AccType y      = gammav * lv + betav;
          yRow[id]       = (T)y;
        }
@ -2005,10 +2000,10 @@ __global__ void gLayerNormalizationGrad(T* gradX,
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int j = bid + blockIdx.x;
    if(j < rows) {
-      AccType* sum_adj   = shared;
-      AccType* sum_adj_x = shared +     blockDim.x;
-      AccType* sum_x     = shared + 2 * blockDim.x;
-      AccType* sum_sqr   = shared + 3 * blockDim.x;
+      AccType* sum_adj   = shared;                   // sum of gradient coming in
+      AccType* sum_adj_l = shared +     blockDim.x;  // sum of gradient coming in times layerNorm from value
+      AccType* sum_x     = shared + 2 * blockDim.x;  // sum of input value x
+      AccType* sum_sqr   = shared + 3 * blockDim.x;  // sum of (x - mean)^2

      const T* xRow   =   x + j * cols;
      const T* yRow   =   y + j * cols;
@ -2016,7 +2011,7 @@ __global__ void gLayerNormalizationGrad(T* gradX,

      sum_x[threadIdx.x]     = (AccType)0.0f;
      sum_adj[threadIdx.x]   = (AccType)0.0f;
-      sum_adj_x[threadIdx.x] = (AccType)0.0f;
+      sum_adj_l[threadIdx.x] = (AccType)0.0f;
      sum_sqr[threadIdx.x]   = (AccType)0.0f;

      for(int tid = 0; tid < cols; tid += blockDim.x) {
@ -2027,10 +2022,10 @@ __global__ void gLayerNormalizationGrad(T* gradX,
          AccType betav  = beta ? (AccType)beta[id] : (AccType)0.f;
          AccType gammav = (AccType)gamma[id];
          AccType adjv   = adjRow[id];
-          AccType lv     = (yv - betav) / (gammav + eps); // go back to LN(x) from scaled and shifted version for accumulation
+          AccType lv     = (yv - betav) / gammav; // go back to LN(x) from scaled and shifted version for accumulation

          sum_x[threadIdx.x]     += xv;
-          sum_adj_x[threadIdx.x] += adjv * lv;
+          sum_adj_l[threadIdx.x] += adjv * lv;
          sum_adj[threadIdx.x]   += adjv;
        }
      }
@ -2042,7 +2037,7 @@ __global__ void gLayerNormalizationGrad(T* gradX,
        if(threadIdx.x < (len >> 1)) {
          sum_x[threadIdx.x]     += sum_x[threadIdx.x     + skip]; // Accumulates in AccType
          sum_adj[threadIdx.x]   += sum_adj[threadIdx.x   + skip]; // Accumulates in AccType
-          sum_adj_x[threadIdx.x] += sum_adj_x[threadIdx.x + skip]; // Accumulates in AccType
+          sum_adj_l[threadIdx.x] += sum_adj_l[threadIdx.x + skip]; // Accumulates in AccType
        }
        len = (len + 1) >> 1;
      }
@ -2069,33 +2064,32 @@ __global__ void gLayerNormalizationGrad(T* gradX,
        len = (len + 1) >> 1;
      }
      __syncthreads();
-      AccType sigma = functional::Ops<AccType>::sqrt(sum_sqr[0] / N);
+      AccType sigma = functional::Ops<AccType>::sqrt(sum_sqr[0] / N + eps);
      __syncthreads();

      // Jacobian of layer norm
      // J = [ \frac{1}{N\sigma} (N\delta_{ij} - l_i l_j - 1) ]_{ij}
-      // J * a = dC/dx_i = ( N v_i - l_i \sum_j l_j a_j - \sum_j a_j ) / (N \sigma)
+      // J * a = dC/dx_i = ( N a_i - l_i \sum_j l_j a_j - \sum_j a_j ) / (N \sigma)

      for(int tid = 0; tid < cols; tid += blockDim.x) {
        int id = tid + threadIdx.x;
        if(id < cols) {

          AccType xv     = xRow[id];
-          //AccType yv     = yRow[id];
-          //AccType betav  = beta ? (AccType)beta[id] : (AccType)0.f;
          AccType gammav = (AccType)gamma[id];
          AccType adjv   = adjRow[id];
-          AccType lv     = (xv - mean) / (sigma + eps);
+          AccType lv     = (xv - mean) / sigma;

-          AccType gradLv = N * adjv - lv * sum_adj_x[0] - sum_adj[0];
-          gradLv        /= N * (sigma + eps); // eps has to be inside parentheses for correct gradient
+          AccType gradLv = N * adjv - lv * sum_adj_l[0] - sum_adj[0];
+          gradLv        /= N * sigma; 

          AccType gradXv = gammav * gradLv;

-          // Keep LN gradient between [-10, 10]
-          // AccType sign = functional::Ops<AccType>::sgn(gradXv);
-          // AccType cutoff = (AccType)10.f;
-          // gradXv = functional::Ops<AccType>::abs(gradXv) > cutoff ? sign * cutoff : gradXv;
+          // Keep LN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. @TODO: to be fixed and removed.
+          AccType sign = functional::Ops<AccType>::sgn(gradXv);
+          AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option?
+                                            // or better: make obsolete.
+          gradXv = functional::Ops<AccType>::abs(gradXv) > cutoff ? sign * cutoff : gradXv;

          T* gradXRow      = gradX     + j * cols;
          gradXRow[id]    += (T)(gradXv);
--- a/src/tensors/tensor.cpp
+++ b/src/tensors/tensor.cpp
@ -28,18 +28,20 @@ std::string TensorBase::debug(int precision, int dispCols) {
  else
    strm << std::fixed << std::setprecision(0) << std::setfill(' ');

-  // double maxv = std::numeric_limits<double>::lowest();
-  // double minv = std::numeric_limits<double>::max();
-  // double l2Norm = 0.0;
+  double maxv = std::numeric_limits<double>::lowest();
+  double minv = std::numeric_limits<double>::max();
+  double l2Sum = 0.0;
+  for(int i = 0; i < values.size(); ++i) {
+    if((double)values[i] > maxv) maxv = (double)values[i];
+    if((double)values[i] < minv) minv = (double)values[i];
+    l2Sum += (double)values[i] * (double)values[i];
+  }
+  strm << "min: " << minv << " max: " << maxv << " l2-norm: " << sqrt(l2Sum) << std::endl;

  for(int i = 0; i < values.size(); ++i) {
    std::vector<int> dims;
    shape().dims(i, dims);

-    // if((double)values[i] > maxv) maxv = values[i];
-    // if((double)values[i] < minv) minv = values[i];
-    // l2Norm += (double)values[i] * (double)values[i];
-
    bool disp = true;
    for(int j = 0; j < dims.size(); ++j)
      disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
@ -95,8 +97,6 @@ std::string TensorBase::debug(int precision, int dispCols) {
    }
  }
  strm << std::endl;
-  //strm << "min: " << minv << " max: " << maxv << " l2-norm: " << sqrt(l2Norm);
-
  return strm.str();
 }

--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@ -54,12 +54,12 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
    gpu::Add(functor, scale, out, tensors...);
  else
 #endif
-    cpu::Aggregate(functor, 0.0f, functional::_1 + functional::_2, scale, out, tensors...);
+    cpu::Aggregate(functor, /*aggInit=*/0.0f, functional::_1 + functional::_2, scale, out, tensors...);
 }

 template <class Functor, class... Tensors>
 void Add(Functor functor, marian::Tensor out, Tensors... tensors) {
-  Add(functor, 1, out, tensors...);
+  Add(functor, /*scale=*/1.f, out, tensors...);
 }

 template <class Functor, class AggFunctor, class... Tensors>
--- a/src/tests/units/fastopt_tests.cpp
+++ b/src/tests/units/fastopt_tests.cpp
@ -40,6 +40,9 @@ TEST_CASE("Options can be accessed", "[fastopt]") {
      "subnode: {"
      "  baz: [ 111.5, False ],"
      "  qux: 222,"
+      "  preprocess1: n,"
+      "  preprocess2: d,"
+      "  preprocess3: y,"
      "  }"
      "}");

@ -57,6 +60,9 @@ TEST_CASE("Options can be accessed", "[fastopt]") {
    CHECK( o["subnode"]["baz"][0].as<float>() == 111.5f );
    CHECK( o["subnode"]["baz"][1].as<bool>() == false );
    CHECK( o["subnode"]["baz"][0].as<int>() == 111 );
+    CHECK( o["subnode"]["preprocess1"].as<std::string>() == "n" ); // don't allow "n" to be cast to boolean false while converting from YAML
+    CHECK( o["subnode"]["preprocess2"].as<std::string>() == "d" );
+    CHECK( o["subnode"]["preprocess3"].as<std::string>() == "y" ); // don't allow "y" to be cast to boolean true while converting from YAML
  }

  node["foo"] = "baz";
--- a/src/tests/units/operator_tests.cpp
+++ b/src/tests/units/operator_tests.cpp
@ -670,16 +670,16 @@ void tests(DeviceType device, Type floatType = Type::float32) {
    values.clear();

    std::vector<T> vA({  1, -2,   3,
-                            -4,  5,  -6,
-                             7, -8,   9,
-                           -10, 11, -12});
+                        -4,  5,  -6,
+                         7, -8,   9,
+                       -10, 11, -12});
    std::vector<T> vC({ 1,  -2, // C = np.array([1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12]).reshape((2, 3, 2))
-                            3,  -4,
-                            5,  -6,
+                        3,  -4,
+                        5,  -6,

-                            7,  -8,
-                            9, -10,
-                           11, -12 });
+                        7,  -8,
+                        9, -10,
+                        11, -12 });
    std::vector<T> vB1({1, -2, 3});
    std::vector<T> vB2({1, -4, 7, -10});
    std::vector<T> vB3({-2, 5, -8, 11});
@ -687,7 +687,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
    std::vector<T> vD1(vB4);
    std::vector<T> vD2({5, -6, 11, -12});
    std::vector<T> vD3({1, -2, 5, -6, 7, -8, 11, -12}); // C[:,(0,2),:]
-    //std::vector<float> vD4({5, -6, 3, -4, 7, -8, 11, -12}); // [C[0,(2,1),:],C[1,(0,2),:]]
+    std::vector<T> vD4({5, -6, 3, -4, 7, -8, 11, -12}); // [C[0,(2,1),:],C[1,(0,2),:]]
    std::vector<T> vS1({7, -8, 9});
    std::vector<T> vS2({-4, 5, -6, 7, -8, 9});
    std::vector<T> vS3({7, -8, 9, -10, 11, -12});
@ -714,11 +714,11 @@ void tests(DeviceType device, Type floatType = Type::float32) {
    CHECK(D1->type() == "sliceView");
    CHECK(D2->type() == "gather");
    // enable this once gather() supports batched indices:
-    //auto D4 = gather(C, 1, graph->constant({2, 2, 1}, // [C[0,(2,1),:],C[1,(0,2),:]]
-    //                                       inits::fromVector(std::vector<IndexType>{
-    //                                         2, 1,
-    //                                         0, 2 }),
-    //                                       Type::uint32));
+    auto D4 = gather(C, 1, graph->constant({2, 2, 1}, // [C[0,(2,1),:],C[1,(0,2),:]]
+                                          inits::fromVector(std::vector<IndexType>{
+                                            2, 1,
+                                            0, 2 }),
+                                          Type::uint32));

    auto S1 = slice(A, 0, 2);
    auto S2 = narrow(A, 0, 1, 2);
@ -736,7 +736,7 @@ void tests(DeviceType device, Type floatType = Type::float32) {
    CHECK(D1->shape() == Shape({1, 3, 2})); D1->val()->get(values); CHECK( values == vD1 );
    CHECK(D2->shape() == Shape({2, 1, 2})); D2->val()->get(values); CHECK( values == vD2 );
    CHECK(D3->shape() == Shape({2, 2, 2})); D3->val()->get(values); CHECK( values == vD3 );
-    //CHECK(D4->shape() == Shape({2, 2, 2})); D4->val()->get(values); CHECK( values == vD4 );
+    CHECK(D4->shape() == Shape({2, 2, 2})); D4->val()->get(values); CHECK( values == vD4 );

    CHECK(S1->shape() == Shape({1,3})); S1->val()->get(values); CHECK(values == vS1);
    CHECK(S2->shape() == Shape({2,3})); S2->val()->get(values); CHECK(values == vS2);
@ -789,3 +789,59 @@ TEST_CASE("Expression graph supports basic math operations (cpu)", "[operator]")
  tests<float>(DeviceType::cpu);
 }
 #endif
+
+#ifdef BLAS_FOUND
+#ifdef CUDA_FOUND
+
+TEST_CASE("Compare aggregate operator", "[graph]") {
+  auto floatApprox = [](float x, float y) -> bool { return x == Approx(y).epsilon(0.01); };
+  
+  Config::seed = 1234;
+
+  std::vector<float> initc;
+  std::vector<float> inita;
+
+  {
+    auto graph = New<ExpressionGraph>();
+    graph->setDevice({0, DeviceType::cpu});
+    graph->reserveWorkspaceMB(40);
+
+    auto chl = graph->param("1x10x512x2048", {1, 10, 512, 2048}, inits::normal());
+    auto adj = graph->param("1x1x512x2048",  {1,  1, 512, 2048}, inits::normal());
+    graph->forward();
+
+    chl->val()->get(initc);
+    adj->val()->get(inita);
+  }
+
+  SECTION("initializing with zero (cpu)") {
+    std::vector<float> values1;
+    std::vector<float> values2;
+    
+    auto graph1 = New<ExpressionGraph>();
+    graph1->setDevice({0, DeviceType::cpu});
+    graph1->reserveWorkspaceMB(40);
+
+    auto graph2 = New<ExpressionGraph>();
+    graph2->setDevice({0, DeviceType::gpu});
+    graph2->reserveWorkspaceMB(40);
+  
+    auto chl1 = graph1->param("1x10x512x2048", {1, 10, 512, 2048}, inits::fromVector(initc));
+    auto adj1 = graph1->param("1x1x512x2048",  {1,  1, 512, 2048}, inits::fromVector(inita));
+    auto prod1 = scalar_product(chl1, adj1, -1);
+    graph1->forward();
+
+    auto chl2 = graph2->param("1x10x512x2048", {1, 10, 512, 2048}, inits::fromVector(initc));
+    auto adj2 = graph2->param("1x1x512x2048",  {1,  1, 512, 2048}, inits::fromVector(inita));
+    auto prod2 = scalar_product(chl2, adj2, -1);
+    graph2->forward();
+
+    prod1->val()->get(values1);
+    prod2->val()->get(values2);
+
+    CHECK( std::equal(values1.begin(), values1.end(), values2.begin(), floatApprox) );
+  }
+}
+
+  #endif
+  #endif
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@ -109,6 +109,8 @@ public:
        auto cost = model->build(graph, batch);
        fits = graph->fits();

+        LOG(debug, "[batching] length: {} - size: {} - fits: {}", lengths[0], current, fits);
+
        if(fits) {
          stats->add(batch, multiplier);
          start = current + 1;
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@ -18,6 +18,7 @@ AsyncGraphGroup::AsyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
  for(auto device : devices_) {
    auto graph = New<ExpressionGraph>();
    graph->setDevice(device);
+    graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
    graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
    graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
    graphs_.push_back(graph);
--- a/src/training/graph_group_singleton.h
+++ b/src/training/graph_group_singleton.h
@ -34,6 +34,7 @@ public:
    // Initialize graph
    graph_ = New<ExpressionGraph>();
    graph_->setDevice(deviceId);
+    graph_->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
    graph_->getBackend()->setClip(options_->get<float>("clip-gemm"));
    graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
    opt_ = Optimizer(options_);
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -10,6 +10,7 @@ SyncGraphGroup::SyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
  for(auto device : devices_) {
    auto graph = New<ExpressionGraph>();
    graph->setDevice(device);
+    graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
    graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
    graph->getBackend()->setClip(options_->get<float>("clip-gemm"));

@ -57,19 +58,6 @@ void SyncGraphGroup::initialize(const Ptr<data::Batch>& exampleBatch) {
    if (i > 0)
      graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
  });
-  //ThreadPool pool(graphs_.size() - 1, graphs_.size() - 1);
-  //for(size_t i = 1; i < graphs_.size(); ++i) {
-  //  auto init = [&](size_t i) {
-  //    // initialize i-th graph and weights
-  //    builders_[i]->build(graphs_[i], exampleBatch);
-  //    graphs_[i]->forward();
-  //    // overwrite weights of i-th graph with weights from 0-th graph
-  //    graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
-  //  };
-  //  pool.enqueue(init, i);
-  //}
-  //// ThreadPool destructor waits until completion of all tasks.
-  //// @TODO: can we use comm_->foreach()?
 }

 void SyncGraphGroup::initializeAvg() {
@ -401,15 +389,20 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
          paramsAvg_[idx], curParam, scheduler_->numberOfBatches(), updateTrgWords);
  };

-  comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards
-  comm_->foreach(update);              // per-shard model-update
-  comm_->allGatherParams();            // distribute param value shards back
-
  // cost across all local devices (scheduler will aggregate cross-process)
  StaticLoss localLoss;
  for(auto& l : localDeviceLosses) // localDeviceLosses is already summed up over delay steps
    localLoss += l;

+  // model update
+  if (std::isfinite(localLoss.loss) || mpi_->numMPIProcesses() > 1) { // guard against NaN (except with MPI, as this simple way could hang it)
+    comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices and MPI nodes into shards
+    comm_->foreach(update);              // per-shard model-update
+    comm_->allGatherParams();            // distribute param value shards back
+  }
+  else
+    LOG(info, "[training] skipping {}-th update due to loss being {}", scheduler_->numberOfBatches(), localLoss.loss);
+
  if(scheduler_) {
    // track and log localLoss
    scheduler_->update(localLoss, numReadBatches, effectiveBatchSize, effectiveBatchTrgWords, mpi_);
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -9,7 +9,7 @@
 namespace marian {

 bool getSigtermFlag();
-void installSignalHandlers(); 
+void installSignalHandlers();

 class Scheduler : public TrainingObserver {
 private:
@ -229,7 +229,7 @@ public:
        continue;

      size_t stalledPrev = validator->stalled();
-      float value = validator->validate(graphs);
+      float value = validator->validate(graphs, state_);
      if(validator->stalled() > 0) {
        LOG_VALID(info,
                  "Ep. {} : Up. {} : {} : {} : stalled {} times (last best: {})",
@ -358,7 +358,7 @@ public:
       && heartBeatTimer_.elapsed<std::chrono::minutes>() >= 10) {
      printf("PROGRESS: %.2f%%\nEVALERR: %.7f%%\n",
          (double)state_->epochs,
-          state_->costSum / state_->costCount / (mpi ? mpi->numMPIProcesses() : 1));
+          state_->costSum / (state_->costCount ? state_->costCount : 1) / (mpi ? mpi->numMPIProcesses() : 1));
      fflush(stdout);
      std::cout << "MBSIZE: " << batchLabels << " after " << state_->batches << " updates = " << state_->labelsTotal << " labels" << std::endl << std::flush;
      heartBeatTimer_.start();
--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@ -254,7 +254,7 @@ public:
    seedCorpus = config["seed-corpus"].as<std::string>();
  }

-  void save(const std::string& name) {
+  void save(const std::string& name) const {
    std::ofstream fout(name);
    YAML::Node config;

@ -291,6 +291,16 @@ public:
    fout << config;
  }

+  std::string fillTemplate(const std::string& templ) const {
+    // The formatting below uses fmtlib, which is included with spdlog
+    // and is included via the logger.
+    return fmt::format(templ.c_str(),
+                       fmt::arg("E", epochs),
+                       fmt::arg("U", batches),
+                       fmt::arg("B", batchesEpoch),
+                       fmt::arg("T", labelsTotal));
+  }
+
 private:
  std::vector<Ptr<TrainingObserver>> observers_;
 };
--- a/src/training/validator.cpp
+++ b/src/training/validator.cpp
@ -303,7 +303,8 @@ ScriptValidator::ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> op
           "valid-script metric but no script given");
 }

-float ScriptValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
+float ScriptValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                                Ptr<const TrainingState> /*ignored*/) {
  using namespace data;
  auto model = options_->get<std::string>("model");
  std::string suffix = model.substr(model.size() - 4);
@ -331,7 +332,8 @@ TranslationValidator::TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<O
  createBatchGenerator(/*isTranslating=*/true);
 }

-float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
+float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                                     Ptr<const TrainingState> state) {
  using namespace data;

  // Generate batches
@ -353,6 +355,8 @@ float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& gr

  if(options_->hasAndNotEmpty("valid-translation-output")) {
    fileName = options_->get<std::string>("valid-translation-output");
+    // fileName can be a template with fields for training state parameters:
+    fileName = state->fillTemplate(fileName);
  } else {
    tempFile.reset(new io::TemporaryFile(options_->get<std::string>("tempdir"), false));
    fileName = tempFile->getFileName();
@ -455,7 +459,8 @@ BleuValidator::BleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> option
  createBatchGenerator(/*isTranslating=*/true);
 }

-float BleuValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
+float BleuValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                              Ptr<const TrainingState> state) {
  using namespace data;

  // Generate batches
@ -495,6 +500,8 @@ float BleuValidator::validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
    Ptr<OutputCollector> collector;
    if(options_->hasAndNotEmpty("valid-translation-output")) {
      auto fileName = options_->get<std::string>("valid-translation-output");
+      // fileName can be a template with fields for training state parameters:
+      fileName = state->fillTemplate(fileName);
      collector = New<OutputCollector>(fileName);  // for debugging
    } else {
      collector = New<OutputCollector>(/* null */);  // don't print, but log
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -37,7 +37,8 @@ protected:
 public:
  ValidatorBase(bool lowerIsBetter) : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}

-  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) = 0;
+  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                         Ptr<const TrainingState> state) = 0;
  virtual std::string type() = 0;

  float lastBest() { return lastBest_; }
@ -83,7 +84,8 @@ protected:
  }
 public:

-  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) override {
+  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                         Ptr<const TrainingState> /*ignored*/) override {
    for(auto graph : graphs)
      graph->setInference(true);

@ -176,7 +178,8 @@ class ScriptValidator : public Validator<data::Corpus, models::IModel> {
 public:
  ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);

-  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) override;
+  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                         Ptr<const TrainingState> /*ignored*/) override;

  std::string type() override { return "valid-script"; }

@ -191,7 +194,8 @@ class TranslationValidator : public Validator<data::Corpus, models::IModel> {
 public:
  TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);

-  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) override;
+  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                         Ptr<const TrainingState> state) override;

  std::string type() override { return "translation"; }

@ -209,7 +213,8 @@ class BleuValidator : public Validator<data::Corpus, models::IModel> {
 public:
  BleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool detok = false);

-  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) override;
+  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
+                         Ptr<const TrainingState> state) override;

  // @TODO: why do we return this string, but not pass it to the constructor?
  std::string type() override { return detok_ ? "bleu-detok" : "bleu"; }
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -39,6 +39,7 @@ public:
               const std::vector<Ptr<ScorerState /*const*/>>& states,
               Ptr<data::CorpusBatch /*const*/> batch, // for alignments only
               Ptr<FactoredVocab/*const*/> factoredVocab, size_t factorGroup,
+               const std::vector<bool>& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use.
               const std::vector<IndexType>& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx]
    std::vector<float> align; // collects alignment information from the last executed time step
    if(options_->hasAndNotEmpty("alignment") && factorGroup == 0) 
@ -49,9 +50,10 @@ public:

    // create a reverse batchMap to obtain original batchIdx in the starting batch size
    // and calculate the current batch size based on non-empty beams
-    std::vector<IndexType> reverseBatchIdxMap(batchIdxMap.size());
+    std::vector<IndexType> reverseBatchIdxMap; // empty if not purging batch entries
    size_t currentDimBatch = beams.size();
    if(PURGE_BATCH) {
+      reverseBatchIdxMap.resize(batchIdxMap.size()); // adjust size if doing batch purging.
      currentDimBatch = 0;
      for(int i = 0; i < batchIdxMap.size(); ++i) {
        reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one, 
@ -66,16 +68,22 @@ public:
      // They can be between 0 and (vocabSize * nBestBeamSize * batchSize)-1.
      // (beamHypIdx refers to the GPU tensors, *not* the beams[] array; they are not the same in case of purging)
      const auto  key       = nBestKeys[i];
-      const float pathScore = nBestPathScores[i]; // expanded path score for (batchIdx, beamHypIdx, word)
-
+      
      // decompose key into individual indices (batchIdx, beamHypIdx, wordIdx)
-      const auto wordIdx         = (WordIndex)(key % vocabSize);
-      const auto beamHypIdx      =            (key / vocabSize) % nBestBeamSize;
-      const auto currentBatchIdx =            (key / vocabSize) / nBestBeamSize;
+      const auto beamHypIdx      = (key / vocabSize) % nBestBeamSize;
+      const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
+      const auto origBatchIdx    = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam

-      auto origBatchIdx          = currentBatchIdx;
-      if(PURGE_BATCH)
-        origBatchIdx = reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
+      bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx];
+
+      // if we force=drop the hypothesis, assign EOS, otherwise the expected word id. 
+      const auto wordIdx    = dropHyp ? trgVocab_->getEosId().toWordIndex() : (WordIndex)(key % vocabSize);
+
+      // @TODO: We currently assign a log probability of 0 to all beam entries of the dropped batch entry, instead it might be a good idea to use
+      // the per Hyp pathScore without the current expansion (a bit hard to obtain). 
+      // For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better. 
+      // For the empty hyp this would naturally result in 0, too. 
+      const float pathScore = dropHyp ? 0.f : nBestPathScores[i]; // 0 (Prob = 1, maximum score) if dropped or expanded path score for (batchIdx, beamHypIdx, word)

      const auto& beam = beams[origBatchIdx];
      auto& newBeam = newBeams[origBatchIdx]; // extended hypotheses are going to be placed in this new beam
@ -85,7 +93,7 @@ public:
      if (pathScore <= INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor)
        continue;

-      ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??");
+      ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...)

      // map wordIdx to word
      auto prevBeamHypIdx = beamHypIdx; // back pointer
@ -99,12 +107,17 @@ public:
        // starting with the lemma, then adding factors one by one.
        if (factorGroup == 0) {
          word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap(wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
-          //std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
-          //LOG(info, "new lemma {},{}={} -> {}->{}", word.toWordIndex(), factorIndices[0], factoredVocab->word2string(word), prevHyp->getPathScore(), pathScore);
+          std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
+          //LOG(info, "{} + {} ({}) -> {} -> {}",
+          //    factoredVocab->decode(prevHyp->tracebackWords()),
+          //    factoredVocab->word2string(word), factorIndices[0], prevHyp->getPathScore(), pathScore);
        }
        else {
-          //LOG(info, "expand word {}={} with factor[{}] {} -> {}->{}", beam[beamHypIdx]->getWord().toWordIndex(),
-          //    factoredVocab->word2string(beam[beamHypIdx]->getWord()), factorGroup, wordIdx, prevHyp->getPathScore(), pathScore);
+          //LOG(info, "{} |{} ({}) = {} ({}) -> {} -> {}",
+          //    factoredVocab->decodeForDiagnostics(beam[beamHypIdx]->tracebackWords()),
+          //    factoredVocab->getFactorGroupPrefix(factorGroup), factorGroup,
+          //    factoredVocab->getFactorName(factorGroup, wordIdx), wordIdx,
+          //    prevHyp->getPathScore(), pathScore);
          word = beam[beamHypIdx]->getWord();
          ABORT_IF(!factoredVocab->canExpandFactoredWord(word, factorGroup),
                   "A word without this factor snuck through to here??");
@ -235,7 +248,7 @@ public:

      if(PURGE_BATCH)
        if(newBeam.empty() && !beam.empty()) {      // previous beam had hyps, but all were finished in this step, newBeam will now stay empty
-          for(int i = beamIdx + 1; i < beams.size(); ++i) // for all entries above this beam
+          for(size_t i = beamIdx + 1; i < beams.size(); ++i) // for all entries above this beam
            batchIdxMap[i] = batchIdxMap[i] - 1;  // make them look at one batch index below, as the current entry will be removed from the batch.
      }

@ -282,31 +295,22 @@ public:
      states.push_back(scorer->startState(graph, batch));
    }

-    const auto srcEosId = batch->front()->vocab()->getEosId();
-    
    // create one beam per batch entry with sentence-start hypothesis
    Beams beams(origDimBatch, Beam(beamSize_, Hypothesis::New())); // array [origDimBatch] of array [maxBeamSize] of Hypothesis, keeps full size through search.
                                                                   // batch purging is determined from an empty sub-beam.
    std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking. 
                                                      // By default that corresponds to position in array, 
                                                      // but shifts in the course of removing batch entries when they are finished.
+
+    const std::vector<bool> emptyBatchEntries; // used for recording if there are empty input batch entries
    for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) {
      batchIdxMap[origBatchIdx] = origBatchIdx; // map to same position on initialization
      auto& beam = beams[origBatchIdx];
      histories[origBatchIdx]->add(beam, trgEosId); // add beams with start-hypotheses to traceback grid

-      // Handle batch entries that consist only of source <EOS> i.e. these are empty lines
-      if(batch->front()->data()[origBatchIdx] == srcEosId) {
-        // create a target <EOS> hypothesis that extends the start-hypothesis
-        auto eosHyp = Hypothesis::New(/*prevHyp=*/    beam[0], 
-                                      /*currWord=*/   trgEosId, 
-                                      /*prevHypIdx=*/ 0, 
-                                      /*pathScore=*/  0.f);
-        auto eosBeam = Beam(beamSize_, eosHyp);      // create a dummy beam filled with <EOS>-hyps
-        histories[origBatchIdx]->add(eosBeam, trgEosId); // push dummy <EOS>-beam to traceback grid
-        beam.clear(); // Zero out current beam, so it does not get used for further symbols as empty beams get omitted everywhere.
-                      // The corresponding neural states will be purged further down.
-      }
+      // Mark batch entries that consist only of source <EOS> i.e. these are empty lines. They will be forced to EOS and purged from batch
+      const auto& srcEosId = batch->front()->vocab()->getEosId();
+      const_cast<std::vector<bool>&>(emptyBatchEntries).push_back(batch->front()->data()[origBatchIdx] == srcEosId); // const_cast during construction
    }

    // determine index of UNK in the log prob vectors if we want to suppress it in the decoding process
@ -406,7 +410,7 @@ public:
            }
          }
          if(factorGroup == 0) 
-            currentDimBatch = batchIndices.size(); // keep batch size constant for all factor groups in a time step
+            currentDimBatch = (IndexType) batchIndices.size(); // keep batch size constant for all factor groups in a time step
          prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores));
        }
        if (!anyCanExpand) // all words cannot expand this factor: skip
@ -491,6 +495,7 @@ public:
                       states,    // used for keeping track of per-ensemble-member path score
                       batch,     // only used for propagating alignment info
                       factoredVocab, factorGroup, 
+                       emptyBatchEntries, // [origDimBatch] - empty source batch entries are marked with true
                       batchIdxMap); // used to create a reverse batch index map to recover original batch indices for this step
      } // END FOR factorGroup = 0 .. numFactorGroups-1

--- a/src/translator/hypothesis.h
+++ b/src/translator/hypothesis.h
@ -42,33 +42,44 @@ public:
  float getPathScore() const { return pathScore_; }

  const std::vector<float>& getScoreBreakdown() { return scoreBreakdown_; }
-  void setScoreBreakdown(const std::vector<float>& scoreBreaddown) { scoreBreakdown_ = scoreBreaddown; }
+  void setScoreBreakdown(const std::vector<float>& scoreBreakdown) { scoreBreakdown_ = scoreBreakdown; }

  const std::vector<float>& getAlignment() { return alignment_; }
  void setAlignment(const std::vector<float>& align) { alignment_ = align; };

-  // helpers to trace back paths referenced from this hypothesis
-  Words tracebackWords()
-  {
-      Words targetWords;
-      for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
-        targetWords.push_back(hyp->getWord());
-        // std::cerr << hyp->getWord() << " " << hyp << std::endl;
-      }
-      std::reverse(targetWords.begin(), targetWords.end());
-      return targetWords;
+  // trace back paths referenced from this hypothesis
+  Words tracebackWords() {
+    Words targetWords;
+    for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
+      targetWords.push_back(hyp->getWord());
+    }
+    std::reverse(targetWords.begin(), targetWords.end());
+    return targetWords;
+  }
+
+  // calculate word-level scores for each target word by de-aggregating the path score
+  std::vector<float> tracebackWordScores() {
+    std::vector<float> scores;
+    // traverse hypotheses backward
+    for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
+      // a path score is a cumulative score including scores from all preceding hypotheses (words),
+      // so calculate a word-level score by subtracting the previous path score from the current path score
+      auto prevPathScore = hyp->getPrevHyp() ? hyp->getPrevHyp().get()->pathScore_ : 0.f;
+      scores.push_back(hyp->pathScore_ - prevPathScore);
+    }
+    std::reverse(scores.begin(), scores.end());
+    return scores;
  }

  // get soft alignments [t][s] -> P(s|t) for each target word starting from the hyp one
  typedef data::SoftAlignment SoftAlignment;
-  SoftAlignment tracebackAlignment()
-  {
-      SoftAlignment align;
-      for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
-          align.push_back(hyp->getAlignment());
-      }
-      std::reverse(align.begin(), align.end());
-      return align; // [t][s] -> P(s|t)
+  SoftAlignment tracebackAlignment() {
+    SoftAlignment align;
+    for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
+      align.push_back(hyp->getAlignment());
+    }
+    std::reverse(align.begin(), align.end());
+    return align;  // [t][s] -> P(s|t)
  }

 private:
--- a/src/translator/output_printer.cpp
+++ b/src/translator/output_printer.cpp
@ -1,5 +1,7 @@
 #include "output_printer.h"

+#include <sstream>
+
 namespace marian {

 std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
@ -19,11 +21,18 @@ std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
  } else if(alignment_ == "hard") {
    return data::ConvertSoftAlignToHardAlign(align, 1.f).toString();
  } else if(alignmentThreshold_ > 0.f) {
-    return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_)
-        .toString();
+    return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_).toString();
  } else {
    ABORT("Unrecognized word alignment type");
  }
 }

+std::string OutputPrinter::getWordScores(const Hypothesis::PtrType& hyp) {
+  std::ostringstream scores;
+  scores.precision(5);
+  for(const auto& score : hyp->tracebackWordScores())
+    scores << " " << std::fixed << score;
+  return scores.str();
+}
+
 }  // namespace marian
--- a/src/translator/output_printer.h
+++ b/src/translator/output_printer.h
@ -20,12 +20,14 @@ public:
                   ? options->get<size_t>("beam-size")
                   : 0),
        alignment_(options->get<std::string>("alignment", "")),
-        alignmentThreshold_(getAlignmentThreshold(alignment_)) {}
+        alignmentThreshold_(getAlignmentThreshold(alignment_)),
+        wordScores_(options->get<bool>("word-scores")) {}

  template <class OStream>
  void print(Ptr<const History> history, OStream& best1, OStream& bestn) {
    const auto& nbl = history->nBest(nbest_);

+    // prepare n-best list output
    for(size_t i = 0; i < nbl.size(); ++i) {
      const auto& result = nbl[i];
      const auto& hypo = std::get<1>(result);
@ -40,6 +42,9 @@ public:
      if(!alignment_.empty())
        bestn << " ||| " << getAlignment(hypo);

+      if(wordScores_)
+        bestn << " ||| WordScores=" << getWordScores(hypo);
+
      bestn << " |||";
      if(hypo->getScoreBreakdown().empty()) {
        bestn << " F0=" << hypo->getPathScore();
@ -72,17 +77,26 @@ public:
      best1 << " ||| " << getAlignment(hypo);
    }

+    if(wordScores_) {
+      const auto& hypo = std::get<1>(result);
+      best1 << " ||| WordScores=" << getWordScores(hypo);
+    }
+
    best1 << std::flush;
  }

 private:
  Ptr<Vocab const> vocab_;
-  bool reverse_{false};
-  size_t nbest_{0};
-  std::string alignment_;
-  float alignmentThreshold_{0.f};
+  bool reverse_{false};            // If it is a right-to-left model that needs reversed word order
+  size_t nbest_{0};                // Size of the n-best list to print
+  std::string alignment_;          // A non-empty string indicates the type of word alignment
+  float alignmentThreshold_{0.f};  // Threshold for converting attention into hard word alignment
+  bool wordScores_{false};         // Whether to print word-level scores or not

+  // Get word alignment pairs or soft alignment
  std::string getAlignment(const Hypothesis::PtrType& hyp);
+  // Get word-level scores
+  std::string getWordScores(const Hypothesis::PtrType& hyp);

  float getAlignmentThreshold(const std::string& str) {
    try {
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@ -175,6 +175,7 @@ private:

  std::vector<Ptr<Vocab>> srcVocabs_;
  Ptr<Vocab> trgVocab_;
+  Ptr<const data::ShortlistGenerator> shortlistGenerator_;

  size_t numDevices_;

@ -199,6 +200,11 @@ public:
    trgVocab_ = New<Vocab>(options_, vocabPaths.size() - 1);
    trgVocab_->load(vocabPaths.back());

+    // load lexical shortlist
+    if(options_->hasAndNotEmpty("shortlist"))
+      shortlistGenerator_ = New<data::LexicalShortlistGenerator>(
+          options_, srcVocabs_.front(), trgVocab_, 0, 1, vocabPaths.front() == vocabPaths.back());
+
    // get device IDs
    auto devices = Config::getDevices(options_);
    numDevices_ = devices.size();
@ -218,8 +224,11 @@ public:
      graphs_.push_back(graph);

      auto scorers = createScorers(options_);
-      for(auto scorer : scorers)
+      for(auto scorer : scorers) {
        scorer->init(graph);
+        if(shortlistGenerator_)
+          scorer->setShortlistGenerator(shortlistGenerator_);
+      }
      scorers_.push_back(scorers);
    }
  }
--- a/vs/Marian.sln
+++ b/vs/Marian.sln
@ -1,7 +1,7 @@

 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 15
-VisualStudioVersion = 15.0.27703.2047
+VisualStudioVersion = 15.0.28307.902
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Marian", "Marian.vcxproj", "{E2F320FE-0C01-4C80-810C-3A92205A29DC}"
 EndProject
@ -20,6 +20,6 @@ Global
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {8CA1BE8F-87A9-4094-B549-E8C790F79D8C}
+		SolutionGuid = {3B922907-3384-4D39-9CEB-816BF7BB390D}
 	EndGlobalSection
 EndGlobal
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@ -43,14 +43,14 @@
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
-    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\googletest\googletest;..\src\3rd_party\fbgemm\third_party\googletest\googletest\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
    <LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\Marian\</IntDir>
-    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
+    <IncludePath>$(CudaToolkitIncludeDir);..\src\3rd_party\fbgemm\third_party\googletest\googletest;..\src\3rd_party\fbgemm\third_party\googletest\googletest\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include;..\src\3rd_party\fbgemm\third_party\cpuinfo\src;..\src\3rd_party\fbgemm\third_party\cpuinfo\include;..\src\3rd_party\fbgemm;..\src\3rd_party\fbgemm\third_party\asmjit\src;%MKL_PATH%\include;..\src\3rd_party\fbgemm\include;..\src;..\src\3rd_party;%BOOST_INCLUDE_PATH%;%ZLIB_PATH%\include;$(VC_IncludePath);$(WindowsSDK_IncludePath)</IncludePath>
    <LibraryPath>$(CudaToolkitLibDir);%BOOST_LIB_PATH%;%ZLIB_PATH%\lib;%MKL_PATH%\lib\intel64;$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);$(NETFXKitsDir)Lib\um\x64</LibraryPath>
  </PropertyGroup>
  <ItemDefinitionGroup>
@ -70,7 +70,7 @@
      </PrecompiledHeader>
      <WarningLevel>Level4</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>USE_MKL;ASMJIT_EXPORTS;BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>false</SDLCheck>
      <TreatWarningAsError>true</TreatWarningAsError>
      <AdditionalOptions>/bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
@ -107,7 +107,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>USE_MKL;ASMJIT_EXPORTS;BOOST_CONFIG_SUPPRESS_OUTDATED_MESSAGE; FBGEMM_EXPORTS; USE_FBGEMM=1; USE_SSE2=1; CUDA_FOUND=1; MKL_FOUND=1; MPI_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>false</SDLCheck>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <AdditionalOptions>/d2Zi+ /bigobj /arch:AVX %(AdditionalOptions)</AdditionalOptions>
@ -141,6 +141,102 @@
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="..\src\3rd_party\ExceptionWithCallStack.cpp" />
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\BenchUtils.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\ConvUnifiedBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\Depthwise3DBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\DepthwiseBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\FP16Benchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsTunableBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GroupwiseConvRequantizeBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\I8SpmdmBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\Im2ColFusedRequantizeBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedFloatInOutBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc16Benchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc32Benchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\RequantizeBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\RowOffsetBenchmark.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\codegen_fp16fp32.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -177,6 +273,12 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8Depthwise3DAvx2.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -201,6 +303,12 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC16Avx512VNNI.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -213,6 +321,12 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32Avx512VNNI.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\GroupwiseConvAcc32Avx2.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -255,6 +369,12 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\PackDepthwiseConvMatrixAvx2.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\PackMatrix.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</TreatWarningAsError>
@ -309,153 +429,253 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\FP16Test.cc">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\GConvTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\I8DepthwiseTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\I8SpmdmTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\Im2ColFusedRequantizeTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeAcc16Test.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\QuantUtilsTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\RequantizeOnlyTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\TestUtils.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\UniConvTest.cc">
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal.cpp">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass.cpp">
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi.cpp">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
@ -472,20 +692,10 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand_regs.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
-      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
-    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass.cpp">
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
@ -579,6 +789,11 @@
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-all.cc" />
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest_main.cc">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\entry_iterator.cpp" />
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\errors.cpp" />
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\path.cpp" />
@ -586,6 +801,10 @@
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie_ifstream.cpp" />
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\pathie_ofstream.cpp" />
    <ClCompile Include="..\src\3rd_party\pathie-cpp\src\temp.cpp" />
+    <ClCompile Include="..\src\3rd_party\phf\phf.cc">
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">TurnOffAllWarnings</WarningLevel>
+      <WarningLevel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">TurnOffAllWarnings</WarningLevel>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\sentencepiece\src\bpe_model.cc">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -791,6 +1010,8 @@
    <ClInclude Include="..\src\3rd_party\any_type.h" />
    <ClInclude Include="..\src\3rd_party\avx_mathfun.h" />
    <ClInclude Include="..\src\3rd_party\ExceptionWithCallStack.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\bench\AlignedVec.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\bench\BenchUtils.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\ConvUtils.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Fbgemm.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\FbgemmBuild.h" />
@ -804,56 +1025,77 @@
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Types.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\Utils.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\UtilsAvx2.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\src\CodeCache.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernelGeneric.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\ExecuteKernelU8S8.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmFP16UKernelsAvx2.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2-inl.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\GenerateKernel.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\GroupwiseConv.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\OptimizedKernelsAvx2.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\RefImplementations.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\src\TransposeUtils.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\arm.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\test\TestUtils.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apibegin.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apiend.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_build.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\misc_p.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc_p.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\simdtypes.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\build.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codebufferwriter_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\datatypes.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\features.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\misc_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\raassignment_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rabuilders_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\radefs_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestring.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86emitter.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86globals.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86logging_p.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86misc.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86opcode_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.h" />
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc_p.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass_p.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\include\cpuinfo-mock.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\include\cpuinfo.h" />
@ -864,6 +1106,7 @@
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\api.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\cpuid.h" />
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\windows\api.h" />
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-internal-inl.h" />
    <ClInclude Include="..\src\3rd_party\half_float\umHalf.h" />
    <ClInclude Include="..\src\3rd_party\nccl\src\collectives\collectives.h">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -962,6 +1205,7 @@
    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie_ifstream.hpp" />
    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\pathie_ofstream.hpp" />
    <ClInclude Include="..\src\3rd_party\pathie-cpp\include\temp.hpp" />
+    <ClInclude Include="..\src\3rd_party\phf\phf.h" />
    <ClInclude Include="..\src\3rd_party\sentencepiece\src\bpe_model.h">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
@ -1094,6 +1338,7 @@
    <ClCompile Include="..\src\common\cli_helper.cpp" />
    <ClCompile Include="..\src\common\cli_wrapper.cpp" />
    <ClCompile Include="..\src\common\config_validator.cpp" />
+    <ClCompile Include="..\src\common\fastopt.cpp" />
    <ClCompile Include="..\src\common\filesystem.cpp" />
    <ClCompile Include="..\src\common\file_stream.cpp" />
    <ClCompile Include="..\src\common\io.cpp" />
@ -1136,13 +1381,10 @@
    <ClCompile Include="..\src\rescorer\score_collector.cpp" />
    <ClCompile Include="..\src\tensors\backend.cpp" />
    <ClCompile Include="..\src\tensors\cpu\device.cpp" />
+    <ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp" />
    <ClCompile Include="..\src\tensors\cpu\prod.cpp" />
    <ClCompile Include="..\src\tensors\cpu\sharp\avx_gemm.cpp" />
    <ClCompile Include="..\src\tensors\cpu\sharp\int_gemm.cpp" />
-    <ClCompile Include="..\src\tensors\cpu\sharp\packed_gemm.cpp">
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
-      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
-    </ClCompile>
    <ClCompile Include="..\src\tensors\cpu\sharp\sse_gemm.cpp" />
    <ClCompile Include="..\src\tensors\cpu\tensor_operators.cpp" />
    <ClCompile Include="..\src\graph\expression_graph.cpp" />
@ -1251,6 +1493,7 @@
    <ClInclude Include="..\src\common\cli_helper.h" />
    <ClInclude Include="..\src\common\cli_wrapper.h" />
    <ClInclude Include="..\src\common\config_validator.h" />
+    <ClInclude Include="..\src\common\fastopt.h" />
    <ClInclude Include="..\src\common\filesystem.h" />
    <ClInclude Include="..\src\common\hash.h" />
    <ClInclude Include="..\src\common\io.h" />
@ -1267,7 +1510,6 @@
    <ClInclude Include="..\src\examples\mnist\validator.h" />
    <ClInclude Include="..\src\functional\approx.h" />
    <ClInclude Include="..\src\functional\operators.h" />
-    <ClInclude Include="..\src\graph\expression_graph_packable.h" />
    <ClInclude Include="..\src\layers\loss.h" />
    <ClInclude Include="..\src\layers\weight.h" />
    <ClInclude Include="..\src\marian.h" />
@ -1487,9 +1729,10 @@
    <ClInclude Include="..\src\rnn\types.h" />
    <ClInclude Include="..\src\tensors\allocator.h" />
    <ClInclude Include="..\src\tensors\backend.h" />
-    <ClInclude Include="..\src\tensors\cpu\expanded_gemm.h" />
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\expanded_gemm.h" />
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\expression_graph_packable.h" />
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\packed_gemm.h" />
    <ClInclude Include="..\src\tensors\cpu\sharp\int_gemm.h" />
-    <ClInclude Include="..\src\tensors\cpu\sharp\packed_gemm.h" />
    <ClInclude Include="..\src\tensors\device.h" />
    <ClInclude Include="..\src\tensors\dispatch.h" />
    <ClInclude Include="..\src\tensors\gpu\add.h" />
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@ -490,9 +490,6 @@
    <ClCompile Include="..\src\tensors\gpu\prod.cpp">
      <Filter>tensors\gpu</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\tensors\cpu\sharp\packed_gemm.cpp">
-      <Filter>tensors\cpu\sharp</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\src\ExecuteKernel.cc">
      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
@ -616,19 +613,127 @@
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\x86\cache\init.c">
      <Filter>3rd_party\fbgemm\third_party\cpuinfo\src\x86\cacehe</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src\clog.c">
+      <Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\common\aliases.cpp">
+      <Filter>common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\common\filesystem.cpp">
+      <Filter>common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi.cpp">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal.cpp">
@ -640,74 +745,110 @@
    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand_regs.cpp">
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass.cpp">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\codegen_fp16fp32.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\FbgemmI8Depthwise3DAvx2.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC16Avx512VNNI.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\GenerateKernelU8S8S32ACC32Avx512VNNI.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\src\PackDepthwiseConvMatrixAvx2.cc">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\FP16Test.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\GConvTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\I8DepthwiseTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\I8SpmdmTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\Im2ColFusedRequantizeTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeAcc16Test.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\PackedRequantizeTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\QuantUtilsTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\RequantizeOnlyTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\TestUtils.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\test\UniConvTest.cc">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\BenchUtils.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\ConvUnifiedBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\Depthwise3DBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.cpp">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\DepthwiseBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src\clog.c">
-      <Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src</Filter>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\FP16Benchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GEMMsTunableBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\GroupwiseConvRequantizeBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\I8SpmdmBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\Im2ColFusedRequantizeBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedFloatInOutBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc16Benchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\PackedRequantizeAcc32Benchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\RequantizeBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\bench\RowOffsetBenchmark.cc">
+      <Filter>3rd_party\fbgemm\bench</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest_main.cc">
+      <Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-all.cc">
+      <Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
    </ClCompile>
    <ClCompile Include="..\src\common\aliases.cpp">
      <Filter>common</Filter>
@ -733,6 +874,15 @@
    <ClCompile Include="..\src\common\types.cpp">
      <Filter>common</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\common\fastopt.cpp">
+      <Filter>common</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\3rd_party\phf\phf.cc">
+      <Filter>3rd_party\phf</Filter>
+    </ClCompile>
+    <ClCompile Include="..\src\tensors\cpu\fbgemm\packed_gemm.cpp">
+      <Filter>tensors\cpu\fbgemm</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\src\marian.h" />
@ -1798,12 +1948,6 @@
    <ClInclude Include="..\src\tensors\gpu\add.inc">
      <Filter>tensors\gpu</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\tensors\cpu\expanded_gemm.h">
-      <Filter>tensors\cpu</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\tensors\cpu\sharp\packed_gemm.h">
-      <Filter>tensors\cpu\sharp</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\include\fbgemm\ConvUtils.h">
      <Filter>3rd_party\fbgemm\include\fbgemm</Filter>
    </ClInclude>
@ -1897,46 +2041,163 @@
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\src\cpuinfo\utils.h">
      <Filter>3rd_party\fbgemm\third_party\cpuinfo\src\cpuinfo</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\arm.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apibegin.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_apiend.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\asmjit_build.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
-    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h">
+      <Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\arch.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\assembler.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\build.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\builder.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\callconv.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codebufferwriter_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\codeholder.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\compiler.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\constpool.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\cpuinfo.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\datatypes.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\emitter.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\features.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\func.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\globals.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\inst.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitallocator.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\jitruntime.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\logging.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\misc_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\operand.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\osutils.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\raassignment_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rabuilders_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\radefs_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\ralocal_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rapass_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\rastack_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\string.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\support.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\target.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\type.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\virtmem.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zone.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonehash.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonelist.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestack.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonestring.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonetree.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core\zonevector.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\core</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86assembler.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86builder.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86callconv_p.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86compiler.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86emitter.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86features.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86globals.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86inst.h">
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instapi_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instimpl_p.h">
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86instdb_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86internal_p.h">
@ -1945,83 +2206,35 @@
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86logging_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86misc.h">
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86opcode_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86operand.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86regalloc_p.h">
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86\x86rapass_p.h">
      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\arch.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\src\CodeCache.h">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\assembler.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\src\FbgemmI8DepthwiseAvx2-inl.h">
+      <Filter>3rd_party\fbgemm\src</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codebuilder.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\test\QuantizationHelpers.h">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codecompiler.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\test\TestUtils.h">
+      <Filter>3rd_party\fbgemm\test</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeemitter.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\bench\AlignedVec.h">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\codeholder.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\bench\BenchUtils.h">
+      <Filter>3rd_party\fbgemm\bench</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\constpool.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\cpuinfo.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\func.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\globals.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\inst.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\logging.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\misc_p.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\operand.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\osutils.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\regalloc_p.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\runtime.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\simdtypes.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\string.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\utils.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\vmem.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\base\zone.h">
-      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit\base</Filter>
-    </ClInclude>
-    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include\clog.h">
-      <Filter>3rd_party\fbgemm\third_party\cpuinfo\deps\clog\include</Filter>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\googletest\googletest\src\gtest-internal-inl.h">
+      <Filter>3rd_party\fbgemm\third_party\googletest\googletest\src</Filter>
    </ClInclude>
    <ClInclude Include="..\src\3rd_party\half_float\umHalf.h">
      <Filter>3rd_party\half_float</Filter>
@ -2047,8 +2260,23 @@
    <ClInclude Include="..\src\3rd_party\zstr\zstr.hpp">
      <Filter>3rd_party</Filter>
    </ClInclude>
-    <ClInclude Include="..\src\graph\expression_graph_packable.h">
-      <Filter>graph</Filter>
+    <ClInclude Include="..\src\common\fastopt.h">
+      <Filter>common</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\phf\phf.h">
+      <Filter>3rd_party\phf</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\3rd_party\fbgemm\third_party\asmjit\src\asmjit\core.h">
+      <Filter>3rd_party\fbgemm\third_party\asmjit\src\asmjit</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\expanded_gemm.h">
+      <Filter>tensors\cpu\fbgemm</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\expression_graph_packable.h">
+      <Filter>tensors\cpu\fbgemm</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\tensors\cpu\fbgemm\packed_gemm.h">
+      <Filter>tensors\cpu\fbgemm</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
@ -2265,9 +2493,6 @@
    <Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\x86">
      <UniqueIdentifier>{5818c959-7963-4d8e-9e87-b61f340476c2}</UniqueIdentifier>
    </Filter>
-    <Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\base">
-      <UniqueIdentifier>{15414ec0-8761-4068-afef-822b7bed88df}</UniqueIdentifier>
-    </Filter>
    <Filter Include="3rd_party\fbgemm\third_party\cpuinfo\deps">
      <UniqueIdentifier>{d4505c8d-5e6e-4baf-8525-dc59ae8b6415}</UniqueIdentifier>
    </Filter>
@ -2280,9 +2505,33 @@
    <Filter Include="3rd_party\fbgemm\third_party\cpuinfo\deps\clog\src">
      <UniqueIdentifier>{8fd74b1e-d3c1-4158-ad46-4a447222934e}</UniqueIdentifier>
    </Filter>
+    <Filter Include="3rd_party\fbgemm\third_party\asmjit\src\asmjit\core">
+      <UniqueIdentifier>{b3b34c5f-5b98-436a-b34c-11e2dccb7ea2}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\test">
+      <UniqueIdentifier>{40576dca-07d5-4904-8119-ffbc982451a3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\bench">
+      <UniqueIdentifier>{9f11c8f1-78f7-47c6-9eac-34cd2c6cd909}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\third_party\googletest">
+      <UniqueIdentifier>{75f9df88-0eb1-4d9a-858e-4e0b8fc3aa8a}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\third_party\googletest\googletest">
+      <UniqueIdentifier>{9f77e916-1d2f-4c15-9eba-46bcbddd2658}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="3rd_party\fbgemm\third_party\googletest\googletest\src">
+      <UniqueIdentifier>{050ba410-c56a-4607-8401-935f58f598b5}</UniqueIdentifier>
+    </Filter>
    <Filter Include="3rd_party\half_float">
      <UniqueIdentifier>{defd3aec-3c56-4d70-a4bb-90ba9003d98d}</UniqueIdentifier>
    </Filter>
+    <Filter Include="3rd_party\phf">
+      <UniqueIdentifier>{352ac0e9-daed-437a-bc36-fb85ecd037eb}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="tensors\cpu\fbgemm">
+      <UniqueIdentifier>{bf361868-f451-45b8-9695-570d67924972}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\src\3rd_party\nccl\src\bootstrap.cu">