Merged PR 11920: Compare external master against internal master

Compare external master against internal master. Just double checking.
2024-09-11 06:15:56 +03:00 · 2020-03-10 00:29:55 +00:00 · 2020-03-10 00:29:55 +00:00 · cf7f0321f8
commit cf7f0321f8
parent 9f29403627
79 changed files with 533 additions and 332 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]

 ### Added
+- Add support for compiling on Mac (and clang)
+- An option for resetting stalled validation metrics
 - Add CMAKE options to disable compilation for specific GPU SM types
 - An option to print word-level translation scores
 - An option to turn off automatic detokenization from SentencePiece
@ -59,6 +61,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Compilation with CUDA 10.1

 ### Changed
+- Combine two for-loops in nth_element.cpp on CPU
 - Revert LayerNorm eps to old position, i.e. sigma' = sqrt(sigma^2 + eps)
 - Downgrade NCCL to 2.3.7 as 2.4.2 is buggy (hangs with larger models)
 - Return error signal on SIGTERM
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,6 +24,7 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
+option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" OFF)
@ -33,7 +34,7 @@ option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 if(USE_CCACHE)
 find_program(CCACHE_PROGRAM ccache)
 if(CCACHE_PROGRAM)
-  message(STATUS "Found and will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
+  message(STATUS "Will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
 else(CCACHE_PROGRAM)
  message(WARNING "Compilation with ccache requested but no ccache found.")
@ -141,20 +142,32 @@ else(MSVC)
    add_definitions(-DUSE_FBGEMM=1)
  endif(USE_FBGEMM)

-  set(DISABLE_GLOBALLY "-Wno-unused-result")
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+    # Clang-10.0.0 complains when CUDA is newer than 10.1
+    set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-cuda-version")
+  endif()
+  set(DISABLE_GLOBALLY "-Wno-unused-result -Wno-unknown-warning-option ${CLANG_IGNORE_UNKNOWN_CUDA}")

  # These are used in src/CMakeLists.txt on a per-target basis
-  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-                          -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
+  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
+    -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
+    -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;
+    -Wno-missing-field-initializers;)

  # This warning does not exist prior to gcc 5.0
  if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
-    list(APPEND ALL_WARNINGS -Wsuggest-override)
+    list(APPEND ALL_WARNINGS -Wsuggest-override -Wno-int-in-bool-context)
  endif()

-  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread -Wl,--no-as-needed -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_CXX_FLAGS_RELEASE         "-Ofast -m64 -funroll-loops -ffinite-math-only -g -rdynamic")
-  set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g -rdynamic")
+  if(CMAKE_COMPILER_IS_GNUCC)
+    # these flags are not known to clang
+    set(CMAKE_GCC_FLAGS "-Wl,--no-as-needed")
+    set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
+  endif(CMAKE_COMPILER_IS_GNUCC)
+
+  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_CXX_FLAGS_RELEASE         "-Ofast -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
  set(CMAKE_CXX_FLAGS_SLIM            "-Ofast -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELEASE}")
  set(CMAKE_CXX_FLAGS_PROFILE         "${CMAKE_CXX_FLAGS_RELEASE} -pg")
@ -162,9 +175,9 @@ else(MSVC)
  set(CMAKE_CXX_FLAGS_PROFUSE         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")

  # these need to be set separately
-  set(CMAKE_C_FLAGS                 "-pthread -Wl,--no-as-needed -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -ffinite-math-only -g -rdynamic")
-  set(CMAKE_C_FLAGS_DEBUG           "-O0 -g -rdynamic")
+  set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_C_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
  set(CMAKE_C_FLAGS_SLIM            "-O3 -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
  set(CMAKE_C_FLAGS_RELWITHDEBINFO  "${CMAKE_C_FLAGS_RELEASE}")
  set(CMAKE_C_FLAGS_PROFILE         "${CMAKE_C_FLAGS_RELEASE} -pg")
@ -204,7 +217,7 @@ if(CUDA_FOUND)
  if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2"))
      message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
  endif()
-  
+
  if(COMPILE_CUDA_SM35)
    LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;)                             # Tesla K40 and above
  endif(COMPILE_CUDA_SM35)
@ -323,13 +336,15 @@ if(USE_MPI)
 endif(USE_MPI)

 if(COMPILE_CPU)
-  find_package(MKL)
+  if(USE_MKL)
+    find_package(MKL)
+  endif(USE_MKL)
  if(MKL_FOUND)
    include_directories(${MKL_INCLUDE_DIR})
    set(EXT_LIBS ${EXT_LIBS} ${MKL_LIBRARIES})
    add_definitions(-DBLAS_FOUND=1 -DMKL_FOUND=1)
  else(MKL_FOUND)
-    set(BLA_VENDOR "OpenBLAS")
+    set(BLAS_VENDOR "OpenBLAS")
    find_package(BLAS)
    if(BLAS_FOUND)
      include(FindCBLAS)
--- a/2
+++ b/2
@ -1 +1 @@
-v1.8.43
+v1.8.51
--- a/cmake/FindCBLAS.cmake
+++ b/cmake/FindCBLAS.cmake
@ -54,7 +54,7 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES INCLUDE _prefix _name _flags _list _include
      IF(APPLE) 
        FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
          NAMES ${_library}
-          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV 
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 /usr/local/opt/openblas/lib ENV
          DYLD_LIBRARY_PATH 
          )
      ELSE(APPLE)
--- a/scripts/embeddings/export_embeddings.py
+++ b/scripts/embeddings/export_embeddings.py
@ -9,18 +9,22 @@ import numpy as np


 def main():
-    desc = """Export word embedding from model"""
+    desc = """Export word embeddings from model"""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=desc)
-    parser.add_argument("-m", "--model", help="Model file", required=True)
-    parser.add_argument(
-        "-o", "--output-prefix", help="Output files prefix", required=True)
+    parser.add_argument("-m", "--model", help="path to model.npz file", required=True)
+    parser.add_argument("-o", "--output-prefix", help="prefix for output files", required=True)
    args = parser.parse_args()

    print("Loading model")
    model = np.load(args.model)
    special = yaml.load(model["special:model.yml"][:-1].tobytes())

+    if special["tied-embeddings-all"] or special["tied-embeddings-src"]:
+        all_emb = model["Wemb"]
+        export_emb(args.output_prefix + ".all", all_emb)
+        exit()
+
    if special["type"] == "amun":
        enc_emb = model["Wemb"]
        dec_emb = model["Wemb_dec"]
@ -28,16 +32,15 @@ def main():
        enc_emb = model["encoder_Wemb"]
        dec_emb = model["decoder_Wemb"]

-    with open(args.output_prefix + ".src", "w") as out:
-        out.write("{0} {1}\n".format(*enc_emb.shape))
-        for i in range(enc_emb.shape[0]):
-            vec = " ".join("{0:.8f}".format(v) for v in enc_emb[i])
-            out.write("{0} {1}\n".format(i, vec))
+    export_emb(args.output_prefix + ".src", enc_emb)
+    export_emb(args.output_prefix + ".trg", dec_emb)

-    with open(args.output_prefix + ".trg", "w") as out:
-        out.write("{0} {1}\n".format(*dec_emb.shape))
-        for i in range(dec_emb.shape[0]):
-            vec = " ".join("{0:.8f}".format(v) for v in dec_emb[i])
+
+def export_emb(filename, emb):
+    with open(filename, "w") as out:
+        out.write("{0} {1}\n".format(*emb.shape))
+        for i in range(emb.shape[0]):
+            vec = " ".join("{0:.8f}".format(v) for v in emb[i])
            out.write("{0} {1}\n".format(i, vec))


--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@ -15,12 +15,22 @@ if(USE_FBGEMM)

  if(NOT MSVC)
    # only locally disabled for the 3rd_party folder
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
+    # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused")
  endif()

  set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "Disable fbgemm tests")
  set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "Disable fbgemm benchmark")
  add_subdirectory(./fbgemm)
+
+  # asmjit (3rd-party submodule of fbgemm) sets -Wall -Wextra near the end of
+  # the compile options, invalidating any -Wno-... flags that we may have set
+  # earlier. Let's remove them.
+  get_property(ASMJIT_COMPILE_OPTIONS TARGET asmjit PROPERTY COMPILE_OPTIONS)
+  list(REMOVE_ITEM ASMJIT_COMPILE_OPTIONS -Wall -Wextra)
+  set_property(TARGET asmjit PROPERTY COMPILE_OPTIONS ${ASMJIT_COMPILE_OPTIONS})
+  message("   ASMJIT COMPILE FLAGS: ${ASMJIT_COMPILE_OPTIONS}")
+
 endif(USE_FBGEMM)

 if(USE_SENTENCEPIECE)
@ -39,7 +49,7 @@ if(USE_SENTENCEPIECE)
    message(WARNING "You are compiling SentencePiece binaries with -DUSE_STATIC_LIBS=on. \
    This will cause spm_train to segfault. No need to worry if you do not intend to use that binary. \
    Marian support for SentencePiece will work fine.")
-    
+
    set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
    set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
  else(USE_STATIC_LIBS)
@ -51,8 +61,19 @@ if(USE_SENTENCEPIECE)
  include_directories(./sentencepiece)

  set_target_properties(spm_encode spm_decode spm_train spm_normalize spm_export_vocab
-                        PROPERTIES
-                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    foreach(t sentencepiece sentencepiece_train sentencepiece_train-static
+        spm_decode spm_encode spm_export_vocab spm_normalize spm_train)
+      set_property(TARGET ${t} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-tautological-compare -Wno-unused")
+      if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+        set_property(TARGET ${t} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-range-loop-construct")
+      endif()
+      # get_property(SENTENCEPIECE_COMPILE_FLAGS TARGET ${t} PROPERTY COMPILE_FLAGS)
+      # message("-- SENTENCPIECE: compile flags for target ${t}: ${SENTENCEPIECE_COMPILE_FLAGS}")
+    endforeach(t)
+  endif()

  if(USE_STATIC_LIBS)
    set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
@ -63,6 +84,22 @@ include_directories(./SQLiteCpp/include)
 include_directories(./CLI)
 include_directories(./pathie-cpp/include)

+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  #set_target_properties(SQLiteCpp PROPERTIES COMPILE_FLAGS
+  set_property(TARGET SQLiteCpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -Wno-parentheses-equality -Wno-unused-value")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+    set_property(TARGET SQLiteCpp APPEND_STRING PROPERTY COMPILE_FLAGS
+      " -Wno-implicit-int-float-conversion")
+  endif()
+  set_property(TARGET libyaml-cpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -fPIC -Wno-unused-value")
+  set_property(TARGET pathie-cpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -fPIC -Wno-unused-value")
+endif()
+
+
+
 include_directories(./zlib)

 include(ExternalProject)
--- a/src/3rd_party/half_float/umHalf.inl
+++ b/src/3rd_party/half_float/umHalf.inl
@ -186,7 +186,7 @@ inline HalfFloat& HalfFloat::operator= (float other)
 inline bool HalfFloat::operator== (HalfFloat other) const
 {
 	// +0 and -0 are considered to be equal
-	if (!(bits << 1u) && !(other.bits << 1u))return true;
+	if ((bits << 1u) == 0 && (other.bits << 1u) == 0) return true;

 	return bits == other.bits && !this->IsNaN();
 }
@ -194,7 +194,7 @@ inline bool HalfFloat::operator== (HalfFloat other) const
 inline bool HalfFloat::operator!= (HalfFloat other) const
 {
 	// +0 and -0 are considered to be equal
-	if (!(bits << 1u) && !(other.bits << 1u))return false;
+	if ((bits << 1u) == 0 && (other.bits << 1u) == 0) return false;

 	return bits != other.bits || this->IsNaN();
 }
--- a/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
+++ b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
@ -31,7 +31,7 @@
 #include "../include/path.hpp"
 #include "../include/errors.hpp"

-#if defined(__unix__)
+#if defined(__unix__) || defined(__APPLE__)
 #include <sys/types.h>
 #include <dirent.h>
 #include <errno.h>
--- a/src/3rd_party/pathie-cpp/src/path.cpp
+++ b/src/3rd_party/pathie-cpp/src/path.cpp
@ -902,7 +902,7 @@ Path Path::pwd()
 */
 Path Path::exe()
 {
-#if defined(__linux__)
+#if defined(__linux__) || defined(__APPLE__)
  char buf[PATH_MAX];
  ssize_t size = ::readlink("/proc/self/exe", buf, PATH_MAX);

--- a/src/3rd_party/pathie-cpp/src/pathie.cpp
+++ b/src/3rd_party/pathie-cpp/src/pathie.cpp
@ -143,7 +143,7 @@ std::string Pathie::convert_encodings(const char* from_encoding, const char* to_
    errno  = 0;
    errsav = 0;

-#ifdef BSD
+#if defined(BSD) && ! defined(__APPLE__) //Since MacOS evolved from BSD, it is captured here but the iconv on macos behaves differently
    // What the heck. FreeBSD violates POSIX.1-2008: it declares iconv()
    // differently than mandated by POSIX: http://pubs.opengroup.org/onlinepubs/9699919799/functions/iconv.html
    // (it declares a `const' where it must not be).
@ -181,11 +181,10 @@ std::string Pathie::convert_encodings(const char* from_encoding, const char* to_
 std::string Pathie::utf8_to_filename(const std::string& utf8)
 {
  bool fs_encoding_is_utf8 = false;
-
+  char* fsencoding = NULL;
 #if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
  fs_encoding_is_utf8 = true;
 #else
-  char* fsencoding = NULL;
  fsencoding = nl_langinfo(CODESET);
  fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
 #endif
@ -206,11 +205,10 @@ std::string Pathie::utf8_to_filename(const std::string& utf8)
 std::string Pathie::filename_to_utf8(const std::string& native_filename)
 {
  bool fs_encoding_is_utf8 = false;
-
+  char* fsencoding = NULL;
 #if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
  fs_encoding_is_utf8 = true;
 #else
-  char* fsencoding = NULL;
  fsencoding = nl_langinfo(CODESET);
  fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
 #endif
--- a/src/3rd_party/zstr/strict_fstream.hpp
+++ b/src/3rd_party/zstr/strict_fstream.hpp
@ -27,7 +27,7 @@ static std::string strerror()
    {
        buff = "Unknown error";
    }
-#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
+#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || __APPLE__) && ! _GNU_SOURCE
 // XSI-compliant strerror_r()
    if (strerror_r(errno, &buff[0], buff.size()) != 0)
    {
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -215,6 +215,10 @@ if(COMPILE_SERVER)
  set(EXECUTABLES ${EXECUTABLES} marian_server)
 endif(COMPILE_SERVER)

+if(APPLE) # This is a dependency of pathie but I can't seem to link it into that CMakeLists because we're not compiling it as a library.
+   set(EXT_LIBS ${EXT_LIBS} iconv)
+endif()
+
 foreach(exec ${EXECUTABLES})
  target_link_libraries(${exec} marian ${EXT_LIBS} ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
  if(CUDA_FOUND)
--- a/src/command/marian_server.cpp
+++ b/src/command/marian_server.cpp
@ -44,7 +44,7 @@ int main(int argc, char **argv) {

  // Error Codes for error code meanings
  // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
-  translate.on_error = [](Ptr<WSServer::Connection> connection,
+  translate.on_error = [](Ptr<WSServer::Connection> /*connection*/,
                          const SimpleWeb::error_code &ec) {
    LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
  };
--- a/src/common/cli_helper.h
+++ b/src/common/cli_helper.h
@ -15,7 +15,6 @@ static inline std::string InterpolateEnvVars(std::string str) {
  // presently has the form /hdfs/VC instead of /{gfs,hdfs}/CLUSTER/VC

  // Catch stdin/stdout and do not process
-  std::cerr << str << std::endl;
  if(str == "stdin" || str == "stdout") {
    return str;
  }
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -525,6 +525,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
      "Metric to use during validation: cross-entropy, ce-mean-words, perplexity, valid-script, "
      "translation, bleu, bleu-detok. Multiple metrics can be specified",
      {"cross-entropy"});
+  cli.add<bool>("--valid-reset-stalled",
+     "Reset all stalled validation metrics when the training is restarted");
  cli.add<size_t>("--early-stopping",
     "Stop if the first validation metric does not improve for  arg  consecutive validation steps",
     10);
@ -553,7 +555,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
      "Size of mini-batch used during validation",
      32);
  cli.add<size_t>("--valid-max-length",
-      "Maximum length of a sentence in a validating sentence pair",
+      "Maximum length of a sentence in a validating sentence pair. "
+      "Sentences longer than valid-max-length are cropped to valid-max-length",
      1000);

  // options for validation script
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@ -10,6 +10,21 @@
 #include <string>
 #include <vector>

+// The macro MAYBE_UNUSED is used to selectively disable
+// unused-variable warnings. C++17 defines the attribute
+// [[maybe_unused]], but I don't think we're at C++17 yet. We can add it when we reach C++17.
+// The compilers gcc and clang (and maybe others) define
+// __has_attribute and support __attribute__(unused) in C++11,
+#if defined __has_attribute
+#  if __has_attribute(unused)
+#    define MAYBE_UNUSED __attribute__((unused))
+#  else
+#    define MAYBE_UNUSED
+#  endif
+#else
+#  define MAYBE_UNUSED
+#endif
+
 #define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is neccessary, remove if no problems occur.
 #define NodeOp(op) [=]() { op; }

--- a/src/common/fastopt.cpp
+++ b/src/common/fastopt.cpp
@ -84,10 +84,16 @@ std::vector<T> As<std::vector<T>>::apply(const FastOpt& node) {
 // specializations for simple vector types
 template struct As<std::vector<bool>>;
 template struct As<std::vector<int>>;
-// Windows and Unix based OS have different type definitions for 'unsigned long'.
-// So, we need an explicit definition for uint64_t. Otherwise, there's a linking error on windows.
+// Windows, Linux based OS and Mac have different type definitions for 'unsigned long'.
+// So, we need an explicit definitions for uint64_t, that cover different platforms.
+// Otherwise, there's a linking error on windows or Linux or Mac.
 // https://software.intel.com/en-us/articles/size-of-long-integer-type-on-different-architecture-and-os/
-template struct As<std::vector<uint64_t>>;
+// https://stackoverflow.com/questions/32021860/c-should-you-size-t-with-a-regular-array
+// MacOS: size_t = unsigned long (8 bytes), uint64_t = unsigned long long (8 bytes)
+// Linux: size_t = unsigned long (8 bytes), uint64_t = unsigned long (8 bytes)
+// Windows: size_t = unsigned long long (8 bytes), uint64_t = unsigned long long (8 bytes)
+template struct As<std::vector<unsigned long long>>;
+template struct As<std::vector<unsigned long>>;
 template struct As<std::vector<float>>;
 template struct As<std::vector<double>>;
 template struct As<std::vector<std::string>>;
@ -103,4 +109,4 @@ std::pair<T1, T2> As<std::pair<T1, T2>>::apply(const FastOpt& node) {
 template struct As<std::pair<int, int>>;

 }
-}
+}
--- a/src/common/fastopt.h
+++ b/src/common/fastopt.h
@ -367,7 +367,8 @@ public:
  }

  const FastOpt& operator[](const char* const key) const {
-    return operator[](crc::crc(key));
+    // MacOS requires explicit cast to size_t before we can use it.
+    return operator[]((size_t)crc::crc(key));
  }

  const FastOpt& operator[](const std::string& key) const {
@ -375,4 +376,4 @@ public:
  }
 };

-}
+}
--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@ -7,10 +7,21 @@
 #include "common/filesystem.h"
 #include "common/logging.h"

+// Even when compiling with clang, __GNUC__ may be defined, so
+// we need to add some extra checks to avoid compile errors with
+// respect to -Wsuggest-override.
 #ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsuggest-override"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-value"
+#  if defined(__has_warning)
+#    if __has_warning("-Wsuggest-override")
+#      pragma GCC diagnostic ignored "-Wsuggest-override"
+#    endif
+#  else
+#    pragma GCC diagnostic ignored "-Wsuggest-override"
+#  endif
 #endif
+
 #ifdef _MSC_VER
 #pragma warning(push) // 4101: 'identifier' : unreferenced local variable. One parameter variable in zstr.hpp is not used.
 #pragma warning(disable : 4101)
@ -82,7 +93,7 @@ protected:

  void NormalizeTempPrefix(std::string& base) const;
  void MakeTemp(const std::string& base);
- 
+
 };

 }  // namespace io
--- a/src/common/filesystem.h
+++ b/src/common/filesystem.h
@ -7,9 +7,19 @@
 // @TODO: go back to canonical names for functions and objects
 // as specified in C++17 so it becomes easy to move in the future

+// Even when compiling with clang, __GNUC__ may be defined, so
+// we need to add some extra checks to avoid compile errors with
+// respect to -Wsuggest-override.
 #ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsuggest-override"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-value"
+#  if defined(__has_warning)
+#    if __has_warning("-Wsuggest-override")
+#      pragma GCC diagnostic ignored "-Wsuggest-override"
+#    endif
+#  else
+#    pragma GCC diagnostic ignored "-Wsuggest-override"
+#  endif
 #endif

 #include "3rd_party/pathie-cpp/include/path.hpp"  // @TODO: update to latest Pathie
--- a/src/common/intrusive_ptr.h
+++ b/src/common/intrusive_ptr.h
@ -147,16 +147,6 @@ inline bool operator!=(const IntrusivePtr<T>& a, const IntrusivePtr<U>& b) {
  return a.get() != b.get();
 }

-template<class T>
-inline bool operator==(const IntrusivePtr<T>& a, T* b) {
-  return a.get() == b;
-}
-
-template<class T>
-inline bool operator!=(const IntrusivePtr<T>& a, T* b) {
-  return a.get() != b;
-}
-
 template<class T>
 inline bool operator==(const IntrusivePtr<T>& a, std::nullptr_t) {
  return a.get() == 0;
@ -167,14 +157,24 @@ inline bool operator!=(const IntrusivePtr<T>& a, std::nullptr_t) {
  return a.get() != 0;
 }

+template<class T>
+inline bool operator==(const IntrusivePtr<T>& a, T* b) {
+  return a.get() == b;
+}
+
+template<class T>
+inline bool operator!=(const IntrusivePtr<T>& a, T* b) {
+  return a.get() != b;
+}
+
 template<class T>
 inline bool operator==(T* a, const IntrusivePtr<T>& b) {
-  return b.get();
+  return a == b.get();
 }

 template<class T>
 inline bool operator!=(T* a, const IntrusivePtr<T>& b) {
-  return b.get();
+  return a != b.get();
 }

 template<class T, class U>
@ -223,5 +223,3 @@ namespace std {
    }
  };
 }
-
-
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@ -124,7 +124,7 @@ static void setErrorHandlers() {
  std::set_terminate(unhandledException);
 #ifdef __unix__
  // catch segfaults
-  struct sigaction sa = { 0 };
+  struct sigaction sa = { {0} };
  sigemptyset(&sa.sa_mask);
  sa.sa_flags = SA_SIGINFO;
  sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };
--- a/src/common/types.h
+++ b/src/common/types.h
@ -254,7 +254,7 @@ enum class Type : size_t {
  packed16      = TypeClass::packed_type + 2u,                          // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
  packed8avx2   = TypeClass::packed_type + 1u + TypeClass::avx2_type,   // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
  packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-  
+
 };

 static inline size_t operator&(TypeClass typeClass, Type type) {
@ -394,7 +394,7 @@ static Type inline typeFromString(const std::string& str) {
    return Type::float32;
  if(str == "float64")
    return Type::float64;
-  
+
  if(str == "packed16")
    return Type::packed16;
  if(str == "packed8avx2")
@ -437,19 +437,35 @@ void matchOrAbort(Type type) {

 namespace typeFitting { // own namespace instead of in class, otherwise we get error "explicit specialization in non-namespace scope"

-  // compares max for different types as constexpr, so can be used at compile-time to determine if RequestType type max fits into ReturnType max, see std::conditional below.
-  template <typename RequestType, typename ReturnType>
-  constexpr bool fitsIntoMax() { return std::numeric_limits<RequestType>::max() <= std::numeric_limits<ReturnType>::max(); } // for built-in types everything is constexpr
+  // Helper function for fitsIntoMax() below
+  // Returns the 'capacity' of a type: number of digits for integers,
+  // max_exponent for floats. We ignore the mantissa for floats.
+  template<typename X> constexpr int capacity() {
+    static_assert(std::is_arithmetic<X>::value || std::is_same<X,HalfFloat>::value,
+                  "Wrong type for this template");
+    return (std::is_integral<X>::value
+            ? std::numeric_limits<X>::digits
+            : std::numeric_limits<X>::max_exponent);
+ }
+
+
+  // Compare max for different types as constexpr, so can be used at compile-time to determine if RequestType type max fits into ReturnType max, see std::conditional below.
+  template <typename RequestType, typename ReturnType>
+  constexpr bool fitsIntoMax() {
+    // We can't just compare std::numeric_limits<>::max(), because Clang-10
+    // complains about rounding errors when implicitly converting int to float
+    return ((!std::is_integral<RequestType>::value // RequestType is a float
+             && std::is_integral<ReturnType>::value) // ReturnType an integer
+            ? capacity<RequestType>() < capacity<ReturnType>() // special case
+            : capacity<RequestType>() <= capacity<ReturnType>()); // normal case
+  } // for built-in types everything is constexpr

-  // add specializations here when needed
-  template <> constexpr bool fitsIntoMax<float16, float>() { return true; };  // for float16 conversion to float is not constexpr, hence specializations
-  template <> constexpr bool fitsIntoMax<float, float16>() { return false; }; // for float16 conversion to float is not constexpr, hence specializations
 }

 template <typename ReturnType>
 class NumericLimits {
 private:
-  
+
  template <typename MaxType> void setLimitsMax() {
    max    = (ReturnType)std::numeric_limits<MaxType>::max();
    lowest = (ReturnType)std::numeric_limits<MaxType>::lowest();
@ -459,10 +475,14 @@ private:
  void setLimits() {
    // check if the maximum of type RequestType fits into ReturnType
    constexpr bool fits = typeFitting::fitsIntoMax<RequestType, ReturnType>();
+    // sanity check:
+    static_assert(fits || typeFitting::fitsIntoMax<ReturnType, RequestType>(),
+                  "RequestType doesn't fit into ReturnType, and ReturnType doesn't "
+                  "fit into RequestType. fitsIntoMax is broken!");
    // and then use the smaller of each types to determine max, min, lowest.
    using MaxType = typename std::conditional<fits, RequestType, ReturnType>::type;
    setLimitsMax<MaxType>();
-    // @TODO: should we rather abort if the RequestType does not fit into ReturnType instead of clipping to smaller type? 
+    // @TODO: should we rather abort if the RequestType does not fit into ReturnType instead of clipping to smaller type?
    // ABORT_IF(!fits, "Type {} is too small to contain max of type {}", typeId<ReturnType>(), typeId<RequestType>());
  }

--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@ -8,12 +8,22 @@
 #include <sstream>
 #include <string>
 #include <set>
-#ifdef __unix__
+#if defined(__unix__) || defined(__APPLE__)
 #include <unistd.h>
 #endif
 #include <codecvt>
 #include <cwctype>

+// MACOS lacks HOST_NAME_MAX
+#ifndef HOST_NAME_MAX
+# if defined(_POSIX_HOST_NAME_MAX)
+#  define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+# elif defined(MAXHOSTNAMELEN)
+#  define HOST_NAME_MAX MAXHOSTNAMELEN
+# endif
+#endif
+
+
 namespace marian {
 namespace utils {

--- a/src/data/batch.h
+++ b/src/data/batch.h
@ -26,7 +26,7 @@ public:

  virtual void setGuidedAlignment(std::vector<float>&&) = 0;
  virtual void setDataWeights(const std::vector<float>&) = 0;
-
+  virtual ~Batch() {};
 protected:
  std::vector<size_t> sentenceIds_;
 };
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@ -66,10 +66,10 @@ protected:
  Ptr<Options> options_;
  bool restored_{false};

-  // replacing old shuffle_ with two variants that determine more fine-grained shuffling behavior. 
-  // Both set to false is equivalent to old shuffle_ == false. 
+  // replacing old shuffle_ with two variants that determine more fine-grained shuffling behavior.
+  // Both set to false is equivalent to old shuffle_ == false.
  // Now we can not shuffle the data, but shuffle batches. Useful for linear reading of very large data sets with pre-reading.
-  // Parameters like maxi-batch determine how much data is pre-read and sorted by length or other criteria. 
+  // Parameters like maxi-batch determine how much data is pre-read and sorted by length or other criteria.
  bool shuffleData_{false};    // determine if full data should be shuffled before reading and batching.
  bool shuffleBatches_{false}; // determine if batches should be shuffled after batching.

@ -103,7 +103,7 @@ private:
    };

    auto cmpNone = [](const Sample& a, const Sample& b) { return a.getId() < b.getId(); }; // sort in order of original ids = original data order unless shuffling
-    
+
    typedef std::function<bool(const Sample&, const Sample&)> cmp_type;
    typedef std::priority_queue<Sample, Samples, cmp_type> sample_queue;

@ -229,7 +229,7 @@ private:

  // this starts fillBatches() as a background operation
  void fetchBatchesAsync() {
-    ABORT_IF(futureBufferedBatches_.valid(), "attempted to restart futureBufferedBatches_ while still running");
+    ABORT_IF(futureBufferedBatches_.valid(), "Attempted to restart futureBufferedBatches_ while still running");
    futureBufferedBatches_ = threadPool_.enqueue([this]() {
      return fetchBatches();
    });
@ -239,7 +239,9 @@ private:
    if(bufferedBatches_.empty()) {
      // out of data: need to get next batch from background thread
      // We only get here if the future has been scheduled to run; it must be valid.
-      ABORT_IF(!futureBufferedBatches_.valid(), "attempted to wait for futureBufferedBatches_ when none pending");
+      ABORT_IF(!futureBufferedBatches_.valid(), "Attempted to wait for futureBufferedBatches_ when none pending.\n"
+          "This error often occurs when Marian tries to restore the training data iterator, but the corpus has been changed or replaced.\n"
+          "If you have changed the training corpus, add --no-restore-corpus to the training command and run it again.");
      bufferedBatches_ = std::move(futureBufferedBatches_.get());
      // if bg thread returns an empty swath, we hit the end of the epoch
      if (bufferedBatches_.empty()) {
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -525,6 +525,7 @@ public:
             const std::vector<Ptr<Vocab>>& vocabs,
             Ptr<Options> options);

+  virtual ~CorpusBase() {}
  virtual std::vector<Ptr<Vocab>>& getVocabs() = 0;

 protected:
--- a/src/data/default_vocab.cpp
+++ b/src/data/default_vocab.cpp
@ -45,6 +45,7 @@ protected:

 public:
  // @TODO: choose between 'virtual' and 'final'. Can we derive from this class?
+  virtual ~DefaultVocab() {};
  virtual const std::string& canonicalExtension() const override { return suffixes_[0]; }
  virtual const std::vector<std::string>& suffixes() const override { return suffixes_; }

@ -295,7 +296,7 @@ private:
 class ClassVocab : public DefaultVocab {
 private:
  // Do nothing.
-  virtual void addRequiredVocabulary(const std::string& vocabPath, bool isJson) override { vocabPath; isJson; }
+  virtual void addRequiredVocabulary(const std::string& /*vocabPath*/, bool /*isJson*/) override {}

  // Not adding special class labels, only seen classes.
  virtual void create(const std::string& vocabPath,
--- a/src/data/shortlist.h
+++ b/src/data/shortlist.h
@ -36,6 +36,8 @@ public:

 class ShortlistGenerator {
 public:
+  virtual ~ShortlistGenerator() {}
+
  virtual Ptr<Shortlist> generate(Ptr<data::CorpusBatch> batch) const = 0;

  // Writes text version of (possibly) pruned short list to file
@ -129,7 +131,6 @@ private:
  Ptr<const Vocab> trgVocab_;

  size_t srcIdx_;
-  size_t trgIdx_;
  bool shared_{false};

  size_t firstNum_{100};
@ -183,13 +184,12 @@ public:
                            Ptr<const Vocab> srcVocab,
                            Ptr<const Vocab> trgVocab,
                            size_t srcIdx = 0,
-                            size_t trgIdx = 1,
+                            size_t /*trgIdx*/ = 1,
                            bool shared = false)
      : options_(options),
        srcVocab_(srcVocab),
        trgVocab_(trgVocab),
        srcIdx_(srcIdx),
-        trgIdx_(trgIdx),
        shared_(shared) {
    std::vector<std::string> vals = options_->get<std::vector<std::string>>("shortlist");

@ -235,7 +235,6 @@ public:

  virtual Ptr<Shortlist> generate(Ptr<data::CorpusBatch> batch) const override {
    auto srcBatch = (*batch)[srcIdx_];
-    // auto trgBatch = (*batch)[trgIdx_];

    // add firstNum most frequent words
    std::unordered_set<WordIndex> indexSet;
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@ -37,6 +37,7 @@ public:
  typedef SentenceTuple Sample;

  TextInput(std::vector<std::string> inputs, std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~TextInput() {}

  Sample next() override;

--- a/src/data/vocab_base.h
+++ b/src/data/vocab_base.h
@ -57,6 +57,7 @@ public:
  virtual Word randWord() const {
    return Word::fromWordIndex(rand() % size());
  }
+  virtual ~IVocab() {};
 };

 class Options;
--- a/src/examples/mnist/dataset.h
+++ b/src/examples/mnist/dataset.h
@ -62,6 +62,7 @@ private:
  std::vector<Input> inputs_;

 public:
+
  std::vector<Input>& inputs() { return inputs_; }

  const std::vector<Input>& inputs() const { return inputs_; }
@ -144,6 +145,8 @@ public:
    loadData();
  }

+  virtual ~MNISTData(){}
+
  void loadData() override {
    ABORT_IF(paths_.size() != 2, "Paths to MNIST data files are not specified");

--- a/src/examples/mnist/model.h
+++ b/src/examples/mnist/model.h
@ -47,6 +47,8 @@ class MNISTLogsoftmax : public ILogProb {
 public:
  MNISTLogsoftmax() {}

+  virtual ~MNISTLogsoftmax(){}
+
  Logits apply(Ptr<IModel> model,
             Ptr<ExpressionGraph> graph,
             Ptr<data::Batch> batch,
@ -61,13 +63,15 @@ public:
  typedef data::MNISTData dataset_type;

  template <class... Args>
-  MnistFeedForwardNet(Ptr<Options> options, Args... args)
+  MnistFeedForwardNet(Ptr<Options> options, Args... /*args*/)
      : options_(options), inference_(options->get<bool>("inference", false)) {}

+  virtual ~MnistFeedForwardNet(){}
+
  virtual Logits build(Ptr<ExpressionGraph> graph,
                     Ptr<data::Batch> batch,
                     bool /*clean*/ = false) override {
-    
+
    return Logits(apply(graph, batch, inference_));
  }

--- a/src/examples/mnist/validator.h
+++ b/src/examples/mnist/validator.h
@ -19,7 +19,9 @@ public:
    builder_ = models::createModelFromOptions(options, models::usage::translation);
  }

-  virtual void keepBest(const std::vector<Ptr<ExpressionGraph>>& graphs) override {
+  virtual ~MNISTAccuracyValidator(){}
+
+  virtual void keepBest(const std::vector<Ptr<ExpressionGraph>>& /*graphs*/) override {
    LOG(warn, "Keeping best model for MNIST examples is not supported");
  }

--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@ -7,55 +7,58 @@ namespace marian {
 namespace functional {

 // General template, will be used for any type without specializations
-// and will fail with an abort message.
+// and will fail at runtime with an abort message. Note that the
+// general template functions don't have named parameters on purpose,
+// because clang will warn about unused parameters during compilation.
+
 template <typename T>
 struct Ops {
-  static HOST_DEVICE_INLINE T tanh(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sin(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T cos(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T tan(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T log(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T exp(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T abs(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sqrt(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T neg(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sgn(const T& x)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T tanh(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sin(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T cos(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T tan(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T log(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T exp(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T abs(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sqrt(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T neg(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sgn(const T&)  { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T add(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sub(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T mul(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T div(const T& x, const T& y)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T add(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sub(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T mul(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T div(const T&, const T&)  { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T max(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T min(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T pow(const T& x, const T& y)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T max(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T min(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T pow(const T&, const T&)  { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T negate(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T eq(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T neq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T gt(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T lt(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T geq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T leq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T _and(const T& x, const T& y) { ABORT("Unknown type"); } // 'and' is used by gcc
-  static HOST_DEVICE_INLINE T _or(const T& x, const T& y)  { ABORT("Unknown type"); } // 'or' is used by gcc
+  static HOST_DEVICE_INLINE T negate(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T eq(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T neq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T gt(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T lt(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T geq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T leq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T _and(const T&, const T&) { ABORT("Unknown type"); } // 'and' is used by gcc
+  static HOST_DEVICE_INLINE T _or(const T&, const T&)  { ABORT("Unknown type"); } // 'or' is used by gcc

  // Neural Networks specific functions
-  static HOST_DEVICE_INLINE T sigmoid(const T& x)               { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T logaddexp(const T& x, const T& y) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T clip(const T& x, const T& y)      { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sigmoid(const T&)               { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T logaddexp(const T&, const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T clip(const T&, const T&)      { ABORT("Unknown type"); }
  // derivative of Clip, cut-off function
-  static HOST_DEVICE_INLINE T bump(const T& x, const T& y)      { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T relu(const T& x)                  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T reluBack(const T& x)              { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T prelu(const T& x, const T& y)     { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T preluBack(const T& x, const T& y) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T bump(const T&, const T&)      { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T relu(const T&)                  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T reluBack(const T&)              { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T prelu(const T&, const T&)     { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T preluBack(const T&, const T&) { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T if_then_else(const T& x, const T& y, const T& z) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T if_then_else(const T&, const T&, const T&) { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T sumReduce(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T maxReduce(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T minReduce(const T& x) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sumReduce(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T maxReduce(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T minReduce(const T&) { ABORT("Unknown type"); }
 };

 // Specialization for float
@ -127,14 +130,14 @@ template <>
 struct Ops<double> {
  typedef double Single;

-  static HOST_DEVICE_INLINE double tanh(const double& x) { return tanh(x); }
-  static HOST_DEVICE_INLINE double sin(const double& x)  { return sin(x); }
-  static HOST_DEVICE_INLINE double cos(const double& x)  { return cos(x); }
-  static HOST_DEVICE_INLINE double tan(const double& x)  { return tan(x); }
-  static HOST_DEVICE_INLINE double log(const double& x)  { return log(x); }
-  static HOST_DEVICE_INLINE double exp(const double& x)  { return exp(x); }
-  static HOST_DEVICE_INLINE double abs(const double& x)  { return abs(x); }
-  static HOST_DEVICE_INLINE double sqrt(const double& x) { return sqrt(x); }
+  static HOST_DEVICE_INLINE double tanh(const double& x) { return std::tanh(x); }
+  static HOST_DEVICE_INLINE double sin(const double& x)  { return std::sin(x); }
+  static HOST_DEVICE_INLINE double cos(const double& x)  { return std::cos(x); }
+  static HOST_DEVICE_INLINE double tan(const double& x)  { return std::tan(x); }
+  static HOST_DEVICE_INLINE double log(const double& x)  { return std::log(x); }
+  static HOST_DEVICE_INLINE double exp(const double& x)  { return std::exp(x); }
+  static HOST_DEVICE_INLINE double abs(const double& x)  { return std::abs(x); }
+  static HOST_DEVICE_INLINE double sqrt(const double& x) { return std::sqrt(x); }
  static HOST_DEVICE_INLINE double neg(const double& x)  { return -x; }
  static HOST_DEVICE_INLINE double sgn(const double& x)  { return (0 < x) - (x < 0); }

@ -145,7 +148,7 @@ struct Ops<double> {

  static HOST_DEVICE_INLINE double max(const double& x, const double& y)  { return x < y ? y : x; }
  static HOST_DEVICE_INLINE double min(const double& x, const double& y)  { return x < y ? x : y; }
-  static HOST_DEVICE_INLINE double pow(const double& x, const double& y)  { return pow(x, y); }
+  static HOST_DEVICE_INLINE double pow(const double& x, const double& y)  { return std::pow(x, y); }


  static HOST_DEVICE_INLINE double negate(const double& x)  { return !(bool)x; }
@ -460,7 +463,7 @@ struct Ops<half> {
  static DEVICE_INLINE half exp(const half& x)  { return hexp(x); }
  static DEVICE_INLINE half sqrt(const half& x) { return hsqrt(x); }
  static DEVICE_INLINE half neg(const half& x)  { return -x; }
-  
+
  static DEVICE_INLINE half abs(const half& x)  { return fabs((float)x); }// @TODO half has this information somewhere in the struct, right?
  static DEVICE_INLINE half sgn(const half& x)  { half zero = 0.f; return (zero < x) - (x < zero); } // @TODO half has this information somewhere in the struct, right?

--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -130,9 +130,6 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {

  bool inferenceOnly_{false};

-  // during inference, use optimizations that might lead to precision loss, e.g. 8-bit MatMul.
-  // At this moment, this is used for int16 qunatized Matmul - 11/1/2019
-  bool optimized_{false};
  bool checkpointing_{false}; // use gradient checkpointing if true

  bool reloaded_{false};
@ -178,9 +175,6 @@ public:
  void setInference(bool inference) { inferenceOnly_ = inference; }
  bool isInference() { return inferenceOnly_; }

-  void setOptimized(bool optimized) { optimized_ = optimized; }
-  bool isOptimized() { return (optimized_ && inferenceOnly_); }
-
  void setCheckpointing(bool checkpointing) { checkpointing_ = checkpointing; }
  bool isCheckpointing() { return checkpointing_; }

--- a/src/graph/node.h
+++ b/src/graph/node.h
@ -40,7 +40,7 @@ protected:
  std::string debugMessage_;

  Ptr<std::list<Expr>> subtape_; // a subtape is used to keep track of nodes that need to be freed and recomputed with gradient-checkpointing.
-  bool isCheckpoint_{false};     // true if this node has been selected to be a checkpoint, currently only done manually. 
+  bool isCheckpoint_{false};     // true if this node has been selected to be a checkpoint, currently only done manually.

  Ptr<AutoTunerRecorder> recorder_;
  size_t recorderHash_;
@ -138,7 +138,7 @@ public:

  virtual std::string graphviz() override {
    std::stringstream ss;
-    ss << "\"" << this << "\" [" 
+    ss << "\"" << this << "\" ["
      << "shape=\"" << form() << "\", "
      << "label="   << label() << ", "
      << "style=\"filled\", "
@ -147,7 +147,7 @@ public:

    for(auto&& child : children())
      ss << "\"" << child << "\" -> \"" << this << "\";" << std::endl;
-    
+
    if(subtape_) {
      for(auto&& dep : *subtape_)
        ss << "\"" << dep << "\" -> \"" << this << "\" [style=dotted];" << std::endl;
@ -188,9 +188,9 @@ struct NaryNodeOp : public Node {

  // Deduce type automatically, but then all types must be the same
  // this is called automatically when no output type is specified.
-  // If the input types are mixed, the output type needs to be specified 
+  // If the input types are mixed, the output type needs to be specified
  // in the constructor.
-  Type commonType(const std::vector<Expr>& nodes) {
+  static Type commonType(const std::vector<Expr>& nodes) {
    ABORT_IF(nodes.size() == 0, "NaryNodeOp has no children");
    Type type = nodes[0]->value_type();
    for(int i = 1; i < nodes.size(); ++i)
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@ -17,9 +17,9 @@ namespace inits {
 /**
 * Base class for specialized NodeInitializers.
 *
- * A NodeInitializer is a functor that is associated with parameters 
- * and constants, and is invoked on a tensor during node intialization. 
- * You need to override NodeIntializer::apply(Tensor) with your own 
+ * A NodeInitializer is a functor that is associated with parameters
+ * and constants, and is invoked on a tensor during node intialization.
+ * You need to override NodeIntializer::apply(Tensor) with your own
 * functionality or use a fromLambda intializer.
 *
 * See node_initializers.cpp for examples.
@ -31,6 +31,7 @@ protected:
 public:
  virtual void apply(Tensor t) = 0;
  void setAllocator(Ptr<Allocator> allocator) { allocator_ = allocator; }
+  virtual ~NodeInitializer() {}
 };

 /**
@ -135,7 +136,7 @@ Ptr<NodeInitializer> dropout(float dropoutProbabilty);

 /**
 * Intialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps)
- * 
+ *
 * @return A NodeInitializer
 */
 Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
@ -163,7 +164,7 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,

 /**
 * Computes Google's Transformer-style sinusoidal position embeddings
- * starting from position 'start' taking into account batch and time 
+ * starting from position 'start' taking into account batch and time
 * dimensions of the tensor.
 *
 * Expected tensor layout {-2: time, -1: model}
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -480,9 +480,12 @@ class CSRDotNodeOp : public NaryNodeOp {
  bool transS_;
  bool swapOperands_;
 public:
-  CSRDotNodeOp(const Shape& S_shape, Expr S_values, Expr S_indices, Expr S_offsets, Expr D, bool transS, bool swapOperands)
-      : NaryNodeOp({ S_values, S_indices, S_offsets, D }, newShape(S_shape, S_values, S_indices, S_offsets, D, transS, swapOperands), commonType({S_values, D})),
-                   transS_(transS), swapOperands_(swapOperands) {
+  CSRDotNodeOp(const Shape& S_shape, Expr S_values, Expr S_indices,
+               Expr S_offsets, Expr D, bool transS, bool swapOperands)
+    : NaryNodeOp({ S_values, S_indices, S_offsets, D },
+                 newShape(S_shape, S_values, S_indices, S_offsets, D, transS, swapOperands),
+                 NaryNodeOp::commonType({S_values, D})),
+      transS_(transS), swapOperands_(swapOperands) {
    matchOrAbort<IndexType>(S_indices->value_type());
    matchOrAbort<IndexType>(S_offsets->value_type());
  }
@ -513,7 +516,7 @@ public:

  NodeOps backwardOps() override {
    return { nullptr, // can't backprop into the sparse matrix (the gradient is dense)
-             nullptr, 
+             nullptr,
             nullptr,
             NodeOp(CSRProd(child(3)->grad(), // child(3) = D
                            graph()->allocator(),
@ -527,7 +530,7 @@ public:
  virtual size_t hash() override {
    size_t seed = NaryNodeOp::hash();
    for(auto s : shape())
-      util::hash_combine(seed, s);  
+      util::hash_combine(seed, s);
    util::hash_combine(seed, transS_);
    util::hash_combine(seed, swapOperands_);
    return seed;
@ -1050,8 +1053,8 @@ struct ConcatenateNodeOp : public NaryNodeOp {
    auto checkShape = shape;
    for(auto child : nodes) {
      checkShape.set(ax_, child->shape()[ax_]); // don't abort on different sizes on axis dim.
-      ABORT_IF(checkShape != child->shape(), 
-               "Child shapes {} and {} cannot be concatenated along axis {}", 
+      ABORT_IF(checkShape != child->shape(),
+               "Child shapes {} and {} cannot be concatenated along axis {}",
               shape, child->shape(), ax);

      sum += child->shape()[ax_];
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@ -10,10 +10,10 @@

 namespace marian {

-// @TODO: Currently an ExpressionGraph only supports one Parameters object and 
+// @TODO: Currently an ExpressionGraph only supports one Parameters object and
 // the type of parameters has to be the inside on Parameters object. This limits
 // parameter types to a single chosen type, e.g. only fp32 or only fp16. This should
-// be extended to allow multiple sets of parameters. 
+// be extended to allow multiple sets of parameters.
 // The reason here is to be able to efficiently compute updates of whole parameter
 // sets of one type.
 class Parameters {
@ -40,7 +40,7 @@ public:
    LOG(debug, "Created parameter object of type {}", acceptedElementType_);
  }

-  ~Parameters() {
+  virtual ~Parameters() {
    LOG(debug, "Destroyed parameter object of type {}", acceptedElementType_);
  }

@ -88,7 +88,7 @@ public:

      // sort parameters by name before allocation to make sure the memory layout after allocation is always the same
      std::sort(params_.begin(), params_.end(), [](Expr n1, Expr n2){ return n1->name() < n2->name(); });
-      
+
      for(auto p : params_) {
        if(!p->val()) {
          vals_->allocate(p->val(), p->shape(), p->value_type());
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@ -39,6 +39,7 @@ public:

 // Simplest layer interface: Unary function
 struct IUnaryLayer {
+  virtual ~IUnaryLayer() {}
  virtual Expr apply(Expr) = 0;
  virtual Expr apply(const std::vector<Expr>& es) {
    ABORT_IF(es.size() > 1, "Not implemented"); // simple stub
@ -59,6 +60,7 @@ struct IEmbeddingLayer {

  // alternative from indices directly
  virtual Expr applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const = 0;
+  virtual ~IEmbeddingLayer() {}
 };

 // base class for Encoder and Decoder classes, which have embeddings and a batch index (=stream index)
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@ -5,14 +5,14 @@

 namespace marian {

-static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
+static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> /*graph*/,
                                               Ptr<data::CorpusBatch> batch,
                                               Ptr<Options> options,
                                               Expr attention) { // [beam depth=1, max src length, batch size, tgt length]

  std::string guidedLossType = options->get<std::string>("guided-alignment-cost");  // @TODO: change "cost" to "loss"
  float guidedLossWeight = options->get<float>("guided-alignment-weight");
-  
+
  const auto& shape = attention->shape(); // [beam depth=1, max src length, batch size, tgt length]
  float epsilon = 1e-6f;
  Expr alignmentLoss; // sum up loss over all attention/alignment positions
@ -55,8 +55,8 @@ static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
    else
       ABORT("Unknown alignment cost type: {}", guidedLossType);
    // every position is a label as they should all agree
-    // @TODO: there should be positional masking here ... on the other hand, positions that are not 
-    // in a sentence should always agree (both being 0). Lack of masking affects label count only which is 
+    // @TODO: there should be positional masking here ... on the other hand, positions that are not
+    // in a sentence should always agree (both being 0). Lack of masking affects label count only which is
    // probably negligible?
    numLabels = shape.elements();
  }
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@ -331,6 +331,7 @@ public:
  : LabelwiseLoss(axes), // cross-entropy already reduces over axis -1
    labelSmoothing_(labelSmoothing), factorWeight_(factorWeight) {}

+  virtual ~CrossEntropyLoss() {}
 protected:
  float labelSmoothing_; // interpolation factor for label smoothing, see below
  float factorWeight_;   // give extra weight to factors
@ -368,7 +369,7 @@ protected:

    if(labelWeights) {
      // We currently do not know how to use target factors and word-level label weights together
-      bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights. 
+      bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights.
      ABORT_IF(wordlevel && logits.getNumFactorGroups() > 1, "CE loss with word-level label weights is not implemented for factors");
      ce = ce * cast(labelWeights, Type::float32);
    }
@ -379,15 +380,15 @@ protected:


 /**
- * @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an 
- * implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319. 
+ * @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an
+ * implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319.
 * We rely on word-level label weights where 1 is correct and 0 is marking an error. If there are not
 * zeros for a sentence it going to be trained with normal CE loss if there is at least one 0 it is going
 * to flip over to use SUL for that sentence to penalize the selected word.
- * 
+ *
 * SUL is implemented as:
 * -log(gather(1 - softmax(logits), -1, indices))
- * 
+ *
 * Factors are currently not supported.
 */
 class SequenceUnlikelihoodLoss : public CrossEntropyLoss {
@ -411,17 +412,17 @@ protected:
    ABORT_IF(!mask, "mask is required"); // @TODO: check this, it seems weights for padding are by default 1, which would make this obsolete.
    // use label weights, where 1 is GOOD and 0 is BAD. After inversion here, now 1 marks, mask again to eliminate padding (might be obsolete)
    auto errorMask = (1.f - cast(labelWeights, Type::float32)) * cast(mask, Type::float32);
-  
+
    auto ceUl = logits.applyLossFunction(labels, [&](Expr logits, Expr indices) {
      return cast(unlikelihood(logits, indices), Type::float32);
    });
-  
+
    // compute if want to use CE or UL. If there are no errors train with CE, otherwise train _only on_ the errors with UL. This is the "mixed" training
-    // schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL. 
+    // schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL.
    auto onlyCe  = eq(sum(errorMask, /*axis=*/-3), 0.f); // [1, 1, dimBatch, 1] - equal 1 if no errors are present
    ceUl         = errorMask * ceUl;                     // don't use for correct label or padding

-    auto cost    = onlyCe * ce + (1.f - onlyCe) * ceUl;  // ce or unlikelihood part are never simultanously used as cost per batch entry 
+    auto cost    = onlyCe * ce + (1.f - onlyCe) * ceUl;  // ce or unlikelihood part are never simultanously used as cost per batch entry

    return cost;
  }
--- a/src/layers/weight.h
+++ b/src/layers/weight.h
@ -17,6 +17,7 @@ public:
  virtual void debugWeighting(std::vector<float> /*weightedMask*/,
                              std::vector<float> /*freqMask*/,
                              Ptr<data::CorpusBatch> /*batch*/){};
+  virtual ~WeightingBase() {}
 };

 class DataWeighting : public WeightingBase {
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@ -41,6 +41,7 @@ class VocabWrapper : public IVocabWrapper {
  Ptr<Vocab> pImpl_;
 public:
  VocabWrapper(Ptr<Vocab> vocab) : pImpl_(vocab) {}
+  virtual ~VocabWrapper() {}
  WordIndex encode(const std::string& word) const override { return (*pImpl_)[word].toWordIndex(); }
  std::string decode(WordIndex id) const override { return (*pImpl_)[Word::fromWordIndex(id)]; }
  size_t size() const override { return pImpl_->size(); }
@ -243,7 +244,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) {
  }
 }

-// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc. 
+// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc.
 bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec) {
  std::cout << "Converting from: " << inputFile << ", to: " << outputFile << std::endl;

--- a/src/microsoft/quicksand.h
+++ b/src/microsoft/quicksand.h
@ -54,6 +54,8 @@ public:
                     const std::vector<const void*>& ptrs)
      : options_(options), ptrs_(ptrs) {}

+  virtual ~IBeamSearchDecoder() {}
+
  virtual QSNBestBatch decode(const QSBatch& qsBatch,
                              size_t maxLength,
                              const std::unordered_set<WordIndex>& shortlist)
--- a/src/models/costs.h
+++ b/src/models/costs.h
@ -25,6 +25,7 @@ public:
                                       Ptr<ExpressionGraph> graph, // @TODO: why needed? Can it be gotten from model?
                                       Ptr<data::Batch> batch,
                                       bool clearGraph = true) = 0;
+  virtual ~ICost() {}
 };

 class EncoderDecoderCECost : public ICost {
@ -51,6 +52,8 @@ public:
      weighter_ = WeightingFactory(options_);
  }

+  virtual ~EncoderDecoderCECost() {}
+
  Ptr<MultiRationalLoss> apply(Ptr<IModel> model,
             Ptr<ExpressionGraph> graph,
             Ptr<data::Batch> batch,
@ -136,6 +139,8 @@ public:
  Trainer(Ptr<IModel> model, Ptr<ICost> cost)
      : model_(model), cost_(cost) {}

+  virtual ~Trainer() {}
+
  Ptr<IModel> getModel() { return model_; }

  virtual void load(Ptr<ExpressionGraph> graph,
@ -179,6 +184,8 @@ public:
  Scorer(Ptr<IModel> model, Ptr<ILogProb> cost)
      : model_(model), logProb_(cost) {}

+  virtual ~Scorer(){}
+
  Ptr<IModel> getModel() { return model_; }

  virtual void load(Ptr<ExpressionGraph> graph,
@ -211,6 +218,7 @@ public:

 class LogSoftmaxStep : public ILogProbStep {
 public:
+  virtual ~LogSoftmaxStep() {}
  virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
    // decoder needs normalized probabilities (note: skipped if beam 1 and --skip-cost)
    state->setLogProbs(state->getLogProbs().applyUnaryFunction(logsoftmax));
@ -224,6 +232,7 @@ public:
 // with --output-sampling during translation with marian-decoder
 class GumbelSoftmaxStep : public ILogProbStep {
 public:
+  virtual ~GumbelSoftmaxStep() {}
  virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
    state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
      [](Expr logits){ // lemma gets gumbelled
--- a/src/models/encoder_decoder.h
+++ b/src/models/encoder_decoder.h
@ -11,6 +11,7 @@ namespace marian {

 class IEncoderDecoder : public models::IModel {
 public:
+  virtual ~IEncoderDecoder() {}
  virtual void load(Ptr<ExpressionGraph> graph,
                    const std::string& name,
                    bool markedReloaded = true) override
--- a/src/models/model_base.h
+++ b/src/models/model_base.h
@ -41,6 +41,8 @@ public:
 // @TODO: Is there a better name?
 class ICriterionFunction {
 public:
+  virtual ~ICriterionFunction() {}
+
  virtual void load(Ptr<ExpressionGraph>,
                    const std::string&,
                    bool markReloaded = true)
--- a/src/models/model_task.h
+++ b/src/models/model_task.h
@ -5,10 +5,12 @@
 namespace marian {

 struct ModelTask {
+  virtual ~ModelTask() {}
  virtual void run() = 0;
 };

 struct ModelServiceTask {
+  virtual ~ModelServiceTask() {}
  virtual std::string run(const std::string&) = 0;
 };
 }  // namespace marian
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@ -11,6 +11,7 @@ namespace marian {
 class EncoderS2S : public EncoderBase {
  using EncoderBase::EncoderBase;
 public:
+  virtual ~EncoderS2S() {}
  Expr applyEncoderRNN(Ptr<ExpressionGraph> graph,
                       Expr embeddings,
                       Expr mask,
@ -254,7 +255,7 @@ public:
    auto embeddings = state->getTargetHistoryEmbeddings();

    // The batch dimension of the inputs can change due to batch-pruning, in that case
-    // cached elements need to be rebuilt, in this case the mapped encoder context in the 
+    // cached elements need to be rebuilt, in this case the mapped encoder context in the
    // attention mechanism of the decoder RNN.
    int currDimBatch = embeddings->shape()[-2];
    if(!rnn_ || lastDimBatch_ != currDimBatch)  // if currDimBatch is different, rebuild the cached RNN
@ -263,7 +264,7 @@ public:
    // Also @TODO: maybe implement a Cached(build, updateIf) that runs a check and rebuild if required
    // at dereferecing :
    // rnn_ = Cached<decltype(constructDecoderRNN(graph, state))>(
-    //          /*build=*/[]{ return constructDecoderRNN(graph, state); }, 
+    //          /*build=*/[]{ return constructDecoderRNN(graph, state); },
    //          /*updateIf=*/[]{ return state->batchDimChanged() });
    // rnn_->transduce(...);

--- a/src/models/states.h
+++ b/src/models/states.h
@ -17,6 +17,7 @@ public:
      : context_(context), mask_(mask), batch_(batch) {}

  EncoderState() {}
+  virtual ~EncoderState() {}

  virtual Expr getContext()   const { return context_;   }
  virtual Expr getAttended()  const { return context_;   }
@ -53,6 +54,7 @@ public:
               const std::vector<Ptr<EncoderState>>& encStates,
               Ptr<data::CorpusBatch> batch)
      : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch) {}
+  virtual ~DecoderState() {}

  // @TODO: Do we need all these to be virtual?
  virtual const std::vector<Ptr<EncoderState>>& getEncoderStates() const {
@ -68,10 +70,10 @@ public:
                                   int beamSize) const {

    std::vector<Ptr<EncoderState>> newEncStates;
-    for(auto& es : encStates_) 
+    for(auto& es : encStates_)
      // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries
      newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices));
-      
+
    // hypindices matches batchIndices in terms of batch dimension, so we only need hypIndices
    auto selectedState = New<DecoderState>(
        states_.select(hypIndices, beamSize, /*isBatchMajor=*/false), logProbs_, newEncStates, batch_);
@ -121,6 +123,7 @@ private:
  Words targetWords_;

 public:
+  virtual ~ClassifierState() {}
  virtual Expr getLogProbs() const { return logProbs_; }
  virtual void setLogProbs(Expr logProbs) { logProbs_ = logProbs; }

--- a/src/optimizers/clippers.h
+++ b/src/optimizers/clippers.h
@ -16,6 +16,7 @@ namespace marian {
 class ClipperBase {
 public:
  virtual void clip(Tensor) = 0;
+  virtual ~ClipperBase() {}
 };

 typedef std::shared_ptr<ClipperBase> ClipperPtr;
--- a/src/optimizers/optimizers.h
+++ b/src/optimizers/optimizers.h
@ -29,6 +29,8 @@ public:
      LOG(info, "[optimizers] Learning rate gets automatically adjusted as if minibatch size was {}", refMBWordsParam_);
  }

+  virtual ~OptimizerBase() {}
+
  static constexpr size_t mbSizeNotProvided = SIZE_MAX;

  void update(Ptr<ExpressionGraph> graph, size_t mbSize = mbSizeNotProvided) {
@ -114,7 +116,7 @@ class Sgd : public OptimizerBase {
 public:
  Sgd(float eta, size_t refMBWordsParam = 0, Ptr<ClipperBase> clipper = nullptr)
      : OptimizerBase(eta, refMBWordsParam, clipper) {}
-
+  virtual ~Sgd() {}
  virtual void setParams(const std::vector<float>& /*params*/) override {}
 private:
  void updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBWords) override;
--- a/src/rescorer/score_collector.h
+++ b/src/rescorer/score_collector.h
@ -13,6 +13,7 @@ namespace marian {
 class ScoreCollector {
 public:
  ScoreCollector(const Ptr<Options>& options);
+  virtual ~ScoreCollector() {}

  virtual void Write(long id, const std::string& message);
  virtual void Write(long id,
--- a/src/rnn/rnn.h
+++ b/src/rnn/rnn.h
@ -35,7 +35,7 @@ protected:
 public:
  BaseRNN(Ptr<ExpressionGraph> graph, Ptr<Options> options)
      : graph_(graph), options_(options) {}
-
+  virtual ~BaseRNN() {}
  virtual Expr transduce(Expr, Expr = nullptr) = 0;
  virtual Expr transduce(Expr, State, Expr = nullptr) = 0;
  virtual Expr transduce(Expr, States, Expr = nullptr) = 0;
@ -113,6 +113,7 @@ private:

 public:
  friend RNN;
+  virtual ~SingleLayerRNN() {}

  // @TODO: benchmark whether this concatenation is a good idea
  virtual Expr transduce(Expr input, Expr mask = nullptr) override {
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@ -17,7 +17,7 @@ protected:
 public:
  Backend(DeviceId deviceId, size_t seed)
      : deviceId_(deviceId), seed_(seed), randomGenerator_(createRandomGenerator(seed, deviceId)) {}
-
+  virtual ~Backend() {};
  virtual DeviceId getDeviceId() { return deviceId_; };
  virtual Ptr<RandomGenerator> getRandomGenerator() { return randomGenerator_; }

--- a/src/tensors/cpu/device.cpp
+++ b/src/tensors/cpu/device.cpp
@ -8,29 +8,40 @@

 namespace marian {
 namespace cpu {
+namespace {

 // allocate function for tensor reserve() below. 
-// Needed for AVX512, while not available on all compilers. It seems clang
-// does not have aligned_alloc for all cstlib versions. If AVX512 is not used
-// a simple malloc is probably fine. 
-// Should generate a runtime error otherwise as we have a check in the AVX512 
-// functions which tests for alignment. 
-#ifdef _WIN32
-#define MALLOC(size) _aligned_malloc(size, alignment_)
-#elif __GNUC__
-#define MALLOC(size) aligned_alloc(alignment_, size)
-#else
-#define MALLOC(size) malloc(size)
-#endif
+// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.

 #ifdef _WIN32
-#define FREE(ptr) _aligned_free(ptr)
+void *genericMalloc(size_t alignment, size_t size) {
+  void *ret = _aligned_malloc(size, alignment);
+  ABORT_IF(!ret, "Failed to allocate memory on CPU");
+  return ret;
+}
+void genericFree(void *ptr) {
+  _aligned_free(ptr);
+}
 #else
-#define FREE(ptr) free(ptr)
+// Linux and OS X.  There is no fallback to malloc because we need it to be aligned.
+void *genericMalloc(size_t alignment, size_t size) {
+  // On macos, aligned_alloc is available only on c++17
+  // Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
+  // posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
+  void *result;
+  // Error could be detected by return value or just remaining nullptr.
+  ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
+  return result;
+}
+void genericFree(void *ptr) {
+  free(ptr);
+}
 #endif

+} // namespace
+
 Device::~Device() {
-  FREE(data_);
+  genericFree(data_);
 }

 void Device::reserve(size_t size) {
@ -38,14 +49,12 @@ void Device::reserve(size_t size) {
  ABORT_IF(size < size_ || size == 0,
           "New size must be larger than old size and larger than 0");

+  uint8_t *temp = static_cast<uint8_t*>(genericMalloc(alignment_, size));
  if(data_) {
-    uint8_t *temp = static_cast<uint8_t*>(MALLOC(size));
    std::copy(data_, data_ + size_, temp);
-    FREE(data_);
-    data_ = temp;
-  } else {
-    data_ = static_cast<uint8_t*>(MALLOC(size));
+    genericFree(data_);
  }
+  data_ = temp;
  size_ = size;
 }
 }  // namespace cpu
--- a/src/tensors/cpu/fbgemm/expanded_gemm.h
+++ b/src/tensors/cpu/fbgemm/expanded_gemm.h
@ -17,6 +17,7 @@
 #endif

 using namespace fbgemm;
+// @TODO: don't use using namespace ...; in header files. Just don't. [UG]
 #endif  // USE_FBGEMM

 namespace marian {
@ -96,7 +97,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {

  const std::string type() override { return "packMatFp16"; }

-  Shape newShape(Expr a, bool transpose) {
+  Shape newShape(Expr MAYBE_UNUSED a, bool MAYBE_UNUSED transpose) {
 #if USE_FBGEMM
    auto shapeMat = a->shape();
    // Should be 2D - weight matrix
@ -115,9 +116,8 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
                           packsize_);

    Shape outShape({(int)packsize_});
-
    return outShape;
-#else // USE_FBGEMM
+#else
    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
    return Shape();
 #endif  // USE_FBGEMM
@ -180,19 +180,21 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {

  const std::string type() override { return "packMatInt8"; }

-  Shape newShape(Expr a, bool transpose) {
 #if USE_FBGEMM
+  Shape newShape(Expr a, bool transpose) {
    fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
    Shape outShape({(int)packsize_});
-
    return outShape;
-#else // USE_FBGEMM
+  }
+#else
+  Shape newShape(Expr /*a*/, bool /*transpose*/) {
    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
    return Shape();
-#endif  // USE_FBGEMM
  }
+#endif  // USE_FBGEMM
 };

+
 // Affine transform (matrix multiplication) using packed B matrix
 // float scalar_: scalar multiplier
 // size_t m_: the number of rows in A and C
@ -202,7 +204,6 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
 // bool transB_: transpose B
 class FbgemmPacked16AffineNodeOp : public NaryNodeOp {
 private:
-  float scalar_;
  size_t m_;
  size_t n_;
  size_t k_;
@ -210,9 +211,8 @@ private:
  bool transB_;

 public:
-  FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
-        scalar_(scalar) {
+  FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
+    : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar)*/ {
    transA_ = transA;
    transB_ = transB;
    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@ -281,7 +281,6 @@ public:
 // bool transB_: transpose B
 class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
 private:
-  float scalar_;
  size_t m_;
  size_t n_;
  size_t k_;
@ -289,9 +288,8 @@ private:
  bool transB_;

 public:
-  FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
-        scalar_(scalar) {
+ FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
+   : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ {
    transA_ = transA;
    transB_ = transB;
    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@ -302,7 +300,7 @@ public:
    size_t l = bShape.elements() / bShape[-1];
    n_ = bShape[-1];
    if(transB)
-     std::swap(l, n_);
+      std::swap(l, n_);
  }

  Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
@ -369,9 +367,9 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
  Type elementType = b->value_type();

  if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
    return nullptr;
@ -380,9 +378,9 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo

 static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
  if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
+    return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
  else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
+    return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
    return nullptr;
@ -394,9 +392,9 @@ static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, f
  Type elementType = b->value_type();

  if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
    return nullptr;
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -20,7 +20,7 @@ namespace marian {

 namespace cpu {

-void IsNaN(const Tensor in, Ptr<Allocator> allocator, bool& /*isNaN*/, bool& /*isInf*/) {
+  void IsNaN(const Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool& /*isNaN*/, bool& /*isInf*/) {
  ABORT("Not implemented");
 }

@ -214,9 +214,11 @@ void Transpose0213(Tensor out, Tensor in) {
  }
 }

+// This function is called only when MKL is available.
+#if MKL_FOUND
 // Given a 4D array, transpose (swap) the initial 3 dimensions while keeping the last dimension.
 // e.g. 1234 --> 2134, 1234 --> 3214 (4 is always kept).
-// This is an optimized version for swapping first 3 dimensions 
+// This is an optimized version for swapping first 3 dimensions
 // assuming the last dimension is large enough to get benefits from vectorized copy.
 //
 // @param out output tensor
@ -225,14 +227,13 @@ void Transpose0213(Tensor out, Tensor in) {
 template <bool add>
 void TransposeFirst3In4(Tensor out, Tensor in, const std::vector<int>& vAxis) {
  ABORT_IF(vAxis.size() != 4, "This function handles only 4D arrays.");
-#if MKL_FOUND
  int innermost = in->shape()[-1];

  int l1 = in->shape()[vAxis[0]];
  int l2 = in->shape()[vAxis[1]];
  int l3 = in->shape()[vAxis[2]];

-  // find the mapping between the transposed output dimensional indices (oi, oj, ok) 
+  // find the mapping between the transposed output dimensional indices (oi, oj, ok)
  // and original input dimensional indices (i, j, k)
  int oi, oj, ok;
 #pragma omp parallel for
@ -275,11 +276,8 @@ void TransposeFirst3In4(Tensor out, Tensor in, const std::vector<int>& vAxis) {
      }
    }
  }
-#else
-  // it shouldn't come into here. This function is called only when MKL is available.
-  ABORT("Should not get here");
-#endif  // MKL_FOUND
 }
+#endif  // MKL_FOUND

 inline void transpose4x4_SSE(const float* A,
                             float* B,
@ -656,7 +654,7 @@ void SelectAxis2(Tensor out,

  functional::Shape outShape = out->shape();
  functional::Shape inShape = in->shape();
-  
+
  auto idxData = indices->data<IndexType>();
  auto odata = out->data();
  const auto idata = in->data();
--- a/src/tensors/rand.h
+++ b/src/tensors/rand.h
@ -15,11 +15,11 @@ protected:

 public:
  RandomGenerator(size_t seed) : seed_(seed) { }
-
+  virtual ~RandomGenerator() {}
  virtual void uniform(Tensor, float a, float b) = 0;
  virtual void normal(Tensor, float mean, float stddev) = 0;
 };

 Ptr<RandomGenerator> createRandomGenerator(size_t /*seed*/, DeviceId);

-}
+}
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@ -25,7 +25,7 @@
 namespace marian {

 template <typename InIt, typename OutIt>
-void copy(Ptr<Backend> backend, const InIt beg, const InIt end, OutIt it) {
+void copy(Ptr<Backend>& MAYBE_UNUSED backend, const InIt beg, const InIt end, OutIt it) {
 #ifdef CUDA_FOUND
  if(backend->getDeviceId().type == DeviceType::gpu)
    gpu::copy(backend, beg, end, it);
@ -119,7 +119,7 @@ DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)

 // clang-format on

-// Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1. 
+// Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1.
 static inline void Bernoulli(Tensor resultTensor, float keepProb, float scale = 1.f, float shift = 0.f) {
  // in-place uniform distribution
  auto rnd = resultTensor->getBackend()->getRandomGenerator();
@ -190,7 +190,7 @@ void LayerNormalizationGrad(Tensor gradX,
 }

 static inline void LayerNormalizationGrad(
-                            Ptr<Allocator> allocator,
+                            Ptr<Allocator> MAYBE_UNUSED allocator,
                            Tensor gradX,
                            Tensor gradGamma,
                            Tensor gradBeta,
--- a/src/tests/prod.cpp
+++ b/src/tests/prod.cpp
@ -1,7 +1,7 @@
 #include "marian.h"
 #include "common/timer.h"

-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
    using namespace marian;

    {
--- a/src/tests/sqlite.cpp
+++ b/src/tests/sqlite.cpp
@ -8,6 +8,8 @@
 #include <fstream>

 int main(int argc, char** argv) {
+    ABORT_IF(argc != 3, "FATAL ERROR: Incorrect number of command line arguments "
+             "(expected: 2) for command {}.",argv[0]);

    SQLite::Database db("corpus.db", SQLite::OPEN_READWRITE|SQLite::OPEN_CREATE);
    db.exec("PRAGMA temp_store_directory = '/data1/marcinjd';");
--- a/src/training/communicator.cpp
+++ b/src/training/communicator.cpp
@ -38,7 +38,7 @@ Ptr<ICommunicator> createCommunicator(
  }

  // the actual implementation is inside communicator.cu
-  return New<NCCLCommunicator>(graphs, mpi); 
+  return New<NCCLCommunicator>(graphs, mpi);
 #else // no CUDA or no NCCL
  noNccl; // (unused)
  return New<DefaultCommunicator>(graphs, mpi);
@ -141,7 +141,7 @@ public:
  FakeMPIWrapper(bool) {
    LOG(warn, "Compiled without MPI support. Falling back to FakeMPIWrapper");
  }
-
+  virtual ~FakeMPIWrapper() {}
  virtual size_t myMPIRank() const override { return 0; };
  virtual size_t numMPIProcesses() const override { return 1; };

--- a/src/training/communicator.h
+++ b/src/training/communicator.h
@ -156,11 +156,8 @@ public:
  void scatterReduceAndResetGrads() const override {
    const_cast<DefaultCommunicator*>(this)->lazyInit();

-    int totalSize = (int)graphs_[0]->params()->vals()->size();
-    int shardSize = (int)ceil(totalSize / (float)graphs_.size());
-
    // Gather gradients from different devices into current gradient shards
-    auto scatter = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto scatter = [this](size_t idx, size_t begin, size_t end) {
      auto curGrad = graphs_[idx]->params()->grads()->subtensor(begin, end-begin);

      // collect and sum gradients
@ -176,7 +173,7 @@ public:
    };

    // reset gradients outside current shard
-    auto reset = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto reset = [this](size_t idx, size_t begin, size_t end) {
      auto grad = graphs_[idx]->params()->grads();
      if (begin > 0)
        grad->subtensor(0, begin)->set(0);
@ -189,11 +186,9 @@ public:
  }

  void allGatherParams() const override {
-    int totalSize = (int)graphs_[0]->params()->vals()->size();
-    int shardSize = (int)ceil(totalSize / (float)graphs_.size());

    // Update all graphs with parameter shard
-    auto gather = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto gather = [this](size_t idx, size_t begin, size_t end) {
      auto getShard = [&](Ptr<ExpressionGraph> graph) {
        return graph->params()->vals()->subtensor(begin, end-begin);
      };
--- a/src/training/gradient_dropping/sparse_tensor.h
+++ b/src/training/gradient_dropping/sparse_tensor.h
@ -118,7 +118,7 @@ public:
  }

  // Convert a tensor into a sparse tensor format
-  void fromDense(Tensor t) {
+  void fromDense(Tensor MAYBE_UNUSED t) {
    if(backend_->getDeviceId().type == DeviceType::cpu) {
      ABORT("Gradient Dropping for CPU is not yet supported");
    }
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@ -54,10 +54,10 @@ public:
   * number of devices, which is passed in as the 'multiplier'.
   */
  // @TODO: Can this be made const? It seems wrong to have a stateful method that still returns a result.
-  virtual Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
-                                             Ptr<models::ICriterionFunction> model,
-                                             const std::vector<Ptr<Vocab>>& vocabs,
-                                             double multiplier = 1.) {
+  Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
+                                     Ptr<models::ICriterionFunction> model,
+                                     const std::vector<Ptr<Vocab>>& vocabs,
+                                     double multiplier = 1.) {
    auto stats = New<data::BatchStats>();

    size_t numFiles = options_->get<std::vector<std::string>>("train-sets").size();
@ -92,8 +92,8 @@ public:
        maxBatch *= 2;
    }

-    // Do a binary search for maxmimum batch size that fits into given workspace memory 
-    // for a tested sentence length. 
+    // Do a binary search for maxmimum batch size that fits into given workspace memory
+    // for a tested sentence length.
    for(size_t i = step; i <= maxLength; i += step) {
      size_t start = 1;
      size_t end = maxBatch;
--- a/src/training/graph_group_async.h
+++ b/src/training/graph_group_async.h
@ -64,7 +64,7 @@ public:
  void save(Ptr<ExpressionGraph>, bool final = false);

  // @TODO: give it a fake batch generator which own vocabs instead of passing vocabs
-  Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) {
+  virtual Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) {
    return GraphGroup::collectStats(graphs_[0], builders_[0], vocabs);
  }

--- a/src/training/graph_group_multinode_sync.h
+++ b/src/training/graph_group_multinode_sync.h
@ -63,7 +63,6 @@ private:
  Tensor paramsAvg_;
  std::vector<float> accGradientsSync_cpu;
  std::vector<float> receiveBuffer_cpu;
-  bool synchronization_happened{false};

  Ptr<OptimizerBase> syncOptimizer_;

--- a/src/training/graph_group_sync.h
+++ b/src/training/graph_group_sync.h
@ -26,7 +26,6 @@ class SyncGraphGroup : public GraphGroup, public ExponentialSmoothing {
  // state for update()
  bool first_{ true };                           // gets interpreted and cleared by update()
  std::vector<Ptr<data::Batch>> pendingBatches_; // in case of dynamic MB-size scaling, we temporarly buffer up batches across update() calls until enough
-  size_t typicalTrgWords_{};                     // typical batch size in words (labels), 0 if unknown (e.g. specified in sentences)
  double updateMultiplier_{1};                  // multiplier not applied in collectStats() (no multiplier if not mini-batch-fit)

  void initialize(const Ptr<data::Batch>& exampleBatch);
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -379,6 +379,14 @@ public:
      state_->wordsDisp    = 0;
    }

+    if(options_->get<bool>("valid-reset-stalled")) {
+      state_->stalled      = 0;
+      state_->maxStalled   = 0;
+      for(const auto& validator : validators_) {
+        state_->validators[validator->type()]["stalled"] = 0;
+      }
+    }
+
    state_->newLoad();
  }

--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@ -14,7 +14,7 @@ class TrainingState;
 class TrainingObserver {
 public:
  virtual ~TrainingObserver() {}
-
+  
  virtual void init(TrainingState&) {}
  virtual void actAfterEpoch(TrainingState&) {}
  virtual void actAfterBatches(TrainingState&) {}
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -36,6 +36,7 @@ protected:

 public:
  ValidatorBase(bool lowerIsBetter) : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}
+  virtual ~ValidatorBase() {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                         Ptr<const TrainingState> state) = 0;
@ -51,6 +52,7 @@ public:
 template <class DataSet, class BuilderType> // @TODO: BuilderType doesn't really serve a purpose here? Review and remove.
 class Validator : public ValidatorBase {
 public:
+  virtual ~Validator() {}
  Validator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool lowerIsBetter = true)
      : ValidatorBase(lowerIsBetter),
        vocabs_(vocabs),
@ -137,6 +139,7 @@ class CrossEntropyValidator : public Validator<data::Corpus, models::ICriterionF

 public:
  CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~CrossEntropyValidator() {}

  std::string type() override { return options_->get<std::string>("cost-type"); }

@ -148,6 +151,7 @@ protected:
 class AccuracyValidator : public Validator<data::Corpus, models::IModel> {
 public:
  AccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~AccuracyValidator() {}

  std::string type() override { return "accuracy"; }

@ -161,6 +165,7 @@ private:

 public:
  BertAccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool evalMaskedLM);
+  virtual ~BertAccuracyValidator() {}

  std::string type() override {
    if(evalMaskedLM_)
@ -177,6 +182,7 @@ protected:
 class ScriptValidator : public Validator<data::Corpus, models::IModel> {
 public:
  ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~ScriptValidator() {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                         Ptr<const TrainingState> /*ignored*/) override;
@ -193,6 +199,7 @@ protected:
 class TranslationValidator : public Validator<data::Corpus, models::IModel> {
 public:
  TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~TranslationValidator() {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                         Ptr<const TrainingState> state) override;
@ -212,6 +219,7 @@ protected:
 class BleuValidator : public Validator<data::Corpus, models::IModel> {
 public:
  BleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool detok = false);
+  virtual ~BleuValidator() {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                         Ptr<const TrainingState> state) override;
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -16,7 +16,7 @@ private:
  Ptr<Options> options_;
  std::vector<Ptr<Scorer>> scorers_;
  size_t beamSize_;
-  Ptr<Vocab> trgVocab_;
+  Ptr<const Vocab> trgVocab_;

  const float INVALID_PATH_SCORE = std::numeric_limits<float>::lowest(); // @TODO: observe this closely
  const bool PURGE_BATCH = true; // @TODO: diagnostic, to-be-removed once confirmed there are no issues.
@ -24,7 +24,7 @@ private:
 public:
  BeamSearch(Ptr<Options> options,
             const std::vector<Ptr<Scorer>>& scorers,
-             Ptr<Vocab> trgVocab)
+             const Ptr<const Vocab> trgVocab)
      : options_(options),
        scorers_(scorers),
        beamSize_(options_->get<size_t>("beam-size")),
@ -42,8 +42,8 @@ public:
               const std::vector<bool>& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use.
               const std::vector<IndexType>& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx]
    std::vector<float> align; // collects alignment information from the last executed time step
-    if(options_->hasAndNotEmpty("alignment") && factorGroup == 0) 
-      align = scorers_[0]->getAlignment(); // [beam depth * max src length * current batch size] -> P(s|t); use alignments from the first scorer, even if ensemble, 
+    if(options_->hasAndNotEmpty("alignment") && factorGroup == 0)
+      align = scorers_[0]->getAlignment(); // [beam depth * max src length * current batch size] -> P(s|t); use alignments from the first scorer, even if ensemble,

    const auto origDimBatch = beams.size(); // see function search for definition of origDimBatch and currentDimBatch etc.
    Beams newBeams(origDimBatch);           // return value of this function goes here. There are always origDimBatch beams.
@ -56,7 +56,7 @@ public:
      reverseBatchIdxMap.resize(batchIdxMap.size()); // adjust size if doing batch purging.
      currentDimBatch = 0;
      for(int i = 0; i < batchIdxMap.size(); ++i) {
-        reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one, 
+        reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one,
                                                // which is expected due to down-shifting
        if(!beams[i].empty())
          currentDimBatch++;
@ -154,12 +154,12 @@ public:
          auto lval = states[j]->getLogProbs().getFactoredLogitsTensor(factorGroup); // [maxBeamSize, 1, currentDimBatch, dimFactorVocab]
          // The flatting happens based on actual (current) batch size and batch index computed with batch-pruning as we are looking into the pruned tensor
          size_t flattenedLogitIndex = (beamHypIdx * currentDimBatch + currentBatchIdx) * vocabSize + wordIdx;  // (beam idx, batch idx, word idx); note: beam and batch are transposed, compared to 'key'
-          
+
          // @TODO: use a function on shape() to index, or new method val->at({i1, i2, i3, i4}) with broadcasting
          ABORT_IF(lval->shape() != Shape({(int)nBestBeamSize, 1, (int)currentDimBatch, (int)vocabSize}) &&
                   (beamHypIdx == 0 && lval->shape() != Shape({1, 1, (int)currentDimBatch, (int)vocabSize})),
                   "Unexpected shape of logits?? {} != {}", lval->shape(), Shape({(int)nBestBeamSize, 1, (int)currentDimBatch, (int)vocabSize}));
-          
+
          breakDown[j] += lval->get(flattenedLogitIndex);
        }
        hyp->setScoreBreakdown(breakDown);
@ -173,7 +173,7 @@ public:

      newBeam.push_back(hyp);
    }
-    
+
    // if factored vocab and this is not the first factor, we need to
    // also propagate factored hypotheses that do not get expanded in this step because they don't have this factor
    if (factorGroup > 0) {
@ -225,7 +225,7 @@ public:
    // in a single beam, i.e.:
    //   * [word1-batch1, word1-batch2, ..., word2-batch1, ...]
    //
-    
+
    size_t origDimBatch = batch->size();  // number of sentences in batch
    size_t batchWidth   = batch->width(); // max src length

@ -254,7 +254,7 @@ public:
    for(auto beam : beams) {
      Beam newBeam; // a beam of surviving hyps
      for(auto hyp : beam)
-        if(hyp->getWord() != trgEosId) // if this hyp is not finished, 
+        if(hyp->getWord() != trgEosId) // if this hyp is not finished,
          newBeam.push_back(hyp);      // move over to beam of surviving hyps

      if(PURGE_BATCH)
@ -309,8 +309,8 @@ public:
    // create one beam per batch entry with sentence-start hypothesis
    Beams beams(origDimBatch, Beam(beamSize_, Hypothesis::New())); // array [origDimBatch] of array [maxBeamSize] of Hypothesis, keeps full size through search.
                                                                   // batch purging is determined from an empty sub-beam.
-    std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking. 
-                                                      // By default that corresponds to position in array, 
+    std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking.
+                                                      // By default that corresponds to position in array,
                                                      // but shifts in the course of removing batch entries when they are finished.

    const std::vector<bool> emptyBatchEntries; // used for recording if there are empty input batch entries
@ -370,7 +370,7 @@ public:
        std::vector<IndexType> hypIndices;      // [maxBeamSize, 1, currentDimBatch, 1] (flattened) tensor index ((beamHypIdx, batchIdx), flattened) of prev hyp that a hyp originated from
        std::vector<Word> prevWords;            // [maxBeamSize, 1, currentDimBatch, 1] (flattened) word that a hyp ended in, for advancing the decoder-model's history
        Expr prevPathScores;                    // [maxBeamSize, 1, currentDimBatch, 1], path score that a hyp ended in (last axis will broadcast into vocab size when adding expandedPathScores)
-        
+
        bool anyCanExpand = false; // stays false if all hyps are invalid factor expansions
        if(t == 0 && factorGroup == 0) { // no scores yet
          prevPathScores = graph->constant({1, 1, 1, 1}, inits::fromValue(0));
@ -384,7 +384,7 @@ public:
            for(int currentBatchIdx = 0; currentBatchIdx < beams.size(); ++currentBatchIdx) // loop over batch entries (active sentences)
              if(!beams[currentBatchIdx].empty() || !PURGE_BATCH)                           // for each beam check
                batchIndices.push_back(prevBatchIdxMap[currentBatchIdx]);                   // which batch entries were active in previous step
-            
+
          std::vector<float> prevScores;
          for(size_t beamHypIdx = 0; beamHypIdx < maxBeamSize; ++beamHypIdx) { // loop over globally maximal beam-size (maxBeamSize)
            for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) { // loop over all batch entries (active and inactive)
@ -401,11 +401,11 @@ public:
                  if(factorGroup == 0)
                    currentBatchIdx = prevBatchIdxMap[origBatchIdx]; // subselection may happen for factorGroup == 0
                  else
-                    currentBatchIdx = batchIdxMap[origBatchIdx];     // no subselection happens for factorGroup > 0, 
-                                                                     // but we treat it like a next step, since a step 
+                    currentBatchIdx = batchIdxMap[origBatchIdx];     // no subselection happens for factorGroup > 0,
+                                                                     // but we treat it like a next step, since a step
                                                                     // happened for factorGroup == 0
                }
-                  
+
                auto hypIndex = (IndexType)(hyp->getPrevStateIndex() * currentDimBatch + currentBatchIdx); // (beamHypIdx, batchIdx), flattened, for index_select() operation

                hypIndices.push_back(hypIndex); // (beamHypIdx, batchIdx), flattened as said above.
@ -420,7 +420,7 @@ public:
              }
            }
          }
-          if(factorGroup == 0) 
+          if(factorGroup == 0)
            currentDimBatch = (IndexType) batchIndices.size(); // keep batch size constant for all factor groups in a time step
          prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores));
        }
@ -505,7 +505,7 @@ public:
                       beams,
                       states,    // used for keeping track of per-ensemble-member path score
                       batch,     // only used for propagating alignment info
-                       factoredVocab, factorGroup, 
+                       factoredVocab, factorGroup,
                       emptyBatchEntries, // [origDimBatch] - empty source batch entries are marked with true
                       batchIdxMap); // used to create a reverse batch index map to recover original batch indices for this step
      } // END FOR factorGroup = 0 .. numFactorGroups-1
--- a/src/translator/nth_element.cpp
+++ b/src/translator/nth_element.cpp
@ -20,34 +20,6 @@ public:
  NthElementCPU() {}
  NthElementCPU(const NthElementCPU& copy) = delete;

-private:
-  // for each batch, select the max N elements, where N is the beam size for this batch.
-  void selectNBest(const float* scores,
-                   const std::vector<int>& batchFirstElementIdxs,
-                   const std::vector<int>& cumulativeBeamSizes) {
-    int numProbs = batchFirstElementIdxs.back();
-    std::vector<int> idxs(numProbs);
-    std::iota(idxs.begin(), idxs.end(), 0);
-
-    size_t numBatches = batchFirstElementIdxs.size() - 1;
-    for(size_t batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
-      int pos = cumulativeBeamSizes[batchIdx];
-      int beamSize = cumulativeBeamSizes[batchIdx + 1] - pos;
-
-      std::vector<int>::iterator begin = idxs.begin() + batchFirstElementIdxs[batchIdx];
-      std::vector<int>::iterator middle = begin + beamSize;
-      std::vector<int>::iterator end = idxs.begin() + batchFirstElementIdxs[batchIdx + 1];
-      std::partial_sort(
-          begin, middle, end, [&](int a, int b) { return scores[a] > scores[b]; });
-
-      while(begin != middle) {
-        int idx = *begin++;
-        h_res_idx[pos] = idx;
-        h_res[pos] = scores[idx];
-        ++pos;
-      }
-    }
-  }

 public:
  void getNBestList(Tensor scores, // [dimBatch, 1, beamSize, dimVocab or dimShortlist]
@ -59,23 +31,39 @@ public:
    const auto inputN    = scores->shape()[-2];
    const auto dimBatch  = scores->shape()[-4];
    ABORT_IF(inputN != (isFirst ? 1 : N), "Input tensor has wrong beam dim??"); // @TODO: Remove isFirst argument altogether
-
-    std::vector<int> cumulativeBeamSizes(dimBatch + 1, 0);
-    std::vector<int> batchFirstElementIdxs(dimBatch + 1, 0);
-
-    for(int batchIdx = 0; batchIdx < dimBatch; ++batchIdx) {
-      cumulativeBeamSizes[batchIdx + 1] = (batchIdx + 1) * (int)N;
-      batchFirstElementIdxs[batchIdx + 1] += (batchIdx + 1) * inputN * vocabSize;
-      ABORT_IF(cumulativeBeamSizes[batchIdx + 1] != cumulativeBeamSizes[batchIdx] + (int)N, "cumulativeBeamSizes wrong??");
-      ABORT_IF((isFirst ? batchIdx + 1 : cumulativeBeamSizes[batchIdx + 1]) != (batchIdx + 1) * inputN, "inputN wrong??");
-    }
-    ABORT_IF(cumulativeBeamSizes.back() != dimBatch * N, "cumulativeBeamSizes.back() wrong??");
+    const float* scoresData = scores->data();

    size_t maxSize = N * dimBatch;
    h_res.resize(maxSize);
    h_res_idx.resize(maxSize);
+    size_t pos = 0; // iterates through h_res and h_res_idx

-    selectNBest(scores->data(), batchFirstElementIdxs, cumulativeBeamSizes);
+    size_t batchOffset = inputN * vocabSize;
+    std::vector<int> idxs(batchOffset); // re-used for each batch
+    std::iota(idxs.begin(), idxs.end(), 0);
+
+    for(size_t batchIdx = 0; batchIdx < dimBatch; ++batchIdx) {
+
+      std::partial_sort( 
+        // sorts the top N (beam size) idxs by score to the front
+        idxs.begin(),
+        idxs.begin() + N,
+        idxs.end(),
+        [&](int a, int b) { return scoresData[a] > scoresData[b]; }
+      );
+
+      // copy top N idxs and scores to return vectors
+      for(size_t i = 0; i < N; ++i) {
+        int idx = idxs[i];
+        // since idxs is re-used for each batch, add batch offset to each idx to get absolute position
+        h_res_idx[pos] = idx + batchIdx * batchOffset;
+        h_res[pos] = scoresData[idx];
+        ++pos;
+      }
+
+      // advance pointer to next batch's beginning
+      scoresData += batchOffset;
+    }
    getPairs(/*cumulativeBeamSizes.back(),*/ outKeys, outPathScores);
  }

--- a/src/translator/output_collector.h
+++ b/src/translator/output_collector.h
@ -11,6 +11,7 @@ namespace marian {

 class PrintingStrategy {
 public:
+  virtual ~PrintingStrategy() {}
  virtual bool shouldBePrinted(long) = 0;
 };

--- a/src/translator/scorers.h
+++ b/src/translator/scorers.h
@ -10,6 +10,8 @@ namespace marian {

 class ScorerState {
 public:
+  virtual ~ScorerState(){}
+
  virtual Logits getLogProbs() const = 0;

  virtual void blacklist(Expr /*totalCosts*/, Ptr<data::CorpusBatch> /*batch*/){};
@ -24,6 +26,8 @@ public:
  Scorer(const std::string& name, float weight)
      : name_(name), weight_(weight) {}

+  virtual ~Scorer(){}
+
  std::string getName() { return name_; }
  float getWeight() { return weight_; }

@ -53,6 +57,7 @@ protected:

 public:
  ScorerWrapperState(Ptr<DecoderState> state) : state_(state) {}
+  virtual ~ScorerWrapperState() {}

  virtual Ptr<DecoderState> getState() { return state_; }

@ -88,6 +93,8 @@ public:
        encdec_(std::static_pointer_cast<IEncoderDecoder>(encdec)),
        ptr_{ptr} {}

+  virtual ~ScorerWrapper() {}
+
  virtual void init(Ptr<ExpressionGraph> graph) override {
    graph->switchParams(getName());
    if(ptr_)
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@ -109,6 +109,17 @@ public:

      threadPool.enqueue(task, device, id++);
    }
+
+    if(options_->get<bool>("output-sampling", false)) {
+      if(options_->get<size_t>("beam-size") > 1)
+        LOG(warn,
+            "[warning] Output sampling and beam search (beam-size > 1) are contradictory methods "
+            "and using them together is not recommended. Set beam-size to 1");
+      if(options_->get<std::vector<std::string>>("models").size() > 1)
+        LOG(warn,
+            "[warning] Output sampling and model ensembling are contradictory methods and using "
+            "them together is not recommended. Use a single model");
+    }
  }

  void run() override {