Add support for compiling on Mac (and clang) (#598)

* Compile marian on mac and clang. Two linker errors left * MacOS defines has a different definition for unsigned long * Find OpenBLAS on mac * Fix a typo in the BLAS detection * Simplify and add comments * Refactor cpu allocation code. Do not fallback to malloc * Fix compilation warning on gcc * Refactor memory allocation * Make things compile with clang-8 with fewer warnings. * Eliminate clang warnings when compiling examples and when compiling without MKL * added USE_MKL option to compile without MKL for debugging even when MKL is installed * fixed issues with compiling examples with clang * Fix compile errors with clang in src/tests. * Fix missing whitespace in error message in src/tests/sqlite.cpp. * Responding to Frank Seide's code review. * Eliminate clang warnings when compiling with -DUSE_FBGEMM=on. * Fix compilation on gcc 8 * Get Marian to compile with Clang-10. * Fix Clang-8 warnings when compiling with marian-server * Add more comments and explicit unsigned long long for windows * Pull in fbgemm that supports mac * Fix warning flags order in CMakeLists.txt Co-authored-by: Kenneth Heafield <kpu@users.noreply.github.com> Co-authored-by: Ulrich Germann <ulrich.germann@gmail.com> Co-authored-by: Roman Grundkiewicz <romang@amu.edu.pl>
2024-09-11 06:15:56 +03:00 · 2020-03-05 21:08:23 +00:00 · 2020-03-05 21:08:23 +00:00 · 00d2e999e3
commit 00d2e999e3
parent 67b055fe4a
69 changed files with 428 additions and 236 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,6 +24,7 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
+option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" OFF)
@ -33,7 +34,7 @@ option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 if(USE_CCACHE)
 find_program(CCACHE_PROGRAM ccache)
 if(CCACHE_PROGRAM)
-  message(STATUS "Found and will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
+  message(STATUS "Will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
 else(CCACHE_PROGRAM)
  message(WARNING "Compilation with ccache requested but no ccache found.")
@ -141,20 +142,32 @@ else(MSVC)
    add_definitions(-DUSE_FBGEMM=1)
  endif(USE_FBGEMM)

-  set(DISABLE_GLOBALLY "-Wno-unused-result")
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+    # Clang-10.0.0 complains when CUDA is newer than 10.1
+    set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-cuda-version")
+  endif()
+  set(DISABLE_GLOBALLY "-Wno-unused-result -Wno-unknown-warning-option ${CLANG_IGNORE_UNKNOWN_CUDA}")

  # These are used in src/CMakeLists.txt on a per-target basis
-  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-                          -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
+  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
+    -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
+    -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;
+    -Wno-missing-field-initializers;)

  # This warning does not exist prior to gcc 5.0
  if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
-    list(APPEND ALL_WARNINGS -Wsuggest-override)
+    list(APPEND ALL_WARNINGS -Wsuggest-override -Wno-int-in-bool-context)
  endif()

-  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread -Wl,--no-as-needed -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_CXX_FLAGS_RELEASE         "-Ofast -m64 -funroll-loops -ffinite-math-only -g -rdynamic")
-  set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g -rdynamic")
+  if(CMAKE_COMPILER_IS_GNUCC)
+    # these flags are not known to clang
+    set(CMAKE_GCC_FLAGS "-Wl,--no-as-needed")
+    set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
+  endif(CMAKE_COMPILER_IS_GNUCC)
+
+  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_CXX_FLAGS_RELEASE         "-Ofast -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
  set(CMAKE_CXX_FLAGS_SLIM            "-Ofast -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELEASE}")
  set(CMAKE_CXX_FLAGS_PROFILE         "${CMAKE_CXX_FLAGS_RELEASE} -pg")
@ -162,9 +175,9 @@ else(MSVC)
  set(CMAKE_CXX_FLAGS_PROFUSE         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")

  # these need to be set separately
-  set(CMAKE_C_FLAGS                 "-pthread -Wl,--no-as-needed -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -ffinite-math-only -g -rdynamic")
-  set(CMAKE_C_FLAGS_DEBUG           "-O0 -g -rdynamic")
+  set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_C_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
  set(CMAKE_C_FLAGS_SLIM            "-O3 -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
  set(CMAKE_C_FLAGS_RELWITHDEBINFO  "${CMAKE_C_FLAGS_RELEASE}")
  set(CMAKE_C_FLAGS_PROFILE         "${CMAKE_C_FLAGS_RELEASE} -pg")
@ -204,7 +217,7 @@ if(CUDA_FOUND)
  if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2"))
      message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
  endif()
-  
+
  if(COMPILE_CUDA_SM35)
    LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;)                             # Tesla K40 and above
  endif(COMPILE_CUDA_SM35)
@ -323,13 +336,15 @@ if(USE_MPI)
 endif(USE_MPI)

 if(COMPILE_CPU)
-  find_package(MKL)
+  if(USE_MKL)
+    find_package(MKL)
+  endif(USE_MKL)
  if(MKL_FOUND)
    include_directories(${MKL_INCLUDE_DIR})
    set(EXT_LIBS ${EXT_LIBS} ${MKL_LIBRARIES})
    add_definitions(-DBLAS_FOUND=1 -DMKL_FOUND=1)
  else(MKL_FOUND)
-    set(BLA_VENDOR "OpenBLAS")
+    set(BLAS_VENDOR "OpenBLAS")
    find_package(BLAS)
    if(BLAS_FOUND)
      include(FindCBLAS)
--- a/cmake/FindCBLAS.cmake
+++ b/cmake/FindCBLAS.cmake
@ -54,7 +54,7 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES INCLUDE _prefix _name _flags _list _include
      IF(APPLE) 
        FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
          NAMES ${_library}
-          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV 
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 /usr/local/opt/openblas/lib ENV
          DYLD_LIBRARY_PATH 
          )
      ELSE(APPLE)
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@ -15,12 +15,22 @@ if(USE_FBGEMM)

  if(NOT MSVC)
    # only locally disabled for the 3rd_party folder
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
+    # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused")
  endif()

  set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "Disable fbgemm tests")
  set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "Disable fbgemm benchmark")
  add_subdirectory(./fbgemm)
+
+  # asmjit (3rd-party submodule of fbgemm) sets -Wall -Wextra near the end of
+  # the compile options, invalidating any -Wno-... flags that we may have set
+  # earlier. Let's remove them.
+  get_property(ASMJIT_COMPILE_OPTIONS TARGET asmjit PROPERTY COMPILE_OPTIONS)
+  list(REMOVE_ITEM ASMJIT_COMPILE_OPTIONS -Wall -Wextra)
+  set_property(TARGET asmjit PROPERTY COMPILE_OPTIONS ${ASMJIT_COMPILE_OPTIONS})
+  message("   ASMJIT COMPILE FLAGS: ${ASMJIT_COMPILE_OPTIONS}")
+
 endif(USE_FBGEMM)

 if(USE_SENTENCEPIECE)
@ -39,7 +49,7 @@ if(USE_SENTENCEPIECE)
    message(WARNING "You are compiling SentencePiece binaries with -DUSE_STATIC_LIBS=on. \
    This will cause spm_train to segfault. No need to worry if you do not intend to use that binary. \
    Marian support for SentencePiece will work fine.")
-    
+
    set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
    set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
  else(USE_STATIC_LIBS)
@ -51,8 +61,19 @@ if(USE_SENTENCEPIECE)
  include_directories(./sentencepiece)

  set_target_properties(spm_encode spm_decode spm_train spm_normalize spm_export_vocab
-                        PROPERTIES
-                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    foreach(t sentencepiece sentencepiece_train sentencepiece_train-static
+        spm_decode spm_encode spm_export_vocab spm_normalize spm_train)
+      set_property(TARGET ${t} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-tautological-compare -Wno-unused")
+      if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+        set_property(TARGET ${t} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-range-loop-construct")
+      endif()
+      # get_property(SENTENCEPIECE_COMPILE_FLAGS TARGET ${t} PROPERTY COMPILE_FLAGS)
+      # message("-- SENTENCPIECE: compile flags for target ${t}: ${SENTENCEPIECE_COMPILE_FLAGS}")
+    endforeach(t)
+  endif()

  if(USE_STATIC_LIBS)
    set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
@ -63,6 +84,22 @@ include_directories(./SQLiteCpp/include)
 include_directories(./CLI)
 include_directories(./pathie-cpp/include)

+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  #set_target_properties(SQLiteCpp PROPERTIES COMPILE_FLAGS
+  set_property(TARGET SQLiteCpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -Wno-parentheses-equality -Wno-unused-value")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+    set_property(TARGET SQLiteCpp APPEND_STRING PROPERTY COMPILE_FLAGS
+      " -Wno-implicit-int-float-conversion")
+  endif()
+  set_property(TARGET libyaml-cpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -fPIC -Wno-unused-value")
+  set_property(TARGET pathie-cpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -fPIC -Wno-unused-value")
+endif()
+
+
+
 include_directories(./zlib)

 include(ExternalProject)
--- a/src/3rd_party/fbgemm
+++ b/src/3rd_party/fbgemm
@ -1 +1 @@
-Subproject commit 84e66a976046180187724aff60a236c5378fde7c
+Subproject commit f78e60988329b9207d086c743cafce1ac1bea3ab
--- a/src/3rd_party/half_float/umHalf.inl
+++ b/src/3rd_party/half_float/umHalf.inl
@ -186,7 +186,7 @@ inline HalfFloat& HalfFloat::operator= (float other)
 inline bool HalfFloat::operator== (HalfFloat other) const
 {
 	// +0 and -0 are considered to be equal
-	if (!(bits << 1u) && !(other.bits << 1u))return true;
+	if ((bits << 1u) == 0 && (other.bits << 1u) == 0) return true;

 	return bits == other.bits && !this->IsNaN();
 }
@ -194,7 +194,7 @@ inline bool HalfFloat::operator== (HalfFloat other) const
 inline bool HalfFloat::operator!= (HalfFloat other) const
 {
 	// +0 and -0 are considered to be equal
-	if (!(bits << 1u) && !(other.bits << 1u))return false;
+	if ((bits << 1u) == 0 && (other.bits << 1u) == 0) return false;

 	return bits != other.bits || this->IsNaN();
 }
--- a/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
+++ b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
@ -31,7 +31,7 @@
 #include "../include/path.hpp"
 #include "../include/errors.hpp"

-#if defined(__unix__)
+#if defined(__unix__) || defined(__APPLE__)
 #include <sys/types.h>
 #include <dirent.h>
 #include <errno.h>
--- a/src/3rd_party/pathie-cpp/src/path.cpp
+++ b/src/3rd_party/pathie-cpp/src/path.cpp
@ -902,7 +902,7 @@ Path Path::pwd()
 */
 Path Path::exe()
 {
-#if defined(__linux__)
+#if defined(__linux__) || defined(__APPLE__)
  char buf[PATH_MAX];
  ssize_t size = ::readlink("/proc/self/exe", buf, PATH_MAX);

--- a/src/3rd_party/pathie-cpp/src/pathie.cpp
+++ b/src/3rd_party/pathie-cpp/src/pathie.cpp
@ -143,7 +143,7 @@ std::string Pathie::convert_encodings(const char* from_encoding, const char* to_
    errno  = 0;
    errsav = 0;

-#ifdef BSD
+#if defined(BSD) && ! defined(__APPLE__) //Since MacOS evolved from BSD, it is captured here but the iconv on macos behaves differently
    // What the heck. FreeBSD violates POSIX.1-2008: it declares iconv()
    // differently than mandated by POSIX: http://pubs.opengroup.org/onlinepubs/9699919799/functions/iconv.html
    // (it declares a `const' where it must not be).
@ -181,11 +181,10 @@ std::string Pathie::convert_encodings(const char* from_encoding, const char* to_
 std::string Pathie::utf8_to_filename(const std::string& utf8)
 {
  bool fs_encoding_is_utf8 = false;
-
+  char* fsencoding = NULL;
 #if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
  fs_encoding_is_utf8 = true;
 #else
-  char* fsencoding = NULL;
  fsencoding = nl_langinfo(CODESET);
  fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
 #endif
@ -206,11 +205,10 @@ std::string Pathie::utf8_to_filename(const std::string& utf8)
 std::string Pathie::filename_to_utf8(const std::string& native_filename)
 {
  bool fs_encoding_is_utf8 = false;
-
+  char* fsencoding = NULL;
 #if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
  fs_encoding_is_utf8 = true;
 #else
-  char* fsencoding = NULL;
  fsencoding = nl_langinfo(CODESET);
  fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
 #endif
--- a/src/3rd_party/zstr/strict_fstream.hpp
+++ b/src/3rd_party/zstr/strict_fstream.hpp
@ -27,7 +27,7 @@ static std::string strerror()
    {
        buff = "Unknown error";
    }
-#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
+#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || __APPLE__) && ! _GNU_SOURCE
 // XSI-compliant strerror_r()
    if (strerror_r(errno, &buff[0], buff.size()) != 0)
    {
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -215,6 +215,10 @@ if(COMPILE_SERVER)
  set(EXECUTABLES ${EXECUTABLES} marian_server)
 endif(COMPILE_SERVER)

+if(APPLE) # This is a dependency of pathie but I can't seem to link it into that CMakeLists because we're not compiling it as a library.
+   set(EXT_LIBS ${EXT_LIBS} iconv)
+endif()
+
 foreach(exec ${EXECUTABLES})
  target_link_libraries(${exec} marian ${EXT_LIBS} ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
  if(CUDA_FOUND)
--- a/src/command/marian_server.cpp
+++ b/src/command/marian_server.cpp
@ -44,7 +44,7 @@ int main(int argc, char **argv) {

  // Error Codes for error code meanings
  // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
-  translate.on_error = [](Ptr<WSServer::Connection> connection,
+  translate.on_error = [](Ptr<WSServer::Connection> /*connection*/,
                          const SimpleWeb::error_code &ec) {
    LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
  };
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@ -10,6 +10,21 @@
 #include <string>
 #include <vector>

+// The macro MAYBE_UNUSED is used to selectively disable
+// unused-variable warnings. C++17 defines the attribute
+// [[maybe_unused]], but I don't think we're at C++17 yet. We can add it when we reach C++17.
+// The compilers gcc and clang (and maybe others) define
+// __has_attribute and support __attribute__(unused) in C++11,
+#if defined __has_attribute
+#  if __has_attribute(unused)
+#    define MAYBE_UNUSED __attribute__((unused))
+#  else
+#    define MAYBE_UNUSED
+#  endif
+#endif
+
+
+
 #define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is neccessary, remove if no problems occur.
 #define NodeOp(op) [=]() { op; }

--- a/src/common/fastopt.cpp
+++ b/src/common/fastopt.cpp
@ -84,10 +84,16 @@ std::vector<T> As<std::vector<T>>::apply(const FastOpt& node) {
 // specializations for simple vector types
 template struct As<std::vector<bool>>;
 template struct As<std::vector<int>>;
-// Windows and Unix based OS have different type definitions for 'unsigned long'.
-// So, we need an explicit definition for uint64_t. Otherwise, there's a linking error on windows.
+// Windows, Linux based OS and Mac have different type definitions for 'unsigned long'.
+// So, we need an explicit definitions for uint64_t, that cover different platforms.
+// Otherwise, there's a linking error on windows or Linux or Mac.
 // https://software.intel.com/en-us/articles/size-of-long-integer-type-on-different-architecture-and-os/
-template struct As<std::vector<uint64_t>>;
+// https://stackoverflow.com/questions/32021860/c-should-you-size-t-with-a-regular-array
+// MacOS: size_t = unsigned long (8 bytes), uint64_t = unsigned long long (8 bytes)
+// Linux: size_t = unsigned long (8 bytes), uint64_t = unsigned long (8 bytes)
+// Windows: size_t = unsigned long long (8 bytes), uint64_t = unsigned long long (8 bytes)
+template struct As<std::vector<unsigned long long>>;
+template struct As<std::vector<unsigned long>>;
 template struct As<std::vector<float>>;
 template struct As<std::vector<double>>;
 template struct As<std::vector<std::string>>;
@ -103,4 +109,4 @@ std::pair<T1, T2> As<std::pair<T1, T2>>::apply(const FastOpt& node) {
 template struct As<std::pair<int, int>>;

 }
-}
+}
--- a/src/common/fastopt.h
+++ b/src/common/fastopt.h
@ -367,7 +367,8 @@ public:
  }

  const FastOpt& operator[](const char* const key) const {
-    return operator[](crc::crc(key));
+    // MacOS requires explicit cast to size_t before we can use it.
+    return operator[]((size_t)crc::crc(key));
  }

  const FastOpt& operator[](const std::string& key) const {
@ -375,4 +376,4 @@ public:
  }
 };

-}
+}
--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@ -7,10 +7,21 @@
 #include "common/filesystem.h"
 #include "common/logging.h"

+// Even when compiling with clang, __GNUC__ may be defined, so
+// we need to add some extra checks to avoid compile errors with
+// respect to -Wsuggest-override.
 #ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsuggest-override"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-value"
+#  if defined(__has_warning)
+#    if __has_warning("-Wsuggest-override")
+#      pragma GCC diagnostic ignored "-Wsuggest-override"
+#    endif
+#  else
+#    pragma GCC diagnostic ignored "-Wsuggest-override"
+#  endif
 #endif
+
 #ifdef _MSC_VER
 #pragma warning(push) // 4101: 'identifier' : unreferenced local variable. One parameter variable in zstr.hpp is not used.
 #pragma warning(disable : 4101)
@ -82,7 +93,7 @@ protected:

  void NormalizeTempPrefix(std::string& base) const;
  void MakeTemp(const std::string& base);
- 
+
 };

 }  // namespace io
--- a/src/common/filesystem.h
+++ b/src/common/filesystem.h
@ -7,9 +7,19 @@
 // @TODO: go back to canonical names for functions and objects
 // as specified in C++17 so it becomes easy to move in the future

+// Even when compiling with clang, __GNUC__ may be defined, so
+// we need to add some extra checks to avoid compile errors with
+// respect to -Wsuggest-override.
 #ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsuggest-override"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-value"
+#  if defined(__has_warning)
+#    if __has_warning("-Wsuggest-override")
+#      pragma GCC diagnostic ignored "-Wsuggest-override"
+#    endif
+#  else
+#    pragma GCC diagnostic ignored "-Wsuggest-override"
+#  endif
 #endif

 #include "3rd_party/pathie-cpp/include/path.hpp"  // @TODO: update to latest Pathie
--- a/src/common/intrusive_ptr.h
+++ b/src/common/intrusive_ptr.h
@ -169,12 +169,12 @@ inline bool operator!=(const IntrusivePtr<T>& a, std::nullptr_t) {

 template<class T>
 inline bool operator==(T* a, const IntrusivePtr<T>& b) {
-  return b.get();
+  return a == b.get(); // used to say: return b.get(); That cannot be right. [UG]
 }

 template<class T>
 inline bool operator!=(T* a, const IntrusivePtr<T>& b) {
-  return b.get();
+  return a != b.get(); // used to say: return b.get(); That cannot be right. [UG]
 }

 template<class T, class U>
@ -223,5 +223,3 @@ namespace std {
    }
  };
 }
-
-
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@ -124,7 +124,7 @@ static void setErrorHandlers() {
  std::set_terminate(unhandledException);
 #ifdef __unix__
  // catch segfaults
-  struct sigaction sa = { 0 };
+  struct sigaction sa = { {0} };
  sigemptyset(&sa.sa_mask);
  sa.sa_flags = SA_SIGINFO;
  sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };
--- a/src/common/types.h
+++ b/src/common/types.h
@ -254,7 +254,7 @@ enum class Type : size_t {
  packed16      = TypeClass::packed_type + 2u,                          // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
  packed8avx2   = TypeClass::packed_type + 1u + TypeClass::avx2_type,   // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
  packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-  
+
 };

 static inline size_t operator&(TypeClass typeClass, Type type) {
@ -394,7 +394,7 @@ static Type inline typeFromString(const std::string& str) {
    return Type::float32;
  if(str == "float64")
    return Type::float64;
-  
+
  if(str == "packed16")
    return Type::packed16;
  if(str == "packed8avx2")
@ -437,19 +437,35 @@ void matchOrAbort(Type type) {

 namespace typeFitting { // own namespace instead of in class, otherwise we get error "explicit specialization in non-namespace scope"

-  // compares max for different types as constexpr, so can be used at compile-time to determine if RequestType type max fits into ReturnType max, see std::conditional below.
-  template <typename RequestType, typename ReturnType>
-  constexpr bool fitsIntoMax() { return std::numeric_limits<RequestType>::max() <= std::numeric_limits<ReturnType>::max(); } // for built-in types everything is constexpr
+  // Helper function for fitsIntoMax() below
+  // Returns the 'capacity' of a type: number of digits for integers,
+  // max_exponent for floats. We ignore the mantissa for floats.
+  template<typename X> constexpr int capacity() {
+    static_assert(std::is_arithmetic<X>::value || std::is_same<X,HalfFloat>::value,
+                  "Wrong type for this template");
+    return (std::is_integral<X>::value
+            ? std::numeric_limits<X>::digits
+            : std::numeric_limits<X>::max_exponent);
+ }
+
+
+  // Compare max for different types as constexpr, so can be used at compile-time to determine if RequestType type max fits into ReturnType max, see std::conditional below.
+  template <typename RequestType, typename ReturnType>
+  constexpr bool fitsIntoMax() {
+    // We can't just compare std::numeric_limits<>::max(), because Clang-10
+    // complains about rounding errors when implicitly converting int to float
+    return ((!std::is_integral<RequestType>::value // RequestType is a float
+             && std::is_integral<ReturnType>::value) // ReturnType an integer
+            ? capacity<RequestType>() < capacity<ReturnType>() // special case
+            : capacity<RequestType>() <= capacity<ReturnType>()); // normal case
+  } // for built-in types everything is constexpr

-  // add specializations here when needed
-  template <> constexpr bool fitsIntoMax<float16, float>() { return true; };  // for float16 conversion to float is not constexpr, hence specializations
-  template <> constexpr bool fitsIntoMax<float, float16>() { return false; }; // for float16 conversion to float is not constexpr, hence specializations
 }

 template <typename ReturnType>
 class NumericLimits {
 private:
-  
+
  template <typename MaxType> void setLimitsMax() {
    max    = (ReturnType)std::numeric_limits<MaxType>::max();
    lowest = (ReturnType)std::numeric_limits<MaxType>::lowest();
@ -459,10 +475,14 @@ private:
  void setLimits() {
    // check if the maximum of type RequestType fits into ReturnType
    constexpr bool fits = typeFitting::fitsIntoMax<RequestType, ReturnType>();
+    // sanity check:
+    static_assert(fits || typeFitting::fitsIntoMax<ReturnType, RequestType>(),
+                  "RequestType doesn't fit into ReturnType, and ReturnType doesn't "
+                  "fit into RequestType. fitsIntoMax is broken!");
    // and then use the smaller of each types to determine max, min, lowest.
    using MaxType = typename std::conditional<fits, RequestType, ReturnType>::type;
    setLimitsMax<MaxType>();
-    // @TODO: should we rather abort if the RequestType does not fit into ReturnType instead of clipping to smaller type? 
+    // @TODO: should we rather abort if the RequestType does not fit into ReturnType instead of clipping to smaller type?
    // ABORT_IF(!fits, "Type {} is too small to contain max of type {}", typeId<ReturnType>(), typeId<RequestType>());
  }

--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@ -8,12 +8,22 @@
 #include <sstream>
 #include <string>
 #include <set>
-#ifdef __unix__
+#if defined(__unix__) || defined(__APPLE__)
 #include <unistd.h>
 #endif
 #include <codecvt>
 #include <cwctype>

+// MACOS lacks HOST_NAME_MAX
+#ifndef HOST_NAME_MAX
+# if defined(_POSIX_HOST_NAME_MAX)
+#  define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+# elif defined(MAXHOSTNAMELEN)
+#  define HOST_NAME_MAX MAXHOSTNAMELEN
+# endif
+#endif
+
+
 namespace marian {
 namespace utils {

--- a/src/data/batch.h
+++ b/src/data/batch.h
@ -26,7 +26,7 @@ public:

  virtual void setGuidedAlignment(std::vector<float>&&) = 0;
  virtual void setDataWeights(const std::vector<float>&) = 0;
-
+  virtual ~Batch() {};
 protected:
  std::vector<size_t> sentenceIds_;
 };
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -525,6 +525,7 @@ public:
             const std::vector<Ptr<Vocab>>& vocabs,
             Ptr<Options> options);

+  virtual ~CorpusBase() {}
  virtual std::vector<Ptr<Vocab>>& getVocabs() = 0;

 protected:
--- a/src/data/default_vocab.cpp
+++ b/src/data/default_vocab.cpp
@ -45,6 +45,7 @@ protected:

 public:
  // @TODO: choose between 'virtual' and 'final'. Can we derive from this class?
+  virtual ~DefaultVocab() {};
  virtual const std::string& canonicalExtension() const override { return suffixes_[0]; }
  virtual const std::vector<std::string>& suffixes() const override { return suffixes_; }

@ -295,7 +296,7 @@ private:
 class ClassVocab : public DefaultVocab {
 private:
  // Do nothing.
-  virtual void addRequiredVocabulary(const std::string& vocabPath, bool isJson) override { vocabPath; isJson; }
+  virtual void addRequiredVocabulary(const std::string& /*vocabPath*/, bool /*isJson*/) override {}

  // Not adding special class labels, only seen classes.
  virtual void create(const std::string& vocabPath,
--- a/src/data/shortlist.h
+++ b/src/data/shortlist.h
@ -36,6 +36,8 @@ public:

 class ShortlistGenerator {
 public:
+  virtual ~ShortlistGenerator() {}
+
  virtual Ptr<Shortlist> generate(Ptr<data::CorpusBatch> batch) const = 0;

  // Writes text version of (possibly) pruned short list to file
@ -129,7 +131,6 @@ private:
  Ptr<const Vocab> trgVocab_;

  size_t srcIdx_;
-  size_t trgIdx_;
  bool shared_{false};

  size_t firstNum_{100};
@ -183,13 +184,12 @@ public:
                            Ptr<const Vocab> srcVocab,
                            Ptr<const Vocab> trgVocab,
                            size_t srcIdx = 0,
-                            size_t trgIdx = 1,
+                            size_t /*trgIdx*/ = 1,
                            bool shared = false)
      : options_(options),
        srcVocab_(srcVocab),
        trgVocab_(trgVocab),
        srcIdx_(srcIdx),
-        trgIdx_(trgIdx),
        shared_(shared) {
    std::vector<std::string> vals = options_->get<std::vector<std::string>>("shortlist");

@ -235,7 +235,6 @@ public:

  virtual Ptr<Shortlist> generate(Ptr<data::CorpusBatch> batch) const override {
    auto srcBatch = (*batch)[srcIdx_];
-    // auto trgBatch = (*batch)[trgIdx_];

    // add firstNum most frequent words
    std::unordered_set<WordIndex> indexSet;
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@ -37,6 +37,7 @@ public:
  typedef SentenceTuple Sample;

  TextInput(std::vector<std::string> inputs, std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~TextInput() {}

  Sample next() override;

--- a/src/data/vocab_base.h
+++ b/src/data/vocab_base.h
@ -57,6 +57,7 @@ public:
  virtual Word randWord() const {
    return Word::fromWordIndex(rand() % size());
  }
+  virtual ~IVocab() {};
 };

 class Options;
--- a/src/examples/mnist/dataset.h
+++ b/src/examples/mnist/dataset.h
@ -62,6 +62,7 @@ private:
  std::vector<Input> inputs_;

 public:
+
  std::vector<Input>& inputs() { return inputs_; }

  const std::vector<Input>& inputs() const { return inputs_; }
@ -144,6 +145,8 @@ public:
    loadData();
  }

+  virtual ~MNISTData(){}
+
  void loadData() override {
    ABORT_IF(paths_.size() != 2, "Paths to MNIST data files are not specified");

--- a/src/examples/mnist/model.h
+++ b/src/examples/mnist/model.h
@ -47,6 +47,8 @@ class MNISTLogsoftmax : public ILogProb {
 public:
  MNISTLogsoftmax() {}

+  virtual ~MNISTLogsoftmax(){}
+
  Logits apply(Ptr<IModel> model,
             Ptr<ExpressionGraph> graph,
             Ptr<data::Batch> batch,
@ -61,13 +63,15 @@ public:
  typedef data::MNISTData dataset_type;

  template <class... Args>
-  MnistFeedForwardNet(Ptr<Options> options, Args... args)
+  MnistFeedForwardNet(Ptr<Options> options, Args... /*args*/)
      : options_(options), inference_(options->get<bool>("inference", false)) {}

+  virtual ~MnistFeedForwardNet(){}
+
  virtual Logits build(Ptr<ExpressionGraph> graph,
                     Ptr<data::Batch> batch,
                     bool /*clean*/ = false) override {
-    
+
    return Logits(apply(graph, batch, inference_));
  }

--- a/src/examples/mnist/validator.h
+++ b/src/examples/mnist/validator.h
@ -19,7 +19,9 @@ public:
    builder_ = models::createModelFromOptions(options, models::usage::translation);
  }

-  virtual void keepBest(const std::vector<Ptr<ExpressionGraph>>& graphs) override {
+  virtual ~MNISTAccuracyValidator(){}
+
+  virtual void keepBest(const std::vector<Ptr<ExpressionGraph>>& /*graphs*/) override {
    LOG(warn, "Keeping best model for MNIST examples is not supported");
  }

--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@ -7,55 +7,58 @@ namespace marian {
 namespace functional {

 // General template, will be used for any type without specializations
-// and will fail with an abort message.
+// and will fail at runtime with an abort message. Note that the
+// general template functions don't have named parameters on purpose,
+// because clang will warn about unused parameters during compilation.
+
 template <typename T>
 struct Ops {
-  static HOST_DEVICE_INLINE T tanh(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sin(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T cos(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T tan(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T log(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T exp(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T abs(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sqrt(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T neg(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sgn(const T& x)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T tanh(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sin(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T cos(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T tan(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T log(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T exp(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T abs(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sqrt(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T neg(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sgn(const T&)  { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T add(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sub(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T mul(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T div(const T& x, const T& y)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T add(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sub(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T mul(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T div(const T&, const T&)  { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T max(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T min(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T pow(const T& x, const T& y)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T max(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T min(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T pow(const T&, const T&)  { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T negate(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T eq(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T neq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T gt(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T lt(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T geq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T leq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T _and(const T& x, const T& y) { ABORT("Unknown type"); } // 'and' is used by gcc
-  static HOST_DEVICE_INLINE T _or(const T& x, const T& y)  { ABORT("Unknown type"); } // 'or' is used by gcc
+  static HOST_DEVICE_INLINE T negate(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T eq(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T neq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T gt(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T lt(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T geq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T leq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T _and(const T&, const T&) { ABORT("Unknown type"); } // 'and' is used by gcc
+  static HOST_DEVICE_INLINE T _or(const T&, const T&)  { ABORT("Unknown type"); } // 'or' is used by gcc

  // Neural Networks specific functions
-  static HOST_DEVICE_INLINE T sigmoid(const T& x)               { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T logaddexp(const T& x, const T& y) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T clip(const T& x, const T& y)      { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sigmoid(const T&)               { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T logaddexp(const T&, const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T clip(const T&, const T&)      { ABORT("Unknown type"); }
  // derivative of Clip, cut-off function
-  static HOST_DEVICE_INLINE T bump(const T& x, const T& y)      { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T relu(const T& x)                  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T reluBack(const T& x)              { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T prelu(const T& x, const T& y)     { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T preluBack(const T& x, const T& y) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T bump(const T&, const T&)      { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T relu(const T&)                  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T reluBack(const T&)              { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T prelu(const T&, const T&)     { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T preluBack(const T&, const T&) { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T if_then_else(const T& x, const T& y, const T& z) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T if_then_else(const T&, const T&, const T&) { ABORT("Unknown type"); }

-  static HOST_DEVICE_INLINE T sumReduce(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T maxReduce(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T minReduce(const T& x) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sumReduce(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T maxReduce(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T minReduce(const T&) { ABORT("Unknown type"); }
 };

 // Specialization for float
@ -127,14 +130,14 @@ template <>
 struct Ops<double> {
  typedef double Single;

-  static HOST_DEVICE_INLINE double tanh(const double& x) { return tanh(x); }
-  static HOST_DEVICE_INLINE double sin(const double& x)  { return sin(x); }
-  static HOST_DEVICE_INLINE double cos(const double& x)  { return cos(x); }
-  static HOST_DEVICE_INLINE double tan(const double& x)  { return tan(x); }
-  static HOST_DEVICE_INLINE double log(const double& x)  { return log(x); }
-  static HOST_DEVICE_INLINE double exp(const double& x)  { return exp(x); }
-  static HOST_DEVICE_INLINE double abs(const double& x)  { return abs(x); }
-  static HOST_DEVICE_INLINE double sqrt(const double& x) { return sqrt(x); }
+  static HOST_DEVICE_INLINE double tanh(const double& x) { return std::tanh(x); }
+  static HOST_DEVICE_INLINE double sin(const double& x)  { return std::sin(x); }
+  static HOST_DEVICE_INLINE double cos(const double& x)  { return std::cos(x); }
+  static HOST_DEVICE_INLINE double tan(const double& x)  { return std::tan(x); }
+  static HOST_DEVICE_INLINE double log(const double& x)  { return std::log(x); }
+  static HOST_DEVICE_INLINE double exp(const double& x)  { return std::exp(x); }
+  static HOST_DEVICE_INLINE double abs(const double& x)  { return std::abs(x); }
+  static HOST_DEVICE_INLINE double sqrt(const double& x) { return std::sqrt(x); }
  static HOST_DEVICE_INLINE double neg(const double& x)  { return -x; }
  static HOST_DEVICE_INLINE double sgn(const double& x)  { return (0 < x) - (x < 0); }

@ -145,7 +148,7 @@ struct Ops<double> {

  static HOST_DEVICE_INLINE double max(const double& x, const double& y)  { return x < y ? y : x; }
  static HOST_DEVICE_INLINE double min(const double& x, const double& y)  { return x < y ? x : y; }
-  static HOST_DEVICE_INLINE double pow(const double& x, const double& y)  { return pow(x, y); }
+  static HOST_DEVICE_INLINE double pow(const double& x, const double& y)  { return std::pow(x, y); }


  static HOST_DEVICE_INLINE double negate(const double& x)  { return !(bool)x; }
@ -460,7 +463,7 @@ struct Ops<half> {
  static DEVICE_INLINE half exp(const half& x)  { return hexp(x); }
  static DEVICE_INLINE half sqrt(const half& x) { return hsqrt(x); }
  static DEVICE_INLINE half neg(const half& x)  { return -x; }
-  
+
  static DEVICE_INLINE half abs(const half& x)  { return fabs((float)x); }// @TODO half has this information somewhere in the struct, right?
  static DEVICE_INLINE half sgn(const half& x)  { half zero = 0.f; return (zero < x) - (x < zero); } // @TODO half has this information somewhere in the struct, right?

--- a/src/graph/node.h
+++ b/src/graph/node.h
@ -40,7 +40,7 @@ protected:
  std::string debugMessage_;

  Ptr<std::list<Expr>> subtape_; // a subtape is used to keep track of nodes that need to be freed and recomputed with gradient-checkpointing.
-  bool isCheckpoint_{false};     // true if this node has been selected to be a checkpoint, currently only done manually. 
+  bool isCheckpoint_{false};     // true if this node has been selected to be a checkpoint, currently only done manually.

  Ptr<AutoTunerRecorder> recorder_;
  size_t recorderHash_;
@ -138,7 +138,7 @@ public:

  virtual std::string graphviz() override {
    std::stringstream ss;
-    ss << "\"" << this << "\" [" 
+    ss << "\"" << this << "\" ["
      << "shape=\"" << form() << "\", "
      << "label="   << label() << ", "
      << "style=\"filled\", "
@ -147,7 +147,7 @@ public:

    for(auto&& child : children())
      ss << "\"" << child << "\" -> \"" << this << "\";" << std::endl;
-    
+
    if(subtape_) {
      for(auto&& dep : *subtape_)
        ss << "\"" << dep << "\" -> \"" << this << "\" [style=dotted];" << std::endl;
@ -188,9 +188,9 @@ struct NaryNodeOp : public Node {

  // Deduce type automatically, but then all types must be the same
  // this is called automatically when no output type is specified.
-  // If the input types are mixed, the output type needs to be specified 
+  // If the input types are mixed, the output type needs to be specified
  // in the constructor.
-  Type commonType(const std::vector<Expr>& nodes) {
+  static Type commonType(const std::vector<Expr>& nodes) {
    ABORT_IF(nodes.size() == 0, "NaryNodeOp has no children");
    Type type = nodes[0]->value_type();
    for(int i = 1; i < nodes.size(); ++i)
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@ -17,9 +17,9 @@ namespace inits {
 /**
 * Base class for specialized NodeInitializers.
 *
- * A NodeInitializer is a functor that is associated with parameters 
- * and constants, and is invoked on a tensor during node intialization. 
- * You need to override NodeIntializer::apply(Tensor) with your own 
+ * A NodeInitializer is a functor that is associated with parameters
+ * and constants, and is invoked on a tensor during node intialization.
+ * You need to override NodeIntializer::apply(Tensor) with your own
 * functionality or use a fromLambda intializer.
 *
 * See node_initializers.cpp for examples.
@ -31,6 +31,7 @@ protected:
 public:
  virtual void apply(Tensor t) = 0;
  void setAllocator(Ptr<Allocator> allocator) { allocator_ = allocator; }
+  virtual ~NodeInitializer() {}
 };

 /**
@ -135,7 +136,7 @@ Ptr<NodeInitializer> dropout(float dropoutProbabilty);

 /**
 * Intialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps)
- * 
+ *
 * @return A NodeInitializer
 */
 Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
@ -163,7 +164,7 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,

 /**
 * Computes Google's Transformer-style sinusoidal position embeddings
- * starting from position 'start' taking into account batch and time 
+ * starting from position 'start' taking into account batch and time
 * dimensions of the tensor.
 *
 * Expected tensor layout {-2: time, -1: model}
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -480,9 +480,12 @@ class CSRDotNodeOp : public NaryNodeOp {
  bool transS_;
  bool swapOperands_;
 public:
-  CSRDotNodeOp(const Shape& S_shape, Expr S_values, Expr S_indices, Expr S_offsets, Expr D, bool transS, bool swapOperands)
-      : NaryNodeOp({ S_values, S_indices, S_offsets, D }, newShape(S_shape, S_values, S_indices, S_offsets, D, transS, swapOperands), commonType({S_values, D})),
-                   transS_(transS), swapOperands_(swapOperands) {
+  CSRDotNodeOp(const Shape& S_shape, Expr S_values, Expr S_indices,
+               Expr S_offsets, Expr D, bool transS, bool swapOperands)
+    : NaryNodeOp({ S_values, S_indices, S_offsets, D },
+                 newShape(S_shape, S_values, S_indices, S_offsets, D, transS, swapOperands),
+                 NaryNodeOp::commonType({S_values, D})),
+      transS_(transS), swapOperands_(swapOperands) {
    matchOrAbort<IndexType>(S_indices->value_type());
    matchOrAbort<IndexType>(S_offsets->value_type());
  }
@ -513,7 +516,7 @@ public:

  NodeOps backwardOps() override {
    return { nullptr, // can't backprop into the sparse matrix (the gradient is dense)
-             nullptr, 
+             nullptr,
             nullptr,
             NodeOp(CSRProd(child(3)->grad(), // child(3) = D
                            graph()->allocator(),
@ -527,7 +530,7 @@ public:
  virtual size_t hash() override {
    size_t seed = NaryNodeOp::hash();
    for(auto s : shape())
-      util::hash_combine(seed, s);  
+      util::hash_combine(seed, s);
    util::hash_combine(seed, transS_);
    util::hash_combine(seed, swapOperands_);
    return seed;
@ -1050,8 +1053,8 @@ struct ConcatenateNodeOp : public NaryNodeOp {
    auto checkShape = shape;
    for(auto child : nodes) {
      checkShape.set(ax_, child->shape()[ax_]); // don't abort on different sizes on axis dim.
-      ABORT_IF(checkShape != child->shape(), 
-               "Child shapes {} and {} cannot be concatenated along axis {}", 
+      ABORT_IF(checkShape != child->shape(),
+               "Child shapes {} and {} cannot be concatenated along axis {}",
               shape, child->shape(), ax);

      sum += child->shape()[ax_];
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@ -10,10 +10,10 @@

 namespace marian {

-// @TODO: Currently an ExpressionGraph only supports one Parameters object and 
+// @TODO: Currently an ExpressionGraph only supports one Parameters object and
 // the type of parameters has to be the inside on Parameters object. This limits
 // parameter types to a single chosen type, e.g. only fp32 or only fp16. This should
-// be extended to allow multiple sets of parameters. 
+// be extended to allow multiple sets of parameters.
 // The reason here is to be able to efficiently compute updates of whole parameter
 // sets of one type.
 class Parameters {
@ -40,7 +40,7 @@ public:
    LOG(debug, "Created parameter object of type {}", acceptedElementType_);
  }

-  ~Parameters() {
+  virtual ~Parameters() {
    LOG(debug, "Destroyed parameter object of type {}", acceptedElementType_);
  }

@ -88,7 +88,7 @@ public:

      // sort parameters by name before allocation to make sure the memory layout after allocation is always the same
      std::sort(params_.begin(), params_.end(), [](Expr n1, Expr n2){ return n1->name() < n2->name(); });
-      
+
      for(auto p : params_) {
        if(!p->val()) {
          vals_->allocate(p->val(), p->shape(), p->value_type());
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@ -39,6 +39,7 @@ public:

 // Simplest layer interface: Unary function
 struct IUnaryLayer {
+  virtual ~IUnaryLayer() {}
  virtual Expr apply(Expr) = 0;
  virtual Expr apply(const std::vector<Expr>& es) {
    ABORT_IF(es.size() > 1, "Not implemented"); // simple stub
@ -59,6 +60,7 @@ struct IEmbeddingLayer {

  // alternative from indices directly
  virtual Expr applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const = 0;
+  virtual ~IEmbeddingLayer() {}
 };

 // base class for Encoder and Decoder classes, which have embeddings and a batch index (=stream index)
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@ -5,14 +5,14 @@

 namespace marian {

-static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
+static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> /*graph*/,
                                               Ptr<data::CorpusBatch> batch,
                                               Ptr<Options> options,
                                               Expr attention) { // [beam depth=1, max src length, batch size, tgt length]

  std::string guidedLossType = options->get<std::string>("guided-alignment-cost");  // @TODO: change "cost" to "loss"
  float guidedLossWeight = options->get<float>("guided-alignment-weight");
-  
+
  const auto& shape = attention->shape(); // [beam depth=1, max src length, batch size, tgt length]
  float epsilon = 1e-6f;
  Expr alignmentLoss; // sum up loss over all attention/alignment positions
@ -55,8 +55,8 @@ static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
    else
       ABORT("Unknown alignment cost type: {}", guidedLossType);
    // every position is a label as they should all agree
-    // @TODO: there should be positional masking here ... on the other hand, positions that are not 
-    // in a sentence should always agree (both being 0). Lack of masking affects label count only which is 
+    // @TODO: there should be positional masking here ... on the other hand, positions that are not
+    // in a sentence should always agree (both being 0). Lack of masking affects label count only which is
    // probably negligible?
    numLabels = shape.elements();
  }
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@ -331,6 +331,7 @@ public:
  : LabelwiseLoss(axes), // cross-entropy already reduces over axis -1
    labelSmoothing_(labelSmoothing), factorWeight_(factorWeight) {}

+  virtual ~CrossEntropyLoss() {}
 protected:
  float labelSmoothing_; // interpolation factor for label smoothing, see below
  float factorWeight_;   // give extra weight to factors
@ -368,7 +369,7 @@ protected:

    if(labelWeights) {
      // We currently do not know how to use target factors and word-level label weights together
-      bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights. 
+      bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights.
      ABORT_IF(wordlevel && logits.getNumFactorGroups() > 1, "CE loss with word-level label weights is not implemented for factors");
      ce = ce * cast(labelWeights, Type::float32);
    }
@ -379,15 +380,15 @@ protected:


 /**
- * @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an 
- * implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319. 
+ * @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an
+ * implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319.
 * We rely on word-level label weights where 1 is correct and 0 is marking an error. If there are not
 * zeros for a sentence it going to be trained with normal CE loss if there is at least one 0 it is going
 * to flip over to use SUL for that sentence to penalize the selected word.
- * 
+ *
 * SUL is implemented as:
 * -log(gather(1 - softmax(logits), -1, indices))
- * 
+ *
 * Factors are currently not supported.
 */
 class SequenceUnlikelihoodLoss : public CrossEntropyLoss {
@ -411,17 +412,17 @@ protected:
    ABORT_IF(!mask, "mask is required"); // @TODO: check this, it seems weights for padding are by default 1, which would make this obsolete.
    // use label weights, where 1 is GOOD and 0 is BAD. After inversion here, now 1 marks, mask again to eliminate padding (might be obsolete)
    auto errorMask = (1.f - cast(labelWeights, Type::float32)) * cast(mask, Type::float32);
-  
+
    auto ceUl = logits.applyLossFunction(labels, [&](Expr logits, Expr indices) {
      return cast(unlikelihood(logits, indices), Type::float32);
    });
-  
+
    // compute if want to use CE or UL. If there are no errors train with CE, otherwise train _only on_ the errors with UL. This is the "mixed" training
-    // schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL. 
+    // schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL.
    auto onlyCe  = eq(sum(errorMask, /*axis=*/-3), 0.f); // [1, 1, dimBatch, 1] - equal 1 if no errors are present
    ceUl         = errorMask * ceUl;                     // don't use for correct label or padding

-    auto cost    = onlyCe * ce + (1.f - onlyCe) * ceUl;  // ce or unlikelihood part are never simultanously used as cost per batch entry 
+    auto cost    = onlyCe * ce + (1.f - onlyCe) * ceUl;  // ce or unlikelihood part are never simultanously used as cost per batch entry

    return cost;
  }
--- a/src/layers/weight.h
+++ b/src/layers/weight.h
@ -17,6 +17,7 @@ public:
  virtual void debugWeighting(std::vector<float> /*weightedMask*/,
                              std::vector<float> /*freqMask*/,
                              Ptr<data::CorpusBatch> /*batch*/){};
+  virtual ~WeightingBase() {}
 };

 class DataWeighting : public WeightingBase {
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@ -41,6 +41,7 @@ class VocabWrapper : public IVocabWrapper {
  Ptr<Vocab> pImpl_;
 public:
  VocabWrapper(Ptr<Vocab> vocab) : pImpl_(vocab) {}
+  virtual ~VocabWrapper() {}
  WordIndex encode(const std::string& word) const override { return (*pImpl_)[word].toWordIndex(); }
  std::string decode(WordIndex id) const override { return (*pImpl_)[Word::fromWordIndex(id)]; }
  size_t size() const override { return pImpl_->size(); }
@ -243,7 +244,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) {
  }
 }

-// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc. 
+// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc.
 bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec) {
  std::cout << "Converting from: " << inputFile << ", to: " << outputFile << std::endl;

--- a/src/microsoft/quicksand.h
+++ b/src/microsoft/quicksand.h
@ -54,6 +54,8 @@ public:
                     const std::vector<const void*>& ptrs)
      : options_(options), ptrs_(ptrs) {}

+  virtual ~IBeamSearchDecoder() {}
+
  virtual QSNBestBatch decode(const QSBatch& qsBatch,
                              size_t maxLength,
                              const std::unordered_set<WordIndex>& shortlist)
--- a/src/models/costs.h
+++ b/src/models/costs.h
@ -25,6 +25,7 @@ public:
                                       Ptr<ExpressionGraph> graph, // @TODO: why needed? Can it be gotten from model?
                                       Ptr<data::Batch> batch,
                                       bool clearGraph = true) = 0;
+  virtual ~ICost() {}
 };

 class EncoderDecoderCECost : public ICost {
@ -51,6 +52,8 @@ public:
      weighter_ = WeightingFactory(options_);
  }

+  virtual ~EncoderDecoderCECost() {}
+
  Ptr<MultiRationalLoss> apply(Ptr<IModel> model,
             Ptr<ExpressionGraph> graph,
             Ptr<data::Batch> batch,
@ -136,6 +139,8 @@ public:
  Trainer(Ptr<IModel> model, Ptr<ICost> cost)
      : model_(model), cost_(cost) {}

+  virtual ~Trainer() {}
+
  Ptr<IModel> getModel() { return model_; }

  virtual void load(Ptr<ExpressionGraph> graph,
@ -179,6 +184,8 @@ public:
  Scorer(Ptr<IModel> model, Ptr<ILogProb> cost)
      : model_(model), logProb_(cost) {}

+  virtual ~Scorer(){}
+
  Ptr<IModel> getModel() { return model_; }

  virtual void load(Ptr<ExpressionGraph> graph,
@ -211,6 +218,7 @@ public:

 class LogSoftmaxStep : public ILogProbStep {
 public:
+  virtual ~LogSoftmaxStep() {}
  virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
    // decoder needs normalized probabilities (note: skipped if beam 1 and --skip-cost)
    state->setLogProbs(state->getLogProbs().applyUnaryFunction(logsoftmax));
@ -224,6 +232,7 @@ public:
 // with --output-sampling during translation with marian-decoder
 class GumbelSoftmaxStep : public ILogProbStep {
 public:
+  virtual ~GumbelSoftmaxStep() {}
  virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
    state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
      [](Expr logits){ // lemma gets gumbelled
--- a/src/models/encoder_decoder.h
+++ b/src/models/encoder_decoder.h
@ -11,6 +11,7 @@ namespace marian {

 class IEncoderDecoder : public models::IModel {
 public:
+  virtual ~IEncoderDecoder() {}
  virtual void load(Ptr<ExpressionGraph> graph,
                    const std::string& name,
                    bool markedReloaded = true) override
--- a/src/models/model_base.h
+++ b/src/models/model_base.h
@ -41,6 +41,8 @@ public:
 // @TODO: Is there a better name?
 class ICriterionFunction {
 public:
+  virtual ~ICriterionFunction() {}
+
  virtual void load(Ptr<ExpressionGraph>,
                    const std::string&,
                    bool markReloaded = true)
--- a/src/models/model_task.h
+++ b/src/models/model_task.h
@ -5,10 +5,12 @@
 namespace marian {

 struct ModelTask {
+  virtual ~ModelTask() {}
  virtual void run() = 0;
 };

 struct ModelServiceTask {
+  virtual ~ModelServiceTask() {}
  virtual std::string run(const std::string&) = 0;
 };
 }  // namespace marian
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@ -11,6 +11,7 @@ namespace marian {
 class EncoderS2S : public EncoderBase {
  using EncoderBase::EncoderBase;
 public:
+  virtual ~EncoderS2S() {}
  Expr applyEncoderRNN(Ptr<ExpressionGraph> graph,
                       Expr embeddings,
                       Expr mask,
@ -254,7 +255,7 @@ public:
    auto embeddings = state->getTargetHistoryEmbeddings();

    // The batch dimension of the inputs can change due to batch-pruning, in that case
-    // cached elements need to be rebuilt, in this case the mapped encoder context in the 
+    // cached elements need to be rebuilt, in this case the mapped encoder context in the
    // attention mechanism of the decoder RNN.
    int currDimBatch = embeddings->shape()[-2];
    if(!rnn_ || lastDimBatch_ != currDimBatch)  // if currDimBatch is different, rebuild the cached RNN
@ -263,7 +264,7 @@ public:
    // Also @TODO: maybe implement a Cached(build, updateIf) that runs a check and rebuild if required
    // at dereferecing :
    // rnn_ = Cached<decltype(constructDecoderRNN(graph, state))>(
-    //          /*build=*/[]{ return constructDecoderRNN(graph, state); }, 
+    //          /*build=*/[]{ return constructDecoderRNN(graph, state); },
    //          /*updateIf=*/[]{ return state->batchDimChanged() });
    // rnn_->transduce(...);

--- a/src/models/states.h
+++ b/src/models/states.h
@ -17,6 +17,7 @@ public:
      : context_(context), mask_(mask), batch_(batch) {}

  EncoderState() {}
+  virtual ~EncoderState() {}

  virtual Expr getContext()   const { return context_;   }
  virtual Expr getAttended()  const { return context_;   }
@ -53,6 +54,7 @@ public:
               const std::vector<Ptr<EncoderState>>& encStates,
               Ptr<data::CorpusBatch> batch)
      : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch) {}
+  virtual ~DecoderState() {}

  // @TODO: Do we need all these to be virtual?
  virtual const std::vector<Ptr<EncoderState>>& getEncoderStates() const {
@ -68,10 +70,10 @@ public:
                                   int beamSize) const {

    std::vector<Ptr<EncoderState>> newEncStates;
-    for(auto& es : encStates_) 
+    for(auto& es : encStates_)
      // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries
      newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices));
-      
+
    // hypindices matches batchIndices in terms of batch dimension, so we only need hypIndices
    auto selectedState = New<DecoderState>(
        states_.select(hypIndices, beamSize, /*isBatchMajor=*/false), logProbs_, newEncStates, batch_);
@ -121,6 +123,7 @@ private:
  Words targetWords_;

 public:
+  virtual ~ClassifierState() {}
  virtual Expr getLogProbs() const { return logProbs_; }
  virtual void setLogProbs(Expr logProbs) { logProbs_ = logProbs; }

--- a/src/optimizers/clippers.h
+++ b/src/optimizers/clippers.h
@ -16,6 +16,7 @@ namespace marian {
 class ClipperBase {
 public:
  virtual void clip(Tensor) = 0;
+  virtual ~ClipperBase() {}
 };

 typedef std::shared_ptr<ClipperBase> ClipperPtr;
--- a/src/optimizers/optimizers.h
+++ b/src/optimizers/optimizers.h
@ -29,6 +29,8 @@ public:
      LOG(info, "[optimizers] Learning rate gets automatically adjusted as if minibatch size was {}", refMBWordsParam_);
  }

+  virtual ~OptimizerBase() {}
+
  static constexpr size_t mbSizeNotProvided = SIZE_MAX;

  void update(Ptr<ExpressionGraph> graph, size_t mbSize = mbSizeNotProvided) {
@ -114,7 +116,7 @@ class Sgd : public OptimizerBase {
 public:
  Sgd(float eta, size_t refMBWordsParam = 0, Ptr<ClipperBase> clipper = nullptr)
      : OptimizerBase(eta, refMBWordsParam, clipper) {}
-
+  virtual ~Sgd() {}
  virtual void setParams(const std::vector<float>& /*params*/) override {}
 private:
  void updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBWords) override;
--- a/src/rescorer/score_collector.h
+++ b/src/rescorer/score_collector.h
@ -13,6 +13,7 @@ namespace marian {
 class ScoreCollector {
 public:
  ScoreCollector(const Ptr<Options>& options);
+  virtual ~ScoreCollector() {}

  virtual void Write(long id, const std::string& message);
  virtual void Write(long id,
--- a/src/rnn/rnn.h
+++ b/src/rnn/rnn.h
@ -35,7 +35,7 @@ protected:
 public:
  BaseRNN(Ptr<ExpressionGraph> graph, Ptr<Options> options)
      : graph_(graph), options_(options) {}
-
+  virtual ~BaseRNN() {}
  virtual Expr transduce(Expr, Expr = nullptr) = 0;
  virtual Expr transduce(Expr, State, Expr = nullptr) = 0;
  virtual Expr transduce(Expr, States, Expr = nullptr) = 0;
@ -113,6 +113,7 @@ private:

 public:
  friend RNN;
+  virtual ~SingleLayerRNN() {}

  // @TODO: benchmark whether this concatenation is a good idea
  virtual Expr transduce(Expr input, Expr mask = nullptr) override {
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@ -17,7 +17,7 @@ protected:
 public:
  Backend(DeviceId deviceId, size_t seed)
      : deviceId_(deviceId), seed_(seed), randomGenerator_(createRandomGenerator(seed, deviceId)) {}
-
+  virtual ~Backend() {};
  virtual DeviceId getDeviceId() { return deviceId_; };
  virtual Ptr<RandomGenerator> getRandomGenerator() { return randomGenerator_; }

--- a/src/tensors/cpu/device.cpp
+++ b/src/tensors/cpu/device.cpp
@ -8,29 +8,40 @@

 namespace marian {
 namespace cpu {
+namespace {

 // allocate function for tensor reserve() below. 
-// Needed for AVX512, while not available on all compilers. It seems clang
-// does not have aligned_alloc for all cstlib versions. If AVX512 is not used
-// a simple malloc is probably fine. 
-// Should generate a runtime error otherwise as we have a check in the AVX512 
-// functions which tests for alignment. 
-#ifdef _WIN32
-#define MALLOC(size) _aligned_malloc(size, alignment_)
-#elif __GNUC__
-#define MALLOC(size) aligned_alloc(alignment_, size)
-#else
-#define MALLOC(size) malloc(size)
-#endif
+// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.

 #ifdef _WIN32
-#define FREE(ptr) _aligned_free(ptr)
+void *genericMalloc(size_t alignment, size_t size) {
+  void *ret = _aligned_malloc(size, alignment);
+  ABORT_IF(!ret, "Failed to allocate memory on CPU");
+  return ret;
+}
+void genericFree(void *ptr) {
+  _aligned_free(ptr);
+}
 #else
-#define FREE(ptr) free(ptr)
+// Linux and OS X.  There is no fallback to malloc because we need it to be aligned.
+void *genericMalloc(size_t alignment, size_t size) {
+  // On macos, aligned_alloc is available only on c++17
+  // Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
+  // posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
+  void *result;
+  // Error could be detected by return value or just remaining nullptr.
+  ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
+  return result;
+}
+void genericFree(void *ptr) {
+  free(ptr);
+}
 #endif

+} // namespace
+
 Device::~Device() {
-  FREE(data_);
+  genericFree(data_);
 }

 void Device::reserve(size_t size) {
@ -38,14 +49,12 @@ void Device::reserve(size_t size) {
  ABORT_IF(size < size_ || size == 0,
           "New size must be larger than old size and larger than 0");

+  uint8_t *temp = static_cast<uint8_t*>(genericMalloc(alignment_, size));
  if(data_) {
-    uint8_t *temp = static_cast<uint8_t*>(MALLOC(size));
    std::copy(data_, data_ + size_, temp);
-    FREE(data_);
-    data_ = temp;
-  } else {
-    data_ = static_cast<uint8_t*>(MALLOC(size));
+    genericFree(data_);
  }
+  data_ = temp;
  size_ = size;
 }
 }  // namespace cpu
--- a/src/tensors/cpu/fbgemm/expanded_gemm.h
+++ b/src/tensors/cpu/fbgemm/expanded_gemm.h
@ -17,6 +17,7 @@
 #endif

 using namespace fbgemm;
+// @TODO: don't use using namespace ...; in header files. Just don't. [UG]
 #endif  // USE_FBGEMM

 namespace marian {
@ -96,7 +97,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {

  const std::string type() override { return "packMatFp16"; }

-  Shape newShape(Expr a, bool transpose) {
+  Shape newShape(Expr MAYBE_UNUSED a, bool MAYBE_UNUSED transpose) {
 #if USE_FBGEMM
    auto shapeMat = a->shape();
    // Should be 2D - weight matrix
@ -115,15 +116,14 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
                           packsize_);

    Shape outShape({(int)packsize_});
-
    return outShape;
-#else // USE_FBGEMM
+#else
    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
    return Shape();
 #endif  // USE_FBGEMM
  }
 };
-
+  ;
 // Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
 // PackMatrix packMat_: the type of packed matrix - A or B matrix
 // marian::Type packType_: the type the input matrix is packed - packed8avx2 or packed8avx512
@ -132,6 +132,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
 // int ncol_: the number of columns
 // uint64_t packsize_: the size of the packed matrix
 //                    (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+
 struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
  PackMatrix packMat_;
  marian::Type packType_;
@ -180,19 +181,21 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {

  const std::string type() override { return "packMatInt8"; }

-  Shape newShape(Expr a, bool transpose) {
 #if USE_FBGEMM
+  Shape newShape(Expr a, bool transpose) {
    fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
    Shape outShape({(int)packsize_});
-
    return outShape;
-#else // USE_FBGEMM
+  }
+#else
+  Shape newShape(Expr /*a*/, bool /*transpose*/) {
    ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
    return Shape();
-#endif  // USE_FBGEMM
  }
+#endif  // USE_FBGEMM
 };

+
 // Affine transform (matrix multiplication) using packed B matrix
 // float scalar_: scalar multiplier
 // size_t m_: the number of rows in A and C
@ -202,7 +205,6 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
 // bool transB_: transpose B
 class FbgemmPacked16AffineNodeOp : public NaryNodeOp {
 private:
-  float scalar_;
  size_t m_;
  size_t n_;
  size_t k_;
@ -210,9 +212,8 @@ private:
  bool transB_;

 public:
-  FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
-        scalar_(scalar) {
+  FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
+    : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar)*/ {
    transA_ = transA;
    transB_ = transB;
    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@ -281,7 +282,6 @@ public:
 // bool transB_: transpose B
 class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
 private:
-  float scalar_;
  size_t m_;
  size_t n_;
  size_t k_;
@ -289,9 +289,8 @@ private:
  bool transB_;

 public:
-  FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
-        scalar_(scalar) {
+ FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
+   : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ {
    transA_ = transA;
    transB_ = transB;
    m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@ -302,7 +301,7 @@ public:
    size_t l = bShape.elements() / bShape[-1];
    n_ = bShape[-1];
    if(transB)
-     std::swap(l, n_);
+      std::swap(l, n_);
  }

  Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
@ -369,9 +368,9 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
  Type elementType = b->value_type();

  if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
    return nullptr;
@ -380,9 +379,9 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo

 static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
  if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
+    return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
  else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
+    return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
    return nullptr;
@ -394,9 +393,9 @@ static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, f
  Type elementType = b->value_type();

  if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
  else {
    ABORT("Only int8 and fp16 are available. {}", elementType);
    return nullptr;
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -20,7 +20,7 @@ namespace marian {

 namespace cpu {

-void IsNaN(const Tensor in, Ptr<Allocator> allocator, bool& /*isNaN*/, bool& /*isInf*/) {
+  void IsNaN(const Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool& /*isNaN*/, bool& /*isInf*/) {
  ABORT("Not implemented");
 }

@ -214,9 +214,11 @@ void Transpose0213(Tensor out, Tensor in) {
  }
 }

+// This function is called only when MKL is available.
+#if MKL_FOUND
 // Given a 4D array, transpose (swap) the initial 3 dimensions while keeping the last dimension.
 // e.g. 1234 --> 2134, 1234 --> 3214 (4 is always kept).
-// This is an optimized version for swapping first 3 dimensions 
+// This is an optimized version for swapping first 3 dimensions
 // assuming the last dimension is large enough to get benefits from vectorized copy.
 //
 // @param out output tensor
@ -225,14 +227,13 @@ void Transpose0213(Tensor out, Tensor in) {
 template <bool add>
 void TransposeFirst3In4(Tensor out, Tensor in, const std::vector<int>& vAxis) {
  ABORT_IF(vAxis.size() != 4, "This function handles only 4D arrays.");
-#if MKL_FOUND
  int innermost = in->shape()[-1];

  int l1 = in->shape()[vAxis[0]];
  int l2 = in->shape()[vAxis[1]];
  int l3 = in->shape()[vAxis[2]];

-  // find the mapping between the transposed output dimensional indices (oi, oj, ok) 
+  // find the mapping between the transposed output dimensional indices (oi, oj, ok)
  // and original input dimensional indices (i, j, k)
  int oi, oj, ok;
 #pragma omp parallel for
@ -275,11 +276,8 @@ void TransposeFirst3In4(Tensor out, Tensor in, const std::vector<int>& vAxis) {
      }
    }
  }
-#else
-  // it shouldn't come into here. This function is called only when MKL is available.
-  ABORT("Should not get here");
-#endif  // MKL_FOUND
 }
+#endif  // MKL_FOUND

 inline void transpose4x4_SSE(const float* A,
                             float* B,
@ -656,7 +654,7 @@ void SelectAxis2(Tensor out,

  functional::Shape outShape = out->shape();
  functional::Shape inShape = in->shape();
-  
+
  auto idxData = indices->data<IndexType>();
  auto odata = out->data();
  const auto idata = in->data();
--- a/src/tensors/rand.h
+++ b/src/tensors/rand.h
@ -15,11 +15,11 @@ protected:

 public:
  RandomGenerator(size_t seed) : seed_(seed) { }
-
+  virtual ~RandomGenerator() {}
  virtual void uniform(Tensor, float a, float b) = 0;
  virtual void normal(Tensor, float mean, float stddev) = 0;
 };

 Ptr<RandomGenerator> createRandomGenerator(size_t /*seed*/, DeviceId);

-}
+}
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@ -25,7 +25,7 @@
 namespace marian {

 template <typename InIt, typename OutIt>
-void copy(Ptr<Backend> backend, const InIt beg, const InIt end, OutIt it) {
+void copy(Ptr<Backend>& MAYBE_UNUSED backend, const InIt beg, const InIt end, OutIt it) {
 #ifdef CUDA_FOUND
  if(backend->getDeviceId().type == DeviceType::gpu)
    gpu::copy(backend, beg, end, it);
@ -119,7 +119,7 @@ DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)

 // clang-format on

-// Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1. 
+// Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1.
 static inline void Bernoulli(Tensor resultTensor, float keepProb, float scale = 1.f, float shift = 0.f) {
  // in-place uniform distribution
  auto rnd = resultTensor->getBackend()->getRandomGenerator();
@ -190,7 +190,7 @@ void LayerNormalizationGrad(Tensor gradX,
 }

 static inline void LayerNormalizationGrad(
-                            Ptr<Allocator> allocator,
+                            Ptr<Allocator> MAYBE_UNUSED allocator,
                            Tensor gradX,
                            Tensor gradGamma,
                            Tensor gradBeta,
--- a/src/tests/prod.cpp
+++ b/src/tests/prod.cpp
@ -1,7 +1,7 @@
 #include "marian.h"
 #include "common/timer.h"

-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
    using namespace marian;

    {
--- a/src/tests/sqlite.cpp
+++ b/src/tests/sqlite.cpp
@ -8,6 +8,8 @@
 #include <fstream>

 int main(int argc, char** argv) {
+    ABORT_IF(argc != 3, "FATAL ERROR: Incorrect number of command line arguments "
+             "(expected: 2) for command {}.",argv[0]);

    SQLite::Database db("corpus.db", SQLite::OPEN_READWRITE|SQLite::OPEN_CREATE);
    db.exec("PRAGMA temp_store_directory = '/data1/marcinjd';");
--- a/src/training/communicator.cpp
+++ b/src/training/communicator.cpp
@ -38,7 +38,7 @@ Ptr<ICommunicator> createCommunicator(
  }

  // the actual implementation is inside communicator.cu
-  return New<NCCLCommunicator>(graphs, mpi); 
+  return New<NCCLCommunicator>(graphs, mpi);
 #else // no CUDA or no NCCL
  noNccl; // (unused)
  return New<DefaultCommunicator>(graphs, mpi);
@ -141,7 +141,7 @@ public:
  FakeMPIWrapper(bool) {
    LOG(warn, "Compiled without MPI support. Falling back to FakeMPIWrapper");
  }
-
+  virtual ~FakeMPIWrapper() {}
  virtual size_t myMPIRank() const override { return 0; };
  virtual size_t numMPIProcesses() const override { return 1; };

--- a/src/training/communicator.h
+++ b/src/training/communicator.h
@ -156,11 +156,8 @@ public:
  void scatterReduceAndResetGrads() const override {
    const_cast<DefaultCommunicator*>(this)->lazyInit();

-    int totalSize = (int)graphs_[0]->params()->vals()->size();
-    int shardSize = (int)ceil(totalSize / (float)graphs_.size());
-
    // Gather gradients from different devices into current gradient shards
-    auto scatter = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto scatter = [this](size_t idx, size_t begin, size_t end) {
      auto curGrad = graphs_[idx]->params()->grads()->subtensor(begin, end-begin);

      // collect and sum gradients
@ -176,7 +173,7 @@ public:
    };

    // reset gradients outside current shard
-    auto reset = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto reset = [this](size_t idx, size_t begin, size_t end) {
      auto grad = graphs_[idx]->params()->grads();
      if (begin > 0)
        grad->subtensor(0, begin)->set(0);
@ -189,11 +186,9 @@ public:
  }

  void allGatherParams() const override {
-    int totalSize = (int)graphs_[0]->params()->vals()->size();
-    int shardSize = (int)ceil(totalSize / (float)graphs_.size());

    // Update all graphs with parameter shard
-    auto gather = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto gather = [this](size_t idx, size_t begin, size_t end) {
      auto getShard = [&](Ptr<ExpressionGraph> graph) {
        return graph->params()->vals()->subtensor(begin, end-begin);
      };
--- a/src/training/gradient_dropping/sparse_tensor.h
+++ b/src/training/gradient_dropping/sparse_tensor.h
@ -118,7 +118,7 @@ public:
  }

  // Convert a tensor into a sparse tensor format
-  void fromDense(Tensor t) {
+  void fromDense(Tensor MAYBE_UNUSED t) {
    if(backend_->getDeviceId().type == DeviceType::cpu) {
      ABORT("Gradient Dropping for CPU is not yet supported");
    }
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@ -54,10 +54,10 @@ public:
   * number of devices, which is passed in as the 'multiplier'.
   */
  // @TODO: Can this be made const? It seems wrong to have a stateful method that still returns a result.
-  virtual Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
-                                             Ptr<models::ICriterionFunction> model,
-                                             const std::vector<Ptr<Vocab>>& vocabs,
-                                             double multiplier = 1.) {
+  Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
+                                     Ptr<models::ICriterionFunction> model,
+                                     const std::vector<Ptr<Vocab>>& vocabs,
+                                     double multiplier = 1.) {
    auto stats = New<data::BatchStats>();

    size_t numFiles = options_->get<std::vector<std::string>>("train-sets").size();
@ -92,8 +92,8 @@ public:
        maxBatch *= 2;
    }

-    // Do a binary search for maxmimum batch size that fits into given workspace memory 
-    // for a tested sentence length. 
+    // Do a binary search for maxmimum batch size that fits into given workspace memory
+    // for a tested sentence length.
    for(size_t i = step; i <= maxLength; i += step) {
      size_t start = 1;
      size_t end = maxBatch;
--- a/src/training/graph_group_async.h
+++ b/src/training/graph_group_async.h
@ -64,7 +64,7 @@ public:
  void save(Ptr<ExpressionGraph>, bool final = false);

  // @TODO: give it a fake batch generator which own vocabs instead of passing vocabs
-  Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) {
+  virtual Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) {
    return GraphGroup::collectStats(graphs_[0], builders_[0], vocabs);
  }

--- a/src/training/graph_group_multinode_sync.h
+++ b/src/training/graph_group_multinode_sync.h
@ -63,7 +63,6 @@ private:
  Tensor paramsAvg_;
  std::vector<float> accGradientsSync_cpu;
  std::vector<float> receiveBuffer_cpu;
-  bool synchronization_happened{false};

  Ptr<OptimizerBase> syncOptimizer_;

--- a/src/training/graph_group_sync.h
+++ b/src/training/graph_group_sync.h
@ -26,7 +26,6 @@ class SyncGraphGroup : public GraphGroup, public ExponentialSmoothing {
  // state for update()
  bool first_{ true };                           // gets interpreted and cleared by update()
  std::vector<Ptr<data::Batch>> pendingBatches_; // in case of dynamic MB-size scaling, we temporarly buffer up batches across update() calls until enough
-  size_t typicalTrgWords_{};                     // typical batch size in words (labels), 0 if unknown (e.g. specified in sentences)
  double updateMultiplier_{1};                  // multiplier not applied in collectStats() (no multiplier if not mini-batch-fit)

  void initialize(const Ptr<data::Batch>& exampleBatch);
--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@ -13,6 +13,7 @@ class TrainingState;

 class TrainingObserver {
 public:
+  virtual ~TrainingObserver() {}
  virtual void init(TrainingState&) {}
  virtual void actAfterEpoch(TrainingState&) {}
  virtual void actAfterBatches(TrainingState&) {}
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -36,6 +36,7 @@ protected:

 public:
  ValidatorBase(bool lowerIsBetter) : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}
+  virtual ~ValidatorBase() {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                         Ptr<const TrainingState> state) = 0;
@ -51,6 +52,7 @@ public:
 template <class DataSet, class BuilderType> // @TODO: BuilderType doesn't really serve a purpose here? Review and remove.
 class Validator : public ValidatorBase {
 public:
+  virtual ~Validator() {}
  Validator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool lowerIsBetter = true)
      : ValidatorBase(lowerIsBetter),
        vocabs_(vocabs),
@ -137,6 +139,7 @@ class CrossEntropyValidator : public Validator<data::Corpus, models::ICriterionF

 public:
  CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~CrossEntropyValidator() {}

  std::string type() override { return options_->get<std::string>("cost-type"); }

@ -148,6 +151,7 @@ protected:
 class AccuracyValidator : public Validator<data::Corpus, models::IModel> {
 public:
  AccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~AccuracyValidator() {}

  std::string type() override { return "accuracy"; }

@ -161,6 +165,7 @@ private:

 public:
  BertAccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool evalMaskedLM);
+  virtual ~BertAccuracyValidator() {}

  std::string type() override {
    if(evalMaskedLM_)
@ -177,6 +182,7 @@ protected:
 class ScriptValidator : public Validator<data::Corpus, models::IModel> {
 public:
  ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~ScriptValidator() {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                         Ptr<const TrainingState> /*ignored*/) override;
@ -193,6 +199,7 @@ protected:
 class TranslationValidator : public Validator<data::Corpus, models::IModel> {
 public:
  TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~TranslationValidator() {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                         Ptr<const TrainingState> state) override;
@ -212,6 +219,7 @@ protected:
 class BleuValidator : public Validator<data::Corpus, models::IModel> {
 public:
  BleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool detok = false);
+  virtual ~BleuValidator() {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                         Ptr<const TrainingState> state) override;
--- a/src/translator/output_collector.h
+++ b/src/translator/output_collector.h
@ -11,6 +11,7 @@ namespace marian {

 class PrintingStrategy {
 public:
+  virtual ~PrintingStrategy() {}
  virtual bool shouldBePrinted(long) = 0;
 };

--- a/src/translator/scorers.h
+++ b/src/translator/scorers.h
@ -10,6 +10,8 @@ namespace marian {

 class ScorerState {
 public:
+  virtual ~ScorerState(){}
+
  virtual Logits getLogProbs() const = 0;

  virtual void blacklist(Expr /*totalCosts*/, Ptr<data::CorpusBatch> /*batch*/){};
@ -24,6 +26,8 @@ public:
  Scorer(const std::string& name, float weight)
      : name_(name), weight_(weight) {}

+  virtual ~Scorer(){}
+
  std::string getName() { return name_; }
  float getWeight() { return weight_; }

@ -53,6 +57,7 @@ protected:

 public:
  ScorerWrapperState(Ptr<DecoderState> state) : state_(state) {}
+  virtual ~ScorerWrapperState() {}

  virtual Ptr<DecoderState> getState() { return state_; }

@ -88,6 +93,8 @@ public:
        encdec_(std::static_pointer_cast<IEncoderDecoder>(encdec)),
        ptr_{ptr} {}

+  virtual ~ScorerWrapper() {}
+
  virtual void init(Ptr<ExpressionGraph> graph) override {
    graph->switchParams(getName());
    if(ptr_)