From 00d2e999e3c24c9e8c836e800b3720bbe4a326a1 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk>
Date: Thu, 5 Mar 2020 21:08:23 +0000
Subject: [PATCH] Add support for compiling on Mac (and clang) (#598)

* Compile marian on mac and clang. Two linker errors left

* MacOS defines has a different definition for unsigned long

* Find OpenBLAS on mac

* Fix a typo in the BLAS detection

* Simplify and add comments

* Refactor cpu allocation code. Do not fallback to malloc

* Fix compilation warning on gcc

* Refactor memory allocation

* Make things compile with clang-8 with fewer warnings.

* Eliminate clang warnings when compiling examples and when compiling without MKL

* added USE_MKL option to compile without MKL for debugging even when MKL is installed

* fixed issues with compiling examples with clang

* Fix compile errors with clang in src/tests.

* Fix missing whitespace in error message in src/tests/sqlite.cpp.

* Responding to Frank Seide's code review.

* Eliminate clang warnings when compiling with -DUSE_FBGEMM=on.

* Fix compilation on gcc 8

* Get Marian to compile with Clang-10.

* Fix Clang-8 warnings when compiling with marian-server

* Add more comments and explicit unsigned long long for windows

* Pull in fbgemm that supports mac

* Fix warning flags order in CMakeLists.txt

Co-authored-by: Kenneth Heafield <kpu@users.noreply.github.com>
Co-authored-by: Ulrich Germann <ulrich.germann@gmail.com>
Co-authored-by: Roman Grundkiewicz <romang@amu.edu.pl>
---
 CMakeLists.txt                                |  43 +++++---
 cmake/FindCBLAS.cmake                         |   2 +-
 src/3rd_party/CMakeLists.txt                  |  45 +++++++-
 src/3rd_party/fbgemm                          |   2 +-
 src/3rd_party/half_float/umHalf.inl           |   4 +-
 .../pathie-cpp/src/entry_iterator.cpp         |   2 +-
 src/3rd_party/pathie-cpp/src/path.cpp         |   2 +-
 src/3rd_party/pathie-cpp/src/pathie.cpp       |   8 +-
 src/3rd_party/zstr/strict_fstream.hpp         |   2 +-
 src/CMakeLists.txt                            |   4 +
 src/command/marian_server.cpp                 |   2 +-
 src/common/definitions.h                      |  15 +++
 src/common/fastopt.cpp                        |  14 ++-
 src/common/fastopt.h                          |   5 +-
 src/common/file_stream.h                      |  17 ++-
 src/common/filesystem.h                       |  14 ++-
 src/common/intrusive_ptr.h                    |   6 +-
 src/common/logging.cpp                        |   2 +-
 src/common/types.h                            |  40 +++++--
 src/common/utils.cpp                          |  12 ++-
 src/data/batch.h                              |   2 +-
 src/data/corpus_base.h                        |   1 +
 src/data/default_vocab.cpp                    |   3 +-
 src/data/shortlist.h                          |   7 +-
 src/data/text_input.h                         |   1 +
 src/data/vocab_base.h                         |   1 +
 src/examples/mnist/dataset.h                  |   3 +
 src/examples/mnist/model.h                    |   8 +-
 src/examples/mnist/validator.h                |   4 +-
 src/functional/operators.h                    | 101 +++++++++---------
 src/graph/node.h                              |  10 +-
 src/graph/node_initializers.h                 |  11 +-
 src/graph/node_operators_binary.h             |  17 +--
 src/graph/parameters.h                        |   8 +-
 src/layers/generic.h                          |   2 +
 src/layers/guided_alignment.h                 |   8 +-
 src/layers/loss.h                             |  19 ++--
 src/layers/weight.h                           |   1 +
 src/microsoft/quicksand.cpp                   |   3 +-
 src/microsoft/quicksand.h                     |   2 +
 src/models/costs.h                            |   9 ++
 src/models/encoder_decoder.h                  |   1 +
 src/models/model_base.h                       |   2 +
 src/models/model_task.h                       |   2 +
 src/models/s2s.h                              |   5 +-
 src/models/states.h                           |   7 +-
 src/optimizers/clippers.h                     |   1 +
 src/optimizers/optimizers.h                   |   4 +-
 src/rescorer/score_collector.h                |   1 +
 src/rnn/rnn.h                                 |   3 +-
 src/tensors/backend.h                         |   2 +-
 src/tensors/cpu/device.cpp                    |  49 +++++----
 src/tensors/cpu/fbgemm/expanded_gemm.h        |  45 ++++----
 src/tensors/cpu/tensor_operators.cpp          |  16 ++-
 src/tensors/rand.h                            |   4 +-
 src/tensors/tensor_operators.h                |   6 +-
 src/tests/prod.cpp                            |   2 +-
 src/tests/sqlite.cpp                          |   2 +
 src/training/communicator.cpp                 |   4 +-
 src/training/communicator.h                   |  11 +-
 .../gradient_dropping/sparse_tensor.h         |   2 +-
 src/training/graph_group.h                    |  12 +--
 src/training/graph_group_async.h              |   2 +-
 src/training/graph_group_multinode_sync.h     |   1 -
 src/training/graph_group_sync.h               |   1 -
 src/training/training_state.h                 |   1 +
 src/training/validator.h                      |   8 ++
 src/translator/output_collector.h             |   1 +
 src/translator/scorers.h                      |   7 ++
 69 files changed, 428 insertions(+), 236 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c442931f..c27761cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
+option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" OFF)
@@ -33,7 +34,7 @@ option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 if(USE_CCACHE)
 find_program(CCACHE_PROGRAM ccache)
 if(CCACHE_PROGRAM)
-  message(STATUS "Found and will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
+  message(STATUS "Will be using ccache for faster repeat compilation (use cmake -DUSE_CCACHE=off to disable).")
   set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
 else(CCACHE_PROGRAM)
   message(WARNING "Compilation with ccache requested but no ccache found.")
@@ -141,20 +142,32 @@ else(MSVC)
     add_definitions(-DUSE_FBGEMM=1)
   endif(USE_FBGEMM)
 
-  set(DISABLE_GLOBALLY "-Wno-unused-result")
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+    # Clang-10.0.0 complains when CUDA is newer than 10.1
+    set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-cuda-version")
+  endif()
+  set(DISABLE_GLOBALLY "-Wno-unused-result -Wno-unknown-warning-option ${CLANG_IGNORE_UNKNOWN_CUDA}")
 
   # These are used in src/CMakeLists.txt on a per-target basis
-  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wextra; -Wno-unused-function;
-                          -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers;)
+  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
+    -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
+    -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;
+    -Wno-missing-field-initializers;)
 
   # This warning does not exist prior to gcc 5.0
   if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
-    list(APPEND ALL_WARNINGS -Wsuggest-override)
+    list(APPEND ALL_WARNINGS -Wsuggest-override -Wno-int-in-bool-context)
   endif()
 
-  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread -Wl,--no-as-needed -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_CXX_FLAGS_RELEASE         "-Ofast -m64 -funroll-loops -ffinite-math-only -g -rdynamic")
-  set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g -rdynamic")
+  if(CMAKE_COMPILER_IS_GNUCC)
+    # these flags are not known to clang
+    set(CMAKE_GCC_FLAGS "-Wl,--no-as-needed")
+    set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
+  endif(CMAKE_COMPILER_IS_GNUCC)
+
+  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_CXX_FLAGS_RELEASE         "-Ofast -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_SLIM            "-Ofast -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELEASE}")
   set(CMAKE_CXX_FLAGS_PROFILE         "${CMAKE_CXX_FLAGS_RELEASE} -pg")
@@ -162,9 +175,9 @@ else(MSVC)
   set(CMAKE_CXX_FLAGS_PROFUSE         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
 
   # these need to be set separately
-  set(CMAKE_C_FLAGS                 "-pthread -Wl,--no-as-needed -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
-  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -ffinite-math-only -g -rdynamic")
-  set(CMAKE_C_FLAGS_DEBUG           "-O0 -g -rdynamic")
+  set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -ffinite-math-only -g ${CMAKE_RDYNAMIC_FLAG}")
+  set(CMAKE_C_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_C_FLAGS_SLIM            "-O3 -m64 -funroll-loops -ffinite-math-only -DNDEBUG")
   set(CMAKE_C_FLAGS_RELWITHDEBINFO  "${CMAKE_C_FLAGS_RELEASE}")
   set(CMAKE_C_FLAGS_PROFILE         "${CMAKE_C_FLAGS_RELEASE} -pg")
@@ -204,7 +217,7 @@ if(CUDA_FOUND)
   if((CUDA_VERSION VERSION_EQUAL "10.0" OR CUDA_VERSION VERSION_GREATER "10.0") AND (CMAKE_VERSION VERSION_LESS "3.12.2"))
       message(WARNING "On some Unix systems CUDA 10.0+ requires CMake 3.12.2+; you use CMake ${CMAKE_VERSION}")
   endif()
-  
+
   if(COMPILE_CUDA_SM35)
     LIST(APPEND COMPUTE -arch=sm_35; -gencode=arch=compute_35,code=sm_35;)                             # Tesla K40 and above
   endif(COMPILE_CUDA_SM35)
@@ -323,13 +336,15 @@ if(USE_MPI)
 endif(USE_MPI)
 
 if(COMPILE_CPU)
-  find_package(MKL)
+  if(USE_MKL)
+    find_package(MKL)
+  endif(USE_MKL)
   if(MKL_FOUND)
     include_directories(${MKL_INCLUDE_DIR})
     set(EXT_LIBS ${EXT_LIBS} ${MKL_LIBRARIES})
     add_definitions(-DBLAS_FOUND=1 -DMKL_FOUND=1)
   else(MKL_FOUND)
-    set(BLA_VENDOR "OpenBLAS")
+    set(BLAS_VENDOR "OpenBLAS")
     find_package(BLAS)
     if(BLAS_FOUND)
       include(FindCBLAS)
diff --git a/cmake/FindCBLAS.cmake b/cmake/FindCBLAS.cmake
index 631da99b..97b0d3f8 100644
--- a/cmake/FindCBLAS.cmake
+++ b/cmake/FindCBLAS.cmake
@@ -54,7 +54,7 @@ MACRO(CHECK_ALL_LIBRARIES LIBRARIES INCLUDE _prefix _name _flags _list _include
       IF(APPLE) 
         FIND_LIBRARY(${_prefix}_${_library}_LIBRARY
           NAMES ${_library}
-          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 ENV 
+          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64 /usr/local/opt/openblas/lib ENV
           DYLD_LIBRARY_PATH 
           )
       ELSE(APPLE)
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index e100a619..9f3981af 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -15,12 +15,22 @@ if(USE_FBGEMM)
 
   if(NOT MSVC)
     # only locally disabled for the 3rd_party folder
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
+    # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-value -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused")
   endif()
 
   set(FBGEMM_BUILD_TESTS OFF CACHE BOOL "Disable fbgemm tests")
   set(FBGEMM_BUILD_BENCHMARKS OFF CACHE BOOL "Disable fbgemm benchmark")
   add_subdirectory(./fbgemm)
+
+  # asmjit (3rd-party submodule of fbgemm) sets -Wall -Wextra near the end of
+  # the compile options, invalidating any -Wno-... flags that we may have set
+  # earlier. Let's remove them.
+  get_property(ASMJIT_COMPILE_OPTIONS TARGET asmjit PROPERTY COMPILE_OPTIONS)
+  list(REMOVE_ITEM ASMJIT_COMPILE_OPTIONS -Wall -Wextra)
+  set_property(TARGET asmjit PROPERTY COMPILE_OPTIONS ${ASMJIT_COMPILE_OPTIONS})
+  message("   ASMJIT COMPILE FLAGS: ${ASMJIT_COMPILE_OPTIONS}")
+
 endif(USE_FBGEMM)
 
 if(USE_SENTENCEPIECE)
@@ -39,7 +49,7 @@ if(USE_SENTENCEPIECE)
     message(WARNING "You are compiling SentencePiece binaries with -DUSE_STATIC_LIBS=on. \
     This will cause spm_train to segfault. No need to worry if you do not intend to use that binary. \
     Marian support for SentencePiece will work fine.")
-    
+
     set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE)
     set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
   else(USE_STATIC_LIBS)
@@ -51,8 +61,19 @@ if(USE_SENTENCEPIECE)
   include_directories(./sentencepiece)
 
   set_target_properties(spm_encode spm_decode spm_train spm_normalize spm_export_vocab
-                        PROPERTIES
-                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    foreach(t sentencepiece sentencepiece_train sentencepiece_train-static
+        spm_decode spm_encode spm_export_vocab spm_normalize spm_train)
+      set_property(TARGET ${t} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-tautological-compare -Wno-unused")
+      if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+        set_property(TARGET ${t} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-range-loop-construct")
+      endif()
+      # get_property(SENTENCEPIECE_COMPILE_FLAGS TARGET ${t} PROPERTY COMPILE_FLAGS)
+      # message("-- SENTENCPIECE: compile flags for target ${t}: ${SENTENCEPIECE_COMPILE_FLAGS}")
+    endforeach(t)
+  endif()
 
   if(USE_STATIC_LIBS)
     set(CMAKE_FIND_LIBRARY_SUFFIXES ${_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
@@ -63,6 +84,22 @@ include_directories(./SQLiteCpp/include)
 include_directories(./CLI)
 include_directories(./pathie-cpp/include)
 
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  #set_target_properties(SQLiteCpp PROPERTIES COMPILE_FLAGS
+  set_property(TARGET SQLiteCpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -Wno-parentheses-equality -Wno-unused-value")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
+    set_property(TARGET SQLiteCpp APPEND_STRING PROPERTY COMPILE_FLAGS
+      " -Wno-implicit-int-float-conversion")
+  endif()
+  set_property(TARGET libyaml-cpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -fPIC -Wno-unused-value")
+  set_property(TARGET pathie-cpp APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -fPIC -Wno-unused-value")
+endif()
+
+
+
 include_directories(./zlib)
 
 include(ExternalProject)
diff --git a/src/3rd_party/fbgemm b/src/3rd_party/fbgemm
index 84e66a97..f78e6098 160000
--- a/src/3rd_party/fbgemm
+++ b/src/3rd_party/fbgemm
@@ -1 +1 @@
-Subproject commit 84e66a976046180187724aff60a236c5378fde7c
+Subproject commit f78e60988329b9207d086c743cafce1ac1bea3ab
diff --git a/src/3rd_party/half_float/umHalf.inl b/src/3rd_party/half_float/umHalf.inl
index 4f065dad..3f5285a2 100644
--- a/src/3rd_party/half_float/umHalf.inl
+++ b/src/3rd_party/half_float/umHalf.inl
@@ -186,7 +186,7 @@ inline HalfFloat& HalfFloat::operator= (float other)
 inline bool HalfFloat::operator== (HalfFloat other) const
 {
 	// +0 and -0 are considered to be equal
-	if (!(bits << 1u) && !(other.bits << 1u))return true;
+	if ((bits << 1u) == 0 && (other.bits << 1u) == 0) return true;
 
 	return bits == other.bits && !this->IsNaN();
 }
@@ -194,7 +194,7 @@ inline bool HalfFloat::operator== (HalfFloat other) const
 inline bool HalfFloat::operator!= (HalfFloat other) const
 {
 	// +0 and -0 are considered to be equal
-	if (!(bits << 1u) && !(other.bits << 1u))return false;
+	if ((bits << 1u) == 0 && (other.bits << 1u) == 0) return false;
 
 	return bits != other.bits || this->IsNaN();
 }
diff --git a/src/3rd_party/pathie-cpp/src/entry_iterator.cpp b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
index 24e5b110..baaf5394 100644
--- a/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
+++ b/src/3rd_party/pathie-cpp/src/entry_iterator.cpp
@@ -31,7 +31,7 @@
 #include "../include/path.hpp"
 #include "../include/errors.hpp"
 
-#if defined(__unix__)
+#if defined(__unix__) || defined(__APPLE__)
 #include <sys/types.h>
 #include <dirent.h>
 #include <errno.h>
diff --git a/src/3rd_party/pathie-cpp/src/path.cpp b/src/3rd_party/pathie-cpp/src/path.cpp
index 07fba271..e732e09c 100644
--- a/src/3rd_party/pathie-cpp/src/path.cpp
+++ b/src/3rd_party/pathie-cpp/src/path.cpp
@@ -902,7 +902,7 @@ Path Path::pwd()
  */
 Path Path::exe()
 {
-#if defined(__linux__)
+#if defined(__linux__) || defined(__APPLE__)
   char buf[PATH_MAX];
   ssize_t size = ::readlink("/proc/self/exe", buf, PATH_MAX);
 
diff --git a/src/3rd_party/pathie-cpp/src/pathie.cpp b/src/3rd_party/pathie-cpp/src/pathie.cpp
index c0abc689..d8e567e1 100644
--- a/src/3rd_party/pathie-cpp/src/pathie.cpp
+++ b/src/3rd_party/pathie-cpp/src/pathie.cpp
@@ -143,7 +143,7 @@ std::string Pathie::convert_encodings(const char* from_encoding, const char* to_
     errno  = 0;
     errsav = 0;
 
-#ifdef BSD
+#if defined(BSD) && ! defined(__APPLE__) //Since MacOS evolved from BSD, it is captured here but the iconv on macos behaves differently
     // What the heck. FreeBSD violates POSIX.1-2008: it declares iconv()
     // differently than mandated by POSIX: http://pubs.opengroup.org/onlinepubs/9699919799/functions/iconv.html
     // (it declares a `const' where it must not be).
@@ -181,11 +181,10 @@ std::string Pathie::convert_encodings(const char* from_encoding, const char* to_
 std::string Pathie::utf8_to_filename(const std::string& utf8)
 {
   bool fs_encoding_is_utf8 = false;
-
+  char* fsencoding = NULL;
 #if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
   fs_encoding_is_utf8 = true;
 #else
-  char* fsencoding = NULL;
   fsencoding = nl_langinfo(CODESET);
   fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
 #endif
@@ -206,11 +205,10 @@ std::string Pathie::utf8_to_filename(const std::string& utf8)
 std::string Pathie::filename_to_utf8(const std::string& native_filename)
 {
   bool fs_encoding_is_utf8 = false;
-
+  char* fsencoding = NULL;
 #if defined(__APPLE__) || defined(PATHIE_ASSUME_UTF8_ON_UNIX)
   fs_encoding_is_utf8 = true;
 #else
-  char* fsencoding = NULL;
   fsencoding = nl_langinfo(CODESET);
   fs_encoding_is_utf8 = (strcmp(fsencoding, "UTF-8") == 0);
 #endif
diff --git a/src/3rd_party/zstr/strict_fstream.hpp b/src/3rd_party/zstr/strict_fstream.hpp
index fbc64164..7b117393 100644
--- a/src/3rd_party/zstr/strict_fstream.hpp
+++ b/src/3rd_party/zstr/strict_fstream.hpp
@@ -27,7 +27,7 @@ static std::string strerror()
     {
         buff = "Unknown error";
     }
-#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && ! _GNU_SOURCE
+#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || __APPLE__) && ! _GNU_SOURCE
 // XSI-compliant strerror_r()
     if (strerror_r(errno, &buff[0], buff.size()) != 0)
     {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c1a51390..839a64f0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -215,6 +215,10 @@ if(COMPILE_SERVER)
   set(EXECUTABLES ${EXECUTABLES} marian_server)
 endif(COMPILE_SERVER)
 
+if(APPLE) # This is a dependency of pathie but I can't seem to link it into that CMakeLists because we're not compiling it as a library.
+   set(EXT_LIBS ${EXT_LIBS} iconv)
+endif()
+
 foreach(exec ${EXECUTABLES})
   target_link_libraries(${exec} marian ${EXT_LIBS} ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
   if(CUDA_FOUND)
diff --git a/src/command/marian_server.cpp b/src/command/marian_server.cpp
index 5df944d7..e4074bd1 100644
--- a/src/command/marian_server.cpp
+++ b/src/command/marian_server.cpp
@@ -44,7 +44,7 @@ int main(int argc, char **argv) {
 
   // Error Codes for error code meanings
   // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
-  translate.on_error = [](Ptr<WSServer::Connection> connection,
+  translate.on_error = [](Ptr<WSServer::Connection> /*connection*/,
                           const SimpleWeb::error_code &ec) {
     LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
   };
diff --git a/src/common/definitions.h b/src/common/definitions.h
index 45f5d563..2df0f8f3 100755
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -10,6 +10,21 @@
 #include <string>
 #include <vector>
 
+// The macro MAYBE_UNUSED is used to selectively disable
+// unused-variable warnings. C++17 defines the attribute
+// [[maybe_unused]], but I don't think we're at C++17 yet. We can add it when we reach C++17.
+// The compilers gcc and clang (and maybe others) define
+// __has_attribute and support __attribute__(unused) in C++11,
+#if defined __has_attribute
+#  if __has_attribute(unused)
+#    define MAYBE_UNUSED __attribute__((unused))
+#  else
+#    define MAYBE_UNUSED
+#  endif
+#endif
+
+
+
 #define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is neccessary, remove if no problems occur.
 #define NodeOp(op) [=]() { op; }
 
diff --git a/src/common/fastopt.cpp b/src/common/fastopt.cpp
index ccdd7b54..9af8e844 100644
--- a/src/common/fastopt.cpp
+++ b/src/common/fastopt.cpp
@@ -84,10 +84,16 @@ std::vector<T> As<std::vector<T>>::apply(const FastOpt& node) {
 // specializations for simple vector types
 template struct As<std::vector<bool>>;
 template struct As<std::vector<int>>;
-// Windows and Unix based OS have different type definitions for 'unsigned long'.
-// So, we need an explicit definition for uint64_t. Otherwise, there's a linking error on windows.
+// Windows, Linux based OS and Mac have different type definitions for 'unsigned long'.
+// So, we need an explicit definitions for uint64_t, that cover different platforms.
+// Otherwise, there's a linking error on windows or Linux or Mac.
 // https://software.intel.com/en-us/articles/size-of-long-integer-type-on-different-architecture-and-os/
-template struct As<std::vector<uint64_t>>;
+// https://stackoverflow.com/questions/32021860/c-should-you-size-t-with-a-regular-array
+// MacOS: size_t = unsigned long (8 bytes), uint64_t = unsigned long long (8 bytes)
+// Linux: size_t = unsigned long (8 bytes), uint64_t = unsigned long (8 bytes)
+// Windows: size_t = unsigned long long (8 bytes), uint64_t = unsigned long long (8 bytes)
+template struct As<std::vector<unsigned long long>>;
+template struct As<std::vector<unsigned long>>;
 template struct As<std::vector<float>>;
 template struct As<std::vector<double>>;
 template struct As<std::vector<std::string>>;
@@ -103,4 +109,4 @@ std::pair<T1, T2> As<std::pair<T1, T2>>::apply(const FastOpt& node) {
 template struct As<std::pair<int, int>>;
 
 }
-}
\ No newline at end of file
+}
diff --git a/src/common/fastopt.h b/src/common/fastopt.h
index 9ddc3c2c..3f735660 100644
--- a/src/common/fastopt.h
+++ b/src/common/fastopt.h
@@ -367,7 +367,8 @@ public:
   }
 
   const FastOpt& operator[](const char* const key) const {
-    return operator[](crc::crc(key));
+    // MacOS requires explicit cast to size_t before we can use it.
+    return operator[]((size_t)crc::crc(key));
   }
 
   const FastOpt& operator[](const std::string& key) const {
@@ -375,4 +376,4 @@ public:
   }
 };
 
-}
\ No newline at end of file
+}
diff --git a/src/common/file_stream.h b/src/common/file_stream.h
index c8151207..9bbf3359 100644
--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@@ -7,10 +7,21 @@
 #include "common/filesystem.h"
 #include "common/logging.h"
 
+// Even when compiling with clang, __GNUC__ may be defined, so
+// we need to add some extra checks to avoid compile errors with
+// respect to -Wsuggest-override.
 #ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsuggest-override"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-value"
+#  if defined(__has_warning)
+#    if __has_warning("-Wsuggest-override")
+#      pragma GCC diagnostic ignored "-Wsuggest-override"
+#    endif
+#  else
+#    pragma GCC diagnostic ignored "-Wsuggest-override"
+#  endif
 #endif
+
 #ifdef _MSC_VER
 #pragma warning(push) // 4101: 'identifier' : unreferenced local variable. One parameter variable in zstr.hpp is not used.
 #pragma warning(disable : 4101)
@@ -82,7 +93,7 @@ protected:
 
   void NormalizeTempPrefix(std::string& base) const;
   void MakeTemp(const std::string& base);
- 
+
 };
 
 }  // namespace io
diff --git a/src/common/filesystem.h b/src/common/filesystem.h
index d95125c5..d7cb3da6 100644
--- a/src/common/filesystem.h
+++ b/src/common/filesystem.h
@@ -7,9 +7,19 @@
 // @TODO: go back to canonical names for functions and objects
 // as specified in C++17 so it becomes easy to move in the future
 
+// Even when compiling with clang, __GNUC__ may be defined, so
+// we need to add some extra checks to avoid compile errors with
+// respect to -Wsuggest-override.
 #ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsuggest-override"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-value"
+#  if defined(__has_warning)
+#    if __has_warning("-Wsuggest-override")
+#      pragma GCC diagnostic ignored "-Wsuggest-override"
+#    endif
+#  else
+#    pragma GCC diagnostic ignored "-Wsuggest-override"
+#  endif
 #endif
 
 #include "3rd_party/pathie-cpp/include/path.hpp"  // @TODO: update to latest Pathie
diff --git a/src/common/intrusive_ptr.h b/src/common/intrusive_ptr.h
index fa412a8b..d476dbc1 100644
--- a/src/common/intrusive_ptr.h
+++ b/src/common/intrusive_ptr.h
@@ -169,12 +169,12 @@ inline bool operator!=(const IntrusivePtr<T>& a, std::nullptr_t) {
 
 template<class T>
 inline bool operator==(T* a, const IntrusivePtr<T>& b) {
-  return b.get();
+  return a == b.get(); // used to say: return b.get(); That cannot be right. [UG]
 }
 
 template<class T>
 inline bool operator!=(T* a, const IntrusivePtr<T>& b) {
-  return b.get();
+  return a != b.get(); // used to say: return b.get(); That cannot be right. [UG]
 }
 
 template<class T, class U>
@@ -223,5 +223,3 @@ namespace std {
     }
   };
 }
-
-
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 93bfcbd8..62d76fee 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -124,7 +124,7 @@ static void setErrorHandlers() {
   std::set_terminate(unhandledException);
 #ifdef __unix__
   // catch segfaults
-  struct sigaction sa = { 0 };
+  struct sigaction sa = { {0} };
   sigemptyset(&sa.sa_mask);
   sa.sa_flags = SA_SIGINFO;
   sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };
diff --git a/src/common/types.h b/src/common/types.h
index 552ae225..4bc4f9ad 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -254,7 +254,7 @@ enum class Type : size_t {
   packed16      = TypeClass::packed_type + 2u,                          // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
   packed8avx2   = TypeClass::packed_type + 1u + TypeClass::avx2_type,   // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
   packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-  
+
 };
 
 static inline size_t operator&(TypeClass typeClass, Type type) {
@@ -394,7 +394,7 @@ static Type inline typeFromString(const std::string& str) {
     return Type::float32;
   if(str == "float64")
     return Type::float64;
-  
+
   if(str == "packed16")
     return Type::packed16;
   if(str == "packed8avx2")
@@ -437,19 +437,35 @@ void matchOrAbort(Type type) {
 
 namespace typeFitting { // own namespace instead of in class, otherwise we get error "explicit specialization in non-namespace scope"
 
-  // compares max for different types as constexpr, so can be used at compile-time to determine if RequestType type max fits into ReturnType max, see std::conditional below.
-  template <typename RequestType, typename ReturnType>
-  constexpr bool fitsIntoMax() { return std::numeric_limits<RequestType>::max() <= std::numeric_limits<ReturnType>::max(); } // for built-in types everything is constexpr
+  // Helper function for fitsIntoMax() below
+  // Returns the 'capacity' of a type: number of digits for integers,
+  // max_exponent for floats. We ignore the mantissa for floats.
+  template<typename X> constexpr int capacity() {
+    static_assert(std::is_arithmetic<X>::value || std::is_same<X,HalfFloat>::value,
+                  "Wrong type for this template");
+    return (std::is_integral<X>::value
+            ? std::numeric_limits<X>::digits
+            : std::numeric_limits<X>::max_exponent);
+ }
+
+
+  // Compare max for different types as constexpr, so can be used at compile-time to determine if RequestType type max fits into ReturnType max, see std::conditional below.
+  template <typename RequestType, typename ReturnType>
+  constexpr bool fitsIntoMax() {
+    // We can't just compare std::numeric_limits<>::max(), because Clang-10
+    // complains about rounding errors when implicitly converting int to float
+    return ((!std::is_integral<RequestType>::value // RequestType is a float
+             && std::is_integral<ReturnType>::value) // ReturnType an integer
+            ? capacity<RequestType>() < capacity<ReturnType>() // special case
+            : capacity<RequestType>() <= capacity<ReturnType>()); // normal case
+  } // for built-in types everything is constexpr
 
-  // add specializations here when needed
-  template <> constexpr bool fitsIntoMax<float16, float>() { return true; };  // for float16 conversion to float is not constexpr, hence specializations
-  template <> constexpr bool fitsIntoMax<float, float16>() { return false; }; // for float16 conversion to float is not constexpr, hence specializations
 }
 
 template <typename ReturnType>
 class NumericLimits {
 private:
-  
+
   template <typename MaxType> void setLimitsMax() {
     max    = (ReturnType)std::numeric_limits<MaxType>::max();
     lowest = (ReturnType)std::numeric_limits<MaxType>::lowest();
@@ -459,10 +475,14 @@ private:
   void setLimits() {
     // check if the maximum of type RequestType fits into ReturnType
     constexpr bool fits = typeFitting::fitsIntoMax<RequestType, ReturnType>();
+    // sanity check:
+    static_assert(fits || typeFitting::fitsIntoMax<ReturnType, RequestType>(),
+                  "RequestType doesn't fit into ReturnType, and ReturnType doesn't "
+                  "fit into RequestType. fitsIntoMax is broken!");
     // and then use the smaller of each types to determine max, min, lowest.
     using MaxType = typename std::conditional<fits, RequestType, ReturnType>::type;
     setLimitsMax<MaxType>();
-    // @TODO: should we rather abort if the RequestType does not fit into ReturnType instead of clipping to smaller type? 
+    // @TODO: should we rather abort if the RequestType does not fit into ReturnType instead of clipping to smaller type?
     // ABORT_IF(!fits, "Type {} is too small to contain max of type {}", typeId<ReturnType>(), typeId<RequestType>());
   }
 
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index 80aee431..3acb756d 100755
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -8,12 +8,22 @@
 #include <sstream>
 #include <string>
 #include <set>
-#ifdef __unix__
+#if defined(__unix__) || defined(__APPLE__)
 #include <unistd.h>
 #endif
 #include <codecvt>
 #include <cwctype>
 
+// MACOS lacks HOST_NAME_MAX
+#ifndef HOST_NAME_MAX
+# if defined(_POSIX_HOST_NAME_MAX)
+#  define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+# elif defined(MAXHOSTNAMELEN)
+#  define HOST_NAME_MAX MAXHOSTNAMELEN
+# endif
+#endif
+
+
 namespace marian {
 namespace utils {
 
diff --git a/src/data/batch.h b/src/data/batch.h
index b1b10981..3c592b31 100755
--- a/src/data/batch.h
+++ b/src/data/batch.h
@@ -26,7 +26,7 @@ public:
 
   virtual void setGuidedAlignment(std::vector<float>&&) = 0;
   virtual void setDataWeights(const std::vector<float>&) = 0;
-
+  virtual ~Batch() {};
 protected:
   std::vector<size_t> sentenceIds_;
 };
diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
index f0d9b706..e85e378a 100755
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@@ -525,6 +525,7 @@ public:
              const std::vector<Ptr<Vocab>>& vocabs,
              Ptr<Options> options);
 
+  virtual ~CorpusBase() {}
   virtual std::vector<Ptr<Vocab>>& getVocabs() = 0;
 
 protected:
diff --git a/src/data/default_vocab.cpp b/src/data/default_vocab.cpp
index 53f90a5d..590e9931 100644
--- a/src/data/default_vocab.cpp
+++ b/src/data/default_vocab.cpp
@@ -45,6 +45,7 @@ protected:
 
 public:
   // @TODO: choose between 'virtual' and 'final'. Can we derive from this class?
+  virtual ~DefaultVocab() {};
   virtual const std::string& canonicalExtension() const override { return suffixes_[0]; }
   virtual const std::vector<std::string>& suffixes() const override { return suffixes_; }
 
@@ -295,7 +296,7 @@ private:
 class ClassVocab : public DefaultVocab {
 private:
   // Do nothing.
-  virtual void addRequiredVocabulary(const std::string& vocabPath, bool isJson) override { vocabPath; isJson; }
+  virtual void addRequiredVocabulary(const std::string& /*vocabPath*/, bool /*isJson*/) override {}
 
   // Not adding special class labels, only seen classes.
   virtual void create(const std::string& vocabPath,
diff --git a/src/data/shortlist.h b/src/data/shortlist.h
index e5d08494..177272df 100644
--- a/src/data/shortlist.h
+++ b/src/data/shortlist.h
@@ -36,6 +36,8 @@ public:
 
 class ShortlistGenerator {
 public:
+  virtual ~ShortlistGenerator() {}
+
   virtual Ptr<Shortlist> generate(Ptr<data::CorpusBatch> batch) const = 0;
 
   // Writes text version of (possibly) pruned short list to file
@@ -129,7 +131,6 @@ private:
   Ptr<const Vocab> trgVocab_;
 
   size_t srcIdx_;
-  size_t trgIdx_;
   bool shared_{false};
 
   size_t firstNum_{100};
@@ -183,13 +184,12 @@ public:
                             Ptr<const Vocab> srcVocab,
                             Ptr<const Vocab> trgVocab,
                             size_t srcIdx = 0,
-                            size_t trgIdx = 1,
+                            size_t /*trgIdx*/ = 1,
                             bool shared = false)
       : options_(options),
         srcVocab_(srcVocab),
         trgVocab_(trgVocab),
         srcIdx_(srcIdx),
-        trgIdx_(trgIdx),
         shared_(shared) {
     std::vector<std::string> vals = options_->get<std::vector<std::string>>("shortlist");
 
@@ -235,7 +235,6 @@ public:
 
   virtual Ptr<Shortlist> generate(Ptr<data::CorpusBatch> batch) const override {
     auto srcBatch = (*batch)[srcIdx_];
-    // auto trgBatch = (*batch)[trgIdx_];
 
     // add firstNum most frequent words
     std::unordered_set<WordIndex> indexSet;
diff --git a/src/data/text_input.h b/src/data/text_input.h
index bdbcf64e..db99ef6a 100644
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@@ -37,6 +37,7 @@ public:
   typedef SentenceTuple Sample;
 
   TextInput(std::vector<std::string> inputs, std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~TextInput() {}
 
   Sample next() override;
 
diff --git a/src/data/vocab_base.h b/src/data/vocab_base.h
index 6b6869d8..8c214c97 100755
--- a/src/data/vocab_base.h
+++ b/src/data/vocab_base.h
@@ -57,6 +57,7 @@ public:
   virtual Word randWord() const {
     return Word::fromWordIndex(rand() % size());
   }
+  virtual ~IVocab() {};
 };
 
 class Options;
diff --git a/src/examples/mnist/dataset.h b/src/examples/mnist/dataset.h
index 0e73840b..b0492b85 100644
--- a/src/examples/mnist/dataset.h
+++ b/src/examples/mnist/dataset.h
@@ -62,6 +62,7 @@ private:
   std::vector<Input> inputs_;
 
 public:
+
   std::vector<Input>& inputs() { return inputs_; }
 
   const std::vector<Input>& inputs() const { return inputs_; }
@@ -144,6 +145,8 @@ public:
     loadData();
   }
 
+  virtual ~MNISTData(){}
+
   void loadData() override {
     ABORT_IF(paths_.size() != 2, "Paths to MNIST data files are not specified");
 
diff --git a/src/examples/mnist/model.h b/src/examples/mnist/model.h
index 24543ad1..f7af1681 100755
--- a/src/examples/mnist/model.h
+++ b/src/examples/mnist/model.h
@@ -47,6 +47,8 @@ class MNISTLogsoftmax : public ILogProb {
 public:
   MNISTLogsoftmax() {}
 
+  virtual ~MNISTLogsoftmax(){}
+
   Logits apply(Ptr<IModel> model,
              Ptr<ExpressionGraph> graph,
              Ptr<data::Batch> batch,
@@ -61,13 +63,15 @@ public:
   typedef data::MNISTData dataset_type;
 
   template <class... Args>
-  MnistFeedForwardNet(Ptr<Options> options, Args... args)
+  MnistFeedForwardNet(Ptr<Options> options, Args... /*args*/)
       : options_(options), inference_(options->get<bool>("inference", false)) {}
 
+  virtual ~MnistFeedForwardNet(){}
+
   virtual Logits build(Ptr<ExpressionGraph> graph,
                      Ptr<data::Batch> batch,
                      bool /*clean*/ = false) override {
-    
+
     return Logits(apply(graph, batch, inference_));
   }
 
diff --git a/src/examples/mnist/validator.h b/src/examples/mnist/validator.h
index 7adc73f0..232bb124 100644
--- a/src/examples/mnist/validator.h
+++ b/src/examples/mnist/validator.h
@@ -19,7 +19,9 @@ public:
     builder_ = models::createModelFromOptions(options, models::usage::translation);
   }
 
-  virtual void keepBest(const std::vector<Ptr<ExpressionGraph>>& graphs) override {
+  virtual ~MNISTAccuracyValidator(){}
+
+  virtual void keepBest(const std::vector<Ptr<ExpressionGraph>>& /*graphs*/) override {
     LOG(warn, "Keeping best model for MNIST examples is not supported");
   }
 
diff --git a/src/functional/operators.h b/src/functional/operators.h
index 41fc74b0..be6bb2e5 100755
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -7,55 +7,58 @@ namespace marian {
 namespace functional {
 
 // General template, will be used for any type without specializations
-// and will fail with an abort message.
+// and will fail at runtime with an abort message. Note that the
+// general template functions don't have named parameters on purpose,
+// because clang will warn about unused parameters during compilation.
+
 template <typename T>
 struct Ops {
-  static HOST_DEVICE_INLINE T tanh(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sin(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T cos(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T tan(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T log(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T exp(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T abs(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sqrt(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T neg(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sgn(const T& x)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T tanh(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sin(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T cos(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T tan(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T log(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T exp(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T abs(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sqrt(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T neg(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sgn(const T&)  { ABORT("Unknown type"); }
 
-  static HOST_DEVICE_INLINE T add(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T sub(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T mul(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T div(const T& x, const T& y)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T add(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sub(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T mul(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T div(const T&, const T&)  { ABORT("Unknown type"); }
 
-  static HOST_DEVICE_INLINE T max(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T min(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T pow(const T& x, const T& y)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T max(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T min(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T pow(const T&, const T&)  { ABORT("Unknown type"); }
 
-  static HOST_DEVICE_INLINE T negate(const T& x)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T eq(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T neq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T gt(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T lt(const T& x, const T& y)   { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T geq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T leq(const T& x, const T& y)  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T _and(const T& x, const T& y) { ABORT("Unknown type"); } // 'and' is used by gcc
-  static HOST_DEVICE_INLINE T _or(const T& x, const T& y)  { ABORT("Unknown type"); } // 'or' is used by gcc
+  static HOST_DEVICE_INLINE T negate(const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T eq(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T neq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T gt(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T lt(const T&, const T&)   { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T geq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T leq(const T&, const T&)  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T _and(const T&, const T&) { ABORT("Unknown type"); } // 'and' is used by gcc
+  static HOST_DEVICE_INLINE T _or(const T&, const T&)  { ABORT("Unknown type"); } // 'or' is used by gcc
 
   // Neural Networks specific functions
-  static HOST_DEVICE_INLINE T sigmoid(const T& x)               { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T logaddexp(const T& x, const T& y) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T clip(const T& x, const T& y)      { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sigmoid(const T&)               { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T logaddexp(const T&, const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T clip(const T&, const T&)      { ABORT("Unknown type"); }
   // derivative of Clip, cut-off function
-  static HOST_DEVICE_INLINE T bump(const T& x, const T& y)      { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T relu(const T& x)                  { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T reluBack(const T& x)              { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T prelu(const T& x, const T& y)     { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T preluBack(const T& x, const T& y) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T bump(const T&, const T&)      { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T relu(const T&)                  { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T reluBack(const T&)              { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T prelu(const T&, const T&)     { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T preluBack(const T&, const T&) { ABORT("Unknown type"); }
 
-  static HOST_DEVICE_INLINE T if_then_else(const T& x, const T& y, const T& z) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T if_then_else(const T&, const T&, const T&) { ABORT("Unknown type"); }
 
-  static HOST_DEVICE_INLINE T sumReduce(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T maxReduce(const T& x) { ABORT("Unknown type"); }
-  static HOST_DEVICE_INLINE T minReduce(const T& x) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T sumReduce(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T maxReduce(const T&) { ABORT("Unknown type"); }
+  static HOST_DEVICE_INLINE T minReduce(const T&) { ABORT("Unknown type"); }
 };
 
 // Specialization for float
@@ -127,14 +130,14 @@ template <>
 struct Ops<double> {
   typedef double Single;
 
-  static HOST_DEVICE_INLINE double tanh(const double& x) { return tanh(x); }
-  static HOST_DEVICE_INLINE double sin(const double& x)  { return sin(x); }
-  static HOST_DEVICE_INLINE double cos(const double& x)  { return cos(x); }
-  static HOST_DEVICE_INLINE double tan(const double& x)  { return tan(x); }
-  static HOST_DEVICE_INLINE double log(const double& x)  { return log(x); }
-  static HOST_DEVICE_INLINE double exp(const double& x)  { return exp(x); }
-  static HOST_DEVICE_INLINE double abs(const double& x)  { return abs(x); }
-  static HOST_DEVICE_INLINE double sqrt(const double& x) { return sqrt(x); }
+  static HOST_DEVICE_INLINE double tanh(const double& x) { return std::tanh(x); }
+  static HOST_DEVICE_INLINE double sin(const double& x)  { return std::sin(x); }
+  static HOST_DEVICE_INLINE double cos(const double& x)  { return std::cos(x); }
+  static HOST_DEVICE_INLINE double tan(const double& x)  { return std::tan(x); }
+  static HOST_DEVICE_INLINE double log(const double& x)  { return std::log(x); }
+  static HOST_DEVICE_INLINE double exp(const double& x)  { return std::exp(x); }
+  static HOST_DEVICE_INLINE double abs(const double& x)  { return std::abs(x); }
+  static HOST_DEVICE_INLINE double sqrt(const double& x) { return std::sqrt(x); }
   static HOST_DEVICE_INLINE double neg(const double& x)  { return -x; }
   static HOST_DEVICE_INLINE double sgn(const double& x)  { return (0 < x) - (x < 0); }
 
@@ -145,7 +148,7 @@ struct Ops<double> {
 
   static HOST_DEVICE_INLINE double max(const double& x, const double& y)  { return x < y ? y : x; }
   static HOST_DEVICE_INLINE double min(const double& x, const double& y)  { return x < y ? x : y; }
-  static HOST_DEVICE_INLINE double pow(const double& x, const double& y)  { return pow(x, y); }
+  static HOST_DEVICE_INLINE double pow(const double& x, const double& y)  { return std::pow(x, y); }
 
 
   static HOST_DEVICE_INLINE double negate(const double& x)  { return !(bool)x; }
@@ -460,7 +463,7 @@ struct Ops<half> {
   static DEVICE_INLINE half exp(const half& x)  { return hexp(x); }
   static DEVICE_INLINE half sqrt(const half& x) { return hsqrt(x); }
   static DEVICE_INLINE half neg(const half& x)  { return -x; }
-  
+
   static DEVICE_INLINE half abs(const half& x)  { return fabs((float)x); }// @TODO half has this information somewhere in the struct, right?
   static DEVICE_INLINE half sgn(const half& x)  { half zero = 0.f; return (zero < x) - (x < zero); } // @TODO half has this information somewhere in the struct, right?
 
diff --git a/src/graph/node.h b/src/graph/node.h
index 4d591674..c017eeb2 100644
--- a/src/graph/node.h
+++ b/src/graph/node.h
@@ -40,7 +40,7 @@ protected:
   std::string debugMessage_;
 
   Ptr<std::list<Expr>> subtape_; // a subtape is used to keep track of nodes that need to be freed and recomputed with gradient-checkpointing.
-  bool isCheckpoint_{false};     // true if this node has been selected to be a checkpoint, currently only done manually. 
+  bool isCheckpoint_{false};     // true if this node has been selected to be a checkpoint, currently only done manually.
 
   Ptr<AutoTunerRecorder> recorder_;
   size_t recorderHash_;
@@ -138,7 +138,7 @@ public:
 
   virtual std::string graphviz() override {
     std::stringstream ss;
-    ss << "\"" << this << "\" [" 
+    ss << "\"" << this << "\" ["
       << "shape=\"" << form() << "\", "
       << "label="   << label() << ", "
       << "style=\"filled\", "
@@ -147,7 +147,7 @@ public:
 
     for(auto&& child : children())
       ss << "\"" << child << "\" -> \"" << this << "\";" << std::endl;
-    
+
     if(subtape_) {
       for(auto&& dep : *subtape_)
         ss << "\"" << dep << "\" -> \"" << this << "\" [style=dotted];" << std::endl;
@@ -188,9 +188,9 @@ struct NaryNodeOp : public Node {
 
   // Deduce type automatically, but then all types must be the same
   // this is called automatically when no output type is specified.
-  // If the input types are mixed, the output type needs to be specified 
+  // If the input types are mixed, the output type needs to be specified
   // in the constructor.
-  Type commonType(const std::vector<Expr>& nodes) {
+  static Type commonType(const std::vector<Expr>& nodes) {
     ABORT_IF(nodes.size() == 0, "NaryNodeOp has no children");
     Type type = nodes[0]->value_type();
     for(int i = 1; i < nodes.size(); ++i)
diff --git a/src/graph/node_initializers.h b/src/graph/node_initializers.h
index 624816b9..21dcc95d 100755
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@@ -17,9 +17,9 @@ namespace inits {
 /**
  * Base class for specialized NodeInitializers.
  *
- * A NodeInitializer is a functor that is associated with parameters 
- * and constants, and is invoked on a tensor during node intialization. 
- * You need to override NodeIntializer::apply(Tensor) with your own 
+ * A NodeInitializer is a functor that is associated with parameters
+ * and constants, and is invoked on a tensor during node intialization.
+ * You need to override NodeIntializer::apply(Tensor) with your own
  * functionality or use a fromLambda intializer.
  *
  * See node_initializers.cpp for examples.
@@ -31,6 +31,7 @@ protected:
 public:
   virtual void apply(Tensor t) = 0;
   void setAllocator(Ptr<Allocator> allocator) { allocator_ = allocator; }
+  virtual ~NodeInitializer() {}
 };
 
 /**
@@ -135,7 +136,7 @@ Ptr<NodeInitializer> dropout(float dropoutProbabilty);
 
 /**
  * Intialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps)
- * 
+ *
  * @return A NodeInitializer
  */
 Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
@@ -163,7 +164,7 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
 
 /**
  * Computes Google's Transformer-style sinusoidal position embeddings
- * starting from position 'start' taking into account batch and time 
+ * starting from position 'start' taking into account batch and time
  * dimensions of the tensor.
  *
  * Expected tensor layout {-2: time, -1: model}
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index d12c2481..63158ffa 100755
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -480,9 +480,12 @@ class CSRDotNodeOp : public NaryNodeOp {
   bool transS_;
   bool swapOperands_;
 public:
-  CSRDotNodeOp(const Shape& S_shape, Expr S_values, Expr S_indices, Expr S_offsets, Expr D, bool transS, bool swapOperands)
-      : NaryNodeOp({ S_values, S_indices, S_offsets, D }, newShape(S_shape, S_values, S_indices, S_offsets, D, transS, swapOperands), commonType({S_values, D})),
-                   transS_(transS), swapOperands_(swapOperands) {
+  CSRDotNodeOp(const Shape& S_shape, Expr S_values, Expr S_indices,
+               Expr S_offsets, Expr D, bool transS, bool swapOperands)
+    : NaryNodeOp({ S_values, S_indices, S_offsets, D },
+                 newShape(S_shape, S_values, S_indices, S_offsets, D, transS, swapOperands),
+                 NaryNodeOp::commonType({S_values, D})),
+      transS_(transS), swapOperands_(swapOperands) {
     matchOrAbort<IndexType>(S_indices->value_type());
     matchOrAbort<IndexType>(S_offsets->value_type());
   }
@@ -513,7 +516,7 @@ public:
 
   NodeOps backwardOps() override {
     return { nullptr, // can't backprop into the sparse matrix (the gradient is dense)
-             nullptr, 
+             nullptr,
              nullptr,
              NodeOp(CSRProd(child(3)->grad(), // child(3) = D
                             graph()->allocator(),
@@ -527,7 +530,7 @@ public:
   virtual size_t hash() override {
     size_t seed = NaryNodeOp::hash();
     for(auto s : shape())
-      util::hash_combine(seed, s);  
+      util::hash_combine(seed, s);
     util::hash_combine(seed, transS_);
     util::hash_combine(seed, swapOperands_);
     return seed;
@@ -1050,8 +1053,8 @@ struct ConcatenateNodeOp : public NaryNodeOp {
     auto checkShape = shape;
     for(auto child : nodes) {
       checkShape.set(ax_, child->shape()[ax_]); // don't abort on different sizes on axis dim.
-      ABORT_IF(checkShape != child->shape(), 
-               "Child shapes {} and {} cannot be concatenated along axis {}", 
+      ABORT_IF(checkShape != child->shape(),
+               "Child shapes {} and {} cannot be concatenated along axis {}",
                shape, child->shape(), ax);
 
       sum += child->shape()[ax_];
diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index 25eb5db4..8b4af9dd 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -10,10 +10,10 @@
 
 namespace marian {
 
-// @TODO: Currently an ExpressionGraph only supports one Parameters object and 
+// @TODO: Currently an ExpressionGraph only supports one Parameters object and
 // the type of parameters has to be the inside on Parameters object. This limits
 // parameter types to a single chosen type, e.g. only fp32 or only fp16. This should
-// be extended to allow multiple sets of parameters. 
+// be extended to allow multiple sets of parameters.
 // The reason here is to be able to efficiently compute updates of whole parameter
 // sets of one type.
 class Parameters {
@@ -40,7 +40,7 @@ public:
     LOG(debug, "Created parameter object of type {}", acceptedElementType_);
   }
 
-  ~Parameters() {
+  virtual ~Parameters() {
     LOG(debug, "Destroyed parameter object of type {}", acceptedElementType_);
   }
 
@@ -88,7 +88,7 @@ public:
 
       // sort parameters by name before allocation to make sure the memory layout after allocation is always the same
       std::sort(params_.begin(), params_.end(), [](Expr n1, Expr n2){ return n1->name() < n2->name(); });
-      
+
       for(auto p : params_) {
         if(!p->val()) {
           vals_->allocate(p->val(), p->shape(), p->value_type());
diff --git a/src/layers/generic.h b/src/layers/generic.h
index e899d8a0..a3b9bac4 100755
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@@ -39,6 +39,7 @@ public:
 
 // Simplest layer interface: Unary function
 struct IUnaryLayer {
+  virtual ~IUnaryLayer() {}
   virtual Expr apply(Expr) = 0;
   virtual Expr apply(const std::vector<Expr>& es) {
     ABORT_IF(es.size() > 1, "Not implemented"); // simple stub
@@ -59,6 +60,7 @@ struct IEmbeddingLayer {
 
   // alternative from indices directly
   virtual Expr applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const = 0;
+  virtual ~IEmbeddingLayer() {}
 };
 
 // base class for Encoder and Decoder classes, which have embeddings and a batch index (=stream index)
diff --git a/src/layers/guided_alignment.h b/src/layers/guided_alignment.h
index 48ad6599..f08d3f09 100755
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@@ -5,14 +5,14 @@
 
 namespace marian {
 
-static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
+static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> /*graph*/,
                                                Ptr<data::CorpusBatch> batch,
                                                Ptr<Options> options,
                                                Expr attention) { // [beam depth=1, max src length, batch size, tgt length]
 
   std::string guidedLossType = options->get<std::string>("guided-alignment-cost");  // @TODO: change "cost" to "loss"
   float guidedLossWeight = options->get<float>("guided-alignment-weight");
-  
+
   const auto& shape = attention->shape(); // [beam depth=1, max src length, batch size, tgt length]
   float epsilon = 1e-6f;
   Expr alignmentLoss; // sum up loss over all attention/alignment positions
@@ -55,8 +55,8 @@ static inline RationalLoss guidedAlignmentCost(Ptr<ExpressionGraph> graph,
     else
        ABORT("Unknown alignment cost type: {}", guidedLossType);
     // every position is a label as they should all agree
-    // @TODO: there should be positional masking here ... on the other hand, positions that are not 
-    // in a sentence should always agree (both being 0). Lack of masking affects label count only which is 
+    // @TODO: there should be positional masking here ... on the other hand, positions that are not
+    // in a sentence should always agree (both being 0). Lack of masking affects label count only which is
     // probably negligible?
     numLabels = shape.elements();
   }
diff --git a/src/layers/loss.h b/src/layers/loss.h
index d9deaeb8..43e89c1d 100755
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@@ -331,6 +331,7 @@ public:
   : LabelwiseLoss(axes), // cross-entropy already reduces over axis -1
     labelSmoothing_(labelSmoothing), factorWeight_(factorWeight) {}
 
+  virtual ~CrossEntropyLoss() {}
 protected:
   float labelSmoothing_; // interpolation factor for label smoothing, see below
   float factorWeight_;   // give extra weight to factors
@@ -368,7 +369,7 @@ protected:
 
     if(labelWeights) {
       // We currently do not know how to use target factors and word-level label weights together
-      bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights. 
+      bool wordlevel = labelWeights->shape()[-3] > 1; // Time-dimension is not trivially 1, hence we have word-level weights.
       ABORT_IF(wordlevel && logits.getNumFactorGroups() > 1, "CE loss with word-level label weights is not implemented for factors");
       ce = ce * cast(labelWeights, Type::float32);
     }
@@ -379,15 +380,15 @@ protected:
 
 
 /**
- * @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an 
- * implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319. 
+ * @brief Unlikelihood loss across last axis, summed up over batch and time dimensions. This is an
+ * implementation of sequence-level unlikelihood loss from https://arxiv.org/abs/1908.04319.
  * We rely on word-level label weights where 1 is correct and 0 is marking an error. If there are not
  * zeros for a sentence it going to be trained with normal CE loss if there is at least one 0 it is going
  * to flip over to use SUL for that sentence to penalize the selected word.
- * 
+ *
  * SUL is implemented as:
  * -log(gather(1 - softmax(logits), -1, indices))
- * 
+ *
  * Factors are currently not supported.
  */
 class SequenceUnlikelihoodLoss : public CrossEntropyLoss {
@@ -411,17 +412,17 @@ protected:
     ABORT_IF(!mask, "mask is required"); // @TODO: check this, it seems weights for padding are by default 1, which would make this obsolete.
     // use label weights, where 1 is GOOD and 0 is BAD. After inversion here, now 1 marks, mask again to eliminate padding (might be obsolete)
     auto errorMask = (1.f - cast(labelWeights, Type::float32)) * cast(mask, Type::float32);
-  
+
     auto ceUl = logits.applyLossFunction(labels, [&](Expr logits, Expr indices) {
       return cast(unlikelihood(logits, indices), Type::float32);
     });
-  
+
     // compute if want to use CE or UL. If there are no errors train with CE, otherwise train _only on_ the errors with UL. This is the "mixed" training
-    // schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL. 
+    // schedule from https://arxiv.org/abs/1908.04319. Providing labels with or without error scores we can easily switch between CE and UL.
     auto onlyCe  = eq(sum(errorMask, /*axis=*/-3), 0.f); // [1, 1, dimBatch, 1] - equal 1 if no errors are present
     ceUl         = errorMask * ceUl;                     // don't use for correct label or padding
 
-    auto cost    = onlyCe * ce + (1.f - onlyCe) * ceUl;  // ce or unlikelihood part are never simultanously used as cost per batch entry 
+    auto cost    = onlyCe * ce + (1.f - onlyCe) * ceUl;  // ce or unlikelihood part are never simultanously used as cost per batch entry
 
     return cost;
   }
diff --git a/src/layers/weight.h b/src/layers/weight.h
index 787c2376..74cf47d6 100644
--- a/src/layers/weight.h
+++ b/src/layers/weight.h
@@ -17,6 +17,7 @@ public:
   virtual void debugWeighting(std::vector<float> /*weightedMask*/,
                               std::vector<float> /*freqMask*/,
                               Ptr<data::CorpusBatch> /*batch*/){};
+  virtual ~WeightingBase() {}
 };
 
 class DataWeighting : public WeightingBase {
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 0f0ea253..2f7687aa 100755
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -41,6 +41,7 @@ class VocabWrapper : public IVocabWrapper {
   Ptr<Vocab> pImpl_;
 public:
   VocabWrapper(Ptr<Vocab> vocab) : pImpl_(vocab) {}
+  virtual ~VocabWrapper() {}
   WordIndex encode(const std::string& word) const override { return (*pImpl_)[word].toWordIndex(); }
   std::string decode(WordIndex id) const override { return (*pImpl_)[Word::fromWordIndex(id)]; }
   size_t size() const override { return pImpl_->size(); }
@@ -243,7 +244,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) {
   }
 }
 
-// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc. 
+// @TODO: clean-up this code and unify with marian-conv. The targetPrec parameter is not clear enought etc.
 bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec) {
   std::cout << "Converting from: " << inputFile << ", to: " << outputFile << std::endl;
 
diff --git a/src/microsoft/quicksand.h b/src/microsoft/quicksand.h
index 1089be14..87de1948 100755
--- a/src/microsoft/quicksand.h
+++ b/src/microsoft/quicksand.h
@@ -54,6 +54,8 @@ public:
                      const std::vector<const void*>& ptrs)
       : options_(options), ptrs_(ptrs) {}
 
+  virtual ~IBeamSearchDecoder() {}
+
   virtual QSNBestBatch decode(const QSBatch& qsBatch,
                               size_t maxLength,
                               const std::unordered_set<WordIndex>& shortlist)
diff --git a/src/models/costs.h b/src/models/costs.h
index 03b247f0..2a66e549 100755
--- a/src/models/costs.h
+++ b/src/models/costs.h
@@ -25,6 +25,7 @@ public:
                                        Ptr<ExpressionGraph> graph, // @TODO: why needed? Can it be gotten from model?
                                        Ptr<data::Batch> batch,
                                        bool clearGraph = true) = 0;
+  virtual ~ICost() {}
 };
 
 class EncoderDecoderCECost : public ICost {
@@ -51,6 +52,8 @@ public:
       weighter_ = WeightingFactory(options_);
   }
 
+  virtual ~EncoderDecoderCECost() {}
+
   Ptr<MultiRationalLoss> apply(Ptr<IModel> model,
              Ptr<ExpressionGraph> graph,
              Ptr<data::Batch> batch,
@@ -136,6 +139,8 @@ public:
   Trainer(Ptr<IModel> model, Ptr<ICost> cost)
       : model_(model), cost_(cost) {}
 
+  virtual ~Trainer() {}
+
   Ptr<IModel> getModel() { return model_; }
 
   virtual void load(Ptr<ExpressionGraph> graph,
@@ -179,6 +184,8 @@ public:
   Scorer(Ptr<IModel> model, Ptr<ILogProb> cost)
       : model_(model), logProb_(cost) {}
 
+  virtual ~Scorer(){}
+
   Ptr<IModel> getModel() { return model_; }
 
   virtual void load(Ptr<ExpressionGraph> graph,
@@ -211,6 +218,7 @@ public:
 
 class LogSoftmaxStep : public ILogProbStep {
 public:
+  virtual ~LogSoftmaxStep() {}
   virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
     // decoder needs normalized probabilities (note: skipped if beam 1 and --skip-cost)
     state->setLogProbs(state->getLogProbs().applyUnaryFunction(logsoftmax));
@@ -224,6 +232,7 @@ public:
 // with --output-sampling during translation with marian-decoder
 class GumbelSoftmaxStep : public ILogProbStep {
 public:
+  virtual ~GumbelSoftmaxStep() {}
   virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
     state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
       [](Expr logits){ // lemma gets gumbelled
diff --git a/src/models/encoder_decoder.h b/src/models/encoder_decoder.h
index 599dbfdc..92c1647f 100755
--- a/src/models/encoder_decoder.h
+++ b/src/models/encoder_decoder.h
@@ -11,6 +11,7 @@ namespace marian {
 
 class IEncoderDecoder : public models::IModel {
 public:
+  virtual ~IEncoderDecoder() {}
   virtual void load(Ptr<ExpressionGraph> graph,
                     const std::string& name,
                     bool markedReloaded = true) override
diff --git a/src/models/model_base.h b/src/models/model_base.h
index 016f3b8e..5f76b380 100644
--- a/src/models/model_base.h
+++ b/src/models/model_base.h
@@ -41,6 +41,8 @@ public:
 // @TODO: Is there a better name?
 class ICriterionFunction {
 public:
+  virtual ~ICriterionFunction() {}
+
   virtual void load(Ptr<ExpressionGraph>,
                     const std::string&,
                     bool markReloaded = true)
diff --git a/src/models/model_task.h b/src/models/model_task.h
index 1fcf6eca..96dfadd0 100644
--- a/src/models/model_task.h
+++ b/src/models/model_task.h
@@ -5,10 +5,12 @@
 namespace marian {
 
 struct ModelTask {
+  virtual ~ModelTask() {}
   virtual void run() = 0;
 };
 
 struct ModelServiceTask {
+  virtual ~ModelServiceTask() {}
   virtual std::string run(const std::string&) = 0;
 };
 }  // namespace marian
diff --git a/src/models/s2s.h b/src/models/s2s.h
index 4c0fd019..8d12f96e 100755
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@@ -11,6 +11,7 @@ namespace marian {
 class EncoderS2S : public EncoderBase {
   using EncoderBase::EncoderBase;
 public:
+  virtual ~EncoderS2S() {}
   Expr applyEncoderRNN(Ptr<ExpressionGraph> graph,
                        Expr embeddings,
                        Expr mask,
@@ -254,7 +255,7 @@ public:
     auto embeddings = state->getTargetHistoryEmbeddings();
 
     // The batch dimension of the inputs can change due to batch-pruning, in that case
-    // cached elements need to be rebuilt, in this case the mapped encoder context in the 
+    // cached elements need to be rebuilt, in this case the mapped encoder context in the
     // attention mechanism of the decoder RNN.
     int currDimBatch = embeddings->shape()[-2];
     if(!rnn_ || lastDimBatch_ != currDimBatch)  // if currDimBatch is different, rebuild the cached RNN
@@ -263,7 +264,7 @@ public:
     // Also @TODO: maybe implement a Cached(build, updateIf) that runs a check and rebuild if required
     // at dereferecing :
     // rnn_ = Cached<decltype(constructDecoderRNN(graph, state))>(
-    //          /*build=*/[]{ return constructDecoderRNN(graph, state); }, 
+    //          /*build=*/[]{ return constructDecoderRNN(graph, state); },
     //          /*updateIf=*/[]{ return state->batchDimChanged() });
     // rnn_->transduce(...);
 
diff --git a/src/models/states.h b/src/models/states.h
index 21cf42a8..c2f9ee05 100755
--- a/src/models/states.h
+++ b/src/models/states.h
@@ -17,6 +17,7 @@ public:
       : context_(context), mask_(mask), batch_(batch) {}
 
   EncoderState() {}
+  virtual ~EncoderState() {}
 
   virtual Expr getContext()   const { return context_;   }
   virtual Expr getAttended()  const { return context_;   }
@@ -53,6 +54,7 @@ public:
                const std::vector<Ptr<EncoderState>>& encStates,
                Ptr<data::CorpusBatch> batch)
       : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch) {}
+  virtual ~DecoderState() {}
 
   // @TODO: Do we need all these to be virtual?
   virtual const std::vector<Ptr<EncoderState>>& getEncoderStates() const {
@@ -68,10 +70,10 @@ public:
                                    int beamSize) const {
 
     std::vector<Ptr<EncoderState>> newEncStates;
-    for(auto& es : encStates_) 
+    for(auto& es : encStates_)
       // If the size of the batch dimension of the encoder state context changed, subselect the correct batch entries
       newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices));
-      
+
     // hypindices matches batchIndices in terms of batch dimension, so we only need hypIndices
     auto selectedState = New<DecoderState>(
         states_.select(hypIndices, beamSize, /*isBatchMajor=*/false), logProbs_, newEncStates, batch_);
@@ -121,6 +123,7 @@ private:
   Words targetWords_;
 
 public:
+  virtual ~ClassifierState() {}
   virtual Expr getLogProbs() const { return logProbs_; }
   virtual void setLogProbs(Expr logProbs) { logProbs_ = logProbs; }
 
diff --git a/src/optimizers/clippers.h b/src/optimizers/clippers.h
index f953c2ec..67a50004 100644
--- a/src/optimizers/clippers.h
+++ b/src/optimizers/clippers.h
@@ -16,6 +16,7 @@ namespace marian {
 class ClipperBase {
 public:
   virtual void clip(Tensor) = 0;
+  virtual ~ClipperBase() {}
 };
 
 typedef std::shared_ptr<ClipperBase> ClipperPtr;
diff --git a/src/optimizers/optimizers.h b/src/optimizers/optimizers.h
index 2b70ba84..d10af919 100644
--- a/src/optimizers/optimizers.h
+++ b/src/optimizers/optimizers.h
@@ -29,6 +29,8 @@ public:
       LOG(info, "[optimizers] Learning rate gets automatically adjusted as if minibatch size was {}", refMBWordsParam_);
   }
 
+  virtual ~OptimizerBase() {}
+
   static constexpr size_t mbSizeNotProvided = SIZE_MAX;
 
   void update(Ptr<ExpressionGraph> graph, size_t mbSize = mbSizeNotProvided) {
@@ -114,7 +116,7 @@ class Sgd : public OptimizerBase {
 public:
   Sgd(float eta, size_t refMBWordsParam = 0, Ptr<ClipperBase> clipper = nullptr)
       : OptimizerBase(eta, refMBWordsParam, clipper) {}
-
+  virtual ~Sgd() {}
   virtual void setParams(const std::vector<float>& /*params*/) override {}
 private:
   void updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBWords) override;
diff --git a/src/rescorer/score_collector.h b/src/rescorer/score_collector.h
index ba488436..38ba7a08 100644
--- a/src/rescorer/score_collector.h
+++ b/src/rescorer/score_collector.h
@@ -13,6 +13,7 @@ namespace marian {
 class ScoreCollector {
 public:
   ScoreCollector(const Ptr<Options>& options);
+  virtual ~ScoreCollector() {}
 
   virtual void Write(long id, const std::string& message);
   virtual void Write(long id,
diff --git a/src/rnn/rnn.h b/src/rnn/rnn.h
index c9131d87..4efc569c 100644
--- a/src/rnn/rnn.h
+++ b/src/rnn/rnn.h
@@ -35,7 +35,7 @@ protected:
 public:
   BaseRNN(Ptr<ExpressionGraph> graph, Ptr<Options> options)
       : graph_(graph), options_(options) {}
-
+  virtual ~BaseRNN() {}
   virtual Expr transduce(Expr, Expr = nullptr) = 0;
   virtual Expr transduce(Expr, State, Expr = nullptr) = 0;
   virtual Expr transduce(Expr, States, Expr = nullptr) = 0;
@@ -113,6 +113,7 @@ private:
 
 public:
   friend RNN;
+  virtual ~SingleLayerRNN() {}
 
   // @TODO: benchmark whether this concatenation is a good idea
   virtual Expr transduce(Expr input, Expr mask = nullptr) override {
diff --git a/src/tensors/backend.h b/src/tensors/backend.h
index 01d00267..ce8a5e60 100644
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@@ -17,7 +17,7 @@ protected:
 public:
   Backend(DeviceId deviceId, size_t seed)
       : deviceId_(deviceId), seed_(seed), randomGenerator_(createRandomGenerator(seed, deviceId)) {}
-
+  virtual ~Backend() {};
   virtual DeviceId getDeviceId() { return deviceId_; };
   virtual Ptr<RandomGenerator> getRandomGenerator() { return randomGenerator_; }
 
diff --git a/src/tensors/cpu/device.cpp b/src/tensors/cpu/device.cpp
index 941902a8..40bb558b 100644
--- a/src/tensors/cpu/device.cpp
+++ b/src/tensors/cpu/device.cpp
@@ -8,29 +8,40 @@
 
 namespace marian {
 namespace cpu {
+namespace {
 
 // allocate function for tensor reserve() below. 
-// Needed for AVX512, while not available on all compilers. It seems clang
-// does not have aligned_alloc for all cstlib versions. If AVX512 is not used
-// a simple malloc is probably fine. 
-// Should generate a runtime error otherwise as we have a check in the AVX512 
-// functions which tests for alignment. 
-#ifdef _WIN32
-#define MALLOC(size) _aligned_malloc(size, alignment_)
-#elif __GNUC__
-#define MALLOC(size) aligned_alloc(alignment_, size)
-#else
-#define MALLOC(size) malloc(size)
-#endif
+// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.
 
 #ifdef _WIN32
-#define FREE(ptr) _aligned_free(ptr)
+void *genericMalloc(size_t alignment, size_t size) {
+  void *ret = _aligned_malloc(size, alignment);
+  ABORT_IF(!ret, "Failed to allocate memory on CPU");
+  return ret;
+}
+void genericFree(void *ptr) {
+  _aligned_free(ptr);
+}
 #else
-#define FREE(ptr) free(ptr)
+// Linux and OS X.  There is no fallback to malloc because we need it to be aligned.
+void *genericMalloc(size_t alignment, size_t size) {
+  // On macos, aligned_alloc is available only on c++17
+  // Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
+  // posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
+  void *result;
+  // Error could be detected by return value or just remaining nullptr.
+  ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
+  return result;
+}
+void genericFree(void *ptr) {
+  free(ptr);
+}
 #endif
 
+} // namespace
+
 Device::~Device() {
-  FREE(data_);
+  genericFree(data_);
 }
 
 void Device::reserve(size_t size) {
@@ -38,14 +49,12 @@ void Device::reserve(size_t size) {
   ABORT_IF(size < size_ || size == 0,
            "New size must be larger than old size and larger than 0");
 
+  uint8_t *temp = static_cast<uint8_t*>(genericMalloc(alignment_, size));
   if(data_) {
-    uint8_t *temp = static_cast<uint8_t*>(MALLOC(size));
     std::copy(data_, data_ + size_, temp);
-    FREE(data_);
-    data_ = temp;
-  } else {
-    data_ = static_cast<uint8_t*>(MALLOC(size));
+    genericFree(data_);
   }
+  data_ = temp;
   size_ = size;
 }
 }  // namespace cpu
diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h
index cf53fe1a..32cc6b12 100644
--- a/src/tensors/cpu/fbgemm/expanded_gemm.h
+++ b/src/tensors/cpu/fbgemm/expanded_gemm.h
@@ -17,6 +17,7 @@
 #endif
 
 using namespace fbgemm;
+// @TODO: don't use using namespace ...; in header files. Just don't. [UG]
 #endif  // USE_FBGEMM
 
 namespace marian {
@@ -96,7 +97,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
 
   const std::string type() override { return "packMatFp16"; }
 
-  Shape newShape(Expr a, bool transpose) {
+  Shape newShape(Expr MAYBE_UNUSED a, bool MAYBE_UNUSED transpose) {
 #if USE_FBGEMM
     auto shapeMat = a->shape();
     // Should be 2D - weight matrix
@@ -115,15 +116,14 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
                            packsize_);
 
     Shape outShape({(int)packsize_});
-
     return outShape;
-#else // USE_FBGEMM
+#else
     ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
     return Shape();
 #endif  // USE_FBGEMM
   }
 };
-
+  ;
 // Pack a matrix (int8) into cache utilization efficient way (block format) together with quantization into int8
 // PackMatrix packMat_: the type of packed matrix - A or B matrix
 // marian::Type packType_: the type the input matrix is packed - packed8avx2 or packed8avx512
@@ -132,6 +132,7 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
 // int ncol_: the number of columns
 // uint64_t packsize_: the size of the packed matrix
 //                    (the size of int8 packed B from fbgemm:PackAWithQuantRowOffset + quantization scale, offset and zero point)
+
 struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
   PackMatrix packMat_;
   marian::Type packType_;
@@ -180,19 +181,21 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
 
   const std::string type() override { return "packMatInt8"; }
 
-  Shape newShape(Expr a, bool transpose) {
 #if USE_FBGEMM
+  Shape newShape(Expr a, bool transpose) {
     fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
     Shape outShape({(int)packsize_});
-
     return outShape;
-#else // USE_FBGEMM
+  }
+#else
+  Shape newShape(Expr /*a*/, bool /*transpose*/) {
     ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
     return Shape();
-#endif  // USE_FBGEMM
   }
+#endif  // USE_FBGEMM
 };
 
+
 // Affine transform (matrix multiplication) using packed B matrix
 // float scalar_: scalar multiplier
 // size_t m_: the number of rows in A and C
@@ -202,7 +205,6 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
 // bool transB_: transpose B
 class FbgemmPacked16AffineNodeOp : public NaryNodeOp {
 private:
-  float scalar_;
   size_t m_;
   size_t n_;
   size_t k_;
@@ -210,9 +212,8 @@ private:
   bool transB_;
 
 public:
-  FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
-        scalar_(scalar) {
+  FbgemmPacked16AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
+    : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar)*/ {
     transA_ = transA;
     transB_ = transB;
     m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@@ -281,7 +282,6 @@ public:
 // bool transB_: transpose B
 class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
 private:
-  float scalar_;
   size_t m_;
   size_t n_;
   size_t k_;
@@ -289,9 +289,8 @@ private:
   bool transB_;
 
 public:
-  FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
-        scalar_(scalar) {
+ FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
+   : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ {
     transA_ = transA;
     transB_ = transB;
     m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@@ -302,7 +301,7 @@ public:
     size_t l = bShape.elements() / bShape[-1];
     n_ = bShape[-1];
     if(transB)
-     std::swap(l, n_);
+      std::swap(l, n_);
   }
 
   Shape newShape(Expr a, Shape bShape, bool transA, bool transB) {
@@ -369,9 +368,9 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
   Type elementType = b->value_type();
 
   if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
@@ -380,9 +379,9 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
 
 static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
   if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
+    return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
+    return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
@@ -394,9 +393,9 @@ static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, f
   Type elementType = b->value_type();
 
   if (elementType == Type::packed16)
-    return Expression<cpu::variant::FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 91e058b8..ae5eeed5 100755
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -20,7 +20,7 @@ namespace marian {
 
 namespace cpu {
 
-void IsNaN(const Tensor in, Ptr<Allocator> allocator, bool& /*isNaN*/, bool& /*isInf*/) {
+  void IsNaN(const Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool& /*isNaN*/, bool& /*isInf*/) {
   ABORT("Not implemented");
 }
 
@@ -214,9 +214,11 @@ void Transpose0213(Tensor out, Tensor in) {
   }
 }
 
+// This function is called only when MKL is available.
+#if MKL_FOUND
 // Given a 4D array, transpose (swap) the initial 3 dimensions while keeping the last dimension.
 // e.g. 1234 --> 2134, 1234 --> 3214 (4 is always kept).
-// This is an optimized version for swapping first 3 dimensions 
+// This is an optimized version for swapping first 3 dimensions
 // assuming the last dimension is large enough to get benefits from vectorized copy.
 //
 // @param out output tensor
@@ -225,14 +227,13 @@ void Transpose0213(Tensor out, Tensor in) {
 template <bool add>
 void TransposeFirst3In4(Tensor out, Tensor in, const std::vector<int>& vAxis) {
   ABORT_IF(vAxis.size() != 4, "This function handles only 4D arrays.");
-#if MKL_FOUND
   int innermost = in->shape()[-1];
 
   int l1 = in->shape()[vAxis[0]];
   int l2 = in->shape()[vAxis[1]];
   int l3 = in->shape()[vAxis[2]];
 
-  // find the mapping between the transposed output dimensional indices (oi, oj, ok) 
+  // find the mapping between the transposed output dimensional indices (oi, oj, ok)
   // and original input dimensional indices (i, j, k)
   int oi, oj, ok;
 #pragma omp parallel for
@@ -275,11 +276,8 @@ void TransposeFirst3In4(Tensor out, Tensor in, const std::vector<int>& vAxis) {
       }
     }
   }
-#else
-  // it shouldn't come into here. This function is called only when MKL is available.
-  ABORT("Should not get here");
-#endif  // MKL_FOUND
 }
+#endif  // MKL_FOUND
 
 inline void transpose4x4_SSE(const float* A,
                              float* B,
@@ -656,7 +654,7 @@ void SelectAxis2(Tensor out,
 
   functional::Shape outShape = out->shape();
   functional::Shape inShape = in->shape();
-  
+
   auto idxData = indices->data<IndexType>();
   auto odata = out->data();
   const auto idata = in->data();
diff --git a/src/tensors/rand.h b/src/tensors/rand.h
index f3e356fb..94b44a97 100644
--- a/src/tensors/rand.h
+++ b/src/tensors/rand.h
@@ -15,11 +15,11 @@ protected:
 
 public:
   RandomGenerator(size_t seed) : seed_(seed) { }
-
+  virtual ~RandomGenerator() {}
   virtual void uniform(Tensor, float a, float b) = 0;
   virtual void normal(Tensor, float mean, float stddev) = 0;
 };
 
 Ptr<RandomGenerator> createRandomGenerator(size_t /*seed*/, DeviceId);
 
-}
\ No newline at end of file
+}
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index d7565240..227bc953 100644
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -25,7 +25,7 @@
 namespace marian {
 
 template <typename InIt, typename OutIt>
-void copy(Ptr<Backend> backend, const InIt beg, const InIt end, OutIt it) {
+void copy(Ptr<Backend>& MAYBE_UNUSED backend, const InIt beg, const InIt end, OutIt it) {
 #ifdef CUDA_FOUND
   if(backend->getDeviceId().type == DeviceType::gpu)
     gpu::copy(backend, beg, end, it);
@@ -119,7 +119,7 @@ DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
 
 // clang-format on
 
-// Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1. 
+// Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1.
 static inline void Bernoulli(Tensor resultTensor, float keepProb, float scale = 1.f, float shift = 0.f) {
   // in-place uniform distribution
   auto rnd = resultTensor->getBackend()->getRandomGenerator();
@@ -190,7 +190,7 @@ void LayerNormalizationGrad(Tensor gradX,
 }
 
 static inline void LayerNormalizationGrad(
-                            Ptr<Allocator> allocator,
+                            Ptr<Allocator> MAYBE_UNUSED allocator,
                             Tensor gradX,
                             Tensor gradGamma,
                             Tensor gradBeta,
diff --git a/src/tests/prod.cpp b/src/tests/prod.cpp
index 37809bac..698712b9 100644
--- a/src/tests/prod.cpp
+++ b/src/tests/prod.cpp
@@ -1,7 +1,7 @@
 #include "marian.h"
 #include "common/timer.h"
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
     using namespace marian;
 
     {
diff --git a/src/tests/sqlite.cpp b/src/tests/sqlite.cpp
index 21748514..f822dbc5 100644
--- a/src/tests/sqlite.cpp
+++ b/src/tests/sqlite.cpp
@@ -8,6 +8,8 @@
 #include <fstream>
 
 int main(int argc, char** argv) {
+    ABORT_IF(argc != 3, "FATAL ERROR: Incorrect number of command line arguments "
+             "(expected: 2) for command {}.",argv[0]);
 
     SQLite::Database db("corpus.db", SQLite::OPEN_READWRITE|SQLite::OPEN_CREATE);
     db.exec("PRAGMA temp_store_directory = '/data1/marcinjd';");
diff --git a/src/training/communicator.cpp b/src/training/communicator.cpp
index be9b23be..98b7be7f 100644
--- a/src/training/communicator.cpp
+++ b/src/training/communicator.cpp
@@ -38,7 +38,7 @@ Ptr<ICommunicator> createCommunicator(
   }
 
   // the actual implementation is inside communicator.cu
-  return New<NCCLCommunicator>(graphs, mpi); 
+  return New<NCCLCommunicator>(graphs, mpi);
 #else // no CUDA or no NCCL
   noNccl; // (unused)
   return New<DefaultCommunicator>(graphs, mpi);
@@ -141,7 +141,7 @@ public:
   FakeMPIWrapper(bool) {
     LOG(warn, "Compiled without MPI support. Falling back to FakeMPIWrapper");
   }
-
+  virtual ~FakeMPIWrapper() {}
   virtual size_t myMPIRank() const override { return 0; };
   virtual size_t numMPIProcesses() const override { return 1; };
 
diff --git a/src/training/communicator.h b/src/training/communicator.h
index 6ccce204..47274491 100644
--- a/src/training/communicator.h
+++ b/src/training/communicator.h
@@ -156,11 +156,8 @@ public:
   void scatterReduceAndResetGrads() const override {
     const_cast<DefaultCommunicator*>(this)->lazyInit();
 
-    int totalSize = (int)graphs_[0]->params()->vals()->size();
-    int shardSize = (int)ceil(totalSize / (float)graphs_.size());
-
     // Gather gradients from different devices into current gradient shards
-    auto scatter = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto scatter = [this](size_t idx, size_t begin, size_t end) {
       auto curGrad = graphs_[idx]->params()->grads()->subtensor(begin, end-begin);
 
       // collect and sum gradients
@@ -176,7 +173,7 @@ public:
     };
 
     // reset gradients outside current shard
-    auto reset = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto reset = [this](size_t idx, size_t begin, size_t end) {
       auto grad = graphs_[idx]->params()->grads();
       if (begin > 0)
         grad->subtensor(0, begin)->set(0);
@@ -189,11 +186,9 @@ public:
   }
 
   void allGatherParams() const override {
-    int totalSize = (int)graphs_[0]->params()->vals()->size();
-    int shardSize = (int)ceil(totalSize / (float)graphs_.size());
 
     // Update all graphs with parameter shard
-    auto gather = [this, shardSize](size_t idx, size_t begin, size_t end) {
+    auto gather = [this](size_t idx, size_t begin, size_t end) {
       auto getShard = [&](Ptr<ExpressionGraph> graph) {
         return graph->params()->vals()->subtensor(begin, end-begin);
       };
diff --git a/src/training/gradient_dropping/sparse_tensor.h b/src/training/gradient_dropping/sparse_tensor.h
index 5effa3d7..652a77ec 100644
--- a/src/training/gradient_dropping/sparse_tensor.h
+++ b/src/training/gradient_dropping/sparse_tensor.h
@@ -118,7 +118,7 @@ public:
   }
 
   // Convert a tensor into a sparse tensor format
-  void fromDense(Tensor t) {
+  void fromDense(Tensor MAYBE_UNUSED t) {
     if(backend_->getDeviceId().type == DeviceType::cpu) {
       ABORT("Gradient Dropping for CPU is not yet supported");
     }
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index 265f9785..56b8afe3 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -54,10 +54,10 @@ public:
    * number of devices, which is passed in as the 'multiplier'.
    */
   // @TODO: Can this be made const? It seems wrong to have a stateful method that still returns a result.
-  virtual Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
-                                             Ptr<models::ICriterionFunction> model,
-                                             const std::vector<Ptr<Vocab>>& vocabs,
-                                             double multiplier = 1.) {
+  Ptr<data::BatchStats> collectStats(Ptr<ExpressionGraph> graph,
+                                     Ptr<models::ICriterionFunction> model,
+                                     const std::vector<Ptr<Vocab>>& vocabs,
+                                     double multiplier = 1.) {
     auto stats = New<data::BatchStats>();
 
     size_t numFiles = options_->get<std::vector<std::string>>("train-sets").size();
@@ -92,8 +92,8 @@ public:
         maxBatch *= 2;
     }
 
-    // Do a binary search for maxmimum batch size that fits into given workspace memory 
-    // for a tested sentence length. 
+    // Do a binary search for maxmimum batch size that fits into given workspace memory
+    // for a tested sentence length.
     for(size_t i = step; i <= maxLength; i += step) {
       size_t start = 1;
       size_t end = maxBatch;
diff --git a/src/training/graph_group_async.h b/src/training/graph_group_async.h
index 0e27a665..dd5254bb 100644
--- a/src/training/graph_group_async.h
+++ b/src/training/graph_group_async.h
@@ -64,7 +64,7 @@ public:
   void save(Ptr<ExpressionGraph>, bool final = false);
 
   // @TODO: give it a fake batch generator which own vocabs instead of passing vocabs
-  Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) {
+  virtual Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) {
     return GraphGroup::collectStats(graphs_[0], builders_[0], vocabs);
   }
 
diff --git a/src/training/graph_group_multinode_sync.h b/src/training/graph_group_multinode_sync.h
index ff1fe4e9..0a089d7b 100644
--- a/src/training/graph_group_multinode_sync.h
+++ b/src/training/graph_group_multinode_sync.h
@@ -63,7 +63,6 @@ private:
   Tensor paramsAvg_;
   std::vector<float> accGradientsSync_cpu;
   std::vector<float> receiveBuffer_cpu;
-  bool synchronization_happened{false};
 
   Ptr<OptimizerBase> syncOptimizer_;
 
diff --git a/src/training/graph_group_sync.h b/src/training/graph_group_sync.h
index b127296d..067d245c 100644
--- a/src/training/graph_group_sync.h
+++ b/src/training/graph_group_sync.h
@@ -26,7 +26,6 @@ class SyncGraphGroup : public GraphGroup, public ExponentialSmoothing {
   // state for update()
   bool first_{ true };                           // gets interpreted and cleared by update()
   std::vector<Ptr<data::Batch>> pendingBatches_; // in case of dynamic MB-size scaling, we temporarly buffer up batches across update() calls until enough
-  size_t typicalTrgWords_{};                     // typical batch size in words (labels), 0 if unknown (e.g. specified in sentences)
   double updateMultiplier_{1};                  // multiplier not applied in collectStats() (no multiplier if not mini-batch-fit)
 
   void initialize(const Ptr<data::Batch>& exampleBatch);
diff --git a/src/training/training_state.h b/src/training/training_state.h
index d39173be..1eb187b7 100644
--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@@ -13,6 +13,7 @@ class TrainingState;
 
 class TrainingObserver {
 public:
+  virtual ~TrainingObserver() {}
   virtual void init(TrainingState&) {}
   virtual void actAfterEpoch(TrainingState&) {}
   virtual void actAfterBatches(TrainingState&) {}
diff --git a/src/training/validator.h b/src/training/validator.h
index e97f0da6..1658dff3 100755
--- a/src/training/validator.h
+++ b/src/training/validator.h
@@ -36,6 +36,7 @@ protected:
 
 public:
   ValidatorBase(bool lowerIsBetter) : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}
+  virtual ~ValidatorBase() {}
 
   virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                          Ptr<const TrainingState> state) = 0;
@@ -51,6 +52,7 @@ public:
 template <class DataSet, class BuilderType> // @TODO: BuilderType doesn't really serve a purpose here? Review and remove.
 class Validator : public ValidatorBase {
 public:
+  virtual ~Validator() {}
   Validator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool lowerIsBetter = true)
       : ValidatorBase(lowerIsBetter),
         vocabs_(vocabs),
@@ -137,6 +139,7 @@ class CrossEntropyValidator : public Validator<data::Corpus, models::ICriterionF
 
 public:
   CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~CrossEntropyValidator() {}
 
   std::string type() override { return options_->get<std::string>("cost-type"); }
 
@@ -148,6 +151,7 @@ protected:
 class AccuracyValidator : public Validator<data::Corpus, models::IModel> {
 public:
   AccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~AccuracyValidator() {}
 
   std::string type() override { return "accuracy"; }
 
@@ -161,6 +165,7 @@ private:
 
 public:
   BertAccuracyValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool evalMaskedLM);
+  virtual ~BertAccuracyValidator() {}
 
   std::string type() override {
     if(evalMaskedLM_)
@@ -177,6 +182,7 @@ protected:
 class ScriptValidator : public Validator<data::Corpus, models::IModel> {
 public:
   ScriptValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~ScriptValidator() {}
 
   virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                          Ptr<const TrainingState> /*ignored*/) override;
@@ -193,6 +199,7 @@ protected:
 class TranslationValidator : public Validator<data::Corpus, models::IModel> {
 public:
   TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
+  virtual ~TranslationValidator() {}
 
   virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                          Ptr<const TrainingState> state) override;
@@ -212,6 +219,7 @@ protected:
 class BleuValidator : public Validator<data::Corpus, models::IModel> {
 public:
   BleuValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options, bool detok = false);
+  virtual ~BleuValidator() {}
 
   virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
                          Ptr<const TrainingState> state) override;
diff --git a/src/translator/output_collector.h b/src/translator/output_collector.h
index 9e0e12b5..ffcbd2d5 100644
--- a/src/translator/output_collector.h
+++ b/src/translator/output_collector.h
@@ -11,6 +11,7 @@ namespace marian {
 
 class PrintingStrategy {
 public:
+  virtual ~PrintingStrategy() {}
   virtual bool shouldBePrinted(long) = 0;
 };
 
diff --git a/src/translator/scorers.h b/src/translator/scorers.h
index c936514c..a5a0be2c 100755
--- a/src/translator/scorers.h
+++ b/src/translator/scorers.h
@@ -10,6 +10,8 @@ namespace marian {
 
 class ScorerState {
 public:
+  virtual ~ScorerState(){}
+
   virtual Logits getLogProbs() const = 0;
 
   virtual void blacklist(Expr /*totalCosts*/, Ptr<data::CorpusBatch> /*batch*/){};
@@ -24,6 +26,8 @@ public:
   Scorer(const std::string& name, float weight)
       : name_(name), weight_(weight) {}
 
+  virtual ~Scorer(){}
+
   std::string getName() { return name_; }
   float getWeight() { return weight_; }
 
@@ -53,6 +57,7 @@ protected:
 
 public:
   ScorerWrapperState(Ptr<DecoderState> state) : state_(state) {}
+  virtual ~ScorerWrapperState() {}
 
   virtual Ptr<DecoderState> getState() { return state_; }
 
@@ -88,6 +93,8 @@ public:
         encdec_(std::static_pointer_cast<IEncoderDecoder>(encdec)),
         ptr_{ptr} {}
 
+  virtual ~ScorerWrapper() {}
+
   virtual void init(Ptr<ExpressionGraph> graph) override {
     graph->switchParams(getName());
     if(ptr_)