fixed all warnings discovered by Visual Studio

2024-09-11 06:15:56 +03:00 · 2018-08-31 19:21:14 -07:00 · 2018-08-31 19:21:14 -07:00 · 14631160db
commit 14631160db
parent 2bf44365ff
93 changed files with 440 additions and 397 deletions
--- a/src/3rd_party/cnpy/cnpy.cpp
+++ b/src/3rd_party/cnpy/cnpy.cpp
@ -70,19 +70,19 @@ void cnpy::parse_npy_header(FILE* fp, unsigned int& word_size, unsigned int*& sh
    int loc1, loc2;

    //fortran order
-    loc1 = header.find("fortran_order")+16;
+    loc1 = (int)header.find("fortran_order")+16;
    fortran_order = (header.substr(loc1,5) == "True" ? true : false);

    //shape
-    loc1 = header.find("(");
-    loc2 = header.find(")");
+    loc1 = (int)header.find("(");
+    loc2 = (int)header.find(")");
    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
    if(str_shape.length() == 0) ndims = 0;
    else if(str_shape[str_shape.size()-1] == ',') ndims = 1;
-    else ndims = std::count(str_shape.begin(),str_shape.end(),',')+1;
+    else ndims = (unsigned int)std::count(str_shape.begin(),str_shape.end(),',')+1;
    shape = new unsigned int[ndims];
    for(unsigned int i = 0;i < ndims;i++) {
-        loc1 = str_shape.find(",");
+        loc1 = (int)str_shape.find(",");
        shape[i] = atoi(str_shape.substr(0,loc1).c_str());
        str_shape = str_shape.substr(loc1+1);
    }
@ -90,15 +90,15 @@ void cnpy::parse_npy_header(FILE* fp, unsigned int& word_size, unsigned int*& sh
    //endian, word size, data type
    //byte order code | stands for not applicable.
    //not sure when this applies except for byte array
-    loc1 = header.find("descr")+9;
+    loc1 = (int)header.find("descr")+9;
    bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
-    assert(littleEndian);
+    assert(littleEndian); littleEndian;

    //char type = header[loc1+1];
    //assert(type == map_type(T));

    std::string str_ws = header.substr(loc1+2);
-    loc2 = str_ws.find("'");
+    loc2 = (int)str_ws.find("'");
    word_size = atoi(str_ws.substr(0,loc2).c_str());
 }

--- a/src/3rd_party/cnpy/cnpy.h
+++ b/src/3rd_party/cnpy/cnpy.h
@ -70,7 +70,7 @@ namespace cnpy {
    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const char* rhs);


-    template<typename T> std::string tostring(T i, int pad = 0, char padval = ' ') {
+    template<typename T> std::string tostring(T i, int /*pad*/ = 0, char /*padval*/ = ' ') {
        std::stringstream s;
        s << i;
        return s.str();
@ -162,7 +162,7 @@ namespace cnpy {

        unsigned long nels = 1;
        for (int m=0; m<ndims; m++ ) nels *= shape[m];
-        int nbytes = nels*sizeof(T) + npy_header.size();
+        auto nbytes = nels*sizeof(T) + npy_header.size();

        //get the CRC of the data to be added
        unsigned int crc = crc32(0L,(unsigned char*)&npy_header[0],npy_header.size());
@ -250,7 +250,7 @@ namespace cnpy {
            name(name), type(type_)
        {
            shape = dataShape;
-            word_size = word_size_;
+            word_size = (unsigned int)word_size_;
            bytes.resize(data.size());
            std::copy(data.begin(), data.end(), bytes.begin());
        }
@ -278,15 +278,15 @@ namespace cnpy {
            const auto* shape     = item.shape.data();
            const auto  type      = item.type;
            const auto  word_size = item.word_size;
-            const unsigned int ndims = item.shape.size();
+            const unsigned int ndims = (unsigned int)item.shape.size();
            std::vector<char> npy_header = create_npy_header(type,word_size,shape,ndims);

            unsigned long nels = 1;
-            for (int m=0; m<ndims; m++ ) nels *= shape[m];
-            int nbytes = nels*word_size + npy_header.size();
+            for (size_t m=0; m<ndims; m++ ) nels *= shape[m];
+            auto nbytes = nels*word_size + npy_header.size();

            //get the CRC of the data to be added
-            unsigned int crc = crc32(0L,(unsigned char*)&npy_header[0],npy_header.size());
+            unsigned int crc = crc32(0L,(unsigned char*)&npy_header[0],(uInt)npy_header.size());
            crc = crc32(crc,(unsigned char*)data,nels*word_size);

            //build the local header
@ -330,7 +330,7 @@ namespace cnpy {
        fwrite(&global_header[0],sizeof(char),global_header.size(),fp);

        //build footer
-        unsigned short nrecs = items.size();
+        auto nrecs = items.size();
        std::vector<char> footer;
        footer += "PK"; //first part of sig
        footer += (unsigned short) 0x0605; //second part of sig
@ -347,7 +347,7 @@ namespace cnpy {

        //close up
        fflush(fp);
-        bool bad = ferror(fp);
+        bool bad = ferror(fp) != 0;
        fclose(fp);

        // move to final location (atomically)
@ -370,7 +370,7 @@ namespace cnpy {
        dict += tostring(word_size);
        dict += "', 'fortran_order': False, 'shape': (";
        dict += tostring(shape[0]);
-        for(int i = 1;i < ndims;i++) {
+        for(size_t i = 1;i < ndims;i++) {
            dict += ", ";
            dict += tostring(shape[i]);
        }
@ -382,7 +382,7 @@ namespace cnpy {
        dict.back() = '\n';

        std::vector<char> header;
-        header += (char) 0x93;
+        header += (char) (0x93 - 0x100);
        header += "NUMPY";
        header += (char) 0x01; //major version of numpy format
        header += (char) 0x00; //minor version of numpy format
--- a/src/3rd_party/svd/svd.cpp
+++ b/src/3rd_party/svd/svd.cpp
@ -33,7 +33,7 @@ static double PYTHAG(double a, double b)

 int dsvd(float *a, int m, int n, float *w, float *v)
 {
-    int flag, i, its, j, jj, k, l, nm;
+    int flag, i, its, j, jj, k, l = 0, nm = 0; // (initializing to keep compiler happy)
    double c, f, h, s, x, y, z;
    double anorm = 0.0, g = 0.0, scale = 0.0;
    double *rv1;
--- a/src/3rd_party/yaml-cpp/binary_renamed.cpp
+++ b/src/3rd_party/yaml-cpp/binary_renamed.cpp
@ -79,11 +79,11 @@ std::vector<unsigned char> DecodeBase64(const std::string &input) {

    value = (value << 6) | d;
    if (i % 4 == 3) {
-      *out++ = value >> 16;
+      *out++ = (unsigned char)(value >> 16);
      if (i > 0 && input[i - 1] != '=')
-        *out++ = value >> 8;
+        *out++ = (unsigned char)(value >> 8);
      if (input[i] != '=')
-        *out++ = value;
+        *out++ = (unsigned char)value;
    }
  }

--- a/src/3rd_party/yaml-cpp/collectionstack.h
+++ b/src/3rd_party/yaml-cpp/collectionstack.h
@ -27,7 +27,7 @@ class CollectionStack {
    collectionStack.push(type);
  }
  void PopCollectionType(CollectionType::value type) {
-    assert(type == GetCurCollectionType());
+    assert(type == GetCurCollectionType()); type;
    collectionStack.pop();
  }

--- a/src/3rd_party/yaml-cpp/emitterstate.cpp
+++ b/src/3rd_party/yaml-cpp/emitterstate.cpp
@ -98,10 +98,6 @@ EmitterNodeType::value EmitterState::NextGroupType(
    else
      return EmitterNodeType::FlowMap;
  }
-
-  // can't happen
-  assert(false);
-  return EmitterNodeType::NoType;
 }

 void EmitterState::StartedDoc() {
--- a/src/3rd_party/yaml-cpp/emitterstate.h
+++ b/src/3rd_party/yaml-cpp/emitterstate.h
@ -167,10 +167,6 @@ class EmitterState {
        else
          return EmitterNodeType::BlockMap;
      }
-
-      // can't get here
-      assert(false);
-      return EmitterNodeType::NoType;
    }
  };

--- a/src/3rd_party/yaml-cpp/node/convert.h
+++ b/src/3rd_party/yaml-cpp/node/convert.h
@ -126,6 +126,8 @@ struct convert<_Null> {
    }                                                                    \
  }

+#pragma warning(push)
+#pragma warning(disable: 4127) // conditional expression is constant (the std::numeric_limits constants in macro above)
 #define YAML_DEFINE_CONVERT_STREAMABLE_SIGNED(type) \
  YAML_DEFINE_CONVERT_STREAMABLE(type, -)

@ -148,6 +150,7 @@ YAML_DEFINE_CONVERT_STREAMABLE_UNSIGNED(unsigned char);
 YAML_DEFINE_CONVERT_STREAMABLE_SIGNED(float);
 YAML_DEFINE_CONVERT_STREAMABLE_SIGNED(double);
 YAML_DEFINE_CONVERT_STREAMABLE_SIGNED(long double);
+#pragma warning(pop)

 #undef YAML_DEFINE_CONVERT_STREAMABLE_SIGNED
 #undef YAML_DEFINE_CONVERT_STREAMABLE_UNSIGNED
--- a/src/3rd_party/yaml-cpp/node_data.cpp
+++ b/src/3rd_party/yaml-cpp/node_data.cpp
@ -91,7 +91,6 @@ std::size_t node_data::size() const {
    default:
      return 0;
  }
-  return 0;
 }

 void node_data::compute_seq_size() const {
--- a/src/3rd_party/yaml-cpp/singledocparser.cpp
+++ b/src/3rd_party/yaml-cpp/singledocparser.cpp
@ -166,10 +166,10 @@ void SingleDocParser::HandleBlockSequence(EventHandler& eventHandler) {

    // check for null
    if (!m_scanner.empty()) {
-      const Token& token = m_scanner.peek();
-      if (token.type == Token::BLOCK_ENTRY ||
-          token.type == Token::BLOCK_SEQ_END) {
-        eventHandler.OnNull(token.mark, NullAnchor);
+      const Token& token1 = m_scanner.peek();
+      if (token1.type == Token::BLOCK_ENTRY ||
+          token1.type == Token::BLOCK_SEQ_END) {
+        eventHandler.OnNull(token1.mark, NullAnchor);
        continue;
      }
    }
--- a/src/command/marian.cpp
+++ b/src/command/marian.cpp
@ -68,6 +68,8 @@ bool configureMPI(int argc, char** argv, bool sync) {
      "Your version of MPI does not support multi-threaded communication.");

  enable = true;
+#else
+  argc; argv; sync; // (unused)
 #endif
  return enable;
 }
--- a/src/common/cli_helper.h
+++ b/src/common/cli_helper.h
@ -10,7 +10,7 @@ namespace cli {

 // helper to replace environment-variable expressions of the form ${VARNAME} in
 // a string
-static std::string InterpolateEnvVars(std::string str) {
+static inline std::string InterpolateEnvVars(std::string str) {
 // temporary workaround for MS-internal PhillyOnAzure cluster: warm storage
 // presently has the form /hdfs/VC instead of /{gfs,hdfs}/CLUSTER/VC
 #if 1
--- a/src/common/compile_time_crc32.h
+++ b/src/common/compile_time_crc32.h
@ -74,7 +74,7 @@ constexpr uint32_t crc32(const char* str) {

 // This is the stop-recursion function
 template <>
-constexpr uint32_t crc32<size_t(-1)>(const char* str) {
+constexpr uint32_t crc32<size_t(-1)>(const char*) {
  return 0xFFFFFFFF;
 }

--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@ -12,7 +12,7 @@ namespace marian {
 size_t Config::seed = (size_t)time(0);

 bool Config::has(const std::string& key) const {
-  return config_[key];
+  return !!config_[key];
 }

 YAML::Node Config::get(const std::string& key) const {
--- a/src/common/config.h
+++ b/src/common/config.h
@ -30,7 +30,7 @@ public:
         bool validate = false) {
    std::vector<std::string> sargv;
    utils::Split(options, sargv, " ");
-    int argc = sargv.size();
+    int argc = (int)sargv.size();

    std::vector<char*> argv(argc);
    for(int i = 0; i < argc; ++i)
@ -67,7 +67,7 @@ public:
        try {
          if(!get<bool>("ignore-model-config"))
            loadModelParameters(get<std::string>("model"));
-        } catch(std::runtime_error& e) {
+        } catch(std::runtime_error&) {
          LOG(info, "[config] No model configuration found in model file");
        }
      }
@ -76,7 +76,7 @@ public:
      try {
        if(!get<bool>("ignore-model-config"))
          loadModelParameters(model);
-      } catch(std::runtime_error& e) {
+      } catch(std::runtime_error&) {
        LOG(info, "[config] No model configuration found in model file");
      }
    }
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -56,7 +56,7 @@ uint16_t guess_terminal_width(uint16_t max_width) {
 #endif
  // couldn't determine terminal width
  if(cols == 0)
-    cols = po::options_description::m_default_line_length;
+    cols = (uint16_t)po::options_description::m_default_line_length;
  return max_width ? std::min(cols, max_width) : cols;
 }

@ -73,7 +73,7 @@ const std::set<std::string> PATHS = {"model",


 bool ConfigParser::has(const std::string& key) const {
-  return config_[key];
+  return !!config_[key];
 }

 void ConfigParser::validateOptions() const {
@ -288,6 +288,8 @@ void ConfigParser::addOptionsModel(po::options_description& desc) {
     "Tie all embedding layers and output layer")
    ("transformer-heads", po::value<int>()->default_value(8),
     "Number of heads in multi-head attention (transformer)")
+    ("transformer-dim-ffn", po::value<int>()->default_value(2048),
+     "Size of position-wise feed-forward network (transformer)")
    ("transformer-no-projection", po::value<bool>()->zero_tokens()->default_value(false),
     "Omit linear projection after multi-head attention (transformer)")
    ("transformer-dim-ffn", po::value<int>()->default_value(2048),
@ -332,6 +334,20 @@ void ConfigParser::addOptionsModel(po::options_description& desc) {
      ->multitoken(),
     "Convolution window widths in char-s2s model")
 #endif
+    // Frank's experiments
+    // Note: Don't forget to add these also in encoder_decoder.cpp, EncoderDecoder().
+    ("use-direct-sent-end-prob", po::value<bool>()->zero_tokens()->default_value(false),
+     "Enable Frank's direct sentence-end model (experimental) (transformer, requires --transformer-heads-top)")
+    ("transformer-heads-top", po::value<int>(), //->default_value(8),
+     "Number of heads in top layer, multi-head attention (transformer)")
+    ("transformer-coverage", po::value<bool>()->zero_tokens()->default_value(false),
+     "Enable Frank's coverage model, top layer only (experimental) (transformer)")
+    ("transformer-coverage-all", po::value<bool>()->zero_tokens()->default_value(false),
+     "Enable Frank's coverage model, all layers (experimental) (transformer)")
+    ("transformer-alignment-weight-heads", po::value<bool>()->zero_tokens()->default_value(false),
+     "If deriving alignment and/or coverage from multi-head, learn interpolation weights (experimental) (transformer)")
+    ("transformer-offset-embedding-range", po::value<int>()->default_value(0),
+     "Clipping range of offset embedding, 0 to disable (transformer)")
    ;

  if(mode_ == ConfigMode::training) {
@ -488,7 +504,7 @@ void ConfigParser::addOptionsTraining(po::options_description& desc) {
     "Epsilon for label smoothing (0 to disable)")
    ("clip-norm", po::value<double>()->default_value(1.f),
     "Clip gradient norm to  arg  (0 to disable)")
-    ("exponential-smoothing", po::value<float>()->default_value(0.f)->implicit_value(1e-4, "1e-4"),
+    ("exponential-smoothing", po::value<float>()->default_value(0.f)->implicit_value(1e-4f, "1e-4"),
     "Maintain smoothed version of parameters for validation and saving with smoothing factor arg. "
     " 0 to disable.")
    ("guided-alignment", po::value<std::string>(),
@ -754,7 +770,7 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
    return str;
  };

-  bool loadConfig = vm_.count("config");
+  bool loadConfig = vm_.count("config") != 0;
  bool reloadConfig
      = (mode_ == ConfigMode::training)
        && boost::filesystem::exists(InterpolateEnvVarsIfRequested(
@ -832,6 +848,14 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
  SET_OPTION("transformer-tied-layers", std::vector<size_t>);
  SET_OPTION("transformer-guided-alignment-layer", std::string);

+  // Frank's experiments:
+  SET_OPTION("use-direct-sent-end-prob", bool);
+  SET_OPTION_NONDEFAULT("transformer-heads-top", int);
+  SET_OPTION("transformer-coverage", bool);
+  SET_OPTION("transformer-coverage-all", bool);
+  SET_OPTION("transformer-alignment-weight-heads", bool);
+  SET_OPTION("transformer-offset-embedding-range", int);
+
 #ifdef CUDNN
  SET_OPTION("char-stride", int);
  SET_OPTION("char-highway", int);
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@ -108,7 +108,7 @@ KEY(axis, int);
 KEY(shape, Shape);
 KEY(value, float);
 KEY(fixed, bool);
-KEY(prefix, std::string);
+//KEY(prefix, std::string); // (conflicts with local variables named prefix)
 KEY(final, bool);
 KEY(output_last, bool);
 KEY(mask, Expr);
@ -132,5 +132,5 @@ KEY(valid, Ptr<RunBase>);
 KEY(lex_probs, Ptr<LexProbs>);
 }  // namespace keywords

-const float NEMATUS_LN_EPS = 1e-5;
+const float NEMATUS_LN_EPS = 1e-5f;
 }  // namespace marian
--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@ -3,7 +3,12 @@
 #include <boost/filesystem.hpp>
 #include <boost/filesystem/fstream.hpp>
 #include <boost/iostreams/device/file_descriptor.hpp>
+#pragma warning(push)
+#pragma warning(disable: 4458) // declaration of 'traits_type' hides class member
+#pragma warning(disable: 4456) // declaration of 'c' hides previous local declaration
+#pragma warning(disable: 4244) // conversion from 'int' to 'char', possible loss of data
 #include <boost/iostreams/filter/gzip.hpp>
+#pragma warning(pop)
 #include <boost/iostreams/filtering_stream.hpp>
 #include <iostream>
 #include "3rd_party/exception.h"
--- a/src/common/io.cpp
+++ b/src/common/io.cpp
@ -83,11 +83,11 @@ void loadItemsFromNpz(const std::string& fileName, std::vector<Item>& items) {
    if(it.second->shape.size() == 1) {
      shape.resize(2);
      shape.set(0, 1);
-      shape.set(1, it.second->shape[0]);
+      shape.set(1, (size_t)it.second->shape[0]);
    } else {
      shape.resize(it.second->shape.size());
-      for(size_t i = 0; i < it.second->shape.size(); ++i)
-        shape.set(i, it.second->shape[i]);
+      for(int i = 0; i < it.second->shape.size(); ++i)
+        shape.set(i, (size_t)it.second->shape[i]);
    }

    Item item;
--- a/src/common/keywords.h
+++ b/src/common/keywords.h
@ -71,7 +71,7 @@ public:
   *
   * @arg value The value to store in this object
   */
-  Keyword(Value value) : value_(value) {}
+  Keyword(Value val) : value_(val) {}

  /**
   * @brief Constructs a <code>Keyword</code> with no specified value.
@ -90,8 +90,8 @@ public:
   *
   * @return  a new <code>Keyword</code> object containing the specified value
   */
-  Keyword<key, Value> operator=(Value value) const {
-    return Keyword<key, Value>(value);
+  Keyword<key, Value> operator=(Value val) const {
+    return Keyword<key, Value>(val);
  }

  /**
@ -141,22 +141,22 @@ struct True {};
 struct False {};

 template <typename Match, typename... Args>
-typename Match::value_type opt(True foo,
-                               typename Match::value_type dflt,
+typename Match::value_type opt(True /*foo*/,
+                               typename Match::value_type /*dflt*/,
                               Args... args) {
  std::tuple<Args...> t(args...);
  return std::get<Index<Match, std::tuple<Args...>>::value>(t)();
 }

 template <typename Match, typename... Args>
-typename Match::value_type opt(False foo,
+typename Match::value_type opt(False /*foo*/,
                               typename Match::value_type dflt,
-                               Args... args) {
+                               Args... /*args*/) {
  return dflt;
 }

 template <typename Match, typename... Args>
-typename Match::value_type Get(Match key,
+typename Match::value_type Get(Match /*key*/,
                               typename Match::value_type dflt,
                               Args... args) {
  constexpr bool match = is_one_of<Match, Args...>::value;
@ -165,7 +165,7 @@ typename Match::value_type Get(Match key,
 }

 template <typename Match, typename... Args>
-constexpr bool Has(Match key, Args... args) {
+constexpr bool Has(Match /*key*/, Args... args) {
  return is_one_of<Match, Args...>::value;
 }

--- a/src/common/options.h
+++ b/src/common/options.h
@ -74,6 +74,6 @@ public:
      return defaultValue;
  }

-  bool has(const std::string& key) const { return options_[key]; }
+  bool has(const std::string& key) const { return !!options_[key]; }
 };
 }  // namespace marian
--- a/src/common/shape.h
+++ b/src/common/shape.h
@ -38,7 +38,10 @@ public:
  const int* data() const { return shape_.data(); }
  int* data() { return shape_.data(); }

-  inline void set(int i, int val) { dim(i) = val; }
+  inline void set(int    i, int val) { dim(i) = val; }
+  inline void set(size_t i, int val) { dim(i) = val; }
+  inline void set(int    i, size_t val) { dim(i) = (int)val; }
+  inline void set(size_t i, size_t val) { dim(i) = (int)val; }

  inline int& dim(int i) {
    if(i >= 0) {
@ -55,20 +58,24 @@ public:
      return shape_[size() + i];
    }
  }
-
  inline const int& dim(int i) const {
    return const_cast<Shape&>(*this).dim(i);
  }

+  inline       int& dim(size_t i)       { return dim(int(i)); }
+  inline const int& dim(size_t i) const { return dim(int(i)); }
+
  inline int operator[](int i) const { return dim(i); }
-  inline int operator[](int i) { return dim(i); }
+  inline int operator[](int i)       { return dim(i); }
+  inline int operator[](size_t i) const { return dim(i); }
+  inline int operator[](size_t i)       { return dim(i); }

  inline int back() const { return shape_.back(); }
  inline int& back() { return shape_.back(); }

  inline int stride(int i) const {
    std::vector<int> stride(shape_.size(), 1);
-    for(int j = shape_.size() - 2; j >= 0; --j)
+    for(int j = (int)shape_.size() - 2; j >= 0; --j)
      stride[j] = stride[j + 1] * shape_[j + 1];

    if(i >= 0)
@ -88,7 +95,7 @@ public:
    d.resize(shape_.size());

    std::vector<int> stride(shape_.size(), 1);
-    for(int j = shape_.size() - 2; j >= 0; --j)
+    for(int j = (int)shape_.size() - 2; j >= 0; --j)
      stride[j] = stride[j + 1] * shape_[j + 1];

    for(size_t j = 0; j < d.size(); ++j)
@ -116,7 +123,7 @@ public:
  std::string toString() const {
    std::stringstream strm;
    strm << "shape=" << (*this)[0];
-    for(size_t i = 1; i < size(); ++i)
+    for(int i = 1; i < size(); ++i)
      strm << "x" << (*this)[i];
    strm << " size=" << elements();
    return strm.str();
@ -135,7 +142,7 @@ public:

  int axis(int ax) const {
    if(ax < 0)
-      return size() + ax;
+      return (int)size() + ax;
    else
      return ax;
  }
--- a/src/common/types.h
+++ b/src/common/types.h
@ -40,11 +40,11 @@ static inline size_t sizeOf(Type type) {
 }

 static inline bool isSignedInt(Type type) {
-  return TypeClass::signed_type & type;
+  return (TypeClass::signed_type & type) != 0;
 }

 static inline bool isUnsignedInt(Type type) {
-  return TypeClass::unsigned_type & type;
+  return (TypeClass::unsigned_type & type) != 0;
 }

 static inline bool isInt(Type type) {
@ -52,7 +52,7 @@ static inline bool isInt(Type type) {
 }

 static inline bool isFloat(Type type) {
-  return TypeClass::float_type & type;
+  return (TypeClass::float_type & type) != 0;
 }

 template <typename T>
--- a/src/data/batch.h
+++ b/src/data/batch.h
@ -10,7 +10,7 @@ namespace data {
 class Batch {
 public:
  virtual size_t size() const = 0;
-  virtual size_t words(int which = 0) const { return 0; };
+  virtual size_t words(int /*which*/ = 0) const { return 0; };
  virtual size_t width() const { return 0; };

  virtual size_t sizeTrg() const { return 0; };
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@ -110,7 +110,7 @@ private:
    while(!maxiBatch->empty()) {
      // push item onto batch
      batchVector.push_back(maxiBatch->top());
-      currentWords += batchVector.back()[0].size();
+      currentWords += (int)batchVector.back()[0].size();
      maxiBatch->pop();

      // Batch size based on sentences
--- a/src/data/corpus.h
+++ b/src/data/corpus.h
@ -67,7 +67,7 @@ public:
        maxDims.resize(ex.size(), 0);
      for(size_t i = 0; i < ex.size(); ++i) {
        if(ex[i].size() > (size_t)maxDims[i])
-          maxDims[i] = ex[i].size();
+          maxDims[i] = (int)ex[i].size();
      }
      sentenceIds.push_back(ex.getId());
    }
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@ -216,9 +216,9 @@ void CorpusBase::addWeightsToSentenceTuple(const std::string& line,

 void CorpusBase::addAlignmentsToBatch(Ptr<CorpusBatch> batch,
                                      const std::vector<sample>& batchVector) {
-  int srcWords = batch->front()->batchWidth();
-  int trgWords = batch->back()->batchWidth();
-  int dimBatch = batch->getSentenceIds().size();
+  int srcWords = (int)batch->front()->batchWidth();
+  int trgWords = (int)batch->back()->batchWidth();
+  int dimBatch = (int)batch->getSentenceIds().size();

  std::vector<float> aligns(srcWords * dimBatch * trgWords, 0.f);

@ -235,8 +235,8 @@ void CorpusBase::addAlignmentsToBatch(Ptr<CorpusBatch> batch,

 void CorpusBase::addWeightsToBatch(Ptr<CorpusBatch> batch,
                                   const std::vector<sample>& batchVector) {
-  int dimBatch = batch->size();
-  int trgWords = batch->back()->batchWidth();
+  int dimBatch = (int)batch->size();
+  int trgWords = (int)batch->back()->batchWidth();

  auto sentenceLevel
      = options_->get<std::string>("data-weighting-type") == "sentence";
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -122,7 +122,7 @@ public:
   * @param size Number of sentences
   * @param width Number of words in the longest sentence
   */
-  SubBatch(int size, int width, const Ptr<Vocab>& vocab)
+  SubBatch(size_t size, size_t width, const Ptr<Vocab>& vocab)
      : indices_(size * width, 0),
        mask_(size * width, 0),
        size_(size),
@ -176,31 +176,31 @@ public:
    ABORT_IF(size_ == 0, "Encoutered sub-batch size of 0");

    std::vector<Ptr<SubBatch>> splits;
-    size_t subSize = std::ceil(size_ / (float)n);
+    size_t subSize = size_t(std::ceil(size_ / (float)n));

    size_t restSize = size_;
    size_t pos = 0;
    for(size_t k = 0; k < n; ++k) {
-      size_t __size__ = std::min(subSize, restSize);
-      if(__size__ > 0) {
-        auto sb = New<SubBatch>(__size__, width_, vocab_);
+      size_t size = std::min(subSize, restSize);
+      if(size > 0) {
+        auto sb = New<SubBatch>(size, width_, vocab_);

-        size_t __words__ = 0;
+        size_t words = 0;
        for(size_t j = 0; j < width_; ++j) {
-          for(size_t i = 0; i < __size__; ++i) {
-            sb->data()[j * __size__ + i] = indices_[j * size_ + pos + i];
-            sb->mask()[j * __size__ + i] = mask_[j * size_ + pos + i];
+          for(size_t i = 0; i < size; ++i) {
+            sb->data()[j * size + i] = indices_[j * size_ + pos + i];
+            sb->mask()[j * size + i] = mask_[j * size_ + pos + i];

            if(mask_[j * size_ + pos + i] != 0)
-              __words__++;
+              words++;
          }
        }

-        sb->setWords(__words__);
+        sb->setWords(words);
        splits.push_back(sb);

-        restSize -= __size__;
-        pos += __size__;
+        restSize -= size;
+        pos += size;
      }
    }
    return splits;
@ -309,7 +309,7 @@ public:
      // set word indices to different values to avoid same hashes
      std::fill(sb->data().begin(), sb->data().end(), idx++);
      // mask: no items ask being masked out
-      std::fill(sb->mask().begin(), sb->mask().end(), 1);
+      std::fill(sb->mask().begin(), sb->mask().end(), 1.f);

      batches.push_back(sb);
    }
@ -326,7 +326,7 @@ public:
    }

    if(options->has("data-weighting")) {
-      int weightsSize = batchSize;
+      auto weightsSize = batchSize;
      if(options->get<std::string>("data-weighting-type") != "sentence")
        weightsSize *= lengths.back();
      std::vector<float> weights(weightsSize, 1.f);
--- a/src/data/corpus_nbest.h
+++ b/src/data/corpus_nbest.h
@ -58,7 +58,7 @@ public:
        maxDims.resize(ex.size(), 0);
      for(size_t i = 0; i < ex.size(); ++i) {
        if(ex[i].size() > (size_t)maxDims[i])
-          maxDims[i] = ex[i].size();
+          maxDims[i] = (int)ex[i].size();
      }
      sentenceIds.push_back(ex.getId());
    }
--- a/src/data/dataset.h
+++ b/src/data/dataset.h
@ -82,7 +82,7 @@ public:

  void push_back(Input input) { inputs_.push_back(input); }

-  virtual std::vector<Ptr<Batch>> split(size_t n) override { ABORT("Not implemented"); }
+  virtual std::vector<Ptr<Batch>> split(size_t /*n*/) override { ABORT("Not implemented"); }

  Data& features() { return inputs_[0].data(); }

@ -115,7 +115,7 @@ public:
  void shuffle() override { std::shuffle(examples_.begin(), examples_.end(), eng_); }

  batch_ptr toBatch(const Examples& batchVector) override {
-    int batchSize = batchVector.size();
+    int batchSize = (int)batchVector.size();

    std::vector<int> maxDims;
    for(auto& ex : batchVector) {
@ -123,7 +123,7 @@ public:
        maxDims.resize(ex.size(), 0);
      for(size_t i = 0; i < ex.size(); ++i) {
        if(ex[i].size() > (size_t)maxDims[i])
-          maxDims[i] = ex[i].size();
+          maxDims[i] = (int)ex[i].size();
      }
    }

--- a/src/data/rng_engine.h
+++ b/src/data/rng_engine.h
@ -16,8 +16,8 @@ protected:
  std::mt19937 eng_;

 public:
-  RNGEngine() : eng_(Config::seed) {}
-  RNGEngine(size_t eng) : eng_(eng) {}
+  RNGEngine() : eng_((unsigned int)Config::seed) {}
+  RNGEngine(size_t eng) : eng_((unsigned int)eng) {}

  std::string getRNGState() {
    std::ostringstream oss;
--- a/src/data/shortlist.h
+++ b/src/data/shortlist.h
@ -79,7 +79,7 @@ public:
      for(auto i : srcBatch->data())
        idxSet.insert(i);

-    std::uniform_int_distribution<> dis(firstNum_, maxVocab_);
+    std::uniform_int_distribution<> dis((int)firstNum_, (int)maxVocab_);
    while(idxSet.size() < total_ && idxSet.size() < maxVocab_)
      idxSet.insert(dis(gen_));

--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@ -66,7 +66,7 @@ public:
        maxDims.resize(ex.size(), 0);
      for(size_t i = 0; i < ex.size(); ++i) {
        if(ex[i].size() > (size_t)maxDims[i])
-          maxDims[i] = ex[i].size();
+          maxDims[i] = (int)ex[i].size();
      }
      sentenceIds.push_back(ex.getId());
    }
--- a/src/data/vocab.h
+++ b/src/data/vocab.h
@ -51,8 +51,8 @@ private:
  typedef std::vector<std::string> Id2Str;
  Id2Str id2str_;

-  Word eosId_ = -1;
-  Word unkId_ = -1;
+  Word eosId_ = (Word)-1;
+  Word unkId_ = (Word)-1;

  class VocabFreqOrderer;
 };
--- a/src/functional/approx.h
+++ b/src/functional/approx.h
@ -59,7 +59,7 @@ struct Approx {
    if(x <= -radius)
      return 0;
    if(x < radius)  // +1 because 0 holds value for x < -radius
-      return (x + radius - offset) / ((2.f * radius) / pieces) + 1;
+      return int((x + radius - offset) / ((2.f * radius) / pieces) + 1);
    return pieces + 1;
  }

--- a/src/functional/operands.h
+++ b/src/functional/operands.h
@ -13,7 +13,7 @@ using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type;
 template <int N>
 struct Select {
  template <typename T, typename... Args>
-  __HDI__ static auto apply(T&& arg, Args&&... args)
+  __HDI__ static auto apply(T&& /*arg*/, Args&&... args)
      -> decltype(Select<N - 1>::apply(args...)) {
    return Select<N - 1>::apply(args...);
  }
@ -22,7 +22,7 @@ struct Select {
 template <>
 struct Select<0> {
  template <typename T, typename... Args>
-  __HDI__ static T apply(T&& arg, Args&&... args) {
+  __HDI__ static T apply(T&& arg, Args&&... /*args*/) {
    return arg;
  }
 };
@ -49,7 +49,7 @@ struct Capture {
  Capture(float val) : value(val){};

  template <typename... Args>
-  __HDI__ float operator()(Args&&... args) {
+  __HDI__ float operator()(Args&&... /*args*/) {
    return value;
  }

--- a/src/functional/predicates.h
+++ b/src/functional/predicates.h
@ -51,10 +51,10 @@ struct BinaryFunctor {
  template <class X>                                     \
  using name = UnaryFunctor<elem::name, X>;              \
  template <typename X>                                  \
-  name<IsClass<X>> name2(X x) {                          \
+  static inline name<IsClass<X>> name2(X x) {            \
    return name<X>(x);                                   \
  }                                                      \
-  static name<Capture> name2(Capture x) { return name<Capture>(x); }
+  static inline name<Capture> name2(Capture x) { return name<Capture>(x); }

 #define BINARY(name, name2, func)                                 \
  namespace elem {                                                \
@ -120,7 +120,7 @@ BINARY(Or, operator||, x || y);

 template <typename T>
 __HDI__ T sgn(T val) {
-  return (float(0) < val) - (val < float(0));
+  return T((0 < val) - (val < 0));
 }

 UNARY(Sgn, sgn, sgn(x));
--- a/src/functional/shape.h
+++ b/src/functional/shape.h
@ -89,7 +89,7 @@ struct ConstantShape {

  __HDI__ static constexpr size_t size() { return N; }

-  __HDI__ int elements() const { return elements_; }
+  __HDI__ int elements() const { return (int)elements_; }

  __HDI__ int index(const Array<int, N>& d) const {
    int i = 0;
--- a/src/graph/chainable.h
+++ b/src/graph/chainable.h
@ -51,7 +51,8 @@ class ExpressionGraph;
 *   or formally \f$\bar{w}_i = \frac{\partial y}{\partial w_i}\f$
 */
 template <class DataType>
-struct Chainable {
+class Chainable {
+public:
  Chainable() {}
  virtual ~Chainable(){};

--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -203,7 +203,7 @@ public:
      tensors_->throwAtReallocation(true);
      backprop();
      tensors_->throwAtReallocation(false);
-    } catch(AllocationException& e) {
+    } catch(AllocationException&) {
      tensors_->throwAtReallocation(false);
      return false;
    }
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -353,12 +353,12 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
 // swap the last two axes
 Expr transpose(Expr a) {
  std::vector<int> axes(a->shape().size());
-  for(size_t i = 0; i < axes.size(); ++i) {
+  for(int i = 0; i < axes.size(); ++i) {
    axes[i] = i;
  }
  if(axes.size() > 1) {
-    axes[axes.size() - 1] = axes.size() - 2;
-    axes[axes.size() - 2] = axes.size() - 1;
+    axes[axes.size() - 1] = (int)axes.size() - 2;
+    axes[axes.size() - 2] = (int)axes.size() - 1;
  }
  return Expression<TransposeNodeOp>(a, axes);
 }
@ -405,7 +405,7 @@ Expr leakyrelu(const std::vector<Expr>&) {
  ABORT("Not implemented");
 }

-Expr prelu(const std::vector<Expr>&, float alpha) {
+Expr prelu(const std::vector<Expr>&, float /*alpha*/) {
  ABORT("Not implemented");
 }

--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@ -23,7 +23,7 @@ float xor128() {
  y = z;
  z = w;
  w = (w ^ (w >> 19) ^ t ^ (t >> 8)) % 1000;
-  return 0.1 * ((w % 1000) / 1000.f) - 0.05;
+  return 0.1f * ((w % 1000) / 1000.f) - 0.05f;
 }

 void zeros(Tensor t) {
@ -50,7 +50,7 @@ NodeInitializer diag(float val) {
  };
 }

-NodeInitializer normal(float scale, bool ortho /*= true*/) {
+NodeInitializer normal(float scale, bool /*ortho*/ /*= true*/) {
  return [scale](Tensor t) {
    distribution<std::normal_distribution<float>>(t, 0, scale);
  };
@ -113,7 +113,10 @@ NodeInitializer from_vector(const std::vector<float>& v) {
 }

 NodeInitializer from_vector(const std::vector<size_t>& v) {
-  std::vector<float> vf(v.begin(), v.end());
+  auto n = v.size();
+  std::vector<float> vf(n);
+  for (size_t i = 0; i < n; i++)
+    vf[i] = (float)v[i];
  return from_vector(vf);
 }

--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@ -28,7 +28,7 @@ NodeInitializer diag(float val);

 template <class Distribution, class Iterator>
 void distribution(Iterator begin, Iterator end, float a, float b) {
-  std::default_random_engine engine(Config::seed++);
+  std::default_random_engine engine((unsigned int)Config::seed++);
  Distribution dist(a, b);
  auto gen = std::bind(dist, engine);
  std::generate(begin, end, gen);
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@ -119,12 +119,10 @@ public:

  virtual Tensor vals() override {
    ABORT("Not implemented for memory-mapped parameters");
-    return nullptr;
  }

  virtual Tensor grads() override {
    ABORT("Not implemented for memory-mapped parameters");
-    return nullptr;
  }

  virtual void clear() override {
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@ -155,7 +155,7 @@ public:
    return affine(input, W_, b_, false, transposeW_);
  }

-  virtual Expr apply(const std::vector<Expr>& inputs) override {
+  virtual Expr apply(const std::vector<Expr>& /*inputs*/) override {
    ABORT("Not implemented");
  };
 };
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@ -38,13 +38,13 @@ static inline Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph,
  }

  Expr alnCost;
-  float eps = 1e-6;
+  float epsilon = 1e-6f;
  if(guidedCostType == "mse") {
-    alnCost = sum(flatten(square(att - aln))) / (2 * div);
+    alnCost = sum(flatten(square(att - aln))) / (float)(2 * div);
  } else if(guidedCostType == "mult") {
-    alnCost = -log(sum(flatten(att * aln)) + eps) / div;
+    alnCost = -log(sum(flatten(att * aln)) + epsilon) / (float)div;
  } else if(guidedCostType == "ce") {
-    alnCost = -sum(flatten(aln * log(att + eps))) / div;
+    alnCost = -sum(flatten(aln * log(att + epsilon))) / (float)div;
  } else {
    ABORT("Unknown alignment cost type");
  }
--- a/src/layers/loss.cpp
+++ b/src/layers/loss.cpp
@ -24,13 +24,11 @@ Expr LossBase::getCrossEntropy(Expr logits,
                               Expr indices,
                               Expr mask,
                               Expr weights) {
-  using namespace keywords;
-
  auto ce = cross_entropy(logits, indices);

  if(smoothing_ > 0) {
    // @TODO: add this to CE kernels instead
-    auto ceq = mean(logsoftmax(logits), axis = -1);
+    auto ceq = mean(logsoftmax(logits), /*axis=*/ -1);
    ce = (1 - smoothing_) * ce - smoothing_ * ceq;
  }

--- a/src/layers/weight.cpp
+++ b/src/layers/weight.cpp
@ -13,8 +13,8 @@ Expr DataWeighting::getWeights(Ptr<ExpressionGraph> graph,
  ABORT_IF(batch->getDataWeights().empty(),
           "Vector of weights is unexpectedly empty!");
  bool sentenceWeighting = weightingType_ == "sentence";
-  int dimBatch = batch->size();
-  int dimWords = sentenceWeighting ? 1 : batch->back()->batchWidth();
+  int dimBatch = (int)batch->size();
+  int dimWords = sentenceWeighting ? 1 : (int)batch->back()->batchWidth();
  auto weights = graph->constant({1, dimWords, dimBatch, 1},
                                 inits::from_vector(batch->getDataWeights()));
  return weights;
--- a/src/models/amun.h
+++ b/src/models/amun.h
@ -37,9 +37,7 @@ public:

  void load(Ptr<ExpressionGraph> graph,
            const std::string& name,
-            bool markedReloaded = true) override {
-    using namespace keywords;
-
+            bool /*markedReloaded*/ = true) override {
    std::map<std::string, std::string> nameMap
        = {{"decoder_U", "decoder_cell1_U"},
           {"decoder_Ux", "decoder_cell1_Ux"},
--- a/src/models/costs.h
+++ b/src/models/costs.h
@ -175,9 +175,9 @@ public:
    return cost_->apply(nextState);
  }

-  virtual Expr build(Ptr<ExpressionGraph> graph,
-                     Ptr<data::CorpusBatch> batch,
-                     bool clearGraph = true) override {
+  virtual Expr build(Ptr<ExpressionGraph> /*graph*/,
+                     Ptr<data::CorpusBatch> /*batch*/,
+                     bool /*clearGraph*/ = true) override {
    ABORT("Wrong wrapper. Use models::Trainer or models::Scorer");
    return nullptr;
  }
--- a/src/models/decoder.h
+++ b/src/models/decoder.h
@ -60,8 +60,8 @@ public:
    auto yEmb = yEmbFactory.construct();

    auto subBatch = (*batch)[batchIndex_];
-    int dimBatch = subBatch->batchSize();
-    int dimWords = subBatch->batchWidth();
+    int dimBatch = (int)subBatch->batchSize();
+    int dimWords = (int)subBatch->batchWidth();

    auto chosenEmbeddings = rows(yEmb, subBatch->data());

@ -119,7 +119,7 @@ public:
    state->setTargetEmbeddings(selectedEmbs);
  }

-  virtual const std::vector<Expr> getAlignments(int i = 0) { return {}; };
+  virtual const std::vector<Expr> getAlignments(int /*i*/ = 0) { return {}; };

  virtual Ptr<data::Shortlist> getShortlist() { return shortlist_; }
  virtual void setShortlist(Ptr<data::Shortlist> shortlist) {
--- a/src/models/encoder.h
+++ b/src/models/encoder.h
@ -21,9 +21,9 @@ protected:

    auto subBatch = (*batch)[batchIndex_];

-    int dimBatch = subBatch->batchSize();
+    int dimBatch = (int)subBatch->batchSize();
    int dimEmb = srcEmbeddings->shape()[-1];
-    int dimWords = subBatch->batchWidth();
+    int dimWords = (int)subBatch->batchWidth();

    auto chosenEmbeddings = rows(srcEmbeddings, subBatch->data());

--- a/src/models/hardatt.h
+++ b/src/models/hardatt.h
@ -215,7 +215,7 @@ public:
    Expr logits;
    if(type == "hard-soft-att") {
      std::vector<Expr> alignedContexts;
-      for(size_t k = 0; k < state->getEncoderStates().size(); ++k) {
+      for(int k = 0; k < state->getEncoderStates().size(); ++k) {
        // retrieve all the aligned contexts computed by the attention mechanism
        auto att = rnn_->at(0)
                       ->as<rnn::StackedCell>()
@ -257,8 +257,8 @@ public:
    DecoderBase::embeddingsFromBatch(graph, state, batch);

    auto subBatch = (*batch)[batchIndex_];
-    int dimBatch = subBatch->batchSize();
-    int dimWords = subBatch->batchWidth();
+    int dimBatch = (int)subBatch->batchSize();
+    int dimWords = (int)subBatch->batchWidth();

    std::vector<size_t> attentionIndices(dimBatch, 0);
    std::vector<size_t> currentPos(dimBatch, 0);
--- a/src/models/nematus.h
+++ b/src/models/nematus.h
@ -29,7 +29,7 @@ public:

  void load(Ptr<ExpressionGraph> graph,
            const std::string& name,
-            bool markedReloaded = true) override {
+            bool /*markedReloaded*/ = true) override {
    graph->load(name, nameMap_);
  }

--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@ -274,7 +274,7 @@ public:

      start = mlp->apply(meanContexts);
    } else {
-      int dimBatch = batch->size();
+      int dimBatch = (int)batch->size();
      int dimRnn = opt<int>("dim-rnn");

      start = graph->constant({dimBatch, dimRnn}, inits::zeros);
@ -309,7 +309,7 @@ public:
    rnn::States decoderStates = rnn_->lastCellStates();

    std::vector<Expr> alignedContexts;
-    for(size_t k = 0; k < state->getEncoderStates().size(); ++k) {
+    for(int k = 0; k < state->getEncoderStates().size(); ++k) {
      // retrieve all the aligned contexts computed by the attention mechanism
      auto att = rnn_->at(0)
                     ->as<rnn::StackedCell>()
@ -337,7 +337,7 @@ public:

      int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];

-      auto final = mlp::output(graph)           //
+      auto last = mlp::output(graph)           //
          ("prefix", prefix_ + "_ff_logit_l2")  //
          ("dim", dimTrgVoc);

@ -345,17 +345,17 @@ public:
        std::string tiedPrefix = prefix_ + "_Wemb";
        if(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src"))
          tiedPrefix = "Wemb";
-        final.tie_transposed("W", tiedPrefix);
+        last.tie_transposed("W", tiedPrefix);
      }

      if(shortlist_)
-        final.set_shortlist(shortlist_);
+        last.set_shortlist(shortlist_);

      // assemble layers into MLP and apply to embeddings, decoder context and
      // aligned source context
      output_ = mlp::mlp(graph)         //
                    .push_back(hidden)  //
-                    .push_back(final)
+                    .push_back(last)
                    .construct();
    }

--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -51,7 +51,7 @@ public:
    int dimEmb   = input->shape()[-1];
    int dimWords = input->shape()[-3];

-    float num_timescales = dimEmb / 2;
+    float num_timescales = (float)dimEmb / 2;
    float log_timescale_increment = std::log(10000.f) / (num_timescales - 1.f);

    std::vector<float> vPos(dimEmb * dimWords, 0);
@ -59,7 +59,7 @@ public:
      for(int i = 0; i < num_timescales; ++i) {
        float v = p * std::exp(i * -log_timescale_increment);
        vPos[(p - start) * dimEmb + i] = std::sin(v);
-        vPos[(p - start) * dimEmb + num_timescales + i] = std::cos(v);
+        vPos[(p - start) * dimEmb + (int)num_timescales + i] = std::cos(v); // @TODO: is int vs. float correct for num_timescales?
      }
    }

@ -134,7 +134,7 @@ public:
    int dimModel = x->shape()[-1];
    auto scale = graph_->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones);
    auto bias  = graph_->param(prefix + "_ln_bias"  + suffix, { 1, dimModel }, inits::zeros);
-    return marian::layerNorm(x, scale, bias, 1e-6);
+    return marian::layerNorm(x, scale, bias, 1e-6f);
  }

  Expr preProcess(std::string prefix, std::string ops, Expr input, float dropProb = 0.0f) const {
@ -212,7 +212,7 @@ public:
    // time steps and batch entries), also add mask for illegal connections

    // multiplicative attention with flattened softmax
-    float scale = 1.0 / std::sqrt((float)dk); // scaling to avoid extreme values due to matrix multiplication
+    float scale = 1.0f / std::sqrt((float)dk); // scaling to avoid extreme values due to matrix multiplication
    auto z = bdot(q, k, false, true, scale); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length]

    // mask out garbage beyond end of sequences
@ -425,7 +425,7 @@ public:
    auto output = input;
    if(startPos > 0) {
      // we are decoding at a position after 0
-      output = (prevDecoderState.output * startPos + input) / (startPos + 1);
+      output = (prevDecoderState.output * (float)startPos + input) / float(startPos + 1);
    }
    else if(startPos == 0 && output->shape()[-2] > 1) {
      // we are training or scoring, because there is no history and
@ -444,7 +444,7 @@ public:
                       std::string prefix,
                       Expr input,
                       Expr selfMask,
-                       int startPos) const {
+                       int /*startPos*/) const {
    float dropoutRnn = inference_ ? 0.f : opt<float>("dropout-rnn");

    auto rnn = rnn::rnn(graph_)                                    //
@ -479,7 +479,7 @@ public:

  // returns the embedding matrix based on options
  // and based on batchIndex_.
-  Expr wordEmbeddings(int subBatchIndex) const {
+  Expr wordEmbeddings(size_t subBatchIndex) const {
    // standard encoder word embeddings

    int dimVoc = opt<std::vector<int>>("dim-vocabs")[subBatchIndex];
@ -513,8 +513,8 @@ public:

  Ptr<EncoderState> apply(Ptr<data::CorpusBatch> batch) {
    int dimEmb = opt<int>("dim-emb");
-    int dimBatch = batch->size();
-    int dimSrcWords = (*batch)[batchIndex_]->batchWidth();
+    int dimBatch = (int)batch->size();
+    int dimSrcWords = (int)(*batch)[batchIndex_]->batchWidth();

    auto embeddings = wordEmbeddings(batchIndex_); // embedding matrix, considering tying and some other options

@ -531,7 +531,7 @@ public:
    }

    // according to paper embeddings are scaled up by \sqrt(d_m)
-    auto scaledEmbeddings = std::sqrt(dimEmb) * batchEmbeddings;
+    auto scaledEmbeddings = std::sqrt((float)dimEmb) * batchEmbeddings;

    scaledEmbeddings = addPositionalEmbeddings(scaledEmbeddings);

@ -637,7 +637,7 @@ public:

    std::string layerType = opt<std::string>("transformer-decoder-autoreg", "self-attention");
    if (layerType == "rnn") {
-      int dimBatch = batch->size();
+      int dimBatch = (int)batch->size();
      int dim = opt<int>("dim-emb");

      auto start = graph->constant({1, 1, dimBatch, dim}, inits::zeros);
@ -678,12 +678,12 @@ public:
      dimBeam = embeddings->shape()[-4];

    // according to paper embeddings are scaled by \sqrt(d_m)
-    auto scaledEmbeddings = std::sqrt(dimEmb) * embeddings;
+    auto scaledEmbeddings = std::sqrt((float)dimEmb) * embeddings;

    // set current target token position during decoding or training. At training
    // this should be 0. During translation the current length of the translation.
    // Used for position embeddings and creating new decoder states.
-    int startPos = state->getPosition();
+    int startPos = (int)state->getPosition();

    scaledEmbeddings
      = addPositionalEmbeddings(scaledEmbeddings, startPos);
@ -828,7 +828,7 @@ public:

  // helper function for guided alignment
  // @TODO: const vector<> seems wrong. Either make it non-const or a const& (more efficient but dangerous)
-  virtual const std::vector<Expr> getAlignments(int i = 0) override {
+  virtual const std::vector<Expr> getAlignments(int /*i*/ = 0) override {
    return alignments_;
  }

--- a/src/optimizers/optimizers.cpp
+++ b/src/optimizers/optimizers.cpp
@ -19,7 +19,7 @@ void Adagrad::updateImpl(Tensor params, Tensor grads) {
    alloc_ = New<TensorAllocator>(params->getBackend());

  if(!gt_) {
-    int elements = params->size();
+    int elements = (int)params->size();
    alloc_->reserveExact(params->memory()->size());
    alloc_->allocate(gt_, {1, elements});
    gt_->set(0.f);
@ -68,13 +68,13 @@ void Adagrad::load(const std::string& name,
  }

  // get the size of params which should go
-  size_t shardSize = ceil(totalSize / (float)backends.size());
+  size_t shardSize = size_t(ceil(totalSize / (float)backends.size()));

  size_t id = 0;
  for(auto optBase : opts) {
    auto opt = std::dynamic_pointer_cast<Adagrad>(optBase);

-    int size = std::min(shardSize, totalSize);
+    int size = (int)std::min(shardSize, totalSize);
    totalSize -= size;

    if(!opt->alloc_)
@ -95,7 +95,7 @@ void Adagrad::load(const std::string& name,

 void Adagrad::save(const std::string& name,
                   std::vector<Ptr<OptimizerBase>> opts,
-                   size_t totalSize) {
+                   size_t /*totalSize*/) {
  LOG(info, "Saving Adagrad parameters to {}", name);

  std::vector<float> vGt;
@ -130,7 +130,7 @@ void Adam::updateImpl(Tensor params, Tensor grads) {
    alloc_ = New<TensorAllocator>(params->getBackend());

  if(!mt_) {
-    int elements = params->size();
+    int elements = (int)params->size();
    alloc_->reserveExact(2 * params->memory()->size());
    alloc_->allocate(mt_, {1, elements});
    mt_->set(0.f);
@ -140,8 +140,8 @@ void Adam::updateImpl(Tensor params, Tensor grads) {
  }

  t_++;
-  float denom1 = 1 - std::pow(beta1_, t_);
-  float denom2 = 1 - std::pow(beta2_, t_);
+  float denom1 = 1 - (float)std::pow(beta1_, t_);
+  float denom2 = 1 - (float)std::pow(beta2_, t_);

  using namespace functional;

@ -193,13 +193,13 @@ void Adam::load(const std::string& name,
  }

  // get the size of params which should go
-  size_t shardSize = ceil(totalSize / (float)backends.size());
+  size_t shardSize = size_t(ceil(totalSize / (float)backends.size()));

  size_t id = 0;
  for(auto optBase : opts) {
    auto opt = std::dynamic_pointer_cast<Adam>(optBase);

-    int size = std::min(shardSize, totalSize);
+    int size = (int)std::min(shardSize, totalSize);
    totalSize -= size;

    if(!opt->alloc_)
@ -223,7 +223,7 @@ void Adam::load(const std::string& name,

 void Adam::save(const std::string& name,
                std::vector<Ptr<OptimizerBase>> opts,
-                size_t totalSize) {
+                size_t /*totalSize*/) {
  LOG(info, "Saving Adam parameters to {}", name);

  std::vector<float> vMt;
@ -267,13 +267,13 @@ void Adam::resetStats() {
 }

 Ptr<OptimizerBase> Optimizer(Ptr<Config> options) {
-  float lrate = options->get<double>("learn-rate");
+  float lrate = (float)options->get<double>("learn-rate"); // @TODO: should this be <float>?
  auto params = options->has("optimizer-params")
                    ? options->get<std::vector<float>>("optimizer-params")
                    : std::vector<float>({});

  Ptr<ClipperBase> clipper = nullptr;
-  float clipNorm = options->get<double>("clip-norm");
+  float clipNorm = (float)options->get<double>("clip-norm"); // @TODO: should this be <float>?
  if(clipNorm > 0)
    clipper = Clipper<Norm>(clipNorm);

--- a/src/optimizers/optimizers.h
+++ b/src/optimizers/optimizers.h
@ -63,12 +63,12 @@ public:

  void setParams(const std::vector<float>& params) { parseParams(params); }

-  virtual void load(const std::string& name,
-                    std::vector<Ptr<OptimizerBase>> opts,
-                    std::vector<Ptr<Backend>> backends) {}
-  virtual void save(const std::string& name,
-                    std::vector<Ptr<OptimizerBase>> opts,
-                    size_t totalSize) {}
+  virtual void load(const std::string& /*name*/,
+                    std::vector<Ptr<OptimizerBase>> /*opts*/,
+                    std::vector<Ptr<Backend>> /*backends*/) {}
+  virtual void save(const std::string& /*name*/,
+                    std::vector<Ptr<OptimizerBase>> /*opts*/,
+                    size_t /*totalSize*/) {}

 protected:
  virtual void updateImpl(Tensor params, Tensor grads) = 0;
@ -94,7 +94,7 @@ public:
 private:
  void updateImpl(Tensor params, Tensor grads) override;

-  virtual void parseParams(const std::vector<float>& params) override {}
+  virtual void parseParams(const std::vector<float>& /*params*/) override {}
  virtual void resetStats() override {}
 };

@ -124,7 +124,7 @@ private:
      eps_ = params[0];
  }

-  float eps_ = 1e-8;
+  float eps_ = 1e-8f;
  Ptr<TensorAllocator> alloc_;
  Tensor gt_;
 };
@ -159,9 +159,9 @@ private:
      eps_ = params[2];
  }

-  float beta1_ = 0.9;
-  float beta2_ = 0.999;
-  float eps_ = 1e-8;
+  float beta1_ = 0.9f;
+  float beta2_ = 0.999f;
+  float eps_ = 1e-8f;
  size_t t_;

  Ptr<TensorAllocator> alloc_;
--- a/src/rnn/rnn.h
+++ b/src/rnn/rnn.h
@ -64,10 +64,10 @@ private:

    auto xWs = cell_->applyInput({input});

-    size_t timeSteps = input->shape()[-3];
+    auto timeSteps = input->shape()[-3];

    States outputs;
-    for(size_t i = 0; i < timeSteps; ++i) {
+    for(int i = 0; i < timeSteps; ++i) {
      int j = i;

      if(direction_ == dir::backward)
--- a/src/rnn/types.h
+++ b/src/rnn/types.h
@ -28,7 +28,7 @@ private:

    sel = atleast_4d(sel);

-    int dimBatch = selIdx.size() / beamSize;
+    int dimBatch = (int)selIdx.size() / beamSize;
    int dimDepth = sel->shape()[-1];
    int dimTime  = isBatchMajor ? sel->shape()[-2] : sel->shape()[-3];

@ -93,7 +93,7 @@ public:
 };

 class Cell;
-struct CellInput;
+class CellInput;

 class Stackable : public std::enable_shared_from_this<Stackable> {
 protected:
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@ -91,7 +91,7 @@ private:
  std::unordered_map<uint8_t*, Ptr<MemoryPiece>> allocated_;

  size_t align(size_t size) {
-    return ceil(size / (float)alignment_) * alignment_;
+    return size_t(ceil(size / (float)alignment_) * alignment_);
  }

  void grow(size_t add) {
@ -168,7 +168,7 @@ public:
    reserve(bytes);
  }

-  Allocator(DeviceId deviceId,
+  Allocator(DeviceId /*deviceId*/,
            Ptr<Device> device,
            size_t bytes,
            size_t step,
--- a/src/tensors/cpu/add.h
+++ b/src/tensors/cpu/add.h
@ -28,7 +28,7 @@ void gAddGeneric(Functor functor,

  constexpr size_t N = functional::Shape::size();
  functional::Array<int, N> len;
-  for(size_t i = 0; i < N; ++i)
+  for(int i = 0; i < N; ++i)
    len[i] = full[i] / out.shape()[i];

  functional::Array<int, N> dims;
@ -101,7 +101,7 @@ template <class Functor, class... Tensors>
 void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
  auto full = marian::Shape::broadcast({out, tensors...});

-  int length = out->shape().elements();
+  //int length = out->shape().elements();

  constexpr size_t K = sizeof...(Tensors);

@ -109,8 +109,8 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
  functional::Array<functional::Tensor<float>, K> gIns = {tensors...};

  if(full.back() != 1 && out->shape().back() == 1) {
-    size_t m = full.elements() / length;
-    size_t k = full.back();
+    //size_t m = full.elements() / length;
+    //size_t k = full.back();
    cpu::gAddReduce(functor, full, gOut, gIns, scale);
  } else if(out->shape() == full) {
    bool broadcast = false;
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@ -15,7 +15,7 @@ private:

 public:
  Backend(DeviceId deviceId, size_t seed)
-      : marian::Backend(deviceId, seed), gen_(seed_) {}
+      : marian::Backend(deviceId, seed), gen_((unsigned int)seed_) {}

  void setDevice() override {}

--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@ -131,21 +131,21 @@ void ProdBatched(marian::Tensor C,
  auto strideA = batchA == 1 ? 0 : m * k;
  auto strideC = n * m;

-  int batchC = std::max(batchA, batchB);
-  for(int i = 0; i < batchC; ++i) {
+  auto batchC = std::max(batchA, batchB);
+  for(size_t i = 0; i < batchC; ++i) {
    sgemm(transA,
          transB,
-          m,
-          n,
-          k,
+          (int)m,
+          (int)n,
+          (int)k,
          alpha,
          A->data() + (i % batchA) * strideA,
-          lda,
+          (int)lda,
          B->data() + (i % batchB) * strideB,
-          ldb,
+          (int)ldb,
          beta,
          C->data() + i * strideC,
-          ldc);
+          (int)ldc);
  }
 #else
  ABORT("Not implemented!");
--- a/src/tensors/cpu/sharp/int_gemm.h
+++ b/src/tensors/cpu/sharp/int_gemm.h
@ -56,8 +56,8 @@ void SSE_MatrixMult16(const __m128i* A,

 static inline void Quantize16(marian::Tensor out,
                              const marian::Tensor in,
-                              float clipValue) {
-  float quant_mult = pow(2.0, (float)BITS);
+                              float /*clipValue*/) {
+  float quant_mult = (float)pow(2.0, BITS);
 #ifdef __AVX512F__
  AVX_Quantize16(
      in->data(), out->data<int16_t>(), quant_mult, in->shape().elements());
@ -76,6 +76,7 @@ static inline void Quantize8(marian::Tensor out,
  AVX_Quantize8(
      in->data(), out->data<int8_t>(), quant_mult, in->shape().elements());
 #else
+    out; in; clipValue;
  ABORT("8-bit is currently only AVX512");
 #endif
 }
@ -118,19 +119,19 @@ static void AddBias(marian::Tensor C, const marian::Tensor Bias) {
  }
 }

-static void ProdInt16(marian::Tensor C,
-                      const marian::Tensor A,
-                      const marian::Tensor B,
+static inline void ProdInt16(marian::Tensor C,
+                             const marian::Tensor A,
+                             const marian::Tensor B,
                      float scale) {
  ABORT_IF(scale != 1, "Scale other than 1 not supported");

  // @TODO: make this a parameter
-  float quant_mult = pow(2.0, (float)BITS);
+  float quant_mult = (float)pow(2.0, BITS);

  // If we quantize to n bits and then multiple the values together, the result
  // will be quantized to n^2 bits. So we must divide by 1.0/(n^2) to get back
  // the original value.
-  float unquant_mult = 1.0 / (quant_mult * quant_mult);
+  float unquant_mult = 1.0f / (quant_mult * quant_mult);

  float* fC = C->data();
  int num_A_rows = A->shape().elements() / A->shape()[-1];
@ -155,11 +156,11 @@ static void ProdInt16(marian::Tensor C,
 #endif
 }

-static void ProdInt8(marian::Tensor C,
-                     const marian::Tensor A,
-                     const marian::Tensor B,
-                     float scale,
-                     float clipValue) {
+static inline void ProdInt8(marian::Tensor C,
+                            const marian::Tensor A,
+                            const marian::Tensor B,
+                            float scale,
+                            float clipValue) {
 #ifdef __AVX512F__
  // This would be easy...
  ABORT_IF(scale != 1, "Scale other than 1 not supported");
@ -178,6 +179,7 @@ static void ProdInt8(marian::Tensor C,
                  num_B_rows,
                  width);
 #else
+    C; A; B; scale; clipValue;
  ABORT("8-bit is currently only AVX512");
 #endif
 }
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -17,10 +17,10 @@ namespace cpu {
 inline float stableSigmoid(float x) {
  if(x >= 0) {
    float z = expf(-x);
-    return 1.0 / (1.0 + z);
+    return 1.0f / (1.0f + z);
  } else {
    float z = expf(x);
-    return z / (1.0 + z);
+    return z / (1.0f + z);
  }
 }

@ -228,9 +228,9 @@ void Transpose10(Tensor out, const Tensor in) {
 template <bool add>
 void TransposeGeneric(Tensor out, Tensor in, const std::vector<int>& vAxis) {
  functional::Array<int, functional::Shape::size()> permute;
-  int diff = functional::Shape::size() - vAxis.size();
-  for(size_t i = 0; i < permute.size(); ++i)
-    if((int)i < diff)
+  int diff = int(functional::Shape::size() - vAxis.size());
+  for(int i = 0; i < permute.size(); ++i)
+    if(i < diff)
      permute[i] = i;
    else
      permute[i] = vAxis[i - diff] + diff;
@ -483,7 +483,7 @@ void Select(Tensor out,

  for(int index = 0; index < length; ++index) {
    outShape.dims(index, dims);
-    dims[axis] = indices[dims[axis]];
+    dims[axis] = (int)indices[dims[axis]];
    int inIndex = inShape.index(dims);
    out->data()[index] = in->data()[inIndex];
  }
@ -505,7 +505,7 @@ void Insert(Tensor out,

  for(int index = 0; index < length; ++index) {
    inShape.dims(index, dims);
-    dims[axis] = indices[dims[axis]];
+    dims[axis] = (int)indices[dims[axis]];
    int outIndex = outShape.index(dims);
    out->data()[outIndex] += in->data()[index];
  }
@ -547,8 +547,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
      else
        h = std::tanh(xWrow[l] + sUrow[l] * r + b[l]);

-      float out = (1.0f - z) * h + z * rowState[i];
-      rowOut[i] = m * out + (1 - m) * rowState[i];
+      float o = (1.0f - z) * h + z * rowState[i];
+      rowOut[i] = m * o + (1 - m) * rowState[i];
    }
  }
 }
@ -599,16 +599,16 @@ void GRUFastBackward(std::vector<Tensor> outputs,
      else
        h = std::tanh(rowXW[l] + rowSU[l] * r + b[l]);

-      float adj = rowAdj[i];
+      float a = rowAdj[i];

      float t = (1 - z) * (1 - h * h);

      // df/ds
      if(outState)
-        rowOutState[i] += (m * z - m + 1) * adj;
+        rowOutState[i] += (m * z - m + 1) * a;

      // df/d(xW_r) ...
-      float dfdxW_r = m * r * (1 - r) * t * adj;
+      float dfdxW_r = m * r * (1 - r) * t * a;
      if(final)
        dfdxW_r *= rowSU[l] + b[l];
      else
@ -621,7 +621,7 @@ void GRUFastBackward(std::vector<Tensor> outputs,
        outB[i] += dfdxW_r;

      // df/d(xW_z) ...
-      float dfdxW_z = m * (1 - z) * z * (rowState[i] - h) * adj;
+      float dfdxW_z = m * (1 - z) * z * (rowState[i] - h) * a;
      if(outXW)
        rowOutXW[k] += dfdxW_z;
      if(outSU)
@ -630,7 +630,7 @@ void GRUFastBackward(std::vector<Tensor> outputs,
        outB[k] += dfdxW_z;

      // df/d(xW_x) ...
-      float dfdxW_x = m * t * adj;
+      float dfdxW_x = m * t * a;
      if(outXW)
        rowOutXW[l] += dfdxW_x;
      if(outSU)
@ -671,7 +671,7 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) {
    }

    // cross-entropy
-    int i = pick[j];
+    int i = (int)pick[j];
    // This appears to be safe i.e. that i >= 0 && i < cols is known
    out[j] = std::log(sum) - sp[i] + max;
  }
@ -960,7 +960,7 @@ void Shift(Tensor out_,
           float padValue,
           bool invert) {
  int offset = 0;
-  for(size_t i = 0; i < shift.size(); ++i)
+  for(int i = 0; i < shift.size(); ++i)
    offset += in_->shape().stride(i) * shift[i];

  if(invert)
@ -983,7 +983,7 @@ void Shift(Tensor out_,

 void ShiftGrad(Tensor out_, Tensor in_, marian::Shape shift, bool invert) {
  int offset = 0;
-  for(size_t i = 0; i < shift.size(); ++i)
+  for(int i = 0; i < shift.size(); ++i)
    offset += in_->shape().stride(i) * shift[i];

  if(invert)
@ -1004,7 +1004,7 @@ void ShiftGrad(Tensor out_, Tensor in_, marian::Shape shift, bool invert) {
 void SetSparse(float* out,
               const std::vector<size_t>& indices,
               const std::vector<float>& values) {
-  int length = indices.size();
+  int length = (int)indices.size();
  for(int index = 0; index < length; ++index) {
    out[indices[index]] = values[index];
  }
@ -1112,15 +1112,15 @@ void LSTMCellBackward(std::vector<Tensor> outputs,
      int l = i + 2 * cols;
      float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);

-      float adj = rowAdj[i];
+      float a = rowAdj[i];

      // dc/dx_{t-1}
      if(outCell) {
-        rowOutCell[i] += (m * gf - m + 1) * adj;
+        rowOutCell[i] += (m * gf - m + 1) * a;
      }

      // dc/d(b_f) = dc/d(xW_f) ...
-      float dcdxf = m * rowCell[i] * gf * (1 - gf) * adj;
+      float dcdxf = m * rowCell[i] * gf * (1 - gf) * a;
      if(outXW) {
        rowOutXW[i] += dcdxf;
      }
@ -1132,7 +1132,7 @@ void LSTMCellBackward(std::vector<Tensor> outputs,
      }

      // dc/d(b_i) ...
-      float dcdb_i = m * gc * gi * (1 - gi) * adj;
+      float dcdb_i = m * gc * gi * (1 - gi) * a;
      if(outXW) {
        rowOutXW[k] += dcdb_i;
      }
@ -1144,7 +1144,7 @@ void LSTMCellBackward(std::vector<Tensor> outputs,
      }

      // dc/d(b_c) ...
-      float dcdxc = m * gi * (1 - gc * gc) * adj;
+      float dcdxc = m * gi * (1 - gc * gc) * a;
      if(outXW) {
        rowOutXW[l] += dcdxc;
      }
@ -1193,15 +1193,15 @@ void LSTMOutputBackward(std::vector<Tensor> outputs,

      float t = std::tanh(rowCell[i]);

-      float adj = rowAdj[i];
+      float a = rowAdj[i];

      // dc/dc_{t-1}
      if(outCell) {
-        rowOutCell[i] += go * (1 - t * t) * adj;
+        rowOutCell[i] += go * (1 - t * t) * a;
      }

      // dc/d(b_o) = dc/d(xW_f) ...
-      float dcdxo = t * go * (1 - go) * adj;
+      float dcdxo = t * go * (1 - go) * a;
      if(outXW) {
        rowOutXW[k] += dcdxo;
      }
@ -1240,30 +1240,30 @@ void HighwayForward(Tensor out,
  }
 }

-void HighwayBackward(Tensor out1,
-                     Tensor out2,
-                     Tensor outt,
-                     const Tensor in1,
-                     const Tensor in2,
-                     const Tensor t,
-                     const Tensor adj) {
+void HighwayBackward(Tensor /*out1*/,
+                     Tensor /*out2*/,
+                     Tensor /*outt*/,
+                     const Tensor /*in1*/,
+                     const Tensor /*in2*/,
+                     const Tensor /*t*/,
+                     const Tensor /*adj*/) {
  ABORT("Not implemented!");
 }

-void PoolingWithMaskingForward(Tensor out,
-                               Tensor in,
-                               Tensor mask,
-                               int width,
-                               bool isEven) {
+void PoolingWithMaskingForward(Tensor /*out*/,
+                               Tensor /*in*/,
+                               Tensor /*mask*/,
+                               int /*width*/,
+                               bool /*isEven*/) {
  ABORT("Not implemented!");
 }

-void PoolingWithMaskingBackward(Tensor adj,
-                                Tensor adjIn,
-                                Tensor in,
-                                Tensor mask,
-                                int width,
-                                bool isEven) {
+void PoolingWithMaskingBackward(Tensor /*adj*/,
+                                Tensor /*adjIn*/,
+                                Tensor /*in*/,
+                                Tensor /*mask*/,
+                                int /*width*/,
+                                bool /*isEven*/) {
  ABORT("Not implemented!");
 }
 }  // namespace cpu
--- a/src/tensors/device.h
+++ b/src/tensors/device.h
@ -16,7 +16,7 @@ protected:
  size_t alignment_;

  size_t align(size_t size) {
-    return ceil(size / (float)alignment_) * alignment_;
+    return size_t(ceil(size / (float)alignment_) * alignment_);
  }

 public:
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -91,7 +91,7 @@ public:
             request<float>(),
             type_);

-    float temp;
+    float temp = 0; // (initialize to keep compiler happy)
    if(backend_->getDeviceId().type == DeviceType::cpu) {
      std::copy(data() + i, data() + i + 1, &temp);
    }
@ -183,16 +183,16 @@ public:
  void set(T value) {
    if(!matchType<T>(type_)) {
      switch(type_) {
-        case Type::float32: set<float>(value); break;
-        case Type::float64: set<double>(value); break;
-        case Type::int8: set<int8_t>(value); break;
-        case Type::int16: set<int16_t>(value); break;
-        case Type::int32: set<int32_t>(value); break;
-        case Type::int64: set<int64_t>(value); break;
-        case Type::uint8: set<uint8_t>(value); break;
-        case Type::uint16: set<uint16_t>(value); break;
-        case Type::uint32: set<uint32_t>(value); break;
-        case Type::uint64: set<uint64_t>(value); break;
+        case Type::float32: set<float   >((float   )value); break;
+        case Type::float64: set<double  >((double  )value); break;
+        case Type::int8:    set<int8_t  >((int8_t  )value); break;
+        case Type::int16:   set<int16_t >((int16_t )value); break;
+        case Type::int32:   set<int32_t >((int32_t )value); break;
+        case Type::int64:   set<int64_t >((int64_t )value); break;
+        case Type::uint8:   set<uint8_t >((uint8_t )value); break;
+        case Type::uint16:  set<uint16_t>((uint16_t)value); break;
+        case Type::uint32:  set<uint32_t>((uint32_t)value); break;
+        case Type::uint64:  set<uint64_t>((uint64_t)value); break;
        default:
          ABORT(
              "Requested type ({}) cannot be converted to underlying type ({})",
@ -273,19 +273,19 @@ public:
    else
      strm << std::fixed << std::setprecision(0) << std::setfill(' ');

-    for(size_t i = 0; i < values.size(); ++i) {
+    for(int i = 0; i < values.size(); ++i) {
      std::vector<int> dims;
      shape().dims(i, dims);

      bool disp = true;
-      for(size_t j = 0; j < dims.size(); ++j)
+      for(int j = 0; j < dims.size(); ++j)
        disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);

      if(disp) {
        if(dims.back() == 0) {
          bool par = true;
          std::vector<std::string> p;
-          for(int j = dims.size() - 1; j >= 0; --j) {
+          for(int j = (int)dims.size() - 1; j >= 0; --j) {
            if(dims[j] != 0)
              par = false;

@ -307,7 +307,7 @@ public:
        strm << " ";

        if(dims.back() + 1 == shape().back()) {
-          for(int j = dims.size() - 1; j >= 0; --j) {
+          for(int j = (int)dims.size() - 1; j >= 0; --j) {
            if(dims[j] + 1 != shape()[j])
              break;
            strm << "]";
@ -316,7 +316,7 @@ public:
        }

        bool prev = true;
-        for(int j = dims.size() - 1; j >= 0; --j) {
+        for(int j = (int)dims.size() - 1; j >= 0; --j) {
          if(j < (int)dims.size() - 1)
            prev = prev && dims[j + 1] + 1 == shape()[j + 1];
          if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@ -35,7 +35,7 @@ public:
  }

  void reserve(size_t bytes = 0) {
-    float mult = bytes / GROW + 1;
+    auto mult = bytes / GROW + 1;
    LOG(info,
        "[memory] Extending reserved space to {} MB (device {})",
        mult * CHUNK,
@ -78,8 +78,8 @@ public:

  Tensor asTensor() {
    auto mem = allocator_->memory();
-    int size = mem->size() / sizeof(float);
-    return Tensor(new TensorBase(mem, {1, size}, backend_));
+    auto size = mem->size() / sizeof(float);
+    return Tensor(new TensorBase(mem, {1, (int)size}, backend_));
  }

  size_t size() { return allocator_->size() / sizeof(float); }
--- a/src/training/communicator.cpp
+++ b/src/training/communicator.cpp
@ -7,7 +7,7 @@ namespace marian {
 #ifndef CUDA_FOUND
 Ptr<Communicator> createCommunicator(
    const std::vector<Ptr<ExpressionGraph>>& graphs,
-    bool noNccl) {
+    bool /*noNccl*/) {
  return New<DefaultCommunicator>(graphs);
 }
 #endif
--- a/src/training/communicator.h
+++ b/src/training/communicator.h
@ -17,8 +17,8 @@ public:
  virtual ~Communicator() {}

  virtual void foreach(const std::function<void(size_t, int)>& func) {
-    int totalSize = graphs_[0]->params()->vals()->size();
-    int shardSize = ceil(totalSize / (float)graphs_.size());
+    int totalSize = (int)graphs_[0]->params()->vals()->size();
+    int shardSize = (int)ceil(totalSize / (float)graphs_.size());

    int pos = 0;
    std::vector<std::thread> group;
@ -50,8 +50,8 @@ private:

  void init() {
    if(tmpTensors_.size() == 0) {
-      int totalSize = graphs_[0]->params()->vals()->size();
-      int shardSize = ceil(totalSize / (float)graphs_.size());
+      int totalSize = (int)graphs_[0]->params()->vals()->size();
+      int shardSize = (int)ceil(totalSize / (float)graphs_.size());

      int pos = 0;
      for(auto graph : graphs_) {
@ -83,8 +83,8 @@ public:
  void scatterReduce() override {
    init();

-    int totalSize = graphs_[0]->params()->vals()->size();
-    int shardSize = ceil(totalSize / (float)graphs_.size());
+    int totalSize = (int)graphs_[0]->params()->vals()->size();
+    int shardSize = (int)ceil(totalSize / (float)graphs_.size());

    // Gather gradients from different devices into current gradient shards
    auto scatter = [this, shardSize](size_t idx, int pos) {
@ -107,8 +107,8 @@ public:
  }

  void allGather() override {
-    int totalSize = graphs_[0]->params()->vals()->size();
-    int shardSize = ceil(totalSize / (float)graphs_.size());
+    int totalSize = (int)graphs_[0]->params()->vals()->size();
+    int shardSize = (int)ceil(totalSize / (float)graphs_.size());

    // Update all graphs with parameter shard
    auto gather = [this, shardSize](size_t idx, int pos) {
@ -133,7 +133,7 @@ public:
    auto copy = [this, params](size_t idx, int pos) {
      // copy parameter shard to each graph
      auto subParam
-          = graphs_[idx]->params()->vals()->subtensor(pos, params[idx]->size());
+          = graphs_[idx]->params()->vals()->subtensor(pos, (int)params[idx]->size());
      params[idx]->copyFrom(subParam);
    };

@ -147,7 +147,7 @@ public:
      // copy parameter shard to each graph
      for(auto graph : graphs_) {
        auto subParam
-            = graph->params()->vals()->subtensor(pos, params[idx]->size());
+            = graph->params()->vals()->subtensor(pos, (int)params[idx]->size());
        subParam->copyFrom(params[idx]);
      }
    };
@ -162,17 +162,17 @@ public:
      // copy parameter shard to each graph, apart from last graph
      for(int i = 0; i < (int)graphs_.size() - 1; ++i) {
        auto subParam
-            = graphs_[i]->params()->vals()->subtensor(pos, params[idx]->size());
+            = graphs_[i]->params()->vals()->subtensor(pos, (int)params[idx]->size());
        subParam->copyFrom(params[idx]);
      }

      // back-up shard from last graph
-      auto subParamLast = graphs_.back()->params()->vals()->subtensor(
-          pos, params[idx]->size());
+      auto subParamLast =
+          graphs_.back()->params()->vals()->subtensor(pos, (int)params[idx]->size());
      params[idx]->copyFrom(subParamLast);

      auto subParamFirst
-          = graphs_[0]->params()->vals()->subtensor(pos, params[idx]->size());
+          = graphs_[0]->params()->vals()->subtensor(pos, (int)params[idx]->size());
      subParamLast->copyFrom(subParamFirst);
    };
    // execute for each shard
--- a/src/training/gradient_dropping/sparse_tensor.h
+++ b/src/training/gradient_dropping/sparse_tensor.h
@ -102,6 +102,8 @@ public:
      gpu::copy(backend_, ndata, ndata + nsize, data());
      gpu::copy(backend_, nindices, nindices + nsize, indices());
    }
+#else
+    ndata; nindices; // (unused)
 #endif
  }

@ -137,6 +139,8 @@ public:
    else {
      gpu::scatterAdd(t, data(), indices(), size(), offset);
    }
+#else
+    t; offset; // (unused)
 #endif
  }

@ -149,6 +153,8 @@ public:
    else {
      gpu::scatterUpdate(t, data(), indices(), size(), offset);
    }
+#else
+    t; offset; // (unused)
 #endif
  }

@ -161,6 +167,8 @@ public:
    else {
      gpu::gather(t, data(), indices(), size(), offset);
    }
+#else
+    t; offset; // (unused)
 #endif
  }

--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@ -63,7 +63,7 @@ public:
    size_t step = options_->get<size_t>("mini-batch-fit-step");

    size_t maxLength = options_->get<size_t>("max-length");
-    maxLength = std::ceil(maxLength / (float)step) * step;
+    maxLength = (size_t)(std::ceil(maxLength / (float)step) * step);

    // @TODO: ugly
    auto toptions = New<Options>();
@ -85,7 +85,7 @@ public:
      size_t end = maxBatch;

      std::vector<size_t> lengths(numFiles, i);
-      bool fits = true;
+      fits = true;

      do {
        size_t current = (start + end) / 2;
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@ -36,7 +36,7 @@ void AsyncGraphGroup::setScheduler(Ptr<Scheduler> scheduler) {

 void AsyncGraphGroup::fetchParams(Tensor oldParams,
                                  const std::vector<Tensor>& params,
-                                  int device_id) {
+                                  int /*device_id*/) {
  // @TODO read guard on parameters
  int pos = 0;

@ -46,7 +46,7 @@ void AsyncGraphGroup::fetchParams(Tensor oldParams,
        [&](int idx, int pos) {
          // individual mutex per-shard
          std::lock_guard<std::mutex> guard(shardSync_[idx]);
-          oldParams->subtensor(pos, params[idx]->size())->copyFrom(params[idx]);
+          oldParams->subtensor((int)pos, (int)params[idx]->size())->copyFrom(params[idx]);
        },
        idx,
        pos));
@ -60,7 +60,7 @@ void AsyncGraphGroup::fetchParams(Tensor oldParams,

 void AsyncGraphGroup::pushGradients(Tensor newGrads,
                                    size_t batch_words,
-                                    int device_id) {
+                                    int /*device_id*/) {
  // add instead of copy?
  std::vector<std::thread> threads;
  int pos = 0;
@ -69,7 +69,7 @@ void AsyncGraphGroup::pushGradients(Tensor newGrads,
        [&](int idx, int pos) {
          // individual mutex per-shard
          std::lock_guard<std::mutex> guard(shardSync_[idx]);
-          grads_[idx]->copyFrom(newGrads->subtensor(pos, grads_[idx]->size()));
+          grads_[idx]->copyFrom(newGrads->subtensor(pos, (int)grads_[idx]->size()));

          if(scaleLearningRate_) {
            shardOpt_[idx]->update(
@ -105,8 +105,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
  }

  if(params_.empty()) {
-    int totalSize = graphs_[0]->params()->vals()->size();
-    shardSize_ = ceil(totalSize / (float)devices_.size());
+    int totalSize = (int)graphs_[0]->params()->vals()->size();
+    shardSize_ = (int)ceil(totalSize / (float)devices_.size());

    int pos = 0;
    // parameter sharding
@ -128,7 +128,7 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
    }
  }
  if(grads_.empty()) {
-    int totalSize = graphs_[0]->params()->vals()->size();
+    int totalSize = (int)graphs_[0]->params()->vals()->size();

    for(auto graph : graphs_) {
      int __size__ = std::min(shardSize_, totalSize);
@ -154,7 +154,7 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
      graphAvg->forward();
    }

-    int totalSize = graphs_[0]->params()->vals()->size();
+    int totalSize = (int)graphs_[0]->params()->vals()->size();

    int i = 0;
    for(auto graph : graphs_) {
@ -203,7 +203,7 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {

    if(!graph) {
      std::lock_guard<std::mutex> lock(sync_);
-      t_id = i;
+      t_id = (int)i;
      graph = graphs_[i];
      builder = builders_[i++];
    }
--- a/src/training/graph_group_async_drop.cpp
+++ b/src/training/graph_group_async_drop.cpp
@ -31,7 +31,7 @@ void AsyncGraphGroupDrop::fetchParams(Tensor oldParams,
          sparseShard->gather(params[idx]);
          sparseGrad->copyFrom(sparseShard);
          sparseGrad->scatterUpdate(
-              oldParams->subtensor(pos, params[idx]->size()));
+              oldParams->subtensor((int)pos, (int)params[idx]->size()));
        },
        idx,
        pos));
@ -59,7 +59,7 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
          auto dropper = droppers_[device_id][idx];
          auto sparseGrad = sparseGrads_[device_id][idx];
          auto sparseShard = sparseShards_[device_id][idx];
-          auto tensor = newGrads->subtensor(pos, grads_[idx]->size());
+          auto tensor = newGrads->subtensor((int)pos, (int)grads_[idx]->size());
          // individual mutex per-shard
          std::lock_guard<std::mutex> guard(shardSync_[idx]);

@ -107,8 +107,8 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
      fetch_ready.push_back(false);

      // Size of the sparse tensor
-      int totalSize = graphs_[0]->params()->vals()->size();
-      int sparseCap = totalSize * 1.2 * (1.0 - droping_rate);
+      int totalSize = (int)graphs_[0]->params()->vals()->size();
+      int sparseCap = (int)(totalSize * 1.2 * (1.0 - droping_rate));

      // prepare droppers
      std::vector<GradientDrop> tmpDropper;
@ -120,13 +120,13 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
      std::vector<SparseTensor> tmp;
      for(int j = 0; j < devices_.size(); j++)
        tmp.push_back(SparseTensor(new SparseTensorBase(
-            sparseCap / devices_.size(), graphs_[i]->getBackend())));
+            sparseCap / (int)devices_.size(), graphs_[i]->getBackend())));
      sparseGrads_.push_back(tmp);

      std::vector<SparseTensor> tmp2;
      for(int j = 0; j < devices_.size(); j++)
        tmp2.push_back(SparseTensor(new SparseTensorBase(
-            sparseCap / devices_.size(), graphs_[j]->getBackend())));
+            sparseCap / (int)devices_.size(), graphs_[j]->getBackend())));
      sparseShards_.push_back(tmp2);
    }
    drop_first = false;
--- a/src/training/graph_group_multinode.cpp
+++ b/src/training/graph_group_multinode.cpp
@ -60,8 +60,8 @@ void MultiNodeGraphGroup::init(Ptr<data::Batch> batch) {
    for(int i = 0; i < mpi_comm_world_size_; i++) {
      // Shard buffers across GPUs
      auto backend = clientGraphs_[i % devices_.size()]->getBackend();
-      Tensor accGrad = newTensor(nodeSizes_[i], backend);
-      Tensor accGradBuff = newTensor(nodeSizes_[i], backend);
+      Tensor accGrad     = newTensor((int)nodeSizes_[i], backend);
+      Tensor accGradBuff = newTensor((int)nodeSizes_[i], backend);
      accGradients.push_back(accGrad);
      accGradientBuffer.push_back(accGradBuff);
    }
@ -113,7 +113,7 @@ void MultiNodeGraphGroup::runBatchThroughClientGraphs(Ptr<data::Batch> batch) {
 */
 void MultiNodeGraphGroup::calculateNodeSizes() {
  size_t modelSize = clientGraphs_[0]->params()->vals()->size();
-  size_t nodeSize = ceilf(((float)modelSize) / mpi_comm_world_size_);
+  size_t nodeSize = (size_t)ceilf(((float)modelSize) / mpi_comm_world_size_);
  for(int node = 0; node < mpi_comm_world_size_; node++) {
    size_t remainingModelSize = modelSize - (nodeSize * node);
    // Takes care of edge case where last node is smaller than the others
@ -166,11 +166,11 @@ void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
  for(size_t client = 0; client < devices_.size(); client++) {
    // Communication overlap buffer (for grads + params)
    Tensor commOverlapBuffer
-        = newTensor(modelSize, clientGraphs_[client]->getBackend());
+        = newTensor((int)modelSize, clientGraphs_[client]->getBackend());
    commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals());
    clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer);
    // Gradients local sum buffer
-    Tensor sumGrads = newTensor(modelSize, clientGraphs_[client]->getBackend());
+    Tensor sumGrads = newTensor((int)modelSize, clientGraphs_[client]->getBackend());
    sumGrads->set(0);
    clientSummedGradsGPU.push_back(sumGrads);
    // Local optimizer to apply summed gradients
@ -207,7 +207,7 @@ void MultiNodeGraphGroup::setupServerShards() {
 */
 void MultiNodeGraphGroup::calculateShardSizes() {
  size_t nodeSize = nodeSizes_[mpi_my_rank_];
-  size_t shardSize = ceilf(((float)nodeSize) / devices_.size());
+  size_t shardSize = (size_t)ceilf(((float)nodeSize) / devices_.size());
  for(size_t shard = 0; shard < devices_.size(); shard++) {
    size_t remainingNodeSize = nodeSize - (shardSize * shard);
    // Takes care of edge case where last shard is smaller than the others
@ -226,12 +226,12 @@ void MultiNodeGraphGroup::initShardGpuTensors() {
  }
  for(size_t shard = 0; shard < devices_.size(); shard++) {
    Tensor gpuParams
-        = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
+        = newTensor((int)shardSizes_[shard], clientGraphs_[shard]->getBackend());
    gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
-        offset, shardSizes_[shard]));
+        (int)offset, (int)shardSizes_[shard]));
    shardParams_.push_back(gpuParams);
    shardGrads_.push_back(
-        newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
+        newTensor((int)shardSizes_[shard], clientGraphs_[shard]->getBackend()));
    offset += shardSizes_[shard];
  }
 }
@ -519,6 +519,8 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,

    offset += nodeSize;
  }
+#else
+  newGrads; oldParams; gpu; batchWords; // (unused)
 #endif
 }

@ -578,7 +580,7 @@ void MultiNodeGraphGroup::execute(Ptr<data::Batch> batch) {
    if(!clientCommOverlap) {
      synchronizeWithServerShards(graph->params()->grads(),
                                  graph->params()->vals(),
-                                  my_id,
+                                  (int)my_id,
                                  batch->wordsTrg());
    }

--- a/src/training/graph_group_multinode.h
+++ b/src/training/graph_group_multinode.h
@ -388,7 +388,7 @@ protected:
    numberClientsOfNodes_ = std::vector<int>(mpi_comm_world_size_, 0);
    while(index < deviceConfig.size()) {
      if(numberClientsOfNodes_[node] == 0) {
-        numberClientsOfNodes_[node] = deviceConfig[index];
+        numberClientsOfNodes_[node] = (int)deviceConfig[index];
        nClientsSeen = 0;
      } else if(nClientsSeen < numberClientsOfNodes_[node]) {
        if(node == mpi_my_rank_) {
--- a/src/training/graph_group_multinode_sync.cpp
+++ b/src/training/graph_group_multinode_sync.cpp
@ -45,7 +45,7 @@ Tensor MultiNodeGraphGroupSync::newTensor(int size, Ptr<Backend> backend) {
 void MultiNodeGraphGroupSync::init(Ptr<data::Batch> batch) {
  // Setup clients and shards
  setupClients(batch);
-  int network_size = clientGraphs_[0]->params()->vals()->size();
+  int network_size = (int)clientGraphs_[0]->params()->vals()->size();
  LOG(info, "model size = {} float params", network_size);
  if(movingAvg_)
    paramsAvg_ = newTensor(network_size, clientGraphs_.back()->getBackend());
--- a/src/training/graph_group_multinode_sync.h
+++ b/src/training/graph_group_multinode_sync.h
@ -166,7 +166,7 @@ protected:
    numberClientsOfNodes_ = std::vector<int>(mpi_comm_world_size_, 0);
    while(index < deviceConfig.size()) {
      if(numberClientsOfNodes_[node] == 0) {
-        numberClientsOfNodes_[node] = deviceConfig[index];
+        numberClientsOfNodes_[node] = (int)deviceConfig[index];
        nClientsSeen = 0;
      } else if(nClientsSeen < numberClientsOfNodes_[node]) {
        if(node == mpi_my_rank_) {
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -64,8 +64,8 @@ void SyncGraphGroup::initializeAvg() {
    graphAvg->forward();
  }

-  int totalSize = graphs_[0]->params()->vals()->size();
-  shardSize_ = ceil(totalSize / (float)devices_.size());
+  int totalSize = (int)graphs_[0]->params()->vals()->size();
+  shardSize_ = (int)ceil(totalSize / (float)devices_.size());

  int pos = 0;
  for(auto graph : graphs_) {
@ -99,7 +99,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
  size_t devs = devices_.size();
  auto batches = batch->split(delay_ * devs);

-  float div = batches.size();  // no. of batches
+  float div = (float)batches.size();  // no. of batches
  // do not average gradients if cost type is sum.
  if(options_->get<std::string>("cost-type") == "ce-sum")
    div = 1;
@ -131,7 +131,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
    }

    // Execute single forward/backward step
-    auto forwardBackward = [this, &costs, curBatches, t](size_t idx, int pos) {
+    auto forwardBackward = [this, &costs, curBatches, t](size_t idx, int /*pos*/) {
      auto graph = graphs_[idx];
      auto batch = curBatches[idx];

@ -152,8 +152,8 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {

    // Update parameter shard with gradient shard
    auto update = [this, div](size_t idx, int pos) {
-      int totalSize = graphs_[0]->params()->vals()->size();
-      int shardSize = ceil(totalSize / (float)devices_.size());
+      int totalSize = (int)graphs_[0]->params()->vals()->size();
+      int shardSize = (int)ceil(totalSize / (float)devices_.size());

      int size = std::min(totalSize - pos, shardSize);

--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -20,12 +20,12 @@ private:
  float getLearningRate(TrainingState& state) {
    float baselr = options_->get<float>("learn-rate");

-    float bno = state.batches - state.warmupStart;
+    auto bno = state.batches - state.warmupStart;

    size_t warmup = options_->get<size_t>("lr-warmup");
    float mult1 = 1.f;
    if(warmup > 0) {
-      mult1 = std::min(1.f, bno / (float)warmup);
+      mult1 = std::min(1.f, (float)bno / (float)warmup);
    }

    size_t decayGoogle = options_->get<size_t>("lr-decay-inv-sqrt");
@ -164,8 +164,8 @@ public:
  void update(float cost, const std::vector<Ptr<data::Batch>>& batches) {
    state_->validated = false;

-    auto batchSize = 0;    // number of sentences in batch
-    auto batchLabels = 0;  // number of target words in batch
+    size_t batchSize = 0;    // number of sentences in batch
+    size_t batchLabels = 0;  // number of target words in batch

    for(const auto& batch : batches) {
      batchSize += batch->size();
@ -303,7 +303,7 @@ public:
  }

  void actAfterEpoch(TrainingState& state) override {
-    float factor = options_->get<double>("lr-decay");
+    float factor = (float)options_->get<double>("lr-decay"); // @TODO: <float>?

    float baselr = getLearningRate(state);
    state.eta = baselr * state.factor;
@ -355,7 +355,7 @@ public:
  }

  void actAfterBatches(TrainingState& state) override {
-    float factor = options_->get<double>("lr-decay");
+    float factor = (float)options_->get<double>("lr-decay"); // @TODO: <float>?
    state.reset = false;

    float baselr = getLearningRate(state);
@ -365,7 +365,7 @@ public:
      if("batches" == options_->get<std::string>("lr-decay-strategy")) {
        size_t start
            = options_->get<std::vector<size_t>>("lr-decay-start").front();
-        int freq = options_->get<size_t>("lr-decay-freq");
+        size_t freq = options_->get<size_t>("lr-decay-freq");

        if(start > 0 && freq > 0 && state.batches >= start
           && ((state.batches - start) % freq == 0)) {
@ -403,7 +403,7 @@ public:
  }

  void actAfterStalled(TrainingState& state) override {
-    float factor = options_->get<double>("lr-decay");
+    float factor = (float)options_->get<double>("lr-decay"); // @TODO: <float>?
    state.reset = false;

    float baselr = getLearningRate(state);
@ -411,7 +411,7 @@ public:

    if(factor > 0.0) {
      if(options_->get<std::string>("lr-decay-strategy") == "stalled") {
-        int startStalled
+        size_t startStalled
            = options_->get<std::vector<size_t>>("lr-decay-start").front();
        if(startStalled && state.stalled && state.stalled % startStalled == 0) {
          state.factor *= factor;
--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@ -11,11 +11,11 @@ class TrainingState;

 class TrainingObserver {
 public:
-  virtual void init(TrainingState& state) {}
-  virtual void actAfterEpoch(TrainingState& state) {}
-  virtual void actAfterBatches(TrainingState& state) {}
-  virtual void actAfterStalled(TrainingState& state) {}
-  virtual void actAfterLoaded(TrainingState& state) {}
+  virtual void init(TrainingState&) {}
+  virtual void actAfterEpoch(TrainingState&) {}
+  virtual void actAfterBatches(TrainingState&) {}
+  virtual void actAfterStalled(TrainingState&) {}
+  virtual void actAfterLoaded(TrainingState&) {}
 };

 class TrainingState {
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -209,7 +209,7 @@ public:

    auto command = options_->get<std::string>("valid-script-path");
    auto valStr = utils::Exec(command);
-    float val = std::atof(valStr.c_str());
+    float val = (float)std::atof(valStr.c_str());
    updateStalled(graphs, val);

    return val;
@ -219,8 +219,8 @@ public:

 protected:
  virtual float validateBG(
-      const std::vector<Ptr<ExpressionGraph>>& graphs,
-      Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) override {
+      const std::vector<Ptr<ExpressionGraph>>& /*graphs*/,
+      Ptr<data::BatchGenerator<data::Corpus>> /*batchGenerator*/) override {
    return 0;
  }
 };
@ -332,7 +332,7 @@ public:
            std::stringstream best1;
            std::stringstream bestn;
            printer->print(history, best1, bestn);
-            collector->Write(history->GetLineNum(),
+            collector->Write((long)history->GetLineNum(),
                             best1.str(),
                             bestn.str(),
                             options_->get<bool>("n-best"));
@ -357,7 +357,7 @@ public:
      auto command
          = options_->get<std::string>("valid-script-path") + " " + fileName;
      auto valStr = utils::Exec(command);
-      val = std::atof(valStr.c_str());
+      val = (float)std::atof(valStr.c_str());
      updateStalled(graphs, val);
    }

@ -370,8 +370,8 @@ protected:
  bool quiet_{false};

  virtual float validateBG(
-      const std::vector<Ptr<ExpressionGraph>>& graphs,
-      Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) override {
+      const std::vector<Ptr<ExpressionGraph>>& /*graphs*/,
+      Ptr<data::BatchGenerator<data::Corpus>> /*batchGenerator*/) override {
    return 0;
  }
 };
@ -562,8 +562,8 @@ protected:
  }

  virtual float validateBG(
-      const std::vector<Ptr<ExpressionGraph>>& graphs,
-      Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) override {
+      const std::vector<Ptr<ExpressionGraph>>& /*graphs*/,
+      Ptr<data::BatchGenerator<data::Corpus>> /*batchGenerator*/) override {
    return 0;
  }
 };
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -15,8 +15,8 @@ private:
  Ptr<Options> options_;
  std::vector<Ptr<Scorer>> scorers_;
  size_t beamSize_;
-  Word trgEosId_ = -1;
-  Word trgUnkId_ = -1;
+  Word trgEosId_ = (Word)-1;
+  Word trgUnkId_ = (Word)-1;

 public:
  BeamSearch(Ptr<Options> options,
@ -49,8 +49,8 @@ public:
    for(size_t i = 0; i < keys.size(); ++i) {
      // Keys contains indices to vocab items in the entire beam.
      // Values can be between 0 and beamSize * vocabSize.
-      int embIdx = keys[i] % vocabSize;
-      int beamIdx = i / beamSize;
+      size_t embIdx = keys[i] % vocabSize;
+      auto beamIdx = i / beamSize;

      // Retrieve short list for final softmax (based on words aligned
      // to source sentences). If short list has been set, map the indices
@ -63,15 +63,15 @@ public:
        auto& beam = beams[beamIdx];
        auto& newBeam = newBeams[beamIdx];

-        int hypIdx = keys[i] / vocabSize;
+        size_t hypIdx = keys[i] / vocabSize;
        float pathScore = pathScores[i];

-        int hypIdxTrans
+        size_t hypIdxTrans
            = (hypIdx / beamSize) + (hypIdx % beamSize) * beams.size();
        if(first)
          hypIdxTrans = hypIdx;

-        int beamHypIdx = hypIdx % beamSize;
+        size_t beamHypIdx = hypIdx % beamSize;
        if(beamHypIdx >= (int)beam.size())
          beamHypIdx = beamHypIdx % beam.size();

@ -85,7 +85,7 @@ public:
          std::vector<float> breakDown(states.size(), 0);
          beam[beamHypIdx]->GetScoreBreakdown().resize(states.size(), 0);
          for(size_t j = 0; j < states.size(); ++j) {
-            int key = embIdx + hypIdxTrans * vocabSize;
+            size_t key = embIdx + hypIdxTrans * vocabSize;
            breakDown[j] = states[j]->breakDown(key)
                           + beam[beamHypIdx]->GetScoreBreakdown()[j];
          }
@ -95,7 +95,7 @@ public:
        // Set alignments
        if(!align.empty()) {
          hyp->SetAlignment(
-              getAlignmentsForHypothesis(align, batch, beamHypIdx, beamIdx));
+              getAlignmentsForHypothesis(align, batch, (int)beamHypIdx, (int)beamIdx));
        }

        newBeam.push_back(hyp);
@ -156,7 +156,7 @@ public:
  
  // main decoding function
  Histories search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
-    int dimBatch = batch->size();
+    int dimBatch = (int)batch->size();

    Histories histories;
    for(int i = 0; i < dimBatch; ++i) {
@ -212,7 +212,7 @@ public:
      } else {
        std::vector<float> beamScores;

-        int dimBatch = batch->size();
+        dimBatch = (int)batch->size();

        for(size_t i = 0; i < localBeamSize; ++i) {
          for(size_t j = 0; j < beams.size(); ++j) { // loop over batch entries (active sentences)
@ -240,7 +240,7 @@ public:

      for(size_t i = 0; i < scorers_.size(); ++i) {
        states[i] = scorers_[i]->step(
-            graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);
+            graph, states[i], hypIndices, embIndices, dimBatch, (int)localBeamSize);

        if(scorers_[i]->getWeight() != 1.f)
          pathScores = pathScores + scorers_[i]->getWeight() * states[i]->getLogProbs();
--- a/src/translator/helpers.cpp
+++ b/src/translator/helpers.cpp
@ -19,7 +19,7 @@ void SetColumn(Tensor in_, size_t col, float value) {

  float* in = in_->data();
  for(int rowNumber = 0; rowNumber < nRows; ++rowNumber) {
-    int index = col + rowNumber * nColumns;
+    auto index = col + rowNumber * nColumns;
    in[index] = value;
  }
 }
--- a/src/translator/nth_element.cpp
+++ b/src/translator/nth_element.cpp
@ -30,8 +30,8 @@ void NthElementCPU::getNBestList(float* scores,
  std::vector<int> idxs(numProbs);
  std::iota(idxs.begin(), idxs.end(), 0);

-  int numBatches = batchFirstElementIdxs.size() - 1;
-  for(int batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
+  size_t numBatches = batchFirstElementIdxs.size() - 1;
+  for(size_t batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
    int pos = cumulativeBeamSizes[batchIdx];
    int beamSize = cumulativeBeamSizes[batchIdx + 1] - pos;

@ -61,9 +61,9 @@ void NthElementCPU::getNBestList(const std::vector<size_t>& beamSizes,
  std::vector<int> cumulativeBeamSizes(beamSizes.size() + 1, 0);
  std::vector<int> batchFirstElementIdxs(beamSizes.size() + 1, 0);

-  size_t vocabSize = scores->shape()[-1];
-  for(size_t i = 0; i < beamSizes.size(); ++i) {
-    cumulativeBeamSizes[i + 1] = cumulativeBeamSizes[i] + beamSizes[i];
+  auto vocabSize = scores->shape()[-1];
+  for(int i = 0; i < beamSizes.size(); ++i) {
+    cumulativeBeamSizes[i + 1] = cumulativeBeamSizes[i] + (int)beamSizes[i];
    batchFirstElementIdxs[i + 1]
        += (isFirst ? i + 1 : cumulativeBeamSizes[i + 1]) * vocabSize;
  }
--- a/src/translator/output_collector.h
+++ b/src/translator/output_collector.h
@ -29,7 +29,7 @@ class GeometricPrinting : public PrintingStrategy {
 public:
  bool shouldBePrinted(long id) override {
    if(id == 0)
-      next_ = start_;
+      next_ = (long)start_;
    if(id <= 5)
      return true;
    if(next_ == id) {
--- a/src/translator/output_printer.cpp
+++ b/src/translator/output_printer.cpp
@ -24,8 +24,6 @@ std::string OutputPrinter::getAlignment(const Ptr<Hypothesis>& hyp) {
  } else {
    ABORT("Unrecognized word alignment type");
  }
-
-  return "";
 }

 }  // namespace marian
--- a/src/translator/scorers.cpp
+++ b/src/translator/scorers.cpp
@ -69,7 +69,7 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Config> options) {
    try {
      if(!options->get<bool>("ignore-model-config"))
        modelOptions->loadModelParameters(model);
-    } catch(std::runtime_error& e) {
+    } catch(std::runtime_error&) {
      LOG(warn, "No model settings found in model file");
    }

@ -96,7 +96,7 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Config> options,
    try {
      if(!options->get<bool>("ignore-model-config"))
        modelOptions->loadModelParameters(ptr);
-    } catch(std::runtime_error& e) {
+    } catch(std::runtime_error&) {
      LOG(warn, "No model settings found in model file");
    }

--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@ -111,7 +111,7 @@ public:
          std::stringstream best1;
          std::stringstream bestn;
          printer->print(history, best1, bestn);
-          collector->Write(history->GetLineNum(),
+          collector->Write((long)history->GetLineNum(),
                           best1.str(),
                           bestn.str(),
                           options_->get<bool>("n-best"));
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@ -63,14 +63,14 @@
    <ClCompile>
      <PrecompiledHeader>
      </PrecompiledHeader>
-      <WarningLevel>Level1</WarningLevel>
+      <WarningLevel>Level4</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>MKL_FOUND=1; BLAS_FOUND=1; MKL_ILP64; WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
-      <TreatWarningAsError>false</TreatWarningAsError>
+      <TreatWarningAsError>true</TreatWarningAsError>
      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebugDLL</RuntimeLibrary>
-      <DisableSpecificWarnings>4996;4244</DisableSpecificWarnings>
+      <DisableSpecificWarnings>4996; 4702</DisableSpecificWarnings>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <MinimalRebuild>false</MinimalRebuild>
    </ClCompile>
@ -79,6 +79,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>zlib.lib; mkl_intel_ilp64.lib; mkl_sequential.lib; mkl_core.lib; kernel32.lib; user32.lib; gdi32.lib; winspool.lib; comdlg32.lib; advapi32.lib; shell32.lib; ole32.lib; oleaut32.lib; uuid.lib; odbc32.lib; odbccp32.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <StackReserveSize>100000000</StackReserveSize>
+      <TreatLinkerWarningAsErrors>true</TreatLinkerWarningAsErrors>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@ -93,12 +94,12 @@
      <SDLCheck>true</SDLCheck>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
-      <TreatWarningAsError>false</TreatWarningAsError>
+      <TreatWarningAsError>true</TreatWarningAsError>
      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreadedDLL</RuntimeLibrary>
      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">MultiThreaded</RuntimeLibrary>
      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
      <OmitFramePointers>true</OmitFramePointers>
-      <DisableSpecificWarnings>4996</DisableSpecificWarnings>
+      <DisableSpecificWarnings>4996; 4702</DisableSpecificWarnings>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
    </ClCompile>
    <Link>
@ -108,6 +109,7 @@
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalDependencies>zlib.lib; mkl_intel_ilp64.lib; mkl_sequential.lib; mkl_core.lib; kernel32.lib; user32.lib; gdi32.lib; winspool.lib; comdlg32.lib; advapi32.lib; shell32.lib; ole32.lib; oleaut32.lib; uuid.lib; odbc32.lib; odbccp32.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <StackReserveSize>100000000</StackReserveSize>
+      <TreatLinkerWarningAsErrors>true</TreatLinkerWarningAsErrors>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@ -220,9 +220,6 @@
    <ClCompile Include="..\src\common\binary.cpp">
      <Filter>common</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\command\marian_conv.cpp">
-      <Filter>command</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\data\alignment.cpp">
      <Filter>data</Filter>
    </ClCompile>
@ -241,9 +238,6 @@
    <ClCompile Include="..\src\3rd_party\yaml-cpp\binary_renamed.cpp">
      <Filter>3rd_party\yaml-cpp</Filter>
    </ClCompile>
-    <ClCompile Include="..\src\command\marian.cpp">
-      <Filter>command</Filter>
-    </ClCompile>
    <ClCompile Include="..\src\training\graph_group_multinode_sync.cpp">
      <Filter>training</Filter>
    </ClCompile>
@ -1033,6 +1027,13 @@
    <ClInclude Include="..\src\command\marian_vocab.cpp">
      <Filter>command</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\command\marian.cpp">
+      <Filter>command</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\command\marian_decoder.cpp">
+      <Filter>command</Filter>
+    </ClInclude>
+    <ClInclude Include="..\src\command\marian_conv.cpp" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="3rd_party">