From 07c39c7d76587c439379a2ff4c208bad956cff87 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
Date: Fri, 28 Jan 2022 14:16:41 +0000
Subject: [PATCH] Cherry picked cleaning/refeactoring patches (#905)

Cherry-picked updates from pull request #457

Co-authored-by: Mateusz Chudyk <mateuszchudyk@gmail.com>
---
 src/common/cli_wrapper.cpp         | 13 ++++++--
 src/common/cli_wrapper.h           | 50 ++++++++++++++----------------
 src/common/config.h                |  2 +-
 src/common/config_parser.cpp       | 26 ++++++++--------
 src/common/definitions.h           |  2 +-
 src/common/logging.cpp             |  4 +--
 src/common/logging.h               | 12 +++----
 src/common/options.h               |  2 +-
 src/data/corpus_base.h             |  6 ++--
 src/data/dataset.h                 |  2 +-
 src/data/vocab.h                   |  2 +-
 src/graph/expression_graph.h       | 34 ++++++++++----------
 src/graph/expression_operators.cpp | 18 +++++------
 src/layers/convolution.h           |  4 +--
 src/layers/loss.h                  |  4 +--
 src/layers/word2vec_reader.h       |  2 +-
 src/models/bert.h                  |  4 +--
 src/models/encoder_classifier.h    |  2 +-
 src/models/encoder_decoder.cpp     |  2 +-
 src/models/transformer.h           |  2 +-
 src/training/communicator.cpp      |  4 +--
 src/training/graph_group.h         | 24 +++++++-------
 22 files changed, 113 insertions(+), 108 deletions(-)

diff --git a/src/common/cli_wrapper.cpp b/src/common/cli_wrapper.cpp
index 211dd0b9..fee50a2c 100644
--- a/src/common/cli_wrapper.cpp
+++ b/src/common/cli_wrapper.cpp
@@ -113,10 +113,10 @@ std::string CLIWrapper::switchGroup(std::string name) {
   return name;
 }
 
-void CLIWrapper::parse(int argc, char **argv) {
+void CLIWrapper::parse(int argc, char** argv) {
   try {
     app_->parse(argc, argv);
-  } catch(const CLI::ParseError &e) {
+  } catch(const CLI::ParseError& e) {
     exit(app_->exit(e));
   }
 
@@ -182,6 +182,13 @@ void CLIWrapper::parseAliases() {
   }
 }
 
+std::string CLIWrapper::keyName(const std::string& args) const {
+  // re-use existing functions from CLI11 to keep option names consistent
+  return std::get<1>(
+              CLI::detail::get_names(CLI::detail::split_names(args)))  // get long names only
+      .front();                                                        // get first long name
+}
+
 void CLIWrapper::updateConfig(const YAML::Node &config, cli::OptionPriority priority, const std::string &errorMsg) {
   auto cmdOptions = getParsedOptionNames();
   // Keep track of unrecognized options from the provided config
@@ -276,7 +283,7 @@ std::vector<std::string> CLIWrapper::getOrderedOptionNames() const {
   for(auto const &it : options_)
     keys.push_back(it.first);
   // sort option names by creation index
-  sort(keys.begin(), keys.end(), [this](const std::string &a, const std::string &b) {
+  sort(keys.begin(), keys.end(), [this](const std::string& a, const std::string& b) {
     return options_.at(a).idx < options_.at(b).idx;
   });
   return keys;
diff --git a/src/common/cli_wrapper.h b/src/common/cli_wrapper.h
index 349d353b..da8ebd6d 100644
--- a/src/common/cli_wrapper.h
+++ b/src/common/cli_wrapper.h
@@ -44,7 +44,7 @@ struct CLIAliasTuple {
 class CLIFormatter : public CLI::Formatter {
 public:
   CLIFormatter(size_t columnWidth, size_t screenWidth);
-  virtual std::string make_option_desc(const CLI::Option *) const override;
+  virtual std::string make_option_desc(const CLI::Option*) const override;
 
 private:
   size_t screenWidth_{0};
@@ -85,12 +85,7 @@ private:
 
   // Extract option name from a comma-separated list of long and short options, e.g. 'help' from
   // '--help,-h'
-  std::string keyName(const std::string &args) const {
-    // re-use existing functions from CLI11 to keep option names consistent
-    return std::get<1>(
-               CLI::detail::get_names(CLI::detail::split_names(args)))  // get long names only
-        .front();                                                       // get first long name
-  }
+  std::string keyName(const std::string &args) const;
 
   // Get names of options passed via command-line
   std::unordered_set<std::string> getParsedOptionNames() const;
@@ -134,7 +129,7 @@ public:
    * @return Option object
    */
   template <typename T>
-  CLI::Option *add(const std::string &args, const std::string &help, T val) {
+  CLI::Option* add(const std::string& args, const std::string& help, T val) {
     return addOption<T>(keyName(args),
                         args,
                         help,
@@ -159,7 +154,7 @@ public:
    * @TODO: require to always state the default value creating the parser as this will be clearer
    */
   template <typename T>
-  CLI::Option *add(const std::string &args, const std::string &help) {
+  CLI::Option* add(const std::string& args, const std::string& help) {
     return addOption<T>(keyName(args),
                         args,
                         help,
@@ -206,7 +201,7 @@ public:
   std::string switchGroup(std::string name = "");
 
   // Parse command-line arguments. Handles --help and --version options
-  void parse(int argc, char **argv);
+  void parse(int argc, char** argv);
 
   /**
    * @brief Expand aliases based on arguments parsed with parse(int, char**)
@@ -240,11 +235,12 @@ public:
   std::string dumpConfig(bool skipUnmodified = false) const;
 
 private:
-  template <typename T,
-            // options with numeric and string-like values
-            CLI::enable_if_t<!CLI::is_bool<T>::value && !CLI::is_vector<T>::value,
-                             CLI::detail::enabler> = CLI::detail::dummy>
-  CLI::Option *addOption(const std::string &key,
+  template <typename T>
+  using EnableIfNumbericOrString = CLI::enable_if_t<!CLI::is_bool<T>::value
+                                   && !CLI::is_vector<T>::value, CLI::detail::enabler>;
+
+  template <typename T, EnableIfNumbericOrString<T> = CLI::detail::dummy>
+  CLI::Option* addOption(const std::string &key,
                          const std::string &args,
                          const std::string &help,
                          T val,
@@ -261,7 +257,7 @@ private:
     CLI::callback_t fun = [this, key](CLI::results_t res) {
       options_[key].priority = cli::OptionPriority::CommandLine;
       // get variable associated with the option
-      auto &var = options_[key].var->as<T>();
+      auto& var = options_[key].var->as<T>();
       // store parser result in var
       auto ret = CLI::detail::lexical_cast(res[0], var);
       // update YAML entry
@@ -288,10 +284,11 @@ private:
     return options_[key].opt;
   }
 
-  template <typename T,
-            // options with vector values
-            CLI::enable_if_t<CLI::is_vector<T>::value, CLI::detail::enabler> = CLI::detail::dummy>
-  CLI::Option *addOption(const std::string &key,
+  template <typename T>
+  using EnableIfVector = CLI::enable_if_t<CLI::is_vector<T>::value, CLI::detail::enabler>;
+
+  template <typename T, EnableIfVector<T> = CLI::detail::dummy>
+  CLI::Option* addOption(const std::string &key,
                          const std::string &args,
                          const std::string &help,
                          T val,
@@ -308,7 +305,7 @@ private:
     CLI::callback_t fun = [this, key](CLI::results_t res) {
       options_[key].priority = cli::OptionPriority::CommandLine;
       // get vector variable associated with the option
-      auto &vec = options_[key].var->as<T>();
+      auto& vec = options_[key].var->as<T>();
       vec.clear();
       bool ret = true;
       // handle '[]' as an empty vector
@@ -316,7 +313,7 @@ private:
         ret = true;
       } else {
         // populate the vector with parser results
-        for(const auto &a : res) {
+        for(const auto& a : res) {
           vec.emplace_back();
           ret &= CLI::detail::lexical_cast(a, vec.back());
         }
@@ -345,10 +342,11 @@ private:
     return options_[key].opt;
   }
 
-  template <typename T,
-            // options with boolean values, called flags in CLI11
-            CLI::enable_if_t<CLI::is_bool<T>::value, CLI::detail::enabler> = CLI::detail::dummy>
-  CLI::Option *addOption(const std::string &key,
+  template <typename T>
+  using EnableIfBoolean = CLI::enable_if_t<CLI::is_bool<T>::value, CLI::detail::enabler>;
+
+  template <typename T, EnableIfBoolean<T> = CLI::detail::dummy>
+  CLI::Option* addOption(const std::string &key,
                          const std::string &args,
                          const std::string &help,
                          T val,
diff --git a/src/common/config.h b/src/common/config.h
index 255c50ad..c5a016e6 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -107,7 +107,7 @@ private:
  * @param mode change the set of available command-line options, e.g. training, translation, etc.
  * @param validate validate parsed options and abort on failure
  *
- * @return parsed otions
+ * @return parsed options
  */
 Ptr<Options> parseOptions(int argc,
                           char** argv,
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 9705d5b7..333d87a7 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -119,10 +119,10 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
   cli.add<std::vector<std::string>>("--config,-c",
     "Configuration file(s). If multiple, later overrides earlier");
   cli.add<size_t>("--workspace,-w",
-    "Preallocate  arg  MB of work space",
+    "Preallocate arg MB of work space",
     defaultWorkspace);
   cli.add<std::string>("--log",
-    "Log training process information to file given by  arg");
+    "Log training process information to file given by arg");
   cli.add<std::string>("--log-level",
     "Set verbosity level of logging: trace, debug, info, warn, err(or), critical, off",
     "info");
@@ -392,17 +392,17 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "Finish after this many chosen training units, 0 is infinity (e.g. 100e = 100 epochs, 10Gt = 10 billion target labels, 100Ku = 100,000 updates",
       "0e");
   cli.add<std::string/*SchedulerPeriod*/>("--disp-freq",
-      "Display information every  arg  updates (append 't' for every  arg  target labels)",
+      "Display information every arg updates (append 't' for every arg target labels)",
       "1000u");
   cli.add<size_t>("--disp-first",
-      "Display information for the first  arg  updates");
+      "Display information for the first arg updates");
   cli.add<bool>("--disp-label-counts",
       "Display label counts when logging loss progress",
       true);
 //   cli.add<int>("--disp-label-index",
 //       "Display label counts based on i-th input stream (-1 is last)", -1);
   cli.add<std::string/*SchedulerPeriod*/>("--save-freq",
-      "Save model file every  arg  updates (append 't' for every  arg  target labels)",
+      "Save model file every arg updates (append 't' for every arg target labels)",
       "10000u");
   cli.add<std::vector<std::string>>("--logical-epoch",
       "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt). "
@@ -473,12 +473,12 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   cli.add<bool>("--lr-decay-repeat-warmup",
      "Repeat learning rate warmup when learning rate is decayed");
   cli.add<std::vector<std::string/*SchedulerPeriod*/>>("--lr-decay-inv-sqrt",
-     "Decrease learning rate at arg / sqrt(no. batches) starting at arg  (append 't' or 'e' for sqrt(target labels or epochs)). "
+     "Decrease learning rate at arg / sqrt(no. batches) starting at arg (append 't' or 'e' for sqrt(target labels or epochs)). "
      "Add second argument to define the starting point (default: same as first value)",
      {"0"});
 
   cli.add<std::string/*SchedulerPeriod*/>("--lr-warmup",
-     "Increase learning rate linearly for  arg  first batches (append 't' for  arg  first target labels)",
+     "Increase learning rate linearly for arg first batches (append 't' for arg first target labels)",
       "0");
   cli.add<float>("--lr-warmup-start-rate",
      "Start value for learning rate warmup");
@@ -492,7 +492,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   cli.add<double>("--factor-weight",
      "Weight for loss function for factors (factored vocab only) (1 to disable)", 1.0f);
   cli.add<float>("--clip-norm",
-     "Clip gradient norm to  arg  (0 to disable)",
+     "Clip gradient norm to arg (0 to disable)",
      1.f); // @TODO: this is currently wrong with ce-sum and should rather be disabled or fixed by multiplying with labels
   cli.add<float>("--exponential-smoothing",
      "Maintain smoothed version of parameters for validation and saving with smoothing factor. 0 to disable. "
@@ -575,7 +575,7 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
   cli.add<std::vector<std::string>>("--valid-sets",
       "Paths to validation corpora: source target");
   cli.add<std::string/*SchedulerPeriod*/>("--valid-freq",
-      "Validate model every  arg  updates (append 't' for every  arg  target labels)",
+      "Validate model every arg updates (append 't' for every arg target labels)",
       "10000u");
   cli.add<std::vector<std::string>>("--valid-metrics",
       "Metric to use during validation: cross-entropy, ce-mean-words, perplexity, valid-script, "
@@ -585,7 +585,7 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
   cli.add<bool>("--valid-reset-stalled",
      "Reset all stalled validation metrics when the training is restarted");
   cli.add<size_t>("--early-stopping",
-     "Stop if the first validation metric does not improve for  arg  consecutive validation steps",
+     "Stop if the first validation metric does not improve for arg consecutive validation steps",
      10);
   cli.add<std::string>("--early-stopping-on",
       "Decide if early stopping should take into account first, all, or any validation metrics"
@@ -637,7 +637,7 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
   cli.add<bool>("--keep-best",
       "Keep best model for each validation metric");
   cli.add<std::string>("--valid-log",
-     "Log validation scores to file given by  arg");
+     "Log validation scores to file given by arg");
   cli.switchGroup(previous_group);
   // clang-format on
 }
@@ -942,10 +942,10 @@ void ConfigParser::addSuboptionsULR(cli::CLIWrapper& cli) {
   cli.add<std::string>("--ulr-query-vectors",
       "Path to file with universal sources embeddings from projection into universal space",
       "");
-  // keys: EK in Fig2 : is the keys of the target embbedings projected to unified space (i.e. ENU in
+  // keys: EK in Fig2 : is the keys of the target embeddings projected to unified space (i.e. ENU in
   // multi-lingual case)
   cli.add<std::string>("--ulr-keys-vectors",
-      "Path to file with universal sources embeddings of traget keys from projection into universal space",
+      "Path to file with universal sources embeddings of target keys from projection into universal space",
       "");
   cli.add<bool>("--ulr-trainable-transformation",
       "Make Query Transformation Matrix A trainable");
diff --git a/src/common/definitions.h b/src/common/definitions.h
index d2cf8aa4..159791d0 100644
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -10,7 +10,7 @@
 #include <string>
 #include <vector>
 
-#define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is neccessary, remove if no problems occur.
+#define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is necessary, remove if no problems occur.
 #define NodeOp(op) [=]() { op; }
 
 // helper macro to disable optimization (gcc only)
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 6ecc4099..f77a41df 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -136,11 +136,11 @@ static void setErrorHandlers() {
 
 // modify the log pattern for the "general" logger to include the MPI rank
 // This is called upon initializing MPI. It is needed to associated error messages to ranks.
-void switchtoMultinodeLogging(std::string nodeIdStr) {
+void switchToMultinodeLogging(std::string nodeIdStr) {
   Logger log = spdlog::get("general");
   if(log)
     log->set_pattern(fmt::format("[%Y-%m-%d %T mpi:{}] %v", nodeIdStr));
-  
+
   Logger valid = spdlog::get("valid");
   if(valid)
     valid->set_pattern(fmt::format("[%Y-%m-%d %T mpi:{}] [valid] %v", nodeIdStr));
diff --git a/src/common/logging.h b/src/common/logging.h
index 855bda90..b350d603 100644
--- a/src/common/logging.h
+++ b/src/common/logging.h
@@ -12,19 +12,19 @@ namespace marian {
   std::string getCallStack(size_t skipLevels);
 
   // Marian gives a basic exception guarantee. If you catch a
-  // MarianRuntimeError you must assume that the object can be 
+  // MarianRuntimeError you must assume that the object can be
   // safely destructed, but cannot be used otherwise.
 
-  // Internal multi-threading in exception-throwing mode is not 
+  // Internal multi-threading in exception-throwing mode is not
   // allowed; and constructing a thread-pool will cause an exception.
-  
+
   class MarianRuntimeException : public std::runtime_error {
   private:
     std::string callStack_;
 
   public:
-    MarianRuntimeException(const std::string& message, const std::string& callStack) 
-    : std::runtime_error(message), 
+    MarianRuntimeException(const std::string& message, const std::string& callStack)
+    : std::runtime_error(message),
       callStack_(callStack) {}
 
     const char* getCallStack() const throw() {
@@ -178,4 +178,4 @@ void checkedLog(std::string logger, std::string level, Args... args) {
 }
 
 void createLoggers(const marian::Config* options = nullptr);
-void switchtoMultinodeLogging(std::string nodeIdStr);
+void switchToMultinodeLogging(std::string nodeIdStr);
diff --git a/src/common/options.h b/src/common/options.h
index 08c6a3ca..992be876 100644
--- a/src/common/options.h
+++ b/src/common/options.h
@@ -98,7 +98,7 @@ public:
    * @brief Splice options from a YAML node
    *
    * By default, only options with keys that do not already exist in options_ are extracted from
-   * node. These options are cloned if overwirte is true.
+   * node. These options are cloned if overwrite is true.
    *
    * @param node a YAML node to transfer the options from
    * @param overwrite overwrite all options
diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
index 63a6fb99..d504a7ea 100644
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@@ -379,7 +379,7 @@ public:
    * @see marian::data::SubBatch::split(size_t n)
    */
   std::vector<Ptr<Batch>> split(size_t n, size_t sizeLimit /*=SIZE_MAX*/) override {
-    ABORT_IF(size() == 0, "Encoutered batch size of 0");
+    ABORT_IF(size() == 0, "Encountered batch size of 0");
 
     std::vector<std::vector<Ptr<SubBatch>>> subs; // [subBatchIndex][streamIndex]
     // split each stream separately
@@ -523,8 +523,8 @@ class CorpusBase : public DatasetBase<SentenceTuple, CorpusIterator, CorpusBatch
 public:
   typedef SentenceTuple Sample;
 
-  CorpusBase(Ptr<Options> options, 
-             bool translate = false, 
+  CorpusBase(Ptr<Options> options,
+             bool translate = false,
              size_t seed = Config::seed);
 
   CorpusBase(const std::vector<std::string>& paths,
diff --git a/src/data/dataset.h b/src/data/dataset.h
index 881126a3..3cdccec9 100644
--- a/src/data/dataset.h
+++ b/src/data/dataset.h
@@ -44,7 +44,7 @@ public:
   virtual void prepare() {}
   virtual void restore(Ptr<TrainingState>) {}
 
-  // @TODO: remove after cleaning traininig/training.h
+  // @TODO: remove after cleaning training/training.h
   virtual Ptr<Options> options() { return options_; }
 };
 
diff --git a/src/data/vocab.h b/src/data/vocab.h
index 4af82e8e..7eeca290 100644
--- a/src/data/vocab.h
+++ b/src/data/vocab.h
@@ -10,7 +10,7 @@ namespace marian {
 class IVocab;
 
 // Wrapper around vocabulary types. Can choose underlying
-// vocabulary implementation (vImpl_) based on speficied path
+// vocabulary implementation (vImpl_) based on specified path
 // and suffix.
 // Vocabulary implementations can currently be:
 // * DefaultVocabulary for YAML (*.yml and *.yaml) and TXT (any other non-specific ending)
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index c532abff..7e2a5704 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -76,7 +76,7 @@ public:
 
   Ptr<Allocator>       getAllocator() { return tensors_->allocator(); }
   Ptr<TensorAllocator> getTensorAllocator() { return tensors_; }
-  
+
   Expr findOrRemember(Expr node) {
     size_t hash = node->hash();
     // memoize constant nodes that are not parameters
@@ -359,9 +359,9 @@ private:
 
   // Find the named parameter and its typed parent parameter object (params) and return both.
   // If the parameter is not found return the parent parameter object that the parameter should be added to.
-  // Return [nullptr, nullptr] if no matching parent parameter object exists. 
-  std::tuple<Expr, Ptr<Parameters>> findParams(const std::string& name, 
-                                               Type elementType, 
+  // Return [nullptr, nullptr] if no matching parent parameter object exists.
+  std::tuple<Expr, Ptr<Parameters>> findParams(const std::string& name,
+                                               Type elementType,
                                                bool typeSpecified) const {
     Expr p; Ptr<Parameters> params;
     if(typeSpecified) { // type has been specified, so we are only allowed to look for a parameter with that type
@@ -373,12 +373,12 @@ private:
     } else { // type has not been specified, so we take any type as long as the name matches
       for(auto kvParams : paramsByElementType_) {
         p = kvParams.second->get(name);
-        
+
         if(p) { // p has been found, return with matching params object
           params = kvParams.second;
           break;
         }
-        
+
         if(kvParams.first == elementType) // even if p has not been found, set the params object to be returned
           params = kvParams.second;
       }
@@ -399,8 +399,8 @@ private:
 
     Expr p; Ptr<Parameters> params; std::tie
     (p, params) = findParams(name, elementType, typeSpecified);
-    
-    if(!params) { 
+
+    if(!params) {
       params = New<Parameters>(elementType);
       params->init(backend_);
       paramsByElementType_.insert({elementType, params});
@@ -632,13 +632,13 @@ public:
    * Return the Parameters object related to the graph.
    * The Parameters object holds the whole set of the parameter nodes.
    */
-  Ptr<Parameters>& params() { 
+  Ptr<Parameters>& params() {
     // There are no parameter objects, that's weird.
     ABORT_IF(paramsByElementType_.empty(), "No parameter object has been created");
-    
+
     // Safeguard against accessing parameters from the outside with multiple parameter types, not yet supported
     ABORT_IF(paramsByElementType_.size() > 1, "Calling of params() is currently not supported with multiple ({}) parameters", paramsByElementType_.size());
-    
+
     // Safeguard against accessing parameters from the outside with other than default parameter type, not yet supported
     auto it = paramsByElementType_.find(defaultElementType_);
     ABORT_IF(it == paramsByElementType_.end(), "Parameter object for type {} does not exist", defaultElementType_);
@@ -650,7 +650,7 @@ public:
    * Return the Parameters object related to the graph by elementType.
    * The Parameters object holds the whole set of the parameter nodes of the given type.
    */
-  Ptr<Parameters>& params(Type elementType) {     
+  Ptr<Parameters>& params(Type elementType) {
     auto it = paramsByElementType_.find(elementType);
     ABORT_IF(it == paramsByElementType_.end(), "Parameter object for type {} does not exist", defaultElementType_);
     return it->second;
@@ -661,8 +661,8 @@ public:
    * The default value is used if some node type is not specified.
    */
   void setDefaultElementType(Type defaultElementType) {
-    ABORT_IF(!paramsByElementType_.empty() && defaultElementType != defaultElementType_, 
-             "Parameter objects already exist, cannot change default type from {} to {}", 
+    ABORT_IF(!paramsByElementType_.empty() && defaultElementType != defaultElementType_,
+             "Parameter objects already exist, cannot change default type from {} to {}",
              defaultElementType_, defaultElementType);
     defaultElementType_ = defaultElementType;
   }
@@ -746,7 +746,7 @@ public:
       // skip over special parameters starting with "special:"
       if(pName.substr(0, 8) == "special:")
         continue;
-      
+
       // if during loading the loaded type is of the same type class as the default element type, allow conversion;
       // otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both
       // have type class TypeClass::float_type.
@@ -781,9 +781,9 @@ public:
 
     LOG(info, "Memory mapping model at {}", ptr);
     auto items = io::mmapItems(ptr);
-    
+
     // Deal with default parameter set object that might not be a mapped object.
-    // This gets assigned during ExpressionGraph::setDevice(...) and by default 
+    // This gets assigned during ExpressionGraph::setDevice(...) and by default
     // would contain allocated tensors. Here we replace it with a mmapped version.
     auto it = paramsByElementType_.find(defaultElementType_);
     if(it != paramsByElementType_.end()) {
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 560ab4e7..322a29ad 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -27,12 +27,12 @@ Expr checkpoint(Expr a) {
   return a;
 }
 
-Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type, 
+Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type,
             LambdaNodeFunctor fwd, size_t hash) {
   return Expression<LambdaNodeOp>(nodes, shape, type, fwd, hash);
 }
 
-Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type, 
+Expr lambda(const std::vector<Expr>& nodes, Shape shape, Type type,
             LambdaNodeFunctor fwd, LambdaNodeFunctor bwd, size_t hash) {
   return Expression<LambdaNodeOp>(nodes, shape, type, fwd, bwd, hash);
 }
@@ -436,7 +436,7 @@ Expr std(Expr a, int ax) {
   return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::rms);
 }
 
-Expr var(Expr a, int ax) { 
+Expr var(Expr a, int ax) {
   if(a->shape()[ax] == 1) // nothing to reduce, var(a) = 0
     return a - a;
   return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::meanSqr);
@@ -575,8 +575,8 @@ Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float sc
   return Expression<AffineNodeOp>(nodes, transA, transB, scale);
 }
 
-// This operation used to implement auto-tuning. We have removed it for now due to complexity, but plan to revisit it in the future. 
-// The last branch with auto-tuner is: 
+// This operation used to implement auto-tuning. We have removed it for now due to complexity, but plan to revisit it in the future.
+// The last branch with auto-tuner is:
 // youki/packed-model-pr-backup1031
 // https://machinetranslation.visualstudio.com/Marian/_git/marian-dev?version=GByouki%2Fpacked-model-pr-backup1031
 // SHA: 3456a7ed1d1608cfad74cd2c414e7e8fe141aa52
@@ -660,8 +660,8 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
     }
   } else {
     // Default GEMM
-    ABORT_IF(!isFloat(aElementType) || !isFloat(bElementType), 
-             "GPU-based GEMM only supports float types, you have A: {} and B: {}", 
+    ABORT_IF(!isFloat(aElementType) || !isFloat(bElementType),
+             "GPU-based GEMM only supports float types, you have A: {} and B: {}",
              aElementType, bElementType);
     return affineDefault(a, b, bias, transA, transB, scale);
   }
@@ -669,7 +669,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
 
 Expr affineWithRelu(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
   auto graph = a->graph();
-  
+
   if(graph->isInference() && graph->getDeviceId().type == DeviceType::gpu)
     return Expression<AffineWithReluNodeOp>(a, b, bias, transA, transB, scale);
   else
@@ -775,7 +775,7 @@ Expr unlikelihood(Expr logits, Expr indices) {
   int dimBatch = logits->shape()[-2];
   int dimTime  = logits->shape()[-3];
 
-  // @TODO: fix this outside of this function in decoder.h etc. 
+  // @TODO: fix this outside of this function in decoder.h etc.
   auto indicesWithLayout = reshape(indices, {1, dimTime, dimBatch, 1});
 
   // This is currently implemented with multiple ops, might be worth doing a special operation like for cross_entropy
diff --git a/src/layers/convolution.h b/src/layers/convolution.h
index c6024f63..463419e8 100644
--- a/src/layers/convolution.h
+++ b/src/layers/convolution.h
@@ -70,9 +70,9 @@ public:
       outputs.push_back(output2);
     }
 
-    auto concated = concatenate(outputs, -1);
+    auto concatenated = concatenate(outputs, -1);
 
-    return concated;
+    return concatenated;
   }
 
 protected:
diff --git a/src/layers/loss.h b/src/layers/loss.h
index c662f991..5dbb5e55 100644
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@@ -67,7 +67,7 @@ public:
     return count_->val()->scalar<T>();
   }
 
-  // @TODO: add a funtion for returning maybe ratio?
+  // @TODO: add a function for returning maybe ratio?
 
   size_t size() const {
     ABORT_IF(!count_, "Labels have not been defined");
@@ -189,7 +189,7 @@ public:
  *
  * L = sum_i^N L_i + N/M sum_j^M L_j
  *
- * We set labels to N. When reporting L/N this is equvalient to sum of means.
+ * We set labels to N. When reporting L/N this is equivalent to sum of means.
  * Compare to sum of means below where N is factored into the loss, but labels
  * are set to 1.
  */
diff --git a/src/layers/word2vec_reader.h b/src/layers/word2vec_reader.h
index 4bfc6709..c76d3a9b 100644
--- a/src/layers/word2vec_reader.h
+++ b/src/layers/word2vec_reader.h
@@ -76,7 +76,7 @@ private:
     float scale = sqrtf(2.0f / (dimVoc + dimEmb));
 
     // @TODO: switch to new random generator back-end.
-    // This is rarly used however.
+    // This is rarely used however.
     std::random_device rd;
     std::mt19937 engine(rd());
 
diff --git a/src/models/bert.h b/src/models/bert.h
index 51427457..99dfae55 100644
--- a/src/models/bert.h
+++ b/src/models/bert.h
@@ -8,7 +8,7 @@
 namespace marian {
 
 /**
- * This file contains nearly all BERT-related code and adds BERT-funtionality
+ * This file contains nearly all BERT-related code and adds BERT-functionality
  * on top of existing classes like TansformerEncoder and Classifier.
  */
 
@@ -82,7 +82,7 @@ public:
     // Initialize to sample random vocab id
     randomWord_.reset(new std::uniform_int_distribution<WordIndex>(0, (WordIndex)vocab.size()));
 
-    // Intialize to sample random percentage
+    // Initialize to sample random percentage
     randomPercent_.reset(new std::uniform_real_distribution<float>(0.f, 1.f));
 
     auto& words = subBatch->data();
diff --git a/src/models/encoder_classifier.h b/src/models/encoder_classifier.h
index 5c8ddb5a..bb8d2856 100644
--- a/src/models/encoder_classifier.h
+++ b/src/models/encoder_classifier.h
@@ -14,7 +14,7 @@ namespace marian {
  * Can be used to train sequence classifiers like language detection, BERT-next-sentence-prediction etc.
  * Already has support for multi-objective training.
  *
- * @TODO: this should probably be unified somehow with EncoderDecoder which could allow for deocder/classifier
+ * @TODO: this should probably be unified somehow with EncoderDecoder which could allow for decoder/classifier
  * multi-objective training.
  */
 class EncoderClassifierBase : public models::IModel {
diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp
index bb938ee5..5711ea1b 100644
--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@@ -220,7 +220,7 @@ Ptr<DecoderState> EncoderDecoder::stepAll(Ptr<ExpressionGraph> graph,
   if(clearGraph)
     clear(graph);
 
-  // Required first step, also intializes shortlist
+  // Required first step, also initializes shortlist
   auto state = startState(graph, batch);
 
   // Fill state with embeddings from batch (ground truth)
diff --git a/src/models/transformer.h b/src/models/transformer.h
index af877600..ec68b801 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -70,7 +70,7 @@ public:
       // Hack for translating with length longer than trained embeddings
       // We check if the embedding matrix "Wpos" already exist so we can
       // check the number of positions in that loaded parameter.
-      // We then have to restict the maximum length to the maximum positon
+      // We then have to restrict the maximum length to the maximum positon
       // and positions beyond this will be the maximum position.
       Expr seenEmb = graph_->get("Wpos");
       int numPos = seenEmb ? seenEmb->shape()[-2] : maxLength;
diff --git a/src/training/communicator.cpp b/src/training/communicator.cpp
index 55d4991b..602f7daa 100644
--- a/src/training/communicator.cpp
+++ b/src/training/communicator.cpp
@@ -101,7 +101,7 @@ public:
       std::string maxRankStr = std::to_string(MPIWrapper::numMPIProcesses() -1);
       while (rankStr.size() < maxRankStr.size()) // pad so that logs across MPI processes line up nicely
         rankStr.insert(rankStr.begin(), ' ');
-      switchtoMultinodeLogging(rankStr);
+      switchToMultinodeLogging(rankStr);
     }
 
     // log hostnames in order, and test
@@ -261,7 +261,7 @@ void finalizeMPI(Ptr<IMPIWrapper>&& mpi) {
   ABORT_IF(mpi == nullptr || mpi != s_mpi, "attempted to finalize an inconsistent MPI instance. This should not be possible.");
   mpi = nullptr; // destruct caller's handle
   ABORT_IF(s_mpiUseCount == 0, "finalize called too many times. This should not be possible.");
-  if (s_mpiUseCount == 1) { // last call finalizes MPI, i.e. tells MPI that we sucessfully completed computation
+  if (s_mpiUseCount == 1) { // last call finalizes MPI, i.e. tells MPI that we successfully completed computation
     ABORT_IF(s_mpi.use_count() != 1, "dangling reference to MPI??"); // caller kept another shared_ptr to this instance
     s_mpi->finalize(); // signal successful completion to MPI
     s_mpi = nullptr;   // release the singleton instance upon last finalization
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index 422990b1..0e4a68dc 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -13,7 +13,7 @@ namespace marian {
 
 // With -Ofast enabled gcc will fail to identify NaN or Inf. Safeguard here.
 static inline bool isFinite(float x) {
-#ifdef __GNUC__ 
+#ifdef __GNUC__
   ABORT_IF(std::isfinite(0.f / 0.f), "NaN detection unreliable. Disable -Ofast compiler option.");
 #endif
   return std::isfinite(x);
@@ -27,7 +27,7 @@ static inline bool isFinite(float x) {
 // if one value is nonfinite propagate Nan into the reduction.
 static inline void accNanOrNorm(float& lhs, float rhs) {
   if(isFinite(lhs) && isFinite(rhs)) {
-    lhs = sqrtf(lhs * lhs + rhs * rhs); 
+    lhs = sqrtf(lhs * lhs + rhs * rhs);
   } else
     lhs = std::numeric_limits<float>::quiet_NaN();
 }
@@ -42,20 +42,20 @@ static inline void accNanOrNorm(float& lhs, float rhs) {
 class GraphGroup {
 protected:
   Ptr<Options> options_;
-  
+
   Ptr<ICommunicator> comm_; // [not null] communicator, e.g. NCCLCommunicator
   Ptr<IMPIWrapper> mpi_;    // [not null] all MPI-like communication goes through this (this is a dummy implementation if no MPI run)
 
   std::vector<DeviceId> devices_;                   // [deviceIndex]
-  ShardingMode shardingMode_{ShardingMode::global}; // If local and multi-node training, shard only on local devices and do full sync (faster). If global shard across entire set of GPUs (more RAM). 
-  
+  ShardingMode shardingMode_{ShardingMode::global}; // If local and multi-node training, shard only on local devices and do full sync (faster). If global shard across entire set of GPUs (more RAM).
+
   // common for all graph groups, individual graph groups decide how to fill them
   std::vector<Ptr<ExpressionGraph>> graphs_;            // [deviceIndex]
   std::vector<Ptr<models::ICriterionFunction>> models_; // [deviceIndex]
   std::vector<Ptr<OptimizerBase>> optimizerShards_;     // [deviceIndex]
 
   Ptr<Scheduler> scheduler_; // scheduler that keeps track of how much has been processed
-  
+
   bool finalized_{false};    // 'true' if training has completed (further updates are no longer allowed)
   double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words
   bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false
@@ -100,16 +100,16 @@ public:
 
   virtual void load();
   virtual void save(bool isFinal = false);
-  
+
 private:
   void load(const OptimizerBase::ScatterStateFunc& scatterFn);
   void save(bool isFinal,
             const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn);
 
-  bool restoreFromCheckpoint(const std::string& modelFileName, 
+  bool restoreFromCheckpoint(const std::string& modelFileName,
                              const OptimizerBase::ScatterStateFunc& scatterFn);
 
-  void saveCheckpoint(const std::string& modelFileName, 
+  void saveCheckpoint(const std::string& modelFileName,
                       const OptimizerBase::GatherStateFunc& gatherFn);
 
 public:
@@ -128,11 +128,11 @@ public:
   float executeAndCollectNorm(const std::function<float(size_t, size_t, size_t)>& task);
 
   float computeNormalizationFactor(float gNorm, size_t updateTrgWords);
-  
+
   /**
    * Determine maximal batch size that can fit into the given workspace
    * so that reallocation does not happen. Rather adjust the batch size
-   * based on the stastistics collected here. Activated with
+   * based on the statistics collected here. Activated with
    * `--mini-batch-fit`.
    * In a multi-GPU scenario, the first GPU is used to determine the size.
    * The actual allowed size is then determined by multiplying it with the
@@ -151,4 +151,4 @@ public:
   void updateAverageTrgBatchWords(size_t trgBatchWords);
 };
 
-}  // namespace marian
\ No newline at end of file
+}  // namespace marian