Enables model ensembles (#450)

* Enables model ensembles Adds the ability to use ensembles of models. This supports ensembles of binary- or npz-format models, as well as mixtures of both. When all models in the ensembles are of binary format, the load from memory path is used. Otherwise, they are loaded via the file system. Enable log-level debug for output related to this. * Fix formatting * Fix WASM bindings for MemoryBundle For now, this does not support ensembles. * Remove shared_ptr wrapping the AlignedMemory of models. * Fix formatting
2024-11-20 17:40:06 +03:00 · 2023-08-01 19:35:11 +01:00 · 2023-08-01 19:35:11 +01:00 · 4b0da8d434
commit 4b0da8d434
parent 8011f9c849
5 changed files with 45 additions and 31 deletions
--- a/src/translator/byte_array_util.cpp
+++ b/src/translator/byte_array_util.cpp
@ -91,21 +91,24 @@ AlignedMemory loadFileToMemory(const std::string& path, size_t alignment) {
  return alignedMemory;
 }

-AlignedMemory getModelMemoryFromConfig(marian::Ptr<marian::Options> options) {
+std::vector<AlignedMemory> getModelMemoryFromConfig(marian::Ptr<marian::Options> options) {
  auto models = options->get<std::vector<std::string>>("models");
-  ABORT_IF(models.size() != 1, "Loading multiple binary models is not supported for now as it is not necessary.");

-  // If binary model we load into aligned memory. If .npz we leave it be to
-  // return empty aligned memory, thus allowing traditional file system loads.
-  if (marian::io::isBin(models[0])) {
-    AlignedMemory alignedMemory = loadFileToMemory(models[0], 256);
-    return alignedMemory;
-  } else if (marian::io::isNpz(models[0])) {
-    return AlignedMemory();
-  } else {
-    ABORT("Unknown extension for model: {}, should be one of `.bin` or `.npz`", models[0]);
+  std::vector<AlignedMemory> modelMemories(models.size());
+  for (size_t i = 0; i < models.size(); ++i) {
+    const auto model = models[i];
+    if (marian::io::isBin(model)) {
+      modelMemories[i] = loadFileToMemory(model, 256);
+    } else if (marian::io::isNpz(model)) {
+      // if any of the models are npz format, we revert to loading from file for all models.
+      LOG(debug, "Encountered an npz file {}; will use file loading for {} models", model, models.size());
+      return {};
+    } else {
+      ABORT("Unknown extension for model: {}, should be one of `.bin` or `.npz`", model);
+    }
  }
-  return AlignedMemory();
+
+  return modelMemories;
 }

 AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options) {
@ -153,7 +156,7 @@ AlignedMemory getQualityEstimatorModel(MemoryBundle& memoryBundle, const marian:

 MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options) {
  MemoryBundle memoryBundle;
-  memoryBundle.model = getModelMemoryFromConfig(options);
+  memoryBundle.models = getModelMemoryFromConfig(options);
  memoryBundle.shortlist = getShortlistMemoryFromConfig(options);
  getVocabsMemoryFromConfig(options, memoryBundle.vocabs);
  memoryBundle.ssplitPrefixFile = getSsplitPrefixFileMemoryFromConfig(options);
--- a/src/translator/byte_array_util.h
+++ b/src/translator/byte_array_util.h
@ -5,7 +5,7 @@ namespace marian {
 namespace bergamot {

 AlignedMemory loadFileToMemory(const std::string& path, size_t alignment);
-AlignedMemory getModelMemoryFromConfig(marian::Ptr<marian::Options> options);
+std::vector<AlignedMemory> getModelMemoryFromConfig(marian::Ptr<marian::Options> options);
 AlignedMemory getQualityEstimatorModel(const marian::Ptr<marian::Options>& options);
 AlignedMemory getQualityEstimatorModel(MemoryBundle& memoryBundle, const marian::Ptr<marian::Options>& options);
 AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options);
--- a/src/translator/definitions.h
+++ b/src/translator/definitions.h
@ -19,8 +19,8 @@ typedef AlignedVector<char> AlignedMemory;
 /// Memory bundle for all byte-arrays.
 /// Can be a set/subset of model, shortlist, vocabs and ssplitPrefixFile bytes.
 struct MemoryBundle {
-  AlignedMemory model{};      ///< Byte-array of model (aligned to 256)
-  AlignedMemory shortlist{};  ///< Byte-array of shortlist (aligned to 64)
+  std::vector<AlignedMemory> models{};  ///< Byte-array of model (each element is aligned to 256)
+  AlignedMemory shortlist{};            ///< Byte-array of shortlist (aligned to 64)

  /// Vector of vocabulary memories (aligned to 64).
  /// If two vocabularies are the same (based on the filenames), two entries (shared
--- a/src/translator/translation_model.cpp
+++ b/src/translator/translation_model.cpp
@ -61,24 +61,35 @@ void TranslationModel::loadBackend(size_t idx) {
  graph->getBackend()->configureDevice(options_);
  graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));

-  // Marian Model: Load from memoryBundle or shortList
-  if (memory_.model.size() > 0 &&
-      memory_.model.begin() !=
-          nullptr) {  // If we have provided a byte array that contains the model memory, we can initialise the
-                      // model from there, as opposed to from reading in the config file
-    ABORT_IF((uintptr_t)memory_.model.begin() % 256 != 0,
-             "The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it.");
-    if (options_->get<bool>("check-bytearray", false)) {
-      ABORT_IF(!validateBinaryModel(memory_.model, memory_.model.size()),
-               "The binary file is invalid. Incomplete or corrupted download?");
-    }
-    const std::vector<const void *> container = {
-        memory_.model.begin()};  // Marian supports multiple models initialised in this manner hence std::vector.
-                                 // However we will only ever use 1 during decoding.
+  // if memory_.models is populated, then all models were of binary format
+  if (memory_.models.size() >= 1) {
+    const std::vector<const void *> container = std::invoke([&]() {
+      std::vector<const void *> model_ptrs(memory_.models.size());
+      for (size_t i = 0; i < memory_.models.size(); ++i) {
+        const AlignedMemory &model = memory_.models[i];
+
+        ABORT_IF(model.size() == 0 || model.begin() == nullptr, "The provided memory is empty. Cannot load the model.");
+        ABORT_IF(
+            (uintptr_t)model.begin() % 256 != 0,
+            "The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it.");
+        if (options_->get<bool>("check-bytearray", false)) {
+          ABORT_IF(!validateBinaryModel(model, model.size()),
+                   "The binary file is invalid. Incomplete or corrupted download?");
+        }
+
+        model_ptrs[i] = model.begin();
+        LOG(debug, "Loaded model {} of {} from memory", (i + 1), model_ptrs.size());
+      }
+      return model_ptrs;
+    });
+
    scorerEnsemble = createScorers(options_, container);
  } else {
+    // load npz format models, or a mixture of binary/npz formats
    scorerEnsemble = createScorers(options_);
+    LOG(debug, "Loaded {} model(s) from file", scorerEnsemble.size());
  }
+
  for (auto scorer : scorerEnsemble) {
    scorer->init(graph);
    if (shortlistGenerator_) {
--- a/wasm/bindings/service_bindings.cpp
+++ b/wasm/bindings/service_bindings.cpp
@ -48,7 +48,7 @@ MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory, AlignedMemory* shor
                                 std::vector<AlignedMemory*> uniqueVocabsMemories,
                                 AlignedMemory* qualityEstimatorMemory) {
  MemoryBundle memoryBundle;
-  memoryBundle.model = std::move(*modelMemory);
+  memoryBundle.models.emplace_back(std::move(*modelMemory));
  memoryBundle.shortlist = std::move(*shortlistMemory);
  memoryBundle.vocabs = std::move(prepareVocabsSmartMemories(uniqueVocabsMemories));
  if (qualityEstimatorMemory != nullptr) {