From e0b9bad0581963f4ae1f953f1b52de5619672384 Mon Sep 17 00:00:00 2001
From: Abhishek Aggarwal <aaggarwal@mozilla.com>
Date: Wed, 12 May 2021 14:39:23 +0200
Subject: [PATCH 01/12] Updated wasm README to update for passing vocabs as
 bytes

 - Updated Using JS APIs section to pass vocabs as bytes
---
 wasm/README.md | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/wasm/README.md b/wasm/README.md
index 337ae1b..eceac15 100644
--- a/wasm/README.md
+++ b/wasm/README.md
@@ -7,8 +7,8 @@ Please note that [Using JS APIs](#Using-JS-APIs) and [Demo](#Demo) section below
 
 ```bash
 cd test_page
-mkdir models
 git clone --depth 1 --branch main --single-branch https://github.com/mozilla-applied-ml/bergamot-models
+mkdir models
 cp -rf bergamot-models/prod/* models
 gunzip models/*/*
 ```
@@ -18,10 +18,7 @@ gunzip models/*/*
 ```js
 // The model configuration as YAML formatted string. For available configuration options, please check: https://marian-nmt.github.io/docs/cmd/marian-decoder/
 // This example captures some of the most relevant options
-const modelConfig = `vocabs:
-  - /esen/vocab.esen.spm
-  - /esen/vocab.esen.spm
-beam-size: 1
+const modelConfig = `beam-size: 1
 normalize: 1.0
 word-penalty: 0
 max-length-break: 128
@@ -35,19 +32,31 @@ quiet-translation: true
 gemm-precision: int8shift
 `;
 
-// Download model and shortlist files and read them into buffers
+// Download model, shortlist and vocabulary files and read them into buffers
 const modelFile = `models/esen/model.esen.intgemm.alphas.bin`;
 const shortlistFile = `models/esen/lex.50.50.esen.s2t.bin`;
-const downloadedBuffers = await Promise.all([downloadAsArrayBuffer(modelFile), downloadAsArrayBuffer(shortlistFile)]); // Please refer to bergamot.html in test_page folder for this function
+const vocabFiles = [`models/${languagePair}/vocab.${vocabLanguagePair}.spm`,
+                    `models/${languagePair}/vocab.${vocabLanguagePair}.spm`];
+const uniqueVocabFiles = new Set(vocabFiles);
+
+// Please refer to bergamot.html in test_page folder for downloadAsArrayBuffer function
+const downloadedBuffers = await Promise.all([downloadAsArrayBuffer(modelFile), downloadAsArrayBuffer(shortlistFile)]);
+const downloadedVocabBuffers = [];
+for (let item of uniqueVocabFiles.values()) {
+  downloadedVocabBuffers.push(await downloadAsArrayBuffer(item));
+}
+
 const modelBuffer = downloadedBuffers[0];
 const shortListBuffer = downloadedBuffers[1];
 
 // Construct AlignedMemory instances from the buffers
 var alignedModelMemory = constructAlignedMemoryFromBuffer(modelBuffer, 256); // Please refer to bergamot.html in test_page folder for this function
 var alignedShortlistMemory = constructAlignedMemoryFromBuffer(shortListBuffer, 64); // Please refer to bergamot.html in test_page folder for this function
+var alignedVocabsMemoryList = new Module.AlignedMemoryList;
+downloadedVocabBuffers.forEach(item => alignedVocabsMemoryList.push_back(constructAlignedMemoryFromBuffer(item, 64)));
 
 // Instantiate the TranslationModel
-const model = new Module.TranslationModel(modelConfig, alignedModelMemory, alignedShortlistMemory);
+const model = new Module.TranslationModel(modelConfig, alignedModelMemory, alignedShortlistMemory, alignedVocabsMemoryList);
 
 // Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector<string>)
 const request = new Module.TranslationRequest();

From 0189500160eaac8ae7e8a7b8d03ae91c98063684 Mon Sep 17 00:00:00 2001
From: Abhishek Aggarwal <aaggarwal@mozilla.com>
Date: Wed, 12 May 2021 14:44:33 +0200
Subject: [PATCH 02/12] Updated README to remove packaging steps for wasm
 compilation

 - We don't need to package model, shortlist or vocab files into wasm
   binary at build time
---
 README.md | 94 +++++++++++++++++--------------------------------------
 1 file changed, 28 insertions(+), 66 deletions(-)

diff --git a/README.md b/README.md
index 456af70..f48c981 100644
--- a/README.md
+++ b/README.md
@@ -5,85 +5,47 @@ Bergamot translator provides a unified API for ([Marian NMT](https://marian-nmt.
 ## Build Instructions
 
 ### Build Natively
-1. Clone the repository using these instructions:
-    ```bash
-    git clone https://github.com/browsermt/bergamot-translator
-    cd bergamot-translator
-    ```
-2. Compile
+Create a folder where you want to build all the artifacts (`build-native` in this case) and compile
 
-    Create a folder where you want to build all the artifacts (`build-native` in this case) and compile in that folder
-    ```bash
-    mkdir build-native
-    cd build-native
-    cmake ../
-    make -j
-    ```
+```bash
+mkdir build-native
+cd build-native
+cmake ../
+make -j3
+```
 
 ### Build WASM
-#### Compiling for the first time
+#### Prerequisite
 
-1. Download and Install Emscripten using following instructions
-    * Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git`
-    * Enter the cloned directory: `cd emsdk`
-    * Install the lastest sdk tools: `./emsdk install latest`
-    * Activate the latest sdk tools: `./emsdk activate latest`
-    * Activate path variables: `source ./emsdk_env.sh`
+Building on wasm requires Emscripten toolchain. It can be downloaded and installed using following instructions:
 
-2. Clone the repository using these instructions:
+* Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git`
+* Enter the cloned directory: `cd emsdk`
+* Install the lastest sdk tools: `./emsdk install latest`
+* Activate the latest sdk tools: `./emsdk activate latest`
+* Activate path variables: `source ./emsdk_env.sh`
+
+#### <a name="Compile"></a> Compile
+
+1. Create a folder where you want to build all the artifacts (`build-wasm` in this case) and compile
     ```bash
-    git clone https://github.com/browsermt/bergamot-translator
-    cd bergamot-translator
+    mkdir build-wasm
+    cd build-wasm
+    emcmake cmake -DCOMPILE_WASM=on ../
+    emmake make -j3
     ```
 
-3. Download files (only required if you want to perform inference using build artifacts)
+    The wasm artifacts (.js and .wasm files) will be available in the build directory ("build-wasm" in this case).
 
-    It packages the vocabulary files into wasm binary, which is required only if you want to perform inference.
-    The compilation commands will preload these files in Emscripten’s virtual file system.
-
-    If you want to package bergamot project specific files, please follow these instructions:
+2. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts
     ```bash
-    git clone --depth 1 --branch main --single-branch https://github.com/mozilla-applied-ml/bergamot-models
-    mkdir models
-    cp -rf bergamot-models/prod/* models
-    gunzip models/*/*
-    find models \( -type f -name "model*" -or -type f -name "lex*" \) -delete
+    bash ../wasm/patch-artifacts-enable-wormhole.sh
     ```
 
-4. Compile
-    1. Create a folder where you want to build all the artefacts (`build-wasm` in this case)
-        ```bash
-        mkdir build-wasm
-        cd build-wasm
-        ```
-
-    2. Compile the artefacts
-        * If you want to package files into wasm binary then execute following commands (Replace `FILES_TO_PACKAGE` with the
-        directory containing all the files to be packaged)
-
-            ```bash
-            emcmake cmake -DCOMPILE_WASM=on -DPACKAGE_DIR=FILES_TO_PACKAGE ../
-            emmake make -j
-            ```
-            e.g. If you want to package bergamot project specific files (downloaded using step 3 above) then
-            replace `FILES_TO_PACKAGE` with `../models`
-
-        * If you don't want to package any file into wasm binary then execute following commands:
-            ```bash
-            emcmake cmake -DCOMPILE_WASM=on ../
-            emmake make -j
-            ```
-
-        The wasm artifacts (.js and .wasm files) will be available in the build directory ("build-wasm" in this case).
-
-    3. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts
-        ```bash
-        bash ../wasm/patch-artifacts-enable-wormhole.sh
-        ```
-
 #### Recompiling
-As long as you don't update any submodule, just follow steps in `4.ii` and `4.iii` to recompile.\
-If you update a submodule, execute following command before executing steps in `4.ii` and `4.iii` to recompile.
+As long as you don't update any submodule, just follow [Compile](#Compile) steps.\
+If you update a submodule, execute following command in repository root folder before executing
+[Compile](#Compile) steps.
 ```bash
 git submodule update --init --recursive
 ```

From 6c063c607ee5a5ffc00a2fe64a6c32164699ceab Mon Sep 17 00:00:00 2001
From: Abhishek Aggarwal <aaggarwal@mozilla.com>
Date: Wed, 12 May 2021 14:46:02 +0200
Subject: [PATCH 03/12] Updated CMakeLists.txt to remove packaging steps for
 wasm compilation

 - Removed PACKAGE_DIR cmake option
 - Removed Workerfs, FORCE_FILESYSTEM=1 in wasm builds
   -- File system support is not needed any more (since model,
     shortlist and vocabs are being passed as bytes now)
---
 CMakeLists.txt      | 2 --
 wasm/CMakeLists.txt | 9 +--------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef64863..332aed1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,8 +42,6 @@ option(COMPILE_WASM "Compile for WASM" OFF)
 cmake_dependent_option(USE_WASM_COMPATIBLE_SOURCE "Use wasm compatible sources" OFF "NOT COMPILE_WASM" ON)
 option(COMPILE_TESTS "Compile bergamot-tests" OFF)
 
-SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds")
-
 # Set 3rd party submodule specific cmake options for this project
 SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
 SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt
index 4962da0..7feef75 100644
--- a/wasm/CMakeLists.txt
+++ b/wasm/CMakeLists.txt
@@ -14,14 +14,7 @@ target_include_directories(bergamot-translator-worker
 target_compile_definitions(bergamot-translator-worker PRIVATE WASM_BINDINGS)
 target_compile_options(bergamot-translator-worker PRIVATE ${WASM_COMPILE_FLAGS})
 
-set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s FORCE_FILESYSTEM=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1 -s EXPORTED_RUNTIME_METHODS=[addOnPreMain]")
-if (NOT PACKAGE_DIR STREQUAL "")
-  get_filename_component(REALPATH_PACKAGE_DIR ${PACKAGE_DIR} REALPATH BASE_DIR ${CMAKE_BINARY_DIR})
-  set(LINKER_FLAGS "${LINKER_FLAGS} --preload-file ${REALPATH_PACKAGE_DIR}@/")
-endif()
-
-# Enable worker file system
-set(LINKER_FLAGS "${LINKER_FLAGS} -lworkerfs.js")
+set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1 -s EXPORTED_RUNTIME_METHODS=[addOnPreMain]")
 
 # Avoid node.js-code in emscripten glue-code
 set(LINKER_FLAGS "${LINKER_FLAGS} -s ENVIRONMENT=web,worker")

From 6c7e6156ab2816310d90e5f4d9489f96f10decb2 Mon Sep 17 00:00:00 2001
From: Qianqian Zhu <qianqian.zhu@hotmail.com>
Date: Thu, 13 May 2021 13:18:08 +0100
Subject: [PATCH 04/12] Bundle AlignedMemory inputs with MemoryBundle (#147)

---
 app/service-cli.cpp                        | 12 ++---
 src/translator/byte_array_util.cpp         |  8 ++++
 src/translator/byte_array_util.h           |  1 +
 src/translator/definitions.h               | 36 ++++++++++++++
 src/translator/service.cpp                 |  9 ++--
 src/translator/service.h                   | 56 +++++++---------------
 wasm/bindings/TranslationModelBindings.cpp | 16 +++++--
 7 files changed, 81 insertions(+), 57 deletions(-)

diff --git a/app/service-cli.cpp b/app/service-cli.cpp
index 0e958d6..fbf0131 100644
--- a/app/service-cli.cpp
+++ b/app/service-cli.cpp
@@ -16,19 +16,15 @@ int main(int argc, char *argv[]) {
   auto cp = marian::bergamot::createConfigParser();
   auto options = cp.parseOptions(argc, argv, true);
 
-  // Prepare memories for model and shortlist
-  marian::bergamot::AlignedMemory modelBytes, shortlistBytes;
-  std::vector<std::shared_ptr<marian::bergamot::AlignedMemory>> vocabsBytes;
+  // Prepare memories for bytearrays (including model, shortlist and vocabs)
+  marian::bergamot::MemoryBundle memoryBundle;
 
   if (options->get<bool>("check-bytearray")) {
     // Load legit values into bytearrays.
-    modelBytes = marian::bergamot::getModelMemoryFromConfig(options);
-    shortlistBytes = marian::bergamot::getShortlistMemoryFromConfig(options);
-    marian::bergamot::getVocabsMemoryFromConfig(options, vocabsBytes);
+    memoryBundle = marian::bergamot::getMemoryBundleFromConfig(options);
   }
 
-  marian::bergamot::Service service(options, std::move(modelBytes),
-                                    std::move(shortlistBytes), std::move(vocabsBytes));
+  marian::bergamot::Service service(options, std::move(memoryBundle));
 
   // Read a large input text blob from stdin
   std::ostringstream std_input;
diff --git a/src/translator/byte_array_util.cpp b/src/translator/byte_array_util.cpp
index 00beaa6..69564d2 100644
--- a/src/translator/byte_array_util.cpp
+++ b/src/translator/byte_array_util.cpp
@@ -117,5 +117,13 @@ void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
   }
 }
 
+MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options){
+  MemoryBundle memoryBundle;
+  memoryBundle.model = getModelMemoryFromConfig(options);
+  memoryBundle.shortlist = getShortlistMemoryFromConfig(options);
+  getVocabsMemoryFromConfig(options, memoryBundle.vocabs);
+  return memoryBundle;
+}
+
 } // namespace bergamot
 } // namespace marian
diff --git a/src/translator/byte_array_util.h b/src/translator/byte_array_util.h
index 3cbf3d3..14c79b3 100644
--- a/src/translator/byte_array_util.h
+++ b/src/translator/byte_array_util.h
@@ -10,5 +10,6 @@ AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options)
 void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
                                std::vector<std::shared_ptr<AlignedMemory>>& vocabMemories);
 bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize);
+MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options);
 } // namespace bergamot
 } // namespace marian
diff --git a/src/translator/definitions.h b/src/translator/definitions.h
index 58fd4b3..175397d 100644
--- a/src/translator/definitions.h
+++ b/src/translator/definitions.h
@@ -15,6 +15,42 @@ typedef std::vector<Segment> Segments;
 /// Shortcut to AlignedVector<char> for byte arrays
 typedef AlignedVector<char> AlignedMemory;
 
+/// Memory bundle for all byte-arrays.
+/// Can be a set/subset of model, shortlist, vocabs and ssplitPrefixFile bytes.
+struct MemoryBundle {
+  AlignedMemory model;  ///< Byte-array of model (aligned to 256)
+  AlignedMemory shortlist;  ///< Byte-array of shortlist (aligned to 64)
+
+  /// Vector of vocabulary memories (aligned to 64).
+  /// If two vocabularies are the same (based on the filenames), two entries (shared
+  /// pointers) will be generated which share the same AlignedMemory object.
+  std::vector<std::shared_ptr<AlignedMemory>> vocabs;
+
+  /// @todo Not implemented yet
+  AlignedMemory ssplitPrefixFile;
+
+  MemoryBundle() = default;
+
+  MemoryBundle(MemoryBundle &&from){
+    model = std::move(from.model);
+    shortlist = std::move(from.shortlist);
+    vocabs = std::move(vocabs);
+    ssplitPrefixFile = std::move(from.ssplitPrefixFile);
+  }
+
+  MemoryBundle &operator=(MemoryBundle &&from) {
+    model = std::move(from.model);
+    shortlist = std::move(from.shortlist);
+    vocabs = std::move(vocabs);
+    ssplitPrefixFile = std::move(from.ssplitPrefixFile);
+    return *this;
+  }
+
+  // Delete copy constructors
+  MemoryBundle(const MemoryBundle&) = delete;
+  MemoryBundle& operator=(const MemoryBundle&) = delete;
+};
+
 } // namespace bergamot
 } // namespace marian
 
diff --git a/src/translator/service.cpp b/src/translator/service.cpp
index 385a2a5..16c4743 100644
--- a/src/translator/service.cpp
+++ b/src/translator/service.cpp
@@ -41,14 +41,13 @@ loadVocabularies(marian::Ptr<marian::Options> options,
 namespace marian {
 namespace bergamot {
 
-Service::Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory,
-                 std::vector<std::shared_ptr<AlignedMemory>> vocabMemories)
+Service::Service(Ptr<Options> options, MemoryBundle memoryBundle)
     : requestId_(0), options_(options),
-      vocabs_(std::move(loadVocabularies(options, std::move(vocabMemories)))),
+      vocabs_(std::move(loadVocabularies(options, std::move(memoryBundle.vocabs)))),
       text_processor_(vocabs_, options), batcher_(options),
       numWorkers_(options->get<int>("cpu-threads")),
-      modelMemory_(std::move(modelMemory)),
-      shortlistMemory_(std::move(shortlistMemory))
+      modelMemory_(std::move(memoryBundle.model)),
+      shortlistMemory_(std::move(memoryBundle.shortlist))
 #ifndef WASM_COMPATIBLE_SOURCE
       // 0 elements in PCQueue is illegal and can lead to failures. Adding a
       // guard to have at least one entry allocated. In the single-threaded
diff --git a/src/translator/service.h b/src/translator/service.h
index 721d436..9d0a67d 100644
--- a/src/translator/service.h
+++ b/src/translator/service.h
@@ -55,53 +55,29 @@ namespace bergamot {
 /// // Do things with response.
 /// ```
 ///
-/// Optionally Service can be initialized by also passing model memory for
-/// purposes of efficiency (which defaults to nullpointer and then reads from
+/// Optionally Service can be initialized by also passing bytearray memories
+/// for purposes of efficiency (which defaults to empty and then reads from
 /// file supplied through config).
 ///
 class Service {
 
 public:
+  /// Construct Service from Marian options. If memoryBundle is empty, Service is
+  /// initialized from file-based loading. Otherwise, Service is initialized from
+  /// the given bytearray memories.
   /// @param options Marian options object
-  /// @param modelMemory byte array (aligned to 256!!!) that contains the bytes
-  /// of a model.bin.
-  /// @param shortlistMemory byte array of shortlist (aligned to 64)
-  /// @param vocabMemories vector of vocabulary memories (aligned to 64)
-  explicit Service(Ptr<Options> options, AlignedMemory modelMemory,
-                   AlignedMemory shortlistMemory,
-                   std::vector<std::shared_ptr<AlignedMemory>> vocabMemories);
+  /// @param memoryBundle holds all byte-array memories. Can be a set/subset of
+  /// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional.
+  explicit Service(Ptr<Options> options, MemoryBundle memoryBundle={});
 
-  /// Construct Service purely from Options. This expects options which
-  /// marian-decoder expects to be set for loading model shortlist and
-  /// vocabularies from files in addition to parameters that set unset desired
-  /// features (e.g: alignments, quality-scores).
-  ///
-  /// This is equivalent to a call to:
-  /// ```cpp
-  ///    Service(options, AlignedMemory(), AlignedMemory(), {})
-  /// ```
-  /// wherein empty memory is passed and internal flow defaults to file-based
-  /// model, shortlist loading. AlignedMemory() corresponds to empty memory
-  explicit Service(Ptr<Options> options)
-      : Service(options, AlignedMemory(), AlignedMemory(), {}) {}
-
-  /// Construct Service from a string configuration.
-  /// @param [in] config string parsable as YAML expected to adhere with marian
-  /// config
-  /// @param [in] modelMemory byte array (aligned to 256!!!) that contains the
-  /// bytes of a model.bin. Optional. AlignedMemory() corresponds to empty memory
-  /// @param [in] shortlistMemory byte array of shortlist (aligned to 64). Optional.
-  /// @param [in] vocabMemories vector of vocabulary memories (aligned to 64). Optional.
-  /// If two vocabularies are the same (based on the filenames), two entries (shared
-  /// pointers) will be generated which share the same AlignedMemory object.
-  explicit Service(const std::string &config,
-                   AlignedMemory modelMemory = AlignedMemory(),
-                   AlignedMemory shortlistMemory = AlignedMemory(),
-                   std::vector<std::shared_ptr<AlignedMemory>> vocabsMemories = {})
-      : Service(parseOptions(config, /*validate=*/false),
-                std::move(modelMemory),
-                std::move(shortlistMemory),
-                std::move(vocabsMemories)) {}
+  /// Construct Service from a string configuration. If memoryBundle is empty, Service is
+  /// initialized from file-based loading. Otherwise, Service is initialized from
+  /// the given bytearray memories.
+  /// @param [in] config string parsable as YAML expected to adhere with marian config
+  /// @param [in] memoryBundle holds all byte-array memories. Can be a set/subset of
+  /// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional.
+  explicit Service(const std::string &config, MemoryBundle memoryBundle={})
+      : Service(parseOptions(config, /*validate=*/false), std::move(memoryBundle)) {}
 
   /// Explicit destructor to clean up after any threads initialized in
   /// asynchronous operation mode.
diff --git a/wasm/bindings/TranslationModelBindings.cpp b/wasm/bindings/TranslationModelBindings.cpp
index 4ee9265..1db7401 100644
--- a/wasm/bindings/TranslationModelBindings.cpp
+++ b/wasm/bindings/TranslationModelBindings.cpp
@@ -48,14 +48,22 @@ std::vector<std::shared_ptr<AlignedMemory>> prepareVocabsSmartMemories(std::vect
   return vocabsSmartMemories;
 }
 
+marian::bergamot::MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory,
+                                                   AlignedMemory* shortlistMemory,
+                                                   std::vector<AlignedMemory*> uniqueVocabsMemories){
+  marian::bergamot::MemoryBundle memoryBundle;
+  memoryBundle.model = std::move(*modelMemory);
+  memoryBundle.shortlist = std::move(*shortlistMemory);
+  memoryBundle.vocabs = std::move(prepareVocabsSmartMemories(uniqueVocabsMemories));
+
+  return memoryBundle;
+}
+
 TranslationModel* TranslationModelFactory(const std::string &config,
                                           AlignedMemory* modelMemory,
                                           AlignedMemory* shortlistMemory,
                                           std::vector<AlignedMemory*> uniqueVocabsMemories) {
-  return new TranslationModel(config,
-                              std::move(*modelMemory),
-                              std::move(*shortlistMemory),
-                              std::move(prepareVocabsSmartMemories(uniqueVocabsMemories)));
+  return new TranslationModel(config, std::move(prepareMemoryBundle(modelMemory, shortlistMemory, uniqueVocabsMemories)));
 }
 
 EMSCRIPTEN_BINDINGS(translation_model) {

From 77424a3df155a03c61338287a11b3c6c28815681 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jphilip@ed.ac.uk>
Date: Mon, 17 May 2021 11:42:47 +0100
Subject: [PATCH 05/12] Enabling ccache on github builds for Ubuntu (#95)

* CI Changes to add tiny regression tests

* Adding an inspect cache step

* Removing ccache, pursue in another

* Incorporating Nick's changes through submodule merge

* Submodule now points to master

* Restoring ccache enabled workflow file

* Restoring ccache enabled CMakeLists

* cache -> ccache typo fix

* Moving CCACHE setup to GitHub runner file

* Find also uses CCACHE dir

* Updating CMakeLists not to override env

* Cache compiler binary's contents

* Changing a few names to trigger new build; Testing cache looks fun

* USE_CCACHE=on, -L for inspection

* Adding a ccache_cmd, but will only use in next commit

* Using ccache_cmd

* Removing "

* Adding compiler hash script

* Bunch of absolute paths

* GITHUB_WORKSPACE typo

* Nah, I'll keep -L and trigger another build

* Trying something with compiler hash on cache key backup as well

* builtin, bash it seems

* Empty commit #1

* Move ccache stats to after compile

* Reshuffling ccache vars

* No comments

* Updates to Github output set syntax

* Empty Commit 1

* Empty Commit 2

* Empty commit 3

* /bin/bash -> bash; ccache_cmd for consistency

* Adding ccache -s before and after build

* Adding comments to compiler-hash script

* Let's build cached and non-cached variants together for comparison

* Fixing quotes, /bin/bash -> bash

* Minor var/env adjustment

* Adding ccache -z before the job

* Reverting CMakeLists.txt without CCACHE

* Switching to CMAKE_LANG_COMPILER_LAUNCHER instead of CMakeLists.txt rule

* 5G -> 1G cache size

* 1G -> 2G; Hyperparameter tuning
---
 .github/workflows/native-ubuntu.yml | 89 ++++++++++++++++++++++++++++-
 scripts/ci/compiler-hash.sh         | 35 ++++++++++++
 2 files changed, 122 insertions(+), 2 deletions(-)
 create mode 100644 scripts/ci/compiler-hash.sh

diff --git a/.github/workflows/native-ubuntu.yml b/.github/workflows/native-ubuntu.yml
index dc8016b..563daf7 100644
--- a/.github/workflows/native-ubuntu.yml
+++ b/.github/workflows/native-ubuntu.yml
@@ -15,6 +15,8 @@ jobs:
           - name: "full-marian"
             os: ubuntu-latest
             gcc: 8
+            force_recache: false
+            ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
             cpu: 'ON'
             gpu: 'OFF'
             test_tags: ""
@@ -24,10 +26,14 @@ jobs:
               USE_WASM_COMPATIBLE_SOURCE: "OFF"
               COMPILE_SERVER: "OFF"
               COMPILE_EXAMPLES: "OFF"
+              CMAKE_C_COMPILER_LAUNCHER: "ccache"
+              CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
 
           - name: "minimal-marian"
             os: ubuntu-latest
             gcc: 8
+            force_recache: false
+            ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
             cpu: 'ON'
             gpu: 'OFF'
             test_tags: "'#wasm'"
@@ -37,6 +43,42 @@ jobs:
               USE_WASM_COMPATIBLE_SOURCE: "ON"
               COMPILE_SERVER: "OFF"
               COMPILE_EXAMPLES: "OFF"
+              CMAKE_C_COMPILER_LAUNCHER: "ccache"
+              CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
+
+          - name: "full-marian-force-recache"
+            os: ubuntu-latest
+            gcc: 8
+            force_recache: true
+            ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
+            cpu: 'ON'
+            gpu: 'OFF'
+            test_tags: ""
+            cmake: 
+              CMAKE_BUILD_TYPE: "Release"
+              COMPILE_TESTS: "ON"
+              USE_WASM_COMPATIBLE_SOURCE: "OFF"
+              COMPILE_SERVER: "OFF"
+              COMPILE_EXAMPLES: "OFF"
+              CMAKE_C_COMPILER_LAUNCHER: "ccache"
+              CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
+
+          - name: "minimal-marian-force-recache"
+            os: ubuntu-latest
+            gcc: 8
+            force_recache: true
+            ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
+            cpu: 'ON'
+            gpu: 'OFF'
+            test_tags: "'#wasm'"
+            cmake:
+              CMAKE_BUILD_TYPE: "Release"
+              COMPILE_TESTS: "OFF" # Minimal marian has no sqlite support and COMPILE_TEST=ON fails.
+              USE_WASM_COMPATIBLE_SOURCE: "ON"
+              COMPILE_SERVER: "OFF"
+              COMPILE_EXAMPLES: "OFF"
+              CMAKE_C_COMPILER_LAUNCHER: "ccache"
+              CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
 
 
     runs-on: ${{ matrix.os }}
@@ -57,7 +99,7 @@ jobs:
         sudo apt-get update 
         sudo apt-get install -y \
             libgoogle-perftools-dev libprotobuf-dev protobuf-compiler  \
-            libboost-all-dev g++-${{ matrix.gcc }} 
+            libboost-all-dev g++-${{ matrix.gcc }} ccache
 
     # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
     - name: Install MKL
@@ -68,6 +110,42 @@ jobs:
         sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
       if: matrix.cmake.USE_WASM_COMPATIBLE_SOURCE == 'OFF'
 
+    - name: Generate ccache_vars
+      id: ccache_vars
+      shell: bash
+      run: |
+          echo "::set-output name=hash::$(${{ matrix.ccache_cmd }})"
+          echo "::set-output name=timestamp::$(date '+%Y-%m-%dT%H.%M.%S')"
+
+    - name: Setup ccache environment variables
+      run: | 
+        echo "CCACHE_COMPILERCHECK=${{ matrix.ccache_cmd }}" >> $GITHUB_ENV 
+        echo "CCACHE_BASE_DIR=${{ github.workspace }}" >> $GITHUB_ENV
+        echo "CCACHE_DIR=${{ github.workspace }}/.ccache" >> $GITHUB_ENV
+        echo "CCACHE_COMPRESS=true" >> $GITHUB_ENV
+        echo "CCACHE_COMPRESSLEVEL=6" >> $GITHUB_ENV
+        echo "CCACHE_MAXSIZE=2G" >> $GITHUB_ENV
+
+    - name: Setup ccache recache on
+      run: |
+        echo "CCACHE_RECACHE=" >> $GITHUB_ENV 
+      if: matrix.force_recache == true
+
+    - name: Cache-op for build-cache through ccache 
+      uses: actions/cache@v2
+      with:
+        path: ${{ env.CCACHE_DIR }}
+        key: ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-${{ steps.ccache_vars.outputs.timestamp }}
+        restore-keys: |
+           ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}- 
+           ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}- 
+           ccache-${{ matrix.name }}- 
+
+    - name: Cache stats before build
+      run: |
+          ccache -s
+          ccache -z
+
     # Boost is installed on GitHub-hosted runners in a non-standard location
     # https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671
     - name: Configure CMake
@@ -75,17 +153,24 @@ jobs:
         mkdir -p build
         cd build
         CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \
-        cmake .. \
+        cmake -L .. \
           -DCMAKE_BUILD_TYPE=${{ matrix.cmake.CMAKE_BUILD_TYPE }}\
           -DCOMPILE_TESTS=${{ matrix.cmake.COMPILE_TESTS }}\
           -DCOMPILE_EXAMPLES=${{ matrix.cmake.COMPILE_EXAMPLES }} \
           -DCOMPILE_SERVER=${{ matrix.cmake.COMPILE_SERVER }} \
           -DUSE_WASM_COMPATIBLE_SOURCE=${{ matrix.cmake.USE_WASM_COMPATIBLE_SOURCE }} \
+          -DCMAKE_C_COMPILER_LAUNCHER=${{ matrix.cmake.CMAKE_C_COMPILER_LAUNCHER}} \
+          -DCMAKE_CXX_COMPILER_LAUNCHER=${{ matrix.cmake.CMAKE_CXX_COMPILER_LAUNCHER}} 
+
 
     - name: Compile bergamot-translator
       working-directory: build
       run: make -j2
 
+    - name: Cache stats after build
+      run: |
+          ccache -s
+
     - name: Run unit tests
       working-directory: build
       run: make test
diff --git a/scripts/ci/compiler-hash.sh b/scripts/ci/compiler-hash.sh
new file mode 100644
index 0000000..a770dfd
--- /dev/null
+++ b/scripts/ci/compiler-hash.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Uses the command from https://stackoverflow.com/a/9355840/4565794.
+# -v displays the commands executed to run compilation. Of this cc1 additional
+#    flags which contain the flags triggered by -march=native is what we need.
+# -E stop after preprocessing stage.
+
+# Output on a linux machine with gcc-8 looks as follows:
+
+# $ gcc -march=native -E -v - </dev/null 2>&1 | grep cc1
+#       /usr/lib/gcc/x86_64-linux-gnu/8/cc1 -E -quiet -v -imultiarch x86_64-linux-gnu
+#       - -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3
+#       -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm
+#       -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig
+#       -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mno-rtm
+#       -mno-hle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave
+#       -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1
+#       -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl
+#       -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb
+#       -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk
+#       -mno-avx512vbmi2 -mavx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg
+#       -mno-movdiri -mno-movdir64b --param l1-cache-size=32 --param
+#       l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512
+#       -fstack-protector-strong -Wformat -Wformat-security
+
+# The sha256sum of the output is computed, and stripped to the first 8
+# characters for use in ccache and github cache store key. Can effectively be
+# considered as a hash of the compiler version and the flags activated by
+# -march=native.
+
+COMPILER=$1
+
+$COMPILER -march=native -E -v - < /dev/null 2>&1 | grep cc1 \
+    | sha256sum | cut -c1-8
+

From 5bd1fc6b83d934198d6eed3b9bf50e751fa3d950 Mon Sep 17 00:00:00 2001
From: Qianqian Zhu <qianqian.zhu@hotmail.com>
Date: Mon, 17 May 2021 13:09:03 +0100
Subject: [PATCH 06/12] Refactor vocabs in Service (#143)

Co-authored-by: Nikolay Bogoychev <nheart@gmail.com>
---
 src/translator/batch_translator.cpp | 21 ++++----
 src/translator/batch_translator.h   |  5 +-
 src/translator/definitions.h        | 21 --------
 src/translator/response_builder.cpp |  3 +-
 src/translator/response_builder.h   |  7 +--
 src/translator/service.cpp          | 35 +------------
 src/translator/service.h            |  3 +-
 src/translator/text_processor.cpp   |  8 +--
 src/translator/text_processor.h     |  8 +--
 src/translator/vocabs.h             | 81 +++++++++++++++++++++++++++++
 10 files changed, 111 insertions(+), 81 deletions(-)
 create mode 100644 src/translator/vocabs.h

diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp
index c627172..b35c4ce 100644
--- a/src/translator/batch_translator.cpp
+++ b/src/translator/batch_translator.cpp
@@ -10,11 +10,11 @@ namespace marian {
 namespace bergamot {
 
 BatchTranslator::BatchTranslator(DeviceId const device,
-                                 std::vector<Ptr<Vocab const>> &vocabs,
+                                 Vocabs &vocabs,
                                  Ptr<Options> options,
                                  const AlignedMemory* modelMemory,
                                  const AlignedMemory* shortlistMemory)
-    : device_(device), options_(options), vocabs_(&vocabs),
+    : device_(device), options_(options), vocabs_(vocabs),
     modelMemory_(modelMemory), shortlistMemory_(shortlistMemory) {}
 
 void BatchTranslator::initialize() {
@@ -22,17 +22,17 @@ void BatchTranslator::initialize() {
   bool check = options_->get<bool>("check-bytearray",false); // Flag holds whether validate the bytearray (model and shortlist)
   if (options_->hasAndNotEmpty("shortlist")) {
     int srcIdx = 0, trgIdx = 1;
-    bool shared_vcb = vocabs_->front() == vocabs_->back();
+    bool shared_vcb = vocabs_.sources().front() == vocabs_.target(); // vocabs_->sources().front() is invoked as we currently only support one source vocab
     if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) {
       slgen_ = New<data::BinaryShortlistGenerator>(shortlistMemory_->begin(), shortlistMemory_->size(),
-                                                     vocabs_->front(), vocabs_->back(),
-                                                     srcIdx, trgIdx, shared_vcb, check);
+                                                   vocabs_.sources().front(), vocabs_.target(),
+                                                   srcIdx, trgIdx, shared_vcb, check);
     }
     else {
       // Changed to BinaryShortlistGenerator to enable loading binary shortlist file
       // This class also supports text shortlist file
-      slgen_ = New<data::BinaryShortlistGenerator>(options_, vocabs_->front(),
-                                                    vocabs_->back(), srcIdx,
+      slgen_ = New<data::BinaryShortlistGenerator>(options_, vocabs_.sources().front(),
+                                                    vocabs_.target(), srcIdx,
                                                     trgIdx, shared_vcb);
     }
   }
@@ -97,7 +97,7 @@ void BatchTranslator::translate(Batch &batch) {
   std::vector<Ptr<SubBatch>> subBatches;
   for (size_t j = 0; j < maxDims.size(); ++j) {
     subBatches.emplace_back(
-        New<SubBatch>(batchSize, maxDims[j], vocabs_->at(j)));
+        New<SubBatch>(batchSize, maxDims[j], vocabs_.sources().at(j)));
   }
 
   std::vector<size_t> words(maxDims.size(), 0);
@@ -116,9 +116,8 @@ void BatchTranslator::translate(Batch &batch) {
 
   auto corpus_batch = Ptr<CorpusBatch>(new CorpusBatch(subBatches));
   corpus_batch->setSentenceIds(sentenceIds);
-
-  auto trgVocab = vocabs_->back();
-  auto search = New<BeamSearch>(options_, scorers_, trgVocab);
+  
+  auto search = New<BeamSearch>(options_, scorers_, vocabs_.target());
 
   auto histories = std::move(search->search(graph_, corpus_batch));
   batch.completeBatch(histories);
diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h
index 761a534..048ba77 100644
--- a/src/translator/batch_translator.h
+++ b/src/translator/batch_translator.h
@@ -11,6 +11,7 @@
 #include "request.h"
 #include "translator/history.h"
 #include "translator/scorers.h"
+#include "vocabs.h"
 
 #ifndef WASM_COMPATIBLE_SOURCE
 #include "pcqueue.h"
@@ -34,7 +35,7 @@ public:
    * @param modelMemory byte array (aligned to 256!!!) that contains the bytes of a model.bin. Provide a nullptr if not used.
    * @param shortlistMemory byte array of shortlist (aligned to 64)
    */
-  explicit BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
+  explicit BatchTranslator(DeviceId const device, Vocabs &vocabs,
                   Ptr<Options> options, const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory);
 
   // convenience function for logging. TODO(jerin)
@@ -45,7 +46,7 @@ public:
 private:
   Ptr<Options> options_;
   DeviceId device_;
-  std::vector<Ptr<Vocab const>> *vocabs_;
+  const Vocabs&  vocabs_;
   Ptr<ExpressionGraph> graph_;
   std::vector<Ptr<Scorer>> scorers_;
   Ptr<data::ShortlistGenerator const> slgen_;
diff --git a/src/translator/definitions.h b/src/translator/definitions.h
index 175397d..bf1cb57 100644
--- a/src/translator/definitions.h
+++ b/src/translator/definitions.h
@@ -28,27 +28,6 @@ struct MemoryBundle {
 
   /// @todo Not implemented yet
   AlignedMemory ssplitPrefixFile;
-
-  MemoryBundle() = default;
-
-  MemoryBundle(MemoryBundle &&from){
-    model = std::move(from.model);
-    shortlist = std::move(from.shortlist);
-    vocabs = std::move(vocabs);
-    ssplitPrefixFile = std::move(from.ssplitPrefixFile);
-  }
-
-  MemoryBundle &operator=(MemoryBundle &&from) {
-    model = std::move(from.model);
-    shortlist = std::move(from.shortlist);
-    vocabs = std::move(vocabs);
-    ssplitPrefixFile = std::move(from.ssplitPrefixFile);
-    return *this;
-  }
-
-  // Delete copy constructors
-  MemoryBundle(const MemoryBundle&) = delete;
-  MemoryBundle& operator=(const MemoryBundle&) = delete;
 };
 
 } // namespace bergamot
diff --git a/src/translator/response_builder.cpp b/src/translator/response_builder.cpp
index f68bd31..037d456 100644
--- a/src/translator/response_builder.cpp
+++ b/src/translator/response_builder.cpp
@@ -65,11 +65,10 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
 
     Result result = onebest[0]; // Expecting only one result;
     Words words = std::get<0>(result);
-    auto targetVocab = vocabs_->back();
 
     std::string decoded;
     std::vector<string_view> targetSentenceMappings;
-    targetVocab->decodeWithByteRanges(words, decoded, targetSentenceMappings);
+    vocabs_.target()->decodeWithByteRanges(words, decoded, targetSentenceMappings);
 
     switch (responseOptions_.concatStrategy) {
     case ConcatStrategy::FAITHFUL: {
diff --git a/src/translator/response_builder.h b/src/translator/response_builder.h
index 85caffb..b8a8dd4 100644
--- a/src/translator/response_builder.h
+++ b/src/translator/response_builder.h
@@ -4,6 +4,7 @@
 #include "data/types.h"
 #include "response.h"
 #include "response_options.h"
+#include "vocabs.h"
 
 // For now we will work with this, to avoid complaints another structure is hard
 // to operate with.
@@ -24,10 +25,10 @@ public:
   /// @param [in] vocabs: marian vocab object (used in decoding)
   /// @param [in] promise: promise to set with the constructed Response.
   ResponseBuilder(ResponseOptions responseOptions, AnnotatedText &&source,
-                  std::vector<Ptr<Vocab const>> &vocabs,
+                  Vocabs &vocabs,
                   std::promise<Response> &&promise)
       : responseOptions_(responseOptions), source_(std::move(source)),
-        vocabs_(&vocabs), promise_(std::move(promise)) {}
+        vocabs_(vocabs), promise_(std::move(promise)) {}
 
   /// Constructs and sets the promise of a Response object from obtained
   /// histories after translating.
@@ -81,7 +82,7 @@ private:
   // Data members are context/curried args for the functor.
 
   ResponseOptions responseOptions_;
-  std::vector<Ptr<Vocab const>> *vocabs_; // vocabs are required for decoding
+  const Vocabs& vocabs_; // vocabs are required for decoding
                                           // and any source validation checks.
   std::promise<Response> promise_; //  To be set when callback triggered and
                                    //  after Response constructed.
diff --git a/src/translator/service.cpp b/src/translator/service.cpp
index 16c4743..5439667 100644
--- a/src/translator/service.cpp
+++ b/src/translator/service.cpp
@@ -5,45 +5,12 @@
 #include <string>
 #include <utility>
 
-inline std::vector<marian::Ptr<const marian::Vocab>>
-loadVocabularies(marian::Ptr<marian::Options> options,
-                 std::vector<std::shared_ptr<marian::bergamot::AlignedMemory>>&& vocabMemories) {
-  // @TODO: parallelize vocab loading for faster startup
-  std::vector<marian::Ptr<marian::Vocab const>> vocabs;
-  if(!vocabMemories.empty()){
-    // load vocabs from buffer
-    ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies.");
-    vocabs.resize(vocabMemories.size());
-    for (size_t i = 0; i < vocabs.size(); i++) {
-      marian::Ptr<marian::Vocab> vocab = marian::New<marian::Vocab>(options, i);
-      vocab->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size()));
-      vocabs[i] = vocab;
-    }
-  } else {
-    // load vocabs from file
-    auto vfiles = options->get<std::vector<std::string>>("vocabs");
-    // with the current setup, we need at least two vocabs: src and trg
-    ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies.");
-    vocabs.resize(vfiles.size());
-    std::unordered_map<std::string, marian::Ptr<marian::Vocab>> vmap;
-    for (size_t i = 0; i < vocabs.size(); ++i) {
-      auto m = vmap.emplace(std::make_pair(vfiles[i], marian::Ptr<marian::Vocab>()));
-      if (m.second) { // new: load the vocab
-        m.first->second = marian::New<marian::Vocab>(options, i);
-        m.first->second->load(vfiles[i]);
-      }
-      vocabs[i] = m.first->second;
-    }
-  }
-  return vocabs;
-}
-
 namespace marian {
 namespace bergamot {
 
 Service::Service(Ptr<Options> options, MemoryBundle memoryBundle)
     : requestId_(0), options_(options),
-      vocabs_(std::move(loadVocabularies(options, std::move(memoryBundle.vocabs)))),
+      vocabs_(options, std::move(memoryBundle.vocabs)),
       text_processor_(vocabs_, options), batcher_(options),
       numWorkers_(options->get<int>("cpu-threads")),
       modelMemory_(std::move(memoryBundle.model)),
diff --git a/src/translator/service.h b/src/translator/service.h
index 9d0a67d..6c60888 100644
--- a/src/translator/service.h
+++ b/src/translator/service.h
@@ -9,6 +9,7 @@
 #include "response_builder.h"
 #include "text_processor.h"
 #include "translator/parser.h"
+#include "vocabs.h"
 
 #ifndef WASM_COMPATIBLE_SOURCE
 #include "pcqueue.h"
@@ -172,7 +173,7 @@ private:
 
   size_t requestId_;
   /// Store vocabs representing source and target.
-  std::vector<Ptr<Vocab const>> vocabs_; // ORDER DEPENDENCY (text_processor_)
+  Vocabs vocabs_; // ORDER DEPENDENCY (text_processor_)
 
   /// TextProcesser takes a blob of text and converts into format consumable by
   /// the batch-translator and annotates sentences and words.
diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp
index fb66901..457e2b9 100644
--- a/src/translator/text_processor.cpp
+++ b/src/translator/text_processor.cpp
@@ -4,7 +4,6 @@
 #include "annotation.h"
 
 #include "common/options.h"
-#include "data/vocab.h"
 #include <vector>
 
 namespace marian {
@@ -12,13 +11,14 @@ namespace bergamot {
 
 Segment TextProcessor::tokenize(const string_view &segment,
                                 std::vector<string_view> &wordRanges) {
-  return vocabs_->front()->encodeWithByteRanges(
+  // vocabs_->sources().front() is invoked as we currently only support one source vocab
+  return vocabs_.sources().front()->encodeWithByteRanges(
       segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
 }
 
-TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
+TextProcessor::TextProcessor(Vocabs &vocabs,
                              Ptr<Options> options)
-    : vocabs_(&vocabs), sentence_splitter_(options) {
+    : vocabs_(vocabs), sentence_splitter_(options) {
 
   max_length_break_ = options->get<int>("max-length-break");
   max_length_break_ = max_length_break_ - 1;
diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h
index 698e36e..f5d4d88 100644
--- a/src/translator/text_processor.h
+++ b/src/translator/text_processor.h
@@ -7,6 +7,7 @@
 #include "annotation.h"
 
 #include "sentence_splitter.h"
+#include "vocabs.h"
 
 #include <vector>
 
@@ -21,7 +22,7 @@ class TextProcessor {
   // sentences (vector of words). In addition, the ByteRanges of the
   // source-tokens in unnormalized text are provided as string_views.
 public:
-  explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
+  explicit TextProcessor(Vocabs &vocabs, Ptr<Options>);
 
   void process(AnnotatedText &source, Segments &segments);
 
@@ -36,9 +37,10 @@ private:
                 Segments &segments, AnnotatedText &source);
 
   // shorthand, used only in truncate()
-  const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
+  // vocabs_->sources().front() is invoked as we currently only support one source vocab
+  const Word sourceEosId() const { return vocabs_.sources().front()->getEosId(); }
 
-  std::vector<Ptr<Vocab const>> *vocabs_;
+  const Vocabs& vocabs_;
   SentenceSplitter sentence_splitter_;
   size_t max_length_break_;
 };
diff --git a/src/translator/vocabs.h b/src/translator/vocabs.h
new file mode 100644
index 0000000..89aed4b
--- /dev/null
+++ b/src/translator/vocabs.h
@@ -0,0 +1,81 @@
+#pragma once
+
+namespace marian {
+namespace bergamot {
+
+/// Wrapper of Marian Vocab objects needed for translator.
+/// Holds multiple source vocabularies and one target vocabulary
+class Vocabs {
+public:
+  /// Construct vocabs object from either byte-arrays or files
+  Vocabs(Ptr<Options> options, std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories): options_(options){
+    if (!vocabMemories.empty()){
+      // load vocabs from buffer
+      load(std::move(vocabMemories));
+    }
+    else{
+      // load vocabs from file
+      auto vocabPaths = options->get<std::vector<std::string>>("vocabs");
+      load(vocabPaths);
+    }
+  }
+
+  /// Get all source vocabularies (as a vector)
+  const std::vector<Ptr<Vocab const>>& sources() const {
+    return srcVocabs_;
+  }
+
+  /// Get the target vocabulary
+  const Ptr<Vocab const>& target() const {
+    return trgVocab_;
+  }
+
+private:
+  std::vector<Ptr<Vocab const>> srcVocabs_;  // source vocabularies
+  Ptr<Vocab const> trgVocab_;                // target vocabulary
+  Ptr<Options> options_;
+
+  // load from buffer
+  void load(std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories) {
+    // At least two vocabs: src and trg
+    ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies.");
+    srcVocabs_.resize(vocabMemories.size());
+    // hashMap is introduced to avoid double loading the same vocab
+    // loading vocabs (either from buffers or files) is the biggest bottleneck of the speed
+    // uintptr_t holds unique keys (address) for share_ptr<AlignedMemory>
+    std::unordered_map<uintptr_t, Ptr<Vocab>> vmap;
+    for (size_t i = 0; i < srcVocabs_.size(); i++) {
+      auto m = vmap.emplace(std::make_pair(reinterpret_cast<uintptr_t>(vocabMemories[i].get()), Ptr<Vocab>()));
+      if (m.second) { // new: load the vocab
+        m.first->second = New<Vocab>(options_, i);
+        m.first->second->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size()));
+      }
+      srcVocabs_[i] = m.first->second;
+    }
+    // Initialize target vocab
+    trgVocab_ = srcVocabs_.back();
+    srcVocabs_.pop_back();
+  }
+
+  // load from file
+  void load(const std::vector<std::string>& vocabPaths){
+    // with the current setup, we need at least two vocabs: src and trg
+    ABORT_IF(vocabPaths.size() < 2, "Insufficient number of vocabularies.");
+    srcVocabs_.resize(vocabPaths.size());
+    std::unordered_map<std::string, Ptr<Vocab>> vmap;
+    for (size_t i = 0; i < srcVocabs_.size(); ++i) {
+      auto m = vmap.emplace(std::make_pair(vocabPaths[i], Ptr<Vocab>()));
+      if (m.second) { // new: load the vocab
+        m.first->second = New<Vocab>(options_, i);
+        m.first->second->load(vocabPaths[i]);
+      }
+      srcVocabs_[i] = m.first->second;
+    }
+    // Initialize target vocab
+    trgVocab_ = srcVocabs_.back();
+    srcVocabs_.pop_back();
+  }
+};
+
+} // namespace bergamot
+} // namespace marian

From 3e7058767222a8add4a505518c09ff0e7c6a2810 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <kpu@users.noreply.github.com>
Date: Mon, 17 May 2021 16:42:18 +0100
Subject: [PATCH 07/12] Rewrite annotation class to remove corner cases (#135)

---
 src/tests/annotation_tests.cpp      |  30 ++---
 src/translator/annotation.cpp       | 166 ++++++++---------------
 src/translator/annotation.h         | 197 +++++++++++++++++-----------
 src/translator/response_builder.cpp |  11 +-
 src/translator/text_processor.cpp   |  14 +-
 src/translator/text_processor.h     |   6 +-
 6 files changed, 197 insertions(+), 227 deletions(-)

diff --git a/src/tests/annotation_tests.cpp b/src/tests/annotation_tests.cpp
index d323b9d..0f02a7a 100644
--- a/src/tests/annotation_tests.cpp
+++ b/src/tests/annotation_tests.cpp
@@ -23,9 +23,6 @@ TEST_CASE("Test Annotation API with random sentences") {
   std::mt19937 randomIntGen_;
   randomIntGen_.seed(42);
 
-  AnnotatedText testAnnotation; // This the container we add through API and
-                                // check if the access is correct.
-
   // External book-keeping so we have ground truths. Each element represents a
   // sentence.
 
@@ -45,7 +42,7 @@ TEST_CASE("Test Annotation API with random sentences") {
   //
   //     4-0 4-1 4-2 4-3
   //
-  // Words are separated by space units.
+  // Tokens are contiguous because that's how SentencePiece works.
   //
   // Below, we accumulate the text with intended structure as above, and
   // ground-truth tables populated to be aware of the ByteRanges where they are
@@ -53,9 +50,10 @@ TEST_CASE("Test Annotation API with random sentences") {
   if (debug) {
     std::cout << "Preparing text and ground truth-tables" << std::endl;
   }
+  std::string text;
   for (size_t idx = 0; idx < sentences; idx++) {
     if (idx != 0)
-      testAnnotation.text += "\n";
+      text += "\n";
 
     // Words can be zero, we need to support empty word sentences as well.
     size_t numWords = randomIntGen_() % maxWords;
@@ -65,23 +63,16 @@ TEST_CASE("Test Annotation API with random sentences") {
 
     // For empty sentence, we expect it to be empty and marked in position where
     // the existing string is if needed to be pointed out.
-    size_t before = testAnnotation.text.size() - 1;
+    size_t before = text.size() - 1;
     size_t sentenceBegin{before}, sentenceEnd{before};
 
     for (size_t idw = 0; idw < numWords; idw++) {
-      if (idw != 0) {
-        testAnnotation.text += " ";
-        if (debug) {
-          std::cout << " ";
-        }
-      }
-
       // Get new beginning, accounting for space above.
-      before = testAnnotation.text.size();
+      before = text.size();
 
       // Add the word
       std::string word = std::to_string(idx) + "-" + std::to_string(idw);
-      testAnnotation.text += word;
+      text += word;
 
       // Do math, before, before + new-word's size.
       wordByteRanges.push_back((ByteRange){before, before + word.size()});
@@ -105,6 +96,9 @@ TEST_CASE("Test Annotation API with random sentences") {
     groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
   }
 
+  AnnotatedText testAnnotation(std::move(text)); // This the container we add through API and
+                                                 // check if the access is correct.
+
   // We prepare string_views now with the known ByteRanges and use the
   // string_view based AnnotatedText.addSentence(...) API to add sentences to
   // transparently convert from string_views to ByteRanges, rebasing/working out
@@ -116,6 +110,7 @@ TEST_CASE("Test Annotation API with random sentences") {
   }
 
   std::vector<std::vector<marian::string_view>> wordStringViews;
+  std::vector<ByteRange>::const_iterator sentence_iter = groundTruthSentences.begin();
   for (auto &sentence : groundTruthWords) {
     std::vector<marian::string_view> wordByteRanges;
     bool first{true};
@@ -132,7 +127,8 @@ TEST_CASE("Test Annotation API with random sentences") {
         std::cout << std::string(wordView);
       }
     }
-    testAnnotation.addSentence(wordByteRanges);
+    testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(), testAnnotation.text.data() + sentence_iter->begin);
+    ++sentence_iter;
     wordStringViews.push_back(wordByteRanges);
     if (debug) {
       std::cout << std::endl;
@@ -207,7 +203,7 @@ TEST_CASE("Test Annotation API with random sentences") {
   // Sentence if the random test above does not cover it for some reason.
   int emptySentenceIdx = sentences;
   std::vector<marian::string_view> emptySentence;
-  testAnnotation.addSentence(emptySentence);
+  testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(), testAnnotation.text.data() + testAnnotation.text.size());
 
   // There are no words.
   CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
diff --git a/src/translator/annotation.cpp b/src/translator/annotation.cpp
index c27d784..90e02e0 100644
--- a/src/translator/annotation.cpp
+++ b/src/translator/annotation.cpp
@@ -1,130 +1,68 @@
 #include "annotation.h"
 #include <cassert>
-#include <iostream>
 
 namespace marian {
 namespace bergamot {
 
-void Annotation::addSentence(std::vector<ByteRange> &sentence) {
-  flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence),
-                         std::end(sentence));
-  size_t size = flatByteRanges_.size();
-  sentenceEndIds_.push_back(size);
+AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) {
+  // Treat the entire text as a gap that recordExistingSentence will break.
+  annotation.token_begin_.back() = text.size();
 }
 
-size_t Annotation::numWords(size_t sentenceIdx) const {
-  size_t bosId, eosId;
-  bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
-  eosId = sentenceEndIds_[sentenceIdx + 1];
-  // Difference between eosId and bosId is the number of words.
-  return eosId - bosId;
+void AnnotatedText::appendSentence(string_view prefix, std::vector<string_view>::iterator begin, std::vector<string_view>::iterator end) {
+  assert(annotation.token_begin_.back() == text.size());
+  // We'll be adding tokens from the sentence and another gap.
+  annotation.token_begin_.reserve(annotation.token_begin_.size() + (end - begin) + 1);
+
+  // prefix is just end of the previous one.
+  appendEndingWhitespace(prefix);
+
+  // Appending sentence text.
+  std::size_t offset = text.size();
+  for (std::vector<string_view>::iterator token = begin; token != end; ++token) {
+    offset += token->size();
+    annotation.token_begin_.push_back(offset);
+  }
+  if (begin != end) {
+    text.append(begin->data(), (end - 1)->data() + (end - 1)->size());
+    assert(offset == text.size()); // Tokens should be contiguous.
+  }
+
+  // Add the gap after the sentence.  This is empty for now, but will be
+  // extended with appendEndingWhitespace or another appendSentence.
+  annotation.gap_.push_back(annotation.token_begin_.size() - 1);
+  annotation.token_begin_.push_back(offset);
 }
 
-ByteRange Annotation::sentence(size_t sentenceIdx) const {
-  size_t bosId, eosId;
-  bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
-  eosId = sentenceEndIds_[sentenceIdx + 1];
-  ByteRange sentenceByteRange;
+void AnnotatedText::appendEndingWhitespace(string_view whitespace) {
+  text.append(whitespace.data(), whitespace.size());
+  annotation.token_begin_.back() = text.size();
+}
 
-  if (bosId == eosId) {
-    // We have an empty sentence. However, we want to be able to point where in
-    // target this happened through the ranges. We are looking for the end of
-    // the flatByteRange and non-empty sentence before this happened and
-    // construct empty string-view equivalent ByteRange.
-    ByteRange eos = flatByteRanges_[eosId - 1];
-    sentenceByteRange = ByteRange{eos.end, eos.end};
+void AnnotatedText::recordExistingSentence(std::vector<string_view>::iterator begin, std::vector<string_view>::iterator end, const char *sentence_begin) {
+  assert(sentence_begin >= text.data());
+  assert(sentence_begin <= text.data() + text.size());
+  assert(begin == end || sentence_begin == begin->data());
+  assert(!annotation.token_begin_.empty());
+  assert(annotation.token_begin_.back() == text.size());
+  // Clip off size token ending.
+  annotation.token_begin_.resize(annotation.token_begin_.size() - 1);
+  for (std::vector<string_view>::iterator i = begin; i != end; ++i) {
+    assert(i->data() >= text.data()); // In range.
+    assert(i->data() + i->size() <= text.data() + text.size()); // In range
+    assert(i + 1 == end || i->data() + i->size() == (i+1)->data()); // Contiguous
+    annotation.token_begin_.push_back(i->data() - text.data());
+  }
+  // Gap token after sentence.
+  annotation.gap_.push_back(annotation.token_begin_.size());
+  if (begin != end) {
+    annotation.token_begin_.push_back((end - 1)->data() + (end - 1)->size() - text.data());
   } else {
-    ByteRange bos = flatByteRanges_[bosId];
-    ByteRange eos = flatByteRanges_[eosId - 1];
-    sentenceByteRange = ByteRange{bos.begin, eos.end};
+    // empty sentence.
+    annotation.token_begin_.push_back(sentence_begin - text.data());
   }
-  return sentenceByteRange;
-}
-
-ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const {
-  size_t bosOffset = sentenceEndIds_[sentenceIdx];
-  return flatByteRanges_[bosOffset + wordIdx];
-}
-
-string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const {
-  auto terminals = annotation.word(sentenceIdx, wordIdx);
-  return string_view(&text[terminals.begin], terminals.size());
-}
-
-string_view AnnotatedText::sentence(size_t sentenceIdx) const {
-  auto sentenceAsByteRange = annotation.sentence(sentenceIdx);
-  return asStringView(sentenceAsByteRange);
-}
-
-void AnnotatedText::appendSentence(std::string prefix, std::string &reference,
-                                   std::vector<string_view> &wordRanges) {
-  text += prefix;
-  size_t offset = text.size(); // Get size before to do ByteRange arithmetic
-  text += reference;           // Append reference to text
-  std::vector<ByteRange> sentence;
-  for (auto &wordView : wordRanges) {
-    size_t thisWordBegin = offset + wordView.data() - reference.data();
-    sentence.push_back(
-        ByteRange{thisWordBegin, thisWordBegin + wordView.size()});
-  }
-  annotation.addSentence(sentence);
-}
-
-void AnnotatedText::addSentence(std::vector<string_view> &wordRanges) {
-  addSentence(std::begin(wordRanges), std::end(wordRanges));
-};
-
-void AnnotatedText::addSentence(std::vector<string_view>::iterator begin,
-                                std::vector<string_view>::iterator end) {
-  std::vector<ByteRange> sentence;
-  for (auto p = begin; p != end; p++) {
-    size_t begin_offset = p->data() - text.data();
-    sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()});
-  }
-  annotation.addSentence(sentence);
-};
-
-ByteRange AnnotatedText::wordAsByteRange(size_t sentenceIdx,
-                                         size_t wordIdx) const {
-  return annotation.word(sentenceIdx, wordIdx);
-}
-
-ByteRange AnnotatedText::sentenceAsByteRange(size_t sentenceIdx) const {
-  return annotation.sentence(sentenceIdx);
-}
-
-string_view AnnotatedText::asStringView(const ByteRange &byteRange) const {
-  const char *data = &text[byteRange.begin];
-  size_t size = byteRange.size();
-  return string_view(data, size);
-}
-
-string_view AnnotatedText::gap(size_t sentenceIdx) const {
-  // Find start of filler-text before, there's a corner case when there's no
-  // sentence before.
-  const char *start = nullptr;
-  if (sentenceIdx == 0) {
-    // If first sentence, filler begins at start of whole-text.
-    start = text.data();
-  } else {
-    // Otherwise, filler begins at end of previous sentence.
-    string_view sentenceBefore = sentence(sentenceIdx - 1);
-    start = sentenceBefore.data() + sentenceBefore.size();
-  }
-
-  // Find end of filler-text, but there is a corner-case to handle.
-  const char *end = nullptr;
-  if (sentenceIdx == numSentences()) {
-    // If last sentence, manually find end of whole-text.
-    const char *begin = text.data();
-    end = begin + text.size();
-  } else {
-    // Otherwise, the filler ends at the start of next sentence.
-    string_view sentenceAfter = sentence(sentenceIdx);
-    end = sentenceAfter.data();
-  }
-
-  return string_view(start, end - start);
+  // Add back size token ending.
+  annotation.token_begin_.push_back(text.size());
 }
 
 } // namespace bergamot
diff --git a/src/translator/annotation.h b/src/translator/annotation.h
index 8cb7caf..555ab53 100644
--- a/src/translator/annotation.h
+++ b/src/translator/annotation.h
@@ -17,83 +17,99 @@ struct ByteRange {
   const size_t size() const { return end - begin; }
 };
 
-/// An Annotation is a collection of ByteRanges used to denote ancillary
-/// information of sentences and words on a text of string. Annotation is meant
-/// for consumption on platforms where `string_view` creates problems (eg:
-/// exports through WASM) conveniently rebasing them as required into
-/// ByteRanges. See AnnotatedText for cases where this is a non-issue.
+/// Annotation expresses sentence and token boundary information as ranges of
+/// bytes in a string, but does not itself own the string.  
+/// 
+/// See also AnnotatedText, which owns Annotation and the string. AnnotatedText
+/// wraps these ByteRange functions to provide a string_view interface.
 ///
-/// **Usage**
+/// Text is divided into gaps (whitespace between sentences) and sentences like
+/// so:
+///   gap sentence gap sentence gap
+/// Because gaps appear at the beginning and end of the text, there's always
+/// one more gap than there are sentences.
 ///
-/// To ensure rebasing is consistent during creation and updation, use
-/// `Annotation` best through `AnnotatedText`, which also holds the reference
-/// string and can work with `string_views`.
+/// The entire text is a unbroken sequence of tokens (i.e. the end of a token
+/// is the beginning of the next token).  A gap is exactly one token containing
+/// whatever whitespace is between the sentences.  A sentence is a sequence of
+/// tokens.
 ///
-/// If used separately, it is on the user to ensure the reference string
-/// is the same as what the Annotation refers to. For best results, an instance
-/// is expected to be read only in this mode of operation.
+/// Since we are using SentencePiece, a token can include whitespace.  The term
+/// "word" is used, somewhat incorrectly, as a synonym of token.
 ///
-/// **Idea**
-///
-/// Annotation is intended to be the same structure conceptually as below,
-/// except the `std::vector<std::vector<ByteRange>>` hammered into a flat
-/// structure to avoid multiple reallocs keeping efficiency in mind. This is
-/// achieved by having markers of where sentence ends in the flat container
-/// storing word ByteRanges.
-///
-/// ```cpp
-/// typedef ByteRange Word;
-/// // std::vector<ByteRange>, a single sentence
-/// typedef std::vector<Word> Sentence;
-/// std::vector<std::vector<ByteRange> // multiple sentences
-/// typedef std::vector<Sentence> Annotation;
-///
-/// Annotation example;
-/// ```
-/// This structure exists to provide a consistent API to access the nested
-/// sentences of varying lengths, which occur in source-text processed into
-/// multiple sentences, and target-text translated from source as multiple
-/// sentences, both composed of (sub)-words, providing a List[List] like access
-/// while storing it in a compact and efficient manner.
+/// A gap can be empty (for example there may not have been whitespace at the
+/// beginning).  A sentence can also be empty (typically the translation system
+/// produced empty output).  That's fine, these are just empty ranges as you
+/// would expect.
 class Annotation {
 public:
-  /// Annotation is constructed empty. See `addSentence()` to populate it with
-  /// annotations.
+  /// Initially an empty string.  Populated by AnnotatedText.
   Annotation() {
-    // The -1-th sentence ends at 0.
-    sentenceEndIds_.push_back(0);
+    token_begin_.push_back(0);
+    token_begin_.push_back(0);
+    gap_.push_back(0);
   }
 
-  size_t numSentences() const { return sentenceEndIds_.size() - 1; }
+  size_t numSentences() const { return gap_.size() - 1; }
 
   /// Returns number of words in the sentence identified by `sentenceIdx`.
-  size_t numWords(size_t sentenceIdx) const;
-
-  /// Adds a sentences from `vector<ByteRange>` representation, internally doing
-  /// extra book-keeping for the sentence terminal markings. Sentences are
-  /// expected to be added in order as they occur in text.
-  void addSentence(std::vector<ByteRange> &sentence);
+  size_t numWords(size_t sentenceIdx) const {
+    return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */;
+  }
 
   /// Returns a ByteRange representing `wordIdx` in sentence indexed by
   /// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
   /// `.numWords()` for `sentenceIdx` for defined behaviour.
-  ByteRange word(size_t sentenceIdx, size_t wordIdx) const;
+  ByteRange word(size_t sentenceIdx, size_t wordIdx) const {
+    size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx;
+    return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
+  }
 
   /// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
   /// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
   /// less than `.numSentences()`.
-  ByteRange sentence(size_t sentenceIdx) const;
+  ByteRange sentence(size_t sentenceIdx) const {
+    return ByteRange {
+      token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */
+      token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */
+    };
+  }
+
+  ByteRange gap(size_t gapIdx) const {
+    size_t tokenIdx = gap_[gapIdx];
+    return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
+  }
 
 private:
-  /// A flat storage for ByteRanges. Composed of word ByteRanges, extra
-  /// information in sentenceEndIds_ to denote sentence boundary markers as
-  /// indices.
-  std::vector<ByteRange> flatByteRanges_;
+  friend class AnnotatedText;
+  /// Map from token index to byte offset at which it begins.  Token i is:
+  ///   [token_begin_[i], token_begin_[i+1])
+  /// The vector is padded so that these indices are always valid, even at the
+  /// end.  So tokens_begin_.size() is the number of tokens plus 1.
+  std::vector<size_t> token_begin_;
 
-  /// Stores indices onto flatByteRanges_ of where sentences end (not inclusive,
-  /// aligned with C++ half interval notions). There is a 0 marker to simplify
-  /// sources, indicating where the -1-th sentence ends.
-  std::vector<size_t> sentenceEndIds_;
+  /// Indices of tokens that correspond to gaps between sentences.  These are
+  /// indices into token_begin_.
+  /// Gap g is byte range:
+  ///   [token_begin_[gap_[w]], token_begin_[gap_[w]+1])
+  /// Sentence s is byte range:
+  ///   [token_begin_[gap_[s]+1], token_begin_[gap_[s+1]])
+  /// A sentence does not include whitespace at the beginning or end.
+  ///
+  /// gap_.size() == numSentences() + 1.
+  ///
+  /// Example: empty text "" -> just an empty gap.
+  /// token_begin_ = {0, 0};
+  /// gap_ = {0};
+  ///
+  /// Example: only space " " -> just a gap containing the space.
+  /// token_begin_ = {0, 1};
+  /// gap_ = {0};
+  ///
+  /// Example: one token "hi" -> empty gap, sentence with one token, empty gap
+  /// token_begin_ = {0, 0, 2, 2};
+  /// gap_ = {0, 2};
+  std::vector<size_t> gap_;
 };
 
 /// AnnotatedText is effectively std::string text + Annotation, providing the
@@ -107,7 +123,6 @@ private:
 ///
 /// 3. Bind the text and annotations together, to move around as a meaningful
 /// unit.
-
 struct AnnotatedText {
 public:
   std::string text;      ///< Blob of string elements in annotation refers to.
@@ -122,7 +137,31 @@ public:
 
   /// Construct moving in a string (for efficiency purposes, copying string
   /// constructor is disallowed).
-  AnnotatedText(std::string &&text) : text(std::move(text)){};
+  AnnotatedText(std::string &&text);
+
+  /// Appends a sentence to the existing text and transparently rebases
+  /// string_views.  Since this tracks only prefix, remember
+  /// appendEndingWhitespace.
+  /// The string_views must not already be in text.
+  void appendSentence(
+      string_view prefix,
+      std::vector<string_view>::iterator tokens_begin,
+      std::vector<string_view>::iterator tokens_end);
+
+  /// Append the whitespace at the end of input. string_view must not be in
+  /// text.
+  void appendEndingWhitespace(string_view whitespace);
+
+  /// Record the existence of a sentence that is already in text.  The
+  /// iterators are over string_views for each token that must be in text
+  /// already.  This function must be called to record sentences in order.
+  /// Normally the beginning of the sentence can be inferred from
+  /// tokens_begin->data() but the tokens could be empty, so sentence_begin is
+  /// required to know where the sentence is.
+  void recordExistingSentence(
+      std::vector<string_view>::iterator tokens_begin,
+      std::vector<string_view>::iterator tokens_end,
+      const char *sentence_begin);
 
   /// Returns the number of sentences in the annotation structure.
   const size_t numSentences() const { return annotation.numSentences(); }
@@ -132,46 +171,44 @@ public:
     return annotation.numWords(sentenceIdx);
   }
 
-  /// Appends a sentence to the existing text and transparently rebases
-  /// string_views
-  void appendSentence(std::string prefix, std::string &reference,
-                      std::vector<string_view> &wordRanges);
-
-  /// Adds a sentence, used to load from SentencePiece annotations conveniently.
-  void addSentence(std::vector<string_view> &wordRanges);
-
-  /// Adds a sentence between two iterators, often useful while constructing
-  /// from parts of a container.
-  void addSentence(std::vector<string_view>::iterator begin,
-                   std::vector<string_view>::iterator end);
-
   /// Returns a string_view representing wordIdx in sentenceIdx
-  string_view word(size_t sentenceIdx, size_t wordIdx) const;
+  string_view word(size_t sentenceIdx, size_t wordIdx) const {
+    return asStringView(annotation.word(sentenceIdx, wordIdx));
+  }
 
   /// Returns a string_view representing sentence corresponding to sentenceIdx.
-  string_view sentence(size_t sentenceIdx) const;
+  string_view sentence(size_t sentenceIdx) const {
+    return asStringView(annotation.sentence(sentenceIdx));
+  }
 
   /// Returns the string_view of the gap between two sentences in the container.
   ///
   /// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
   ///
-  /// * For `i = 0`: The gap between the start of text and the first sentence.
+  /// * For `i = 0`: The gap between the start of text and the 0th sentence.
   /// * For `i = 1...N-1`, returns the text comprising of the gap
-  ///   between the `i-1`-th and `i`-th sentence.
-  /// * For `i = N`, the gap between the last sentence and end of
+  ///   between the `i`-th and `i+1`-th sentence.
+  /// * For `i = N`, the gap between the last (N-1th) sentence and end of
   ///   text.
-
   /// @param sentenceIdx: Can be between `[0, numSentences()]`.
-  string_view gap(size_t sentenceIdx) const;
+  string_view gap(size_t sentenceIdx) const {
+    return asStringView(annotation.gap(sentenceIdx));
+  }
 
   /// Returns a ByteRange representing wordIdx in sentenceIdx
-  ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;
+  ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const {
+    return annotation.word(sentenceIdx, wordIdx);
+  }
 
   /// Returns a ByteRange representing sentence corresponding to sentenceIdx.
-  ByteRange sentenceAsByteRange(size_t sentenceIdx) const;
+  ByteRange sentenceAsByteRange(size_t sentenceIdx) const {
+    return annotation.sentence(sentenceIdx);
+  }
 
 private:
-  string_view asStringView(const ByteRange &byteRange) const;
+  string_view asStringView(const ByteRange &byteRange) const {
+    return string_view(text.data() + byteRange.begin, byteRange.size());
+  }
 };
 
 } // namespace bergamot
diff --git a/src/translator/response_builder.cpp b/src/translator/response_builder.cpp
index 037d456..b2f561b 100644
--- a/src/translator/response_builder.cpp
+++ b/src/translator/response_builder.cpp
@@ -75,22 +75,19 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
       // For each sentence, prepend the filler text between the corresponding
       // source-sentence and the source-sentence before.
       string_view pre = response.source.gap(sentenceIdx);
-      response.target.appendSentence(std::string(pre.data(), pre.size()),
-                                     decoded, targetSentenceMappings);
+      response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end());
 
       // If this is the last history to be decoded and translated-text
       // constructed, append the text till the end, which could be spaces or
       // empty.
       if (sentenceIdx + 1 == histories.size()) {
-        string_view post = response.source.gap(sentenceIdx + 1);
-        response.target.text += std::string(post.data(), post.size());
+        response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1));
       }
       break;
     }
     case ConcatStrategy::SPACE: {
-      std::string delimiter = (sentenceIdx == 0) ? "" : " ";
-      response.target.appendSentence(delimiter, decoded,
-                                     targetSentenceMappings);
+      string_view delimiter = (sentenceIdx == 0) ? "" : " ";
+      response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end());
       break;
     }
 
diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp
index 457e2b9..bca5fd1 100644
--- a/src/translator/text_processor.cpp
+++ b/src/translator/text_processor.cpp
@@ -41,15 +41,16 @@ void TextProcessor::process(AnnotatedText &source, Segments &segments) {
     // There are some cases where SentencePiece or vocab returns no words
     // after normalization. 0 prevents any empty entries from being added.
     if (segment.size() > 0) {
-      // Truncate segment into max_input_size segments.
-      truncate(segment, wordRanges, segments, source);
+      // Wrap segment into sentences of at most max_length_break_ tokens and
+      // tell source about them.
+      wrap(segment, wordRanges, segments, source);
     }
   }
 }
 
-void TextProcessor::truncate(Segment &segment,
-                             std::vector<string_view> &wordRanges,
-                             Segments &segments, AnnotatedText &source) {
+void TextProcessor::wrap(Segment &segment,
+                         std::vector<string_view> &wordRanges,
+                         Segments &segments, AnnotatedText &source) {
   for (size_t offset = 0; offset < segment.size();
        offset += max_length_break_) {
     auto start = segment.begin() + offset;
@@ -61,7 +62,8 @@ void TextProcessor::truncate(Segment &segment,
     segments.back().push_back(sourceEosId());
 
     auto astart = wordRanges.begin() + offset;
-    source.addSentence(astart, astart + diff);
+    // diff > 0
+    source.recordExistingSentence(astart, astart + diff, astart->data());
   }
 }
 
diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h
index f5d4d88..7328877 100644
--- a/src/translator/text_processor.h
+++ b/src/translator/text_processor.h
@@ -32,9 +32,9 @@ private:
   Segment tokenize(const string_view &input,
                    std::vector<string_view> &tokenRanges);
 
-  // Truncate sentence into max_input_size segments.
-  void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
-                Segments &segments, AnnotatedText &source);
+  // Wrap into sentences of at most max_length_break_ tokens and add to source.
+  void wrap(Segment &sentence, std::vector<string_view> &tokenRanges,
+            Segments &segments, AnnotatedText &source);
 
   // shorthand, used only in truncate()
   // vocabs_->sources().front() is invoked as we currently only support one source vocab

From c1ef6f2bcb08fcd4f9e2432ae443bb7a81813594 Mon Sep 17 00:00:00 2001
From: Abhishek Aggarwal <aaggarwal@mozilla.com>
Date: Mon, 17 May 2021 17:33:23 +0200
Subject: [PATCH 08/12] Added cmake file to compute version information

 - Reads BERGAMOT_VERSION file for generating various strings
   for versioning
---
 cmake/GetVersionFromFile.cmake | 60 ++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 cmake/GetVersionFromFile.cmake

diff --git a/cmake/GetVersionFromFile.cmake b/cmake/GetVersionFromFile.cmake
new file mode 100644
index 0000000..2eadb42
--- /dev/null
+++ b/cmake/GetVersionFromFile.cmake
@@ -0,0 +1,60 @@
+##
+# This CMake modules sets the project version from a version file.
+#
+# The module sets the following variables:
+#
+# * PROJECT_VERSION_STRING
+# * PROJECT_VERSION_STRING_FULL
+# * PROJECT_VERSION_MAJOR
+# * PROJECT_VERSION_MINOR
+# * PROJECT_VERSION_PATCH
+# * PROJECT_VERSION_TWEAK
+# * PROJECT_VERSION_GIT_SHA
+#
+# This module is public domain, use it as it fits you best.
+##
+
+# Get full string version from file
+if(PROJECT_VERSION_FILE)
+  file(STRINGS ${PROJECT_VERSION_FILE} PROJECT_VERSION_STRING)
+else()
+  file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/BERGAMOT_VERSION PROJECT_VERSION_STRING)
+endif()
+
+# Get current commit SHA from git
+execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PROJECT_VERSION_GIT_SHA
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# Get partial versions into a list
+string(REGEX MATCHALL "-.*$|[0-9]+" PROJECT_PARTIAL_VERSION_LIST
+  ${PROJECT_VERSION_STRING})
+
+# Set the version numbers
+list(GET PROJECT_PARTIAL_VERSION_LIST 0 PROJECT_VERSION_MAJOR)
+list(GET PROJECT_PARTIAL_VERSION_LIST 1 PROJECT_VERSION_MINOR)
+list(GET PROJECT_PARTIAL_VERSION_LIST 2 PROJECT_VERSION_PATCH)
+
+# The tweak part is optional, so check if the list contains it
+list(LENGTH PROJECT_PARTIAL_VERSION_LIST PROJECT_PARTIAL_VERSION_LIST_LEN)
+if(PROJECT_PARTIAL_VERSION_LIST_LEN GREATER 3)
+  list(GET PROJECT_PARTIAL_VERSION_LIST 3 PROJECT_VERSION_TWEAK)
+  string(SUBSTRING ${PROJECT_VERSION_TWEAK} 1 -1 PROJECT_VERSION_TWEAK)
+endif()
+
+# Unset the list
+unset(PROJECT_PARTIAL_VERSION_LIST)
+
+# Set full project version string
+set(PROJECT_VERSION_STRING_FULL
+  ${PROJECT_VERSION_STRING}+${PROJECT_VERSION_GIT_SHA})
+
+# Print all variables for debugging
+#message(STATUS ${PROJECT_VERSION_STRING_FULL})
+#message(STATUS ${PROJECT_VERSION_STRING})
+#message(STATUS ${PROJECT_VERSION_MAJOR})
+#message(STATUS ${PROJECT_VERSION_MINOR})
+#message(STATUS ${PROJECT_VERSION_PATCH})
+#message(STATUS ${PROJECT_VERSION_TWEAK})
+#message(STATUS ${PROJECT_VERSION_GIT_SHA})

From c44868e1fdd56e1562afee18773a9ee1d08d7689 Mon Sep 17 00:00:00 2001
From: Abhishek Aggarwal <aaggarwal@mozilla.com>
Date: Mon, 17 May 2021 17:34:57 +0200
Subject: [PATCH 09/12] Import GetVersionFromFile cmake file in root level
 CMakeLists.txt

---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 332aed1..e561ed9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,5 @@
 cmake_minimum_required(VERSION 3.5.1)
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 if (POLICY CMP0074)
   cmake_policy(SET CMP0074 NEW) # CMake 3.12
@@ -71,6 +72,11 @@ if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
     endif()
 endif()
 
+# Project versioning
+include(GetVersionFromFile)
+message(STATUS "Project name: ${PROJECT_NAME}")
+message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")
+
 if(NOT COMPILE_WASM)
   # Set BUILD_ARCH to native only while compiling for non wasm platform
   set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")

From 2e5880d3d499094e80eaae235fabf86068eb2f00 Mon Sep 17 00:00:00 2001
From: Abhishek Aggarwal <aaggarwal@mozilla.com>
Date: Mon, 17 May 2021 17:37:10 +0200
Subject: [PATCH 10/12] Modified wasm cmake file to include version information
 in built artifacts

---
 wasm/CMakeLists.txt        | 7 +++++++
 wasm/project_version.js.in | 1 +
 2 files changed, 8 insertions(+)
 create mode 100644 wasm/project_version.js.in

diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt
index 7feef75..72b22c1 100644
--- a/wasm/CMakeLists.txt
+++ b/wasm/CMakeLists.txt
@@ -4,6 +4,10 @@ add_executable(bergamot-translator-worker
     bindings/TranslationResultBindings.cpp
 )
 
+# Generate version file that can be included in the wasm artifacts
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.js.in
+               ${CMAKE_CURRENT_SOURCE_DIR}/project_version.js @ONLY)
+
 # This header inclusion needs to go away later as path to public headers of bergamot
 # translator should be directly available from "bergamot-translator" target
 target_include_directories(bergamot-translator-worker
@@ -19,6 +23,9 @@ set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s
 # Avoid node.js-code in emscripten glue-code
 set(LINKER_FLAGS "${LINKER_FLAGS} -s ENVIRONMENT=web,worker")
 
+# Append version information in the Javascript artifact
+set(LINKER_FLAGS "${LINKER_FLAGS} --extern-pre-js ${CMAKE_CURRENT_SOURCE_DIR}/project_version.js")
+
 set_target_properties(bergamot-translator-worker PROPERTIES
                         SUFFIX ".js"
                         LINK_FLAGS ${LINKER_FLAGS}
diff --git a/wasm/project_version.js.in b/wasm/project_version.js.in
new file mode 100644
index 0000000..9a4095f
--- /dev/null
+++ b/wasm/project_version.js.in
@@ -0,0 +1 @@
+var BERGAMOT_VERSION_FULL = "@PROJECT_VERSION_STRING_FULL@";
\ No newline at end of file

From 0ad583cc34affac322e36ab0e4b5d4310309db74 Mon Sep 17 00:00:00 2001
From: Abhishek Aggarwal <aaggarwal@mozilla.com>
Date: Mon, 17 May 2021 17:41:59 +0200
Subject: [PATCH 11/12] Generate project version file for native builds

 - The header file exposes a function that provides version information
   for native binaries
---
 src/translator/CMakeLists.txt       |  4 ++++
 src/translator/project_version.h.in | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 src/translator/project_version.h.in

diff --git a/src/translator/CMakeLists.txt b/src/translator/CMakeLists.txt
index a7ba0d3..1d48d59 100644
--- a/src/translator/CMakeLists.txt
+++ b/src/translator/CMakeLists.txt
@@ -1,3 +1,7 @@
+# Generate version file
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.h.in
+               ${CMAKE_CURRENT_SOURCE_DIR}/project_version.h @ONLY)
+
 add_library(bergamot-translator STATIC
     byte_array_util.cpp
     text_processor.cpp
diff --git a/src/translator/project_version.h.in b/src/translator/project_version.h.in
new file mode 100644
index 0000000..b7a0d04
--- /dev/null
+++ b/src/translator/project_version.h.in
@@ -0,0 +1,19 @@
+#pragma once
+
+/*
+ * File project_version.h is generated using CMake. Do not modify project_version.h manually!
+ * Edit project_version.h.in file instead.
+ */
+
+#include <string>
+
+namespace marian {
+namespace bergamot {
+
+std::string bergamotBuildVersion() {
+    // e.g. v1.2.3-alpha.1.1+abc123d
+    return "@PROJECT_VERSION_STRING_FULL@";
+}
+
+} // namespace bergamot
+} // namespace marian

From 067076fbc180bac04492252eadda8497c22065c3 Mon Sep 17 00:00:00 2001
From: Abhishek Aggarwal <aaggarwal@mozilla.com>
Date: Mon, 17 May 2021 17:38:00 +0200
Subject: [PATCH 12/12] Bumped version to 0.3.0

 - This brings the version info in sync with the various releases
   of extension
---
 BERGAMOT_VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BERGAMOT_VERSION b/BERGAMOT_VERSION
index ae39fab..268b033 100644
--- a/BERGAMOT_VERSION
+++ b/BERGAMOT_VERSION
@@ -1 +1 @@
-v0.0.0
+v0.3.0