From e0b9bad0581963f4ae1f953f1b52de5619672384 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Wed, 12 May 2021 14:39:23 +0200 Subject: [PATCH 01/12] Updated wasm README to update for passing vocabs as bytes - Updated Using JS APIs section to pass vocabs as bytes --- wasm/README.md | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/wasm/README.md b/wasm/README.md index 337ae1b..eceac15 100644 --- a/wasm/README.md +++ b/wasm/README.md @@ -7,8 +7,8 @@ Please note that [Using JS APIs](#Using-JS-APIs) and [Demo](#Demo) section below ```bash cd test_page -mkdir models git clone --depth 1 --branch main --single-branch https://github.com/mozilla-applied-ml/bergamot-models +mkdir models cp -rf bergamot-models/prod/* models gunzip models/*/* ``` @@ -18,10 +18,7 @@ gunzip models/*/* ```js // The model configuration as YAML formatted string. For available configuration options, please check: https://marian-nmt.github.io/docs/cmd/marian-decoder/ // This example captures some of the most relevant options -const modelConfig = `vocabs: - - /esen/vocab.esen.spm - - /esen/vocab.esen.spm -beam-size: 1 +const modelConfig = `beam-size: 1 normalize: 1.0 word-penalty: 0 max-length-break: 128 @@ -35,19 +32,31 @@ quiet-translation: true gemm-precision: int8shift `; -// Download model and shortlist files and read them into buffers +// Download model, shortlist and vocabulary files and read them into buffers const modelFile = `models/esen/model.esen.intgemm.alphas.bin`; const shortlistFile = `models/esen/lex.50.50.esen.s2t.bin`; -const downloadedBuffers = await Promise.all([downloadAsArrayBuffer(modelFile), downloadAsArrayBuffer(shortlistFile)]); // Please refer to bergamot.html in test_page folder for this function +const vocabFiles = [`models/${languagePair}/vocab.${vocabLanguagePair}.spm`, + `models/${languagePair}/vocab.${vocabLanguagePair}.spm`]; +const uniqueVocabFiles = new Set(vocabFiles); + +// Please refer to bergamot.html in test_page folder for downloadAsArrayBuffer function +const downloadedBuffers = await Promise.all([downloadAsArrayBuffer(modelFile), downloadAsArrayBuffer(shortlistFile)]); +const downloadedVocabBuffers = []; +for (let item of uniqueVocabFiles.values()) { + downloadedVocabBuffers.push(await downloadAsArrayBuffer(item)); +} + const modelBuffer = downloadedBuffers[0]; const shortListBuffer = downloadedBuffers[1]; // Construct AlignedMemory instances from the buffers var alignedModelMemory = constructAlignedMemoryFromBuffer(modelBuffer, 256); // Please refer to bergamot.html in test_page folder for this function var alignedShortlistMemory = constructAlignedMemoryFromBuffer(shortListBuffer, 64); // Please refer to bergamot.html in test_page folder for this function +var alignedVocabsMemoryList = new Module.AlignedMemoryList; +downloadedVocabBuffers.forEach(item => alignedVocabsMemoryList.push_back(constructAlignedMemoryFromBuffer(item, 64))); // Instantiate the TranslationModel -const model = new Module.TranslationModel(modelConfig, alignedModelMemory, alignedShortlistMemory); +const model = new Module.TranslationModel(modelConfig, alignedModelMemory, alignedShortlistMemory, alignedVocabsMemoryList); // Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector) const request = new Module.TranslationRequest(); From 0189500160eaac8ae7e8a7b8d03ae91c98063684 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Wed, 12 May 2021 14:44:33 +0200 Subject: [PATCH 02/12] Updated README to remove packaging steps for wasm compilation - We don't need to package model, shortlist or vocab files into wasm binary at build time --- README.md | 94 +++++++++++++++++-------------------------------------- 1 file changed, 28 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index 456af70..f48c981 100644 --- a/README.md +++ b/README.md @@ -5,85 +5,47 @@ Bergamot translator provides a unified API for ([Marian NMT](https://marian-nmt. ## Build Instructions ### Build Natively -1. Clone the repository using these instructions: - ```bash - git clone https://github.com/browsermt/bergamot-translator - cd bergamot-translator - ``` -2. Compile +Create a folder where you want to build all the artifacts (`build-native` in this case) and compile - Create a folder where you want to build all the artifacts (`build-native` in this case) and compile in that folder - ```bash - mkdir build-native - cd build-native - cmake ../ - make -j - ``` +```bash +mkdir build-native +cd build-native +cmake ../ +make -j3 +``` ### Build WASM -#### Compiling for the first time +#### Prerequisite -1. Download and Install Emscripten using following instructions - * Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git` - * Enter the cloned directory: `cd emsdk` - * Install the lastest sdk tools: `./emsdk install latest` - * Activate the latest sdk tools: `./emsdk activate latest` - * Activate path variables: `source ./emsdk_env.sh` +Building on wasm requires Emscripten toolchain. It can be downloaded and installed using following instructions: -2. Clone the repository using these instructions: +* Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git` +* Enter the cloned directory: `cd emsdk` +* Install the lastest sdk tools: `./emsdk install latest` +* Activate the latest sdk tools: `./emsdk activate latest` +* Activate path variables: `source ./emsdk_env.sh` + +#### Compile + +1. Create a folder where you want to build all the artifacts (`build-wasm` in this case) and compile ```bash - git clone https://github.com/browsermt/bergamot-translator - cd bergamot-translator + mkdir build-wasm + cd build-wasm + emcmake cmake -DCOMPILE_WASM=on ../ + emmake make -j3 ``` -3. Download files (only required if you want to perform inference using build artifacts) + The wasm artifacts (.js and .wasm files) will be available in the build directory ("build-wasm" in this case). - It packages the vocabulary files into wasm binary, which is required only if you want to perform inference. - The compilation commands will preload these files in Emscripten’s virtual file system. - - If you want to package bergamot project specific files, please follow these instructions: +2. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts ```bash - git clone --depth 1 --branch main --single-branch https://github.com/mozilla-applied-ml/bergamot-models - mkdir models - cp -rf bergamot-models/prod/* models - gunzip models/*/* - find models \( -type f -name "model*" -or -type f -name "lex*" \) -delete + bash ../wasm/patch-artifacts-enable-wormhole.sh ``` -4. Compile - 1. Create a folder where you want to build all the artefacts (`build-wasm` in this case) - ```bash - mkdir build-wasm - cd build-wasm - ``` - - 2. Compile the artefacts - * If you want to package files into wasm binary then execute following commands (Replace `FILES_TO_PACKAGE` with the - directory containing all the files to be packaged) - - ```bash - emcmake cmake -DCOMPILE_WASM=on -DPACKAGE_DIR=FILES_TO_PACKAGE ../ - emmake make -j - ``` - e.g. If you want to package bergamot project specific files (downloaded using step 3 above) then - replace `FILES_TO_PACKAGE` with `../models` - - * If you don't want to package any file into wasm binary then execute following commands: - ```bash - emcmake cmake -DCOMPILE_WASM=on ../ - emmake make -j - ``` - - The wasm artifacts (.js and .wasm files) will be available in the build directory ("build-wasm" in this case). - - 3. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts - ```bash - bash ../wasm/patch-artifacts-enable-wormhole.sh - ``` - #### Recompiling -As long as you don't update any submodule, just follow steps in `4.ii` and `4.iii` to recompile.\ -If you update a submodule, execute following command before executing steps in `4.ii` and `4.iii` to recompile. +As long as you don't update any submodule, just follow [Compile](#Compile) steps.\ +If you update a submodule, execute following command in repository root folder before executing +[Compile](#Compile) steps. ```bash git submodule update --init --recursive ``` From 6c063c607ee5a5ffc00a2fe64a6c32164699ceab Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Wed, 12 May 2021 14:46:02 +0200 Subject: [PATCH 03/12] Updated CMakeLists.txt to remove packaging steps for wasm compilation - Removed PACKAGE_DIR cmake option - Removed Workerfs, FORCE_FILESYSTEM=1 in wasm builds -- File system support is not needed any more (since model, shortlist and vocabs are being passed as bytes now) --- CMakeLists.txt | 2 -- wasm/CMakeLists.txt | 9 +-------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ef64863..332aed1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,8 +42,6 @@ option(COMPILE_WASM "Compile for WASM" OFF) cmake_dependent_option(USE_WASM_COMPATIBLE_SOURCE "Use wasm compatible sources" OFF "NOT COMPILE_WASM" ON) option(COMPILE_TESTS "Compile bergamot-tests" OFF) -SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds") - # Set 3rd party submodule specific cmake options for this project SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version") SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece") diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt index 4962da0..7feef75 100644 --- a/wasm/CMakeLists.txt +++ b/wasm/CMakeLists.txt @@ -14,14 +14,7 @@ target_include_directories(bergamot-translator-worker target_compile_definitions(bergamot-translator-worker PRIVATE WASM_BINDINGS) target_compile_options(bergamot-translator-worker PRIVATE ${WASM_COMPILE_FLAGS}) -set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s FORCE_FILESYSTEM=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1 -s EXPORTED_RUNTIME_METHODS=[addOnPreMain]") -if (NOT PACKAGE_DIR STREQUAL "") - get_filename_component(REALPATH_PACKAGE_DIR ${PACKAGE_DIR} REALPATH BASE_DIR ${CMAKE_BINARY_DIR}) - set(LINKER_FLAGS "${LINKER_FLAGS} --preload-file ${REALPATH_PACKAGE_DIR}@/") -endif() - -# Enable worker file system -set(LINKER_FLAGS "${LINKER_FLAGS} -lworkerfs.js") +set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1 -s EXPORTED_RUNTIME_METHODS=[addOnPreMain]") # Avoid node.js-code in emscripten glue-code set(LINKER_FLAGS "${LINKER_FLAGS} -s ENVIRONMENT=web,worker") From 6c7e6156ab2816310d90e5f4d9489f96f10decb2 Mon Sep 17 00:00:00 2001 From: Qianqian Zhu Date: Thu, 13 May 2021 13:18:08 +0100 Subject: [PATCH 04/12] Bundle AlignedMemory inputs with MemoryBundle (#147) --- app/service-cli.cpp | 12 ++--- src/translator/byte_array_util.cpp | 8 ++++ src/translator/byte_array_util.h | 1 + src/translator/definitions.h | 36 ++++++++++++++ src/translator/service.cpp | 9 ++-- src/translator/service.h | 56 +++++++--------------- wasm/bindings/TranslationModelBindings.cpp | 16 +++++-- 7 files changed, 81 insertions(+), 57 deletions(-) diff --git a/app/service-cli.cpp b/app/service-cli.cpp index 0e958d6..fbf0131 100644 --- a/app/service-cli.cpp +++ b/app/service-cli.cpp @@ -16,19 +16,15 @@ int main(int argc, char *argv[]) { auto cp = marian::bergamot::createConfigParser(); auto options = cp.parseOptions(argc, argv, true); - // Prepare memories for model and shortlist - marian::bergamot::AlignedMemory modelBytes, shortlistBytes; - std::vector> vocabsBytes; + // Prepare memories for bytearrays (including model, shortlist and vocabs) + marian::bergamot::MemoryBundle memoryBundle; if (options->get("check-bytearray")) { // Load legit values into bytearrays. - modelBytes = marian::bergamot::getModelMemoryFromConfig(options); - shortlistBytes = marian::bergamot::getShortlistMemoryFromConfig(options); - marian::bergamot::getVocabsMemoryFromConfig(options, vocabsBytes); + memoryBundle = marian::bergamot::getMemoryBundleFromConfig(options); } - marian::bergamot::Service service(options, std::move(modelBytes), - std::move(shortlistBytes), std::move(vocabsBytes)); + marian::bergamot::Service service(options, std::move(memoryBundle)); // Read a large input text blob from stdin std::ostringstream std_input; diff --git a/src/translator/byte_array_util.cpp b/src/translator/byte_array_util.cpp index 00beaa6..69564d2 100644 --- a/src/translator/byte_array_util.cpp +++ b/src/translator/byte_array_util.cpp @@ -117,5 +117,13 @@ void getVocabsMemoryFromConfig(marian::Ptr options, } } +MemoryBundle getMemoryBundleFromConfig(marian::Ptr options){ + MemoryBundle memoryBundle; + memoryBundle.model = getModelMemoryFromConfig(options); + memoryBundle.shortlist = getShortlistMemoryFromConfig(options); + getVocabsMemoryFromConfig(options, memoryBundle.vocabs); + return memoryBundle; +} + } // namespace bergamot } // namespace marian diff --git a/src/translator/byte_array_util.h b/src/translator/byte_array_util.h index 3cbf3d3..14c79b3 100644 --- a/src/translator/byte_array_util.h +++ b/src/translator/byte_array_util.h @@ -10,5 +10,6 @@ AlignedMemory getShortlistMemoryFromConfig(marian::Ptr options) void getVocabsMemoryFromConfig(marian::Ptr options, std::vector>& vocabMemories); bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize); +MemoryBundle getMemoryBundleFromConfig(marian::Ptr options); } // namespace bergamot } // namespace marian diff --git a/src/translator/definitions.h b/src/translator/definitions.h index 58fd4b3..175397d 100644 --- a/src/translator/definitions.h +++ b/src/translator/definitions.h @@ -15,6 +15,42 @@ typedef std::vector Segments; /// Shortcut to AlignedVector for byte arrays typedef AlignedVector AlignedMemory; +/// Memory bundle for all byte-arrays. +/// Can be a set/subset of model, shortlist, vocabs and ssplitPrefixFile bytes. +struct MemoryBundle { + AlignedMemory model; ///< Byte-array of model (aligned to 256) + AlignedMemory shortlist; ///< Byte-array of shortlist (aligned to 64) + + /// Vector of vocabulary memories (aligned to 64). + /// If two vocabularies are the same (based on the filenames), two entries (shared + /// pointers) will be generated which share the same AlignedMemory object. + std::vector> vocabs; + + /// @todo Not implemented yet + AlignedMemory ssplitPrefixFile; + + MemoryBundle() = default; + + MemoryBundle(MemoryBundle &&from){ + model = std::move(from.model); + shortlist = std::move(from.shortlist); + vocabs = std::move(vocabs); + ssplitPrefixFile = std::move(from.ssplitPrefixFile); + } + + MemoryBundle &operator=(MemoryBundle &&from) { + model = std::move(from.model); + shortlist = std::move(from.shortlist); + vocabs = std::move(vocabs); + ssplitPrefixFile = std::move(from.ssplitPrefixFile); + return *this; + } + + // Delete copy constructors + MemoryBundle(const MemoryBundle&) = delete; + MemoryBundle& operator=(const MemoryBundle&) = delete; +}; + } // namespace bergamot } // namespace marian diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 385a2a5..16c4743 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -41,14 +41,13 @@ loadVocabularies(marian::Ptr options, namespace marian { namespace bergamot { -Service::Service(Ptr options, AlignedMemory modelMemory, AlignedMemory shortlistMemory, - std::vector> vocabMemories) +Service::Service(Ptr options, MemoryBundle memoryBundle) : requestId_(0), options_(options), - vocabs_(std::move(loadVocabularies(options, std::move(vocabMemories)))), + vocabs_(std::move(loadVocabularies(options, std::move(memoryBundle.vocabs)))), text_processor_(vocabs_, options), batcher_(options), numWorkers_(options->get("cpu-threads")), - modelMemory_(std::move(modelMemory)), - shortlistMemory_(std::move(shortlistMemory)) + modelMemory_(std::move(memoryBundle.model)), + shortlistMemory_(std::move(memoryBundle.shortlist)) #ifndef WASM_COMPATIBLE_SOURCE // 0 elements in PCQueue is illegal and can lead to failures. Adding a // guard to have at least one entry allocated. In the single-threaded diff --git a/src/translator/service.h b/src/translator/service.h index 721d436..9d0a67d 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -55,53 +55,29 @@ namespace bergamot { /// // Do things with response. /// ``` /// -/// Optionally Service can be initialized by also passing model memory for -/// purposes of efficiency (which defaults to nullpointer and then reads from +/// Optionally Service can be initialized by also passing bytearray memories +/// for purposes of efficiency (which defaults to empty and then reads from /// file supplied through config). /// class Service { public: + /// Construct Service from Marian options. If memoryBundle is empty, Service is + /// initialized from file-based loading. Otherwise, Service is initialized from + /// the given bytearray memories. /// @param options Marian options object - /// @param modelMemory byte array (aligned to 256!!!) that contains the bytes - /// of a model.bin. - /// @param shortlistMemory byte array of shortlist (aligned to 64) - /// @param vocabMemories vector of vocabulary memories (aligned to 64) - explicit Service(Ptr options, AlignedMemory modelMemory, - AlignedMemory shortlistMemory, - std::vector> vocabMemories); + /// @param memoryBundle holds all byte-array memories. Can be a set/subset of + /// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional. + explicit Service(Ptr options, MemoryBundle memoryBundle={}); - /// Construct Service purely from Options. This expects options which - /// marian-decoder expects to be set for loading model shortlist and - /// vocabularies from files in addition to parameters that set unset desired - /// features (e.g: alignments, quality-scores). - /// - /// This is equivalent to a call to: - /// ```cpp - /// Service(options, AlignedMemory(), AlignedMemory(), {}) - /// ``` - /// wherein empty memory is passed and internal flow defaults to file-based - /// model, shortlist loading. AlignedMemory() corresponds to empty memory - explicit Service(Ptr options) - : Service(options, AlignedMemory(), AlignedMemory(), {}) {} - - /// Construct Service from a string configuration. - /// @param [in] config string parsable as YAML expected to adhere with marian - /// config - /// @param [in] modelMemory byte array (aligned to 256!!!) that contains the - /// bytes of a model.bin. Optional. AlignedMemory() corresponds to empty memory - /// @param [in] shortlistMemory byte array of shortlist (aligned to 64). Optional. - /// @param [in] vocabMemories vector of vocabulary memories (aligned to 64). Optional. - /// If two vocabularies are the same (based on the filenames), two entries (shared - /// pointers) will be generated which share the same AlignedMemory object. - explicit Service(const std::string &config, - AlignedMemory modelMemory = AlignedMemory(), - AlignedMemory shortlistMemory = AlignedMemory(), - std::vector> vocabsMemories = {}) - : Service(parseOptions(config, /*validate=*/false), - std::move(modelMemory), - std::move(shortlistMemory), - std::move(vocabsMemories)) {} + /// Construct Service from a string configuration. If memoryBundle is empty, Service is + /// initialized from file-based loading. Otherwise, Service is initialized from + /// the given bytearray memories. + /// @param [in] config string parsable as YAML expected to adhere with marian config + /// @param [in] memoryBundle holds all byte-array memories. Can be a set/subset of + /// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional. + explicit Service(const std::string &config, MemoryBundle memoryBundle={}) + : Service(parseOptions(config, /*validate=*/false), std::move(memoryBundle)) {} /// Explicit destructor to clean up after any threads initialized in /// asynchronous operation mode. diff --git a/wasm/bindings/TranslationModelBindings.cpp b/wasm/bindings/TranslationModelBindings.cpp index 4ee9265..1db7401 100644 --- a/wasm/bindings/TranslationModelBindings.cpp +++ b/wasm/bindings/TranslationModelBindings.cpp @@ -48,14 +48,22 @@ std::vector> prepareVocabsSmartMemories(std::vect return vocabsSmartMemories; } +marian::bergamot::MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory, + AlignedMemory* shortlistMemory, + std::vector uniqueVocabsMemories){ + marian::bergamot::MemoryBundle memoryBundle; + memoryBundle.model = std::move(*modelMemory); + memoryBundle.shortlist = std::move(*shortlistMemory); + memoryBundle.vocabs = std::move(prepareVocabsSmartMemories(uniqueVocabsMemories)); + + return memoryBundle; +} + TranslationModel* TranslationModelFactory(const std::string &config, AlignedMemory* modelMemory, AlignedMemory* shortlistMemory, std::vector uniqueVocabsMemories) { - return new TranslationModel(config, - std::move(*modelMemory), - std::move(*shortlistMemory), - std::move(prepareVocabsSmartMemories(uniqueVocabsMemories))); + return new TranslationModel(config, std::move(prepareMemoryBundle(modelMemory, shortlistMemory, uniqueVocabsMemories))); } EMSCRIPTEN_BINDINGS(translation_model) { From 77424a3df155a03c61338287a11b3c6c28815681 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 17 May 2021 11:42:47 +0100 Subject: [PATCH 05/12] Enabling ccache on github builds for Ubuntu (#95) * CI Changes to add tiny regression tests * Adding an inspect cache step * Removing ccache, pursue in another * Incorporating Nick's changes through submodule merge * Submodule now points to master * Restoring ccache enabled workflow file * Restoring ccache enabled CMakeLists * cache -> ccache typo fix * Moving CCACHE setup to GitHub runner file * Find also uses CCACHE dir * Updating CMakeLists not to override env * Cache compiler binary's contents * Changing a few names to trigger new build; Testing cache looks fun * USE_CCACHE=on, -L for inspection * Adding a ccache_cmd, but will only use in next commit * Using ccache_cmd * Removing " * Adding compiler hash script * Bunch of absolute paths * GITHUB_WORKSPACE typo * Nah, I'll keep -L and trigger another build * Trying something with compiler hash on cache key backup as well * builtin, bash it seems * Empty commit #1 * Move ccache stats to after compile * Reshuffling ccache vars * No comments * Updates to Github output set syntax * Empty Commit 1 * Empty Commit 2 * Empty commit 3 * /bin/bash -> bash; ccache_cmd for consistency * Adding ccache -s before and after build * Adding comments to compiler-hash script * Let's build cached and non-cached variants together for comparison * Fixing quotes, /bin/bash -> bash * Minor var/env adjustment * Adding ccache -z before the job * Reverting CMakeLists.txt without CCACHE * Switching to CMAKE_LANG_COMPILER_LAUNCHER instead of CMakeLists.txt rule * 5G -> 1G cache size * 1G -> 2G; Hyperparameter tuning --- .github/workflows/native-ubuntu.yml | 89 ++++++++++++++++++++++++++++- scripts/ci/compiler-hash.sh | 35 ++++++++++++ 2 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 scripts/ci/compiler-hash.sh diff --git a/.github/workflows/native-ubuntu.yml b/.github/workflows/native-ubuntu.yml index dc8016b..563daf7 100644 --- a/.github/workflows/native-ubuntu.yml +++ b/.github/workflows/native-ubuntu.yml @@ -15,6 +15,8 @@ jobs: - name: "full-marian" os: ubuntu-latest gcc: 8 + force_recache: false + ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%" cpu: 'ON' gpu: 'OFF' test_tags: "" @@ -24,10 +26,14 @@ jobs: USE_WASM_COMPATIBLE_SOURCE: "OFF" COMPILE_SERVER: "OFF" COMPILE_EXAMPLES: "OFF" + CMAKE_C_COMPILER_LAUNCHER: "ccache" + CMAKE_CXX_COMPILER_LAUNCHER: "ccache" - name: "minimal-marian" os: ubuntu-latest gcc: 8 + force_recache: false + ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%" cpu: 'ON' gpu: 'OFF' test_tags: "'#wasm'" @@ -37,6 +43,42 @@ jobs: USE_WASM_COMPATIBLE_SOURCE: "ON" COMPILE_SERVER: "OFF" COMPILE_EXAMPLES: "OFF" + CMAKE_C_COMPILER_LAUNCHER: "ccache" + CMAKE_CXX_COMPILER_LAUNCHER: "ccache" + + - name: "full-marian-force-recache" + os: ubuntu-latest + gcc: 8 + force_recache: true + ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%" + cpu: 'ON' + gpu: 'OFF' + test_tags: "" + cmake: + CMAKE_BUILD_TYPE: "Release" + COMPILE_TESTS: "ON" + USE_WASM_COMPATIBLE_SOURCE: "OFF" + COMPILE_SERVER: "OFF" + COMPILE_EXAMPLES: "OFF" + CMAKE_C_COMPILER_LAUNCHER: "ccache" + CMAKE_CXX_COMPILER_LAUNCHER: "ccache" + + - name: "minimal-marian-force-recache" + os: ubuntu-latest + gcc: 8 + force_recache: true + ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%" + cpu: 'ON' + gpu: 'OFF' + test_tags: "'#wasm'" + cmake: + CMAKE_BUILD_TYPE: "Release" + COMPILE_TESTS: "OFF" # Minimal marian has no sqlite support and COMPILE_TEST=ON fails. + USE_WASM_COMPATIBLE_SOURCE: "ON" + COMPILE_SERVER: "OFF" + COMPILE_EXAMPLES: "OFF" + CMAKE_C_COMPILER_LAUNCHER: "ccache" + CMAKE_CXX_COMPILER_LAUNCHER: "ccache" runs-on: ${{ matrix.os }} @@ -57,7 +99,7 @@ jobs: sudo apt-get update sudo apt-get install -y \ libgoogle-perftools-dev libprotobuf-dev protobuf-compiler \ - libboost-all-dev g++-${{ matrix.gcc }} + libboost-all-dev g++-${{ matrix.gcc }} ccache # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - name: Install MKL @@ -68,6 +110,42 @@ jobs: sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088 if: matrix.cmake.USE_WASM_COMPATIBLE_SOURCE == 'OFF' + - name: Generate ccache_vars + id: ccache_vars + shell: bash + run: | + echo "::set-output name=hash::$(${{ matrix.ccache_cmd }})" + echo "::set-output name=timestamp::$(date '+%Y-%m-%dT%H.%M.%S')" + + - name: Setup ccache environment variables + run: | + echo "CCACHE_COMPILERCHECK=${{ matrix.ccache_cmd }}" >> $GITHUB_ENV + echo "CCACHE_BASE_DIR=${{ github.workspace }}" >> $GITHUB_ENV + echo "CCACHE_DIR=${{ github.workspace }}/.ccache" >> $GITHUB_ENV + echo "CCACHE_COMPRESS=true" >> $GITHUB_ENV + echo "CCACHE_COMPRESSLEVEL=6" >> $GITHUB_ENV + echo "CCACHE_MAXSIZE=2G" >> $GITHUB_ENV + + - name: Setup ccache recache on + run: | + echo "CCACHE_RECACHE=" >> $GITHUB_ENV + if: matrix.force_recache == true + + - name: Cache-op for build-cache through ccache + uses: actions/cache@v2 + with: + path: ${{ env.CCACHE_DIR }} + key: ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-${{ steps.ccache_vars.outputs.timestamp }} + restore-keys: | + ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}- + ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}- + ccache-${{ matrix.name }}- + + - name: Cache stats before build + run: | + ccache -s + ccache -z + # Boost is installed on GitHub-hosted runners in a non-standard location # https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671 - name: Configure CMake @@ -75,17 +153,24 @@ jobs: mkdir -p build cd build CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \ - cmake .. \ + cmake -L .. \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake.CMAKE_BUILD_TYPE }}\ -DCOMPILE_TESTS=${{ matrix.cmake.COMPILE_TESTS }}\ -DCOMPILE_EXAMPLES=${{ matrix.cmake.COMPILE_EXAMPLES }} \ -DCOMPILE_SERVER=${{ matrix.cmake.COMPILE_SERVER }} \ -DUSE_WASM_COMPATIBLE_SOURCE=${{ matrix.cmake.USE_WASM_COMPATIBLE_SOURCE }} \ + -DCMAKE_C_COMPILER_LAUNCHER=${{ matrix.cmake.CMAKE_C_COMPILER_LAUNCHER}} \ + -DCMAKE_CXX_COMPILER_LAUNCHER=${{ matrix.cmake.CMAKE_CXX_COMPILER_LAUNCHER}} + - name: Compile bergamot-translator working-directory: build run: make -j2 + - name: Cache stats after build + run: | + ccache -s + - name: Run unit tests working-directory: build run: make test diff --git a/scripts/ci/compiler-hash.sh b/scripts/ci/compiler-hash.sh new file mode 100644 index 0000000..a770dfd --- /dev/null +++ b/scripts/ci/compiler-hash.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Uses the command from https://stackoverflow.com/a/9355840/4565794. +# -v displays the commands executed to run compilation. Of this cc1 additional +# flags which contain the flags triggered by -march=native is what we need. +# -E stop after preprocessing stage. + +# Output on a linux machine with gcc-8 looks as follows: + +# $ gcc -march=native -E -v - &1 | grep cc1 +# /usr/lib/gcc/x86_64-linux-gnu/8/cc1 -E -quiet -v -imultiarch x86_64-linux-gnu +# - -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 +# -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm +# -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig +# -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mno-rtm +# -mno-hle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave +# -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 +# -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl +# -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb +# -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk +# -mno-avx512vbmi2 -mavx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg +# -mno-movdiri -mno-movdir64b --param l1-cache-size=32 --param +# l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 +# -fstack-protector-strong -Wformat -Wformat-security + +# The sha256sum of the output is computed, and stripped to the first 8 +# characters for use in ccache and github cache store key. Can effectively be +# considered as a hash of the compiler version and the flags activated by +# -march=native. + +COMPILER=$1 + +$COMPILER -march=native -E -v - < /dev/null 2>&1 | grep cc1 \ + | sha256sum | cut -c1-8 + From 5bd1fc6b83d934198d6eed3b9bf50e751fa3d950 Mon Sep 17 00:00:00 2001 From: Qianqian Zhu Date: Mon, 17 May 2021 13:09:03 +0100 Subject: [PATCH 06/12] Refactor vocabs in Service (#143) Co-authored-by: Nikolay Bogoychev --- src/translator/batch_translator.cpp | 21 ++++---- src/translator/batch_translator.h | 5 +- src/translator/definitions.h | 21 -------- src/translator/response_builder.cpp | 3 +- src/translator/response_builder.h | 7 +-- src/translator/service.cpp | 35 +------------ src/translator/service.h | 3 +- src/translator/text_processor.cpp | 8 +-- src/translator/text_processor.h | 8 +-- src/translator/vocabs.h | 81 +++++++++++++++++++++++++++++ 10 files changed, 111 insertions(+), 81 deletions(-) create mode 100644 src/translator/vocabs.h diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index c627172..b35c4ce 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -10,11 +10,11 @@ namespace marian { namespace bergamot { BatchTranslator::BatchTranslator(DeviceId const device, - std::vector> &vocabs, + Vocabs &vocabs, Ptr options, const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory) - : device_(device), options_(options), vocabs_(&vocabs), + : device_(device), options_(options), vocabs_(vocabs), modelMemory_(modelMemory), shortlistMemory_(shortlistMemory) {} void BatchTranslator::initialize() { @@ -22,17 +22,17 @@ void BatchTranslator::initialize() { bool check = options_->get("check-bytearray",false); // Flag holds whether validate the bytearray (model and shortlist) if (options_->hasAndNotEmpty("shortlist")) { int srcIdx = 0, trgIdx = 1; - bool shared_vcb = vocabs_->front() == vocabs_->back(); + bool shared_vcb = vocabs_.sources().front() == vocabs_.target(); // vocabs_->sources().front() is invoked as we currently only support one source vocab if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) { slgen_ = New(shortlistMemory_->begin(), shortlistMemory_->size(), - vocabs_->front(), vocabs_->back(), - srcIdx, trgIdx, shared_vcb, check); + vocabs_.sources().front(), vocabs_.target(), + srcIdx, trgIdx, shared_vcb, check); } else { // Changed to BinaryShortlistGenerator to enable loading binary shortlist file // This class also supports text shortlist file - slgen_ = New(options_, vocabs_->front(), - vocabs_->back(), srcIdx, + slgen_ = New(options_, vocabs_.sources().front(), + vocabs_.target(), srcIdx, trgIdx, shared_vcb); } } @@ -97,7 +97,7 @@ void BatchTranslator::translate(Batch &batch) { std::vector> subBatches; for (size_t j = 0; j < maxDims.size(); ++j) { subBatches.emplace_back( - New(batchSize, maxDims[j], vocabs_->at(j))); + New(batchSize, maxDims[j], vocabs_.sources().at(j))); } std::vector words(maxDims.size(), 0); @@ -116,9 +116,8 @@ void BatchTranslator::translate(Batch &batch) { auto corpus_batch = Ptr(new CorpusBatch(subBatches)); corpus_batch->setSentenceIds(sentenceIds); - - auto trgVocab = vocabs_->back(); - auto search = New(options_, scorers_, trgVocab); + + auto search = New(options_, scorers_, vocabs_.target()); auto histories = std::move(search->search(graph_, corpus_batch)); batch.completeBatch(histories); diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index 761a534..048ba77 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -11,6 +11,7 @@ #include "request.h" #include "translator/history.h" #include "translator/scorers.h" +#include "vocabs.h" #ifndef WASM_COMPATIBLE_SOURCE #include "pcqueue.h" @@ -34,7 +35,7 @@ public: * @param modelMemory byte array (aligned to 256!!!) that contains the bytes of a model.bin. Provide a nullptr if not used. * @param shortlistMemory byte array of shortlist (aligned to 64) */ - explicit BatchTranslator(DeviceId const device, std::vector> &vocabs, + explicit BatchTranslator(DeviceId const device, Vocabs &vocabs, Ptr options, const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory); // convenience function for logging. TODO(jerin) @@ -45,7 +46,7 @@ public: private: Ptr options_; DeviceId device_; - std::vector> *vocabs_; + const Vocabs& vocabs_; Ptr graph_; std::vector> scorers_; Ptr slgen_; diff --git a/src/translator/definitions.h b/src/translator/definitions.h index 175397d..bf1cb57 100644 --- a/src/translator/definitions.h +++ b/src/translator/definitions.h @@ -28,27 +28,6 @@ struct MemoryBundle { /// @todo Not implemented yet AlignedMemory ssplitPrefixFile; - - MemoryBundle() = default; - - MemoryBundle(MemoryBundle &&from){ - model = std::move(from.model); - shortlist = std::move(from.shortlist); - vocabs = std::move(vocabs); - ssplitPrefixFile = std::move(from.ssplitPrefixFile); - } - - MemoryBundle &operator=(MemoryBundle &&from) { - model = std::move(from.model); - shortlist = std::move(from.shortlist); - vocabs = std::move(vocabs); - ssplitPrefixFile = std::move(from.ssplitPrefixFile); - return *this; - } - - // Delete copy constructors - MemoryBundle(const MemoryBundle&) = delete; - MemoryBundle& operator=(const MemoryBundle&) = delete; }; } // namespace bergamot diff --git a/src/translator/response_builder.cpp b/src/translator/response_builder.cpp index f68bd31..037d456 100644 --- a/src/translator/response_builder.cpp +++ b/src/translator/response_builder.cpp @@ -65,11 +65,10 @@ void ResponseBuilder::buildTranslatedText(Histories &histories, Result result = onebest[0]; // Expecting only one result; Words words = std::get<0>(result); - auto targetVocab = vocabs_->back(); std::string decoded; std::vector targetSentenceMappings; - targetVocab->decodeWithByteRanges(words, decoded, targetSentenceMappings); + vocabs_.target()->decodeWithByteRanges(words, decoded, targetSentenceMappings); switch (responseOptions_.concatStrategy) { case ConcatStrategy::FAITHFUL: { diff --git a/src/translator/response_builder.h b/src/translator/response_builder.h index 85caffb..b8a8dd4 100644 --- a/src/translator/response_builder.h +++ b/src/translator/response_builder.h @@ -4,6 +4,7 @@ #include "data/types.h" #include "response.h" #include "response_options.h" +#include "vocabs.h" // For now we will work with this, to avoid complaints another structure is hard // to operate with. @@ -24,10 +25,10 @@ public: /// @param [in] vocabs: marian vocab object (used in decoding) /// @param [in] promise: promise to set with the constructed Response. ResponseBuilder(ResponseOptions responseOptions, AnnotatedText &&source, - std::vector> &vocabs, + Vocabs &vocabs, std::promise &&promise) : responseOptions_(responseOptions), source_(std::move(source)), - vocabs_(&vocabs), promise_(std::move(promise)) {} + vocabs_(vocabs), promise_(std::move(promise)) {} /// Constructs and sets the promise of a Response object from obtained /// histories after translating. @@ -81,7 +82,7 @@ private: // Data members are context/curried args for the functor. ResponseOptions responseOptions_; - std::vector> *vocabs_; // vocabs are required for decoding + const Vocabs& vocabs_; // vocabs are required for decoding // and any source validation checks. std::promise promise_; // To be set when callback triggered and // after Response constructed. diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 16c4743..5439667 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -5,45 +5,12 @@ #include #include -inline std::vector> -loadVocabularies(marian::Ptr options, - std::vector>&& vocabMemories) { - // @TODO: parallelize vocab loading for faster startup - std::vector> vocabs; - if(!vocabMemories.empty()){ - // load vocabs from buffer - ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies."); - vocabs.resize(vocabMemories.size()); - for (size_t i = 0; i < vocabs.size(); i++) { - marian::Ptr vocab = marian::New(options, i); - vocab->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size())); - vocabs[i] = vocab; - } - } else { - // load vocabs from file - auto vfiles = options->get>("vocabs"); - // with the current setup, we need at least two vocabs: src and trg - ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies."); - vocabs.resize(vfiles.size()); - std::unordered_map> vmap; - for (size_t i = 0; i < vocabs.size(); ++i) { - auto m = vmap.emplace(std::make_pair(vfiles[i], marian::Ptr())); - if (m.second) { // new: load the vocab - m.first->second = marian::New(options, i); - m.first->second->load(vfiles[i]); - } - vocabs[i] = m.first->second; - } - } - return vocabs; -} - namespace marian { namespace bergamot { Service::Service(Ptr options, MemoryBundle memoryBundle) : requestId_(0), options_(options), - vocabs_(std::move(loadVocabularies(options, std::move(memoryBundle.vocabs)))), + vocabs_(options, std::move(memoryBundle.vocabs)), text_processor_(vocabs_, options), batcher_(options), numWorkers_(options->get("cpu-threads")), modelMemory_(std::move(memoryBundle.model)), diff --git a/src/translator/service.h b/src/translator/service.h index 9d0a67d..6c60888 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -9,6 +9,7 @@ #include "response_builder.h" #include "text_processor.h" #include "translator/parser.h" +#include "vocabs.h" #ifndef WASM_COMPATIBLE_SOURCE #include "pcqueue.h" @@ -172,7 +173,7 @@ private: size_t requestId_; /// Store vocabs representing source and target. - std::vector> vocabs_; // ORDER DEPENDENCY (text_processor_) + Vocabs vocabs_; // ORDER DEPENDENCY (text_processor_) /// TextProcesser takes a blob of text and converts into format consumable by /// the batch-translator and annotates sentences and words. diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp index fb66901..457e2b9 100644 --- a/src/translator/text_processor.cpp +++ b/src/translator/text_processor.cpp @@ -4,7 +4,6 @@ #include "annotation.h" #include "common/options.h" -#include "data/vocab.h" #include namespace marian { @@ -12,13 +11,14 @@ namespace bergamot { Segment TextProcessor::tokenize(const string_view &segment, std::vector &wordRanges) { - return vocabs_->front()->encodeWithByteRanges( + // vocabs_->sources().front() is invoked as we currently only support one source vocab + return vocabs_.sources().front()->encodeWithByteRanges( segment, wordRanges, /*addEOS=*/false, /*inference=*/true); } -TextProcessor::TextProcessor(std::vector> &vocabs, +TextProcessor::TextProcessor(Vocabs &vocabs, Ptr options) - : vocabs_(&vocabs), sentence_splitter_(options) { + : vocabs_(vocabs), sentence_splitter_(options) { max_length_break_ = options->get("max-length-break"); max_length_break_ = max_length_break_ - 1; diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h index 698e36e..f5d4d88 100644 --- a/src/translator/text_processor.h +++ b/src/translator/text_processor.h @@ -7,6 +7,7 @@ #include "annotation.h" #include "sentence_splitter.h" +#include "vocabs.h" #include @@ -21,7 +22,7 @@ class TextProcessor { // sentences (vector of words). In addition, the ByteRanges of the // source-tokens in unnormalized text are provided as string_views. public: - explicit TextProcessor(std::vector> &vocabs, Ptr); + explicit TextProcessor(Vocabs &vocabs, Ptr); void process(AnnotatedText &source, Segments &segments); @@ -36,9 +37,10 @@ private: Segments &segments, AnnotatedText &source); // shorthand, used only in truncate() - const Word sourceEosId() const { return vocabs_->front()->getEosId(); } + // vocabs_->sources().front() is invoked as we currently only support one source vocab + const Word sourceEosId() const { return vocabs_.sources().front()->getEosId(); } - std::vector> *vocabs_; + const Vocabs& vocabs_; SentenceSplitter sentence_splitter_; size_t max_length_break_; }; diff --git a/src/translator/vocabs.h b/src/translator/vocabs.h new file mode 100644 index 0000000..89aed4b --- /dev/null +++ b/src/translator/vocabs.h @@ -0,0 +1,81 @@ +#pragma once + +namespace marian { +namespace bergamot { + +/// Wrapper of Marian Vocab objects needed for translator. +/// Holds multiple source vocabularies and one target vocabulary +class Vocabs { +public: + /// Construct vocabs object from either byte-arrays or files + Vocabs(Ptr options, std::vector>&& vocabMemories): options_(options){ + if (!vocabMemories.empty()){ + // load vocabs from buffer + load(std::move(vocabMemories)); + } + else{ + // load vocabs from file + auto vocabPaths = options->get>("vocabs"); + load(vocabPaths); + } + } + + /// Get all source vocabularies (as a vector) + const std::vector>& sources() const { + return srcVocabs_; + } + + /// Get the target vocabulary + const Ptr& target() const { + return trgVocab_; + } + +private: + std::vector> srcVocabs_; // source vocabularies + Ptr trgVocab_; // target vocabulary + Ptr options_; + + // load from buffer + void load(std::vector>&& vocabMemories) { + // At least two vocabs: src and trg + ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies."); + srcVocabs_.resize(vocabMemories.size()); + // hashMap is introduced to avoid double loading the same vocab + // loading vocabs (either from buffers or files) is the biggest bottleneck of the speed + // uintptr_t holds unique keys (address) for share_ptr + std::unordered_map> vmap; + for (size_t i = 0; i < srcVocabs_.size(); i++) { + auto m = vmap.emplace(std::make_pair(reinterpret_cast(vocabMemories[i].get()), Ptr())); + if (m.second) { // new: load the vocab + m.first->second = New(options_, i); + m.first->second->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size())); + } + srcVocabs_[i] = m.first->second; + } + // Initialize target vocab + trgVocab_ = srcVocabs_.back(); + srcVocabs_.pop_back(); + } + + // load from file + void load(const std::vector& vocabPaths){ + // with the current setup, we need at least two vocabs: src and trg + ABORT_IF(vocabPaths.size() < 2, "Insufficient number of vocabularies."); + srcVocabs_.resize(vocabPaths.size()); + std::unordered_map> vmap; + for (size_t i = 0; i < srcVocabs_.size(); ++i) { + auto m = vmap.emplace(std::make_pair(vocabPaths[i], Ptr())); + if (m.second) { // new: load the vocab + m.first->second = New(options_, i); + m.first->second->load(vocabPaths[i]); + } + srcVocabs_[i] = m.first->second; + } + // Initialize target vocab + trgVocab_ = srcVocabs_.back(); + srcVocabs_.pop_back(); + } +}; + +} // namespace bergamot +} // namespace marian From 3e7058767222a8add4a505518c09ff0e7c6a2810 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 17 May 2021 16:42:18 +0100 Subject: [PATCH 07/12] Rewrite annotation class to remove corner cases (#135) --- src/tests/annotation_tests.cpp | 30 ++--- src/translator/annotation.cpp | 166 ++++++++--------------- src/translator/annotation.h | 197 +++++++++++++++++----------- src/translator/response_builder.cpp | 11 +- src/translator/text_processor.cpp | 14 +- src/translator/text_processor.h | 6 +- 6 files changed, 197 insertions(+), 227 deletions(-) diff --git a/src/tests/annotation_tests.cpp b/src/tests/annotation_tests.cpp index d323b9d..0f02a7a 100644 --- a/src/tests/annotation_tests.cpp +++ b/src/tests/annotation_tests.cpp @@ -23,9 +23,6 @@ TEST_CASE("Test Annotation API with random sentences") { std::mt19937 randomIntGen_; randomIntGen_.seed(42); - AnnotatedText testAnnotation; // This the container we add through API and - // check if the access is correct. - // External book-keeping so we have ground truths. Each element represents a // sentence. @@ -45,7 +42,7 @@ TEST_CASE("Test Annotation API with random sentences") { // // 4-0 4-1 4-2 4-3 // - // Words are separated by space units. + // Tokens are contiguous because that's how SentencePiece works. // // Below, we accumulate the text with intended structure as above, and // ground-truth tables populated to be aware of the ByteRanges where they are @@ -53,9 +50,10 @@ TEST_CASE("Test Annotation API with random sentences") { if (debug) { std::cout << "Preparing text and ground truth-tables" << std::endl; } + std::string text; for (size_t idx = 0; idx < sentences; idx++) { if (idx != 0) - testAnnotation.text += "\n"; + text += "\n"; // Words can be zero, we need to support empty word sentences as well. size_t numWords = randomIntGen_() % maxWords; @@ -65,23 +63,16 @@ TEST_CASE("Test Annotation API with random sentences") { // For empty sentence, we expect it to be empty and marked in position where // the existing string is if needed to be pointed out. - size_t before = testAnnotation.text.size() - 1; + size_t before = text.size() - 1; size_t sentenceBegin{before}, sentenceEnd{before}; for (size_t idw = 0; idw < numWords; idw++) { - if (idw != 0) { - testAnnotation.text += " "; - if (debug) { - std::cout << " "; - } - } - // Get new beginning, accounting for space above. - before = testAnnotation.text.size(); + before = text.size(); // Add the word std::string word = std::to_string(idx) + "-" + std::to_string(idw); - testAnnotation.text += word; + text += word; // Do math, before, before + new-word's size. wordByteRanges.push_back((ByteRange){before, before + word.size()}); @@ -105,6 +96,9 @@ TEST_CASE("Test Annotation API with random sentences") { groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd}); } + AnnotatedText testAnnotation(std::move(text)); // This the container we add through API and + // check if the access is correct. + // We prepare string_views now with the known ByteRanges and use the // string_view based AnnotatedText.addSentence(...) API to add sentences to // transparently convert from string_views to ByteRanges, rebasing/working out @@ -116,6 +110,7 @@ TEST_CASE("Test Annotation API with random sentences") { } std::vector> wordStringViews; + std::vector::const_iterator sentence_iter = groundTruthSentences.begin(); for (auto &sentence : groundTruthWords) { std::vector wordByteRanges; bool first{true}; @@ -132,7 +127,8 @@ TEST_CASE("Test Annotation API with random sentences") { std::cout << std::string(wordView); } } - testAnnotation.addSentence(wordByteRanges); + testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(), testAnnotation.text.data() + sentence_iter->begin); + ++sentence_iter; wordStringViews.push_back(wordByteRanges); if (debug) { std::cout << std::endl; @@ -207,7 +203,7 @@ TEST_CASE("Test Annotation API with random sentences") { // Sentence if the random test above does not cover it for some reason. int emptySentenceIdx = sentences; std::vector emptySentence; - testAnnotation.addSentence(emptySentence); + testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(), testAnnotation.text.data() + testAnnotation.text.size()); // There are no words. CHECK(testAnnotation.numWords(emptySentenceIdx) == 0); diff --git a/src/translator/annotation.cpp b/src/translator/annotation.cpp index c27d784..90e02e0 100644 --- a/src/translator/annotation.cpp +++ b/src/translator/annotation.cpp @@ -1,130 +1,68 @@ #include "annotation.h" #include -#include namespace marian { namespace bergamot { -void Annotation::addSentence(std::vector &sentence) { - flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence), - std::end(sentence)); - size_t size = flatByteRanges_.size(); - sentenceEndIds_.push_back(size); +AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) { + // Treat the entire text as a gap that recordExistingSentence will break. + annotation.token_begin_.back() = text.size(); } -size_t Annotation::numWords(size_t sentenceIdx) const { - size_t bosId, eosId; - bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so; - eosId = sentenceEndIds_[sentenceIdx + 1]; - // Difference between eosId and bosId is the number of words. - return eosId - bosId; +void AnnotatedText::appendSentence(string_view prefix, std::vector::iterator begin, std::vector::iterator end) { + assert(annotation.token_begin_.back() == text.size()); + // We'll be adding tokens from the sentence and another gap. + annotation.token_begin_.reserve(annotation.token_begin_.size() + (end - begin) + 1); + + // prefix is just end of the previous one. + appendEndingWhitespace(prefix); + + // Appending sentence text. + std::size_t offset = text.size(); + for (std::vector::iterator token = begin; token != end; ++token) { + offset += token->size(); + annotation.token_begin_.push_back(offset); + } + if (begin != end) { + text.append(begin->data(), (end - 1)->data() + (end - 1)->size()); + assert(offset == text.size()); // Tokens should be contiguous. + } + + // Add the gap after the sentence. This is empty for now, but will be + // extended with appendEndingWhitespace or another appendSentence. + annotation.gap_.push_back(annotation.token_begin_.size() - 1); + annotation.token_begin_.push_back(offset); } -ByteRange Annotation::sentence(size_t sentenceIdx) const { - size_t bosId, eosId; - bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so; - eosId = sentenceEndIds_[sentenceIdx + 1]; - ByteRange sentenceByteRange; +void AnnotatedText::appendEndingWhitespace(string_view whitespace) { + text.append(whitespace.data(), whitespace.size()); + annotation.token_begin_.back() = text.size(); +} - if (bosId == eosId) { - // We have an empty sentence. However, we want to be able to point where in - // target this happened through the ranges. We are looking for the end of - // the flatByteRange and non-empty sentence before this happened and - // construct empty string-view equivalent ByteRange. - ByteRange eos = flatByteRanges_[eosId - 1]; - sentenceByteRange = ByteRange{eos.end, eos.end}; +void AnnotatedText::recordExistingSentence(std::vector::iterator begin, std::vector::iterator end, const char *sentence_begin) { + assert(sentence_begin >= text.data()); + assert(sentence_begin <= text.data() + text.size()); + assert(begin == end || sentence_begin == begin->data()); + assert(!annotation.token_begin_.empty()); + assert(annotation.token_begin_.back() == text.size()); + // Clip off size token ending. + annotation.token_begin_.resize(annotation.token_begin_.size() - 1); + for (std::vector::iterator i = begin; i != end; ++i) { + assert(i->data() >= text.data()); // In range. + assert(i->data() + i->size() <= text.data() + text.size()); // In range + assert(i + 1 == end || i->data() + i->size() == (i+1)->data()); // Contiguous + annotation.token_begin_.push_back(i->data() - text.data()); + } + // Gap token after sentence. + annotation.gap_.push_back(annotation.token_begin_.size()); + if (begin != end) { + annotation.token_begin_.push_back((end - 1)->data() + (end - 1)->size() - text.data()); } else { - ByteRange bos = flatByteRanges_[bosId]; - ByteRange eos = flatByteRanges_[eosId - 1]; - sentenceByteRange = ByteRange{bos.begin, eos.end}; + // empty sentence. + annotation.token_begin_.push_back(sentence_begin - text.data()); } - return sentenceByteRange; -} - -ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const { - size_t bosOffset = sentenceEndIds_[sentenceIdx]; - return flatByteRanges_[bosOffset + wordIdx]; -} - -string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const { - auto terminals = annotation.word(sentenceIdx, wordIdx); - return string_view(&text[terminals.begin], terminals.size()); -} - -string_view AnnotatedText::sentence(size_t sentenceIdx) const { - auto sentenceAsByteRange = annotation.sentence(sentenceIdx); - return asStringView(sentenceAsByteRange); -} - -void AnnotatedText::appendSentence(std::string prefix, std::string &reference, - std::vector &wordRanges) { - text += prefix; - size_t offset = text.size(); // Get size before to do ByteRange arithmetic - text += reference; // Append reference to text - std::vector sentence; - for (auto &wordView : wordRanges) { - size_t thisWordBegin = offset + wordView.data() - reference.data(); - sentence.push_back( - ByteRange{thisWordBegin, thisWordBegin + wordView.size()}); - } - annotation.addSentence(sentence); -} - -void AnnotatedText::addSentence(std::vector &wordRanges) { - addSentence(std::begin(wordRanges), std::end(wordRanges)); -}; - -void AnnotatedText::addSentence(std::vector::iterator begin, - std::vector::iterator end) { - std::vector sentence; - for (auto p = begin; p != end; p++) { - size_t begin_offset = p->data() - text.data(); - sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()}); - } - annotation.addSentence(sentence); -}; - -ByteRange AnnotatedText::wordAsByteRange(size_t sentenceIdx, - size_t wordIdx) const { - return annotation.word(sentenceIdx, wordIdx); -} - -ByteRange AnnotatedText::sentenceAsByteRange(size_t sentenceIdx) const { - return annotation.sentence(sentenceIdx); -} - -string_view AnnotatedText::asStringView(const ByteRange &byteRange) const { - const char *data = &text[byteRange.begin]; - size_t size = byteRange.size(); - return string_view(data, size); -} - -string_view AnnotatedText::gap(size_t sentenceIdx) const { - // Find start of filler-text before, there's a corner case when there's no - // sentence before. - const char *start = nullptr; - if (sentenceIdx == 0) { - // If first sentence, filler begins at start of whole-text. - start = text.data(); - } else { - // Otherwise, filler begins at end of previous sentence. - string_view sentenceBefore = sentence(sentenceIdx - 1); - start = sentenceBefore.data() + sentenceBefore.size(); - } - - // Find end of filler-text, but there is a corner-case to handle. - const char *end = nullptr; - if (sentenceIdx == numSentences()) { - // If last sentence, manually find end of whole-text. - const char *begin = text.data(); - end = begin + text.size(); - } else { - // Otherwise, the filler ends at the start of next sentence. - string_view sentenceAfter = sentence(sentenceIdx); - end = sentenceAfter.data(); - } - - return string_view(start, end - start); + // Add back size token ending. + annotation.token_begin_.push_back(text.size()); } } // namespace bergamot diff --git a/src/translator/annotation.h b/src/translator/annotation.h index 8cb7caf..555ab53 100644 --- a/src/translator/annotation.h +++ b/src/translator/annotation.h @@ -17,83 +17,99 @@ struct ByteRange { const size_t size() const { return end - begin; } }; -/// An Annotation is a collection of ByteRanges used to denote ancillary -/// information of sentences and words on a text of string. Annotation is meant -/// for consumption on platforms where `string_view` creates problems (eg: -/// exports through WASM) conveniently rebasing them as required into -/// ByteRanges. See AnnotatedText for cases where this is a non-issue. +/// Annotation expresses sentence and token boundary information as ranges of +/// bytes in a string, but does not itself own the string. +/// +/// See also AnnotatedText, which owns Annotation and the string. AnnotatedText +/// wraps these ByteRange functions to provide a string_view interface. /// -/// **Usage** +/// Text is divided into gaps (whitespace between sentences) and sentences like +/// so: +/// gap sentence gap sentence gap +/// Because gaps appear at the beginning and end of the text, there's always +/// one more gap than there are sentences. /// -/// To ensure rebasing is consistent during creation and updation, use -/// `Annotation` best through `AnnotatedText`, which also holds the reference -/// string and can work with `string_views`. +/// The entire text is a unbroken sequence of tokens (i.e. the end of a token +/// is the beginning of the next token). A gap is exactly one token containing +/// whatever whitespace is between the sentences. A sentence is a sequence of +/// tokens. /// -/// If used separately, it is on the user to ensure the reference string -/// is the same as what the Annotation refers to. For best results, an instance -/// is expected to be read only in this mode of operation. +/// Since we are using SentencePiece, a token can include whitespace. The term +/// "word" is used, somewhat incorrectly, as a synonym of token. /// -/// **Idea** -/// -/// Annotation is intended to be the same structure conceptually as below, -/// except the `std::vector>` hammered into a flat -/// structure to avoid multiple reallocs keeping efficiency in mind. This is -/// achieved by having markers of where sentence ends in the flat container -/// storing word ByteRanges. -/// -/// ```cpp -/// typedef ByteRange Word; -/// // std::vector, a single sentence -/// typedef std::vector Sentence; -/// std::vector // multiple sentences -/// typedef std::vector Annotation; -/// -/// Annotation example; -/// ``` -/// This structure exists to provide a consistent API to access the nested -/// sentences of varying lengths, which occur in source-text processed into -/// multiple sentences, and target-text translated from source as multiple -/// sentences, both composed of (sub)-words, providing a List[List] like access -/// while storing it in a compact and efficient manner. +/// A gap can be empty (for example there may not have been whitespace at the +/// beginning). A sentence can also be empty (typically the translation system +/// produced empty output). That's fine, these are just empty ranges as you +/// would expect. class Annotation { public: - /// Annotation is constructed empty. See `addSentence()` to populate it with - /// annotations. + /// Initially an empty string. Populated by AnnotatedText. Annotation() { - // The -1-th sentence ends at 0. - sentenceEndIds_.push_back(0); + token_begin_.push_back(0); + token_begin_.push_back(0); + gap_.push_back(0); } - size_t numSentences() const { return sentenceEndIds_.size() - 1; } + size_t numSentences() const { return gap_.size() - 1; } /// Returns number of words in the sentence identified by `sentenceIdx`. - size_t numWords(size_t sentenceIdx) const; - - /// Adds a sentences from `vector` representation, internally doing - /// extra book-keeping for the sentence terminal markings. Sentences are - /// expected to be added in order as they occur in text. - void addSentence(std::vector &sentence); + size_t numWords(size_t sentenceIdx) const { + return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */; + } /// Returns a ByteRange representing `wordIdx` in sentence indexed by /// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than /// `.numWords()` for `sentenceIdx` for defined behaviour. - ByteRange word(size_t sentenceIdx, size_t wordIdx) const; + ByteRange word(size_t sentenceIdx, size_t wordIdx) const { + size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx; + return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; + } /// Returns a ByteRange representing sentence corresponding to `sentenceIdx`. /// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when /// less than `.numSentences()`. - ByteRange sentence(size_t sentenceIdx) const; + ByteRange sentence(size_t sentenceIdx) const { + return ByteRange { + token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */ + token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */ + }; + } + + ByteRange gap(size_t gapIdx) const { + size_t tokenIdx = gap_[gapIdx]; + return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; + } private: - /// A flat storage for ByteRanges. Composed of word ByteRanges, extra - /// information in sentenceEndIds_ to denote sentence boundary markers as - /// indices. - std::vector flatByteRanges_; + friend class AnnotatedText; + /// Map from token index to byte offset at which it begins. Token i is: + /// [token_begin_[i], token_begin_[i+1]) + /// The vector is padded so that these indices are always valid, even at the + /// end. So tokens_begin_.size() is the number of tokens plus 1. + std::vector token_begin_; - /// Stores indices onto flatByteRanges_ of where sentences end (not inclusive, - /// aligned with C++ half interval notions). There is a 0 marker to simplify - /// sources, indicating where the -1-th sentence ends. - std::vector sentenceEndIds_; + /// Indices of tokens that correspond to gaps between sentences. These are + /// indices into token_begin_. + /// Gap g is byte range: + /// [token_begin_[gap_[w]], token_begin_[gap_[w]+1]) + /// Sentence s is byte range: + /// [token_begin_[gap_[s]+1], token_begin_[gap_[s+1]]) + /// A sentence does not include whitespace at the beginning or end. + /// + /// gap_.size() == numSentences() + 1. + /// + /// Example: empty text "" -> just an empty gap. + /// token_begin_ = {0, 0}; + /// gap_ = {0}; + /// + /// Example: only space " " -> just a gap containing the space. + /// token_begin_ = {0, 1}; + /// gap_ = {0}; + /// + /// Example: one token "hi" -> empty gap, sentence with one token, empty gap + /// token_begin_ = {0, 0, 2, 2}; + /// gap_ = {0, 2}; + std::vector gap_; }; /// AnnotatedText is effectively std::string text + Annotation, providing the @@ -107,7 +123,6 @@ private: /// /// 3. Bind the text and annotations together, to move around as a meaningful /// unit. - struct AnnotatedText { public: std::string text; ///< Blob of string elements in annotation refers to. @@ -122,7 +137,31 @@ public: /// Construct moving in a string (for efficiency purposes, copying string /// constructor is disallowed). - AnnotatedText(std::string &&text) : text(std::move(text)){}; + AnnotatedText(std::string &&text); + + /// Appends a sentence to the existing text and transparently rebases + /// string_views. Since this tracks only prefix, remember + /// appendEndingWhitespace. + /// The string_views must not already be in text. + void appendSentence( + string_view prefix, + std::vector::iterator tokens_begin, + std::vector::iterator tokens_end); + + /// Append the whitespace at the end of input. string_view must not be in + /// text. + void appendEndingWhitespace(string_view whitespace); + + /// Record the existence of a sentence that is already in text. The + /// iterators are over string_views for each token that must be in text + /// already. This function must be called to record sentences in order. + /// Normally the beginning of the sentence can be inferred from + /// tokens_begin->data() but the tokens could be empty, so sentence_begin is + /// required to know where the sentence is. + void recordExistingSentence( + std::vector::iterator tokens_begin, + std::vector::iterator tokens_end, + const char *sentence_begin); /// Returns the number of sentences in the annotation structure. const size_t numSentences() const { return annotation.numSentences(); } @@ -132,46 +171,44 @@ public: return annotation.numWords(sentenceIdx); } - /// Appends a sentence to the existing text and transparently rebases - /// string_views - void appendSentence(std::string prefix, std::string &reference, - std::vector &wordRanges); - - /// Adds a sentence, used to load from SentencePiece annotations conveniently. - void addSentence(std::vector &wordRanges); - - /// Adds a sentence between two iterators, often useful while constructing - /// from parts of a container. - void addSentence(std::vector::iterator begin, - std::vector::iterator end); - /// Returns a string_view representing wordIdx in sentenceIdx - string_view word(size_t sentenceIdx, size_t wordIdx) const; + string_view word(size_t sentenceIdx, size_t wordIdx) const { + return asStringView(annotation.word(sentenceIdx, wordIdx)); + } /// Returns a string_view representing sentence corresponding to sentenceIdx. - string_view sentence(size_t sentenceIdx) const; + string_view sentence(size_t sentenceIdx) const { + return asStringView(annotation.sentence(sentenceIdx)); + } /// Returns the string_view of the gap between two sentences in the container. /// /// More precisely where `i = sentenceIdx, N = numSentences()` for brevity: /// - /// * For `i = 0`: The gap between the start of text and the first sentence. + /// * For `i = 0`: The gap between the start of text and the 0th sentence. /// * For `i = 1...N-1`, returns the text comprising of the gap - /// between the `i-1`-th and `i`-th sentence. - /// * For `i = N`, the gap between the last sentence and end of + /// between the `i`-th and `i+1`-th sentence. + /// * For `i = N`, the gap between the last (N-1th) sentence and end of /// text. - /// @param sentenceIdx: Can be between `[0, numSentences()]`. - string_view gap(size_t sentenceIdx) const; + string_view gap(size_t sentenceIdx) const { + return asStringView(annotation.gap(sentenceIdx)); + } /// Returns a ByteRange representing wordIdx in sentenceIdx - ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const; + ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const { + return annotation.word(sentenceIdx, wordIdx); + } /// Returns a ByteRange representing sentence corresponding to sentenceIdx. - ByteRange sentenceAsByteRange(size_t sentenceIdx) const; + ByteRange sentenceAsByteRange(size_t sentenceIdx) const { + return annotation.sentence(sentenceIdx); + } private: - string_view asStringView(const ByteRange &byteRange) const; + string_view asStringView(const ByteRange &byteRange) const { + return string_view(text.data() + byteRange.begin, byteRange.size()); + } }; } // namespace bergamot diff --git a/src/translator/response_builder.cpp b/src/translator/response_builder.cpp index 037d456..b2f561b 100644 --- a/src/translator/response_builder.cpp +++ b/src/translator/response_builder.cpp @@ -75,22 +75,19 @@ void ResponseBuilder::buildTranslatedText(Histories &histories, // For each sentence, prepend the filler text between the corresponding // source-sentence and the source-sentence before. string_view pre = response.source.gap(sentenceIdx); - response.target.appendSentence(std::string(pre.data(), pre.size()), - decoded, targetSentenceMappings); + response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end()); // If this is the last history to be decoded and translated-text // constructed, append the text till the end, which could be spaces or // empty. if (sentenceIdx + 1 == histories.size()) { - string_view post = response.source.gap(sentenceIdx + 1); - response.target.text += std::string(post.data(), post.size()); + response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1)); } break; } case ConcatStrategy::SPACE: { - std::string delimiter = (sentenceIdx == 0) ? "" : " "; - response.target.appendSentence(delimiter, decoded, - targetSentenceMappings); + string_view delimiter = (sentenceIdx == 0) ? "" : " "; + response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end()); break; } diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp index 457e2b9..bca5fd1 100644 --- a/src/translator/text_processor.cpp +++ b/src/translator/text_processor.cpp @@ -41,15 +41,16 @@ void TextProcessor::process(AnnotatedText &source, Segments &segments) { // There are some cases where SentencePiece or vocab returns no words // after normalization. 0 prevents any empty entries from being added. if (segment.size() > 0) { - // Truncate segment into max_input_size segments. - truncate(segment, wordRanges, segments, source); + // Wrap segment into sentences of at most max_length_break_ tokens and + // tell source about them. + wrap(segment, wordRanges, segments, source); } } } -void TextProcessor::truncate(Segment &segment, - std::vector &wordRanges, - Segments &segments, AnnotatedText &source) { +void TextProcessor::wrap(Segment &segment, + std::vector &wordRanges, + Segments &segments, AnnotatedText &source) { for (size_t offset = 0; offset < segment.size(); offset += max_length_break_) { auto start = segment.begin() + offset; @@ -61,7 +62,8 @@ void TextProcessor::truncate(Segment &segment, segments.back().push_back(sourceEosId()); auto astart = wordRanges.begin() + offset; - source.addSentence(astart, astart + diff); + // diff > 0 + source.recordExistingSentence(astart, astart + diff, astart->data()); } } diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h index f5d4d88..7328877 100644 --- a/src/translator/text_processor.h +++ b/src/translator/text_processor.h @@ -32,9 +32,9 @@ private: Segment tokenize(const string_view &input, std::vector &tokenRanges); - // Truncate sentence into max_input_size segments. - void truncate(Segment &sentence, std::vector &tokenRanges, - Segments &segments, AnnotatedText &source); + // Wrap into sentences of at most max_length_break_ tokens and add to source. + void wrap(Segment &sentence, std::vector &tokenRanges, + Segments &segments, AnnotatedText &source); // shorthand, used only in truncate() // vocabs_->sources().front() is invoked as we currently only support one source vocab From c1ef6f2bcb08fcd4f9e2432ae443bb7a81813594 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 17 May 2021 17:33:23 +0200 Subject: [PATCH 08/12] Added cmake file to compute version information - Reads BERGAMOT_VERSION file for generating various strings for versioning --- cmake/GetVersionFromFile.cmake | 60 ++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 cmake/GetVersionFromFile.cmake diff --git a/cmake/GetVersionFromFile.cmake b/cmake/GetVersionFromFile.cmake new file mode 100644 index 0000000..2eadb42 --- /dev/null +++ b/cmake/GetVersionFromFile.cmake @@ -0,0 +1,60 @@ +## +# This CMake modules sets the project version from a version file. +# +# The module sets the following variables: +# +# * PROJECT_VERSION_STRING +# * PROJECT_VERSION_STRING_FULL +# * PROJECT_VERSION_MAJOR +# * PROJECT_VERSION_MINOR +# * PROJECT_VERSION_PATCH +# * PROJECT_VERSION_TWEAK +# * PROJECT_VERSION_GIT_SHA +# +# This module is public domain, use it as it fits you best. +## + +# Get full string version from file +if(PROJECT_VERSION_FILE) + file(STRINGS ${PROJECT_VERSION_FILE} PROJECT_VERSION_STRING) +else() + file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/BERGAMOT_VERSION PROJECT_VERSION_STRING) +endif() + +# Get current commit SHA from git +execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PROJECT_VERSION_GIT_SHA + OUTPUT_STRIP_TRAILING_WHITESPACE) + +# Get partial versions into a list +string(REGEX MATCHALL "-.*$|[0-9]+" PROJECT_PARTIAL_VERSION_LIST + ${PROJECT_VERSION_STRING}) + +# Set the version numbers +list(GET PROJECT_PARTIAL_VERSION_LIST 0 PROJECT_VERSION_MAJOR) +list(GET PROJECT_PARTIAL_VERSION_LIST 1 PROJECT_VERSION_MINOR) +list(GET PROJECT_PARTIAL_VERSION_LIST 2 PROJECT_VERSION_PATCH) + +# The tweak part is optional, so check if the list contains it +list(LENGTH PROJECT_PARTIAL_VERSION_LIST PROJECT_PARTIAL_VERSION_LIST_LEN) +if(PROJECT_PARTIAL_VERSION_LIST_LEN GREATER 3) + list(GET PROJECT_PARTIAL_VERSION_LIST 3 PROJECT_VERSION_TWEAK) + string(SUBSTRING ${PROJECT_VERSION_TWEAK} 1 -1 PROJECT_VERSION_TWEAK) +endif() + +# Unset the list +unset(PROJECT_PARTIAL_VERSION_LIST) + +# Set full project version string +set(PROJECT_VERSION_STRING_FULL + ${PROJECT_VERSION_STRING}+${PROJECT_VERSION_GIT_SHA}) + +# Print all variables for debugging +#message(STATUS ${PROJECT_VERSION_STRING_FULL}) +#message(STATUS ${PROJECT_VERSION_STRING}) +#message(STATUS ${PROJECT_VERSION_MAJOR}) +#message(STATUS ${PROJECT_VERSION_MINOR}) +#message(STATUS ${PROJECT_VERSION_PATCH}) +#message(STATUS ${PROJECT_VERSION_TWEAK}) +#message(STATUS ${PROJECT_VERSION_GIT_SHA}) From c44868e1fdd56e1562afee18773a9ee1d08d7689 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 17 May 2021 17:34:57 +0200 Subject: [PATCH 09/12] Import GetVersionFromFile cmake file in root level CMakeLists.txt --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 332aed1..e561ed9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.5.1) +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) if (POLICY CMP0074) cmake_policy(SET CMP0074 NEW) # CMake 3.12 @@ -71,6 +72,11 @@ if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git") endif() endif() +# Project versioning +include(GetVersionFromFile) +message(STATUS "Project name: ${PROJECT_NAME}") +message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}") + if(NOT COMPILE_WASM) # Set BUILD_ARCH to native only while compiling for non wasm platform set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.") From 2e5880d3d499094e80eaae235fabf86068eb2f00 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 17 May 2021 17:37:10 +0200 Subject: [PATCH 10/12] Modified wasm cmake file to include version information in built artifacts --- wasm/CMakeLists.txt | 7 +++++++ wasm/project_version.js.in | 1 + 2 files changed, 8 insertions(+) create mode 100644 wasm/project_version.js.in diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt index 7feef75..72b22c1 100644 --- a/wasm/CMakeLists.txt +++ b/wasm/CMakeLists.txt @@ -4,6 +4,10 @@ add_executable(bergamot-translator-worker bindings/TranslationResultBindings.cpp ) +# Generate version file that can be included in the wasm artifacts +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.js.in + ${CMAKE_CURRENT_SOURCE_DIR}/project_version.js @ONLY) + # This header inclusion needs to go away later as path to public headers of bergamot # translator should be directly available from "bergamot-translator" target target_include_directories(bergamot-translator-worker @@ -19,6 +23,9 @@ set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s # Avoid node.js-code in emscripten glue-code set(LINKER_FLAGS "${LINKER_FLAGS} -s ENVIRONMENT=web,worker") +# Append version information in the Javascript artifact +set(LINKER_FLAGS "${LINKER_FLAGS} --extern-pre-js ${CMAKE_CURRENT_SOURCE_DIR}/project_version.js") + set_target_properties(bergamot-translator-worker PROPERTIES SUFFIX ".js" LINK_FLAGS ${LINKER_FLAGS} diff --git a/wasm/project_version.js.in b/wasm/project_version.js.in new file mode 100644 index 0000000..9a4095f --- /dev/null +++ b/wasm/project_version.js.in @@ -0,0 +1 @@ +var BERGAMOT_VERSION_FULL = "@PROJECT_VERSION_STRING_FULL@"; \ No newline at end of file From 0ad583cc34affac322e36ab0e4b5d4310309db74 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 17 May 2021 17:41:59 +0200 Subject: [PATCH 11/12] Generate project version file for native builds - The header file exposes a function that provides version information for native binaries --- src/translator/CMakeLists.txt | 4 ++++ src/translator/project_version.h.in | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 src/translator/project_version.h.in diff --git a/src/translator/CMakeLists.txt b/src/translator/CMakeLists.txt index a7ba0d3..1d48d59 100644 --- a/src/translator/CMakeLists.txt +++ b/src/translator/CMakeLists.txt @@ -1,3 +1,7 @@ +# Generate version file +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.h.in + ${CMAKE_CURRENT_SOURCE_DIR}/project_version.h @ONLY) + add_library(bergamot-translator STATIC byte_array_util.cpp text_processor.cpp diff --git a/src/translator/project_version.h.in b/src/translator/project_version.h.in new file mode 100644 index 0000000..b7a0d04 --- /dev/null +++ b/src/translator/project_version.h.in @@ -0,0 +1,19 @@ +#pragma once + +/* + * File project_version.h is generated using CMake. Do not modify project_version.h manually! + * Edit project_version.h.in file instead. + */ + +#include + +namespace marian { +namespace bergamot { + +std::string bergamotBuildVersion() { + // e.g. v1.2.3-alpha.1.1+abc123d + return "@PROJECT_VERSION_STRING_FULL@"; +} + +} // namespace bergamot +} // namespace marian From 067076fbc180bac04492252eadda8497c22065c3 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 17 May 2021 17:38:00 +0200 Subject: [PATCH 12/12] Bumped version to 0.3.0 - This brings the version info in sync with the various releases of extension --- BERGAMOT_VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BERGAMOT_VERSION b/BERGAMOT_VERSION index ae39fab..268b033 100644 --- a/BERGAMOT_VERSION +++ b/BERGAMOT_VERSION @@ -1 +1 @@ -v0.0.0 +v0.3.0