Merge remote-tracking branch 'upstream/main' into main

- Sync with upstream (https://github.com/browsermt/bergamot-translator)
This commit is contained in:
Abhishek Aggarwal 2021-05-18 08:48:41 +02:00
commit b73714e222
28 changed files with 633 additions and 430 deletions

View File

@ -15,6 +15,8 @@ jobs:
- name: "full-marian"
os: ubuntu-latest
gcc: 8
force_recache: false
ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
cpu: 'ON'
gpu: 'OFF'
test_tags: ""
@ -24,10 +26,14 @@ jobs:
USE_WASM_COMPATIBLE_SOURCE: "OFF"
COMPILE_SERVER: "OFF"
COMPILE_EXAMPLES: "OFF"
CMAKE_C_COMPILER_LAUNCHER: "ccache"
CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
- name: "minimal-marian"
os: ubuntu-latest
gcc: 8
force_recache: false
ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
cpu: 'ON'
gpu: 'OFF'
test_tags: "'#wasm'"
@ -37,6 +43,42 @@ jobs:
USE_WASM_COMPATIBLE_SOURCE: "ON"
COMPILE_SERVER: "OFF"
COMPILE_EXAMPLES: "OFF"
CMAKE_C_COMPILER_LAUNCHER: "ccache"
CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
- name: "full-marian-force-recache"
os: ubuntu-latest
gcc: 8
force_recache: true
ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
cpu: 'ON'
gpu: 'OFF'
test_tags: ""
cmake:
CMAKE_BUILD_TYPE: "Release"
COMPILE_TESTS: "ON"
USE_WASM_COMPATIBLE_SOURCE: "OFF"
COMPILE_SERVER: "OFF"
COMPILE_EXAMPLES: "OFF"
CMAKE_C_COMPILER_LAUNCHER: "ccache"
CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
- name: "minimal-marian-force-recache"
os: ubuntu-latest
gcc: 8
force_recache: true
ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
cpu: 'ON'
gpu: 'OFF'
test_tags: "'#wasm'"
cmake:
CMAKE_BUILD_TYPE: "Release"
COMPILE_TESTS: "OFF" # Minimal marian has no sqlite support and COMPILE_TEST=ON fails.
USE_WASM_COMPATIBLE_SOURCE: "ON"
COMPILE_SERVER: "OFF"
COMPILE_EXAMPLES: "OFF"
CMAKE_C_COMPILER_LAUNCHER: "ccache"
CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
runs-on: ${{ matrix.os }}
@ -57,7 +99,7 @@ jobs:
sudo apt-get update
sudo apt-get install -y \
libgoogle-perftools-dev libprotobuf-dev protobuf-compiler \
libboost-all-dev g++-${{ matrix.gcc }}
libboost-all-dev g++-${{ matrix.gcc }} ccache
# https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
- name: Install MKL
@ -68,6 +110,42 @@ jobs:
sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
if: matrix.cmake.USE_WASM_COMPATIBLE_SOURCE == 'OFF'
- name: Generate ccache_vars
id: ccache_vars
shell: bash
run: |
echo "::set-output name=hash::$(${{ matrix.ccache_cmd }})"
echo "::set-output name=timestamp::$(date '+%Y-%m-%dT%H.%M.%S')"
- name: Setup ccache environment variables
run: |
echo "CCACHE_COMPILERCHECK=${{ matrix.ccache_cmd }}" >> $GITHUB_ENV
echo "CCACHE_BASE_DIR=${{ github.workspace }}" >> $GITHUB_ENV
echo "CCACHE_DIR=${{ github.workspace }}/.ccache" >> $GITHUB_ENV
echo "CCACHE_COMPRESS=true" >> $GITHUB_ENV
echo "CCACHE_COMPRESSLEVEL=6" >> $GITHUB_ENV
echo "CCACHE_MAXSIZE=2G" >> $GITHUB_ENV
- name: Setup ccache recache on
run: |
echo "CCACHE_RECACHE=" >> $GITHUB_ENV
if: matrix.force_recache == true
- name: Cache-op for build-cache through ccache
uses: actions/cache@v2
with:
path: ${{ env.CCACHE_DIR }}
key: ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-${{ steps.ccache_vars.outputs.timestamp }}
restore-keys: |
ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-
ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-
ccache-${{ matrix.name }}-
- name: Cache stats before build
run: |
ccache -s
ccache -z
# Boost is installed on GitHub-hosted runners in a non-standard location
# https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671
- name: Configure CMake
@ -75,17 +153,24 @@ jobs:
mkdir -p build
cd build
CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \
cmake .. \
cmake -L .. \
-DCMAKE_BUILD_TYPE=${{ matrix.cmake.CMAKE_BUILD_TYPE }}\
-DCOMPILE_TESTS=${{ matrix.cmake.COMPILE_TESTS }}\
-DCOMPILE_EXAMPLES=${{ matrix.cmake.COMPILE_EXAMPLES }} \
-DCOMPILE_SERVER=${{ matrix.cmake.COMPILE_SERVER }} \
-DUSE_WASM_COMPATIBLE_SOURCE=${{ matrix.cmake.USE_WASM_COMPATIBLE_SOURCE }} \
-DCMAKE_C_COMPILER_LAUNCHER=${{ matrix.cmake.CMAKE_C_COMPILER_LAUNCHER}} \
-DCMAKE_CXX_COMPILER_LAUNCHER=${{ matrix.cmake.CMAKE_CXX_COMPILER_LAUNCHER}}
- name: Compile bergamot-translator
working-directory: build
run: make -j2
- name: Cache stats after build
run: |
ccache -s
- name: Run unit tests
working-directory: build
run: make test

View File

@ -1 +1 @@
v0.0.0
v0.3.0

View File

@ -1,4 +1,5 @@
cmake_minimum_required(VERSION 3.5.1)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
if (POLICY CMP0074)
cmake_policy(SET CMP0074 NEW) # CMake 3.12
@ -42,8 +43,6 @@ option(COMPILE_WASM "Compile for WASM" OFF)
cmake_dependent_option(USE_WASM_COMPATIBLE_SOURCE "Use wasm compatible sources" OFF "NOT COMPILE_WASM" ON)
option(COMPILE_TESTS "Compile bergamot-tests" OFF)
SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds")
# Set 3rd party submodule specific cmake options for this project
SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
@ -73,6 +72,11 @@ if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
endif()
endif()
# Project versioning
include(GetVersionFromFile)
message(STATUS "Project name: ${PROJECT_NAME}")
message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")
if(NOT COMPILE_WASM)
# Set BUILD_ARCH to native only while compiling for non wasm platform
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")

View File

@ -7,85 +7,47 @@ Bergamot translator provides a unified API for ([Marian NMT](https://marian-nmt.
## Build Instructions
### Build Natively
1. Clone the repository using these instructions:
```bash
git clone https://github.com/mozilla/bergamot-translator
cd bergamot-translator
```
2. Compile
Create a folder where you want to build all the artifacts (`build-native` in this case) and compile
Create a folder where you want to build all the artifacts (`build-native` in this case) and compile in that folder
```bash
mkdir build-native
cd build-native
cmake ../
make -j
```
```bash
mkdir build-native
cd build-native
cmake ../
make -j3
```
### Build WASM
#### Compiling for the first time
#### Prerequisite
1. Download and Install Emscripten using following instructions
* Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git`
* Enter the cloned directory: `cd emsdk`
* Install the lastest sdk tools: `./emsdk install latest`
* Activate the latest sdk tools: `./emsdk activate latest`
* Activate path variables: `source ./emsdk_env.sh`
Building on wasm requires Emscripten toolchain. It can be downloaded and installed using following instructions:
2. Clone the repository using these instructions:
* Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git`
* Enter the cloned directory: `cd emsdk`
* Install the lastest sdk tools: `./emsdk install latest`
* Activate the latest sdk tools: `./emsdk activate latest`
* Activate path variables: `source ./emsdk_env.sh`
#### <a name="Compile"></a> Compile
1. Create a folder where you want to build all the artifacts (`build-wasm` in this case) and compile
```bash
git clone https://github.com/mozilla/bergamot-translator
cd bergamot-translator
mkdir build-wasm
cd build-wasm
emcmake cmake -DCOMPILE_WASM=on ../
emmake make -j3
```
3. Download files (only required if you want to perform inference using build artifacts)
The wasm artifacts (.js and .wasm files) will be available in the build directory ("build-wasm" in this case).
It packages the vocabulary files into wasm binary, which is required only if you want to perform inference.
The compilation commands will preload these files in Emscriptens virtual file system.
If you want to package bergamot project specific files, please follow these instructions:
2. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts
```bash
git clone --depth 1 --branch main --single-branch https://github.com/mozilla-applied-ml/bergamot-models
mkdir models
cp -rf bergamot-models/prod/* models
gunzip models/*/*
find models \( -type f -name "model*" -or -type f -name "lex*" \) -delete
bash ../wasm/patch-artifacts-enable-wormhole.sh
```
4. Compile
1. Create a folder where you want to build all the artefacts (`build-wasm` in this case)
```bash
mkdir build-wasm
cd build-wasm
```
2. Compile the artefacts
* If you want to package files into wasm binary then execute following commands (Replace `FILES_TO_PACKAGE` with the
directory containing all the files to be packaged)
```bash
emcmake cmake -DCOMPILE_WASM=on -DPACKAGE_DIR=FILES_TO_PACKAGE ../
emmake make -j
```
e.g. If you want to package bergamot project specific files (downloaded using step 3 above) then
replace `FILES_TO_PACKAGE` with `../models`
* If you don't want to package any file into wasm binary then execute following commands:
```bash
emcmake cmake -DCOMPILE_WASM=on ../
emmake make -j
```
The wasm artifacts (.js and .wasm files) will be available in the build directory ("build-wasm" in this case).
3. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts
```bash
bash ../wasm/patch-artifacts-enable-wormhole.sh
```
#### Recompiling
As long as you don't update any submodule, just follow steps in `4.ii` and `4.iii` to recompile.\
If you update a submodule, execute following command before executing steps in `4.ii` and `4.iii` to recompile.
As long as you don't update any submodule, just follow [Compile](#Compile) steps.\
If you update a submodule, execute following command in repository root folder before executing
[Compile](#Compile) steps.
```bash
git submodule update --init --recursive
```

View File

@ -16,19 +16,15 @@ int main(int argc, char *argv[]) {
auto cp = marian::bergamot::createConfigParser();
auto options = cp.parseOptions(argc, argv, true);
// Prepare memories for model and shortlist
marian::bergamot::AlignedMemory modelBytes, shortlistBytes;
std::vector<std::shared_ptr<marian::bergamot::AlignedMemory>> vocabsBytes;
// Prepare memories for bytearrays (including model, shortlist and vocabs)
marian::bergamot::MemoryBundle memoryBundle;
if (options->get<bool>("check-bytearray")) {
// Load legit values into bytearrays.
modelBytes = marian::bergamot::getModelMemoryFromConfig(options);
shortlistBytes = marian::bergamot::getShortlistMemoryFromConfig(options);
marian::bergamot::getVocabsMemoryFromConfig(options, vocabsBytes);
memoryBundle = marian::bergamot::getMemoryBundleFromConfig(options);
}
marian::bergamot::Service service(options, std::move(modelBytes),
std::move(shortlistBytes), std::move(vocabsBytes));
marian::bergamot::Service service(options, std::move(memoryBundle));
// Read a large input text blob from stdin
std::ostringstream std_input;

View File

@ -0,0 +1,60 @@
##
# This CMake modules sets the project version from a version file.
#
# The module sets the following variables:
#
# * PROJECT_VERSION_STRING
# * PROJECT_VERSION_STRING_FULL
# * PROJECT_VERSION_MAJOR
# * PROJECT_VERSION_MINOR
# * PROJECT_VERSION_PATCH
# * PROJECT_VERSION_TWEAK
# * PROJECT_VERSION_GIT_SHA
#
# This module is public domain, use it as it fits you best.
##
# Get full string version from file
if(PROJECT_VERSION_FILE)
file(STRINGS ${PROJECT_VERSION_FILE} PROJECT_VERSION_STRING)
else()
file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/BERGAMOT_VERSION PROJECT_VERSION_STRING)
endif()
# Get current commit SHA from git
execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PROJECT_VERSION_GIT_SHA
OUTPUT_STRIP_TRAILING_WHITESPACE)
# Get partial versions into a list
string(REGEX MATCHALL "-.*$|[0-9]+" PROJECT_PARTIAL_VERSION_LIST
${PROJECT_VERSION_STRING})
# Set the version numbers
list(GET PROJECT_PARTIAL_VERSION_LIST 0 PROJECT_VERSION_MAJOR)
list(GET PROJECT_PARTIAL_VERSION_LIST 1 PROJECT_VERSION_MINOR)
list(GET PROJECT_PARTIAL_VERSION_LIST 2 PROJECT_VERSION_PATCH)
# The tweak part is optional, so check if the list contains it
list(LENGTH PROJECT_PARTIAL_VERSION_LIST PROJECT_PARTIAL_VERSION_LIST_LEN)
if(PROJECT_PARTIAL_VERSION_LIST_LEN GREATER 3)
list(GET PROJECT_PARTIAL_VERSION_LIST 3 PROJECT_VERSION_TWEAK)
string(SUBSTRING ${PROJECT_VERSION_TWEAK} 1 -1 PROJECT_VERSION_TWEAK)
endif()
# Unset the list
unset(PROJECT_PARTIAL_VERSION_LIST)
# Set full project version string
set(PROJECT_VERSION_STRING_FULL
${PROJECT_VERSION_STRING}+${PROJECT_VERSION_GIT_SHA})
# Print all variables for debugging
#message(STATUS ${PROJECT_VERSION_STRING_FULL})
#message(STATUS ${PROJECT_VERSION_STRING})
#message(STATUS ${PROJECT_VERSION_MAJOR})
#message(STATUS ${PROJECT_VERSION_MINOR})
#message(STATUS ${PROJECT_VERSION_PATCH})
#message(STATUS ${PROJECT_VERSION_TWEAK})
#message(STATUS ${PROJECT_VERSION_GIT_SHA})

View File

@ -0,0 +1,35 @@
#!/bin/bash
# Uses the command from https://stackoverflow.com/a/9355840/4565794.
# -v displays the commands executed to run compilation. Of this cc1 additional
# flags which contain the flags triggered by -march=native is what we need.
# -E stop after preprocessing stage.
# Output on a linux machine with gcc-8 looks as follows:
# $ gcc -march=native -E -v - </dev/null 2>&1 | grep cc1
# /usr/lib/gcc/x86_64-linux-gnu/8/cc1 -E -quiet -v -imultiarch x86_64-linux-gnu
# - -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3
# -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm
# -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig
# -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mno-rtm
# -mno-hle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave
# -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1
# -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl
# -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb
# -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk
# -mno-avx512vbmi2 -mavx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg
# -mno-movdiri -mno-movdir64b --param l1-cache-size=32 --param
# l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512
# -fstack-protector-strong -Wformat -Wformat-security
# The sha256sum of the output is computed, and stripped to the first 8
# characters for use in ccache and github cache store key. Can effectively be
# considered as a hash of the compiler version and the flags activated by
# -march=native.
COMPILER=$1
$COMPILER -march=native -E -v - < /dev/null 2>&1 | grep cc1 \
| sha256sum | cut -c1-8

View File

@ -23,9 +23,6 @@ TEST_CASE("Test Annotation API with random sentences") {
std::mt19937 randomIntGen_;
randomIntGen_.seed(42);
AnnotatedText testAnnotation; // This the container we add through API and
// check if the access is correct.
// External book-keeping so we have ground truths. Each element represents a
// sentence.
@ -45,7 +42,7 @@ TEST_CASE("Test Annotation API with random sentences") {
//
// 4-0 4-1 4-2 4-3
//
// Words are separated by space units.
// Tokens are contiguous because that's how SentencePiece works.
//
// Below, we accumulate the text with intended structure as above, and
// ground-truth tables populated to be aware of the ByteRanges where they are
@ -53,9 +50,10 @@ TEST_CASE("Test Annotation API with random sentences") {
if (debug) {
std::cout << "Preparing text and ground truth-tables" << std::endl;
}
std::string text;
for (size_t idx = 0; idx < sentences; idx++) {
if (idx != 0)
testAnnotation.text += "\n";
text += "\n";
// Words can be zero, we need to support empty word sentences as well.
size_t numWords = randomIntGen_() % maxWords;
@ -65,23 +63,16 @@ TEST_CASE("Test Annotation API with random sentences") {
// For empty sentence, we expect it to be empty and marked in position where
// the existing string is if needed to be pointed out.
size_t before = testAnnotation.text.size() - 1;
size_t before = text.size() - 1;
size_t sentenceBegin{before}, sentenceEnd{before};
for (size_t idw = 0; idw < numWords; idw++) {
if (idw != 0) {
testAnnotation.text += " ";
if (debug) {
std::cout << " ";
}
}
// Get new beginning, accounting for space above.
before = testAnnotation.text.size();
before = text.size();
// Add the word
std::string word = std::to_string(idx) + "-" + std::to_string(idw);
testAnnotation.text += word;
text += word;
// Do math, before, before + new-word's size.
wordByteRanges.push_back((ByteRange){before, before + word.size()});
@ -105,6 +96,9 @@ TEST_CASE("Test Annotation API with random sentences") {
groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
}
AnnotatedText testAnnotation(std::move(text)); // This the container we add through API and
// check if the access is correct.
// We prepare string_views now with the known ByteRanges and use the
// string_view based AnnotatedText.addSentence(...) API to add sentences to
// transparently convert from string_views to ByteRanges, rebasing/working out
@ -116,6 +110,7 @@ TEST_CASE("Test Annotation API with random sentences") {
}
std::vector<std::vector<marian::string_view>> wordStringViews;
std::vector<ByteRange>::const_iterator sentence_iter = groundTruthSentences.begin();
for (auto &sentence : groundTruthWords) {
std::vector<marian::string_view> wordByteRanges;
bool first{true};
@ -132,7 +127,8 @@ TEST_CASE("Test Annotation API with random sentences") {
std::cout << std::string(wordView);
}
}
testAnnotation.addSentence(wordByteRanges);
testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(), testAnnotation.text.data() + sentence_iter->begin);
++sentence_iter;
wordStringViews.push_back(wordByteRanges);
if (debug) {
std::cout << std::endl;
@ -207,7 +203,7 @@ TEST_CASE("Test Annotation API with random sentences") {
// Sentence if the random test above does not cover it for some reason.
int emptySentenceIdx = sentences;
std::vector<marian::string_view> emptySentence;
testAnnotation.addSentence(emptySentence);
testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(), testAnnotation.text.data() + testAnnotation.text.size());
// There are no words.
CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);

View File

@ -1,3 +1,7 @@
# Generate version file
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.h.in
${CMAKE_CURRENT_SOURCE_DIR}/project_version.h @ONLY)
add_library(bergamot-translator STATIC
byte_array_util.cpp
text_processor.cpp

View File

@ -1,130 +1,68 @@
#include "annotation.h"
#include <cassert>
#include <iostream>
namespace marian {
namespace bergamot {
void Annotation::addSentence(std::vector<ByteRange> &sentence) {
flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence),
std::end(sentence));
size_t size = flatByteRanges_.size();
sentenceEndIds_.push_back(size);
AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) {
// Treat the entire text as a gap that recordExistingSentence will break.
annotation.token_begin_.back() = text.size();
}
size_t Annotation::numWords(size_t sentenceIdx) const {
size_t bosId, eosId;
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
eosId = sentenceEndIds_[sentenceIdx + 1];
// Difference between eosId and bosId is the number of words.
return eosId - bosId;
void AnnotatedText::appendSentence(string_view prefix, std::vector<string_view>::iterator begin, std::vector<string_view>::iterator end) {
assert(annotation.token_begin_.back() == text.size());
// We'll be adding tokens from the sentence and another gap.
annotation.token_begin_.reserve(annotation.token_begin_.size() + (end - begin) + 1);
// prefix is just end of the previous one.
appendEndingWhitespace(prefix);
// Appending sentence text.
std::size_t offset = text.size();
for (std::vector<string_view>::iterator token = begin; token != end; ++token) {
offset += token->size();
annotation.token_begin_.push_back(offset);
}
if (begin != end) {
text.append(begin->data(), (end - 1)->data() + (end - 1)->size());
assert(offset == text.size()); // Tokens should be contiguous.
}
// Add the gap after the sentence. This is empty for now, but will be
// extended with appendEndingWhitespace or another appendSentence.
annotation.gap_.push_back(annotation.token_begin_.size() - 1);
annotation.token_begin_.push_back(offset);
}
ByteRange Annotation::sentence(size_t sentenceIdx) const {
size_t bosId, eosId;
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
eosId = sentenceEndIds_[sentenceIdx + 1];
ByteRange sentenceByteRange;
void AnnotatedText::appendEndingWhitespace(string_view whitespace) {
text.append(whitespace.data(), whitespace.size());
annotation.token_begin_.back() = text.size();
}
if (bosId == eosId) {
// We have an empty sentence. However, we want to be able to point where in
// target this happened through the ranges. We are looking for the end of
// the flatByteRange and non-empty sentence before this happened and
// construct empty string-view equivalent ByteRange.
ByteRange eos = flatByteRanges_[eosId - 1];
sentenceByteRange = ByteRange{eos.end, eos.end};
void AnnotatedText::recordExistingSentence(std::vector<string_view>::iterator begin, std::vector<string_view>::iterator end, const char *sentence_begin) {
assert(sentence_begin >= text.data());
assert(sentence_begin <= text.data() + text.size());
assert(begin == end || sentence_begin == begin->data());
assert(!annotation.token_begin_.empty());
assert(annotation.token_begin_.back() == text.size());
// Clip off size token ending.
annotation.token_begin_.resize(annotation.token_begin_.size() - 1);
for (std::vector<string_view>::iterator i = begin; i != end; ++i) {
assert(i->data() >= text.data()); // In range.
assert(i->data() + i->size() <= text.data() + text.size()); // In range
assert(i + 1 == end || i->data() + i->size() == (i+1)->data()); // Contiguous
annotation.token_begin_.push_back(i->data() - text.data());
}
// Gap token after sentence.
annotation.gap_.push_back(annotation.token_begin_.size());
if (begin != end) {
annotation.token_begin_.push_back((end - 1)->data() + (end - 1)->size() - text.data());
} else {
ByteRange bos = flatByteRanges_[bosId];
ByteRange eos = flatByteRanges_[eosId - 1];
sentenceByteRange = ByteRange{bos.begin, eos.end};
// empty sentence.
annotation.token_begin_.push_back(sentence_begin - text.data());
}
return sentenceByteRange;
}
ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const {
size_t bosOffset = sentenceEndIds_[sentenceIdx];
return flatByteRanges_[bosOffset + wordIdx];
}
string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const {
auto terminals = annotation.word(sentenceIdx, wordIdx);
return string_view(&text[terminals.begin], terminals.size());
}
string_view AnnotatedText::sentence(size_t sentenceIdx) const {
auto sentenceAsByteRange = annotation.sentence(sentenceIdx);
return asStringView(sentenceAsByteRange);
}
void AnnotatedText::appendSentence(std::string prefix, std::string &reference,
std::vector<string_view> &wordRanges) {
text += prefix;
size_t offset = text.size(); // Get size before to do ByteRange arithmetic
text += reference; // Append reference to text
std::vector<ByteRange> sentence;
for (auto &wordView : wordRanges) {
size_t thisWordBegin = offset + wordView.data() - reference.data();
sentence.push_back(
ByteRange{thisWordBegin, thisWordBegin + wordView.size()});
}
annotation.addSentence(sentence);
}
void AnnotatedText::addSentence(std::vector<string_view> &wordRanges) {
addSentence(std::begin(wordRanges), std::end(wordRanges));
};
void AnnotatedText::addSentence(std::vector<string_view>::iterator begin,
std::vector<string_view>::iterator end) {
std::vector<ByteRange> sentence;
for (auto p = begin; p != end; p++) {
size_t begin_offset = p->data() - text.data();
sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()});
}
annotation.addSentence(sentence);
};
ByteRange AnnotatedText::wordAsByteRange(size_t sentenceIdx,
size_t wordIdx) const {
return annotation.word(sentenceIdx, wordIdx);
}
ByteRange AnnotatedText::sentenceAsByteRange(size_t sentenceIdx) const {
return annotation.sentence(sentenceIdx);
}
string_view AnnotatedText::asStringView(const ByteRange &byteRange) const {
const char *data = &text[byteRange.begin];
size_t size = byteRange.size();
return string_view(data, size);
}
string_view AnnotatedText::gap(size_t sentenceIdx) const {
// Find start of filler-text before, there's a corner case when there's no
// sentence before.
const char *start = nullptr;
if (sentenceIdx == 0) {
// If first sentence, filler begins at start of whole-text.
start = text.data();
} else {
// Otherwise, filler begins at end of previous sentence.
string_view sentenceBefore = sentence(sentenceIdx - 1);
start = sentenceBefore.data() + sentenceBefore.size();
}
// Find end of filler-text, but there is a corner-case to handle.
const char *end = nullptr;
if (sentenceIdx == numSentences()) {
// If last sentence, manually find end of whole-text.
const char *begin = text.data();
end = begin + text.size();
} else {
// Otherwise, the filler ends at the start of next sentence.
string_view sentenceAfter = sentence(sentenceIdx);
end = sentenceAfter.data();
}
return string_view(start, end - start);
// Add back size token ending.
annotation.token_begin_.push_back(text.size());
}
} // namespace bergamot

View File

@ -17,83 +17,99 @@ struct ByteRange {
const size_t size() const { return end - begin; }
};
/// An Annotation is a collection of ByteRanges used to denote ancillary
/// information of sentences and words on a text of string. Annotation is meant
/// for consumption on platforms where `string_view` creates problems (eg:
/// exports through WASM) conveniently rebasing them as required into
/// ByteRanges. See AnnotatedText for cases where this is a non-issue.
/// Annotation expresses sentence and token boundary information as ranges of
/// bytes in a string, but does not itself own the string.
///
/// See also AnnotatedText, which owns Annotation and the string. AnnotatedText
/// wraps these ByteRange functions to provide a string_view interface.
///
/// **Usage**
/// Text is divided into gaps (whitespace between sentences) and sentences like
/// so:
/// gap sentence gap sentence gap
/// Because gaps appear at the beginning and end of the text, there's always
/// one more gap than there are sentences.
///
/// To ensure rebasing is consistent during creation and updation, use
/// `Annotation` best through `AnnotatedText`, which also holds the reference
/// string and can work with `string_views`.
/// The entire text is a unbroken sequence of tokens (i.e. the end of a token
/// is the beginning of the next token). A gap is exactly one token containing
/// whatever whitespace is between the sentences. A sentence is a sequence of
/// tokens.
///
/// If used separately, it is on the user to ensure the reference string
/// is the same as what the Annotation refers to. For best results, an instance
/// is expected to be read only in this mode of operation.
/// Since we are using SentencePiece, a token can include whitespace. The term
/// "word" is used, somewhat incorrectly, as a synonym of token.
///
/// **Idea**
///
/// Annotation is intended to be the same structure conceptually as below,
/// except the `std::vector<std::vector<ByteRange>>` hammered into a flat
/// structure to avoid multiple reallocs keeping efficiency in mind. This is
/// achieved by having markers of where sentence ends in the flat container
/// storing word ByteRanges.
///
/// ```cpp
/// typedef ByteRange Word;
/// // std::vector<ByteRange>, a single sentence
/// typedef std::vector<Word> Sentence;
/// std::vector<std::vector<ByteRange> // multiple sentences
/// typedef std::vector<Sentence> Annotation;
///
/// Annotation example;
/// ```
/// This structure exists to provide a consistent API to access the nested
/// sentences of varying lengths, which occur in source-text processed into
/// multiple sentences, and target-text translated from source as multiple
/// sentences, both composed of (sub)-words, providing a List[List] like access
/// while storing it in a compact and efficient manner.
/// A gap can be empty (for example there may not have been whitespace at the
/// beginning). A sentence can also be empty (typically the translation system
/// produced empty output). That's fine, these are just empty ranges as you
/// would expect.
class Annotation {
public:
/// Annotation is constructed empty. See `addSentence()` to populate it with
/// annotations.
/// Initially an empty string. Populated by AnnotatedText.
Annotation() {
// The -1-th sentence ends at 0.
sentenceEndIds_.push_back(0);
token_begin_.push_back(0);
token_begin_.push_back(0);
gap_.push_back(0);
}
size_t numSentences() const { return sentenceEndIds_.size() - 1; }
size_t numSentences() const { return gap_.size() - 1; }
/// Returns number of words in the sentence identified by `sentenceIdx`.
size_t numWords(size_t sentenceIdx) const;
/// Adds a sentences from `vector<ByteRange>` representation, internally doing
/// extra book-keeping for the sentence terminal markings. Sentences are
/// expected to be added in order as they occur in text.
void addSentence(std::vector<ByteRange> &sentence);
size_t numWords(size_t sentenceIdx) const {
return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */;
}
/// Returns a ByteRange representing `wordIdx` in sentence indexed by
/// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
/// `.numWords()` for `sentenceIdx` for defined behaviour.
ByteRange word(size_t sentenceIdx, size_t wordIdx) const;
ByteRange word(size_t sentenceIdx, size_t wordIdx) const {
size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx;
return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
}
/// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
/// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
/// less than `.numSentences()`.
ByteRange sentence(size_t sentenceIdx) const;
ByteRange sentence(size_t sentenceIdx) const {
return ByteRange {
token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */
token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */
};
}
ByteRange gap(size_t gapIdx) const {
size_t tokenIdx = gap_[gapIdx];
return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
}
private:
/// A flat storage for ByteRanges. Composed of word ByteRanges, extra
/// information in sentenceEndIds_ to denote sentence boundary markers as
/// indices.
std::vector<ByteRange> flatByteRanges_;
friend class AnnotatedText;
/// Map from token index to byte offset at which it begins. Token i is:
/// [token_begin_[i], token_begin_[i+1])
/// The vector is padded so that these indices are always valid, even at the
/// end. So tokens_begin_.size() is the number of tokens plus 1.
std::vector<size_t> token_begin_;
/// Stores indices onto flatByteRanges_ of where sentences end (not inclusive,
/// aligned with C++ half interval notions). There is a 0 marker to simplify
/// sources, indicating where the -1-th sentence ends.
std::vector<size_t> sentenceEndIds_;
/// Indices of tokens that correspond to gaps between sentences. These are
/// indices into token_begin_.
/// Gap g is byte range:
/// [token_begin_[gap_[w]], token_begin_[gap_[w]+1])
/// Sentence s is byte range:
/// [token_begin_[gap_[s]+1], token_begin_[gap_[s+1]])
/// A sentence does not include whitespace at the beginning or end.
///
/// gap_.size() == numSentences() + 1.
///
/// Example: empty text "" -> just an empty gap.
/// token_begin_ = {0, 0};
/// gap_ = {0};
///
/// Example: only space " " -> just a gap containing the space.
/// token_begin_ = {0, 1};
/// gap_ = {0};
///
/// Example: one token "hi" -> empty gap, sentence with one token, empty gap
/// token_begin_ = {0, 0, 2, 2};
/// gap_ = {0, 2};
std::vector<size_t> gap_;
};
/// AnnotatedText is effectively std::string text + Annotation, providing the
@ -107,7 +123,6 @@ private:
///
/// 3. Bind the text and annotations together, to move around as a meaningful
/// unit.
struct AnnotatedText {
public:
std::string text; ///< Blob of string elements in annotation refers to.
@ -122,7 +137,31 @@ public:
/// Construct moving in a string (for efficiency purposes, copying string
/// constructor is disallowed).
AnnotatedText(std::string &&text) : text(std::move(text)){};
AnnotatedText(std::string &&text);
/// Appends a sentence to the existing text and transparently rebases
/// string_views. Since this tracks only prefix, remember
/// appendEndingWhitespace.
/// The string_views must not already be in text.
void appendSentence(
string_view prefix,
std::vector<string_view>::iterator tokens_begin,
std::vector<string_view>::iterator tokens_end);
/// Append the whitespace at the end of input. string_view must not be in
/// text.
void appendEndingWhitespace(string_view whitespace);
/// Record the existence of a sentence that is already in text. The
/// iterators are over string_views for each token that must be in text
/// already. This function must be called to record sentences in order.
/// Normally the beginning of the sentence can be inferred from
/// tokens_begin->data() but the tokens could be empty, so sentence_begin is
/// required to know where the sentence is.
void recordExistingSentence(
std::vector<string_view>::iterator tokens_begin,
std::vector<string_view>::iterator tokens_end,
const char *sentence_begin);
/// Returns the number of sentences in the annotation structure.
const size_t numSentences() const { return annotation.numSentences(); }
@ -132,46 +171,44 @@ public:
return annotation.numWords(sentenceIdx);
}
/// Appends a sentence to the existing text and transparently rebases
/// string_views
void appendSentence(std::string prefix, std::string &reference,
std::vector<string_view> &wordRanges);
/// Adds a sentence, used to load from SentencePiece annotations conveniently.
void addSentence(std::vector<string_view> &wordRanges);
/// Adds a sentence between two iterators, often useful while constructing
/// from parts of a container.
void addSentence(std::vector<string_view>::iterator begin,
std::vector<string_view>::iterator end);
/// Returns a string_view representing wordIdx in sentenceIdx
string_view word(size_t sentenceIdx, size_t wordIdx) const;
string_view word(size_t sentenceIdx, size_t wordIdx) const {
return asStringView(annotation.word(sentenceIdx, wordIdx));
}
/// Returns a string_view representing sentence corresponding to sentenceIdx.
string_view sentence(size_t sentenceIdx) const;
string_view sentence(size_t sentenceIdx) const {
return asStringView(annotation.sentence(sentenceIdx));
}
/// Returns the string_view of the gap between two sentences in the container.
///
/// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
///
/// * For `i = 0`: The gap between the start of text and the first sentence.
/// * For `i = 0`: The gap between the start of text and the 0th sentence.
/// * For `i = 1...N-1`, returns the text comprising of the gap
/// between the `i-1`-th and `i`-th sentence.
/// * For `i = N`, the gap between the last sentence and end of
/// between the `i`-th and `i+1`-th sentence.
/// * For `i = N`, the gap between the last (N-1th) sentence and end of
/// text.
/// @param sentenceIdx: Can be between `[0, numSentences()]`.
string_view gap(size_t sentenceIdx) const;
string_view gap(size_t sentenceIdx) const {
return asStringView(annotation.gap(sentenceIdx));
}
/// Returns a ByteRange representing wordIdx in sentenceIdx
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const {
return annotation.word(sentenceIdx, wordIdx);
}
/// Returns a ByteRange representing sentence corresponding to sentenceIdx.
ByteRange sentenceAsByteRange(size_t sentenceIdx) const;
ByteRange sentenceAsByteRange(size_t sentenceIdx) const {
return annotation.sentence(sentenceIdx);
}
private:
string_view asStringView(const ByteRange &byteRange) const;
string_view asStringView(const ByteRange &byteRange) const {
return string_view(text.data() + byteRange.begin, byteRange.size());
}
};
} // namespace bergamot

View File

@ -10,11 +10,11 @@ namespace marian {
namespace bergamot {
BatchTranslator::BatchTranslator(DeviceId const device,
std::vector<Ptr<Vocab const>> &vocabs,
Vocabs &vocabs,
Ptr<Options> options,
const AlignedMemory* modelMemory,
const AlignedMemory* shortlistMemory)
: device_(device), options_(options), vocabs_(&vocabs),
: device_(device), options_(options), vocabs_(vocabs),
modelMemory_(modelMemory), shortlistMemory_(shortlistMemory) {}
void BatchTranslator::initialize() {
@ -22,17 +22,17 @@ void BatchTranslator::initialize() {
bool check = options_->get<bool>("check-bytearray",false); // Flag holds whether validate the bytearray (model and shortlist)
if (options_->hasAndNotEmpty("shortlist")) {
int srcIdx = 0, trgIdx = 1;
bool shared_vcb = vocabs_->front() == vocabs_->back();
bool shared_vcb = vocabs_.sources().front() == vocabs_.target(); // vocabs_->sources().front() is invoked as we currently only support one source vocab
if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) {
slgen_ = New<data::BinaryShortlistGenerator>(shortlistMemory_->begin(), shortlistMemory_->size(),
vocabs_->front(), vocabs_->back(),
srcIdx, trgIdx, shared_vcb, check);
vocabs_.sources().front(), vocabs_.target(),
srcIdx, trgIdx, shared_vcb, check);
}
else {
// Changed to BinaryShortlistGenerator to enable loading binary shortlist file
// This class also supports text shortlist file
slgen_ = New<data::BinaryShortlistGenerator>(options_, vocabs_->front(),
vocabs_->back(), srcIdx,
slgen_ = New<data::BinaryShortlistGenerator>(options_, vocabs_.sources().front(),
vocabs_.target(), srcIdx,
trgIdx, shared_vcb);
}
}
@ -97,7 +97,7 @@ void BatchTranslator::translate(Batch &batch) {
std::vector<Ptr<SubBatch>> subBatches;
for (size_t j = 0; j < maxDims.size(); ++j) {
subBatches.emplace_back(
New<SubBatch>(batchSize, maxDims[j], vocabs_->at(j)));
New<SubBatch>(batchSize, maxDims[j], vocabs_.sources().at(j)));
}
std::vector<size_t> words(maxDims.size(), 0);
@ -116,9 +116,8 @@ void BatchTranslator::translate(Batch &batch) {
auto corpus_batch = Ptr<CorpusBatch>(new CorpusBatch(subBatches));
corpus_batch->setSentenceIds(sentenceIds);
auto trgVocab = vocabs_->back();
auto search = New<BeamSearch>(options_, scorers_, trgVocab);
auto search = New<BeamSearch>(options_, scorers_, vocabs_.target());
auto histories = std::move(search->search(graph_, corpus_batch));
batch.completeBatch(histories);

View File

@ -11,6 +11,7 @@
#include "request.h"
#include "translator/history.h"
#include "translator/scorers.h"
#include "vocabs.h"
#ifndef WASM_COMPATIBLE_SOURCE
#include "pcqueue.h"
@ -34,7 +35,7 @@ public:
* @param modelMemory byte array (aligned to 256!!!) that contains the bytes of a model.bin. Provide a nullptr if not used.
* @param shortlistMemory byte array of shortlist (aligned to 64)
*/
explicit BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
explicit BatchTranslator(DeviceId const device, Vocabs &vocabs,
Ptr<Options> options, const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory);
// convenience function for logging. TODO(jerin)
@ -45,7 +46,7 @@ public:
private:
Ptr<Options> options_;
DeviceId device_;
std::vector<Ptr<Vocab const>> *vocabs_;
const Vocabs& vocabs_;
Ptr<ExpressionGraph> graph_;
std::vector<Ptr<Scorer>> scorers_;
Ptr<data::ShortlistGenerator const> slgen_;

View File

@ -117,5 +117,13 @@ void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
}
}
MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options){
MemoryBundle memoryBundle;
memoryBundle.model = getModelMemoryFromConfig(options);
memoryBundle.shortlist = getShortlistMemoryFromConfig(options);
getVocabsMemoryFromConfig(options, memoryBundle.vocabs);
return memoryBundle;
}
} // namespace bergamot
} // namespace marian

View File

@ -10,5 +10,6 @@ AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options)
void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
std::vector<std::shared_ptr<AlignedMemory>>& vocabMemories);
bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize);
MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options);
} // namespace bergamot
} // namespace marian

View File

@ -15,6 +15,21 @@ typedef std::vector<Segment> Segments;
/// Shortcut to AlignedVector<char> for byte arrays
typedef AlignedVector<char> AlignedMemory;
/// Memory bundle for all byte-arrays.
/// Can be a set/subset of model, shortlist, vocabs and ssplitPrefixFile bytes.
struct MemoryBundle {
AlignedMemory model; ///< Byte-array of model (aligned to 256)
AlignedMemory shortlist; ///< Byte-array of shortlist (aligned to 64)
/// Vector of vocabulary memories (aligned to 64).
/// If two vocabularies are the same (based on the filenames), two entries (shared
/// pointers) will be generated which share the same AlignedMemory object.
std::vector<std::shared_ptr<AlignedMemory>> vocabs;
/// @todo Not implemented yet
AlignedMemory ssplitPrefixFile;
};
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,19 @@
#pragma once
/*
* File project_version.h is generated using CMake. Do not modify project_version.h manually!
* Edit project_version.h.in file instead.
*/
#include <string>
namespace marian {
namespace bergamot {
std::string bergamotBuildVersion() {
// e.g. v1.2.3-alpha.1.1+abc123d
return "@PROJECT_VERSION_STRING_FULL@";
}
} // namespace bergamot
} // namespace marian

View File

@ -65,33 +65,29 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
Result result = onebest[0]; // Expecting only one result;
Words words = std::get<0>(result);
auto targetVocab = vocabs_->back();
std::string decoded;
std::vector<string_view> targetSentenceMappings;
targetVocab->decodeWithByteRanges(words, decoded, targetSentenceMappings);
vocabs_.target()->decodeWithByteRanges(words, decoded, targetSentenceMappings);
switch (responseOptions_.concatStrategy) {
case ConcatStrategy::FAITHFUL: {
// For each sentence, prepend the filler text between the corresponding
// source-sentence and the source-sentence before.
string_view pre = response.source.gap(sentenceIdx);
response.target.appendSentence(std::string(pre.data(), pre.size()),
decoded, targetSentenceMappings);
response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end());
// If this is the last history to be decoded and translated-text
// constructed, append the text till the end, which could be spaces or
// empty.
if (sentenceIdx + 1 == histories.size()) {
string_view post = response.source.gap(sentenceIdx + 1);
response.target.text += std::string(post.data(), post.size());
response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1));
}
break;
}
case ConcatStrategy::SPACE: {
std::string delimiter = (sentenceIdx == 0) ? "" : " ";
response.target.appendSentence(delimiter, decoded,
targetSentenceMappings);
string_view delimiter = (sentenceIdx == 0) ? "" : " ";
response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end());
break;
}

View File

@ -4,6 +4,7 @@
#include "data/types.h"
#include "response.h"
#include "response_options.h"
#include "vocabs.h"
// For now we will work with this, to avoid complaints another structure is hard
// to operate with.
@ -24,10 +25,10 @@ public:
/// @param [in] vocabs: marian vocab object (used in decoding)
/// @param [in] promise: promise to set with the constructed Response.
ResponseBuilder(ResponseOptions responseOptions, AnnotatedText &&source,
std::vector<Ptr<Vocab const>> &vocabs,
Vocabs &vocabs,
std::promise<Response> &&promise)
: responseOptions_(responseOptions), source_(std::move(source)),
vocabs_(&vocabs), promise_(std::move(promise)) {}
vocabs_(vocabs), promise_(std::move(promise)) {}
/// Constructs and sets the promise of a Response object from obtained
/// histories after translating.
@ -81,7 +82,7 @@ private:
// Data members are context/curried args for the functor.
ResponseOptions responseOptions_;
std::vector<Ptr<Vocab const>> *vocabs_; // vocabs are required for decoding
const Vocabs& vocabs_; // vocabs are required for decoding
// and any source validation checks.
std::promise<Response> promise_; // To be set when callback triggered and
// after Response constructed.

View File

@ -5,50 +5,16 @@
#include <string>
#include <utility>
inline std::vector<marian::Ptr<const marian::Vocab>>
loadVocabularies(marian::Ptr<marian::Options> options,
std::vector<std::shared_ptr<marian::bergamot::AlignedMemory>>&& vocabMemories) {
// @TODO: parallelize vocab loading for faster startup
std::vector<marian::Ptr<marian::Vocab const>> vocabs;
if(!vocabMemories.empty()){
// load vocabs from buffer
ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies.");
vocabs.resize(vocabMemories.size());
for (size_t i = 0; i < vocabs.size(); i++) {
marian::Ptr<marian::Vocab> vocab = marian::New<marian::Vocab>(options, i);
vocab->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size()));
vocabs[i] = vocab;
}
} else {
// load vocabs from file
auto vfiles = options->get<std::vector<std::string>>("vocabs");
// with the current setup, we need at least two vocabs: src and trg
ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies.");
vocabs.resize(vfiles.size());
std::unordered_map<std::string, marian::Ptr<marian::Vocab>> vmap;
for (size_t i = 0; i < vocabs.size(); ++i) {
auto m = vmap.emplace(std::make_pair(vfiles[i], marian::Ptr<marian::Vocab>()));
if (m.second) { // new: load the vocab
m.first->second = marian::New<marian::Vocab>(options, i);
m.first->second->load(vfiles[i]);
}
vocabs[i] = m.first->second;
}
}
return vocabs;
}
namespace marian {
namespace bergamot {
Service::Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory,
std::vector<std::shared_ptr<AlignedMemory>> vocabMemories)
Service::Service(Ptr<Options> options, MemoryBundle memoryBundle)
: requestId_(0), options_(options),
vocabs_(std::move(loadVocabularies(options, std::move(vocabMemories)))),
vocabs_(options, std::move(memoryBundle.vocabs)),
text_processor_(vocabs_, options), batcher_(options),
numWorkers_(options->get<int>("cpu-threads")),
modelMemory_(std::move(modelMemory)),
shortlistMemory_(std::move(shortlistMemory))
modelMemory_(std::move(memoryBundle.model)),
shortlistMemory_(std::move(memoryBundle.shortlist))
#ifndef WASM_COMPATIBLE_SOURCE
// 0 elements in PCQueue is illegal and can lead to failures. Adding a
// guard to have at least one entry allocated. In the single-threaded

View File

@ -9,6 +9,7 @@
#include "response_builder.h"
#include "text_processor.h"
#include "translator/parser.h"
#include "vocabs.h"
#ifndef WASM_COMPATIBLE_SOURCE
#include "pcqueue.h"
@ -55,53 +56,29 @@ namespace bergamot {
/// // Do things with response.
/// ```
///
/// Optionally Service can be initialized by also passing model memory for
/// purposes of efficiency (which defaults to nullpointer and then reads from
/// Optionally Service can be initialized by also passing bytearray memories
/// for purposes of efficiency (which defaults to empty and then reads from
/// file supplied through config).
///
class Service {
public:
/// Construct Service from Marian options. If memoryBundle is empty, Service is
/// initialized from file-based loading. Otherwise, Service is initialized from
/// the given bytearray memories.
/// @param options Marian options object
/// @param modelMemory byte array (aligned to 256!!!) that contains the bytes
/// of a model.bin.
/// @param shortlistMemory byte array of shortlist (aligned to 64)
/// @param vocabMemories vector of vocabulary memories (aligned to 64)
explicit Service(Ptr<Options> options, AlignedMemory modelMemory,
AlignedMemory shortlistMemory,
std::vector<std::shared_ptr<AlignedMemory>> vocabMemories);
/// @param memoryBundle holds all byte-array memories. Can be a set/subset of
/// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional.
explicit Service(Ptr<Options> options, MemoryBundle memoryBundle={});
/// Construct Service purely from Options. This expects options which
/// marian-decoder expects to be set for loading model shortlist and
/// vocabularies from files in addition to parameters that set unset desired
/// features (e.g: alignments, quality-scores).
///
/// This is equivalent to a call to:
/// ```cpp
/// Service(options, AlignedMemory(), AlignedMemory(), {})
/// ```
/// wherein empty memory is passed and internal flow defaults to file-based
/// model, shortlist loading. AlignedMemory() corresponds to empty memory
explicit Service(Ptr<Options> options)
: Service(options, AlignedMemory(), AlignedMemory(), {}) {}
/// Construct Service from a string configuration.
/// @param [in] config string parsable as YAML expected to adhere with marian
/// config
/// @param [in] modelMemory byte array (aligned to 256!!!) that contains the
/// bytes of a model.bin. Optional. AlignedMemory() corresponds to empty memory
/// @param [in] shortlistMemory byte array of shortlist (aligned to 64). Optional.
/// @param [in] vocabMemories vector of vocabulary memories (aligned to 64). Optional.
/// If two vocabularies are the same (based on the filenames), two entries (shared
/// pointers) will be generated which share the same AlignedMemory object.
explicit Service(const std::string &config,
AlignedMemory modelMemory = AlignedMemory(),
AlignedMemory shortlistMemory = AlignedMemory(),
std::vector<std::shared_ptr<AlignedMemory>> vocabsMemories = {})
: Service(parseOptions(config, /*validate=*/false),
std::move(modelMemory),
std::move(shortlistMemory),
std::move(vocabsMemories)) {}
/// Construct Service from a string configuration. If memoryBundle is empty, Service is
/// initialized from file-based loading. Otherwise, Service is initialized from
/// the given bytearray memories.
/// @param [in] config string parsable as YAML expected to adhere with marian config
/// @param [in] memoryBundle holds all byte-array memories. Can be a set/subset of
/// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional.
explicit Service(const std::string &config, MemoryBundle memoryBundle={})
: Service(parseOptions(config, /*validate=*/false), std::move(memoryBundle)) {}
/// Explicit destructor to clean up after any threads initialized in
/// asynchronous operation mode.
@ -196,7 +173,7 @@ private:
size_t requestId_;
/// Store vocabs representing source and target.
std::vector<Ptr<Vocab const>> vocabs_; // ORDER DEPENDENCY (text_processor_)
Vocabs vocabs_; // ORDER DEPENDENCY (text_processor_)
/// TextProcesser takes a blob of text and converts into format consumable by
/// the batch-translator and annotates sentences and words.

View File

@ -4,7 +4,6 @@
#include "annotation.h"
#include "common/options.h"
#include "data/vocab.h"
#include <vector>
namespace marian {
@ -12,13 +11,14 @@ namespace bergamot {
Segment TextProcessor::tokenize(const string_view &segment,
std::vector<string_view> &wordRanges) {
return vocabs_->front()->encodeWithByteRanges(
// vocabs_->sources().front() is invoked as we currently only support one source vocab
return vocabs_.sources().front()->encodeWithByteRanges(
segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
}
TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
TextProcessor::TextProcessor(Vocabs &vocabs,
Ptr<Options> options)
: vocabs_(&vocabs), sentence_splitter_(options) {
: vocabs_(vocabs), sentence_splitter_(options) {
max_length_break_ = options->get<int>("max-length-break");
max_length_break_ = max_length_break_ - 1;
@ -41,15 +41,16 @@ void TextProcessor::process(AnnotatedText &source, Segments &segments) {
// There are some cases where SentencePiece or vocab returns no words
// after normalization. 0 prevents any empty entries from being added.
if (segment.size() > 0) {
// Truncate segment into max_input_size segments.
truncate(segment, wordRanges, segments, source);
// Wrap segment into sentences of at most max_length_break_ tokens and
// tell source about them.
wrap(segment, wordRanges, segments, source);
}
}
}
void TextProcessor::truncate(Segment &segment,
std::vector<string_view> &wordRanges,
Segments &segments, AnnotatedText &source) {
void TextProcessor::wrap(Segment &segment,
std::vector<string_view> &wordRanges,
Segments &segments, AnnotatedText &source) {
for (size_t offset = 0; offset < segment.size();
offset += max_length_break_) {
auto start = segment.begin() + offset;
@ -61,7 +62,8 @@ void TextProcessor::truncate(Segment &segment,
segments.back().push_back(sourceEosId());
auto astart = wordRanges.begin() + offset;
source.addSentence(astart, astart + diff);
// diff > 0
source.recordExistingSentence(astart, astart + diff, astart->data());
}
}

View File

@ -7,6 +7,7 @@
#include "annotation.h"
#include "sentence_splitter.h"
#include "vocabs.h"
#include <vector>
@ -21,7 +22,7 @@ class TextProcessor {
// sentences (vector of words). In addition, the ByteRanges of the
// source-tokens in unnormalized text are provided as string_views.
public:
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
explicit TextProcessor(Vocabs &vocabs, Ptr<Options>);
void process(AnnotatedText &source, Segments &segments);
@ -31,14 +32,15 @@ private:
Segment tokenize(const string_view &input,
std::vector<string_view> &tokenRanges);
// Truncate sentence into max_input_size segments.
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
Segments &segments, AnnotatedText &source);
// Wrap into sentences of at most max_length_break_ tokens and add to source.
void wrap(Segment &sentence, std::vector<string_view> &tokenRanges,
Segments &segments, AnnotatedText &source);
// shorthand, used only in truncate()
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
// vocabs_->sources().front() is invoked as we currently only support one source vocab
const Word sourceEosId() const { return vocabs_.sources().front()->getEosId(); }
std::vector<Ptr<Vocab const>> *vocabs_;
const Vocabs& vocabs_;
SentenceSplitter sentence_splitter_;
size_t max_length_break_;
};

81
src/translator/vocabs.h Normal file
View File

@ -0,0 +1,81 @@
#pragma once
namespace marian {
namespace bergamot {
/// Wrapper of Marian Vocab objects needed for translator.
/// Holds multiple source vocabularies and one target vocabulary
class Vocabs {
public:
/// Construct vocabs object from either byte-arrays or files
Vocabs(Ptr<Options> options, std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories): options_(options){
if (!vocabMemories.empty()){
// load vocabs from buffer
load(std::move(vocabMemories));
}
else{
// load vocabs from file
auto vocabPaths = options->get<std::vector<std::string>>("vocabs");
load(vocabPaths);
}
}
/// Get all source vocabularies (as a vector)
const std::vector<Ptr<Vocab const>>& sources() const {
return srcVocabs_;
}
/// Get the target vocabulary
const Ptr<Vocab const>& target() const {
return trgVocab_;
}
private:
std::vector<Ptr<Vocab const>> srcVocabs_; // source vocabularies
Ptr<Vocab const> trgVocab_; // target vocabulary
Ptr<Options> options_;
// load from buffer
void load(std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories) {
// At least two vocabs: src and trg
ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies.");
srcVocabs_.resize(vocabMemories.size());
// hashMap is introduced to avoid double loading the same vocab
// loading vocabs (either from buffers or files) is the biggest bottleneck of the speed
// uintptr_t holds unique keys (address) for share_ptr<AlignedMemory>
std::unordered_map<uintptr_t, Ptr<Vocab>> vmap;
for (size_t i = 0; i < srcVocabs_.size(); i++) {
auto m = vmap.emplace(std::make_pair(reinterpret_cast<uintptr_t>(vocabMemories[i].get()), Ptr<Vocab>()));
if (m.second) { // new: load the vocab
m.first->second = New<Vocab>(options_, i);
m.first->second->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size()));
}
srcVocabs_[i] = m.first->second;
}
// Initialize target vocab
trgVocab_ = srcVocabs_.back();
srcVocabs_.pop_back();
}
// load from file
void load(const std::vector<std::string>& vocabPaths){
// with the current setup, we need at least two vocabs: src and trg
ABORT_IF(vocabPaths.size() < 2, "Insufficient number of vocabularies.");
srcVocabs_.resize(vocabPaths.size());
std::unordered_map<std::string, Ptr<Vocab>> vmap;
for (size_t i = 0; i < srcVocabs_.size(); ++i) {
auto m = vmap.emplace(std::make_pair(vocabPaths[i], Ptr<Vocab>()));
if (m.second) { // new: load the vocab
m.first->second = New<Vocab>(options_, i);
m.first->second->load(vocabPaths[i]);
}
srcVocabs_[i] = m.first->second;
}
// Initialize target vocab
trgVocab_ = srcVocabs_.back();
srcVocabs_.pop_back();
}
};
} // namespace bergamot
} // namespace marian

View File

@ -4,6 +4,10 @@ add_executable(bergamot-translator-worker
bindings/TranslationResultBindings.cpp
)
# Generate version file that can be included in the wasm artifacts
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.js.in
${CMAKE_CURRENT_SOURCE_DIR}/project_version.js @ONLY)
# This header inclusion needs to go away later as path to public headers of bergamot
# translator should be directly available from "bergamot-translator" target
target_include_directories(bergamot-translator-worker
@ -14,18 +18,14 @@ target_include_directories(bergamot-translator-worker
target_compile_definitions(bergamot-translator-worker PRIVATE WASM_BINDINGS)
target_compile_options(bergamot-translator-worker PRIVATE ${WASM_COMPILE_FLAGS})
set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s FORCE_FILESYSTEM=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1 -s EXPORTED_RUNTIME_METHODS=[addOnPreMain]")
if (NOT PACKAGE_DIR STREQUAL "")
get_filename_component(REALPATH_PACKAGE_DIR ${PACKAGE_DIR} REALPATH BASE_DIR ${CMAKE_BINARY_DIR})
set(LINKER_FLAGS "${LINKER_FLAGS} --preload-file ${REALPATH_PACKAGE_DIR}@/")
endif()
# Enable worker file system
set(LINKER_FLAGS "${LINKER_FLAGS} -lworkerfs.js")
set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1 -s EXPORTED_RUNTIME_METHODS=[addOnPreMain]")
# Avoid node.js-code in emscripten glue-code
set(LINKER_FLAGS "${LINKER_FLAGS} -s ENVIRONMENT=web,worker")
# Append version information in the Javascript artifact
set(LINKER_FLAGS "${LINKER_FLAGS} --extern-pre-js ${CMAKE_CURRENT_SOURCE_DIR}/project_version.js")
set_target_properties(bergamot-translator-worker PROPERTIES
SUFFIX ".js"
LINK_FLAGS ${LINKER_FLAGS}

View File

@ -7,8 +7,8 @@ Please note that [Using JS APIs](#Using-JS-APIs) and [Demo](#Demo) section below
```bash
cd test_page
mkdir models
git clone --depth 1 --branch main --single-branch https://github.com/mozilla-applied-ml/bergamot-models
mkdir models
cp -rf bergamot-models/prod/* models
gunzip models/*/*
```
@ -18,10 +18,7 @@ gunzip models/*/*
```js
// The model configuration as YAML formatted string. For available configuration options, please check: https://marian-nmt.github.io/docs/cmd/marian-decoder/
// This example captures some of the most relevant options
const modelConfig = `vocabs:
- /esen/vocab.esen.spm
- /esen/vocab.esen.spm
beam-size: 1
const modelConfig = `beam-size: 1
normalize: 1.0
word-penalty: 0
max-length-break: 128
@ -35,19 +32,31 @@ quiet-translation: true
gemm-precision: int8shift
`;
// Download model and shortlist files and read them into buffers
// Download model, shortlist and vocabulary files and read them into buffers
const modelFile = `models/esen/model.esen.intgemm.alphas.bin`;
const shortlistFile = `models/esen/lex.50.50.esen.s2t.bin`;
const downloadedBuffers = await Promise.all([downloadAsArrayBuffer(modelFile), downloadAsArrayBuffer(shortlistFile)]); // Please refer to bergamot.html in test_page folder for this function
const vocabFiles = [`models/${languagePair}/vocab.${vocabLanguagePair}.spm`,
`models/${languagePair}/vocab.${vocabLanguagePair}.spm`];
const uniqueVocabFiles = new Set(vocabFiles);
// Please refer to bergamot.html in test_page folder for downloadAsArrayBuffer function
const downloadedBuffers = await Promise.all([downloadAsArrayBuffer(modelFile), downloadAsArrayBuffer(shortlistFile)]);
const downloadedVocabBuffers = [];
for (let item of uniqueVocabFiles.values()) {
downloadedVocabBuffers.push(await downloadAsArrayBuffer(item));
}
const modelBuffer = downloadedBuffers[0];
const shortListBuffer = downloadedBuffers[1];
// Construct AlignedMemory instances from the buffers
var alignedModelMemory = constructAlignedMemoryFromBuffer(modelBuffer, 256); // Please refer to bergamot.html in test_page folder for this function
var alignedShortlistMemory = constructAlignedMemoryFromBuffer(shortListBuffer, 64); // Please refer to bergamot.html in test_page folder for this function
var alignedVocabsMemoryList = new Module.AlignedMemoryList;
downloadedVocabBuffers.forEach(item => alignedVocabsMemoryList.push_back(constructAlignedMemoryFromBuffer(item, 64)));
// Instantiate the TranslationModel
const model = new Module.TranslationModel(modelConfig, alignedModelMemory, alignedShortlistMemory);
const model = new Module.TranslationModel(modelConfig, alignedModelMemory, alignedShortlistMemory, alignedVocabsMemoryList);
// Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector<string>)
const request = new Module.TranslationRequest();

View File

@ -48,14 +48,22 @@ std::vector<std::shared_ptr<AlignedMemory>> prepareVocabsSmartMemories(std::vect
return vocabsSmartMemories;
}
marian::bergamot::MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory,
AlignedMemory* shortlistMemory,
std::vector<AlignedMemory*> uniqueVocabsMemories){
marian::bergamot::MemoryBundle memoryBundle;
memoryBundle.model = std::move(*modelMemory);
memoryBundle.shortlist = std::move(*shortlistMemory);
memoryBundle.vocabs = std::move(prepareVocabsSmartMemories(uniqueVocabsMemories));
return memoryBundle;
}
TranslationModel* TranslationModelFactory(const std::string &config,
AlignedMemory* modelMemory,
AlignedMemory* shortlistMemory,
std::vector<AlignedMemory*> uniqueVocabsMemories) {
return new TranslationModel(config,
std::move(*modelMemory),
std::move(*shortlistMemory),
std::move(prepareVocabsSmartMemories(uniqueVocabsMemories)));
return new TranslationModel(config, std::move(prepareMemoryBundle(modelMemory, shortlistMemory, uniqueVocabsMemories)));
}
EMSCRIPTEN_BINDINGS(translation_model) {

View File

@ -0,0 +1 @@
var BERGAMOT_VERSION_FULL = "@PROJECT_VERSION_STRING_FULL@";