mirror of
https://github.com/browsermt/bergamot-translator.git
synced 2024-08-15 16:40:26 +03:00
Merge remote-tracking branch 'upstream/main' into main
- Sync with upstream (https://github.com/browsermt/bergamot-translator)
This commit is contained in:
commit
b73714e222
89
.github/workflows/native-ubuntu.yml
vendored
89
.github/workflows/native-ubuntu.yml
vendored
@ -15,6 +15,8 @@ jobs:
|
||||
- name: "full-marian"
|
||||
os: ubuntu-latest
|
||||
gcc: 8
|
||||
force_recache: false
|
||||
ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
|
||||
cpu: 'ON'
|
||||
gpu: 'OFF'
|
||||
test_tags: ""
|
||||
@ -24,10 +26,14 @@ jobs:
|
||||
USE_WASM_COMPATIBLE_SOURCE: "OFF"
|
||||
COMPILE_SERVER: "OFF"
|
||||
COMPILE_EXAMPLES: "OFF"
|
||||
CMAKE_C_COMPILER_LAUNCHER: "ccache"
|
||||
CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
|
||||
|
||||
- name: "minimal-marian"
|
||||
os: ubuntu-latest
|
||||
gcc: 8
|
||||
force_recache: false
|
||||
ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
|
||||
cpu: 'ON'
|
||||
gpu: 'OFF'
|
||||
test_tags: "'#wasm'"
|
||||
@ -37,6 +43,42 @@ jobs:
|
||||
USE_WASM_COMPATIBLE_SOURCE: "ON"
|
||||
COMPILE_SERVER: "OFF"
|
||||
COMPILE_EXAMPLES: "OFF"
|
||||
CMAKE_C_COMPILER_LAUNCHER: "ccache"
|
||||
CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
|
||||
|
||||
- name: "full-marian-force-recache"
|
||||
os: ubuntu-latest
|
||||
gcc: 8
|
||||
force_recache: true
|
||||
ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
|
||||
cpu: 'ON'
|
||||
gpu: 'OFF'
|
||||
test_tags: ""
|
||||
cmake:
|
||||
CMAKE_BUILD_TYPE: "Release"
|
||||
COMPILE_TESTS: "ON"
|
||||
USE_WASM_COMPATIBLE_SOURCE: "OFF"
|
||||
COMPILE_SERVER: "OFF"
|
||||
COMPILE_EXAMPLES: "OFF"
|
||||
CMAKE_C_COMPILER_LAUNCHER: "ccache"
|
||||
CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
|
||||
|
||||
- name: "minimal-marian-force-recache"
|
||||
os: ubuntu-latest
|
||||
gcc: 8
|
||||
force_recache: true
|
||||
ccache_cmd: "bash ${GITHUB_WORKSPACE}/scripts/ci/compiler-hash.sh %compiler%"
|
||||
cpu: 'ON'
|
||||
gpu: 'OFF'
|
||||
test_tags: "'#wasm'"
|
||||
cmake:
|
||||
CMAKE_BUILD_TYPE: "Release"
|
||||
COMPILE_TESTS: "OFF" # Minimal marian has no sqlite support and COMPILE_TEST=ON fails.
|
||||
USE_WASM_COMPATIBLE_SOURCE: "ON"
|
||||
COMPILE_SERVER: "OFF"
|
||||
COMPILE_EXAMPLES: "OFF"
|
||||
CMAKE_C_COMPILER_LAUNCHER: "ccache"
|
||||
CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
|
||||
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
@ -57,7 +99,7 @@ jobs:
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y \
|
||||
libgoogle-perftools-dev libprotobuf-dev protobuf-compiler \
|
||||
libboost-all-dev g++-${{ matrix.gcc }}
|
||||
libboost-all-dev g++-${{ matrix.gcc }} ccache
|
||||
|
||||
# https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
|
||||
- name: Install MKL
|
||||
@ -68,6 +110,42 @@ jobs:
|
||||
sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
|
||||
if: matrix.cmake.USE_WASM_COMPATIBLE_SOURCE == 'OFF'
|
||||
|
||||
- name: Generate ccache_vars
|
||||
id: ccache_vars
|
||||
shell: bash
|
||||
run: |
|
||||
echo "::set-output name=hash::$(${{ matrix.ccache_cmd }})"
|
||||
echo "::set-output name=timestamp::$(date '+%Y-%m-%dT%H.%M.%S')"
|
||||
|
||||
- name: Setup ccache environment variables
|
||||
run: |
|
||||
echo "CCACHE_COMPILERCHECK=${{ matrix.ccache_cmd }}" >> $GITHUB_ENV
|
||||
echo "CCACHE_BASE_DIR=${{ github.workspace }}" >> $GITHUB_ENV
|
||||
echo "CCACHE_DIR=${{ github.workspace }}/.ccache" >> $GITHUB_ENV
|
||||
echo "CCACHE_COMPRESS=true" >> $GITHUB_ENV
|
||||
echo "CCACHE_COMPRESSLEVEL=6" >> $GITHUB_ENV
|
||||
echo "CCACHE_MAXSIZE=2G" >> $GITHUB_ENV
|
||||
|
||||
- name: Setup ccache recache on
|
||||
run: |
|
||||
echo "CCACHE_RECACHE=" >> $GITHUB_ENV
|
||||
if: matrix.force_recache == true
|
||||
|
||||
- name: Cache-op for build-cache through ccache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.CCACHE_DIR }}
|
||||
key: ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-${{ steps.ccache_vars.outputs.timestamp }}
|
||||
restore-keys: |
|
||||
ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-
|
||||
ccache-${{ matrix.name }}-${{ steps.ccache_vars.outputs.hash }}-
|
||||
ccache-${{ matrix.name }}-
|
||||
|
||||
- name: Cache stats before build
|
||||
run: |
|
||||
ccache -s
|
||||
ccache -z
|
||||
|
||||
# Boost is installed on GitHub-hosted runners in a non-standard location
|
||||
# https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671
|
||||
- name: Configure CMake
|
||||
@ -75,17 +153,24 @@ jobs:
|
||||
mkdir -p build
|
||||
cd build
|
||||
CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \
|
||||
cmake .. \
|
||||
cmake -L .. \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.cmake.CMAKE_BUILD_TYPE }}\
|
||||
-DCOMPILE_TESTS=${{ matrix.cmake.COMPILE_TESTS }}\
|
||||
-DCOMPILE_EXAMPLES=${{ matrix.cmake.COMPILE_EXAMPLES }} \
|
||||
-DCOMPILE_SERVER=${{ matrix.cmake.COMPILE_SERVER }} \
|
||||
-DUSE_WASM_COMPATIBLE_SOURCE=${{ matrix.cmake.USE_WASM_COMPATIBLE_SOURCE }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=${{ matrix.cmake.CMAKE_C_COMPILER_LAUNCHER}} \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=${{ matrix.cmake.CMAKE_CXX_COMPILER_LAUNCHER}}
|
||||
|
||||
|
||||
- name: Compile bergamot-translator
|
||||
working-directory: build
|
||||
run: make -j2
|
||||
|
||||
- name: Cache stats after build
|
||||
run: |
|
||||
ccache -s
|
||||
|
||||
- name: Run unit tests
|
||||
working-directory: build
|
||||
run: make test
|
||||
|
@ -1 +1 @@
|
||||
v0.0.0
|
||||
v0.3.0
|
||||
|
@ -1,4 +1,5 @@
|
||||
cmake_minimum_required(VERSION 3.5.1)
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
||||
|
||||
if (POLICY CMP0074)
|
||||
cmake_policy(SET CMP0074 NEW) # CMake 3.12
|
||||
@ -42,8 +43,6 @@ option(COMPILE_WASM "Compile for WASM" OFF)
|
||||
cmake_dependent_option(USE_WASM_COMPATIBLE_SOURCE "Use wasm compatible sources" OFF "NOT COMPILE_WASM" ON)
|
||||
option(COMPILE_TESTS "Compile bergamot-tests" OFF)
|
||||
|
||||
SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds")
|
||||
|
||||
# Set 3rd party submodule specific cmake options for this project
|
||||
SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
|
||||
SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
|
||||
@ -73,6 +72,11 @@ if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Project versioning
|
||||
include(GetVersionFromFile)
|
||||
message(STATUS "Project name: ${PROJECT_NAME}")
|
||||
message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")
|
||||
|
||||
if(NOT COMPILE_WASM)
|
||||
# Set BUILD_ARCH to native only while compiling for non wasm platform
|
||||
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
|
||||
|
94
README.md
94
README.md
@ -7,85 +7,47 @@ Bergamot translator provides a unified API for ([Marian NMT](https://marian-nmt.
|
||||
## Build Instructions
|
||||
|
||||
### Build Natively
|
||||
1. Clone the repository using these instructions:
|
||||
```bash
|
||||
git clone https://github.com/mozilla/bergamot-translator
|
||||
cd bergamot-translator
|
||||
```
|
||||
2. Compile
|
||||
Create a folder where you want to build all the artifacts (`build-native` in this case) and compile
|
||||
|
||||
Create a folder where you want to build all the artifacts (`build-native` in this case) and compile in that folder
|
||||
```bash
|
||||
mkdir build-native
|
||||
cd build-native
|
||||
cmake ../
|
||||
make -j
|
||||
```
|
||||
```bash
|
||||
mkdir build-native
|
||||
cd build-native
|
||||
cmake ../
|
||||
make -j3
|
||||
```
|
||||
|
||||
### Build WASM
|
||||
#### Compiling for the first time
|
||||
#### Prerequisite
|
||||
|
||||
1. Download and Install Emscripten using following instructions
|
||||
* Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git`
|
||||
* Enter the cloned directory: `cd emsdk`
|
||||
* Install the lastest sdk tools: `./emsdk install latest`
|
||||
* Activate the latest sdk tools: `./emsdk activate latest`
|
||||
* Activate path variables: `source ./emsdk_env.sh`
|
||||
Building on wasm requires Emscripten toolchain. It can be downloaded and installed using following instructions:
|
||||
|
||||
2. Clone the repository using these instructions:
|
||||
* Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git`
|
||||
* Enter the cloned directory: `cd emsdk`
|
||||
* Install the lastest sdk tools: `./emsdk install latest`
|
||||
* Activate the latest sdk tools: `./emsdk activate latest`
|
||||
* Activate path variables: `source ./emsdk_env.sh`
|
||||
|
||||
#### <a name="Compile"></a> Compile
|
||||
|
||||
1. Create a folder where you want to build all the artifacts (`build-wasm` in this case) and compile
|
||||
```bash
|
||||
git clone https://github.com/mozilla/bergamot-translator
|
||||
cd bergamot-translator
|
||||
mkdir build-wasm
|
||||
cd build-wasm
|
||||
emcmake cmake -DCOMPILE_WASM=on ../
|
||||
emmake make -j3
|
||||
```
|
||||
|
||||
3. Download files (only required if you want to perform inference using build artifacts)
|
||||
The wasm artifacts (.js and .wasm files) will be available in the build directory ("build-wasm" in this case).
|
||||
|
||||
It packages the vocabulary files into wasm binary, which is required only if you want to perform inference.
|
||||
The compilation commands will preload these files in Emscripten’s virtual file system.
|
||||
|
||||
If you want to package bergamot project specific files, please follow these instructions:
|
||||
2. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts
|
||||
```bash
|
||||
git clone --depth 1 --branch main --single-branch https://github.com/mozilla-applied-ml/bergamot-models
|
||||
mkdir models
|
||||
cp -rf bergamot-models/prod/* models
|
||||
gunzip models/*/*
|
||||
find models \( -type f -name "model*" -or -type f -name "lex*" \) -delete
|
||||
bash ../wasm/patch-artifacts-enable-wormhole.sh
|
||||
```
|
||||
|
||||
4. Compile
|
||||
1. Create a folder where you want to build all the artefacts (`build-wasm` in this case)
|
||||
```bash
|
||||
mkdir build-wasm
|
||||
cd build-wasm
|
||||
```
|
||||
|
||||
2. Compile the artefacts
|
||||
* If you want to package files into wasm binary then execute following commands (Replace `FILES_TO_PACKAGE` with the
|
||||
directory containing all the files to be packaged)
|
||||
|
||||
```bash
|
||||
emcmake cmake -DCOMPILE_WASM=on -DPACKAGE_DIR=FILES_TO_PACKAGE ../
|
||||
emmake make -j
|
||||
```
|
||||
e.g. If you want to package bergamot project specific files (downloaded using step 3 above) then
|
||||
replace `FILES_TO_PACKAGE` with `../models`
|
||||
|
||||
* If you don't want to package any file into wasm binary then execute following commands:
|
||||
```bash
|
||||
emcmake cmake -DCOMPILE_WASM=on ../
|
||||
emmake make -j
|
||||
```
|
||||
|
||||
The wasm artifacts (.js and .wasm files) will be available in the build directory ("build-wasm" in this case).
|
||||
|
||||
3. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts
|
||||
```bash
|
||||
bash ../wasm/patch-artifacts-enable-wormhole.sh
|
||||
```
|
||||
|
||||
#### Recompiling
|
||||
As long as you don't update any submodule, just follow steps in `4.ii` and `4.iii` to recompile.\
|
||||
If you update a submodule, execute following command before executing steps in `4.ii` and `4.iii` to recompile.
|
||||
As long as you don't update any submodule, just follow [Compile](#Compile) steps.\
|
||||
If you update a submodule, execute following command in repository root folder before executing
|
||||
[Compile](#Compile) steps.
|
||||
```bash
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
@ -16,19 +16,15 @@ int main(int argc, char *argv[]) {
|
||||
auto cp = marian::bergamot::createConfigParser();
|
||||
auto options = cp.parseOptions(argc, argv, true);
|
||||
|
||||
// Prepare memories for model and shortlist
|
||||
marian::bergamot::AlignedMemory modelBytes, shortlistBytes;
|
||||
std::vector<std::shared_ptr<marian::bergamot::AlignedMemory>> vocabsBytes;
|
||||
// Prepare memories for bytearrays (including model, shortlist and vocabs)
|
||||
marian::bergamot::MemoryBundle memoryBundle;
|
||||
|
||||
if (options->get<bool>("check-bytearray")) {
|
||||
// Load legit values into bytearrays.
|
||||
modelBytes = marian::bergamot::getModelMemoryFromConfig(options);
|
||||
shortlistBytes = marian::bergamot::getShortlistMemoryFromConfig(options);
|
||||
marian::bergamot::getVocabsMemoryFromConfig(options, vocabsBytes);
|
||||
memoryBundle = marian::bergamot::getMemoryBundleFromConfig(options);
|
||||
}
|
||||
|
||||
marian::bergamot::Service service(options, std::move(modelBytes),
|
||||
std::move(shortlistBytes), std::move(vocabsBytes));
|
||||
marian::bergamot::Service service(options, std::move(memoryBundle));
|
||||
|
||||
// Read a large input text blob from stdin
|
||||
std::ostringstream std_input;
|
||||
|
60
cmake/GetVersionFromFile.cmake
Normal file
60
cmake/GetVersionFromFile.cmake
Normal file
@ -0,0 +1,60 @@
|
||||
##
|
||||
# This CMake modules sets the project version from a version file.
|
||||
#
|
||||
# The module sets the following variables:
|
||||
#
|
||||
# * PROJECT_VERSION_STRING
|
||||
# * PROJECT_VERSION_STRING_FULL
|
||||
# * PROJECT_VERSION_MAJOR
|
||||
# * PROJECT_VERSION_MINOR
|
||||
# * PROJECT_VERSION_PATCH
|
||||
# * PROJECT_VERSION_TWEAK
|
||||
# * PROJECT_VERSION_GIT_SHA
|
||||
#
|
||||
# This module is public domain, use it as it fits you best.
|
||||
##
|
||||
|
||||
# Get full string version from file
|
||||
if(PROJECT_VERSION_FILE)
|
||||
file(STRINGS ${PROJECT_VERSION_FILE} PROJECT_VERSION_STRING)
|
||||
else()
|
||||
file(STRINGS ${CMAKE_CURRENT_SOURCE_DIR}/BERGAMOT_VERSION PROJECT_VERSION_STRING)
|
||||
endif()
|
||||
|
||||
# Get current commit SHA from git
|
||||
execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE PROJECT_VERSION_GIT_SHA
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
# Get partial versions into a list
|
||||
string(REGEX MATCHALL "-.*$|[0-9]+" PROJECT_PARTIAL_VERSION_LIST
|
||||
${PROJECT_VERSION_STRING})
|
||||
|
||||
# Set the version numbers
|
||||
list(GET PROJECT_PARTIAL_VERSION_LIST 0 PROJECT_VERSION_MAJOR)
|
||||
list(GET PROJECT_PARTIAL_VERSION_LIST 1 PROJECT_VERSION_MINOR)
|
||||
list(GET PROJECT_PARTIAL_VERSION_LIST 2 PROJECT_VERSION_PATCH)
|
||||
|
||||
# The tweak part is optional, so check if the list contains it
|
||||
list(LENGTH PROJECT_PARTIAL_VERSION_LIST PROJECT_PARTIAL_VERSION_LIST_LEN)
|
||||
if(PROJECT_PARTIAL_VERSION_LIST_LEN GREATER 3)
|
||||
list(GET PROJECT_PARTIAL_VERSION_LIST 3 PROJECT_VERSION_TWEAK)
|
||||
string(SUBSTRING ${PROJECT_VERSION_TWEAK} 1 -1 PROJECT_VERSION_TWEAK)
|
||||
endif()
|
||||
|
||||
# Unset the list
|
||||
unset(PROJECT_PARTIAL_VERSION_LIST)
|
||||
|
||||
# Set full project version string
|
||||
set(PROJECT_VERSION_STRING_FULL
|
||||
${PROJECT_VERSION_STRING}+${PROJECT_VERSION_GIT_SHA})
|
||||
|
||||
# Print all variables for debugging
|
||||
#message(STATUS ${PROJECT_VERSION_STRING_FULL})
|
||||
#message(STATUS ${PROJECT_VERSION_STRING})
|
||||
#message(STATUS ${PROJECT_VERSION_MAJOR})
|
||||
#message(STATUS ${PROJECT_VERSION_MINOR})
|
||||
#message(STATUS ${PROJECT_VERSION_PATCH})
|
||||
#message(STATUS ${PROJECT_VERSION_TWEAK})
|
||||
#message(STATUS ${PROJECT_VERSION_GIT_SHA})
|
35
scripts/ci/compiler-hash.sh
Normal file
35
scripts/ci/compiler-hash.sh
Normal file
@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Uses the command from https://stackoverflow.com/a/9355840/4565794.
|
||||
# -v displays the commands executed to run compilation. Of this cc1 additional
|
||||
# flags which contain the flags triggered by -march=native is what we need.
|
||||
# -E stop after preprocessing stage.
|
||||
|
||||
# Output on a linux machine with gcc-8 looks as follows:
|
||||
|
||||
# $ gcc -march=native -E -v - </dev/null 2>&1 | grep cc1
|
||||
# /usr/lib/gcc/x86_64-linux-gnu/8/cc1 -E -quiet -v -imultiarch x86_64-linux-gnu
|
||||
# - -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3
|
||||
# -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm
|
||||
# -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig
|
||||
# -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mno-rtm
|
||||
# -mno-hle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave
|
||||
# -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1
|
||||
# -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl
|
||||
# -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb
|
||||
# -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk
|
||||
# -mno-avx512vbmi2 -mavx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg
|
||||
# -mno-movdiri -mno-movdir64b --param l1-cache-size=32 --param
|
||||
# l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512
|
||||
# -fstack-protector-strong -Wformat -Wformat-security
|
||||
|
||||
# The sha256sum of the output is computed, and stripped to the first 8
|
||||
# characters for use in ccache and github cache store key. Can effectively be
|
||||
# considered as a hash of the compiler version and the flags activated by
|
||||
# -march=native.
|
||||
|
||||
COMPILER=$1
|
||||
|
||||
$COMPILER -march=native -E -v - < /dev/null 2>&1 | grep cc1 \
|
||||
| sha256sum | cut -c1-8
|
||||
|
@ -23,9 +23,6 @@ TEST_CASE("Test Annotation API with random sentences") {
|
||||
std::mt19937 randomIntGen_;
|
||||
randomIntGen_.seed(42);
|
||||
|
||||
AnnotatedText testAnnotation; // This the container we add through API and
|
||||
// check if the access is correct.
|
||||
|
||||
// External book-keeping so we have ground truths. Each element represents a
|
||||
// sentence.
|
||||
|
||||
@ -45,7 +42,7 @@ TEST_CASE("Test Annotation API with random sentences") {
|
||||
//
|
||||
// 4-0 4-1 4-2 4-3
|
||||
//
|
||||
// Words are separated by space units.
|
||||
// Tokens are contiguous because that's how SentencePiece works.
|
||||
//
|
||||
// Below, we accumulate the text with intended structure as above, and
|
||||
// ground-truth tables populated to be aware of the ByteRanges where they are
|
||||
@ -53,9 +50,10 @@ TEST_CASE("Test Annotation API with random sentences") {
|
||||
if (debug) {
|
||||
std::cout << "Preparing text and ground truth-tables" << std::endl;
|
||||
}
|
||||
std::string text;
|
||||
for (size_t idx = 0; idx < sentences; idx++) {
|
||||
if (idx != 0)
|
||||
testAnnotation.text += "\n";
|
||||
text += "\n";
|
||||
|
||||
// Words can be zero, we need to support empty word sentences as well.
|
||||
size_t numWords = randomIntGen_() % maxWords;
|
||||
@ -65,23 +63,16 @@ TEST_CASE("Test Annotation API with random sentences") {
|
||||
|
||||
// For empty sentence, we expect it to be empty and marked in position where
|
||||
// the existing string is if needed to be pointed out.
|
||||
size_t before = testAnnotation.text.size() - 1;
|
||||
size_t before = text.size() - 1;
|
||||
size_t sentenceBegin{before}, sentenceEnd{before};
|
||||
|
||||
for (size_t idw = 0; idw < numWords; idw++) {
|
||||
if (idw != 0) {
|
||||
testAnnotation.text += " ";
|
||||
if (debug) {
|
||||
std::cout << " ";
|
||||
}
|
||||
}
|
||||
|
||||
// Get new beginning, accounting for space above.
|
||||
before = testAnnotation.text.size();
|
||||
before = text.size();
|
||||
|
||||
// Add the word
|
||||
std::string word = std::to_string(idx) + "-" + std::to_string(idw);
|
||||
testAnnotation.text += word;
|
||||
text += word;
|
||||
|
||||
// Do math, before, before + new-word's size.
|
||||
wordByteRanges.push_back((ByteRange){before, before + word.size()});
|
||||
@ -105,6 +96,9 @@ TEST_CASE("Test Annotation API with random sentences") {
|
||||
groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
|
||||
}
|
||||
|
||||
AnnotatedText testAnnotation(std::move(text)); // This the container we add through API and
|
||||
// check if the access is correct.
|
||||
|
||||
// We prepare string_views now with the known ByteRanges and use the
|
||||
// string_view based AnnotatedText.addSentence(...) API to add sentences to
|
||||
// transparently convert from string_views to ByteRanges, rebasing/working out
|
||||
@ -116,6 +110,7 @@ TEST_CASE("Test Annotation API with random sentences") {
|
||||
}
|
||||
|
||||
std::vector<std::vector<marian::string_view>> wordStringViews;
|
||||
std::vector<ByteRange>::const_iterator sentence_iter = groundTruthSentences.begin();
|
||||
for (auto &sentence : groundTruthWords) {
|
||||
std::vector<marian::string_view> wordByteRanges;
|
||||
bool first{true};
|
||||
@ -132,7 +127,8 @@ TEST_CASE("Test Annotation API with random sentences") {
|
||||
std::cout << std::string(wordView);
|
||||
}
|
||||
}
|
||||
testAnnotation.addSentence(wordByteRanges);
|
||||
testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(), testAnnotation.text.data() + sentence_iter->begin);
|
||||
++sentence_iter;
|
||||
wordStringViews.push_back(wordByteRanges);
|
||||
if (debug) {
|
||||
std::cout << std::endl;
|
||||
@ -207,7 +203,7 @@ TEST_CASE("Test Annotation API with random sentences") {
|
||||
// Sentence if the random test above does not cover it for some reason.
|
||||
int emptySentenceIdx = sentences;
|
||||
std::vector<marian::string_view> emptySentence;
|
||||
testAnnotation.addSentence(emptySentence);
|
||||
testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(), testAnnotation.text.data() + testAnnotation.text.size());
|
||||
|
||||
// There are no words.
|
||||
CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
|
||||
|
@ -1,3 +1,7 @@
|
||||
# Generate version file
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.h.in
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/project_version.h @ONLY)
|
||||
|
||||
add_library(bergamot-translator STATIC
|
||||
byte_array_util.cpp
|
||||
text_processor.cpp
|
||||
|
@ -1,130 +1,68 @@
|
||||
#include "annotation.h"
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
void Annotation::addSentence(std::vector<ByteRange> &sentence) {
|
||||
flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence),
|
||||
std::end(sentence));
|
||||
size_t size = flatByteRanges_.size();
|
||||
sentenceEndIds_.push_back(size);
|
||||
AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) {
|
||||
// Treat the entire text as a gap that recordExistingSentence will break.
|
||||
annotation.token_begin_.back() = text.size();
|
||||
}
|
||||
|
||||
size_t Annotation::numWords(size_t sentenceIdx) const {
|
||||
size_t bosId, eosId;
|
||||
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
|
||||
eosId = sentenceEndIds_[sentenceIdx + 1];
|
||||
// Difference between eosId and bosId is the number of words.
|
||||
return eosId - bosId;
|
||||
void AnnotatedText::appendSentence(string_view prefix, std::vector<string_view>::iterator begin, std::vector<string_view>::iterator end) {
|
||||
assert(annotation.token_begin_.back() == text.size());
|
||||
// We'll be adding tokens from the sentence and another gap.
|
||||
annotation.token_begin_.reserve(annotation.token_begin_.size() + (end - begin) + 1);
|
||||
|
||||
// prefix is just end of the previous one.
|
||||
appendEndingWhitespace(prefix);
|
||||
|
||||
// Appending sentence text.
|
||||
std::size_t offset = text.size();
|
||||
for (std::vector<string_view>::iterator token = begin; token != end; ++token) {
|
||||
offset += token->size();
|
||||
annotation.token_begin_.push_back(offset);
|
||||
}
|
||||
if (begin != end) {
|
||||
text.append(begin->data(), (end - 1)->data() + (end - 1)->size());
|
||||
assert(offset == text.size()); // Tokens should be contiguous.
|
||||
}
|
||||
|
||||
// Add the gap after the sentence. This is empty for now, but will be
|
||||
// extended with appendEndingWhitespace or another appendSentence.
|
||||
annotation.gap_.push_back(annotation.token_begin_.size() - 1);
|
||||
annotation.token_begin_.push_back(offset);
|
||||
}
|
||||
|
||||
ByteRange Annotation::sentence(size_t sentenceIdx) const {
|
||||
size_t bosId, eosId;
|
||||
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
|
||||
eosId = sentenceEndIds_[sentenceIdx + 1];
|
||||
ByteRange sentenceByteRange;
|
||||
void AnnotatedText::appendEndingWhitespace(string_view whitespace) {
|
||||
text.append(whitespace.data(), whitespace.size());
|
||||
annotation.token_begin_.back() = text.size();
|
||||
}
|
||||
|
||||
if (bosId == eosId) {
|
||||
// We have an empty sentence. However, we want to be able to point where in
|
||||
// target this happened through the ranges. We are looking for the end of
|
||||
// the flatByteRange and non-empty sentence before this happened and
|
||||
// construct empty string-view equivalent ByteRange.
|
||||
ByteRange eos = flatByteRanges_[eosId - 1];
|
||||
sentenceByteRange = ByteRange{eos.end, eos.end};
|
||||
void AnnotatedText::recordExistingSentence(std::vector<string_view>::iterator begin, std::vector<string_view>::iterator end, const char *sentence_begin) {
|
||||
assert(sentence_begin >= text.data());
|
||||
assert(sentence_begin <= text.data() + text.size());
|
||||
assert(begin == end || sentence_begin == begin->data());
|
||||
assert(!annotation.token_begin_.empty());
|
||||
assert(annotation.token_begin_.back() == text.size());
|
||||
// Clip off size token ending.
|
||||
annotation.token_begin_.resize(annotation.token_begin_.size() - 1);
|
||||
for (std::vector<string_view>::iterator i = begin; i != end; ++i) {
|
||||
assert(i->data() >= text.data()); // In range.
|
||||
assert(i->data() + i->size() <= text.data() + text.size()); // In range
|
||||
assert(i + 1 == end || i->data() + i->size() == (i+1)->data()); // Contiguous
|
||||
annotation.token_begin_.push_back(i->data() - text.data());
|
||||
}
|
||||
// Gap token after sentence.
|
||||
annotation.gap_.push_back(annotation.token_begin_.size());
|
||||
if (begin != end) {
|
||||
annotation.token_begin_.push_back((end - 1)->data() + (end - 1)->size() - text.data());
|
||||
} else {
|
||||
ByteRange bos = flatByteRanges_[bosId];
|
||||
ByteRange eos = flatByteRanges_[eosId - 1];
|
||||
sentenceByteRange = ByteRange{bos.begin, eos.end};
|
||||
// empty sentence.
|
||||
annotation.token_begin_.push_back(sentence_begin - text.data());
|
||||
}
|
||||
return sentenceByteRange;
|
||||
}
|
||||
|
||||
ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const {
|
||||
size_t bosOffset = sentenceEndIds_[sentenceIdx];
|
||||
return flatByteRanges_[bosOffset + wordIdx];
|
||||
}
|
||||
|
||||
string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const {
|
||||
auto terminals = annotation.word(sentenceIdx, wordIdx);
|
||||
return string_view(&text[terminals.begin], terminals.size());
|
||||
}
|
||||
|
||||
string_view AnnotatedText::sentence(size_t sentenceIdx) const {
|
||||
auto sentenceAsByteRange = annotation.sentence(sentenceIdx);
|
||||
return asStringView(sentenceAsByteRange);
|
||||
}
|
||||
|
||||
void AnnotatedText::appendSentence(std::string prefix, std::string &reference,
|
||||
std::vector<string_view> &wordRanges) {
|
||||
text += prefix;
|
||||
size_t offset = text.size(); // Get size before to do ByteRange arithmetic
|
||||
text += reference; // Append reference to text
|
||||
std::vector<ByteRange> sentence;
|
||||
for (auto &wordView : wordRanges) {
|
||||
size_t thisWordBegin = offset + wordView.data() - reference.data();
|
||||
sentence.push_back(
|
||||
ByteRange{thisWordBegin, thisWordBegin + wordView.size()});
|
||||
}
|
||||
annotation.addSentence(sentence);
|
||||
}
|
||||
|
||||
void AnnotatedText::addSentence(std::vector<string_view> &wordRanges) {
|
||||
addSentence(std::begin(wordRanges), std::end(wordRanges));
|
||||
};
|
||||
|
||||
void AnnotatedText::addSentence(std::vector<string_view>::iterator begin,
|
||||
std::vector<string_view>::iterator end) {
|
||||
std::vector<ByteRange> sentence;
|
||||
for (auto p = begin; p != end; p++) {
|
||||
size_t begin_offset = p->data() - text.data();
|
||||
sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()});
|
||||
}
|
||||
annotation.addSentence(sentence);
|
||||
};
|
||||
|
||||
ByteRange AnnotatedText::wordAsByteRange(size_t sentenceIdx,
|
||||
size_t wordIdx) const {
|
||||
return annotation.word(sentenceIdx, wordIdx);
|
||||
}
|
||||
|
||||
ByteRange AnnotatedText::sentenceAsByteRange(size_t sentenceIdx) const {
|
||||
return annotation.sentence(sentenceIdx);
|
||||
}
|
||||
|
||||
string_view AnnotatedText::asStringView(const ByteRange &byteRange) const {
|
||||
const char *data = &text[byteRange.begin];
|
||||
size_t size = byteRange.size();
|
||||
return string_view(data, size);
|
||||
}
|
||||
|
||||
string_view AnnotatedText::gap(size_t sentenceIdx) const {
|
||||
// Find start of filler-text before, there's a corner case when there's no
|
||||
// sentence before.
|
||||
const char *start = nullptr;
|
||||
if (sentenceIdx == 0) {
|
||||
// If first sentence, filler begins at start of whole-text.
|
||||
start = text.data();
|
||||
} else {
|
||||
// Otherwise, filler begins at end of previous sentence.
|
||||
string_view sentenceBefore = sentence(sentenceIdx - 1);
|
||||
start = sentenceBefore.data() + sentenceBefore.size();
|
||||
}
|
||||
|
||||
// Find end of filler-text, but there is a corner-case to handle.
|
||||
const char *end = nullptr;
|
||||
if (sentenceIdx == numSentences()) {
|
||||
// If last sentence, manually find end of whole-text.
|
||||
const char *begin = text.data();
|
||||
end = begin + text.size();
|
||||
} else {
|
||||
// Otherwise, the filler ends at the start of next sentence.
|
||||
string_view sentenceAfter = sentence(sentenceIdx);
|
||||
end = sentenceAfter.data();
|
||||
}
|
||||
|
||||
return string_view(start, end - start);
|
||||
// Add back size token ending.
|
||||
annotation.token_begin_.push_back(text.size());
|
||||
}
|
||||
|
||||
} // namespace bergamot
|
||||
|
@ -17,83 +17,99 @@ struct ByteRange {
|
||||
const size_t size() const { return end - begin; }
|
||||
};
|
||||
|
||||
/// An Annotation is a collection of ByteRanges used to denote ancillary
|
||||
/// information of sentences and words on a text of string. Annotation is meant
|
||||
/// for consumption on platforms where `string_view` creates problems (eg:
|
||||
/// exports through WASM) conveniently rebasing them as required into
|
||||
/// ByteRanges. See AnnotatedText for cases where this is a non-issue.
|
||||
/// Annotation expresses sentence and token boundary information as ranges of
|
||||
/// bytes in a string, but does not itself own the string.
|
||||
///
|
||||
/// See also AnnotatedText, which owns Annotation and the string. AnnotatedText
|
||||
/// wraps these ByteRange functions to provide a string_view interface.
|
||||
///
|
||||
/// **Usage**
|
||||
/// Text is divided into gaps (whitespace between sentences) and sentences like
|
||||
/// so:
|
||||
/// gap sentence gap sentence gap
|
||||
/// Because gaps appear at the beginning and end of the text, there's always
|
||||
/// one more gap than there are sentences.
|
||||
///
|
||||
/// To ensure rebasing is consistent during creation and updation, use
|
||||
/// `Annotation` best through `AnnotatedText`, which also holds the reference
|
||||
/// string and can work with `string_views`.
|
||||
/// The entire text is a unbroken sequence of tokens (i.e. the end of a token
|
||||
/// is the beginning of the next token). A gap is exactly one token containing
|
||||
/// whatever whitespace is between the sentences. A sentence is a sequence of
|
||||
/// tokens.
|
||||
///
|
||||
/// If used separately, it is on the user to ensure the reference string
|
||||
/// is the same as what the Annotation refers to. For best results, an instance
|
||||
/// is expected to be read only in this mode of operation.
|
||||
/// Since we are using SentencePiece, a token can include whitespace. The term
|
||||
/// "word" is used, somewhat incorrectly, as a synonym of token.
|
||||
///
|
||||
/// **Idea**
|
||||
///
|
||||
/// Annotation is intended to be the same structure conceptually as below,
|
||||
/// except the `std::vector<std::vector<ByteRange>>` hammered into a flat
|
||||
/// structure to avoid multiple reallocs keeping efficiency in mind. This is
|
||||
/// achieved by having markers of where sentence ends in the flat container
|
||||
/// storing word ByteRanges.
|
||||
///
|
||||
/// ```cpp
|
||||
/// typedef ByteRange Word;
|
||||
/// // std::vector<ByteRange>, a single sentence
|
||||
/// typedef std::vector<Word> Sentence;
|
||||
/// std::vector<std::vector<ByteRange> // multiple sentences
|
||||
/// typedef std::vector<Sentence> Annotation;
|
||||
///
|
||||
/// Annotation example;
|
||||
/// ```
|
||||
/// This structure exists to provide a consistent API to access the nested
|
||||
/// sentences of varying lengths, which occur in source-text processed into
|
||||
/// multiple sentences, and target-text translated from source as multiple
|
||||
/// sentences, both composed of (sub)-words, providing a List[List] like access
|
||||
/// while storing it in a compact and efficient manner.
|
||||
/// A gap can be empty (for example there may not have been whitespace at the
|
||||
/// beginning). A sentence can also be empty (typically the translation system
|
||||
/// produced empty output). That's fine, these are just empty ranges as you
|
||||
/// would expect.
|
||||
class Annotation {
|
||||
public:
|
||||
/// Annotation is constructed empty. See `addSentence()` to populate it with
|
||||
/// annotations.
|
||||
/// Initially an empty string. Populated by AnnotatedText.
|
||||
Annotation() {
|
||||
// The -1-th sentence ends at 0.
|
||||
sentenceEndIds_.push_back(0);
|
||||
token_begin_.push_back(0);
|
||||
token_begin_.push_back(0);
|
||||
gap_.push_back(0);
|
||||
}
|
||||
|
||||
size_t numSentences() const { return sentenceEndIds_.size() - 1; }
|
||||
size_t numSentences() const { return gap_.size() - 1; }
|
||||
|
||||
/// Returns number of words in the sentence identified by `sentenceIdx`.
|
||||
size_t numWords(size_t sentenceIdx) const;
|
||||
|
||||
/// Adds a sentences from `vector<ByteRange>` representation, internally doing
|
||||
/// extra book-keeping for the sentence terminal markings. Sentences are
|
||||
/// expected to be added in order as they occur in text.
|
||||
void addSentence(std::vector<ByteRange> &sentence);
|
||||
size_t numWords(size_t sentenceIdx) const {
|
||||
return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */;
|
||||
}
|
||||
|
||||
/// Returns a ByteRange representing `wordIdx` in sentence indexed by
|
||||
/// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
|
||||
/// `.numWords()` for `sentenceIdx` for defined behaviour.
|
||||
ByteRange word(size_t sentenceIdx, size_t wordIdx) const;
|
||||
ByteRange word(size_t sentenceIdx, size_t wordIdx) const {
|
||||
size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx;
|
||||
return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
|
||||
}
|
||||
|
||||
/// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
|
||||
/// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
|
||||
/// less than `.numSentences()`.
|
||||
ByteRange sentence(size_t sentenceIdx) const;
|
||||
ByteRange sentence(size_t sentenceIdx) const {
|
||||
return ByteRange {
|
||||
token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */
|
||||
token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */
|
||||
};
|
||||
}
|
||||
|
||||
ByteRange gap(size_t gapIdx) const {
|
||||
size_t tokenIdx = gap_[gapIdx];
|
||||
return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
|
||||
}
|
||||
|
||||
private:
|
||||
/// A flat storage for ByteRanges. Composed of word ByteRanges, extra
|
||||
/// information in sentenceEndIds_ to denote sentence boundary markers as
|
||||
/// indices.
|
||||
std::vector<ByteRange> flatByteRanges_;
|
||||
friend class AnnotatedText;
|
||||
/// Map from token index to byte offset at which it begins. Token i is:
|
||||
/// [token_begin_[i], token_begin_[i+1])
|
||||
/// The vector is padded so that these indices are always valid, even at the
|
||||
/// end. So tokens_begin_.size() is the number of tokens plus 1.
|
||||
std::vector<size_t> token_begin_;
|
||||
|
||||
/// Stores indices onto flatByteRanges_ of where sentences end (not inclusive,
|
||||
/// aligned with C++ half interval notions). There is a 0 marker to simplify
|
||||
/// sources, indicating where the -1-th sentence ends.
|
||||
std::vector<size_t> sentenceEndIds_;
|
||||
/// Indices of tokens that correspond to gaps between sentences. These are
|
||||
/// indices into token_begin_.
|
||||
/// Gap g is byte range:
|
||||
/// [token_begin_[gap_[w]], token_begin_[gap_[w]+1])
|
||||
/// Sentence s is byte range:
|
||||
/// [token_begin_[gap_[s]+1], token_begin_[gap_[s+1]])
|
||||
/// A sentence does not include whitespace at the beginning or end.
|
||||
///
|
||||
/// gap_.size() == numSentences() + 1.
|
||||
///
|
||||
/// Example: empty text "" -> just an empty gap.
|
||||
/// token_begin_ = {0, 0};
|
||||
/// gap_ = {0};
|
||||
///
|
||||
/// Example: only space " " -> just a gap containing the space.
|
||||
/// token_begin_ = {0, 1};
|
||||
/// gap_ = {0};
|
||||
///
|
||||
/// Example: one token "hi" -> empty gap, sentence with one token, empty gap
|
||||
/// token_begin_ = {0, 0, 2, 2};
|
||||
/// gap_ = {0, 2};
|
||||
std::vector<size_t> gap_;
|
||||
};
|
||||
|
||||
/// AnnotatedText is effectively std::string text + Annotation, providing the
|
||||
@ -107,7 +123,6 @@ private:
|
||||
///
|
||||
/// 3. Bind the text and annotations together, to move around as a meaningful
|
||||
/// unit.
|
||||
|
||||
struct AnnotatedText {
|
||||
public:
|
||||
std::string text; ///< Blob of string elements in annotation refers to.
|
||||
@ -122,7 +137,31 @@ public:
|
||||
|
||||
/// Construct moving in a string (for efficiency purposes, copying string
|
||||
/// constructor is disallowed).
|
||||
AnnotatedText(std::string &&text) : text(std::move(text)){};
|
||||
AnnotatedText(std::string &&text);
|
||||
|
||||
/// Appends a sentence to the existing text and transparently rebases
|
||||
/// string_views. Since this tracks only prefix, remember
|
||||
/// appendEndingWhitespace.
|
||||
/// The string_views must not already be in text.
|
||||
void appendSentence(
|
||||
string_view prefix,
|
||||
std::vector<string_view>::iterator tokens_begin,
|
||||
std::vector<string_view>::iterator tokens_end);
|
||||
|
||||
/// Append the whitespace at the end of input. string_view must not be in
|
||||
/// text.
|
||||
void appendEndingWhitespace(string_view whitespace);
|
||||
|
||||
/// Record the existence of a sentence that is already in text. The
|
||||
/// iterators are over string_views for each token that must be in text
|
||||
/// already. This function must be called to record sentences in order.
|
||||
/// Normally the beginning of the sentence can be inferred from
|
||||
/// tokens_begin->data() but the tokens could be empty, so sentence_begin is
|
||||
/// required to know where the sentence is.
|
||||
void recordExistingSentence(
|
||||
std::vector<string_view>::iterator tokens_begin,
|
||||
std::vector<string_view>::iterator tokens_end,
|
||||
const char *sentence_begin);
|
||||
|
||||
/// Returns the number of sentences in the annotation structure.
|
||||
const size_t numSentences() const { return annotation.numSentences(); }
|
||||
@ -132,46 +171,44 @@ public:
|
||||
return annotation.numWords(sentenceIdx);
|
||||
}
|
||||
|
||||
/// Appends a sentence to the existing text and transparently rebases
|
||||
/// string_views
|
||||
void appendSentence(std::string prefix, std::string &reference,
|
||||
std::vector<string_view> &wordRanges);
|
||||
|
||||
/// Adds a sentence, used to load from SentencePiece annotations conveniently.
|
||||
void addSentence(std::vector<string_view> &wordRanges);
|
||||
|
||||
/// Adds a sentence between two iterators, often useful while constructing
|
||||
/// from parts of a container.
|
||||
void addSentence(std::vector<string_view>::iterator begin,
|
||||
std::vector<string_view>::iterator end);
|
||||
|
||||
/// Returns a string_view representing wordIdx in sentenceIdx
|
||||
string_view word(size_t sentenceIdx, size_t wordIdx) const;
|
||||
string_view word(size_t sentenceIdx, size_t wordIdx) const {
|
||||
return asStringView(annotation.word(sentenceIdx, wordIdx));
|
||||
}
|
||||
|
||||
/// Returns a string_view representing sentence corresponding to sentenceIdx.
|
||||
string_view sentence(size_t sentenceIdx) const;
|
||||
string_view sentence(size_t sentenceIdx) const {
|
||||
return asStringView(annotation.sentence(sentenceIdx));
|
||||
}
|
||||
|
||||
/// Returns the string_view of the gap between two sentences in the container.
|
||||
///
|
||||
/// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
|
||||
///
|
||||
/// * For `i = 0`: The gap between the start of text and the first sentence.
|
||||
/// * For `i = 0`: The gap between the start of text and the 0th sentence.
|
||||
/// * For `i = 1...N-1`, returns the text comprising of the gap
|
||||
/// between the `i-1`-th and `i`-th sentence.
|
||||
/// * For `i = N`, the gap between the last sentence and end of
|
||||
/// between the `i`-th and `i+1`-th sentence.
|
||||
/// * For `i = N`, the gap between the last (N-1th) sentence and end of
|
||||
/// text.
|
||||
|
||||
/// @param sentenceIdx: Can be between `[0, numSentences()]`.
|
||||
string_view gap(size_t sentenceIdx) const;
|
||||
string_view gap(size_t sentenceIdx) const {
|
||||
return asStringView(annotation.gap(sentenceIdx));
|
||||
}
|
||||
|
||||
/// Returns a ByteRange representing wordIdx in sentenceIdx
|
||||
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;
|
||||
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const {
|
||||
return annotation.word(sentenceIdx, wordIdx);
|
||||
}
|
||||
|
||||
/// Returns a ByteRange representing sentence corresponding to sentenceIdx.
|
||||
ByteRange sentenceAsByteRange(size_t sentenceIdx) const;
|
||||
ByteRange sentenceAsByteRange(size_t sentenceIdx) const {
|
||||
return annotation.sentence(sentenceIdx);
|
||||
}
|
||||
|
||||
private:
|
||||
string_view asStringView(const ByteRange &byteRange) const;
|
||||
string_view asStringView(const ByteRange &byteRange) const {
|
||||
return string_view(text.data() + byteRange.begin, byteRange.size());
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
|
@ -10,11 +10,11 @@ namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
BatchTranslator::BatchTranslator(DeviceId const device,
|
||||
std::vector<Ptr<Vocab const>> &vocabs,
|
||||
Vocabs &vocabs,
|
||||
Ptr<Options> options,
|
||||
const AlignedMemory* modelMemory,
|
||||
const AlignedMemory* shortlistMemory)
|
||||
: device_(device), options_(options), vocabs_(&vocabs),
|
||||
: device_(device), options_(options), vocabs_(vocabs),
|
||||
modelMemory_(modelMemory), shortlistMemory_(shortlistMemory) {}
|
||||
|
||||
void BatchTranslator::initialize() {
|
||||
@ -22,17 +22,17 @@ void BatchTranslator::initialize() {
|
||||
bool check = options_->get<bool>("check-bytearray",false); // Flag holds whether validate the bytearray (model and shortlist)
|
||||
if (options_->hasAndNotEmpty("shortlist")) {
|
||||
int srcIdx = 0, trgIdx = 1;
|
||||
bool shared_vcb = vocabs_->front() == vocabs_->back();
|
||||
bool shared_vcb = vocabs_.sources().front() == vocabs_.target(); // vocabs_->sources().front() is invoked as we currently only support one source vocab
|
||||
if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) {
|
||||
slgen_ = New<data::BinaryShortlistGenerator>(shortlistMemory_->begin(), shortlistMemory_->size(),
|
||||
vocabs_->front(), vocabs_->back(),
|
||||
srcIdx, trgIdx, shared_vcb, check);
|
||||
vocabs_.sources().front(), vocabs_.target(),
|
||||
srcIdx, trgIdx, shared_vcb, check);
|
||||
}
|
||||
else {
|
||||
// Changed to BinaryShortlistGenerator to enable loading binary shortlist file
|
||||
// This class also supports text shortlist file
|
||||
slgen_ = New<data::BinaryShortlistGenerator>(options_, vocabs_->front(),
|
||||
vocabs_->back(), srcIdx,
|
||||
slgen_ = New<data::BinaryShortlistGenerator>(options_, vocabs_.sources().front(),
|
||||
vocabs_.target(), srcIdx,
|
||||
trgIdx, shared_vcb);
|
||||
}
|
||||
}
|
||||
@ -97,7 +97,7 @@ void BatchTranslator::translate(Batch &batch) {
|
||||
std::vector<Ptr<SubBatch>> subBatches;
|
||||
for (size_t j = 0; j < maxDims.size(); ++j) {
|
||||
subBatches.emplace_back(
|
||||
New<SubBatch>(batchSize, maxDims[j], vocabs_->at(j)));
|
||||
New<SubBatch>(batchSize, maxDims[j], vocabs_.sources().at(j)));
|
||||
}
|
||||
|
||||
std::vector<size_t> words(maxDims.size(), 0);
|
||||
@ -116,9 +116,8 @@ void BatchTranslator::translate(Batch &batch) {
|
||||
|
||||
auto corpus_batch = Ptr<CorpusBatch>(new CorpusBatch(subBatches));
|
||||
corpus_batch->setSentenceIds(sentenceIds);
|
||||
|
||||
auto trgVocab = vocabs_->back();
|
||||
auto search = New<BeamSearch>(options_, scorers_, trgVocab);
|
||||
|
||||
auto search = New<BeamSearch>(options_, scorers_, vocabs_.target());
|
||||
|
||||
auto histories = std::move(search->search(graph_, corpus_batch));
|
||||
batch.completeBatch(histories);
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "request.h"
|
||||
#include "translator/history.h"
|
||||
#include "translator/scorers.h"
|
||||
#include "vocabs.h"
|
||||
|
||||
#ifndef WASM_COMPATIBLE_SOURCE
|
||||
#include "pcqueue.h"
|
||||
@ -34,7 +35,7 @@ public:
|
||||
* @param modelMemory byte array (aligned to 256!!!) that contains the bytes of a model.bin. Provide a nullptr if not used.
|
||||
* @param shortlistMemory byte array of shortlist (aligned to 64)
|
||||
*/
|
||||
explicit BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
|
||||
explicit BatchTranslator(DeviceId const device, Vocabs &vocabs,
|
||||
Ptr<Options> options, const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory);
|
||||
|
||||
// convenience function for logging. TODO(jerin)
|
||||
@ -45,7 +46,7 @@ public:
|
||||
private:
|
||||
Ptr<Options> options_;
|
||||
DeviceId device_;
|
||||
std::vector<Ptr<Vocab const>> *vocabs_;
|
||||
const Vocabs& vocabs_;
|
||||
Ptr<ExpressionGraph> graph_;
|
||||
std::vector<Ptr<Scorer>> scorers_;
|
||||
Ptr<data::ShortlistGenerator const> slgen_;
|
||||
|
@ -117,5 +117,13 @@ void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
|
||||
}
|
||||
}
|
||||
|
||||
MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options){
|
||||
MemoryBundle memoryBundle;
|
||||
memoryBundle.model = getModelMemoryFromConfig(options);
|
||||
memoryBundle.shortlist = getShortlistMemoryFromConfig(options);
|
||||
getVocabsMemoryFromConfig(options, memoryBundle.vocabs);
|
||||
return memoryBundle;
|
||||
}
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
@ -10,5 +10,6 @@ AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options)
|
||||
void getVocabsMemoryFromConfig(marian::Ptr<marian::Options> options,
|
||||
std::vector<std::shared_ptr<AlignedMemory>>& vocabMemories);
|
||||
bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize);
|
||||
MemoryBundle getMemoryBundleFromConfig(marian::Ptr<marian::Options> options);
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
@ -15,6 +15,21 @@ typedef std::vector<Segment> Segments;
|
||||
/// Shortcut to AlignedVector<char> for byte arrays
|
||||
typedef AlignedVector<char> AlignedMemory;
|
||||
|
||||
/// Memory bundle for all byte-arrays.
|
||||
/// Can be a set/subset of model, shortlist, vocabs and ssplitPrefixFile bytes.
|
||||
struct MemoryBundle {
|
||||
AlignedMemory model; ///< Byte-array of model (aligned to 256)
|
||||
AlignedMemory shortlist; ///< Byte-array of shortlist (aligned to 64)
|
||||
|
||||
/// Vector of vocabulary memories (aligned to 64).
|
||||
/// If two vocabularies are the same (based on the filenames), two entries (shared
|
||||
/// pointers) will be generated which share the same AlignedMemory object.
|
||||
std::vector<std::shared_ptr<AlignedMemory>> vocabs;
|
||||
|
||||
/// @todo Not implemented yet
|
||||
AlignedMemory ssplitPrefixFile;
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
||||
|
19
src/translator/project_version.h.in
Normal file
19
src/translator/project_version.h.in
Normal file
@ -0,0 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
/*
|
||||
* File project_version.h is generated using CMake. Do not modify project_version.h manually!
|
||||
* Edit project_version.h.in file instead.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
std::string bergamotBuildVersion() {
|
||||
// e.g. v1.2.3-alpha.1.1+abc123d
|
||||
return "@PROJECT_VERSION_STRING_FULL@";
|
||||
}
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
@ -65,33 +65,29 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
|
||||
|
||||
Result result = onebest[0]; // Expecting only one result;
|
||||
Words words = std::get<0>(result);
|
||||
auto targetVocab = vocabs_->back();
|
||||
|
||||
std::string decoded;
|
||||
std::vector<string_view> targetSentenceMappings;
|
||||
targetVocab->decodeWithByteRanges(words, decoded, targetSentenceMappings);
|
||||
vocabs_.target()->decodeWithByteRanges(words, decoded, targetSentenceMappings);
|
||||
|
||||
switch (responseOptions_.concatStrategy) {
|
||||
case ConcatStrategy::FAITHFUL: {
|
||||
// For each sentence, prepend the filler text between the corresponding
|
||||
// source-sentence and the source-sentence before.
|
||||
string_view pre = response.source.gap(sentenceIdx);
|
||||
response.target.appendSentence(std::string(pre.data(), pre.size()),
|
||||
decoded, targetSentenceMappings);
|
||||
response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end());
|
||||
|
||||
// If this is the last history to be decoded and translated-text
|
||||
// constructed, append the text till the end, which could be spaces or
|
||||
// empty.
|
||||
if (sentenceIdx + 1 == histories.size()) {
|
||||
string_view post = response.source.gap(sentenceIdx + 1);
|
||||
response.target.text += std::string(post.data(), post.size());
|
||||
response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ConcatStrategy::SPACE: {
|
||||
std::string delimiter = (sentenceIdx == 0) ? "" : " ";
|
||||
response.target.appendSentence(delimiter, decoded,
|
||||
targetSentenceMappings);
|
||||
string_view delimiter = (sentenceIdx == 0) ? "" : " ";
|
||||
response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end());
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "data/types.h"
|
||||
#include "response.h"
|
||||
#include "response_options.h"
|
||||
#include "vocabs.h"
|
||||
|
||||
// For now we will work with this, to avoid complaints another structure is hard
|
||||
// to operate with.
|
||||
@ -24,10 +25,10 @@ public:
|
||||
/// @param [in] vocabs: marian vocab object (used in decoding)
|
||||
/// @param [in] promise: promise to set with the constructed Response.
|
||||
ResponseBuilder(ResponseOptions responseOptions, AnnotatedText &&source,
|
||||
std::vector<Ptr<Vocab const>> &vocabs,
|
||||
Vocabs &vocabs,
|
||||
std::promise<Response> &&promise)
|
||||
: responseOptions_(responseOptions), source_(std::move(source)),
|
||||
vocabs_(&vocabs), promise_(std::move(promise)) {}
|
||||
vocabs_(vocabs), promise_(std::move(promise)) {}
|
||||
|
||||
/// Constructs and sets the promise of a Response object from obtained
|
||||
/// histories after translating.
|
||||
@ -81,7 +82,7 @@ private:
|
||||
// Data members are context/curried args for the functor.
|
||||
|
||||
ResponseOptions responseOptions_;
|
||||
std::vector<Ptr<Vocab const>> *vocabs_; // vocabs are required for decoding
|
||||
const Vocabs& vocabs_; // vocabs are required for decoding
|
||||
// and any source validation checks.
|
||||
std::promise<Response> promise_; // To be set when callback triggered and
|
||||
// after Response constructed.
|
||||
|
@ -5,50 +5,16 @@
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
inline std::vector<marian::Ptr<const marian::Vocab>>
|
||||
loadVocabularies(marian::Ptr<marian::Options> options,
|
||||
std::vector<std::shared_ptr<marian::bergamot::AlignedMemory>>&& vocabMemories) {
|
||||
// @TODO: parallelize vocab loading for faster startup
|
||||
std::vector<marian::Ptr<marian::Vocab const>> vocabs;
|
||||
if(!vocabMemories.empty()){
|
||||
// load vocabs from buffer
|
||||
ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies.");
|
||||
vocabs.resize(vocabMemories.size());
|
||||
for (size_t i = 0; i < vocabs.size(); i++) {
|
||||
marian::Ptr<marian::Vocab> vocab = marian::New<marian::Vocab>(options, i);
|
||||
vocab->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size()));
|
||||
vocabs[i] = vocab;
|
||||
}
|
||||
} else {
|
||||
// load vocabs from file
|
||||
auto vfiles = options->get<std::vector<std::string>>("vocabs");
|
||||
// with the current setup, we need at least two vocabs: src and trg
|
||||
ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies.");
|
||||
vocabs.resize(vfiles.size());
|
||||
std::unordered_map<std::string, marian::Ptr<marian::Vocab>> vmap;
|
||||
for (size_t i = 0; i < vocabs.size(); ++i) {
|
||||
auto m = vmap.emplace(std::make_pair(vfiles[i], marian::Ptr<marian::Vocab>()));
|
||||
if (m.second) { // new: load the vocab
|
||||
m.first->second = marian::New<marian::Vocab>(options, i);
|
||||
m.first->second->load(vfiles[i]);
|
||||
}
|
||||
vocabs[i] = m.first->second;
|
||||
}
|
||||
}
|
||||
return vocabs;
|
||||
}
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
Service::Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory,
|
||||
std::vector<std::shared_ptr<AlignedMemory>> vocabMemories)
|
||||
Service::Service(Ptr<Options> options, MemoryBundle memoryBundle)
|
||||
: requestId_(0), options_(options),
|
||||
vocabs_(std::move(loadVocabularies(options, std::move(vocabMemories)))),
|
||||
vocabs_(options, std::move(memoryBundle.vocabs)),
|
||||
text_processor_(vocabs_, options), batcher_(options),
|
||||
numWorkers_(options->get<int>("cpu-threads")),
|
||||
modelMemory_(std::move(modelMemory)),
|
||||
shortlistMemory_(std::move(shortlistMemory))
|
||||
modelMemory_(std::move(memoryBundle.model)),
|
||||
shortlistMemory_(std::move(memoryBundle.shortlist))
|
||||
#ifndef WASM_COMPATIBLE_SOURCE
|
||||
// 0 elements in PCQueue is illegal and can lead to failures. Adding a
|
||||
// guard to have at least one entry allocated. In the single-threaded
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "response_builder.h"
|
||||
#include "text_processor.h"
|
||||
#include "translator/parser.h"
|
||||
#include "vocabs.h"
|
||||
|
||||
#ifndef WASM_COMPATIBLE_SOURCE
|
||||
#include "pcqueue.h"
|
||||
@ -55,53 +56,29 @@ namespace bergamot {
|
||||
/// // Do things with response.
|
||||
/// ```
|
||||
///
|
||||
/// Optionally Service can be initialized by also passing model memory for
|
||||
/// purposes of efficiency (which defaults to nullpointer and then reads from
|
||||
/// Optionally Service can be initialized by also passing bytearray memories
|
||||
/// for purposes of efficiency (which defaults to empty and then reads from
|
||||
/// file supplied through config).
|
||||
///
|
||||
class Service {
|
||||
|
||||
public:
|
||||
/// Construct Service from Marian options. If memoryBundle is empty, Service is
|
||||
/// initialized from file-based loading. Otherwise, Service is initialized from
|
||||
/// the given bytearray memories.
|
||||
/// @param options Marian options object
|
||||
/// @param modelMemory byte array (aligned to 256!!!) that contains the bytes
|
||||
/// of a model.bin.
|
||||
/// @param shortlistMemory byte array of shortlist (aligned to 64)
|
||||
/// @param vocabMemories vector of vocabulary memories (aligned to 64)
|
||||
explicit Service(Ptr<Options> options, AlignedMemory modelMemory,
|
||||
AlignedMemory shortlistMemory,
|
||||
std::vector<std::shared_ptr<AlignedMemory>> vocabMemories);
|
||||
/// @param memoryBundle holds all byte-array memories. Can be a set/subset of
|
||||
/// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional.
|
||||
explicit Service(Ptr<Options> options, MemoryBundle memoryBundle={});
|
||||
|
||||
/// Construct Service purely from Options. This expects options which
|
||||
/// marian-decoder expects to be set for loading model shortlist and
|
||||
/// vocabularies from files in addition to parameters that set unset desired
|
||||
/// features (e.g: alignments, quality-scores).
|
||||
///
|
||||
/// This is equivalent to a call to:
|
||||
/// ```cpp
|
||||
/// Service(options, AlignedMemory(), AlignedMemory(), {})
|
||||
/// ```
|
||||
/// wherein empty memory is passed and internal flow defaults to file-based
|
||||
/// model, shortlist loading. AlignedMemory() corresponds to empty memory
|
||||
explicit Service(Ptr<Options> options)
|
||||
: Service(options, AlignedMemory(), AlignedMemory(), {}) {}
|
||||
|
||||
/// Construct Service from a string configuration.
|
||||
/// @param [in] config string parsable as YAML expected to adhere with marian
|
||||
/// config
|
||||
/// @param [in] modelMemory byte array (aligned to 256!!!) that contains the
|
||||
/// bytes of a model.bin. Optional. AlignedMemory() corresponds to empty memory
|
||||
/// @param [in] shortlistMemory byte array of shortlist (aligned to 64). Optional.
|
||||
/// @param [in] vocabMemories vector of vocabulary memories (aligned to 64). Optional.
|
||||
/// If two vocabularies are the same (based on the filenames), two entries (shared
|
||||
/// pointers) will be generated which share the same AlignedMemory object.
|
||||
explicit Service(const std::string &config,
|
||||
AlignedMemory modelMemory = AlignedMemory(),
|
||||
AlignedMemory shortlistMemory = AlignedMemory(),
|
||||
std::vector<std::shared_ptr<AlignedMemory>> vocabsMemories = {})
|
||||
: Service(parseOptions(config, /*validate=*/false),
|
||||
std::move(modelMemory),
|
||||
std::move(shortlistMemory),
|
||||
std::move(vocabsMemories)) {}
|
||||
/// Construct Service from a string configuration. If memoryBundle is empty, Service is
|
||||
/// initialized from file-based loading. Otherwise, Service is initialized from
|
||||
/// the given bytearray memories.
|
||||
/// @param [in] config string parsable as YAML expected to adhere with marian config
|
||||
/// @param [in] memoryBundle holds all byte-array memories. Can be a set/subset of
|
||||
/// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional.
|
||||
explicit Service(const std::string &config, MemoryBundle memoryBundle={})
|
||||
: Service(parseOptions(config, /*validate=*/false), std::move(memoryBundle)) {}
|
||||
|
||||
/// Explicit destructor to clean up after any threads initialized in
|
||||
/// asynchronous operation mode.
|
||||
@ -196,7 +173,7 @@ private:
|
||||
|
||||
size_t requestId_;
|
||||
/// Store vocabs representing source and target.
|
||||
std::vector<Ptr<Vocab const>> vocabs_; // ORDER DEPENDENCY (text_processor_)
|
||||
Vocabs vocabs_; // ORDER DEPENDENCY (text_processor_)
|
||||
|
||||
/// TextProcesser takes a blob of text and converts into format consumable by
|
||||
/// the batch-translator and annotates sentences and words.
|
||||
|
@ -4,7 +4,6 @@
|
||||
#include "annotation.h"
|
||||
|
||||
#include "common/options.h"
|
||||
#include "data/vocab.h"
|
||||
#include <vector>
|
||||
|
||||
namespace marian {
|
||||
@ -12,13 +11,14 @@ namespace bergamot {
|
||||
|
||||
Segment TextProcessor::tokenize(const string_view &segment,
|
||||
std::vector<string_view> &wordRanges) {
|
||||
return vocabs_->front()->encodeWithByteRanges(
|
||||
// vocabs_->sources().front() is invoked as we currently only support one source vocab
|
||||
return vocabs_.sources().front()->encodeWithByteRanges(
|
||||
segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
|
||||
}
|
||||
|
||||
TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
|
||||
TextProcessor::TextProcessor(Vocabs &vocabs,
|
||||
Ptr<Options> options)
|
||||
: vocabs_(&vocabs), sentence_splitter_(options) {
|
||||
: vocabs_(vocabs), sentence_splitter_(options) {
|
||||
|
||||
max_length_break_ = options->get<int>("max-length-break");
|
||||
max_length_break_ = max_length_break_ - 1;
|
||||
@ -41,15 +41,16 @@ void TextProcessor::process(AnnotatedText &source, Segments &segments) {
|
||||
// There are some cases where SentencePiece or vocab returns no words
|
||||
// after normalization. 0 prevents any empty entries from being added.
|
||||
if (segment.size() > 0) {
|
||||
// Truncate segment into max_input_size segments.
|
||||
truncate(segment, wordRanges, segments, source);
|
||||
// Wrap segment into sentences of at most max_length_break_ tokens and
|
||||
// tell source about them.
|
||||
wrap(segment, wordRanges, segments, source);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TextProcessor::truncate(Segment &segment,
|
||||
std::vector<string_view> &wordRanges,
|
||||
Segments &segments, AnnotatedText &source) {
|
||||
void TextProcessor::wrap(Segment &segment,
|
||||
std::vector<string_view> &wordRanges,
|
||||
Segments &segments, AnnotatedText &source) {
|
||||
for (size_t offset = 0; offset < segment.size();
|
||||
offset += max_length_break_) {
|
||||
auto start = segment.begin() + offset;
|
||||
@ -61,7 +62,8 @@ void TextProcessor::truncate(Segment &segment,
|
||||
segments.back().push_back(sourceEosId());
|
||||
|
||||
auto astart = wordRanges.begin() + offset;
|
||||
source.addSentence(astart, astart + diff);
|
||||
// diff > 0
|
||||
source.recordExistingSentence(astart, astart + diff, astart->data());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "annotation.h"
|
||||
|
||||
#include "sentence_splitter.h"
|
||||
#include "vocabs.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
@ -21,7 +22,7 @@ class TextProcessor {
|
||||
// sentences (vector of words). In addition, the ByteRanges of the
|
||||
// source-tokens in unnormalized text are provided as string_views.
|
||||
public:
|
||||
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
|
||||
explicit TextProcessor(Vocabs &vocabs, Ptr<Options>);
|
||||
|
||||
void process(AnnotatedText &source, Segments &segments);
|
||||
|
||||
@ -31,14 +32,15 @@ private:
|
||||
Segment tokenize(const string_view &input,
|
||||
std::vector<string_view> &tokenRanges);
|
||||
|
||||
// Truncate sentence into max_input_size segments.
|
||||
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
|
||||
Segments &segments, AnnotatedText &source);
|
||||
// Wrap into sentences of at most max_length_break_ tokens and add to source.
|
||||
void wrap(Segment &sentence, std::vector<string_view> &tokenRanges,
|
||||
Segments &segments, AnnotatedText &source);
|
||||
|
||||
// shorthand, used only in truncate()
|
||||
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
|
||||
// vocabs_->sources().front() is invoked as we currently only support one source vocab
|
||||
const Word sourceEosId() const { return vocabs_.sources().front()->getEosId(); }
|
||||
|
||||
std::vector<Ptr<Vocab const>> *vocabs_;
|
||||
const Vocabs& vocabs_;
|
||||
SentenceSplitter sentence_splitter_;
|
||||
size_t max_length_break_;
|
||||
};
|
||||
|
81
src/translator/vocabs.h
Normal file
81
src/translator/vocabs.h
Normal file
@ -0,0 +1,81 @@
|
||||
#pragma once
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
/// Wrapper of Marian Vocab objects needed for translator.
|
||||
/// Holds multiple source vocabularies and one target vocabulary
|
||||
class Vocabs {
|
||||
public:
|
||||
/// Construct vocabs object from either byte-arrays or files
|
||||
Vocabs(Ptr<Options> options, std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories): options_(options){
|
||||
if (!vocabMemories.empty()){
|
||||
// load vocabs from buffer
|
||||
load(std::move(vocabMemories));
|
||||
}
|
||||
else{
|
||||
// load vocabs from file
|
||||
auto vocabPaths = options->get<std::vector<std::string>>("vocabs");
|
||||
load(vocabPaths);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get all source vocabularies (as a vector)
|
||||
const std::vector<Ptr<Vocab const>>& sources() const {
|
||||
return srcVocabs_;
|
||||
}
|
||||
|
||||
/// Get the target vocabulary
|
||||
const Ptr<Vocab const>& target() const {
|
||||
return trgVocab_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<Ptr<Vocab const>> srcVocabs_; // source vocabularies
|
||||
Ptr<Vocab const> trgVocab_; // target vocabulary
|
||||
Ptr<Options> options_;
|
||||
|
||||
// load from buffer
|
||||
void load(std::vector<std::shared_ptr<AlignedMemory>>&& vocabMemories) {
|
||||
// At least two vocabs: src and trg
|
||||
ABORT_IF(vocabMemories.size() < 2, "Insufficient number of vocabularies.");
|
||||
srcVocabs_.resize(vocabMemories.size());
|
||||
// hashMap is introduced to avoid double loading the same vocab
|
||||
// loading vocabs (either from buffers or files) is the biggest bottleneck of the speed
|
||||
// uintptr_t holds unique keys (address) for share_ptr<AlignedMemory>
|
||||
std::unordered_map<uintptr_t, Ptr<Vocab>> vmap;
|
||||
for (size_t i = 0; i < srcVocabs_.size(); i++) {
|
||||
auto m = vmap.emplace(std::make_pair(reinterpret_cast<uintptr_t>(vocabMemories[i].get()), Ptr<Vocab>()));
|
||||
if (m.second) { // new: load the vocab
|
||||
m.first->second = New<Vocab>(options_, i);
|
||||
m.first->second->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size()));
|
||||
}
|
||||
srcVocabs_[i] = m.first->second;
|
||||
}
|
||||
// Initialize target vocab
|
||||
trgVocab_ = srcVocabs_.back();
|
||||
srcVocabs_.pop_back();
|
||||
}
|
||||
|
||||
// load from file
|
||||
void load(const std::vector<std::string>& vocabPaths){
|
||||
// with the current setup, we need at least two vocabs: src and trg
|
||||
ABORT_IF(vocabPaths.size() < 2, "Insufficient number of vocabularies.");
|
||||
srcVocabs_.resize(vocabPaths.size());
|
||||
std::unordered_map<std::string, Ptr<Vocab>> vmap;
|
||||
for (size_t i = 0; i < srcVocabs_.size(); ++i) {
|
||||
auto m = vmap.emplace(std::make_pair(vocabPaths[i], Ptr<Vocab>()));
|
||||
if (m.second) { // new: load the vocab
|
||||
m.first->second = New<Vocab>(options_, i);
|
||||
m.first->second->load(vocabPaths[i]);
|
||||
}
|
||||
srcVocabs_[i] = m.first->second;
|
||||
}
|
||||
// Initialize target vocab
|
||||
trgVocab_ = srcVocabs_.back();
|
||||
srcVocabs_.pop_back();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
@ -4,6 +4,10 @@ add_executable(bergamot-translator-worker
|
||||
bindings/TranslationResultBindings.cpp
|
||||
)
|
||||
|
||||
# Generate version file that can be included in the wasm artifacts
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/project_version.js.in
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/project_version.js @ONLY)
|
||||
|
||||
# This header inclusion needs to go away later as path to public headers of bergamot
|
||||
# translator should be directly available from "bergamot-translator" target
|
||||
target_include_directories(bergamot-translator-worker
|
||||
@ -14,18 +18,14 @@ target_include_directories(bergamot-translator-worker
|
||||
target_compile_definitions(bergamot-translator-worker PRIVATE WASM_BINDINGS)
|
||||
target_compile_options(bergamot-translator-worker PRIVATE ${WASM_COMPILE_FLAGS})
|
||||
|
||||
set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s FORCE_FILESYSTEM=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1 -s EXPORTED_RUNTIME_METHODS=[addOnPreMain]")
|
||||
if (NOT PACKAGE_DIR STREQUAL "")
|
||||
get_filename_component(REALPATH_PACKAGE_DIR ${PACKAGE_DIR} REALPATH BASE_DIR ${CMAKE_BINARY_DIR})
|
||||
set(LINKER_FLAGS "${LINKER_FLAGS} --preload-file ${REALPATH_PACKAGE_DIR}@/")
|
||||
endif()
|
||||
|
||||
# Enable worker file system
|
||||
set(LINKER_FLAGS "${LINKER_FLAGS} -lworkerfs.js")
|
||||
set(LINKER_FLAGS "-g2 --bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1 -s EXPORTED_RUNTIME_METHODS=[addOnPreMain]")
|
||||
|
||||
# Avoid node.js-code in emscripten glue-code
|
||||
set(LINKER_FLAGS "${LINKER_FLAGS} -s ENVIRONMENT=web,worker")
|
||||
|
||||
# Append version information in the Javascript artifact
|
||||
set(LINKER_FLAGS "${LINKER_FLAGS} --extern-pre-js ${CMAKE_CURRENT_SOURCE_DIR}/project_version.js")
|
||||
|
||||
set_target_properties(bergamot-translator-worker PROPERTIES
|
||||
SUFFIX ".js"
|
||||
LINK_FLAGS ${LINKER_FLAGS}
|
||||
|
@ -7,8 +7,8 @@ Please note that [Using JS APIs](#Using-JS-APIs) and [Demo](#Demo) section below
|
||||
|
||||
```bash
|
||||
cd test_page
|
||||
mkdir models
|
||||
git clone --depth 1 --branch main --single-branch https://github.com/mozilla-applied-ml/bergamot-models
|
||||
mkdir models
|
||||
cp -rf bergamot-models/prod/* models
|
||||
gunzip models/*/*
|
||||
```
|
||||
@ -18,10 +18,7 @@ gunzip models/*/*
|
||||
```js
|
||||
// The model configuration as YAML formatted string. For available configuration options, please check: https://marian-nmt.github.io/docs/cmd/marian-decoder/
|
||||
// This example captures some of the most relevant options
|
||||
const modelConfig = `vocabs:
|
||||
- /esen/vocab.esen.spm
|
||||
- /esen/vocab.esen.spm
|
||||
beam-size: 1
|
||||
const modelConfig = `beam-size: 1
|
||||
normalize: 1.0
|
||||
word-penalty: 0
|
||||
max-length-break: 128
|
||||
@ -35,19 +32,31 @@ quiet-translation: true
|
||||
gemm-precision: int8shift
|
||||
`;
|
||||
|
||||
// Download model and shortlist files and read them into buffers
|
||||
// Download model, shortlist and vocabulary files and read them into buffers
|
||||
const modelFile = `models/esen/model.esen.intgemm.alphas.bin`;
|
||||
const shortlistFile = `models/esen/lex.50.50.esen.s2t.bin`;
|
||||
const downloadedBuffers = await Promise.all([downloadAsArrayBuffer(modelFile), downloadAsArrayBuffer(shortlistFile)]); // Please refer to bergamot.html in test_page folder for this function
|
||||
const vocabFiles = [`models/${languagePair}/vocab.${vocabLanguagePair}.spm`,
|
||||
`models/${languagePair}/vocab.${vocabLanguagePair}.spm`];
|
||||
const uniqueVocabFiles = new Set(vocabFiles);
|
||||
|
||||
// Please refer to bergamot.html in test_page folder for downloadAsArrayBuffer function
|
||||
const downloadedBuffers = await Promise.all([downloadAsArrayBuffer(modelFile), downloadAsArrayBuffer(shortlistFile)]);
|
||||
const downloadedVocabBuffers = [];
|
||||
for (let item of uniqueVocabFiles.values()) {
|
||||
downloadedVocabBuffers.push(await downloadAsArrayBuffer(item));
|
||||
}
|
||||
|
||||
const modelBuffer = downloadedBuffers[0];
|
||||
const shortListBuffer = downloadedBuffers[1];
|
||||
|
||||
// Construct AlignedMemory instances from the buffers
|
||||
var alignedModelMemory = constructAlignedMemoryFromBuffer(modelBuffer, 256); // Please refer to bergamot.html in test_page folder for this function
|
||||
var alignedShortlistMemory = constructAlignedMemoryFromBuffer(shortListBuffer, 64); // Please refer to bergamot.html in test_page folder for this function
|
||||
var alignedVocabsMemoryList = new Module.AlignedMemoryList;
|
||||
downloadedVocabBuffers.forEach(item => alignedVocabsMemoryList.push_back(constructAlignedMemoryFromBuffer(item, 64)));
|
||||
|
||||
// Instantiate the TranslationModel
|
||||
const model = new Module.TranslationModel(modelConfig, alignedModelMemory, alignedShortlistMemory);
|
||||
const model = new Module.TranslationModel(modelConfig, alignedModelMemory, alignedShortlistMemory, alignedVocabsMemoryList);
|
||||
|
||||
// Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector<string>)
|
||||
const request = new Module.TranslationRequest();
|
||||
|
@ -48,14 +48,22 @@ std::vector<std::shared_ptr<AlignedMemory>> prepareVocabsSmartMemories(std::vect
|
||||
return vocabsSmartMemories;
|
||||
}
|
||||
|
||||
marian::bergamot::MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory,
|
||||
AlignedMemory* shortlistMemory,
|
||||
std::vector<AlignedMemory*> uniqueVocabsMemories){
|
||||
marian::bergamot::MemoryBundle memoryBundle;
|
||||
memoryBundle.model = std::move(*modelMemory);
|
||||
memoryBundle.shortlist = std::move(*shortlistMemory);
|
||||
memoryBundle.vocabs = std::move(prepareVocabsSmartMemories(uniqueVocabsMemories));
|
||||
|
||||
return memoryBundle;
|
||||
}
|
||||
|
||||
TranslationModel* TranslationModelFactory(const std::string &config,
|
||||
AlignedMemory* modelMemory,
|
||||
AlignedMemory* shortlistMemory,
|
||||
std::vector<AlignedMemory*> uniqueVocabsMemories) {
|
||||
return new TranslationModel(config,
|
||||
std::move(*modelMemory),
|
||||
std::move(*shortlistMemory),
|
||||
std::move(prepareVocabsSmartMemories(uniqueVocabsMemories)));
|
||||
return new TranslationModel(config, std::move(prepareMemoryBundle(modelMemory, shortlistMemory, uniqueVocabsMemories)));
|
||||
}
|
||||
|
||||
EMSCRIPTEN_BINDINGS(translation_model) {
|
||||
|
1
wasm/project_version.js.in
Normal file
1
wasm/project_version.js.in
Normal file
@ -0,0 +1 @@
|
||||
var BERGAMOT_VERSION_FULL = "@PROJECT_VERSION_STRING_FULL@";
|
Loading…
Reference in New Issue
Block a user