mirror of
https://github.com/browsermt/bergamot-translator.git
synced 2024-10-26 05:43:59 +03:00
Merge remote-tracking branch 'upstream/main' into upstream-sync
This commit is contained in:
commit
1574a4586c
32
.github/workflows/macos-custom-marian-native.yml
vendored
32
.github/workflows/macos-custom-marian-native.yml
vendored
@ -1,32 +0,0 @@
|
|||||||
name: MacOS Native (Custom)
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [ main, ci-sandbox ]
|
|
||||||
pull_request:
|
|
||||||
branches: [ main, ci-sandbox ]
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
build-macos:
|
|
||||||
name: Native (With Custom Marian)
|
|
||||||
runs-on: macos-10.15
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v2
|
|
||||||
with:
|
|
||||||
submodules: recursive
|
|
||||||
|
|
||||||
- name: Configure CMake
|
|
||||||
run: |
|
|
||||||
mkdir -p build-native
|
|
||||||
cd build-native
|
|
||||||
cmake ..
|
|
||||||
|
|
||||||
- name: Compile
|
|
||||||
working-directory: build-native
|
|
||||||
run: make -j2
|
|
||||||
|
|
||||||
- name: Print versions
|
|
||||||
working-directory: build-native
|
|
||||||
run: ./app/bergamot-translator-app --version
|
|
33
.github/workflows/native-custom_marian-mac.yml
vendored
Normal file
33
.github/workflows/native-custom_marian-mac.yml
vendored
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
name: Native (Custom Marian) MacOS
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main, ci-sandbox ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ main, ci-sandbox ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-macos:
|
||||||
|
name: MacOS
|
||||||
|
runs-on: macos-10.15
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
|
||||||
|
- name: Configure CMake
|
||||||
|
run: |
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
|
||||||
|
- name: Compile
|
||||||
|
working-directory: build
|
||||||
|
run: make -j2
|
||||||
|
|
||||||
|
- name: Print versions
|
||||||
|
working-directory: build
|
||||||
|
run: |
|
||||||
|
./app/bergamot-translator-app --version
|
33
.github/workflows/native-custom_marian-ubuntu.yml
vendored
Normal file
33
.github/workflows/native-custom_marian-ubuntu.yml
vendored
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
name: Native (Custom Marian) Ubuntu
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main, ci-sandbox ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ main, ci-sandbox ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-macos:
|
||||||
|
name: Ubuntu
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
|
||||||
|
- name: Configure CMake
|
||||||
|
run: |
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
|
||||||
|
- name: Compile
|
||||||
|
working-directory: build
|
||||||
|
run: make -j2
|
||||||
|
|
||||||
|
- name: Print versions
|
||||||
|
working-directory: build
|
||||||
|
run: |
|
||||||
|
./app/bergamot-translator-app --version
|
@ -1,4 +1,4 @@
|
|||||||
name: MacOS
|
name: Native (Full Marian) MacOS
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
@ -39,16 +39,15 @@ jobs:
|
|||||||
-DUSE_FBGEMM=on \
|
-DUSE_FBGEMM=on \
|
||||||
-DUSE_SENTENCEPIECE=on \
|
-DUSE_SENTENCEPIECE=on \
|
||||||
-DUSE_STATIC_LIBS=off \
|
-DUSE_STATIC_LIBS=off \
|
||||||
-DUSE_WASM_COMPATIBLE_SOURCES=off
|
-DUSE_WASM_COMPATIBLE_SOURCE=off
|
||||||
|
|
||||||
- name: Compile
|
- name: Compile
|
||||||
working-directory: build
|
working-directory: build
|
||||||
run: make -j2
|
run: make -j2
|
||||||
|
|
||||||
# Removing unit-tests, taken care of in browsermt/marian-dev
|
- name: Run unit tests
|
||||||
# - name: Run unit tests
|
working-directory: build
|
||||||
# - working-directory: build
|
run: make test
|
||||||
# - run: make test
|
|
||||||
|
|
||||||
- name: Print versions
|
- name: Print versions
|
||||||
working-directory: build
|
working-directory: build
|
@ -1,10 +1,10 @@
|
|||||||
name: Ubuntu
|
name: Native (Full Marian) Ubuntu
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [ main, ci-sandbox ]
|
branches: [ main, ci-test ]
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [ main, ci-sandbox ]
|
branches: [ main, ci-test ]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-ubuntu:
|
build-ubuntu:
|
||||||
@ -15,7 +15,7 @@ jobs:
|
|||||||
- name: "Ubuntu CPU-only"
|
- name: "Ubuntu CPU-only"
|
||||||
os: ubuntu-latest
|
os: ubuntu-latest
|
||||||
cuda: ""
|
cuda: ""
|
||||||
gcc: 7
|
gcc: 8
|
||||||
cpu: true
|
cpu: true
|
||||||
gpu: false
|
gpu: false
|
||||||
# GPU Builds are commented out, for bergamot-translator CI runs.
|
# GPU Builds are commented out, for bergamot-translator CI runs.
|
||||||
@ -62,7 +62,7 @@ jobs:
|
|||||||
# No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because
|
# No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because
|
||||||
# it is installed together with libprotobuf-dev
|
# it is installed together with libprotobuf-dev
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-all-dev
|
run: sudo apt-get update && sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-all-dev g++-8
|
||||||
|
|
||||||
# https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
|
# https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
|
||||||
- name: Install MKL
|
- name: Install MKL
|
||||||
@ -97,19 +97,17 @@ jobs:
|
|||||||
-DUSE_FBGEMM=${{ matrix.cpu }} \
|
-DUSE_FBGEMM=${{ matrix.cpu }} \
|
||||||
-DUSE_SENTENCEPIECE=on \
|
-DUSE_SENTENCEPIECE=on \
|
||||||
-DUSE_STATIC_LIBS=on \
|
-DUSE_STATIC_LIBS=on \
|
||||||
-DUSE_WASM_COMPATIBLE_SOURCES=off
|
-DUSE_WASM_COMPATIBLE_SOURCE=off
|
||||||
|
|
||||||
- name: Compile
|
- name: Compile
|
||||||
working-directory: build
|
working-directory: build
|
||||||
run: make -j2
|
run: make -j2
|
||||||
|
|
||||||
# Removing unit-tests, taken care of in browsermt/marian-dev
|
- name: Run unit tests
|
||||||
# TODO: add a flag to CMake to compile unit tests only on CPU
|
working-directory: build
|
||||||
# - name: Run unit tests
|
run: make test
|
||||||
# working-directory: build
|
# GitHub-hosted VMs do not have GPUs, so can not be run in CUDA builds
|
||||||
# run: make test
|
if: matrix.gpu == false
|
||||||
# # GitHub-hosted VMs do not have GPUs, so can not be run in CUDA builds
|
|
||||||
# if: matrix.gpu == false
|
|
||||||
|
|
||||||
- name: Print versions
|
- name: Print versions
|
||||||
working-directory: build
|
working-directory: build
|
@ -1,4 +1,4 @@
|
|||||||
name: MacOS WASM (Custom)
|
name: WASM (Custom Marian) MacOS
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
@ -8,7 +8,7 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-wasm:
|
build-wasm:
|
||||||
name: WASM (With Custom Marian)
|
name: WASM (Custom Marian) MacOS
|
||||||
runs-on: macos-10.15
|
runs-on: macos-10.15
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@ -35,10 +35,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Instantiate simd wormhole
|
- name: Instantiate simd wormhole
|
||||||
working-directory: build-wasm
|
working-directory: build-wasm
|
||||||
run: |
|
run: bash ../wasm/patch-artifacts-enable-wormhole.sh
|
||||||
sed -i.bak 's/var result = WebAssembly.instantiateStreaming(response, info);/var result = WebAssembly.instantiateStreaming(response, info, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
|
||||||
sed -i.bak 's/return WebAssembly.instantiate(binary, info);/return WebAssembly.instantiate(binary, info, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
|
||||||
sed -i.bak 's/var module = new WebAssembly.Module(bytes);/var module = new WebAssembly.Module(bytes, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
|
||||||
|
|
||||||
- name: Check artifacts
|
- name: Check artifacts
|
||||||
working-directory: build-wasm
|
working-directory: build-wasm
|
51
.github/workflows/wasm-custom_marian-ubuntu.yml
vendored
Normal file
51
.github/workflows/wasm-custom_marian-ubuntu.yml
vendored
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
name: WASM (Custom Marian) Ubuntu
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-wasm:
|
||||||
|
name: WASM (Custom Marian) Ubuntu
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Setup Emscripten toolchain
|
||||||
|
uses: mymindstorm/setup-emsdk@v8
|
||||||
|
|
||||||
|
- name: Verify Emscripten setup
|
||||||
|
run: emcc -v
|
||||||
|
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
|
||||||
|
- name: Configure builds
|
||||||
|
run: |
|
||||||
|
mkdir -p build-wasm
|
||||||
|
cd build-wasm
|
||||||
|
emcmake cmake -DCOMPILE_WASM=on ..
|
||||||
|
|
||||||
|
- name: Compile
|
||||||
|
working-directory: build-wasm
|
||||||
|
run: emmake make -j2
|
||||||
|
|
||||||
|
- name: Instantiate simd wormhole
|
||||||
|
working-directory: build-wasm
|
||||||
|
run: bash ../wasm/patch-artifacts-enable-wormhole.sh
|
||||||
|
|
||||||
|
- name: Check artifacts
|
||||||
|
working-directory: build-wasm
|
||||||
|
run: |
|
||||||
|
export WASM_ARTIFACTS_DIR=wasm
|
||||||
|
ls -all ${WASM_ARTIFACTS_DIR}
|
||||||
|
if ls ${WASM_ARTIFACTS_DIR}/*.wasm &>/dev/null && ls ${WASM_ARTIFACTS_DIR}/*.js &>/dev/null
|
||||||
|
then
|
||||||
|
echo "Artifacts Successfully Generated"
|
||||||
|
else
|
||||||
|
echo "Failure: Artifacts Not Present"
|
||||||
|
exit 1
|
||||||
|
fi
|
2
3rd_party/marian-dev
vendored
2
3rd_party/marian-dev
vendored
@ -1 +1 @@
|
|||||||
Subproject commit 8ddb73fad1001ae4c1697d2514ac1e5bd43e2ed3
|
Subproject commit 0f0bcf99626c660227bb68b76267a8d2451e7172
|
1
BERGAMOT_VERSION
Normal file
1
BERGAMOT_VERSION
Normal file
@ -0,0 +1 @@
|
|||||||
|
v0.0.0
|
@ -13,29 +13,21 @@ include(CMakeDependentOption)
|
|||||||
|
|
||||||
# Project specific cmake options
|
# Project specific cmake options
|
||||||
option(COMPILE_WASM "Compile for WASM" OFF)
|
option(COMPILE_WASM "Compile for WASM" OFF)
|
||||||
option(USE_WASM_COMPATIBLE_SOURCES "Use wasm compatible sources" ON)
|
option(USE_WASM_COMPATIBLE_SOURCE "Use wasm compatible sources" ON)
|
||||||
|
option(COMPILE_TESTS "Compile bergamot-tests" OFF)
|
||||||
|
|
||||||
SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds")
|
SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds")
|
||||||
|
|
||||||
# Set marian (3rd party submodule) cmake options to compile for this project
|
# Set 3rd party submodule specific cmake options for this project
|
||||||
SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
|
SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
|
||||||
SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
|
SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
|
||||||
SET(USE_STATIC_LIBS ON CACHE BOOL "Link statically against non-system libs")
|
SET(USE_STATIC_LIBS ON CACHE BOOL "Link statically against non-system libs")
|
||||||
if (USE_WASM_COMPATIBLE_SOURCES)
|
if (USE_WASM_COMPATIBLE_SOURCE)
|
||||||
# If using wasm compatible marian then set following flags
|
|
||||||
SET(COMPILE_LIBRARY_ONLY ON CACHE BOOL "Build only the Marian library and exclude all executables.")
|
SET(COMPILE_LIBRARY_ONLY ON CACHE BOOL "Build only the Marian library and exclude all executables.")
|
||||||
SET(USE_MKL OFF CACHE BOOL "Compile with MKL support")
|
SET(USE_MKL OFF CACHE BOOL "Compile with MKL support")
|
||||||
SET(COMPILE_DECODER_ONLY ON CACHE BOOL "Compile marian-decoder only")
|
# # Setting the ssplit-cpp submodule specific cmake options for wasm
|
||||||
SET(COMPILE_WITH_PTHREADS OFF CACHE BOOL "Compile with pthreads support")
|
SET(USE_INTERNAL_PCRE2 ON CACHE BOOL "Use internal PCRE2 instead of system PCRE2")
|
||||||
SET(USE_WASM_COMPATIBLE_BLAS ON CACHE BOOL "Compile with a WASM compatible blas for decoder only builds")
|
|
||||||
SET(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without exceptions")
|
|
||||||
if(COMPILE_WASM)
|
|
||||||
# Set WORMHOLE to ON for marian whenever compiling for wasm platform
|
|
||||||
SET(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160")
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
|
||||||
# Set ssplit (3rd party submodule) cmake options to compile for this project
|
|
||||||
CMAKE_DEPENDENT_OPTION(USE_INTERNAL_PCRE2 "Use internal PCRE2 instead of system PCRE2" ON
|
|
||||||
"USE_WASM_COMPATIBLE_SOURCES" OFF)
|
|
||||||
|
|
||||||
# Documentation: https://cliutils.gitlab.io/modern-cmake/chapters/projects/submodule.html
|
# Documentation: https://cliutils.gitlab.io/modern-cmake/chapters/projects/submodule.html
|
||||||
# Ensures the submodules are set correctly during a build.
|
# Ensures the submodules are set correctly during a build.
|
||||||
@ -60,11 +52,17 @@ if(NOT COMPILE_WASM)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(COMPILE_WASM)
|
if(COMPILE_WASM)
|
||||||
|
set(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160")
|
||||||
list(APPEND WASM_COMPILE_FLAGS -pthread -O3 -g2 -fPIC -mssse3 -msimd128)
|
list(APPEND WASM_COMPILE_FLAGS -pthread -O3 -g2 -fPIC -mssse3 -msimd128)
|
||||||
list(APPEND WASM_COMPILE_FLAGS "SHELL:-s WASM=1" "SHELL:-s ASSERTIONS=0" "SHELL:-s DISABLE_EXCEPTION_CATCHING=1" "SHELL:-s LLD_REPORT_UNDEFINED" "SHELL:-s FORCE_FILESYSTEM=1" "SHELL:-s ALLOW_MEMORY_GROWTH=1")
|
list(APPEND WASM_COMPILE_FLAGS "SHELL:-s WASM=1" "SHELL:-s ASSERTIONS=0" "SHELL:-s DISABLE_EXCEPTION_CATCHING=1" "SHELL:-s LLD_REPORT_UNDEFINED" "SHELL:-s FORCE_FILESYSTEM=1" "SHELL:-s ALLOW_MEMORY_GROWTH=1")
|
||||||
list(APPEND WASM_COMPILE_FLAGS -Wno-error=pthreads-mem-growth)
|
list(APPEND WASM_COMPILE_FLAGS -Wno-error=pthreads-mem-growth)
|
||||||
endif(COMPILE_WASM)
|
endif(COMPILE_WASM)
|
||||||
|
|
||||||
|
# Needs to be enabled before including the folder containing tests (src/tests)
|
||||||
|
if(COMPILE_TESTS)
|
||||||
|
enable_testing()
|
||||||
|
endif(COMPILE_TESTS)
|
||||||
|
|
||||||
add_subdirectory(3rd_party)
|
add_subdirectory(3rd_party)
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
|
||||||
@ -73,3 +71,4 @@ if(COMPILE_WASM)
|
|||||||
else()
|
else()
|
||||||
add_subdirectory(app)
|
add_subdirectory(app)
|
||||||
endif(COMPILE_WASM)
|
endif(COMPILE_WASM)
|
||||||
|
|
||||||
|
2494
Doxyfile.in
Normal file
2494
Doxyfile.in
Normal file
File diff suppressed because it is too large
Load Diff
13
README.md
13
README.md
@ -77,17 +77,16 @@ Bergamot translator provides a unified API for ([Marian NMT](https://marian-nmt.
|
|||||||
emmake make -j
|
emmake make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The wasm artifacts (.js and .wasm files) will be available in `wasm` folder of build directory ("build-wasm" in this case).
|
||||||
|
|
||||||
3. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts
|
3. Enable SIMD Wormhole via Wasm instantiation API in generated artifacts
|
||||||
|
```bash
|
||||||
|
bash ../wasm/patch-artifacts-enable-wormhole.sh
|
||||||
```
|
```
|
||||||
sed -i.bak 's/var result = WebAssembly.instantiateStreaming(response, info);/var result = WebAssembly.instantiateStreaming(response, info, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
|
||||||
sed -i.bak 's/return WebAssembly.instantiate(binary, info);/return WebAssembly.instantiate(binary, info, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
|
||||||
sed -i.bak 's/var module = new WebAssembly.Module(bytes);/var module = new WebAssembly.Module(bytes, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
|
||||||
```
|
|
||||||
The artefacts (.js and .wasm files) will be available in `wasm` folder of build directory ("build-wasm" in this case).
|
|
||||||
|
|
||||||
#### Recompiling
|
#### Recompiling
|
||||||
As long as you don't update any submodule, just follow steps in `4.ii` to recompile.\
|
As long as you don't update any submodule, just follow steps in `4.ii` and `4.iii` to recompile.\
|
||||||
If you update a submodule, execute following command before executing steps in `4.ii` to recompile.
|
If you update a submodule, execute following command before executing steps in `4.ii` and `4.iii` to recompile.
|
||||||
```bash
|
```bash
|
||||||
git submodule update --init --recursive
|
git submodule update --init --recursive
|
||||||
```
|
```
|
||||||
|
@ -1,10 +1,16 @@
|
|||||||
add_executable(bergamot-translator-app main.cpp)
|
add_executable(bergamot-translator-app bergamot-translator-app.cpp)
|
||||||
target_link_libraries(bergamot-translator-app PRIVATE bergamot-translator)
|
target_link_libraries(bergamot-translator-app PRIVATE bergamot-translator)
|
||||||
|
|
||||||
if (NOT USE_WASM_COMPATIBLE_SOURCES)
|
add_executable(bergamot-translator-app-bytearray bergamot-translator-app-bytearray.cpp)
|
||||||
add_executable(service-cli main-mts.cpp)
|
target_link_libraries(bergamot-translator-app-bytearray PRIVATE bergamot-translator)
|
||||||
|
|
||||||
|
if (NOT USE_WASM_COMPATIBLE_SOURCE)
|
||||||
|
add_executable(service-cli service-cli.cpp)
|
||||||
target_link_libraries(service-cli PRIVATE bergamot-translator)
|
target_link_libraries(service-cli PRIVATE bergamot-translator)
|
||||||
|
|
||||||
|
add_executable(service-cli-bytearray service-cli-bytearray.cpp)
|
||||||
|
target_link_libraries(service-cli-bytearray PRIVATE bergamot-translator)
|
||||||
|
|
||||||
add_executable(marian-decoder-new marian-decoder-new.cpp)
|
add_executable(marian-decoder-new marian-decoder-new.cpp)
|
||||||
target_link_libraries(marian-decoder-new PRIVATE bergamot-translator)
|
target_link_libraries(marian-decoder-new PRIVATE bergamot-translator)
|
||||||
endif()
|
endif()
|
||||||
|
42
app/bergamot-translator-app-bytearray.cpp
Normal file
42
app/bergamot-translator-app-bytearray.cpp
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
/*
|
||||||
|
* main.cpp
|
||||||
|
*
|
||||||
|
* An example application to demonstrate the use of Bergamot translator.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
#include "TranslationModel.h"
|
||||||
|
#include "translator/parser.h"
|
||||||
|
#include "translator/byte_array_util.h"
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
|
||||||
|
// Create a configParser and load command line parameters into a YAML config
|
||||||
|
// string.
|
||||||
|
auto configParser = marian::bergamot::createConfigParser();
|
||||||
|
auto options = configParser.parseOptions(argc, argv, true);
|
||||||
|
std::string config = options->asYamlString();
|
||||||
|
|
||||||
|
// Route the config string to construct marian model through TranslationModel
|
||||||
|
TranslationModel model(config, marian::bergamot::getModelMemoryFromConfig(options));
|
||||||
|
|
||||||
|
TranslationRequest translationRequest;
|
||||||
|
std::vector<std::string> texts;
|
||||||
|
|
||||||
|
for (std::string line; std::getline(std::cin, line);) {
|
||||||
|
texts.emplace_back(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto results = model.translate(std::move(texts), translationRequest);
|
||||||
|
|
||||||
|
// Resolve the future and get the actual result
|
||||||
|
//std::vector<TranslationResult> results = futureResults.get();
|
||||||
|
|
||||||
|
for (auto &result : results) {
|
||||||
|
std::cout << result.getTranslatedText() << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
43
app/bergamot-translator-app.cpp
Normal file
43
app/bergamot-translator-app.cpp
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
/*
|
||||||
|
* main.cpp
|
||||||
|
*
|
||||||
|
* An application which accepts line separated texts in stdin and returns translated ones in stdout.
|
||||||
|
* It is convenient for batch processing and can be used with tools like SacreBLEU.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "TranslationModel.h"
|
||||||
|
#include "translator/parser.h"
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
|
||||||
|
// Create a configParser and load command line parameters into a YAML config
|
||||||
|
// string.
|
||||||
|
auto configParser = marian::bergamot::createConfigParser();
|
||||||
|
auto options = configParser.parseOptions(argc, argv, true);
|
||||||
|
std::string config = options->asYamlString();
|
||||||
|
|
||||||
|
// Route the config string to construct marian model through TranslationModel
|
||||||
|
TranslationModel model(config);
|
||||||
|
|
||||||
|
TranslationRequest translationRequest;
|
||||||
|
std::vector<std::string> texts;
|
||||||
|
|
||||||
|
for (std::string line; std::getline(std::cin, line);) {
|
||||||
|
texts.emplace_back(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto results = model.translate(std::move(texts), translationRequest);
|
||||||
|
|
||||||
|
// Resolve the future and get the actual result
|
||||||
|
//std::vector<TranslationResult> results = futureResults.get();
|
||||||
|
|
||||||
|
for (auto &result : results) {
|
||||||
|
std::cout << result.getTranslatedText() << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -1,33 +0,0 @@
|
|||||||
#include <cstdlib>
|
|
||||||
#include <future>
|
|
||||||
#include <iostream>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include "common/definitions.h"
|
|
||||||
#include "common/utils.h"
|
|
||||||
#include "marian.h"
|
|
||||||
#include "translator/parser.h"
|
|
||||||
#include "translator/response.h"
|
|
||||||
#include "translator/service.h"
|
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
|
||||||
auto cp = marian::bergamot::createConfigParser();
|
|
||||||
auto options = cp.parseOptions(argc, argv, true);
|
|
||||||
marian::bergamot::Service service(options);
|
|
||||||
|
|
||||||
// Read a large input text blob from stdin
|
|
||||||
std::ostringstream std_input;
|
|
||||||
std_input << std::cin.rdbuf();
|
|
||||||
std::string input = std_input.str();
|
|
||||||
using marian::bergamot::Response;
|
|
||||||
|
|
||||||
// Wait on future until Response is complete
|
|
||||||
std::future<Response> responseFuture = service.translate(std::move(input));
|
|
||||||
responseFuture.wait();
|
|
||||||
Response response = responseFuture.get();
|
|
||||||
std::cout << response.translation() << std::endl;
|
|
||||||
|
|
||||||
// Stop Service.
|
|
||||||
service.stop();
|
|
||||||
return 0;
|
|
||||||
}
|
|
67
app/main.cpp
67
app/main.cpp
@ -1,67 +0,0 @@
|
|||||||
/*
|
|
||||||
* main.cpp
|
|
||||||
*
|
|
||||||
* An example application to demonstrate the use of Bergamot translator.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
#include "AbstractTranslationModel.h"
|
|
||||||
#include "TranslationRequest.h"
|
|
||||||
#include "TranslationResult.h"
|
|
||||||
#include "translator/parser.h"
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
|
|
||||||
// Create a configParser and load command line parameters into a YAML config
|
|
||||||
// string.
|
|
||||||
auto configParser = marian::bergamot::createConfigParser();
|
|
||||||
auto options = configParser.parseOptions(argc, argv, true);
|
|
||||||
std::string config = options->asYamlString();
|
|
||||||
|
|
||||||
// Route the config string to construct marian model through
|
|
||||||
// AbstractTranslationModel
|
|
||||||
std::shared_ptr<AbstractTranslationModel> model =
|
|
||||||
AbstractTranslationModel::createInstance(config);
|
|
||||||
|
|
||||||
TranslationRequest translationRequest;
|
|
||||||
std::vector<std::string> texts;
|
|
||||||
texts.emplace_back(
|
|
||||||
"The Bergamot project will add and improve client-side machine "
|
|
||||||
"translation in a web browser. Unlike current cloud-based "
|
|
||||||
"options, running directly on users’ machines empowers citizens to "
|
|
||||||
"preserve their privacy and increases the uptake of language "
|
|
||||||
"technologies in Europe in various sectors that require "
|
|
||||||
"confidentiality.");
|
|
||||||
texts.emplace_back(
|
|
||||||
"Free software integrated with an open-source web "
|
|
||||||
"browser, such as Mozilla Firefox, will enable bottom-up adoption "
|
|
||||||
"by non-experts, resulting in cost savings for private and public "
|
|
||||||
"sector users who would otherwise procure translation or operate "
|
|
||||||
"monolingually. Bergamot is a consortium coordinated by the "
|
|
||||||
"University of Edinburgh with partners Charles University in "
|
|
||||||
"Prague, the University of Sheffield, University of Tartu, and "
|
|
||||||
"Mozilla.");
|
|
||||||
|
|
||||||
auto results = model->translate(std::move(texts), translationRequest);
|
|
||||||
|
|
||||||
// Resolve the future and get the actual result
|
|
||||||
//std::vector<TranslationResult> results = futureResults.get();
|
|
||||||
|
|
||||||
for (auto &result : results) {
|
|
||||||
std::cout << "[original]: " << result.getOriginalText() << std::endl;
|
|
||||||
std::cout << "[translated]: " << result.getTranslatedText() << std::endl;
|
|
||||||
auto mappings = result.getSentenceMappings();
|
|
||||||
for (auto &p : mappings) {
|
|
||||||
std::string_view src = p.first;
|
|
||||||
std::string_view tgt = p.second;
|
|
||||||
|
|
||||||
std::cout << " [src Sentence]: " << src << std::endl;
|
|
||||||
std::cout << " [tgt Sentence]: " << tgt << std::endl;
|
|
||||||
}
|
|
||||||
std::cout << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -14,25 +14,11 @@
|
|||||||
#include "translator/response.h"
|
#include "translator/response.h"
|
||||||
#include "translator/service.h"
|
#include "translator/service.h"
|
||||||
|
|
||||||
void marian_decoder_minimal(const marian::Histories &histories,
|
void marian_decoder_minimal(const marian::bergamot::Response &response,
|
||||||
marian::Ptr<marian::Vocab const> targetVocab,
|
|
||||||
marian::Ptr<marian::Options> options) {
|
marian::Ptr<marian::Options> options) {
|
||||||
|
// We are no longer marian-decoder compatible. Server ideas are on hold.
|
||||||
bool doNbest = options->get<bool>("n-best");
|
for (size_t sentenceIdx = 0; sentenceIdx < response.size(); sentenceIdx++) {
|
||||||
auto collector =
|
std::cout << response.target.sentence(sentenceIdx) << "\n";
|
||||||
marian::New<marian::OutputCollector>(options->get<std::string>("output"));
|
|
||||||
|
|
||||||
// There is a dependency of vocabs here.
|
|
||||||
auto printer = marian::New<marian::OutputPrinter>(options, targetVocab);
|
|
||||||
if (options->get<bool>("quiet-translation"))
|
|
||||||
collector->setPrintingStrategy(marian::New<marian::QuietPrinting>());
|
|
||||||
|
|
||||||
for (auto &history : histories) {
|
|
||||||
std::stringstream best1;
|
|
||||||
std::stringstream bestn;
|
|
||||||
printer->print(history, best1, bestn);
|
|
||||||
collector->Write((long)history->getLineNum(), best1.str(), bestn.str(),
|
|
||||||
doNbest);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,9 +39,8 @@ int main(int argc, char *argv[]) {
|
|||||||
responseFuture.wait();
|
responseFuture.wait();
|
||||||
const Response &response = responseFuture.get();
|
const Response &response = responseFuture.get();
|
||||||
|
|
||||||
marian_decoder_minimal(response.histories(), service.targetVocab(), options);
|
marian_decoder_minimal(response, options);
|
||||||
|
|
||||||
LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed());
|
LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed());
|
||||||
service.stop();
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
86
app/service-cli-bytearray.cpp
Normal file
86
app/service-cli-bytearray.cpp
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
#include <cstdlib>
|
||||||
|
#include <future>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#include "common/definitions.h"
|
||||||
|
#include "common/utils.h"
|
||||||
|
#include "marian.h"
|
||||||
|
#include "translator/parser.h"
|
||||||
|
#include "translator/response.h"
|
||||||
|
#include "translator/service.h"
|
||||||
|
#include "translator/byte_array_util.h"
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
auto cp = marian::bergamot::createConfigParser();
|
||||||
|
auto options = cp.parseOptions(argc, argv, true);
|
||||||
|
|
||||||
|
// Prepare memories for model and shortlist
|
||||||
|
marian::bergamot::AlignedMemory modelBytes = marian::bergamot::getModelMemoryFromConfig(options);
|
||||||
|
marian::bergamot::AlignedMemory shortlistBytes = marian::bergamot::getShortlistMemoryFromConfig(options);
|
||||||
|
|
||||||
|
marian::bergamot::Service service(options, std::move(modelBytes), std::move(shortlistBytes));
|
||||||
|
|
||||||
|
// Read a large input text blob from stdin
|
||||||
|
std::ostringstream std_input;
|
||||||
|
std_input << std::cin.rdbuf();
|
||||||
|
std::string input = std_input.str();
|
||||||
|
using marian::bergamot::Response;
|
||||||
|
|
||||||
|
// Wait on future until Response is complete
|
||||||
|
std::future<Response> responseFuture = service.translate(std::move(input));
|
||||||
|
responseFuture.wait();
|
||||||
|
Response response = responseFuture.get();
|
||||||
|
|
||||||
|
std::cout << "[original]: " << response.source.text << '\n';
|
||||||
|
std::cout << "[translated]: " << response.target.text << '\n';
|
||||||
|
for (int sentenceIdx = 0; sentenceIdx < response.size(); sentenceIdx++) {
|
||||||
|
std::cout << " [src Sentence]: " << response.source.sentence(sentenceIdx)
|
||||||
|
<< '\n';
|
||||||
|
std::cout << " [tgt Sentence]: " << response.target.sentence(sentenceIdx)
|
||||||
|
<< '\n';
|
||||||
|
std::cout << "Alignments" << '\n';
|
||||||
|
typedef std::pair<size_t, float> Point;
|
||||||
|
|
||||||
|
// Initialize a point vector.
|
||||||
|
std::vector<std::vector<Point>> aggregate(
|
||||||
|
response.source.numWords(sentenceIdx));
|
||||||
|
|
||||||
|
// Handle alignments
|
||||||
|
auto &alignments = response.alignments[sentenceIdx];
|
||||||
|
for (auto &p : alignments) {
|
||||||
|
aggregate[p.src].emplace_back(p.tgt, p.prob);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t src = 0; src < aggregate.size(); src++) {
|
||||||
|
std::cout << response.source.word(sentenceIdx, src) << ": ";
|
||||||
|
for (auto &p : aggregate[src]) {
|
||||||
|
std::cout << response.target.word(sentenceIdx, p.first) << "("
|
||||||
|
<< p.second << ") ";
|
||||||
|
}
|
||||||
|
std::cout << '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle quality.
|
||||||
|
auto &quality = response.qualityScores[sentenceIdx];
|
||||||
|
std::cout << "Quality: whole(" << quality.sequence
|
||||||
|
<< "), tokens below:" << '\n';
|
||||||
|
size_t wordIdx = 0;
|
||||||
|
bool first = true;
|
||||||
|
for (auto &p : quality.word) {
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
std::cout << " ";
|
||||||
|
}
|
||||||
|
std::cout << response.target.word(sentenceIdx, wordIdx) << "(" << p
|
||||||
|
<< ")";
|
||||||
|
wordIdx++;
|
||||||
|
}
|
||||||
|
std::cout << '\n';
|
||||||
|
}
|
||||||
|
std::cout << "--------------------------\n";
|
||||||
|
std::cout << '\n';
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
80
app/service-cli.cpp
Normal file
80
app/service-cli.cpp
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
#include <cstdlib>
|
||||||
|
#include <future>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#include "common/definitions.h"
|
||||||
|
#include "common/utils.h"
|
||||||
|
#include "marian.h"
|
||||||
|
#include "translator/parser.h"
|
||||||
|
#include "translator/response.h"
|
||||||
|
#include "translator/service.h"
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
auto cp = marian::bergamot::createConfigParser();
|
||||||
|
auto options = cp.parseOptions(argc, argv, true);
|
||||||
|
marian::bergamot::Service service(options);
|
||||||
|
|
||||||
|
// Read a large input text blob from stdin
|
||||||
|
std::ostringstream std_input;
|
||||||
|
std_input << std::cin.rdbuf();
|
||||||
|
std::string input = std_input.str();
|
||||||
|
using marian::bergamot::Response;
|
||||||
|
|
||||||
|
// Wait on future until Response is complete
|
||||||
|
std::future<Response> responseFuture = service.translate(std::move(input));
|
||||||
|
responseFuture.wait();
|
||||||
|
Response response = responseFuture.get();
|
||||||
|
|
||||||
|
std::cout << "[original]: " << response.source.text << '\n';
|
||||||
|
std::cout << "[translated]: " << response.target.text << '\n';
|
||||||
|
for (int sentenceIdx = 0; sentenceIdx < response.size(); sentenceIdx++) {
|
||||||
|
std::cout << " [src Sentence]: " << response.source.sentence(sentenceIdx)
|
||||||
|
<< '\n';
|
||||||
|
std::cout << " [tgt Sentence]: " << response.target.sentence(sentenceIdx)
|
||||||
|
<< '\n';
|
||||||
|
std::cout << "Alignments" << '\n';
|
||||||
|
typedef std::pair<size_t, float> Point;
|
||||||
|
|
||||||
|
// Initialize a point vector.
|
||||||
|
std::vector<std::vector<Point>> aggregate(
|
||||||
|
response.source.numWords(sentenceIdx));
|
||||||
|
|
||||||
|
// Handle alignments
|
||||||
|
auto &alignments = response.alignments[sentenceIdx];
|
||||||
|
for (auto &p : alignments) {
|
||||||
|
aggregate[p.src].emplace_back(p.tgt, p.prob);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t src = 0; src < aggregate.size(); src++) {
|
||||||
|
std::cout << response.source.word(sentenceIdx, src) << ": ";
|
||||||
|
for (auto &p : aggregate[src]) {
|
||||||
|
std::cout << response.target.word(sentenceIdx, p.first) << "("
|
||||||
|
<< p.second << ") ";
|
||||||
|
}
|
||||||
|
std::cout << '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle quality.
|
||||||
|
auto &quality = response.qualityScores[sentenceIdx];
|
||||||
|
std::cout << "Quality: whole(" << quality.sequence
|
||||||
|
<< "), tokens below:" << '\n';
|
||||||
|
size_t wordIdx = 0;
|
||||||
|
bool first = true;
|
||||||
|
for (auto &p : quality.word) {
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
std::cout << " ";
|
||||||
|
}
|
||||||
|
std::cout << response.target.word(sentenceIdx, wordIdx) << "(" << p
|
||||||
|
<< ")";
|
||||||
|
wordIdx++;
|
||||||
|
}
|
||||||
|
std::cout << '\n';
|
||||||
|
}
|
||||||
|
std::cout << "--------------------------\n";
|
||||||
|
std::cout << '\n';
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
4
doc/.gitignore
vendored
Normal file
4
doc/.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
api
|
||||||
|
build
|
||||||
|
doxygen
|
||||||
|
venv
|
51
doc/README.md
Normal file
51
doc/README.md
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
# Marian NMT code documentation and library API
|
||||||
|
|
||||||
|
This directory contains code documentation and library API for developers of Marian NMT.
|
||||||
|
|
||||||
|
The documentation is generated using
|
||||||
|
[Sphinx](https://www.sphinx-doc.org/en/master/usage/quickstart.html) +
|
||||||
|
[Breathe](https://breathe.readthedocs.io/en/latest/directives.html) +
|
||||||
|
[Doxygen](http://www.doxygen.nl/manual/docblocks.html) +
|
||||||
|
[Exhale](https://exhale.readthedocs.io/en/latest/usage.html).
|
||||||
|
The documentation source code is written in `.rst` or `.md` files with special directives that allow
|
||||||
|
to reference to C++ source code and documentation. The source documents are then build into static
|
||||||
|
HTML pages.
|
||||||
|
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
On Ubuntu 20.04, install the following packages:
|
||||||
|
|
||||||
|
sudo apt-get install python3 python3-pip python3-setuptools doxygen
|
||||||
|
|
||||||
|
Then set up a Python environment and install modules:
|
||||||
|
|
||||||
|
pip3 install virtualenv
|
||||||
|
virtualenv venv -p python3
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
Documentation building should also work on Windows, but it has not been tested.
|
||||||
|
|
||||||
|
|
||||||
|
## Generation
|
||||||
|
|
||||||
|
The documentation can be generated by running:
|
||||||
|
|
||||||
|
make html
|
||||||
|
|
||||||
|
The website will be generated into `build/html` and accessible by opening _index.html_ in your
|
||||||
|
browser.
|
||||||
|
|
||||||
|
Directories:
|
||||||
|
|
||||||
|
- `build` - automatically output directory for HTML documentation
|
||||||
|
- `doxygen` - automatically generated Doxygen XML files
|
||||||
|
- `api` - automatic library API generated with Exhale
|
||||||
|
- `.rst` and `.md` files in this directory and its subdirectories are documentation source files
|
||||||
|
- `_static` - custom CSS and JavaScript files
|
||||||
|
|
||||||
|
|
||||||
|
## Writing documentation
|
||||||
|
|
||||||
|
To be documented...
|
4
doc/_static/css/custom.css
vendored
Normal file
4
doc/_static/css/custom.css
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
.wy-body-for-nav > .wy-grid-for-nav > .wy-nav-side {
|
||||||
|
border-bottom: 5px solid #28bbee;
|
||||||
|
/*background-color: #494d55;*/
|
||||||
|
}
|
120
doc/conf.py
Normal file
120
doc/conf.py
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
# Configuration file for the Sphinx documentation builder.
|
||||||
|
#
|
||||||
|
# This file only contains a selection of the most common options. For a full
|
||||||
|
# list see the documentation:
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||||
|
|
||||||
|
# -- Path setup --------------------------------------------------------------
|
||||||
|
|
||||||
|
# If extensions (or modules to document with autodoc) are in another directory,
|
||||||
|
# add these directories to sys.path here. If the directory is relative to the
|
||||||
|
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||||
|
#
|
||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.abspath('.'))
|
||||||
|
|
||||||
|
|
||||||
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
|
project = 'Bergamot Translator'
|
||||||
|
copyright = '2021, Bergamot Translator Team'
|
||||||
|
author = 'Bergamot Translator Team'
|
||||||
|
|
||||||
|
# The full version, including alpha/beta/rc tags
|
||||||
|
# TODO: add GitHub commit hash to the version
|
||||||
|
version_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'BERGAMOT_VERSION')
|
||||||
|
with open(os.path.abspath(version_file)) as f:
|
||||||
|
version = f.read().strip()
|
||||||
|
release = version + ' ' + str(datetime.date.today())
|
||||||
|
|
||||||
|
|
||||||
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|
||||||
|
# Add any Sphinx extension module names here, as strings. They can be
|
||||||
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
|
# ones.
|
||||||
|
extensions = [
|
||||||
|
'sphinx.ext.imgmath',
|
||||||
|
'sphinx.ext.todo',
|
||||||
|
'breathe',
|
||||||
|
'exhale',
|
||||||
|
'recommonmark',
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
|
templates_path = ['_templates']
|
||||||
|
|
||||||
|
# List of patterns, relative to source directory, that match files and
|
||||||
|
# directories to ignore when looking for source files.
|
||||||
|
# This pattern also affects html_static_path and html_extra_path.
|
||||||
|
exclude_patterns = [
|
||||||
|
'build',
|
||||||
|
'doxygen',
|
||||||
|
'venv',
|
||||||
|
'README.md',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
|
||||||
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||||
|
# a list of builtin themes.
|
||||||
|
#
|
||||||
|
html_theme = 'sphinx_rtd_theme'
|
||||||
|
htmlhelp_basename = 'bergamot-translator'
|
||||||
|
|
||||||
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
|
html_static_path = ['_static']
|
||||||
|
html_css_files = ['css/custom.css']
|
||||||
|
|
||||||
|
# The base URL which points to the root of the HTML documentation
|
||||||
|
html_baseurl = 'http://jerinphilip.github.io/bergamot-translator'
|
||||||
|
|
||||||
|
|
||||||
|
# -- Extension configuration -------------------------------------------------
|
||||||
|
|
||||||
|
breathe_projects = { 'bergamot-translator': './doxygen/xml' }
|
||||||
|
breathe_default_project = 'bergamot-translator'
|
||||||
|
|
||||||
|
doxygen_config = """
|
||||||
|
INPUT = ../src
|
||||||
|
EXCLUDE += ../3rd_party
|
||||||
|
EXCLUDE += ../src/tests
|
||||||
|
EXCLUDE_PATTERNS = *.md *.txt
|
||||||
|
FILE_PATTERNS += *.cu
|
||||||
|
EXTENSION_MAPPING += cu=C++ inc=C++
|
||||||
|
ENABLE_PREPROCESSING = YES
|
||||||
|
JAVADOC_AUTOBRIEF = YES
|
||||||
|
WARN_IF_UNDOCUMENTED = NO
|
||||||
|
"""
|
||||||
|
|
||||||
|
exhale_args = {
|
||||||
|
'containmentFolder' : './api',
|
||||||
|
'rootFileName' : 'library_index.rst',
|
||||||
|
'rootFileTitle' : 'Library API',
|
||||||
|
'doxygenStripFromPath' : '..',
|
||||||
|
'createTreeView' : True,
|
||||||
|
'exhaleExecutesDoxygen' : True,
|
||||||
|
'exhaleDoxygenStdin' : doxygen_config.strip(),
|
||||||
|
}
|
||||||
|
|
||||||
|
primary_domain = 'cpp'
|
||||||
|
highlight_language = 'cpp'
|
||||||
|
|
||||||
|
# A trick to include markdown files from outside the source directory using
|
||||||
|
# 'mdinclude'. Warning: all other markdown files not included via 'mdinclude'
|
||||||
|
# will be rendered using recommonmark as recommended by Sphinx
|
||||||
|
from m2r import MdInclude
|
||||||
|
|
||||||
|
def setup(app):
|
||||||
|
# from m2r to make `mdinclude` work
|
||||||
|
app.add_config_value('no_underscore_emphasis', False, 'env')
|
||||||
|
app.add_config_value('m2r_parse_relative_links', False, 'env')
|
||||||
|
app.add_config_value('m2r_anonymous_references', False, 'env')
|
||||||
|
app.add_config_value('m2r_disable_inline_math', False, 'env')
|
||||||
|
app.add_directive('mdinclude', MdInclude)
|
38
doc/index.rst
Normal file
38
doc/index.rst
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
Welcome to Bergamot Translator's documentation!
|
||||||
|
===============================================
|
||||||
|
|
||||||
|
|buildcpu| |tests| |release| |license|
|
||||||
|
|
||||||
|
Bergamot translator provides a unified API for (Marian NMT framework based)
|
||||||
|
neural machine translation functionality in accordance with the Bergamot
|
||||||
|
project that focuses on improving client-side machine translation in a web
|
||||||
|
browser.
|
||||||
|
|
||||||
|
This is developer documentation.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
:caption: Contents:
|
||||||
|
|
||||||
|
marian-integration
|
||||||
|
api/library_index
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Indices and tables
|
||||||
|
------------------
|
||||||
|
|
||||||
|
* :ref:`genindex`
|
||||||
|
|
||||||
|
|
||||||
|
.. |buildcpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/browsermt/job/bergamot-translator.svg?label=CPU%20Build
|
||||||
|
:target: http://vali.inf.ed.ac.uk/jenkins/job/bergamot-translator
|
||||||
|
:alt: CPU build status
|
||||||
|
|
||||||
|
.. |tests| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/bergamot-translator-regression-tests.svg?label=Tests
|
||||||
|
:target: http://vali.inf.ed.ac.uk/jenkins/job/bergamot-translator-regression-tests/
|
||||||
|
:alt: Tests status
|
||||||
|
|
||||||
|
.. |license| image:: https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg
|
||||||
|
:target: https://opensource.org/licenses/MPL-2.0
|
||||||
|
:alt: License: MPL
|
35
doc/make.bat
Normal file
35
doc/make.bat
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
@ECHO OFF
|
||||||
|
|
||||||
|
pushd %~dp0
|
||||||
|
|
||||||
|
REM Command file for Sphinx documentation
|
||||||
|
|
||||||
|
if "%SPHINXBUILD%" == "" (
|
||||||
|
set SPHINXBUILD=sphinx-build
|
||||||
|
)
|
||||||
|
set SOURCEDIR=source
|
||||||
|
set BUILDDIR=build
|
||||||
|
|
||||||
|
if "%1" == "" goto help
|
||||||
|
|
||||||
|
%SPHINXBUILD% >NUL 2>NUL
|
||||||
|
if errorlevel 9009 (
|
||||||
|
echo.
|
||||||
|
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||||
|
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||||
|
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||||
|
echo.may add the Sphinx directory to PATH.
|
||||||
|
echo.
|
||||||
|
echo.If you don't have Sphinx installed, grab it from
|
||||||
|
echo.http://sphinx-doc.org/
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
goto end
|
||||||
|
|
||||||
|
:help
|
||||||
|
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
|
||||||
|
:end
|
||||||
|
popd
|
@ -1,4 +1,4 @@
|
|||||||
# Marian Integration
|
# Building marian code for bergamot
|
||||||
|
|
||||||
This document summarizes the minimal build instructions develop for the
|
This document summarizes the minimal build instructions develop for the
|
||||||
marian-code powering bergamot-translator.
|
marian-code powering bergamot-translator.
|
||||||
@ -10,7 +10,7 @@ $ git clone https://github.com/browsermt/bergamot-translator
|
|||||||
$ cd bergamot-translator
|
$ cd bergamot-translator
|
||||||
$ mkdir build
|
$ mkdir build
|
||||||
$ cd build
|
$ cd build
|
||||||
$ cmake .. -DUSE_WASM_COMPATIBLE_SOURCES=off -DCMAKE_BUILD_TYPE=Release
|
$ cmake .. -DUSE_WASM_COMPATIBLE_SOURCE=off -DCMAKE_BUILD_TYPE=Release
|
||||||
$ make -j
|
$ make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
|
0
doc/references.bib
Normal file
0
doc/references.bib
Normal file
6
doc/requirements.txt
Normal file
6
doc/requirements.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
sphinx==2.4.4
|
||||||
|
breathe==4.13.0
|
||||||
|
exhale
|
||||||
|
sphinx_rtd_theme
|
||||||
|
recommonmark
|
||||||
|
m2r
|
@ -1,68 +0,0 @@
|
|||||||
/*
|
|
||||||
* AbstractTranslationModel.h
|
|
||||||
*
|
|
||||||
* An interface for a translation model for translating a plain (without any
|
|
||||||
* markups and emojis) UTF-8 encoded text. The model supports translation from 1
|
|
||||||
* source language to 1 target language. There can be different implementations
|
|
||||||
* of this interface.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef SRC_TRANSLATOR_ABSTRACTTRANSLATIONMODEL_H_
|
|
||||||
#define SRC_TRANSLATOR_ABSTRACTTRANSLATIONMODEL_H_
|
|
||||||
|
|
||||||
#include <future>
|
|
||||||
#include <memory>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "TranslationRequest.h"
|
|
||||||
#include "TranslationResult.h"
|
|
||||||
|
|
||||||
/* An interface for a translation model for translating a plain (without any
|
|
||||||
* markups and emojis) UTF-8 encoded text. The model supports translation from 1
|
|
||||||
* source language to 1 target language.
|
|
||||||
*/
|
|
||||||
class AbstractTranslationModel {
|
|
||||||
public:
|
|
||||||
/* A Factory method to create and return an instance of an implementation of
|
|
||||||
* AbstractTranslationModel. The instance is created using translation model
|
|
||||||
* configuration provided as yaml-formatted string.
|
|
||||||
*/
|
|
||||||
static std::shared_ptr<AbstractTranslationModel>
|
|
||||||
createInstance(const std::string &config);
|
|
||||||
|
|
||||||
AbstractTranslationModel() = default;
|
|
||||||
|
|
||||||
virtual ~AbstractTranslationModel() = default;
|
|
||||||
|
|
||||||
/* This method performs translation on a list of (UTF-8 encoded) texts and
|
|
||||||
* returns a list of results in the same order. Each text entry can either be
|
|
||||||
* a word, a phrase, a sentence or a list of sentences and should contain
|
|
||||||
* plain text (without any markups or emojis). Additional information related
|
|
||||||
* to the translated text can be requested via TranslationRequest which is
|
|
||||||
* applied equally to each text entry.
|
|
||||||
*
|
|
||||||
* The translated text corresponding to each text entry and the additional
|
|
||||||
* information (as specified in the TranslationRequest) is encapsulated and
|
|
||||||
* returned in TranslationResult.
|
|
||||||
*
|
|
||||||
* The API splits each text entry into sentences internally, which are then
|
|
||||||
* translated independent of each other. The translated sentences are then
|
|
||||||
* joined together and returned in TranslationResult. Please refer to the
|
|
||||||
* TranslationRequest class to find out what additional information can be
|
|
||||||
* requested. The alignment information can only be requested if the model
|
|
||||||
* supports it (check isAlignmentSupported() API).
|
|
||||||
*
|
|
||||||
* The texts argument will become empty after the execution of this API (each
|
|
||||||
* entry of texts list will be moved to its corresponding TranslationResult
|
|
||||||
* object).
|
|
||||||
*/
|
|
||||||
virtual std::vector<TranslationResult>
|
|
||||||
translate(std::vector<std::string> &&texts, TranslationRequest request) = 0;
|
|
||||||
|
|
||||||
/* Check if the model can provide alignment information b/w original and
|
|
||||||
* translated text. */
|
|
||||||
virtual bool isAlignmentSupported() const = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif /* SRC_TRANSLATOR_ABSTRACTTRANSLATIONMODEL_H_ */
|
|
@ -1 +1,7 @@
|
|||||||
add_subdirectory(translator)
|
add_subdirectory(translator)
|
||||||
|
|
||||||
|
if(COMPILE_TESTS)
|
||||||
|
# Catch currently comes from marian sources.
|
||||||
|
add_subdirectory(tests)
|
||||||
|
endif(COMPILE_TESTS)
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* TranslationModel.h
|
* TranslationModel.h
|
||||||
*
|
*
|
||||||
* A implementation of AbstractTranslationModel interface.
|
* Main interface for translation API.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef SRC_TRANSLATOR_TRANSLATIONMODEL_H_
|
#ifndef SRC_TRANSLATOR_TRANSLATIONMODEL_H_
|
||||||
@ -15,19 +15,28 @@
|
|||||||
#include "3rd_party/marian-dev/src/common/options.h"
|
#include "3rd_party/marian-dev/src/common/options.h"
|
||||||
|
|
||||||
// All local project includes
|
// All local project includes
|
||||||
#include "AbstractTranslationModel.h"
|
#include "TranslationRequest.h"
|
||||||
#include "translator/service_base.h"
|
#include "TranslationResult.h"
|
||||||
|
#include "translator/definitions.h"
|
||||||
|
#include "translator/service.h"
|
||||||
|
|
||||||
/* A Translation model that translates a plain (without any markups and emojis)
|
/* A Translation model that translates a plain (without any markups and emojis)
|
||||||
* UTF-8 encoded text. This implementation supports translation from 1 source
|
* UTF-8 encoded text. This implementation supports translation from 1 source
|
||||||
* language to 1 target language.
|
* language to 1 target language.
|
||||||
*/
|
*/
|
||||||
class TranslationModel : public AbstractTranslationModel {
|
class TranslationModel {
|
||||||
public:
|
public:
|
||||||
/* Construct the model using the model configuration options as yaml-formatted
|
/* Construct the model using the model configuration options as yaml-formatted
|
||||||
* string
|
* string
|
||||||
*/
|
*/
|
||||||
TranslationModel(const std::string &config);
|
/**
|
||||||
|
* @param config Marian yml config file in the form of a string
|
||||||
|
* @param model_memory optional byte array (aligned to 64!!!) that contains
|
||||||
|
* the bytes of a model.bin.
|
||||||
|
*/
|
||||||
|
TranslationModel(const std::string &config,
|
||||||
|
marian::bergamot::AlignedMemory modelMemory = marian::bergamot::AlignedMemory(),
|
||||||
|
marian::bergamot::AlignedMemory shortlistMemory = marian::bergamot::AlignedMemory());
|
||||||
|
|
||||||
~TranslationModel();
|
~TranslationModel();
|
||||||
|
|
||||||
@ -56,16 +65,16 @@ public:
|
|||||||
* object).
|
* object).
|
||||||
*/
|
*/
|
||||||
std::vector<TranslationResult> translate(std::vector<std::string> &&texts,
|
std::vector<TranslationResult> translate(std::vector<std::string> &&texts,
|
||||||
TranslationRequest request) override;
|
TranslationRequest request);
|
||||||
|
|
||||||
/* Check if the model can provide alignment information b/w original and
|
/* Check if the model can provide alignment information b/w original and
|
||||||
* translated text. */
|
* translated text. */
|
||||||
bool isAlignmentSupported() const override;
|
bool isAlignmentSupported() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Model configuration options
|
// Model configuration options
|
||||||
std::shared_ptr<marian::Options> configOptions_; // ORDER DEPENDECNY
|
std::shared_ptr<marian::Options> configOptions_; // ORDER DEPENDECNY
|
||||||
marian::bergamot::NonThreadedService service_; // ORDER DEPENDENCY
|
marian::bergamot::Service service_; // ORDER DEPENDENCY
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* SRC_TRANSLATOR_TRANSLATIONMODEL_H_ */
|
#endif /* SRC_TRANSLATOR_TRANSLATIONMODEL_H_ */
|
@ -2,7 +2,7 @@
|
|||||||
* TranslationRequest.h
|
* TranslationRequest.h
|
||||||
*
|
*
|
||||||
* This file defines the translation request class to be used in
|
* This file defines the translation request class to be used in
|
||||||
* AbstractTranslationModel::translate() API.
|
* TranslationModel::translate() API.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef SRC_TRANSLATOR_TRANSLATIONREQUEST_H_
|
#ifndef SRC_TRANSLATOR_TRANSLATIONREQUEST_H_
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* TranslationResult.h
|
* TranslationResult.h
|
||||||
*
|
*
|
||||||
* The class that represents the result of AbstractTranslationModel::translate()
|
* The class that represents the result of TranslationModel::translate()
|
||||||
* API for each of its text entry and TranslationRequest.
|
* API for each of its text entry and TranslationRequest.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
#include "QualityScore.h"
|
#include "QualityScore.h"
|
||||||
|
|
||||||
/* This class represents the result of AbstractTranslationModel::translate() API
|
/* This class represents the result of TranslationModel::translate() API
|
||||||
* for each of its text entry and TranslationRequest.
|
* for each of its text entry and TranslationRequest.
|
||||||
*/
|
*/
|
||||||
class TranslationResult {
|
class TranslationResult {
|
||||||
|
22
src/tests/CMakeLists.txt
Normal file
22
src/tests/CMakeLists.txt
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# Unit tests
|
||||||
|
set(UNIT_TESTS
|
||||||
|
annotation_tests
|
||||||
|
)
|
||||||
|
|
||||||
|
foreach(test ${UNIT_TESTS})
|
||||||
|
add_executable("run_${test}" run_tests.cpp "${test}.cpp")
|
||||||
|
target_include_directories("run_${test}" PRIVATE ${CATCH_INCLUDE_DIR} "${CMAKE_SOURCE_DIR}/src")
|
||||||
|
|
||||||
|
if(CUDA_FOUND)
|
||||||
|
target_link_libraries("run_${test}" ${EXT_LIBS} marian ${EXT_LIBS} marian_cuda ${EXT_LIBS} Catch bergamot-translator)
|
||||||
|
else(CUDA_FOUND)
|
||||||
|
target_link_libraries("run_${test}" marian ${EXT_LIBS} Catch bergamot-translator)
|
||||||
|
endif(CUDA_FOUND)
|
||||||
|
|
||||||
|
if(msvc)
|
||||||
|
# disable c4305: truncation from 'double' to '_ty'
|
||||||
|
target_compile_options("run_${test}" public /wd4305)
|
||||||
|
endif(msvc)
|
||||||
|
|
||||||
|
add_test(NAME ${test} COMMAND "run_${test}")
|
||||||
|
endforeach(test)
|
220
src/tests/annotation_tests.cpp
Normal file
220
src/tests/annotation_tests.cpp
Normal file
@ -0,0 +1,220 @@
|
|||||||
|
#include "catch.hpp"
|
||||||
|
#include "translator/sentence_ranges.h"
|
||||||
|
#include <random>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using namespace marian::bergamot;
|
||||||
|
|
||||||
|
TEST_CASE("Test Annotation API with random sentences") {
|
||||||
|
/// Objective here is to test insertion for sentences, and that whatever comes
|
||||||
|
/// out adheres to the way it was inserted. Towards this, we keep externally
|
||||||
|
/// which sentence went in where and try to use accessor methods on
|
||||||
|
/// AnnotatedText to check if what we have as ground-truth by construction is
|
||||||
|
/// consistent with what is returned.
|
||||||
|
size_t sentences = 500;
|
||||||
|
size_t maxWords = 40;
|
||||||
|
|
||||||
|
// Set in case needed to see output. The output is in lines of #sentences +
|
||||||
|
// header, which can be split and compared for easy understanding. The ideal
|
||||||
|
// way to inspect what is going wrong is to redirect output and use to split
|
||||||
|
// the different stages by sentences + 1 lines and check the diff.
|
||||||
|
bool debug{false};
|
||||||
|
|
||||||
|
std::mt19937 randomIntGen_;
|
||||||
|
randomIntGen_.seed(42);
|
||||||
|
|
||||||
|
AnnotatedText testAnnotation; // This the container we add through API and
|
||||||
|
// check if the access is correct.
|
||||||
|
|
||||||
|
// External book-keeping so we have ground truths. Each element represents a
|
||||||
|
// sentence.
|
||||||
|
|
||||||
|
// word byte ranges - for testAnnotation.word(sId, wId)
|
||||||
|
std::vector<std::vector<ByteRange>> groundTruthWords;
|
||||||
|
// sentence byte ranges - for testAnnotation.sentence(sId, wId)
|
||||||
|
std::vector<ByteRange> groundTruthSentences;
|
||||||
|
|
||||||
|
// Prepare the text and construct ByteRanges as intended for sentences and
|
||||||
|
// words. The ByteRanges we construct here are expected to be the
|
||||||
|
// ground-truths for words and sentences. The string being constructed is like
|
||||||
|
// as follows:
|
||||||
|
//
|
||||||
|
// 0-0 0-1 0-2 0-3
|
||||||
|
// 1-0 1-1 1-2 1-3 1-4
|
||||||
|
// 2-0 2-1
|
||||||
|
//
|
||||||
|
// 4-0 4-1 4-2 4-3
|
||||||
|
//
|
||||||
|
// Words are separated by space units.
|
||||||
|
//
|
||||||
|
// Below, we accumulate the text with intended structure as above, and
|
||||||
|
// ground-truth tables populated to be aware of the ByteRanges where they are
|
||||||
|
// meant to be.
|
||||||
|
if (debug) {
|
||||||
|
std::cout << "Preparing text and ground truth-tables" << std::endl;
|
||||||
|
}
|
||||||
|
for (size_t idx = 0; idx < sentences; idx++) {
|
||||||
|
if (idx != 0)
|
||||||
|
testAnnotation.text += "\n";
|
||||||
|
|
||||||
|
// Words can be zero, we need to support empty word sentences as well.
|
||||||
|
size_t numWords = randomIntGen_() % maxWords;
|
||||||
|
|
||||||
|
std::vector<ByteRange> wordByteRanges;
|
||||||
|
wordByteRanges.reserve(numWords);
|
||||||
|
|
||||||
|
// For empty sentence, we expect it to be empty and marked in position where
|
||||||
|
// the existing string is if needed to be pointed out.
|
||||||
|
size_t before = testAnnotation.text.size() - 1;
|
||||||
|
size_t sentenceBegin{before}, sentenceEnd{before};
|
||||||
|
|
||||||
|
for (size_t idw = 0; idw < numWords; idw++) {
|
||||||
|
if (idw != 0) {
|
||||||
|
testAnnotation.text += " ";
|
||||||
|
if (debug) {
|
||||||
|
std::cout << " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get new beginning, accounting for space above.
|
||||||
|
before = testAnnotation.text.size();
|
||||||
|
|
||||||
|
// Add the word
|
||||||
|
std::string word = std::to_string(idx) + "-" + std::to_string(idw);
|
||||||
|
testAnnotation.text += word;
|
||||||
|
|
||||||
|
// Do math, before, before + new-word's size.
|
||||||
|
wordByteRanges.push_back((ByteRange){before, before + word.size()});
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
std::cout << word;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (idw == 0) {
|
||||||
|
sentenceBegin = before;
|
||||||
|
}
|
||||||
|
if (idw == numWords - 1) {
|
||||||
|
sentenceEnd = before + word.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (debug) {
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
groundTruthWords.push_back(wordByteRanges);
|
||||||
|
groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
|
||||||
|
}
|
||||||
|
|
||||||
|
// We prepare string_views now with the known ByteRanges and use the
|
||||||
|
// string_view based AnnotatedText.addSentence(...) API to add sentences to
|
||||||
|
// transparently convert from string_views to ByteRanges, rebasing/working out
|
||||||
|
// the math underneath.
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
std::cout << "Inserting words onto container and save ground-truth-table:"
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<marian::string_view>> wordStringViews;
|
||||||
|
for (auto &sentence : groundTruthWords) {
|
||||||
|
std::vector<marian::string_view> wordByteRanges;
|
||||||
|
bool first{true};
|
||||||
|
for (auto &word : sentence) {
|
||||||
|
marian::string_view wordView(&testAnnotation.text[word.begin],
|
||||||
|
word.size());
|
||||||
|
wordByteRanges.push_back(wordView);
|
||||||
|
if (debug) {
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
std::cout << " ";
|
||||||
|
}
|
||||||
|
std::cout << std::string(wordView);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
testAnnotation.addSentence(wordByteRanges);
|
||||||
|
wordStringViews.push_back(wordByteRanges);
|
||||||
|
if (debug) {
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
std::cout
|
||||||
|
<< "Inserting sentences onto container and save ground-truth-table"
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
std::vector<marian::string_view> sentenceStringViews;
|
||||||
|
for (auto &sentenceByteRange : groundTruthSentences) {
|
||||||
|
char *data = &(testAnnotation.text[sentenceByteRange.begin]);
|
||||||
|
marian::string_view sentenceView(data, sentenceByteRange.size());
|
||||||
|
sentenceStringViews.push_back(sentenceView);
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
std::cout << sentenceView << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Access from the sentence(sentenceIdx) API and confirm that the ground truth
|
||||||
|
// we expect is same as what comes out of the container.
|
||||||
|
if (debug) {
|
||||||
|
std::cout << "From container: Sentences" << std::endl;
|
||||||
|
}
|
||||||
|
for (int idx = 0; idx < groundTruthSentences.size(); idx++) {
|
||||||
|
ByteRange expected = groundTruthSentences[idx];
|
||||||
|
ByteRange obtained = testAnnotation.sentenceAsByteRange(idx);
|
||||||
|
if (debug) {
|
||||||
|
std::cout << std::string(testAnnotation.sentence(idx)) << std::endl;
|
||||||
|
}
|
||||||
|
CHECK(expected.begin == obtained.begin);
|
||||||
|
CHECK(expected.end == obtained.end);
|
||||||
|
std::string expected_string = std::string(sentenceStringViews[idx]);
|
||||||
|
std::string obtained_string = std::string(testAnnotation.sentence(idx));
|
||||||
|
CHECK(expected_string == obtained_string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as
|
||||||
|
/// expected words are the same as those obtained from the container.
|
||||||
|
if (debug) {
|
||||||
|
std::cout << "From container: Words" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
CHECK(groundTruthWords.size() == testAnnotation.numSentences());
|
||||||
|
for (int idx = 0; idx < groundTruthWords.size(); idx++) {
|
||||||
|
CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int idx = 0; idx < groundTruthWords.size(); idx++) {
|
||||||
|
for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) {
|
||||||
|
ByteRange expected = groundTruthWords[idx][idw];
|
||||||
|
ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw);
|
||||||
|
if (debug) {
|
||||||
|
std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
|
||||||
|
}
|
||||||
|
CHECK(expected.begin == obtained.begin);
|
||||||
|
CHECK(expected.end == obtained.end);
|
||||||
|
|
||||||
|
std::string expected_string = std::string(wordStringViews[idx][idw]);
|
||||||
|
std::string obtained_string = std::string(testAnnotation.word(idx, idw));
|
||||||
|
CHECK(expected_string == obtained_string);
|
||||||
|
}
|
||||||
|
if (debug) {
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try inserting an empty Sentence. This is ensuring we check for empty
|
||||||
|
// Sentence if the random test above does not cover it for some reason.
|
||||||
|
int emptySentenceIdx = sentences;
|
||||||
|
std::vector<marian::string_view> emptySentence;
|
||||||
|
testAnnotation.addSentence(emptySentence);
|
||||||
|
|
||||||
|
// There are no words.
|
||||||
|
CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
|
||||||
|
|
||||||
|
// Empty sentence expected at output.
|
||||||
|
std::string expectedEmptyString = "";
|
||||||
|
marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx);
|
||||||
|
std::string obtainedString = std::string(emptyView.data(), emptyView.size());
|
||||||
|
CHECK(expectedEmptyString == obtainedString);
|
||||||
|
}
|
2
src/tests/run_tests.cpp
Normal file
2
src/tests/run_tests.cpp
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define CATCH_CONFIG_MAIN
|
||||||
|
#include "catch.hpp"
|
@ -1,14 +0,0 @@
|
|||||||
/*
|
|
||||||
* AbstractTranslationModel.cpp
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
// All local includes
|
|
||||||
#include "AbstractTranslationModel.h"
|
|
||||||
#include "TranslationModel.h"
|
|
||||||
|
|
||||||
std::shared_ptr<AbstractTranslationModel>
|
|
||||||
AbstractTranslationModel::createInstance(const std::string &config) {
|
|
||||||
return std::make_shared<TranslationModel>(config);
|
|
||||||
}
|
|
@ -1,32 +1,26 @@
|
|||||||
if (NOT USE_WASM_COMPATIBLE_SOURCES)
|
|
||||||
set(MULTITHREADED_SERVICE_SOURCE "service.cpp")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_library(bergamot-translator STATIC
|
add_library(bergamot-translator STATIC
|
||||||
AbstractTranslationModel.cpp
|
|
||||||
TranslationModel.cpp
|
TranslationModel.cpp
|
||||||
|
byte_array_util.cpp
|
||||||
# Following files added from browsermt/mts@nuke
|
|
||||||
text_processor.cpp
|
text_processor.cpp
|
||||||
sentence_splitter.cpp
|
sentence_splitter.cpp
|
||||||
batch_translator.cpp
|
batch_translator.cpp
|
||||||
multifactor_priority.cpp
|
multifactor_priority.cpp
|
||||||
request.cpp
|
request.cpp
|
||||||
service_base.cpp
|
|
||||||
${MULTITHREADED_SERVICE_SOURCE}
|
|
||||||
batcher.cpp
|
batcher.cpp
|
||||||
response.cpp
|
response.cpp
|
||||||
batch.cpp
|
batch.cpp
|
||||||
sentence_ranges.cpp
|
sentence_ranges.cpp
|
||||||
|
service.cpp
|
||||||
)
|
)
|
||||||
if (COMPILE_DECODER_ONLY)
|
if (USE_WASM_COMPATIBLE_SOURCE)
|
||||||
# A dirty hack because of marian's bad cmake practices
|
# Using wasm compatible sources should include this compile definition;
|
||||||
target_compile_definitions(bergamot-translator PUBLIC DECODER_ONLY)
|
# Has to be done here because we are including marian headers + some sources
|
||||||
|
# in local repository use these definitions
|
||||||
|
target_compile_definitions(bergamot-translator PUBLIC USE_SSE2 WASM_COMPATIBLE_SOURCE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(COMPILE_WASM)
|
if(COMPILE_WASM)
|
||||||
# A dirty hack because of marian's bad cmake practices
|
target_compile_definitions(bergamot-translator PUBLIC WASM)
|
||||||
target_compile_definitions(bergamot-translator PUBLIC USE_SSE2 WASM)
|
|
||||||
# Enable code that is required for generating JS bindings
|
# Enable code that is required for generating JS bindings
|
||||||
target_compile_definitions(bergamot-translator PRIVATE WASM_BINDINGS)
|
target_compile_definitions(bergamot-translator PRIVATE WASM_BINDINGS)
|
||||||
target_compile_options(bergamot-translator PRIVATE ${WASM_COMPILE_FLAGS})
|
target_compile_options(bergamot-translator PRIVATE ${WASM_COMPILE_FLAGS})
|
||||||
@ -35,7 +29,5 @@ endif(COMPILE_WASM)
|
|||||||
target_link_libraries(bergamot-translator marian ssplit)
|
target_link_libraries(bergamot-translator marian ssplit)
|
||||||
|
|
||||||
target_include_directories(bergamot-translator
|
target_include_directories(bergamot-translator
|
||||||
PRIVATE ${CMAKE_SOURCE_DIR}
|
PUBLIC ${CMAKE_SOURCE_DIR}
|
||||||
PUBLIC ${CMAKE_SOURCE_DIR}/src)
|
PUBLIC ${CMAKE_SOURCE_DIR}/src)
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,53 +6,15 @@
|
|||||||
#include <future>
|
#include <future>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// All 3rd party includes
|
|
||||||
#include "3rd_party/marian-dev/src/3rd_party/yaml-cpp/yaml.h"
|
|
||||||
#include "3rd_party/marian-dev/src/common/config_parser.h"
|
|
||||||
#include "common/config_validator.h"
|
|
||||||
#include "common/options.h"
|
|
||||||
|
|
||||||
// All local project includes
|
// All local project includes
|
||||||
#include "TranslationModel.h"
|
#include "TranslationModel.h"
|
||||||
#include "translator/parser.h"
|
#include "translator/parser.h"
|
||||||
#include "translator/service_base.h"
|
#include "translator/service.h"
|
||||||
|
|
||||||
std::shared_ptr<marian::Options> parseOptions(const std::string &config) {
|
TranslationModel::TranslationModel(const std::string &config,
|
||||||
marian::Options options;
|
marian::bergamot::AlignedMemory model_memory,
|
||||||
|
marian::bergamot::AlignedMemory lexical_memory)
|
||||||
// @TODO(jerinphilip) There's something off here, @XapaJIaMnu suggests
|
: service_(config, std::move(model_memory), std::move(lexical_memory)) {}
|
||||||
// that should not be using the defaultConfig. This function only has access
|
|
||||||
// to std::string config and needs to be able to construct Options from the
|
|
||||||
// same.
|
|
||||||
|
|
||||||
// Absent the following code-segment, there is a parsing exception thrown on
|
|
||||||
// rebuilding YAML.
|
|
||||||
//
|
|
||||||
// Error: Unhandled exception of type 'N4YAML11InvalidNodeE': invalid node;
|
|
||||||
// this may result from using a map iterator as a sequence iterator, or
|
|
||||||
// vice-versa
|
|
||||||
//
|
|
||||||
// Error: Aborted from void unhandledException() in
|
|
||||||
// 3rd_party/marian-dev/src/common/logging.cpp:113
|
|
||||||
|
|
||||||
marian::ConfigParser configParser = marian::bergamot::createConfigParser();
|
|
||||||
const YAML::Node &defaultConfig = configParser.getConfig();
|
|
||||||
|
|
||||||
options.merge(defaultConfig);
|
|
||||||
|
|
||||||
// Parse configs onto defaultConfig.
|
|
||||||
options.parse(config);
|
|
||||||
YAML::Node configCopy = options.cloneToYamlNode();
|
|
||||||
|
|
||||||
marian::ConfigValidator validator(configCopy);
|
|
||||||
validator.validateOptions(marian::cli::mode::translation);
|
|
||||||
|
|
||||||
return std::make_shared<marian::Options>(options);
|
|
||||||
}
|
|
||||||
|
|
||||||
TranslationModel::TranslationModel(const std::string &config)
|
|
||||||
: configOptions_(std::move(parseOptions(config))),
|
|
||||||
AbstractTranslationModel(), service_(configOptions_) {}
|
|
||||||
|
|
||||||
TranslationModel::~TranslationModel() {}
|
TranslationModel::~TranslationModel() {}
|
||||||
|
|
||||||
@ -71,23 +33,18 @@ TranslationModel::translate(std::vector<std::string> &&texts,
|
|||||||
intermediate.wait();
|
intermediate.wait();
|
||||||
auto marianResponse(std::move(intermediate.get()));
|
auto marianResponse(std::move(intermediate.get()));
|
||||||
|
|
||||||
// This mess because marian::string_view != std::string_view
|
|
||||||
std::string source, translation;
|
|
||||||
marian::bergamot::Response::SentenceMappings mSentenceMappings;
|
|
||||||
marianResponse.move(source, translation, mSentenceMappings);
|
|
||||||
|
|
||||||
// Convert to UnifiedAPI::TranslationResult
|
|
||||||
TranslationResult::SentenceMappings sentenceMappings;
|
TranslationResult::SentenceMappings sentenceMappings;
|
||||||
for (auto &p : mSentenceMappings) {
|
for (size_t idx = 0; idx < marianResponse.size(); idx++) {
|
||||||
std::string_view src(p.first.data(), p.first.size()),
|
marian::string_view src = marianResponse.source.sentence(idx);
|
||||||
tgt(p.second.data(), p.second.size());
|
marian::string_view tgt = marianResponse.target.sentence(idx);
|
||||||
sentenceMappings.emplace_back(src, tgt);
|
sentenceMappings.emplace_back(std::string_view(src.data(), src.size()),
|
||||||
|
std::string_view(tgt.data(), tgt.size()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// In place construction.
|
// In place construction.
|
||||||
translationResults.emplace_back(
|
translationResults.emplace_back(
|
||||||
std::move(source), // &&marianResponse.source_
|
std::move(marianResponse.source.text), // &&marianResponse.source_
|
||||||
std::move(translation), // &&marianResponse.translation_
|
std::move(marianResponse.target.text), // &&marianResponse.translation_
|
||||||
std::move(sentenceMappings) // &&sentenceMappings
|
std::move(sentenceMappings) // &&sentenceMappings
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
71
src/translator/aligned.h
Normal file
71
src/translator/aligned.h
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <new>
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Aligned simple vector.
|
||||||
|
|
||||||
|
namespace marian {
|
||||||
|
namespace bergamot {
|
||||||
|
|
||||||
|
template <class T> class AlignedVector {
|
||||||
|
public:
|
||||||
|
AlignedVector() : mem_(nullptr), size_(0) {}
|
||||||
|
|
||||||
|
explicit AlignedVector(std::size_t size, std::size_t alignment = 64 /* CPU cares about this */)
|
||||||
|
: size_(size) {
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
mem_ = static_cast<T*>(_aligned_malloc(size * sizeof(T), alignment));
|
||||||
|
if (!mem_) throw std::bad_alloc();
|
||||||
|
#else
|
||||||
|
if (posix_memalign(reinterpret_cast<void **>(&mem_), alignment, size * sizeof(T))) {
|
||||||
|
throw std::bad_alloc();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
AlignedVector(AlignedVector &&from) : mem_(from.mem_), size_(from.size_) {
|
||||||
|
from.mem_ = nullptr;
|
||||||
|
from.size_ = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
AlignedVector &operator=(AlignedVector &&from) {
|
||||||
|
mem_ = from.mem_;
|
||||||
|
size_ = from.size_;
|
||||||
|
from.mem_ = nullptr;
|
||||||
|
from.size_ = 0;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
AlignedVector(const AlignedVector&) = delete;
|
||||||
|
AlignedVector& operator=(const AlignedVector&) = delete;
|
||||||
|
|
||||||
|
~AlignedVector() {
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
_aligned_free(mem_);
|
||||||
|
#else
|
||||||
|
std::free(mem_);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t size() const { return size_; }
|
||||||
|
|
||||||
|
T &operator[](std::size_t offset) { return mem_[offset]; }
|
||||||
|
const T &operator[](std::size_t offset) const { return mem_[offset]; }
|
||||||
|
|
||||||
|
T *begin() { return mem_; }
|
||||||
|
const T *begin() const { return mem_; }
|
||||||
|
T *end() { return mem_ + size_; }
|
||||||
|
const T *end() const { return mem_ + size_; }
|
||||||
|
|
||||||
|
template <typename ReturnType>
|
||||||
|
ReturnType *as() { return reinterpret_cast<ReturnType*>(mem_); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
T *mem_;
|
||||||
|
std::size_t size_;
|
||||||
|
};
|
||||||
|
} // namespace bergamot
|
||||||
|
} // namespace marian
|
@ -10,26 +10,46 @@ namespace bergamot {
|
|||||||
|
|
||||||
BatchTranslator::BatchTranslator(DeviceId const device,
|
BatchTranslator::BatchTranslator(DeviceId const device,
|
||||||
std::vector<Ptr<Vocab const>> &vocabs,
|
std::vector<Ptr<Vocab const>> &vocabs,
|
||||||
Ptr<Options> options)
|
Ptr<Options> options,
|
||||||
: device_(device), options_(options), vocabs_(&vocabs) {}
|
const AlignedMemory* modelMemory,
|
||||||
|
const AlignedMemory* shortlistMemory)
|
||||||
|
: device_(device), options_(options), vocabs_(&vocabs),
|
||||||
|
modelMemory_(modelMemory), shortlistMemory_(shortlistMemory) {}
|
||||||
|
|
||||||
void BatchTranslator::initialize() {
|
void BatchTranslator::initialize() {
|
||||||
// Initializes the graph.
|
// Initializes the graph.
|
||||||
if (options_->hasAndNotEmpty("shortlist")) {
|
if (options_->hasAndNotEmpty("shortlist")) {
|
||||||
int srcIdx = 0, trgIdx = 1;
|
int srcIdx = 0, trgIdx = 1;
|
||||||
bool shared_vcb = vocabs_->front() == vocabs_->back();
|
bool shared_vcb = vocabs_->front() == vocabs_->back();
|
||||||
slgen_ = New<data::LexicalShortlistGenerator>(options_, vocabs_->front(),
|
if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) {
|
||||||
|
bool check = options_->get<bool>("check-bytearray",true);
|
||||||
|
slgen_ = New<data::BinaryShortlistGenerator>(shortlistMemory_->begin(), shortlistMemory_->size(),
|
||||||
|
vocabs_->front(), vocabs_->back(),
|
||||||
|
srcIdx, trgIdx, shared_vcb, check);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Changed to BinaryShortlistGenerator to enable loading binary shortlist file
|
||||||
|
// This class also supports text shortlist file
|
||||||
|
slgen_ = New<data::BinaryShortlistGenerator>(options_, vocabs_->front(),
|
||||||
vocabs_->back(), srcIdx,
|
vocabs_->back(), srcIdx,
|
||||||
trgIdx, shared_vcb);
|
trgIdx, shared_vcb);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
graph_ = New<ExpressionGraph>(true); // always optimize
|
graph_ = New<ExpressionGraph>(true); // set the graph to be inference only
|
||||||
auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
|
auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
|
||||||
graph_->setDefaultElementType(typeFromString(prec[0]));
|
graph_->setDefaultElementType(typeFromString(prec[0]));
|
||||||
graph_->setDevice(device_);
|
graph_->setDevice(device_);
|
||||||
graph_->getBackend()->configureDevice(options_);
|
graph_->getBackend()->configureDevice(options_);
|
||||||
graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
|
||||||
|
if (modelMemory_->size() > 0 && modelMemory_->begin() != nullptr) { // If we have provided a byte array that contains the model memory, we can initialise the model from there, as opposed to from reading in the config file
|
||||||
|
ABORT_IF((uintptr_t)modelMemory_->begin() % 256 != 0,
|
||||||
|
"The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it.");
|
||||||
|
const std::vector<const void *> container = {modelMemory_->begin()}; // Marian supports multiple models initialised in this manner hence std::vector. However we will only ever use 1 during decoding.
|
||||||
|
scorers_ = createScorers(options_, container);
|
||||||
|
} else {
|
||||||
scorers_ = createScorers(options_);
|
scorers_ = createScorers(options_);
|
||||||
|
}
|
||||||
for (auto scorer : scorers_) {
|
for (auto scorer : scorers_) {
|
||||||
scorer->init(graph_);
|
scorer->init(graph_);
|
||||||
if (slgen_) {
|
if (slgen_) {
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
#include "translator/history.h"
|
#include "translator/history.h"
|
||||||
#include "translator/scorers.h"
|
#include "translator/scorers.h"
|
||||||
|
|
||||||
#ifdef WITH_PTHREADS
|
#ifndef WASM_COMPATIBLE_SOURCE
|
||||||
#include "pcqueue.h"
|
#include "pcqueue.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -26,8 +26,16 @@ class BatchTranslator {
|
|||||||
// shut down in Service which calls join() on the threads.
|
// shut down in Service which calls join() on the threads.
|
||||||
|
|
||||||
public:
|
public:
|
||||||
BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
|
/**
|
||||||
Ptr<Options> options);
|
* Initialise the marian translator.
|
||||||
|
* @param device DeviceId that performs translation. Could be CPU or GPU
|
||||||
|
* @param vocabs Vector that contains ptrs to two vocabs
|
||||||
|
* @param options Marian options object
|
||||||
|
* @param modelMemory byte array (aligned to 256!!!) that contains the bytes of a model.bin. Provide a nullptr if not used.
|
||||||
|
* @param shortlistMemory byte array of shortlist (aligned to 64)
|
||||||
|
*/
|
||||||
|
explicit BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
|
||||||
|
Ptr<Options> options, const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory);
|
||||||
|
|
||||||
// convenience function for logging. TODO(jerin)
|
// convenience function for logging. TODO(jerin)
|
||||||
std::string _identifier() { return "worker" + std::to_string(device_.no); }
|
std::string _identifier() { return "worker" + std::to_string(device_.no); }
|
||||||
@ -41,6 +49,8 @@ private:
|
|||||||
Ptr<ExpressionGraph> graph_;
|
Ptr<ExpressionGraph> graph_;
|
||||||
std::vector<Ptr<Scorer>> scorers_;
|
std::vector<Ptr<Scorer>> scorers_;
|
||||||
Ptr<data::ShortlistGenerator const> slgen_;
|
Ptr<data::ShortlistGenerator const> slgen_;
|
||||||
|
const AlignedMemory* modelMemory_{nullptr};
|
||||||
|
const AlignedMemory* shortlistMemory_{nullptr};
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
#include "definitions.h"
|
#include "definitions.h"
|
||||||
#include "request.h"
|
#include "request.h"
|
||||||
|
|
||||||
#ifdef WITH_PTHREADS
|
#ifndef WASM_COMPATIBLE_SOURCE
|
||||||
#include "pcqueue.h"
|
#include "pcqueue.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
108
src/translator/byte_array_util.cpp
Normal file
108
src/translator/byte_array_util.cpp
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
#include "byte_array_util.h"
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
namespace marian {
|
||||||
|
namespace bergamot {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// This is a basic validator that checks if the file has not been truncated
|
||||||
|
// it basically loads up the header and checks
|
||||||
|
|
||||||
|
// This struct and the getter are copied from the marian source, because it's located
|
||||||
|
// inside src/common/binary.cpp:15 and we can't include it.
|
||||||
|
struct Header {
|
||||||
|
uint64_t nameLength;
|
||||||
|
uint64_t type;
|
||||||
|
uint64_t shapeLength;
|
||||||
|
uint64_t dataLength;
|
||||||
|
};
|
||||||
|
|
||||||
|
// cast current void pointer to T pointer and move forward by num elements
|
||||||
|
template <typename T>
|
||||||
|
const T* get(const void*& current, uint64_t num = 1) {
|
||||||
|
const T* ptr = (const T*)current;
|
||||||
|
current = (const T*)current + num;
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool validateBinaryModel(AlignedMemory& model, uint64_t fileSize) {
|
||||||
|
const void * current = &model[0];
|
||||||
|
uint64_t memoryNeeded = sizeof(uint64_t)*2; // We keep track of how much memory we would need if we have a complete file
|
||||||
|
uint64_t numHeaders;
|
||||||
|
if (fileSize >= memoryNeeded) { // We have enough filesize to fetch the headers.
|
||||||
|
uint64_t binaryFileVersion = *get<uint64_t>(current);
|
||||||
|
numHeaders = *get<uint64_t>(current); // number of item headers that follow
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
memoryNeeded += numHeaders*sizeof(Header);
|
||||||
|
const Header* headers;
|
||||||
|
if (fileSize >= memoryNeeded) {
|
||||||
|
headers = get<Header>(current, numHeaders); // read that many headers
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate how many bytes we are going to for reading just the names and the shape
|
||||||
|
for (uint64_t i = 0; i < numHeaders; i++) {
|
||||||
|
memoryNeeded += headers[i].nameLength + headers[i].shapeLength*sizeof(int);
|
||||||
|
// Advance the pointers.
|
||||||
|
get<char>(current, headers[i].nameLength);
|
||||||
|
get<int>(current, headers[i].shapeLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Before we start reading the data, there is a small padding to ensure alignment
|
||||||
|
// Read that in, before calculating the actual tensor memory requirements.
|
||||||
|
uint64_t aligned_offset;
|
||||||
|
if (fileSize >= memoryNeeded) {
|
||||||
|
aligned_offset = *get<uint64_t>(current); // Offset to align memory to 256 size
|
||||||
|
memoryNeeded += aligned_offset + sizeof(uint64_t);
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finally the tensor size:
|
||||||
|
for (uint64_t i = 0; i < numHeaders; i++) {
|
||||||
|
memoryNeeded += headers[i].dataLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If this final check passes, the file is at least big enough to contain the model
|
||||||
|
if (fileSize >= memoryNeeded) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // Anonymous namespace
|
||||||
|
|
||||||
|
AlignedMemory loadFileToMemory(const std::string& path, size_t alignment){
|
||||||
|
uint64_t fileSize = filesystem::fileSize(path);
|
||||||
|
io::InputFileStream in(path);
|
||||||
|
ABORT_IF(in.bad(), "Failed opening file stream: {}", path);
|
||||||
|
AlignedMemory alignedMemory(fileSize, alignment);
|
||||||
|
in.read(reinterpret_cast<char *>(alignedMemory.begin()), fileSize);
|
||||||
|
ABORT_IF(alignedMemory.size() != fileSize, "Error reading file {}", path);
|
||||||
|
return alignedMemory;
|
||||||
|
}
|
||||||
|
|
||||||
|
AlignedMemory getModelMemoryFromConfig(marian::Ptr<marian::Options> options){
|
||||||
|
auto models = options->get<std::vector<std::string>>("models");
|
||||||
|
ABORT_IF(models.size() != 1, "Loading multiple binary models is not supported for now as it is not necessary.");
|
||||||
|
marian::filesystem::Path modelPath(models[0]);
|
||||||
|
ABORT_IF(modelPath.extension() != marian::filesystem::Path(".bin"), "The file of binary model should end with .bin");
|
||||||
|
AlignedMemory alignedMemory = loadFileToMemory(models[0], 256);
|
||||||
|
ABORT_IF(!validateBinaryModel(alignedMemory, alignedMemory.size()), "The binary file is invalid. Incomplete or corrupted download?");
|
||||||
|
return alignedMemory;
|
||||||
|
}
|
||||||
|
|
||||||
|
AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options){
|
||||||
|
auto shortlist = options->get<std::vector<std::string>>("shortlist");
|
||||||
|
ABORT_IF(shortlist.empty(), "No path to shortlist file is given.");
|
||||||
|
return loadFileToMemory(shortlist[0], 64);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace bergamot
|
||||||
|
} // namespace marian
|
12
src/translator/byte_array_util.h
Normal file
12
src/translator/byte_array_util.h
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#include "marian.h"
|
||||||
|
#include "definitions.h"
|
||||||
|
|
||||||
|
namespace marian {
|
||||||
|
namespace bergamot {
|
||||||
|
|
||||||
|
AlignedMemory loadFileToMemory(const std::string& path, size_t alignment);
|
||||||
|
AlignedMemory getModelMemoryFromConfig(marian::Ptr<marian::Options> options);
|
||||||
|
AlignedMemory getShortlistMemoryFromConfig(marian::Ptr<marian::Options> options);
|
||||||
|
|
||||||
|
} // namespace bergamot
|
||||||
|
} // namespace marian
|
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include "data/types.h"
|
#include "data/types.h"
|
||||||
#include "data/vocab_base.h"
|
#include "data/vocab_base.h"
|
||||||
|
#include "aligned.h"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
@ -21,6 +22,9 @@ template <class T, typename... Args> UPtr<T> UNew(Args &&... args) {
|
|||||||
|
|
||||||
template <class T> UPtr<T> UNew(UPtr<T> p) { return UPtr<T>(p); }
|
template <class T> UPtr<T> UNew(UPtr<T> p) { return UPtr<T>(p); }
|
||||||
|
|
||||||
|
/// Shortcut to AlignedVector<const void*> for byte arrays
|
||||||
|
typedef AlignedVector<const void*> AlignedMemory;
|
||||||
|
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
} // namespace marian
|
} // namespace marian
|
||||||
|
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
#ifndef SRC_BERGAMOT_PARSER_H
|
#ifndef SRC_BERGAMOT_PARSER_H
|
||||||
#define SRC_BERGAMOT_PARSER_H
|
#define SRC_BERGAMOT_PARSER_H
|
||||||
|
|
||||||
|
#include "3rd_party/yaml-cpp/yaml.h"
|
||||||
|
#include "common/config_parser.h"
|
||||||
|
#include "common/config_validator.h"
|
||||||
|
#include "common/options.h"
|
||||||
#include "marian.h"
|
#include "marian.h"
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
@ -19,9 +23,47 @@ inline marian::ConfigParser createConfigParser() {
|
|||||||
"--max-length-break", "Bergamot Options",
|
"--max-length-break", "Bergamot Options",
|
||||||
"Maximum input tokens to be processed in a single sentence.", 128);
|
"Maximum input tokens to be processed in a single sentence.", 128);
|
||||||
|
|
||||||
|
cp.addOption<bool>(
|
||||||
|
"--check-bytearray", "Bergamot Options",
|
||||||
|
"Flag holds whether to check the content of the bytearray (true by default)", true);
|
||||||
|
|
||||||
return cp;
|
return cp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::shared_ptr<marian::Options>
|
||||||
|
parseOptions(const std::string &config) {
|
||||||
|
marian::Options options;
|
||||||
|
|
||||||
|
// @TODO(jerinphilip) There's something off here, @XapaJIaMnu suggests
|
||||||
|
// that should not be using the defaultConfig. This function only has access
|
||||||
|
// to std::string config and needs to be able to construct Options from the
|
||||||
|
// same.
|
||||||
|
|
||||||
|
// Absent the following code-segment, there is a parsing exception thrown on
|
||||||
|
// rebuilding YAML.
|
||||||
|
//
|
||||||
|
// Error: Unhandled exception of type 'N4YAML11InvalidNodeE': invalid node;
|
||||||
|
// this may result from using a map iterator as a sequence iterator, or
|
||||||
|
// vice-versa
|
||||||
|
//
|
||||||
|
// Error: Aborted from void unhandledException() in
|
||||||
|
// 3rd_party/marian-dev/src/common/logging.cpp:113
|
||||||
|
|
||||||
|
marian::ConfigParser configParser = createConfigParser();
|
||||||
|
const YAML::Node &defaultConfig = configParser.getConfig();
|
||||||
|
|
||||||
|
options.merge(defaultConfig);
|
||||||
|
|
||||||
|
// Parse configs onto defaultConfig.
|
||||||
|
options.parse(config);
|
||||||
|
YAML::Node configCopy = options.cloneToYamlNode();
|
||||||
|
|
||||||
|
marian::ConfigValidator validator(configCopy);
|
||||||
|
validator.validateOptions(marian::cli::mode::translation);
|
||||||
|
|
||||||
|
return std::make_shared<marian::Options>(options);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
} // namespace marian
|
} // namespace marian
|
||||||
|
|
||||||
|
@ -12,12 +12,10 @@ namespace bergamot {
|
|||||||
|
|
||||||
// -----------------------------------------------------------------
|
// -----------------------------------------------------------------
|
||||||
Request::Request(size_t Id, size_t lineNumberBegin,
|
Request::Request(size_t Id, size_t lineNumberBegin,
|
||||||
std::vector<Ptr<Vocab const>> &vocabs, std::string &&source,
|
std::vector<Ptr<Vocab const>> &vocabs, AnnotatedText &&source,
|
||||||
Segments &&segments, SentenceRanges &&sourceRanges,
|
Segments &&segments, std::promise<Response> responsePromise)
|
||||||
std::promise<Response> responsePromise)
|
|
||||||
: Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs),
|
: Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs),
|
||||||
source_(std::move(source)), segments_(std::move(segments)),
|
source_(std::move(source)), segments_(std::move(segments)),
|
||||||
sourceRanges_(std::move(sourceRanges)),
|
|
||||||
response_(std::move(responsePromise)) {
|
response_(std::move(responsePromise)) {
|
||||||
|
|
||||||
counter_ = segments_.size();
|
counter_ = segments_.size();
|
||||||
@ -48,8 +46,7 @@ void Request::processHistory(size_t index, Ptr<History> history) {
|
|||||||
void Request::completeRequest() {
|
void Request::completeRequest() {
|
||||||
// Request no longer needs to hold the content, can transfer it to
|
// Request no longer needs to hold the content, can transfer it to
|
||||||
// Response.
|
// Response.
|
||||||
Response response(std::move(source_), std::move(sourceRanges_),
|
Response response(std::move(source_), std::move(histories_), *vocabs_);
|
||||||
std::move(histories_), *vocabs_);
|
|
||||||
response_.set_value(std::move(response));
|
response_.set_value(std::move(response));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
//
|
//
|
||||||
// Defines:
|
// Defines:
|
||||||
//
|
//
|
||||||
// Request: holds the input blob of a text, Segments (vector<Words>) which are
|
// Request: holds the input text of a text, Segments (vector<Words>) which are
|
||||||
// to go to the batching mechanism and alignments between the processed
|
// to go to the batching mechanism and alignments between the processed
|
||||||
// segments and the input blob (sourceTokenRanges). In addition, Request takes
|
// segments and the input text (sourceTokenRanges). In addition, Request takes
|
||||||
// care of the barrier which fires when all the Segments in a request are done
|
// care of the barrier which fires when all the Segments in a request are done
|
||||||
// translating by the workers (BatchTranslator).
|
// translating by the workers (BatchTranslator).
|
||||||
// TODO(jerinphilip): Extend Request with notions of Priority (sequence,
|
// TODO(jerinphilip): Extend Request with notions of Priority (sequence,
|
||||||
@ -36,9 +36,8 @@ namespace bergamot {
|
|||||||
class Request {
|
class Request {
|
||||||
public:
|
public:
|
||||||
Request(size_t Id, size_t lineNumberBegin,
|
Request(size_t Id, size_t lineNumberBegin,
|
||||||
std::vector<Ptr<Vocab const>> &vocabs_, std::string &&source,
|
std::vector<Ptr<Vocab const>> &vocabs_, AnnotatedText &&source,
|
||||||
Segments &&segments, SentenceRanges &&sourceTokenRanges,
|
Segments &&segments, std::promise<Response> responsePromise);
|
||||||
std::promise<Response> responsePromise);
|
|
||||||
|
|
||||||
// Obtain the count of tokens in the segment correponding to index. Used to
|
// Obtain the count of tokens in the segment correponding to index. Used to
|
||||||
// insert sentence from multiple requests into the corresponding size bucket.
|
// insert sentence from multiple requests into the corresponding size bucket.
|
||||||
@ -77,9 +76,8 @@ private:
|
|||||||
// string_views of the text corresponding to these words, pointing to
|
// string_views of the text corresponding to these words, pointing to
|
||||||
// sequences in source_. histories_ is a buffer which eventually stores the
|
// sequences in source_. histories_ is a buffer which eventually stores the
|
||||||
// translations of each segment in the corresponding index.
|
// translations of each segment in the corresponding index.
|
||||||
std::string source_;
|
AnnotatedText source_;
|
||||||
Segments segments_;
|
Segments segments_;
|
||||||
SentenceRanges sourceRanges_;
|
|
||||||
std::vector<Ptr<History>> histories_;
|
std::vector<Ptr<History>> histories_;
|
||||||
|
|
||||||
// Members above are moved into newly constructed Response on completion
|
// Members above are moved into newly constructed Response on completion
|
||||||
|
@ -1,97 +1,105 @@
|
|||||||
#include "response.h"
|
#include "response.h"
|
||||||
#include "sentence_ranges.h"
|
|
||||||
#include "common/logging.h"
|
#include "common/logging.h"
|
||||||
#include "data/alignment.h"
|
#include "data/alignment.h"
|
||||||
|
#include "sentence_ranges.h"
|
||||||
|
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
namespace bergamot {
|
namespace bergamot {
|
||||||
|
|
||||||
Response::Response(std::string &&source, SentenceRanges &&sourceRanges,
|
Response::Response(AnnotatedText &&source, Histories &&histories,
|
||||||
Histories &&histories, std::vector<Ptr<Vocab const>> &vocabs)
|
std::vector<Ptr<Vocab const>> &vocabs)
|
||||||
: source_(std::move(source)), sourceRanges_(std::move(sourceRanges)),
|
: source(std::move(source)) {
|
||||||
histories_(std::move(histories)), vocabs_(&vocabs) {}
|
|
||||||
|
|
||||||
void Response::move(std::string &source, std::string &translation,
|
|
||||||
SentenceMappings &sentenceMappings) {
|
|
||||||
|
|
||||||
// Construct required stuff first.
|
|
||||||
constructTranslation();
|
|
||||||
constructSentenceMappings(sentenceMappings);
|
|
||||||
|
|
||||||
// Move content out.
|
|
||||||
source = std::move(source_);
|
|
||||||
translation = std::move(translation_);
|
|
||||||
|
|
||||||
// The above assignment expects source, target be moved.
|
|
||||||
// which makes the following invalid, hence required to be cleared.
|
|
||||||
sourceRanges_.clear();
|
|
||||||
targetRanges_.clear();
|
|
||||||
histories_.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Response::constructTranslation() {
|
|
||||||
if (translationConstructed_) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reserving length at least as much as source_ seems like a reasonable thing
|
// Reserving length at least as much as source_ seems like a reasonable thing
|
||||||
// to do to avoid reallocations.
|
// to do to avoid reallocations.
|
||||||
translation_.reserve(source_.size());
|
target.text.reserve(source.text.size());
|
||||||
|
|
||||||
// In a first step, the decoded units (individual senteneces) are compiled
|
// In a first step, the decoded units (individual senteneces) are compiled
|
||||||
// into a huge string. This is done by computing indices first and appending
|
// into a huge string. This is done by computing indices first and appending
|
||||||
// to the string as each sentences are decoded.
|
// to the string as each sentences are decoded.
|
||||||
std::vector<std::pair<size_t, size_t>> translationRanges;
|
std::vector<std::pair<size_t, size_t>> translationRanges;
|
||||||
|
std::vector<size_t> sentenceBegins;
|
||||||
|
|
||||||
size_t offset{0};
|
size_t offset{0};
|
||||||
bool first{true};
|
bool first{true};
|
||||||
|
|
||||||
for (auto &history : histories_) {
|
for (auto &history : histories) {
|
||||||
// TODO(jerin): Change hardcode of nBest = 1
|
// TODO(jerin): Change hardcode of nBest = 1
|
||||||
NBestList onebest = history->nBest(1);
|
NBestList onebest = history->nBest(1);
|
||||||
|
|
||||||
Result result = onebest[0]; // Expecting only one result;
|
Result result = onebest[0]; // Expecting only one result;
|
||||||
Words words = std::get<0>(result);
|
Words words = std::get<0>(result);
|
||||||
auto targetVocab = vocabs_->back();
|
auto targetVocab = vocabs.back();
|
||||||
std::string decoded = targetVocab->decode(words);
|
|
||||||
|
std::string decoded;
|
||||||
|
std::vector<string_view> targetMappings;
|
||||||
|
targetVocab->decodeWithByteRanges(words, decoded, targetMappings);
|
||||||
|
|
||||||
if (first) {
|
if (first) {
|
||||||
first = false;
|
first = false;
|
||||||
} else {
|
} else {
|
||||||
translation_ += " ";
|
target.text += " ";
|
||||||
++offset;
|
++offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
translation_ += decoded;
|
sentenceBegins.push_back(translationRanges.size());
|
||||||
translationRanges.emplace_back(offset, decoded.size());
|
target.text += decoded;
|
||||||
|
auto decodedStringBeginMarker = targetMappings.front().begin();
|
||||||
|
for (auto &sview : targetMappings) {
|
||||||
|
size_t startIdx = offset + sview.begin() - decodedStringBeginMarker;
|
||||||
|
translationRanges.emplace_back(startIdx, startIdx + sview.size());
|
||||||
|
}
|
||||||
|
|
||||||
offset += decoded.size();
|
offset += decoded.size();
|
||||||
|
|
||||||
|
// Alignments
|
||||||
|
// TODO(jerinphilip): The following double conversion might not be
|
||||||
|
// necessary. Hard alignment can directly be exported, but this would mean
|
||||||
|
// WASM bindings for a structure deep within marian source.
|
||||||
|
auto hyp = std::get<1>(result);
|
||||||
|
auto softAlignment = hyp->tracebackAlignment();
|
||||||
|
auto hardAlignment = data::ConvertSoftAlignToHardAlign(
|
||||||
|
softAlignment, /*threshold=*/0.2f); // TODO(jerinphilip): Make this a
|
||||||
|
// configurable parameter.
|
||||||
|
|
||||||
|
Alignment unified_alignment;
|
||||||
|
for (auto &p : hardAlignment) {
|
||||||
|
unified_alignment.emplace_back((Point){p.srcPos, p.tgtPos, p.prob});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Once the entire string is constructed, there are no further possibility of
|
alignments.push_back(std::move(unified_alignment));
|
||||||
// reallocation in the string's storage, the indices are converted into
|
|
||||||
// string_views.
|
|
||||||
|
|
||||||
for (auto &range : translationRanges) {
|
// Quality scores: Sequence level is obtained as normalized path scores.
|
||||||
// TODO(@jerinphilip): Currently considers target tokens as whole text.
|
// Word level using hypothesis traceback. These are most-likely logprobs.
|
||||||
// Needs to be further enhanced in marian-dev to extract alignments.
|
auto normalizedPathScore = std::get<2>(result);
|
||||||
|
auto wordQualities = hyp->tracebackWordScores();
|
||||||
|
wordQualities.pop_back();
|
||||||
|
qualityScores.push_back((Quality){normalizedPathScore, wordQualities});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Once we have the indices in translation (which might be resized a few
|
||||||
|
// times) ready, we can prepare and store the string_view as annotations
|
||||||
|
// instead. This is accomplished by iterating over available sentences using
|
||||||
|
// sentenceBegin and using addSentence(...) API from Annotation.
|
||||||
|
|
||||||
|
for (size_t i = 1; i <= sentenceBegins.size(); i++) {
|
||||||
std::vector<string_view> targetMappings;
|
std::vector<string_view> targetMappings;
|
||||||
|
size_t begin = sentenceBegins[i - 1];
|
||||||
|
size_t safe_end = (i == sentenceBegins.size()) ? translationRanges.size()
|
||||||
|
: sentenceBegins[i];
|
||||||
|
|
||||||
const char *begin = &translation_[range.first];
|
for (size_t idx = begin; idx < safe_end; idx++) {
|
||||||
targetMappings.emplace_back(begin, range.second);
|
auto &p = translationRanges[idx];
|
||||||
targetRanges_.addSentence(targetMappings);
|
size_t begin_idx = p.first;
|
||||||
|
size_t end_idx = p.second;
|
||||||
|
|
||||||
|
const char *data = &target.text[begin_idx];
|
||||||
|
size_t size = end_idx - begin_idx;
|
||||||
|
targetMappings.emplace_back(data, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
translationConstructed_ = true;
|
target.addSentence(targetMappings);
|
||||||
}
|
|
||||||
|
|
||||||
void Response::constructSentenceMappings(
|
|
||||||
Response::SentenceMappings &sentenceMappings) {
|
|
||||||
|
|
||||||
for (size_t i = 0; i < sourceRanges_.numSentences(); i++) {
|
|
||||||
string_view src = sourceRanges_.sentence(i);
|
|
||||||
string_view tgt = targetRanges_.sentence(i);
|
|
||||||
sentenceMappings.emplace_back(src, tgt);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
#ifndef SRC_BERGAMOT_RESPONSE_H_
|
#ifndef SRC_BERGAMOT_RESPONSE_H_
|
||||||
#define SRC_BERGAMOT_RESPONSE_H_
|
#define SRC_BERGAMOT_RESPONSE_H_
|
||||||
|
|
||||||
#include "sentence_ranges.h"
|
#include "data/alignment.h"
|
||||||
#include "data/types.h"
|
#include "data/types.h"
|
||||||
#include "definitions.h"
|
#include "definitions.h"
|
||||||
|
#include "sentence_ranges.h"
|
||||||
#include "translator/beam_search.h"
|
#include "translator/beam_search.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -12,86 +13,79 @@
|
|||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
namespace bergamot {
|
namespace bergamot {
|
||||||
|
|
||||||
|
/// Alignment is stored as a sparse matrix, this pretty much aligns with marian
|
||||||
|
/// internals but is brought here to maintain translator
|
||||||
|
/// agnosticism/independence.
|
||||||
|
struct Point {
|
||||||
|
size_t src; ///< Index pointing to source ByteRange
|
||||||
|
size_t tgt; ///< Index pointing to target ByteRange
|
||||||
|
float prob; ///< Score between [0, 1] on indicating degree of alignment.
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Alignment is a sparse matrix, where Points represent entries with values.
|
||||||
|
typedef std::vector<Point> Alignment;
|
||||||
|
|
||||||
|
/// -loglikelhoods of the sequence components as proxy to quality.
|
||||||
|
struct Quality {
|
||||||
|
/// Certainty/uncertainty score for sequence.
|
||||||
|
float sequence;
|
||||||
|
/// Certainty/uncertainty for each word in the sequence.
|
||||||
|
std::vector<float> word;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Response holds AnnotatedText(s) of source-text and translated text,
|
||||||
|
/// alignment information between source and target sub-words and sentences.
|
||||||
|
///
|
||||||
|
/// AnnotatedText provides an API to access markings of (sub)-word and
|
||||||
|
/// sentences boundaries, which are required to interpret Quality and
|
||||||
|
/// Alignment (s) at the moment.
|
||||||
class Response {
|
class Response {
|
||||||
// Response is a marian internal class (not a bergamot-translator class)
|
|
||||||
// holding source blob of text, vector of TokenRanges corresponding to each
|
|
||||||
// sentence in the source text blob and histories obtained from translating
|
|
||||||
// these sentences.
|
|
||||||
//
|
|
||||||
// This class provides an API at a higher level in comparison to History to
|
|
||||||
// access translations and additionally use string_view manipulations to
|
|
||||||
// recover structure in translation from source-text's structure known through
|
|
||||||
// reference string and string_view. As many of these computations are not
|
|
||||||
// required until invoked, they are computed as required and stored in data
|
|
||||||
// members where it makes sense to do so (translation,translationTokenRanges).
|
|
||||||
//
|
|
||||||
// Examples of such use-cases are:
|
|
||||||
// translation()
|
|
||||||
// translationInSourceStructure() TODO(@jerinphilip)
|
|
||||||
// alignment(idx) TODO(@jerinphilip)
|
|
||||||
// sentenceMappings (for bergamot-translator)
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Response(std::string &&source, SentenceRanges &&sourceRanges,
|
///
|
||||||
Histories &&histories,
|
Response(AnnotatedText &&source, Histories &&histories,
|
||||||
// Required for constructing translation and TokenRanges within
|
|
||||||
// translation lazily.
|
|
||||||
std::vector<Ptr<Vocab const>> &vocabs);
|
std::vector<Ptr<Vocab const>> &vocabs);
|
||||||
|
|
||||||
|
/// \cond HIDDEN_PUBLIC
|
||||||
// Move constructor.
|
// Move constructor.
|
||||||
Response(Response &&other)
|
Response(Response &&other)
|
||||||
: source_(std::move(other.source_)),
|
: source(std::move(other.source)), target(std::move(other.target)),
|
||||||
translation_(std::move(other.translation_)),
|
alignments(std::move(other.alignments)),
|
||||||
sourceRanges_(std::move(other.sourceRanges_)),
|
qualityScores(std::move(other.qualityScores)){};
|
||||||
targetRanges_(std::move(other.targetRanges_)),
|
|
||||||
histories_(std::move(other.histories_)),
|
// The following copy bans are not stricitly required anymore since Annotation
|
||||||
vocabs_(std::move(other.vocabs_)){};
|
// is composed of the ByteRange primitive (which was previously string_view
|
||||||
|
// and required to be bound to string), but makes movement efficient by
|
||||||
|
// banning these letting compiler complain about copies.
|
||||||
|
|
||||||
// Prevents CopyConstruction and CopyAssignment. sourceRanges_ is constituted
|
|
||||||
// by string_view and copying invalidates the data member.
|
|
||||||
Response(const Response &) = delete;
|
Response(const Response &) = delete;
|
||||||
Response &operator=(const Response &) = delete;
|
Response &operator=(const Response &) = delete;
|
||||||
|
|
||||||
typedef std::vector<std::pair<const string_view, const string_view>>
|
/// \endcond
|
||||||
SentenceMappings;
|
|
||||||
|
|
||||||
// Moves source sentence into source, translated text into translation.
|
/// Number of sentences translated. The processing of a text of into sentences
|
||||||
// Pairs of string_views to corresponding sentences in
|
/// are handled internally, and this information can be used to iterate
|
||||||
// source and translation are loaded into sentenceMappings. These string_views
|
/// through meaningful units of translation for which alignment and quality
|
||||||
// reference the new source and translation.
|
/// information are available.
|
||||||
//
|
const size_t size() const { return source.numSentences(); }
|
||||||
// Calling move() invalidates the Response object as ownership is transferred.
|
|
||||||
// Exists for moving strc
|
|
||||||
void move(std::string &source, std::string &translation,
|
|
||||||
SentenceMappings &sentenceMappings);
|
|
||||||
|
|
||||||
const Histories &histories() const { return histories_; }
|
/// source text and annotations of (sub-)words and sentences.
|
||||||
const std::string &source() const { return source_; }
|
AnnotatedText source;
|
||||||
const std::string &translation() {
|
|
||||||
constructTranslation();
|
|
||||||
return translation_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// A convenience function provided to return translated text placed within
|
/// translated text and annotations of (sub-)words and sentences.
|
||||||
// source's structure. This is useful when the source text is a multi-line
|
AnnotatedText target;
|
||||||
// paragraph or string_views extracted from structured text like HTML and it's
|
|
||||||
// desirable to place the individual sentences in the locations of the source
|
|
||||||
// sentences.
|
|
||||||
// const std::string translationInSourceStructure();
|
|
||||||
// const PendingAlignmentType alignment(size_t idx);
|
|
||||||
|
|
||||||
private:
|
/// -logprob of each word and negative log likelihood of sequence (sentence)
|
||||||
void constructTranslation();
|
/// normalized by length, for each sentence processed by the translator.
|
||||||
void constructSentenceMappings(SentenceMappings &);
|
/// Indices correspond to ranges accessible through respective Annotation on
|
||||||
|
/// source or target.
|
||||||
|
std::vector<Quality> qualityScores;
|
||||||
|
|
||||||
std::string source_;
|
/// Alignments between source and target. Each Alignment is a
|
||||||
SentenceRanges sourceRanges_;
|
/// sparse matrix representation with indices corresponding
|
||||||
Histories histories_;
|
/// to (sub-)words accessible through Annotation.
|
||||||
|
std::vector<Alignment> alignments;
|
||||||
std::vector<Ptr<Vocab const>> *vocabs_;
|
|
||||||
bool translationConstructed_{false};
|
|
||||||
std::string translation_;
|
|
||||||
SentenceRanges targetRanges_;
|
|
||||||
};
|
};
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
} // namespace marian
|
} // namespace marian
|
||||||
|
@ -5,40 +5,83 @@
|
|||||||
namespace marian {
|
namespace marian {
|
||||||
namespace bergamot {
|
namespace bergamot {
|
||||||
|
|
||||||
void SentenceRanges::addSentence(std::vector<string_view> &wordRanges) {
|
void Annotation::addSentence(std::vector<ByteRange> &sentence) {
|
||||||
addSentence(std::begin(wordRanges), std::end(wordRanges));
|
flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence),
|
||||||
}
|
std::end(sentence));
|
||||||
|
|
||||||
void SentenceRanges::addSentence(WordIterator begin, WordIterator end) {
|
|
||||||
size_t size = flatByteRanges_.size();
|
size_t size = flatByteRanges_.size();
|
||||||
flatByteRanges_.insert(std::end(flatByteRanges_), begin, end);
|
sentenceEndIds_.push_back(size);
|
||||||
sentenceBeginIds_.push_back(size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
string_view SentenceRanges::sentence(size_t index) const {
|
size_t Annotation::numWords(size_t sentenceIdx) const {
|
||||||
size_t bos_id;
|
size_t bosId, eosId;
|
||||||
string_view eos, bos;
|
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
|
||||||
|
eosId = sentenceEndIds_[sentenceIdx + 1];
|
||||||
|
// Difference between eosId and bosId is the number of words.
|
||||||
|
return eosId - bosId;
|
||||||
|
}
|
||||||
|
|
||||||
bos_id = sentenceBeginIds_[index];
|
ByteRange Annotation::sentence(size_t sentenceIdx) const {
|
||||||
bos = flatByteRanges_[bos_id];
|
size_t bosId, eosId;
|
||||||
|
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
|
||||||
|
eosId = sentenceEndIds_[sentenceIdx + 1];
|
||||||
|
ByteRange sentenceByteRange;
|
||||||
|
|
||||||
if (index + 1 == numSentences()) {
|
if (bosId == eosId) {
|
||||||
eos = flatByteRanges_.back();
|
// We have an empty sentence. However, we want to be able to point where in
|
||||||
|
// target this happened through the ranges. We are looking for the end of
|
||||||
|
// the flatByteRange and non-empty sentence before this happened and
|
||||||
|
// construct empty string-view equivalent ByteRange.
|
||||||
|
ByteRange eos = flatByteRanges_[eosId - 1];
|
||||||
|
sentenceByteRange = (ByteRange){eos.end, eos.end};
|
||||||
} else {
|
} else {
|
||||||
assert(index < numSentences());
|
ByteRange bos = flatByteRanges_[bosId];
|
||||||
size_t eos_id = sentenceBeginIds_[index + 1];
|
ByteRange eos = flatByteRanges_[eosId - 1];
|
||||||
--eos_id;
|
sentenceByteRange = (ByteRange){bos.begin, eos.end};
|
||||||
eos = flatByteRanges_[eos_id];
|
}
|
||||||
|
return sentenceByteRange;
|
||||||
}
|
}
|
||||||
|
|
||||||
return sentenceBetween(bos, eos);
|
ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const {
|
||||||
|
size_t bosOffset = sentenceEndIds_[sentenceIdx];
|
||||||
|
return flatByteRanges_[bosOffset + wordIdx];
|
||||||
}
|
}
|
||||||
|
|
||||||
string_view SentenceRanges::sentenceBetween(string_view firstWord,
|
string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const {
|
||||||
string_view lastWord) const {
|
auto terminals = annotation.word(sentenceIdx, wordIdx);
|
||||||
|
return string_view(&text[terminals.begin], terminals.size());
|
||||||
|
}
|
||||||
|
|
||||||
const char *data = firstWord.data();
|
string_view AnnotatedText::sentence(size_t sentenceIdx) const {
|
||||||
size_t size = lastWord.data() + lastWord.size() - firstWord.data();
|
auto sentenceAsByteRange = annotation.sentence(sentenceIdx);
|
||||||
|
return asStringView(sentenceAsByteRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AnnotatedText::addSentence(std::vector<string_view> &wordRanges) {
|
||||||
|
addSentence(std::begin(wordRanges), std::end(wordRanges));
|
||||||
|
};
|
||||||
|
|
||||||
|
void AnnotatedText::addSentence(std::vector<string_view>::iterator begin,
|
||||||
|
std::vector<string_view>::iterator end) {
|
||||||
|
std::vector<ByteRange> sentence;
|
||||||
|
for (auto p = begin; p != end; p++) {
|
||||||
|
size_t begin_offset = p->data() - &text[0];
|
||||||
|
sentence.push_back((ByteRange){begin_offset, begin_offset + p->size()});
|
||||||
|
}
|
||||||
|
annotation.addSentence(sentence);
|
||||||
|
};
|
||||||
|
|
||||||
|
ByteRange AnnotatedText::wordAsByteRange(size_t sentenceIdx,
|
||||||
|
size_t wordIdx) const {
|
||||||
|
return annotation.word(sentenceIdx, wordIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
ByteRange AnnotatedText::sentenceAsByteRange(size_t sentenceIdx) const {
|
||||||
|
return annotation.sentence(sentenceIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
string_view AnnotatedText::asStringView(const ByteRange &byteRange) const {
|
||||||
|
const char *data = &text[byteRange.begin];
|
||||||
|
size_t size = byteRange.size();
|
||||||
return string_view(data, size);
|
return string_view(data, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,50 +3,165 @@
|
|||||||
|
|
||||||
#include "data/types.h"
|
#include "data/types.h"
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
namespace bergamot {
|
namespace bergamot {
|
||||||
|
|
||||||
class SentenceRanges {
|
/// ByteRange stores indices for half-interval [begin, end) in a string. Can be
|
||||||
// SentenceRanges stores string_views into a source text, with additional
|
/// used to represent a sentence, word.
|
||||||
// annotations to mark sentence boundaries.
|
struct ByteRange {
|
||||||
//
|
size_t begin;
|
||||||
// Given the availability annotations, this container provides capabilty to
|
size_t end;
|
||||||
// add sentences, and access individual sentences.
|
const size_t size() const { return end - begin; }
|
||||||
|
};
|
||||||
|
|
||||||
|
/// An Annotation is a collection of ByteRanges used to denote ancillary
|
||||||
|
/// information of sentences and words on a text of string. Annotation is meant
|
||||||
|
/// for consumption on platforms where `string_view` creates problems (eg:
|
||||||
|
/// exports through WASM) conveniently rebasing them as required into
|
||||||
|
/// ByteRanges. See AnnotatedText for cases where this is a non-issue.
|
||||||
|
///
|
||||||
|
/// **Usage**
|
||||||
|
///
|
||||||
|
/// To ensure rebasing is consistent during creation and updation, use
|
||||||
|
/// `Annotation` best through `AnnotatedText`, which also holds the reference
|
||||||
|
/// string and can work with `string_views`.
|
||||||
|
///
|
||||||
|
/// If used separately, it is on the user to ensure the reference string
|
||||||
|
/// is the same as what the Annotation refers to. For best results, an instance
|
||||||
|
/// is expected to be read only in this mode of operation.
|
||||||
|
///
|
||||||
|
/// **Idea**
|
||||||
|
///
|
||||||
|
/// Annotation is intended to be the same structure conceptually as below,
|
||||||
|
/// except the `std::vector<std::vector<ByteRange>>` hammered into a flat
|
||||||
|
/// structure to avoid multiple reallocs keeping efficiency in mind. This is
|
||||||
|
/// achieved by having markers of where sentence ends in the flat container
|
||||||
|
/// storing word ByteRanges.
|
||||||
|
///
|
||||||
|
/// ```cpp
|
||||||
|
/// typedef ByteRange Word;
|
||||||
|
/// // std::vector<ByteRange>, a single sentence
|
||||||
|
/// typedef std::vector<Word> Sentence;
|
||||||
|
/// std::vector<std::vector<ByteRange> // multiple sentences
|
||||||
|
/// typedef std::vector<Sentence> Annotation;
|
||||||
|
///
|
||||||
|
/// Annotation example;
|
||||||
|
/// ```
|
||||||
|
/// This structure exists to provide a consistent API to access the nested
|
||||||
|
/// sentences of varying lengths, which occur in source-text processed into
|
||||||
|
/// multiple sentences, and target-text translated from source as multiple
|
||||||
|
/// sentences, both composed of (sub)-words, providing a List[List] like access
|
||||||
|
/// while storing it in a compact and efficient manner.
|
||||||
|
class Annotation {
|
||||||
public:
|
public:
|
||||||
typedef std::vector<string_view>::iterator WordIterator;
|
/// Annotation is constructed empty. See `addSentence()` to populate it with
|
||||||
|
/// annotations.
|
||||||
void addSentence(std::vector<string_view> &wordRanges);
|
Annotation() {
|
||||||
void addSentence(WordIterator begin, WordIterator end);
|
// The -1-th sentence ends at 0.
|
||||||
|
sentenceEndIds_.push_back(0);
|
||||||
void clear() {
|
|
||||||
flatByteRanges_.clear();
|
|
||||||
sentenceBeginIds_.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t numSentences() const { return sentenceBeginIds_.size(); }
|
/// Returns the number of sentences annotated in a text.
|
||||||
|
size_t numSentences() const { return sentenceEndIds_.size() - 1; }
|
||||||
|
|
||||||
// Returns a string_view into the ith sentence.
|
/// Returns number of words in the sentence identified by `sentenceIdx`.
|
||||||
string_view sentence(size_t index) const;
|
size_t numWords(size_t sentenceIdx) const;
|
||||||
|
|
||||||
|
/// Adds a sentences from `vector<ByteRange>` representation, internally doing
|
||||||
|
/// extra book-keeping for the sentence terminal markings. Sentences are
|
||||||
|
/// expected to be added in order as they occur in text.
|
||||||
|
void addSentence(std::vector<ByteRange> &sentence);
|
||||||
|
|
||||||
|
/// Returns a ByteRange representing `wordIdx` in sentence indexed by
|
||||||
|
/// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
|
||||||
|
/// `.numWords()` for `sentenceIdx` for defined behaviour.
|
||||||
|
ByteRange word(size_t sentenceIdx, size_t wordIdx) const;
|
||||||
|
|
||||||
|
/// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
|
||||||
|
/// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
|
||||||
|
/// less than `.numSentences()`.
|
||||||
|
ByteRange sentence(size_t sentenceIdx) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// A flat storage for string_views. Can be words or sentences.
|
/// A flat storage for ByteRanges. Composed of word ByteRanges, extra
|
||||||
std::vector<string_view> flatByteRanges_;
|
/// information in sentenceEndIds_ to denote sentence boundary markers as
|
||||||
|
/// indices.
|
||||||
|
std::vector<ByteRange> flatByteRanges_;
|
||||||
|
|
||||||
// The container grows dynamically with addSentence. size_t marking index is
|
/// Stores indices onto flatByteRanges_ of where sentences end (not inclusive,
|
||||||
// used to ensure the sentence boundaries stay same while underlying storage
|
/// aligned with C++ half interval notions). There is a 0 marker to simplify
|
||||||
// might be changed during reallocation.
|
/// sources, indicating where the -1-th sentence ends.
|
||||||
std::vector<size_t> sentenceBeginIds_;
|
std::vector<size_t> sentenceEndIds_;
|
||||||
|
};
|
||||||
|
|
||||||
// Utility function to extract the string starting at firstWord and ending at
|
/// AnnotatedText is effectively std::string text + Annotation, providing the
|
||||||
// lastWord as a single string-view.
|
/// following additional desiderata.
|
||||||
string_view sentenceBetween(string_view firstWord,
|
///
|
||||||
string_view lastWord) const;
|
/// 1. Access to processed string_views for convenience rather than ByteRanges
|
||||||
|
/// (which only provides index information).
|
||||||
|
///
|
||||||
|
/// 2. Transparently convert string_views into ByteRanges for the Annotation
|
||||||
|
/// referring to the text bound by this structure.
|
||||||
|
///
|
||||||
|
/// 3. Bind the text and annotations together, to move around as a meaningful
|
||||||
|
/// unit.
|
||||||
|
|
||||||
|
struct AnnotatedText {
|
||||||
|
public:
|
||||||
|
std::string text; ///< Blob of string elements in annotation refers to.
|
||||||
|
Annotation annotation; ///< sentence and (sub-) word annotations.
|
||||||
|
|
||||||
|
/// Construct an empty AnnotatedText. This is useful when the target string or
|
||||||
|
/// ByteRanges are not known yet, but the public members can be used to
|
||||||
|
/// populate it. One use-case, when translated-text is created decoding from
|
||||||
|
/// histories and the ByteRanges only known after the string has been
|
||||||
|
/// constructed.
|
||||||
|
AnnotatedText() {}
|
||||||
|
|
||||||
|
/// Construct moving in a string (for efficiency purposes, copying string
|
||||||
|
/// constructor is disallowed).
|
||||||
|
AnnotatedText(std::string &&text) : text(std::move(text)){};
|
||||||
|
|
||||||
|
AnnotatedText(AnnotatedText &&annotatedBlob)
|
||||||
|
: text(std::move(annotatedBlob.text)),
|
||||||
|
annotation(std::move(annotatedBlob.annotation)) {}
|
||||||
|
|
||||||
|
/// Returns the number of sentences in the annotation structure.
|
||||||
|
const size_t numSentences() const { return annotation.numSentences(); }
|
||||||
|
|
||||||
|
/// Returns number of words in the sentece identified by sentenceIdx.
|
||||||
|
const size_t numWords(size_t sentenceIdx) const {
|
||||||
|
return annotation.numWords(sentenceIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds a sentence, used to load from SentencePiece annotations conveniently.
|
||||||
|
void addSentence(std::vector<string_view> &wordRanges);
|
||||||
|
|
||||||
|
/// Adds a sentence between two iterators, often useful while constructing
|
||||||
|
/// from parts of a container.
|
||||||
|
void addSentence(std::vector<string_view>::iterator begin,
|
||||||
|
std::vector<string_view>::iterator end);
|
||||||
|
|
||||||
|
/// Returns a string_view representing wordIdx in sentenceIdx
|
||||||
|
string_view word(size_t sentenceIdx, size_t wordIdx) const;
|
||||||
|
|
||||||
|
/// Returns a string_view representing sentence corresponding to sentenceIdx.
|
||||||
|
string_view sentence(size_t sentenceIdx) const;
|
||||||
|
|
||||||
|
/// Returns a ByteRange representing wordIdx in sentenceIdx
|
||||||
|
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;
|
||||||
|
|
||||||
|
/// Returns a ByteRange representing sentence corresponding to sentenceIdx.
|
||||||
|
ByteRange sentenceAsByteRange(size_t sentenceIdx) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
string_view asStringView(const ByteRange &byteRange) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
|
|
||||||
} // namespace marian
|
} // namespace marian
|
||||||
|
|
||||||
#endif // BERGAMOT_SENTENCE_RANGES_H_
|
#endif // BERGAMOT_SENTENCE_RANGES_H_
|
||||||
|
@ -5,25 +5,78 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
|
inline std::vector<marian::Ptr<const marian::Vocab>>
|
||||||
|
loadVocabularies(marian::Ptr<marian::Options> options) {
|
||||||
|
// @TODO: parallelize vocab loading for faster startup
|
||||||
|
auto vfiles = options->get<std::vector<std::string>>("vocabs");
|
||||||
|
// with the current setup, we need at least two vocabs: src and trg
|
||||||
|
ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies.");
|
||||||
|
std::vector<marian::Ptr<marian::Vocab const>> vocabs(vfiles.size());
|
||||||
|
std::unordered_map<std::string, marian::Ptr<marian::Vocab>> vmap;
|
||||||
|
for (size_t i = 0; i < vocabs.size(); ++i) {
|
||||||
|
auto m =
|
||||||
|
vmap.emplace(std::make_pair(vfiles[i], marian::Ptr<marian::Vocab>()));
|
||||||
|
if (m.second) { // new: load the vocab
|
||||||
|
m.first->second = marian::New<marian::Vocab>(options, i);
|
||||||
|
m.first->second->load(vfiles[i]);
|
||||||
|
}
|
||||||
|
vocabs[i] = m.first->second;
|
||||||
|
}
|
||||||
|
return vocabs;
|
||||||
|
}
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
namespace bergamot {
|
namespace bergamot {
|
||||||
|
|
||||||
Service::Service(Ptr<Options> options)
|
Service::Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory)
|
||||||
: ServiceBase(options), numWorkers_(options->get<int>("cpu-threads")),
|
: requestId_(0), vocabs_(std::move(loadVocabularies(options))),
|
||||||
pcqueue_(numWorkers_) {
|
text_processor_(vocabs_, options), batcher_(options),
|
||||||
|
numWorkers_(options->get<int>("cpu-threads")),
|
||||||
|
modelMemory_(std::move(modelMemory)), shortlistMemory_(std::move(shortlistMemory))
|
||||||
|
#ifndef WASM_COMPATIBLE_SOURCE
|
||||||
|
// 0 elements in PCQueue is illegal and can lead to failures. Adding a
|
||||||
|
// guard to have at least one entry allocated. In the single-threaded
|
||||||
|
// case, while initialized pcqueue_ remains unused.
|
||||||
|
,
|
||||||
|
pcqueue_(std::max<size_t>(1, numWorkers_))
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
|
||||||
if (numWorkers_ == 0) {
|
if (numWorkers_ == 0) {
|
||||||
ABORT("Fatal: Attempt to create multithreaded instance with --cpu-threads "
|
build_translators(options, /*numTranslators=*/1);
|
||||||
"0. ");
|
initialize_blocking_translator();
|
||||||
|
} else {
|
||||||
|
build_translators(options, numWorkers_);
|
||||||
|
initialize_async_translators();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
translators_.reserve(numWorkers_);
|
void Service::build_translators(Ptr<Options> options, size_t numTranslators) {
|
||||||
|
translators_.reserve(numTranslators);
|
||||||
|
for (size_t cpuId = 0; cpuId < numTranslators; cpuId++) {
|
||||||
|
marian::DeviceId deviceId(cpuId, DeviceType::cpu);
|
||||||
|
translators_.emplace_back(deviceId, vocabs_, options, &modelMemory_, &shortlistMemory_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Service::initialize_blocking_translator() {
|
||||||
|
translators_.back().initialize();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Service::blocking_translate() {
|
||||||
|
Batch batch;
|
||||||
|
while (batcher_ >> batch) {
|
||||||
|
auto &translator = translators_.back();
|
||||||
|
translator.translate(batch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef WASM_COMPATIBLE_SOURCE
|
||||||
|
void Service::initialize_async_translators() {
|
||||||
workers_.reserve(numWorkers_);
|
workers_.reserve(numWorkers_);
|
||||||
|
|
||||||
for (size_t cpuId = 0; cpuId < numWorkers_; cpuId++) {
|
for (size_t cpuId = 0; cpuId < numWorkers_; cpuId++) {
|
||||||
marian::DeviceId deviceId(cpuId, DeviceType::cpu);
|
auto &translator = translators_[cpuId];
|
||||||
translators_.emplace_back(deviceId, vocabs_, options);
|
|
||||||
auto &translator = translators_.back();
|
|
||||||
|
|
||||||
workers_.emplace_back([&translator, this] {
|
workers_.emplace_back([&translator, this] {
|
||||||
translator.initialize();
|
translator.initialize();
|
||||||
|
|
||||||
@ -42,29 +95,58 @@ Service::Service(Ptr<Options> options)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Service::enqueue() {
|
void Service::async_translate() {
|
||||||
Batch batch;
|
Batch batch;
|
||||||
while (batcher_ >> batch) {
|
while (batcher_ >> batch) {
|
||||||
pcqueue_.ProduceSwap(batch);
|
pcqueue_.ProduceSwap(batch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else // WASM_COMPATIBLE_SOURCE
|
||||||
|
void Service::initialize_async_translators() {
|
||||||
|
ABORT("Cannot run in async mode without multithreading.");
|
||||||
|
}
|
||||||
|
|
||||||
|
void Service::async_translate() {
|
||||||
|
ABORT("Cannot run in async mode without multithreading.");
|
||||||
|
}
|
||||||
|
#endif // WASM_COMPATIBLE_SOURCE
|
||||||
|
|
||||||
|
std::future<Response> Service::translate(std::string &&input) {
|
||||||
|
Segments segments;
|
||||||
|
AnnotatedText source(std::move(input));
|
||||||
|
text_processor_.process(source, segments);
|
||||||
|
|
||||||
|
std::promise<Response> responsePromise;
|
||||||
|
auto future = responsePromise.get_future();
|
||||||
|
|
||||||
|
Ptr<Request> request = New<Request>(
|
||||||
|
requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(source),
|
||||||
|
std::move(segments), std::move(responsePromise));
|
||||||
|
|
||||||
|
batcher_.addWholeRequest(request);
|
||||||
|
if (numWorkers_ == 0) {
|
||||||
|
blocking_translate();
|
||||||
|
} else {
|
||||||
|
async_translate();
|
||||||
|
}
|
||||||
|
return future;
|
||||||
|
}
|
||||||
|
|
||||||
|
Service::~Service() {
|
||||||
|
#ifndef WASM_COMPATIBLE_SOURCE
|
||||||
|
for (size_t workerId = 0; workerId < numWorkers_; workerId++) {
|
||||||
|
|
||||||
void Service::stop() {
|
|
||||||
for (auto &worker : workers_) {
|
|
||||||
Batch poison = Batch::poison();
|
Batch poison = Batch::poison();
|
||||||
pcqueue_.ProduceSwap(poison);
|
pcqueue_.ProduceSwap(poison);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto &worker : workers_) {
|
for (size_t workerId = 0; workerId < numWorkers_; workerId++) {
|
||||||
if (worker.joinable()) {
|
if (workers_[workerId].joinable()) {
|
||||||
worker.join();
|
workers_[workerId].join();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
workers_.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Service::~Service() { stop(); }
|
|
||||||
|
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
} // namespace marian
|
} // namespace marian
|
||||||
|
@ -4,10 +4,13 @@
|
|||||||
#include "batch_translator.h"
|
#include "batch_translator.h"
|
||||||
#include "batcher.h"
|
#include "batcher.h"
|
||||||
#include "data/types.h"
|
#include "data/types.h"
|
||||||
#include "pcqueue.h"
|
|
||||||
#include "response.h"
|
#include "response.h"
|
||||||
#include "service_base.h"
|
|
||||||
#include "text_processor.h"
|
#include "text_processor.h"
|
||||||
|
#include "translator/parser.h"
|
||||||
|
|
||||||
|
#ifndef WASM_COMPATIBLE_SOURCE
|
||||||
|
#include "pcqueue.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <queue>
|
#include <queue>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -15,39 +18,104 @@
|
|||||||
namespace marian {
|
namespace marian {
|
||||||
namespace bergamot {
|
namespace bergamot {
|
||||||
|
|
||||||
class Service : public ServiceBase {
|
/// Service exposes methods to translate an incoming blob of text to the
|
||||||
|
/// Consumer of bergamot API.
|
||||||
// Service exposes methods to translate an incoming blob of text to the
|
///
|
||||||
// Consumer of bergamot API.
|
/// An example use of this API looks as follows:
|
||||||
//
|
///
|
||||||
// An example use of this API looks as follows:
|
/// options = ...;
|
||||||
//
|
/// service = Service(options);
|
||||||
// options = ...;
|
/// std::string input_text = "Hello World";
|
||||||
// service = Service(options);
|
/// std::future<Response>
|
||||||
// std::string input_blob = "Hello World";
|
/// response = service.translate(std::move(input_text));
|
||||||
// std::future<Response>
|
/// response.wait();
|
||||||
// response = service.translate(std::move(input_blob));
|
/// Response result = response.get();
|
||||||
// response.wait();
|
///
|
||||||
// Response result = response.get();
|
/// Optionally Service can be initialized by also passing model_memory for
|
||||||
|
/// purposes of efficiency (which defaults to nullpointer and then reads from
|
||||||
|
/// file supplied through config).
|
||||||
|
class Service {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit Service(Ptr<Options> options);
|
/// @param options Marian options object
|
||||||
// Implements enqueue and top through blocking methods.
|
/// @param modelMemory byte array (aligned to 256!!!) that contains the bytes
|
||||||
void stop() override;
|
/// of a model.bin. Optional, defaults to nullptr when not used
|
||||||
|
/// @param shortlistMemory byte array of shortlist (aligned to 64)
|
||||||
|
explicit Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory);
|
||||||
|
|
||||||
|
explicit Service(Ptr<Options> options) : Service(options, AlignedMemory(), AlignedMemory()){}
|
||||||
|
|
||||||
|
/// Construct Service from a string configuration.
|
||||||
|
/// @param [in] config string parsable as YAML expected to adhere with marian
|
||||||
|
/// config
|
||||||
|
/// @param [in] model_memory byte array (aligned to 256!!!) that contains the
|
||||||
|
/// bytes of a model.bin. Optional.
|
||||||
|
/// @param [in] shortlistMemory byte array of shortlist (aligned to 64)
|
||||||
|
explicit Service(const std::string &config,
|
||||||
|
AlignedMemory modelMemory = AlignedMemory(), AlignedMemory shortlistMemory = AlignedMemory())
|
||||||
|
: Service(parseOptions(config), std::move(modelMemory), std::move(shortlistMemory)) {}
|
||||||
|
|
||||||
|
/// Explicit destructor to clean up after any threads initialized in
|
||||||
|
/// asynchronous operation mode.
|
||||||
~Service();
|
~Service();
|
||||||
|
|
||||||
|
/// To stay efficient and to refer to the string for alignments, expects
|
||||||
|
/// ownership be moved through std::move(..)
|
||||||
|
///
|
||||||
|
/// @param [in] rvalue reference of string to be translated.
|
||||||
|
std::future<Response> translate(std::string &&input);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void enqueue() override;
|
/// Build numTranslators number of translators with options from options
|
||||||
|
void build_translators(Ptr<Options> options, size_t numTranslators);
|
||||||
|
/// Initializes a blocking translator without using std::thread
|
||||||
|
void initialize_blocking_translator();
|
||||||
|
/// Translates through direct interaction between batcher_ and translators_
|
||||||
|
void blocking_translate();
|
||||||
|
|
||||||
// In addition to the common members (text_processor, requestId, vocabs_,
|
/// Launches multiple workers of translators using std::thread
|
||||||
// batcher) extends with a producer-consumer queue, vector of translator
|
/// Reduces to ABORT if called when not compiled WITH_PTHREAD
|
||||||
// instances owned by service each listening to the pcqueue in separate
|
void initialize_async_translators();
|
||||||
// threads.
|
/// Async translate produces to a producer-consumer queue as batches are
|
||||||
|
/// generated by Batcher. In another thread, the translators consume from
|
||||||
|
/// producer-consumer queue.
|
||||||
|
/// Reduces to ABORT if called when not compiled WITH_PTHREAD
|
||||||
|
void async_translate();
|
||||||
|
|
||||||
size_t numWorkers_; // ORDER DEPENDENCY
|
/// Number of workers to launch.
|
||||||
PCQueue<Batch> pcqueue_; // ORDER DEPENDENCY
|
size_t numWorkers_; // ORDER DEPENDENCY (pcqueue_)
|
||||||
|
/// Model memory to load model passed as bytes.
|
||||||
|
AlignedMemory modelMemory_; // ORDER DEPENDENCY (translators_)
|
||||||
|
/// Shortlist memory passed as bytes.
|
||||||
|
AlignedMemory shortlistMemory_; // ORDER DEPENDENCY (translators_)
|
||||||
|
|
||||||
|
/// Holds instances of batch translators, just one in case
|
||||||
|
/// of single-threaded application, numWorkers_ in case of multithreaded
|
||||||
|
/// setting.
|
||||||
|
std::vector<BatchTranslator> translators_; // ORDER DEPENDENCY (modelMemory_, shortlistMemory_)
|
||||||
|
|
||||||
|
/// Stores requestId of active request. Used to establish
|
||||||
|
/// ordering among requests and logging/book-keeping.
|
||||||
|
|
||||||
|
size_t requestId_;
|
||||||
|
|
||||||
|
/// Store vocabs representing source and target.
|
||||||
|
std::vector<Ptr<Vocab const>> vocabs_; // ORDER DEPENDENCY (text_processor_)
|
||||||
|
|
||||||
|
/// TextProcesser takes a blob of text and converts into format consumable by
|
||||||
|
/// the batch-translator and annotates sentences and words.
|
||||||
|
TextProcessor text_processor_; // ORDER DEPENDENCY (vocabs_)
|
||||||
|
|
||||||
|
/// Batcher handles generation of batches from a request, subject to
|
||||||
|
/// packing-efficiency and priority optimization heuristics.
|
||||||
|
Batcher batcher_;
|
||||||
|
|
||||||
|
// The following constructs are available providing full capabilities on a non
|
||||||
|
// WASM platform, where one does not have to hide threads.
|
||||||
|
#ifndef WASM_COMPATIBLE_SOURCE
|
||||||
|
PCQueue<Batch> pcqueue_; // ORDER DEPENDENCY (numWorkers_)
|
||||||
std::vector<std::thread> workers_;
|
std::vector<std::thread> workers_;
|
||||||
std::vector<BatchTranslator> translators_;
|
#endif // WASM_COMPATIBLE_SOURCE
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
|
@ -1,42 +0,0 @@
|
|||||||
#include "service_base.h"
|
|
||||||
|
|
||||||
namespace marian {
|
|
||||||
namespace bergamot {
|
|
||||||
|
|
||||||
ServiceBase::ServiceBase(Ptr<Options> options)
|
|
||||||
: requestId_(0), vocabs_(std::move(loadVocabularies(options))),
|
|
||||||
text_processor_(vocabs_, options), batcher_(options) {}
|
|
||||||
|
|
||||||
std::future<Response> ServiceBase::translate(std::string &&input) {
|
|
||||||
Segments segments;
|
|
||||||
SentenceRanges sourceRanges;
|
|
||||||
text_processor_.process(input, segments, sourceRanges);
|
|
||||||
|
|
||||||
std::promise<Response> responsePromise;
|
|
||||||
auto future = responsePromise.get_future();
|
|
||||||
|
|
||||||
Ptr<Request> request = New<Request>(
|
|
||||||
requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(input),
|
|
||||||
std::move(segments), std::move(sourceRanges), std::move(responsePromise));
|
|
||||||
|
|
||||||
batcher_.addWholeRequest(request);
|
|
||||||
enqueue();
|
|
||||||
return future;
|
|
||||||
}
|
|
||||||
|
|
||||||
NonThreadedService::NonThreadedService(Ptr<Options> options)
|
|
||||||
: ServiceBase(options),
|
|
||||||
translator_(DeviceId(0, DeviceType::cpu), vocabs_, options) {
|
|
||||||
translator_.initialize();
|
|
||||||
}
|
|
||||||
|
|
||||||
void NonThreadedService::enqueue() {
|
|
||||||
// Queue single-threaded
|
|
||||||
Batch batch;
|
|
||||||
while (batcher_ >> batch) {
|
|
||||||
translator_.translate(batch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace bergamot
|
|
||||||
} // namespace marian
|
|
@ -1,80 +0,0 @@
|
|||||||
#ifndef SRC_BERGAMOT_SERVICE_BASE_H_
|
|
||||||
#define SRC_BERGAMOT_SERVICE_BASE_H_
|
|
||||||
#include "batch_translator.h"
|
|
||||||
#include "batcher.h"
|
|
||||||
#include "data/types.h"
|
|
||||||
#include "response.h"
|
|
||||||
#include "text_processor.h"
|
|
||||||
|
|
||||||
#include <queue>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace marian {
|
|
||||||
namespace bergamot {
|
|
||||||
// This file describes the base class ServiceBase, and a non-threaded subclass
|
|
||||||
// implementing translation functionality called NonThreadedService.
|
|
||||||
|
|
||||||
class ServiceBase {
|
|
||||||
public:
|
|
||||||
explicit ServiceBase(Ptr<Options> options);
|
|
||||||
|
|
||||||
// Transfers ownership of input string to Service, returns a future containing
|
|
||||||
// an object which provides access to translations, other features like
|
|
||||||
// sentencemappings and (tentatively) alignments.
|
|
||||||
std::future<Response> translate(std::string &&input);
|
|
||||||
|
|
||||||
// Convenience accessor methods to extract these vocabulary outside service.
|
|
||||||
// e.g: For use in decoding histories for marian-decoder replacement.
|
|
||||||
Ptr<Vocab const> sourceVocab() const { return vocabs_.front(); }
|
|
||||||
Ptr<Vocab const> targetVocab() const { return vocabs_.back(); }
|
|
||||||
|
|
||||||
// Wraps up any thread related destruction code.
|
|
||||||
virtual void stop() = 0;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
// Enqueue queues a request for translation, this can be synchronous, blocking
|
|
||||||
// or asynchronous and queued in the background.
|
|
||||||
virtual void enqueue() = 0;
|
|
||||||
|
|
||||||
size_t requestId_;
|
|
||||||
std::vector<Ptr<Vocab const>> vocabs_; // ORDER DEPENDENCY
|
|
||||||
TextProcessor text_processor_; // ORDER DEPENDENCY
|
|
||||||
Batcher batcher_;
|
|
||||||
};
|
|
||||||
|
|
||||||
class NonThreadedService : public ServiceBase {
|
|
||||||
public:
|
|
||||||
explicit NonThreadedService(Ptr<Options> options);
|
|
||||||
void stop() override{};
|
|
||||||
|
|
||||||
private:
|
|
||||||
// NonThreaded service overrides unimplemented functions in base-class using
|
|
||||||
// blocking mechanisms.
|
|
||||||
void enqueue() override;
|
|
||||||
// There's a single translator, launched as part of the main process.
|
|
||||||
BatchTranslator translator_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Used across Services
|
|
||||||
inline std::vector<Ptr<const Vocab>> loadVocabularies(Ptr<Options> options) {
|
|
||||||
// @TODO: parallelize vocab loading for faster startup
|
|
||||||
auto vfiles = options->get<std::vector<std::string>>("vocabs");
|
|
||||||
// with the current setup, we need at least two vocabs: src and trg
|
|
||||||
ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies.");
|
|
||||||
std::vector<Ptr<Vocab const>> vocabs(vfiles.size());
|
|
||||||
std::unordered_map<std::string, Ptr<Vocab>> vmap;
|
|
||||||
for (size_t i = 0; i < vocabs.size(); ++i) {
|
|
||||||
auto m = vmap.emplace(std::make_pair(vfiles[i], Ptr<Vocab>()));
|
|
||||||
if (m.second) { // new: load the vocab
|
|
||||||
m.first->second = New<Vocab>(options, i);
|
|
||||||
m.first->second->load(vfiles[i]);
|
|
||||||
}
|
|
||||||
vocabs[i] = m.first->second;
|
|
||||||
}
|
|
||||||
return vocabs;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace bergamot
|
|
||||||
} // namespace marian
|
|
||||||
|
|
||||||
#endif // SRC_BERGAMOT_SERVICE_BASE_H_
|
|
@ -25,9 +25,9 @@ TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
|
|||||||
ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0");
|
ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0");
|
||||||
}
|
}
|
||||||
|
|
||||||
void TextProcessor::process(const string_view &query, Segments &segments,
|
void TextProcessor::process(AnnotatedText &source, Segments &segments) {
|
||||||
SentenceRanges &sourceRanges) {
|
|
||||||
|
|
||||||
|
string_view query = string_view(source.text);
|
||||||
auto sentenceStream = sentence_splitter_.createSentenceStream(query);
|
auto sentenceStream = sentence_splitter_.createSentenceStream(query);
|
||||||
std::string_view sentenceStringPiece;
|
std::string_view sentenceStringPiece;
|
||||||
|
|
||||||
@ -42,14 +42,14 @@ void TextProcessor::process(const string_view &query, Segments &segments,
|
|||||||
// after normalization. 0 prevents any empty entries from being added.
|
// after normalization. 0 prevents any empty entries from being added.
|
||||||
if (segment.size() > 0) {
|
if (segment.size() > 0) {
|
||||||
// Truncate segment into max_input_size segments.
|
// Truncate segment into max_input_size segments.
|
||||||
truncate(segment, wordRanges, segments, sourceRanges);
|
truncate(segment, wordRanges, segments, source);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void TextProcessor::truncate(Segment &segment,
|
void TextProcessor::truncate(Segment &segment,
|
||||||
std::vector<string_view> &wordRanges,
|
std::vector<string_view> &wordRanges,
|
||||||
Segments &segments, SentenceRanges &sourceRanges) {
|
Segments &segments, AnnotatedText &source) {
|
||||||
for (size_t offset = 0; offset < segment.size();
|
for (size_t offset = 0; offset < segment.size();
|
||||||
offset += max_length_break_) {
|
offset += max_length_break_) {
|
||||||
auto start = segment.begin() + offset;
|
auto start = segment.begin() + offset;
|
||||||
@ -61,7 +61,7 @@ void TextProcessor::truncate(Segment &segment,
|
|||||||
segments.back().push_back(sourceEosId());
|
segments.back().push_back(sourceEosId());
|
||||||
|
|
||||||
auto astart = wordRanges.begin() + offset;
|
auto astart = wordRanges.begin() + offset;
|
||||||
sourceRanges.addSentence(astart, astart + diff);
|
source.addSentence(astart, astart + diff);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,8 +23,7 @@ class TextProcessor {
|
|||||||
public:
|
public:
|
||||||
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
|
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
|
||||||
|
|
||||||
void process(const string_view &query, Segments &segments,
|
void process(AnnotatedText &source, Segments &segments);
|
||||||
SentenceRanges &sourceRanges);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Tokenizes an input string, returns Words corresponding. Loads the
|
// Tokenizes an input string, returns Words corresponding. Loads the
|
||||||
@ -34,7 +33,7 @@ private:
|
|||||||
|
|
||||||
// Truncate sentence into max_input_size segments.
|
// Truncate sentence into max_input_size segments.
|
||||||
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
|
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
|
||||||
Segments &segments, SentenceRanges &sourceRanges);
|
Segments &segments, AnnotatedText &source);
|
||||||
|
|
||||||
// shorthand, used only in truncate()
|
// shorthand, used only in truncate()
|
||||||
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
|
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
|
||||||
|
7
wasm/patch-artifacts-enable-wormhole.sh
Normal file
7
wasm/patch-artifacts-enable-wormhole.sh
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
echo "Patching wasm artifacts to enable wormhole via APIs that compile and instantiate wasm module"
|
||||||
|
sed -i.bak 's/var result = WebAssembly.instantiateStreaming(response, info);/var result = WebAssembly.instantiateStreaming(response, info, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
||||||
|
sed -i.bak 's/return WebAssembly.instantiate(binary, info);/return WebAssembly.instantiate(binary, info, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
||||||
|
sed -i.bak 's/var module = new WebAssembly.Module(bytes);/var module = new WebAssembly.Module(bytes, {simdWormhole:true});/g' wasm/bergamot-translator-worker.js
|
||||||
|
echo "Done"
|
@ -4,13 +4,6 @@ cp ../../build-wasm/wasm/bergamot-translator-worker.data .
|
|||||||
cp ../../build-wasm/wasm/bergamot-translator-worker.js .
|
cp ../../build-wasm/wasm/bergamot-translator-worker.js .
|
||||||
cp ../../build-wasm/wasm/bergamot-translator-worker.wasm .
|
cp ../../build-wasm/wasm/bergamot-translator-worker.wasm .
|
||||||
cp ../../build-wasm/wasm/bergamot-translator-worker.worker.js .
|
cp ../../build-wasm/wasm/bergamot-translator-worker.worker.js .
|
||||||
echo "Done----"
|
|
||||||
|
|
||||||
echo "Start: Enabling wormhole via APIs that compile and instantiate wasm module-------"
|
|
||||||
sed -i.bak 's/var result = WebAssembly.instantiateStreaming(response, info);/var result = WebAssembly.instantiateStreaming(response, info, {simdWormhole:true});/g' bergamot-translator-worker.js
|
|
||||||
sed -i.bak 's/return WebAssembly.instantiate(binary, info);/return WebAssembly.instantiate(binary, info, {simdWormhole:true});/g' bergamot-translator-worker.js
|
|
||||||
sed -i.bak 's/var module = new WebAssembly.Module(bytes);/var module = new WebAssembly.Module(bytes, {simdWormhole:true});/g' bergamot-translator-worker.js
|
|
||||||
echo "Done: Enabling wormhole via APIs that compile and instantiate wasm module--------"
|
|
||||||
|
|
||||||
npm install
|
npm install
|
||||||
echo "Start httpserver"
|
echo "Start httpserver"
|
||||||
|
Loading…
Reference in New Issue
Block a user