Merge pull request #8 from browsermt/integration

Integration: marian new on-the-fly-decoder for bergamot
This commit is contained in:
abhi-agg 2021-02-26 18:56:29 +01:00 committed by GitHub
commit a9e0d800ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
64 changed files with 3693 additions and 362 deletions

View File

@ -0,0 +1,32 @@
name: MacOS Native (Custom)
on:
push:
branches: [ main, ci-sandbox ]
pull_request:
branches: [ main, ci-sandbox ]
jobs:
build-macos:
name: Native (With Custom Marian)
runs-on: macos-10.15
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- name: Configure CMake
run: |
mkdir -p build-native
cd build-native
cmake ..
- name: Compile
working-directory: build-native
run: make -j2
- name: Print versions
working-directory: build-native
run: ./app/bergamot-translator-app --version

View File

@ -0,0 +1,47 @@
name: MacOS WASM (Custom)
on:
push:
branches: [ main, ci-sandbox ]
pull_request:
branches: [ main, ci-sandbox ]
jobs:
build-wasm:
name: WASM (With Custom Marian)
runs-on: macos-10.15
steps:
- name: Setup Emscripten toolchain
uses: mymindstorm/setup-emsdk@v8
- name: Verify Emscripten setup
run: emcc -v
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- name: Configure builds
run: |
mkdir -p build-wasm
cd build-wasm
emcmake cmake -DCOMPILE_WASM=on ..
- name: Compile
working-directory: build-wasm
run: emmake make -j2
- name: Check artifacts
working-directory: build-wasm
run: |
export WASM_ARTIFACTS_DIR=wasm
ls -all ${WASM_ARTIFACTS_DIR}
if ls ${WASM_ARTIFACTS_DIR}/*.wasm &>/dev/null && ls ${WASM_ARTIFACTS_DIR}/*.js &>/dev/null
then
echo "Artifacts Successfully Generated"
else
echo "Failure: Artifacts Not Present"
exit 1
fi

60
.github/workflows/macos.yml vendored Normal file
View File

@ -0,0 +1,60 @@
name: MacOS
on:
push:
branches: [ main, ci-sandbox ]
pull_request:
branches: [ main, ci-sandbox ]
jobs:
build-macos:
name: MacOS CPU-only
runs-on: macos-10.15
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- name: Install dependencies
run: brew install openblas protobuf
# Openblas location is exported explicitly because openblas is keg-only,
# which means it was not symlinked into /usr/local/.
# CMake cannot find BLAS on GitHub runners if Marian is being compiled
# statically, hence USE_STATIC_LIBS=off
- name: Configure CMake
run: |
export LDFLAGS="-L/usr/local/opt/openblas/lib"
export CPPFLAGS="-I/usr/local/opt/openblas/include"
mkdir -p build
cd build
cmake .. \
-DCOMPILE_CPU=on \
-DCOMPILE_CUDA=off \
-DCOMPILE_EXAMPLES=on \
-DCOMPILE_SERVER=on \
-DCOMPILE_TESTS=on \
-DUSE_FBGEMM=on \
-DUSE_SENTENCEPIECE=on \
-DUSE_STATIC_LIBS=off \
-DUSE_WASM_COMPATIBLE_SOURCES=off
- name: Compile
working-directory: build
run: make -j2
# Removing unit-tests, taken care of in browsermt/marian-dev
# - name: Run unit tests
# - working-directory: build
# - run: make test
- name: Print versions
working-directory: build
run: |
./marian --version
./marian-decoder --version
./marian-scorer --version
./spm_encode --version

122
.github/workflows/ubuntu.yml vendored Normal file
View File

@ -0,0 +1,122 @@
name: Ubuntu
on:
push:
branches: [ main, ci-sandbox ]
pull_request:
branches: [ main, ci-sandbox ]
jobs:
build-ubuntu:
strategy:
matrix:
include:
# Ubuntu CPU-only build
- name: "Ubuntu CPU-only"
os: ubuntu-latest
cuda: ""
gcc: 7
cpu: true
gpu: false
# GPU Builds are commented out, for bergamot-translator CI runs.
# Ubuntu GPU-only build
# - name: "Ubuntu GPU-only"
# os: ubuntu-latest
# cuda: "10.2"
# gcc: 7
# cpu: false
# gpu: true
# Ubuntu 20.04 supports CUDA 11+
#- name: "Ubuntu 20.04 CUDA 11.0 gcc-9"
#os: ubuntu-20.04
#cuda: "11.0"
#gcc: 9
#cpu: false
#gpu: true
# Ubuntu 18.04 supports CUDA 10.1+
# - name: "Ubuntu 18.04 CUDA 10.2 gcc-8"
# os: ubuntu-18.04
# cuda: "10.2"
# gcc: 8
# cpu: true
# gpu: true
# Ubuntu 16.04 supports CUDA 8+
# - name: "Ubuntu 16.04 CUDA 9.2 gcc-7"
# os: ubuntu-16.04
# cuda: "9.2"
# gcc: 7
# cpu: true
# gpu: true
runs-on: ${{ matrix.os }}
name: ${{ matrix.name }}
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
# The following packages are already installed on GitHub-hosted runners:
# build-essential openssl libssl-dev
# No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because
# it is installed together with libprotobuf-dev
- name: Install dependencies
run: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler libboost-all-dev
# https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
- name: Install MKL
run: |
wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
if: matrix.cpu == true
# The script simplifies installation of different versions of CUDA
- name: Install CUDA
run: ./3rd_party/marian-dev/scripts/ci/install_cuda_ubuntu.sh ${{ matrix.cuda }}
if: matrix.gpu == true
# Boost is installed on GitHub-hosted runners in a non-standard location
# https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671
- name: Configure CMake
run: |
mkdir -p build
cd build
CC=/usr/bin/gcc-${{ matrix.gcc }} CXX=/usr/bin/g++-${{ matrix.gcc }} CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }} \
cmake .. \
-DBoost_ARCHITECTURE=-x64 \
-DCMAKE_BUILD_TYPE=Release \
-DCOMPILE_CPU=${{ matrix.cpu }} \
-DCOMPILE_CUDA=${{ matrix.gpu }} \
-DCOMPILE_EXAMPLES=on \
-DCOMPILE_SERVER=on \
-DCOMPILE_TESTS=on \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
-DUSE_FBGEMM=${{ matrix.cpu }} \
-DUSE_SENTENCEPIECE=on \
-DUSE_STATIC_LIBS=on \
-DUSE_WASM_COMPATIBLE_SOURCES=off
- name: Compile
working-directory: build
run: make -j2
# Removing unit-tests, taken care of in browsermt/marian-dev
# TODO: add a flag to CMake to compile unit tests only on CPU
# - name: Run unit tests
# working-directory: build
# run: make test
# # GitHub-hosted VMs do not have GPUs, so can not be run in CUDA builds
# if: matrix.gpu == false
- name: Print versions
working-directory: build
run: |
./marian --version
./marian-decoder --version
./marian-scorer --version
./marian-server --version
./spm_encode --version

130
.github/workflows/windows.yml vendored Normal file
View File

@ -0,0 +1,130 @@
name: Windows
on:
push:
branches: [ main, ci-sandbox ]
pull_request:
branches: [ main, ci-sandbox ]
env:
MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip"
jobs:
build-windows:
strategy:
matrix:
include:
# Windows CPU-only build
- name: "Windows CPU-only"
cuda: ""
gpu: false
# GPU Builds are commented out, for bergamot-translator CI runs.
# Windows CPU+GPU build
# - name: "Windows CPU+CUDA"
# cuda: "10.2"
# gpu: true
runs-on: windows-2019
name: ${{ matrix.name }}
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive
- name: Download MKL
run: |
# Wget retries downloading files and is faster than Invoke-WebRequest
C:\msys64\usr\bin\wget.exe -nv ${{ env.MKL_URL }} -O mkl.zip
Expand-Archive -Force mkl.zip ${{ github.workspace }}\mkl
# Set MKLROOT environment variable so that CMake can find MKL
echo "MKLROOT=${{ github.workspace }}\mkl" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
shell: powershell
- name: Install CUDA
run: |
.\3rd_party\marian-dev\scripts\ci\install_cuda_windows.ps1 "10.2"
# Set CUDA_PATH environment variable so that CMake can find CUDA
echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
shell: powershell
if: matrix.gpu == true
- name: Prepare vcpkg
uses: lukka/run-vcpkg@v4
with:
vcpkgArguments: protobuf
vcpkgGitCommitId: 6185aa76504a5025f36754324abf307cc776f3da
vcpkgDirectory: ${{ github.workspace }}/vcpkg/
vcpkgTriplet: x64-windows-static
# Windows CUDA builds use USE_NCCL=off due to compilation errors.
- name: Build Debug
uses: lukka/run-cmake@v3
with:
buildDirectory: ${{ github.workspace }}/build/Debug
cmakeAppendedArgs: '-G Ninja
-DCMAKE_BUILD_TYPE="Debug"
-DOPENSSL_USE_STATIC_LIBS="TRUE"
-DOPENSSL_MSVC_STATIC_RT="TRUE"
-DCOMPILE_CPU="TRUE"
-DCOMPILE_CUDA="${{ matrix.gpu }}"
-DCOMPILE_SERVER="FALSE"
-DCOMPILE_TESTS="TRUE"
-DUSE_FBGEMM="TRUE"
-DUSE_MPI="FALSE"
-DUSE_NCCL="FALSE"
-DUSE_SENTENCEPIECE="TRUE"
-DUSE_STATIC_LIBS="TRUE"'
cmakeListsOrSettingsJson: CMakeListsTxtAdvanced
cmakeListsTxtPath: ${{ github.workspace }}/CMakeLists.txt
useVcpkgToolchainFile: true
# Building in Debug is sufficient for the all-in CPU+GPU compilation;
# its main purpose is to detect warnings that the Release build is not
# able to find sometimes.
if: matrix.gpu == true
# Windows CUDA builds use USE_NCCL=off due to compilation errors
# Boost is pre-installed on Azure/GitHub-hosted Windows runners
# https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md#boost
# (not used yet)
- name: Build Release
uses: lukka/run-cmake@v3
with:
buildDirectory: ${{ github.workspace }}/build/
cmakeAppendedArgs: '-G Ninja
-DBOOST_ROOT="$(BOOST_ROOT_1_72_0)"
-DBOOST_INCLUDEDIR="$(BOOST_ROOT_1_72_0)/include"
-DBOOST_LIBRARYDIR="$(BOOST_ROOT_1_72_0)/lib"
-DCMAKE_BUILD_TYPE="Release"
-DOPENSSL_USE_STATIC_LIBS="TRUE"
-DOPENSSL_MSVC_STATIC_RT="TRUE"
-DCOMPILE_CPU="TRUE"
-DCOMPILE_CUDA="${{ matrix.gpu }}"
-DCOMPILE_SERVER="FALSE"
-DCOMPILE_TESTS="TRUE"
-DUSE_FBGEMM="TRUE"
-DUSE_MPI="FALSE"
-DUSE_NCCL="FALSE"
-DUSE_SENTENCEPIECE="TRUE"
-DUSE_STATIC_LIBS="TRUE"'
cmakeListsOrSettingsJson: CMakeListsTxtAdvanced
cmakeListsTxtPath: ${{ github.workspace }}/CMakeLists.txt
useVcpkgToolchainFile: true
# Removing unit-tests, taken care of in browsermt/marian-dev
# - name: Run unit tests
# working-directory: build/
# run: ctest
# # Not run in GPU builds because GitHub-hosted VMs do not have GPUs
# if: matrix.gpu == false
- name: Print versions
working-directory: build/
run: |
.\marian.exe --version
.\marian-decoder.exe --version
.\marian-scorer.exe --version
dir *.exe
shell: cmd

22
.gitignore vendored Normal file
View File

@ -0,0 +1,22 @@
# vim temporary files
*.swp
*.swo
# CMake
CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps
wasm/test_page/node_modules
build-*
models
wasm/test_page/bergamot-translator-worker.*

2
.gitmodules vendored
View File

@ -1,6 +1,6 @@
[submodule "3rd_party/ssplit-cpp"]
path = 3rd_party/ssplit-cpp
url = https://github.com/ugermann/ssplit-cpp
url = https://github.com/abhi-agg/ssplit-cpp
[submodule "3rd_party/marian-dev"]
path = 3rd_party/marian-dev
url = https://github.com/browsermt/marian-dev

View File

@ -1,6 +1,23 @@
add_subdirectory(marian-dev)
# Add include directories for marian target to be able to use it anywhere in the project without
# explicitly specifying its include directories. Once marian fixes this problem, it can be removed.
if(COMPILE_WASM)
# This is a bad way of adding compilation flags. Will be improved soon.
add_compile_options(${WASM_COMPILE_FLAGS})
endif(COMPILE_WASM)
add_subdirectory(ssplit-cpp)
# Add include directories for 3rd party targets to be able to use it anywhere in the
# project without explicitly specifying their include directories. Once they
# fixe this problem, it can be removed.
get_property(INCDIRS DIRECTORY marian-dev/src PROPERTY INCLUDE_DIRECTORIES)
target_include_directories(marian PUBLIC ${INCDIRS})
get_property(INCLUDE_DIRECTORIES DIRECTORY ssplit-cpp/src PROPERTY INCLUDE_DIRECTORIES)
target_include_directories(ssplit PUBLIC ${INCLUDE_DIRECTORIES})
# Compilation flags
get_directory_property(CMAKE_C_FLAGS DIRECTORY marian-dev DEFINITION CMAKE_C_FLAGS)
get_directory_property(CMAKE_CXX_FLAGS DIRECTORY marian-dev DEFINITION CMAKE_CXX_FLAGS)
set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} PARENT_SCOPE)
set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} PARENT_SCOPE)

@ -1 +1 @@
Subproject commit 69894793ebd93256d824a1590924780a6d54cae8
Subproject commit 05f2517f58de493d2f42236c2d23db95a9edbd8f

@ -1 +1 @@
Subproject commit f5d022992f4a00c860eb809389748908bb85ffcf
Subproject commit 432208826ee27e7b3984b53774b1a16d74256d77

View File

@ -8,13 +8,68 @@ project(bergamot_translator CXX C)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
# Custom CMake options to compile marian (a 3rd party submodule) for this project
option(COMPILE_CUDA "Compile GPU version" OFF)
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
option(USE_STATIC_LIBS "Link statically against non-system libs" ON)
include(CMakeDependentOption)
# Project specific cmake options
option(COMPILE_WASM "Compile for WASM" OFF)
option(USE_WASM_COMPATIBLE_SOURCES "Use wasm compatible sources" ON)
SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds")
# Set marian (3rd party submodule) cmake options to compile for this project
SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
SET(USE_STATIC_LIBS ON CACHE BOOL "Link statically against non-system libs")
if (USE_WASM_COMPATIBLE_SOURCES)
# If using wasm compatible marian then set following flags
SET(COMPILE_LIBRARY_ONLY ON CACHE BOOL "Build only the Marian library and exclude all executables.")
SET(USE_MKL OFF CACHE BOOL "Compile with MKL support")
SET(COMPILE_DECODER_ONLY ON CACHE BOOL "Compile marian-decoder only")
SET(COMPILE_WITH_PTHREADS OFF CACHE BOOL "Compile with pthreads support")
SET(USE_WASM_COMPATIBLE_BLAS ON CACHE BOOL "Compile with a WASM compatible blas for decoder only builds")
SET(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without exceptions")
if(COMPILE_WASM)
# Set WORMHOLE to ON for marian whenever compiling for wasm platform
SET(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160")
endif()
endif()
# Set ssplit (3rd party submodule) cmake options to compile for this project
CMAKE_DEPENDENT_OPTION(USE_INTERNAL_PCRE2 "Use internal PCRE2 instead of system PCRE2" ON
"USE_WASM_COMPATIBLE_SOURCES" OFF)
# Documentation: https://cliutils.gitlab.io/modern-cmake/chapters/projects/submodule.html
# Ensures the submodules are set correctly during a build.
find_package(Git QUIET)
if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
# Update submodules as needed
option(GIT_SUBMODULE "Check submodules during build" ON)
if(GIT_SUBMODULE)
message(STATUS "Submodule update")
execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE GIT_SUBMOD_RESULT)
if(NOT GIT_SUBMOD_RESULT EQUAL "0")
message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
endif()
endif()
endif()
if(NOT COMPILE_WASM)
# Set BUILD_ARCH to native only while compiling for non wasm platform
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
endif()
if(COMPILE_WASM)
list(APPEND WASM_COMPILE_FLAGS -pthread -O3 -g2 -fPIC -mssse3 -msimd128)
list(APPEND WASM_COMPILE_FLAGS "SHELL:-s WASM=1" "SHELL:-s ASSERTIONS=0" "SHELL:-s DISABLE_EXCEPTION_CATCHING=1" "SHELL:-s LLD_REPORT_UNDEFINED" "SHELL:-s FORCE_FILESYSTEM=1" "SHELL:-s ALLOW_MEMORY_GROWTH=1")
list(APPEND WASM_COMPILE_FLAGS -Wno-error=pthreads-mem-growth)
endif(COMPILE_WASM)
add_subdirectory(3rd_party)
add_subdirectory(src)
add_subdirectory(app)
if(COMPILE_WASM)
add_subdirectory(wasm)
else()
add_subdirectory(app)
endif(COMPILE_WASM)

View File

@ -3,15 +3,95 @@
Bergamot translator provides a unified API for ([Marian NMT](https://marian-nmt.github.io/) framework based) neural machine translation functionality in accordance with the [Bergamot](https://browser.mt/) project that focuses on improving client-side machine translation in a web browser.
## Build Instructions
```
$ git clone https://github.com/browsermt/bergamot-translator
$ cd bergamot-translator
$ mkdir build
$ cd build
$ cmake ../
$ make -j
### Build Natively
1. Clone the repository using these instructions:
```bash
git clone https://github.com/browsermt/bergamot-translator
cd bergamot-translator
```
2. Compile
Create a folder where you want to build all the artifacts (`build-native` in this case) and compile in that folder
```bash
mkdir build-native
cd build-native
cmake ../
make -j
```
### Build WASM
#### Compiling for the first time
1. Download and Install Emscripten using following instructions
* Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git`
* Enter the cloned directory: `cd emsdk`
* Install the lastest sdk tools: `./emsdk install latest`
* Activate the latest sdk tools: `./emsdk activate latest`
* Activate path variables: `source ./emsdk_env.sh`
2. Clone the repository using these instructions:
```bash
git clone https://github.com/browsermt/bergamot-translator
cd bergamot-translator
```
3. Download files (only required if you want to package files in wasm binary)
This step is only required if you want to package files (e.g. models, vocabularies etc.)
into wasm binary. If you don't then just skip this step.
The build preloads the files in Emscriptens virtual file system.
If you want to package bergamot project specific models, please follow these instructions:
```bash
mkdir models
git clone https://github.com/mozilla-applied-ml/bergamot-models
cp -rf bergamot-models/* models
gunzip models/*/*
```
4. Compile
1. Create a folder where you want to build all the artefacts (`build-wasm` in this case)
```bash
mkdir build-wasm
cd build-wasm
```
2. Compile the artefacts
* If you want to package files into wasm binary then execute following commands (Replace `FILES_TO_PACKAGE` with the path of the
directory containing the files to be packaged in wasm binary)
```bash
emcmake cmake -DCOMPILE_WASM=on -DPACKAGE_DIR=FILES_TO_PACKAGE ../
emmake make -j
```
e.g. If you want to package bergamot project specific models (downloaded using step 3 above) then
replace `FILES_TO_PACKAGE` with `../models`
* If you don't want to package any file into wasm binary then execute following commands:
```bash
emcmake cmake -DCOMPILE_WASM=on ../
emmake make -j
```
The artefacts (.js and .wasm files) will be available in `wasm` folder of build directory ("build-wasm" in this case).
#### Recompiling
As long as you don't update any submodule, just follow steps in `4.ii` to recompile.\
If you update a submodule, execute following command before executing steps in `4.ii` to recompile.
```bash
git submodule update --init --recursive
```
## Using Bergamot Translator
The build will generate the library that can be linked to any project. All the public header files are specified in `src` folder.
## How to use
### Using Native version
The builds generate library that can be integrated to any project. All the public header files are specified in `src` folder.\
A short example of how to use the APIs is provided in `app/main.cpp` file.
### Using WASM version
Please follow the `README` inside the `wasm` folder of this repository that demonstrates how to use the translator in JavaScript.

View File

@ -1,3 +1,10 @@
add_executable(bergamot-translator-app main.cpp)
target_link_libraries(bergamot-translator-app PRIVATE bergamot-translator)
if (NOT USE_WASM_COMPATIBLE_SOURCES)
add_executable(service-cli main-mts.cpp)
target_link_libraries(service-cli PRIVATE bergamot-translator)
add_executable(marian-decoder-new marian-decoder-new.cpp)
target_link_libraries(marian-decoder-new PRIVATE bergamot-translator)
endif()

33
app/main-mts.cpp Normal file
View File

@ -0,0 +1,33 @@
#include <cstdlib>
#include <future>
#include <iostream>
#include <sstream>
#include "common/definitions.h"
#include "common/utils.h"
#include "marian.h"
#include "translator/parser.h"
#include "translator/response.h"
#include "translator/service.h"
int main(int argc, char *argv[]) {
auto cp = marian::bergamot::createConfigParser();
auto options = cp.parseOptions(argc, argv, true);
marian::bergamot::Service service(options);
// Read a large input text blob from stdin
std::ostringstream std_input;
std_input << std::cin.rdbuf();
std::string input = std_input.str();
using marian::bergamot::Response;
// Wait on future until Response is complete
std::future<Response> responseFuture = service.translate(std::move(input));
responseFuture.wait();
Response response = responseFuture.get();
std::cout << response.translation() << std::endl;
// Stop Service.
service.stop();
return 0;
}

View File

@ -7,29 +7,61 @@
#include <iostream>
#include "TranslationModelConfiguration.h"
#include "AbstractTranslationModel.h"
#include "TranslationRequest.h"
#include "TranslationResult.h"
#include "translator/parser.h"
int main(int argc, char **argv) {
int main(int argc, char** argv) {
// Create a configParser and load command line parameters into a YAML config
// string.
auto configParser = marian::bergamot::createConfigParser();
auto options = configParser.parseOptions(argc, argv, true);
std::string config = options->asYamlString();
// Create an instance of AbstractTranslationModel with a dummy model configuration
TranslationModelConfiguration config("dummy_modelFilePath",
"dummy_sourceVocabPath",
"dummy_targetVocabPath");
std::shared_ptr<AbstractTranslationModel> model =
AbstractTranslationModel::createInstance(config);
// Route the config string to construct marian model through
// AbstractTranslationModel
std::shared_ptr<AbstractTranslationModel> model =
AbstractTranslationModel::createInstance(config);
// Call to translate a dummy (empty) texts with a dummy (empty) translation request
TranslationRequest req;
std::vector<std::string> texts;
auto result = model->translate(std::move(texts), req);
TranslationRequest translationRequest;
std::vector<std::string> texts;
texts.emplace_back(
"The Bergamot project will add and improve client-side machine "
"translation in a web browser. Unlike current cloud-based "
"options, running directly on users machines empowers citizens to "
"preserve their privacy and increases the uptake of language "
"technologies in Europe in various sectors that require "
"confidentiality.");
texts.emplace_back(
"Free software integrated with an open-source web "
"browser, such as Mozilla Firefox, will enable bottom-up adoption "
"by non-experts, resulting in cost savings for private and public "
"sector users who would otherwise procure translation or operate "
"monolingually. Bergamot is a consortium coordinated by the "
"University of Edinburgh with partners Charles University in "
"Prague, the University of Sheffield, University of Tartu, and "
"Mozilla.");
// Resolve the future and get the actual result
std::vector<TranslationResult> res = result.get();
auto results = model->translate(std::move(texts), translationRequest);
std::cout << "Count is: " << res.size() << std::endl;
return 0;
// Resolve the future and get the actual result
//std::vector<TranslationResult> results = futureResults.get();
for (auto &result : results) {
std::cout << "[original]: " << result.getOriginalText() << std::endl;
std::cout << "[translated]: " << result.getTranslatedText() << std::endl;
auto mappings = result.getSentenceMappings();
for (auto &p : mappings) {
std::string_view src = p.first;
std::string_view tgt = p.second;
std::cout << " [src Sentence]: " << src << std::endl;
std::cout << " [tgt Sentence]: " << tgt << std::endl;
}
std::cout << std::endl;
}
return 0;
}

View File

@ -0,0 +1,61 @@
#include <cstdlib>
#include <future>
#include <iostream>
#include <sstream>
#include "common/definitions.h"
#include "common/timer.h"
#include "common/utils.h"
#include "marian.h"
#include "translator/history.h"
#include "translator/output_collector.h"
#include "translator/output_printer.h"
#include "translator/parser.h"
#include "translator/response.h"
#include "translator/service.h"
void marian_decoder_minimal(const marian::Histories &histories,
marian::Ptr<marian::Vocab const> targetVocab,
marian::Ptr<marian::Options> options) {
bool doNbest = options->get<bool>("n-best");
auto collector =
marian::New<marian::OutputCollector>(options->get<std::string>("output"));
// There is a dependency of vocabs here.
auto printer = marian::New<marian::OutputPrinter>(options, targetVocab);
if (options->get<bool>("quiet-translation"))
collector->setPrintingStrategy(marian::New<marian::QuietPrinting>());
for (auto &history : histories) {
std::stringstream best1;
std::stringstream bestn;
printer->print(history, best1, bestn);
collector->Write((long)history->getLineNum(), best1.str(), bestn.str(),
doNbest);
}
}
int main(int argc, char *argv[]) {
auto cp = marian::bergamot::createConfigParser();
auto options = cp.parseOptions(argc, argv, true);
marian::timer::Timer decoderTimer;
marian::bergamot::Service service(options);
// Read a large input text blob from stdin
std::ostringstream std_input;
std_input << std::cin.rdbuf();
std::string input = std_input.str();
using marian::bergamot::Response;
// Wait on future until Response is complete
std::future<Response> responseFuture = service.translate(std::move(input));
responseFuture.wait();
const Response &response = responseFuture.get();
marian_decoder_minimal(response.histories(), service.targetVocab(), options);
LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed());
service.stop();
return 0;
}

85
doc/marian-integration.md Normal file
View File

@ -0,0 +1,85 @@
# Marian Integration
This document summarizes the minimal build instructions develop for the
marian-code powering bergamot-translator.
## Build Instructions
```
$ git clone https://github.com/browsermt/bergamot-translator
$ cd bergamot-translator
$ mkdir build
$ cd build
$ cmake .. -DUSE_WASM_COMPATIBLE_SOURCES=off -DCMAKE_BUILD_TYPE=Release
$ make -j
```
The build will generate the library that can be linked to any project. All the
public header files are specified in `src` folder.
## Command line apps
The following executables are created by the build:
1. `app/service-cli`: Extends marian to capability to work with string_views.
`service-cli` exists to check if the underlying code, without the
integration works or not.
2. `app/bergamot-translator-app`: App which integreates service-cli's
functionality into the translator agnostic API specified as part of the
project. Integration failures are detected if same arguments work with
`service-cli` and does not with `bergamot-translator-app`.
3. `app/marian-decoder-new`: Helper executable to conveniently benchmark new
implementation with the optimized upstream marian-decoder.
The models required to run the command-line are available at
[data.statmt.org/bergamot/models/](http://data.statmt.org/bergamot/models/).
The following example uses an English to German tiny11 student model, available
at:
* [data.statmt.org/bergamot/models/deen/ende.student.tiny11.tar.gz](http://data.statmt.org/bergamot/models/deen/ende.student.tiny11.tar.gz)
<details>
<summary> Example run of commandline: Click to expand </summary>
<p>
```bash
MODEL_DIR=... # path to where the model-files are.
ARGS=(
-m $MODEL_DIR/model.intgemm.alphas.bin # Path to model file.
--vocabs
$MODEL_DIR/vocab.deen.spm # source-vocabulary
$MODEL_DIR/vocab.deen.spm # target-vocabulary
# The following increases speed through one-best-decoding, shortlist and quantization.
--beam-size 1 --skip-cost --shortlist $MODEL_DIR/lex.s2t.gz 50 50 --int8shiftAlphaAll
# Number of CPU threads (workers to launch). Parallelizes over cores and improves speed.
# A value of 0 allows a path with no worker thread-launches and a single-thread.
--cpu-threads 4
# Maximum size of a sentence allowed. If a sentence is above this length,
# it's broken into pieces of less than or equal to this size.
--max-length-break 1024
# Maximum number of tokens that can be fit in a batch. The optimal value
# for the parameter is dependant on hardware and can be obtained by running
# with variations and benchmarking.
--mini-batch-words 1024
# Three modes are supported
# - sentence: One sentence per line
# - paragraph: One paragraph per line.
# - wrapped_text: Paragraphs are separated by empty line.
--ssplit-mode paragraph
)
./app/service-cli "${ARGS[@]}" < path-to-input-file
./app/bergamot-translator-app "${ARGS[@]}" < path-to-input-file
```
</p>
</summary>
</details>

View File

@ -1,61 +1,68 @@
/*
* AbstractTranslationModel.h
*
* An interface for a translation model for translating a plain (without any markups and emojis) UTF-8 encoded text.
* The model supports translation from 1 source language to 1 target language. There can be different implementations
* An interface for a translation model for translating a plain (without any
* markups and emojis) UTF-8 encoded text. The model supports translation from 1
* source language to 1 target language. There can be different implementations
* of this interface.
*/
#ifndef SRC_TRANSLATOR_ABSTRACTTRANSLATIONMODEL_H_
#define SRC_TRANSLATOR_ABSTRACTTRANSLATIONMODEL_H_
#include <vector>
#include <string>
#include <future>
#include <memory>
#include <string>
#include <vector>
#include "TranslationModelConfiguration.h"
#include "TranslationRequest.h"
#include "TranslationResult.h"
/* An interface for a translation model for translating a plain (without any markups and emojis) UTF-8 encoded text.
* The model supports translation from 1 source language to 1 target language.
/* An interface for a translation model for translating a plain (without any
* markups and emojis) UTF-8 encoded text. The model supports translation from 1
* source language to 1 target language.
*/
class AbstractTranslationModel {
public:
/* A Factory method to create and return an instance of an implementation of
* AbstractTranslationModel. The instance is created using translation model
* configuration provided as yaml-formatted string.
*/
static std::shared_ptr<AbstractTranslationModel>
createInstance(const std::string &config);
/* A Factory method to create and return an instance of an implementation of
* AbstractTranslationModel. The instance is created using translation model configuration
* (TranslationModelConfiguration).
*/
static std::shared_ptr<AbstractTranslationModel>
createInstance(const TranslationModelConfiguration& config);
AbstractTranslationModel() = default;
AbstractTranslationModel() = default;
virtual ~AbstractTranslationModel() = default;
virtual ~AbstractTranslationModel() = default;
/* This method performs translation on a list of (UTF-8 encoded) texts and
* returns a list of results in the same order. Each text entry can either be
* a word, a phrase, a sentence or a list of sentences and should contain
* plain text (without any markups or emojis). Additional information related
* to the translated text can be requested via TranslationRequest which is
* applied equally to each text entry.
*
* The translated text corresponding to each text entry and the additional
* information (as specified in the TranslationRequest) is encapsulated and
* returned in TranslationResult.
*
* The API splits each text entry into sentences internally, which are then
* translated independent of each other. The translated sentences are then
* joined together and returned in TranslationResult. Please refer to the
* TranslationRequest class to find out what additional information can be
* requested. The alignment information can only be requested if the model
* supports it (check isAlignmentSupported() API).
*
* The texts argument will become empty after the execution of this API (each
* entry of texts list will be moved to its corresponding TranslationResult
* object).
*/
virtual std::vector<TranslationResult>
translate(std::vector<std::string> &&texts, TranslationRequest request) = 0;
/* This method performs translation on a list of (UTF-8 encoded) texts and returns a list of results in the same order.
* Each text entry can either be a word, a phrase, a sentence or a list of sentences and should contain plain text
* (without any markups or emojis). Additional information related to the translated text can be requested via
* TranslationRequest which is applied equally to each text entry.
*
* The translated text corresponding to each text entry and the additional information (as specified in the
* TranslationRequest) is encapsulated and returned in TranslationResult.
*
* The API splits each text entry into sentences internally, which are then translated independent of each other.
* The translated sentences are then joined together and returned in TranslationResult.
* Please refer to the TranslationRequest class to find out what additional information can be requested.
* The alignment information can only be requested if the model supports it (check isAlignmentSupported() API).
*
* The texts argument will become empty after the execution of this API (each entry of texts list will be moved to its
* corresponding TranslationResult object).
*/
virtual std::future<std::vector<TranslationResult>> translate(
std::vector<std::string> &&texts, TranslationRequest request) = 0;
/* Check if the model can provide alignment information b/w original and translated text. */
virtual bool isAlignmentSupported() const = 0;
/* Check if the model can provide alignment information b/w original and
* translated text. */
virtual bool isAlignmentSupported() const = 0;
};
#endif /* SRC_TRANSLATOR_ABSTRACTTRANSLATIONMODEL_H_ */

View File

@ -6,31 +6,32 @@
#ifndef SRC_TRANSLATOR_QUALITYSCORE_H_
#define SRC_TRANSLATOR_QUALITYSCORE_H_
#include <vector>
#include <string>
#include <vector>
/* All possible Granularities for which Quality Scores can be returned for translated text. */
/* All possible Granularities for which Quality Scores can be returned for
* translated text. */
enum class QualityScoreGranularity {
WORD, SENTENCE, NONE,
WORD,
SENTENCE,
NONE,
};
/* This class represents the Quality Scores for various spans of a translated text at a specific granularity. */
/* This class represents the Quality Scores for various spans of a translated
* text at a specific granularity. */
class QualityScore {
private:
// Sections of the translated text for the Quality Scores.
std::vector<std::string_view> textViews;
// Sections of the translated text for the Quality Scores.
std::vector<std::string_view> textViews;
// Quality Scores corresponding to each entry of textViews in the same order
std::vector<float> textScores;
// Quality Scores corresponding to each entry of textViews in the same order
std::vector<float> textScores;
// Granularity of the text for the Quality scores above
QualityScoreGranularity textGranularity;
// Granularity of the text for the Quality scores above
QualityScoreGranularity textGranularity;
public:
// ToDo: Public Methods
// ToDo: Public Methods
};
#endif /* SRC_TRANSLATOR_QUALITYSCORE_H_ */

View File

@ -1,68 +0,0 @@
/*
* TranslationModelConfiguration.h
*
*/
#ifndef SRC_TRANSLATOR_TRANSLATIONMODELCONFIGURATION_H_
#define SRC_TRANSLATOR_TRANSLATIONMODELCONFIGURATION_H_
#include <string>
/* This class encapsulates the configuration that is required by a translation model to perform
* translation.
*/
class TranslationModelConfiguration {
public:
// Constructor
TranslationModelConfiguration(const std::string &modelFilePath,
const std::string &sourceVocabPath,
const std::string &targetVocabPath) :
modelPath(modelFilePath),
sourceLanguageVocabPath(sourceVocabPath),
targetLanguageVocabPath(targetVocabPath) {
}
// Copy constructor
TranslationModelConfiguration(const TranslationModelConfiguration &rhs) :
modelPath(rhs.modelPath),
sourceLanguageVocabPath(rhs.sourceLanguageVocabPath),
targetLanguageVocabPath(rhs.targetLanguageVocabPath) {
}
// Move constructor
TranslationModelConfiguration(TranslationModelConfiguration &&rhs) :
modelPath(std::move(rhs.modelPath)),
sourceLanguageVocabPath(std::move(rhs.sourceLanguageVocabPath)),
targetLanguageVocabPath(std::move(rhs.targetLanguageVocabPath)) {
}
// Return the path of the model file
const std::string& getModelFilePath() const {
return modelPath;
}
// Return the path of the source language vocabulary file
const std::string& getSourceVocabularyPath() const {
return sourceLanguageVocabPath;
}
// Return the path of the target language vocabulary file
const std::string& getTargetVocabularyPath() const {
return targetLanguageVocabPath;
}
private:
// Path to the translation model file
const std::string modelPath;
// Path to the source vocabulary file to be used by the model
const std::string sourceLanguageVocabPath;
// Path to the target vocabulary file to be used by the model
const std::string targetLanguageVocabPath;
// ToDo: Add other user configurable options (e.g. min batch size)
};
#endif /* SRC_TRANSLATOR_TRANSLATIONMODELCONFIGURATION_H_ */

View File

@ -1,7 +1,8 @@
/*
* TranslationRequest.h
*
* This file defines the translation request class to be used in AbstractTranslationModel::translate() API.
* This file defines the translation request class to be used in
* AbstractTranslationModel::translate() API.
*/
#ifndef SRC_TRANSLATOR_TRANSLATIONREQUEST_H_
@ -9,66 +10,75 @@
#include "QualityScore.h"
/* This class specifies the information related to the translated text (e.g. quality of the translation etc.) that
* can be included in the TranslationResult. These optional requests are set/unset independent of each other i.e. setting
* any one of them doesnt have the side effect of setting any of the others.
/* This class specifies the information related to the translated text (e.g.
* quality of the translation etc.) that can be included in the
* TranslationResult. These optional requests are set/unset independent of each
* other i.e. setting any one of them doesnt have the side effect of setting
* any of the others.
*/
class TranslationRequest {
private:
// The granularity for which Quality scores of the translated text will be included in TranslationResult.
// QualityScoreGranularity::NONE means the scores are not included in TranslationResult.
QualityScoreGranularity qualityScoreGranularity = QualityScoreGranularity::NONE;
// The granularity for which Quality scores of the translated text will be
// included in TranslationResult. QualityScoreGranularity::NONE means the
// scores are not included in TranslationResult.
QualityScoreGranularity qualityScoreGranularity =
QualityScoreGranularity::NONE;
// A flag to include/exclude the information regarding how individual sentences of original text map to
// corresponding translated sentences in joined translated text in the TranslationResult.
// An example of sentence mappings:
// originalText (containing 2 sentences) = "What is your name? My name is Abc."
// translatedText (containing 2 translated sentences) = "Was ist dein Name? Mein Name ist Abc."
// sentenceMappings = [
// {"What is your name?", "Was ist dein Name?"}, // Pair(originalText[0],translatedText[0])
// {"My name is Abc", "Mein Name ist Abc."} // Pair(originalText[1],translatedText[1])
// ]
bool includeSentenceMapping = false;
// A flag to include/exclude the information regarding how individual
// sentences of original text map to corresponding translated sentences in
// joined translated text in the TranslationResult. An example of sentence
// mappings:
// originalText (containing 2 sentences) = "What is your
// name? My name is Abc." translatedText (containing 2 translated
// sentences) = "Was ist dein Name? Mein Name ist Abc." sentenceMappings =
// [
// {"What is your name?", "Was ist dein Name?"}, //
// Pair(originalText[0],translatedText[0])
// {"My name is Abc", "Mein Name ist Abc."} //
// Pair(originalText[1],translatedText[1])
// ]
bool includeSentenceMapping = false;
public:
TranslationRequest() {}
TranslationRequest() {}
TranslationRequest(const TranslationRequest& request) :
qualityScoreGranularity(request.qualityScoreGranularity),
includeSentenceMapping(request.includeSentenceMapping) {
}
TranslationRequest(const TranslationRequest &request)
: qualityScoreGranularity(request.qualityScoreGranularity),
includeSentenceMapping(request.includeSentenceMapping) {}
~TranslationRequest() {}
~TranslationRequest() {}
/* Set the granularity for which the Quality scores of translated text should be included in the TranslationResult.
* By default (QualityScoreGranularity::NONE), scores are not included.
*/
void setQualityScoreGranularity(QualityScoreGranularity granularity) {
qualityScoreGranularity = granularity;
}
/* Set the granularity for which the Quality scores of translated text should
* be included in the TranslationResult. By default
* (QualityScoreGranularity::NONE), scores are not included.
*/
void setQualityScoreGranularity(QualityScoreGranularity granularity) {
qualityScoreGranularity = granularity;
}
/* Set to true/false to include/exclude the information regarding how individual sentences of original text map to
* corresponding translated sentences in joined translated text in the TranslationResult. By default (false), this
* information is not included.
*/
void sentenceMappingInResult(bool includeMapping) {
includeSentenceMapping = includeMapping;
}
/* Set to true/false to include/exclude the information regarding how
* individual sentences of original text map to corresponding translated
* sentences in joined translated text in the TranslationResult. By default
* (false), this information is not included.
*/
void sentenceMappingInResult(bool includeMapping) {
includeSentenceMapping = includeMapping;
}
/* Return the granularity for which the Quality scores of the translated text will be included in TranslationResult.
* QualityScoreGranularity::NONE means the scores will not be included.
*/
QualityScoreGranularity getQualityScoreGranularity() const {
return qualityScoreGranularity;
}
/* Return the granularity for which the Quality scores of the translated text
* will be included in TranslationResult. QualityScoreGranularity::NONE means
* the scores will not be included.
*/
QualityScoreGranularity getQualityScoreGranularity() const {
return qualityScoreGranularity;
}
/* Return whether the information regarding how individual sentences of original text map to corresponding translated
* sentences in joined translated text will be included in the TranslationResult. By default (false) means this
* information will not be included.
*/
bool sentenceMappingInResult() const {
return includeSentenceMapping;
}
/* Return whether the information regarding how individual sentences of
* original text map to corresponding translated sentences in joined
* translated text will be included in the TranslationResult. By default
* (false) means this information will not be included.
*/
bool sentenceMappingInResult() const { return includeSentenceMapping; }
};
#endif /* SRC_TRANSLATOR_TRANSLATIONREQUEST_H_ */

View File

@ -1,76 +1,108 @@
/*
* TranslationResult.h
*
* The class that represents the result of AbstractTranslationModel::translate() API for each of its text entry and
* TranslationRequest.
* The class that represents the result of AbstractTranslationModel::translate()
* API for each of its text entry and TranslationRequest.
*/
#ifndef SRC_TRANSLATOR_TRANSLATIONRESULT_H_
#define SRC_TRANSLATOR_TRANSLATIONRESULT_H_
#include <vector>
#include <string>
#include <vector>
#include "QualityScore.h"
/* This class represents the result of AbstractTranslationModel::translate() API for each of its text entry and
* TranslationRequest.
/* This class represents the result of AbstractTranslationModel::translate() API
* for each of its text entry and TranslationRequest.
*/
class TranslationResult {
public:
typedef std::vector<std::pair<std::string_view, std::string_view>> SentenceMappings;
typedef std::vector<std::pair<std::string_view, std::string_view>>
SentenceMappings;
#ifdef WASM_BINDINGS
TranslationResult(const std::string &original, const std::string &translation)
: originalText(original), translatedText(translation),
sentenceMappings() {}
#endif
TranslationResult(const std::string &original, const std::string &translation,
SentenceMappings &sentenceMappings)
: originalText(original), translatedText(translation),
sentenceMappings(sentenceMappings) {}
TranslationResult(const std::string &original, const std::string &translation) :
originalText(original), translatedText(translation) {}
TranslationResult(TranslationResult &&other)
: originalText(std::move(other.originalText)),
translatedText(std::move(other.translatedText)),
sentenceMappings(std::move(other.sentenceMappings)) {}
TranslationResult(std::string &&original, std::string &&translation) :
originalText(std::move(original)), translatedText(std::move(translation)) {}
#ifdef WASM_BINDINGS
TranslationResult(const TranslationResult &other)
: originalText(other.originalText),
translatedText(other.translatedText),
sentenceMappings(other.sentenceMappings) {}
#endif
/* Return the original text. */
const std::string& getOriginalText() const {
return originalText;
}
TranslationResult(std::string &&original, std::string &&translation,
SentenceMappings &&sentenceMappings)
: originalText(std::move(original)),
translatedText(std::move(translation)),
sentenceMappings(std::move(sentenceMappings)) {}
/* Return the translated text. */
const std::string& getTranslatedText() const {
return translatedText;
}
#ifndef WASM_BINDINGS
TranslationResult &operator=(const TranslationResult &) = delete;
#else
TranslationResult &operator=(const TranslationResult &result) {
originalText = result.originalText;
translatedText = result.translatedText;
sentenceMappings = result.sentenceMappings;
return *this;
}
#endif
/* Return the Quality scores of the translated text. */
const QualityScore& getQualityScore() const {
return qualityScore;
}
/* Return the original text. */
const std::string &getOriginalText() const { return originalText; }
/* Return the Sentence mappings (information regarding how individual sentences of originalText map to
* corresponding translated sentences in translatedText).
*/
const SentenceMappings& getSentenceMappings() const {
return sentenceMappings;
}
/* Return the translated text. */
const std::string &getTranslatedText() const { return translatedText; }
/* Return the Quality scores of the translated text. */
const QualityScore &getQualityScore() const { return qualityScore; }
/* Return the Sentence mappings (information regarding how individual
* sentences of originalText map to corresponding translated sentences in
* translatedText).
*/
const SentenceMappings &getSentenceMappings() const {
return sentenceMappings;
}
private:
// Original text (in UTF-8 encoded format) that was supposed to be translated
std::string originalText;
// Original text (in UTF-8 encoded format) that was supposed to be translated
std::string originalText;
// Translation (in UTF-8 encoded format) of the originalText
std::string translatedText;
// Translation (in UTF-8 encoded format) of the originalText
std::string translatedText;
// Quality score of the translated text at the granularity specified in TranslationRequest.
// It is an optional result (it will have no information if not requested in TranslationRequest)
QualityScore qualityScore;
// Quality score of the translated text at the granularity specified in
// TranslationRequest. It is an optional result (it will have no information
// if not requested in TranslationRequest)
QualityScore qualityScore;
// Information regarding how individual sentences of originalText map to corresponding translated sentences
// in joined translated text (translatedText)
// An example of sentence mapping:
// originalText (contains 2 sentences) = "What is your name? My name is Abc."
// translatedText (contains 2 translated sentences) = "Was ist dein Name? Mein Name ist Abc."
// sentenceMappings = [
// {"What is your name?", "Was ist dein Name?"}, // Pair(originalText[0],translatedText[0])
// {"My name is Abc", "Mein Name ist Abc."} // Pair(originalText[1],translatedText[1])
// ]
//
// It is an optional result (it will be empty if not requested in TranslationRequest).
SentenceMappings sentenceMappings;
// Information regarding how individual sentences of originalText map to
// corresponding translated sentences in joined translated text
// (translatedText) An example of sentence mapping:
// originalText (contains 2 sentences) = "What is your name?
// My name is Abc." translatedText (contains 2 translated sentences) =
// "Was ist dein Name? Mein Name ist Abc." sentenceMappings = [
// {"What is your name?", "Was ist dein Name?"}, //
// Pair(originalText[0],translatedText[0])
// {"My name is Abc", "Mein Name ist Abc."} //
// Pair(originalText[1],translatedText[1])
// ]
//
// It is an optional result (it will be empty if not requested in
// TranslationRequest).
SentenceMappings sentenceMappings;
};
#endif /* SRC_TRANSLATOR_TRANSLATIONRESULT_H_ */

View File

@ -4,18 +4,11 @@
*/
#include <memory>
// All 3rd party includes
#include "3rd_party/marian-dev/src/common/options.h"
// All local includes
#include "AbstractTranslationModel.h"
#include "TranslationModel.h"
#include "TranslationModelConfigToOptionsAdaptor.h"
std::shared_ptr<AbstractTranslationModel>
AbstractTranslationModel::createInstance(const TranslationModelConfiguration& config) {
TranslationModelConfigToOptionsAdaptor adaptor;
auto options = adaptor.adapt(config);
return std::make_shared<TranslationModel>(options);
AbstractTranslationModel::createInstance(const std::string &config) {
return std::make_shared<TranslationModel>(config);
}

View File

@ -1,11 +1,41 @@
if (NOT USE_WASM_COMPATIBLE_SOURCES)
set(MULTITHREADED_SERVICE_SOURCE "service.cpp")
endif()
add_library(bergamot-translator STATIC
AbstractTranslationModel.cpp
TranslationModel.cpp
TranslationModelConfigToOptionsAdaptor.cpp)
target_link_libraries(bergamot-translator marian)
# Following files added from browsermt/mts@nuke
text_processor.cpp
sentence_splitter.cpp
batch_translator.cpp
multifactor_priority.cpp
request.cpp
service_base.cpp
${MULTITHREADED_SERVICE_SOURCE}
batcher.cpp
response.cpp
batch.cpp
sentence_ranges.cpp
)
if (COMPILE_DECODER_ONLY)
# A dirty hack because of marian's bad cmake practices
target_compile_definitions(bergamot-translator PUBLIC DECODER_ONLY)
endif()
if(COMPILE_WASM)
# A dirty hack because of marian's bad cmake practices
target_compile_definitions(bergamot-translator PUBLIC USE_SSE2 WASM)
# Enable code that is required for generating JS bindings
target_compile_definitions(bergamot-translator PRIVATE WASM_BINDINGS)
target_compile_options(bergamot-translator PRIVATE ${WASM_COMPILE_FLAGS})
endif(COMPILE_WASM)
target_link_libraries(bergamot-translator marian ssplit)
target_include_directories(bergamot-translator
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
PRIVATE ${CMAKE_SOURCE_DIR}
PUBLIC ${CMAKE_SOURCE_DIR}/src)

View File

@ -6,23 +6,93 @@
#include <future>
#include <vector>
#include "TranslationModel.h"
// All 3rd party includes
#include "3rd_party/marian-dev/src/3rd_party/yaml-cpp/yaml.h"
#include "3rd_party/marian-dev/src/common/config_parser.h"
#include "common/config_validator.h"
#include "common/options.h"
TranslationModel::TranslationModel(std::shared_ptr<marian::Options> options) :
configOptions(std::move(options)), AbstractTranslationModel() {
// All local project includes
#include "TranslationModel.h"
#include "translator/parser.h"
#include "translator/service_base.h"
std::shared_ptr<marian::Options> parseOptions(const std::string &config) {
marian::Options options;
// @TODO(jerinphilip) There's something off here, @XapaJIaMnu suggests
// that should not be using the defaultConfig. This function only has access
// to std::string config and needs to be able to construct Options from the
// same.
// Absent the following code-segment, there is a parsing exception thrown on
// rebuilding YAML.
//
// Error: Unhandled exception of type 'N4YAML11InvalidNodeE': invalid node;
// this may result from using a map iterator as a sequence iterator, or
// vice-versa
//
// Error: Aborted from void unhandledException() in
// 3rd_party/marian-dev/src/common/logging.cpp:113
marian::ConfigParser configParser = marian::bergamot::createConfigParser();
const YAML::Node &defaultConfig = configParser.getConfig();
options.merge(defaultConfig);
// Parse configs onto defaultConfig.
options.parse(config);
YAML::Node configCopy = options.cloneToYamlNode();
marian::ConfigValidator validator(configCopy);
validator.validateOptions(marian::cli::mode::translation);
return std::make_shared<marian::Options>(options);
}
TranslationModel::TranslationModel(const std::string &config)
: configOptions_(std::move(parseOptions(config))),
AbstractTranslationModel(), service_(configOptions_) {}
TranslationModel::~TranslationModel() {}
std::future<std::vector<TranslationResult>> TranslationModel::translate(
std::vector<std::string> &&texts, TranslationRequest request) {
//ToDo: Replace this code with the actual implementation
return std::async([]() {
std::vector<TranslationResult> results;
return results;
});
std::vector<TranslationResult>
TranslationModel::translate(std::vector<std::string> &&texts,
TranslationRequest request) {
// Implementing a non-async version first. Unpleasant, but should work.
std::promise<std::vector<TranslationResult>> promise;
auto future = promise.get_future();
// This code, move into async?
std::vector<TranslationResult> translationResults;
for (auto &text : texts) {
// Collect future as marian::bergamot::TranslationResult
auto intermediate = service_.translate(std::move(text));
intermediate.wait();
auto marianResponse(std::move(intermediate.get()));
// This mess because marian::string_view != std::string_view
std::string source, translation;
marian::bergamot::Response::SentenceMappings mSentenceMappings;
marianResponse.move(source, translation, mSentenceMappings);
// Convert to UnifiedAPI::TranslationResult
TranslationResult::SentenceMappings sentenceMappings;
for (auto &p : mSentenceMappings) {
std::string_view src(p.first.data(), p.first.size()),
tgt(p.second.data(), p.second.size());
sentenceMappings.emplace_back(src, tgt);
}
// In place construction.
translationResults.emplace_back(
std::move(source), // &&marianResponse.source_
std::move(translation), // &&marianResponse.translation_
std::move(sentenceMappings) // &&sentenceMappings
);
}
return translationResults;
}
bool TranslationModel::isAlignmentSupported() const {
return false;
}
bool TranslationModel::isAlignmentSupported() const { return false; }

View File

@ -7,58 +7,65 @@
#ifndef SRC_TRANSLATOR_TRANSLATIONMODEL_H_
#define SRC_TRANSLATOR_TRANSLATIONMODEL_H_
#include <vector>
#include <string>
#include <future>
#include <string>
#include <vector>
// All 3rd party includes
#include "3rd_party/marian-dev/src/common/options.h"
// All local project includes
#include "AbstractTranslationModel.h"
#include "TranslationModelConfiguration.h"
#include "translator/service_base.h"
/* A Translation model that translates a plain (without any markups and emojis) UTF-8 encoded text.
* This implementation supports translation from 1 source language to 1 target language.
/* A Translation model that translates a plain (without any markups and emojis)
* UTF-8 encoded text. This implementation supports translation from 1 source
* language to 1 target language.
*/
class TranslationModel: public AbstractTranslationModel {
class TranslationModel : public AbstractTranslationModel {
public:
/* Construct the model using the model configuration options.
*/
TranslationModel(std::shared_ptr<marian::Options> options);
/* Construct the model using the model configuration options as yaml-formatted
* string
*/
TranslationModel(const std::string &config);
~TranslationModel();
~TranslationModel();
/* This method performs translation on a list of UTF-8 encoded plain text (without any markups
* or emojis) and returns a list of results in the same order. The model supports translation
* from 1 source language to 1 target language.
*
* Each text entry can either be a word, a phrase, a sentence or a list of sentences. Additional
* information related to the translated text can be requested via TranslationRequest which is
* applied equally to each text entry. The translated text corresponding to each text entry and
* the additional information (as specified in the TranslationRequest) is encapsulated and
* returned in TranslationResult.
*
* The API splits each text entry into sentences internally, which are then translated
* independent of each other. The translated sentences are then joined back together and returned
* in TranslationResult.
*
* Please refer to the TranslationRequest class to find out what additional information can be
* requested. The alignment information can only be requested if the model supports it (check
* isAlignmentSupported() API).
*
* The texts argument will become empty after the execution of this API (each entry of texts list
* will be moved to its corresponding TranslationResult object).
*/
std::future<std::vector<TranslationResult>> translate(
std::vector<std::string> &&texts, TranslationRequest request) override;
/* This method performs translation on a list of UTF-8 encoded plain text
* (without any markups or emojis) and returns a list of results in the same
* order. The model supports translation from 1 source language to 1 target
* language.
*
* Each text entry can either be a word, a phrase, a sentence or a list of
* sentences. Additional information related to the translated text can be
* requested via TranslationRequest which is applied equally to each text
* entry. The translated text corresponding to each text entry and the
* additional information (as specified in the TranslationRequest) is
* encapsulated and returned in TranslationResult.
*
* The API splits each text entry into sentences internally, which are then
* translated independent of each other. The translated sentences are then
* joined back together and returned in TranslationResult.
*
* Please refer to the TranslationRequest class to find out what additional
* information can be requested. The alignment information can only be
* requested if the model supports it (check isAlignmentSupported() API).
*
* The texts argument will become empty after the execution of this API (each
* entry of texts list will be moved to its corresponding TranslationResult
* object).
*/
std::vector<TranslationResult> translate(std::vector<std::string> &&texts,
TranslationRequest request) override;
/* Check if the model can provide alignment information b/w original and translated text. */
bool isAlignmentSupported() const override;
/* Check if the model can provide alignment information b/w original and
* translated text. */
bool isAlignmentSupported() const override;
private:
// Model configuration options
std::shared_ptr<marian::Options> configOptions;
// Model configuration options
std::shared_ptr<marian::Options> configOptions_; // ORDER DEPENDECNY
marian::bergamot::NonThreadedService service_; // ORDER DEPENDENCY
};
#endif /* SRC_TRANSLATOR_TRANSLATIONMODEL_H_ */

View File

@ -1,17 +0,0 @@
/*
* TranslationModelConfigToOptionsAdaptor.cpp
*
*/
#include <memory>
#include "TranslationModelConfigToOptionsAdaptor.h"
TranslationModelConfigToOptionsAdaptor::TranslationModelConfigToOptionsAdaptor() {}
TranslationModelConfigToOptionsAdaptor::~TranslationModelConfigToOptionsAdaptor() {}
std::shared_ptr<marian::Options>
TranslationModelConfigToOptionsAdaptor::adapt(const TranslationModelConfiguration& config) {
// ToDo: Add actual implementation
return std::make_shared<marian::Options>();
}

View File

@ -1,32 +0,0 @@
/*
* This class adapts the TranslationModelConfiguration object to marian::Options object.
* marian::Options is a class that is specific to Marian and is used heavily inside it
* as configuration options (even for translation workflow). It makes sense to work with
* this class internally in the implementation files.
*/
#ifndef SRC_TRANSLATOR_TRANSLATIONMODELCONFIGTOOPTIONSADAPTOR_H_
#define SRC_TRANSLATOR_TRANSLATIONMODELCONFIGTOOPTIONSADAPTOR_H_
#include <memory>
// All 3rd party includes
#include "3rd_party/marian-dev/src/common/options.h"
// All local includes
#include "TranslationModelConfiguration.h"
class TranslationModelConfigToOptionsAdaptor {
public:
TranslationModelConfigToOptionsAdaptor();
~TranslationModelConfigToOptionsAdaptor();
/* Create an Options object from the translation model configuration object.
*/
std::shared_ptr<marian::Options> adapt(const TranslationModelConfiguration& config);
};
#endif /* SRC_TRANSLATOR_TRANSLATIONMODELCONFIGTOOPTIONSADAPTOR_H_ */

28
src/translator/batch.cpp Normal file
View File

@ -0,0 +1,28 @@
#include "batch.h"
#include "request.h"
namespace marian {
namespace bergamot {
void Batch::log() {
size_t numTokens{0}, maxLength{0};
for (auto &sentence : sentences_) {
numTokens += sentence.numTokens();
maxLength = std::max(maxLength, static_cast<size_t>(sentence.numTokens()));
}
LOG(info, "Batch(tokens={}, max-length={}, sentences_={})", numTokens,
maxLength, sentences_.size());
}
void Batch::add(const RequestSentence &sentence) {
sentences_.push_back(sentence);
}
void Batch::completeBatch(const Histories &histories) {
for (size_t i = 0; i < sentences_.size(); i++) {
sentences_[i].completeSentence(histories[i]);
}
}
} // namespace bergamot
} // namespace marian

52
src/translator/batch.h Normal file
View File

@ -0,0 +1,52 @@
#ifndef SRC_BERGAMOT_BATCH_H
#define SRC_BERGAMOT_BATCH_H
#include "request.h"
#include "translator/beam_search.h"
namespace marian {
namespace bergamot {
class Batch {
public:
Batch() {}
void clear() { sentences_.clear(); }
// Methods to construct and determine poison.
static Batch poison() {
Batch batch;
batch.poison_ = true;
return batch;
}
bool isPoison() const { return poison_; }
size_t size() const { return sentences_.size(); }
void add(const RequestSentence &sentence);
// Accessors to read from a Batch. For use in BatchTranslator (consumer on a
// PCQueue holding batches).
//
// sentences() are used to access sentences to construct marian internal
// batch.
const RequestSentences &sentences() { return sentences_; }
// On obtaining Histories after translating a batch, completeBatch can be
// called with Histories , which forwards the call to Request through
// RequestSentence and triggers completion, by setting the promised value to
// the future given to client.
void completeBatch(const Histories &histories);
// Convenience function to log batch-statistics. numTokens, max-length.
void log();
private:
bool poison_{false};
RequestSentences sentences_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_BATCH_H_

View File

@ -0,0 +1,100 @@
#include "batch_translator.h"
#include "batch.h"
#include "common/logging.h"
#include "data/corpus.h"
#include "data/text_input.h"
#include "translator/beam_search.h"
namespace marian {
namespace bergamot {
BatchTranslator::BatchTranslator(DeviceId const device,
std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options)
: device_(device), options_(options), vocabs_(&vocabs) {}
void BatchTranslator::initialize() {
// Initializes the graph.
if (options_->hasAndNotEmpty("shortlist")) {
int srcIdx = 0, trgIdx = 1;
bool shared_vcb = vocabs_->front() == vocabs_->back();
slgen_ = New<data::LexicalShortlistGenerator>(options_, vocabs_->front(),
vocabs_->back(), srcIdx,
trgIdx, shared_vcb);
}
graph_ = New<ExpressionGraph>(true); // always optimize
auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
graph_->setDefaultElementType(typeFromString(prec[0]));
graph_->setDevice(device_);
graph_->getBackend()->configureDevice(options_);
graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
scorers_ = createScorers(options_);
for (auto scorer : scorers_) {
scorer->init(graph_);
if (slgen_) {
scorer->setShortlistGenerator(slgen_);
}
}
graph_->forward();
}
void BatchTranslator::translate(Batch &batch) {
std::vector<data::SentenceTuple> batchVector;
auto &sentences = batch.sentences();
for (auto &sentence : sentences) {
data::SentenceTuple sentence_tuple(sentence.lineNumber());
Segment segment = sentence.getUnderlyingSegment();
sentence_tuple.push_back(segment);
batchVector.push_back(sentence_tuple);
}
size_t batchSize = batchVector.size();
std::vector<size_t> sentenceIds;
std::vector<int> maxDims;
for (auto &ex : batchVector) {
if (maxDims.size() < ex.size())
maxDims.resize(ex.size(), 0);
for (size_t i = 0; i < ex.size(); ++i) {
if (ex[i].size() > (size_t)maxDims[i])
maxDims[i] = (int)ex[i].size();
}
sentenceIds.push_back(ex.getId());
}
typedef marian::data::SubBatch SubBatch;
typedef marian::data::CorpusBatch CorpusBatch;
std::vector<Ptr<SubBatch>> subBatches;
for (size_t j = 0; j < maxDims.size(); ++j) {
subBatches.emplace_back(
New<SubBatch>(batchSize, maxDims[j], vocabs_->at(j)));
}
std::vector<size_t> words(maxDims.size(), 0);
for (size_t i = 0; i < batchSize; ++i) {
for (size_t j = 0; j < maxDims.size(); ++j) {
for (size_t k = 0; k < batchVector[i][j].size(); ++k) {
subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
subBatches[j]->mask()[k * batchSize + i] = 1.f;
words[j]++;
}
}
}
for (size_t j = 0; j < maxDims.size(); ++j)
subBatches[j]->setWords(words[j]);
auto corpus_batch = Ptr<CorpusBatch>(new CorpusBatch(subBatches));
corpus_batch->setSentenceIds(sentenceIds);
auto trgVocab = vocabs_->back();
auto search = New<BeamSearch>(options_, scorers_, trgVocab);
auto histories = std::move(search->search(graph_, corpus_batch));
batch.completeBatch(histories);
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,49 @@
#ifndef SRC_BERGAMOT_BATCH_TRANSLATOR_H_
#define SRC_BERGAMOT_BATCH_TRANSLATOR_H_
#include <string>
#include <vector>
#include "batch.h"
#include "common/utils.h"
#include "data/shortlist.h"
#include "definitions.h"
#include "request.h"
#include "translator/history.h"
#include "translator/scorers.h"
#ifdef WITH_PTHREADS
#include "pcqueue.h"
#endif
namespace marian {
namespace bergamot {
class BatchTranslator {
// Launches minimal marian-translation (only CPU at the moment) in individual
// threads. Constructor launches each worker thread running mainloop().
// mainloop runs until until it receives poison from the PCQueue. Threads are
// shut down in Service which calls join() on the threads.
public:
BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options);
// convenience function for logging. TODO(jerin)
std::string _identifier() { return "worker" + std::to_string(device_.no); }
void translate(Batch &batch);
void initialize();
private:
Ptr<Options> options_;
DeviceId device_;
std::vector<Ptr<Vocab const>> *vocabs_;
Ptr<ExpressionGraph> graph_;
std::vector<Ptr<Scorer>> scorers_;
Ptr<data::ShortlistGenerator const> slgen_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_BATCH_TRANSLATOR_H_

View File

@ -0,0 +1,61 @@
#include "batcher.h"
#include "batch.h"
#include "common/logging.h"
#include <cassert>
namespace marian {
namespace bergamot {
Batcher::Batcher(Ptr<Options> options) {
miniBatchWords = options->get<int>("mini-batch-words");
bucket_.resize(options->get<int>("max-length-break") + 1);
ABORT_IF(bucket_.size() - 1 > miniBatchWords,
"Fatal: max-length-break > mini-batch-words will lead to sentences "
"longer than what can fit in a batch.");
}
void Batcher::addSentenceWithPriority(RequestSentence &sentence) {
size_t bucket_id = sentence.numTokens();
assert(bucket_id < bucket_.size());
bucket_[bucket_id].insert(sentence);
}
bool Batcher::operator>>(Batch &batch) { return cleaveBatch(batch); }
bool Batcher::cleaveBatch(Batch &batch) {
// For now simply iterates on buckets and converts batches greedily. This
// has to be enhanced with optimizing over priority. The baseline
// implementation should at least be as fast as marian's maxi-batch with full
// corpus size as maxi-batch size.
batch.clear();
size_t paddedBatchSize = 0;
for (size_t length = 0; length < bucket_.size(); length++) {
auto p = bucket_[length].begin();
while (p != bucket_[length].end()) {
paddedBatchSize = (batch.size() + 1) * length;
if (paddedBatchSize <= miniBatchWords) {
auto q = p++;
batch.add(*q);
bucket_[length].erase(q);
} else {
// Check if elements exist
assert(batch.size() > 0);
return true;
}
}
}
bool isValidBatch = batch.size() > 0;
return isValidBatch;
}
void Batcher::addWholeRequest(Ptr<Request> request) {
for (size_t i = 0; i < request->numSegments(); i++) {
RequestSentence requestSentence(i, request);
addSentenceWithPriority(requestSentence);
}
}
} // namespace bergamot
} // namespace marian

43
src/translator/batcher.h Normal file
View File

@ -0,0 +1,43 @@
#ifndef SRC_BERGAMOT_BATCHER_H_
#define SRC_BERGAMOT_BATCHER_H_
#include "batch.h"
#include "common/options.h"
#include "data/corpus_base.h"
#include "definitions.h"
#include "request.h"
#ifdef WITH_PTHREADS
#include "pcqueue.h"
#endif
#include <set>
#include <vector>
namespace marian {
namespace bergamot {
class Batcher {
public:
explicit Batcher(Ptr<Options> options);
// RequestSentence incorporates (tentative) notions of priority with each
// sentence. This method inserts the sentence into the internal data-structure
// which maintains priority among sentences from multiple concurrent requests.
void addSentenceWithPriority(RequestSentence &sentence);
void addWholeRequest(Ptr<Request> request);
bool operator>>(Batch &batch); // alias for cleaveBatch
private:
// Loads sentences with sentences compiled from (tentatively) multiple
// requests optimizing for both padding and priority.
bool cleaveBatch(Batch &batch);
size_t miniBatchWords;
std::vector<std::set<RequestSentence>> bucket_;
size_t batchNumber_{0};
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_BATCHER_H_

View File

@ -0,0 +1,27 @@
#ifndef SRC_BERGAMOT_DEFINITIONS_H_
#define SRC_BERGAMOT_DEFINITIONS_H_
#include "data/types.h"
#include "data/vocab_base.h"
#include <vector>
namespace marian {
namespace bergamot {
typedef marian::Words Segment;
typedef std::vector<Segment> Segments;
typedef std::vector<marian::string_view> TokenRanges;
typedef std::vector<TokenRanges> SentenceTokenRanges;
/** @brief Creates unique_ptr any type, passes all arguments to any available
* * constructor */
template <class T, typename... Args> UPtr<T> UNew(Args &&... args) {
return UPtr<T>(new T(std::forward<Args>(args)...));
}
template <class T> UPtr<T> UNew(UPtr<T> p) { return UPtr<T>(p); }
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_DEFINITIONS_H_

View File

@ -0,0 +1,7 @@
#include "multifactor_priority.h"
namespace marian {
namespace bergamot {
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,20 @@
#ifndef SRC_BERGAMOT_MULTIFACTOR_PRIORITY_H_
#define SRC_BERGAMOT_MULTIFACTOR_PRIORITY_H_
#include "data/types.h"
#include "definitions.h"
#include "sys/time.h"
namespace marian {
namespace bergamot {
struct MultiFactorPriority {
int nice; /* user configurable priority, at a request */
unsigned int Id;
/* What else should priority depend on? */
double priority() { return Id; }
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_MULTIFACTOR_PRIORITY_H_

28
src/translator/parser.h Normal file
View File

@ -0,0 +1,28 @@
#ifndef SRC_BERGAMOT_PARSER_H
#define SRC_BERGAMOT_PARSER_H
#include "marian.h"
namespace marian {
namespace bergamot {
inline marian::ConfigParser createConfigParser() {
marian::ConfigParser cp(marian::cli::mode::translation);
cp.addOption<std::string>(
"--ssplit-prefix-file", "Bergamot Options",
"File with nonbreaking prefixes for sentence splitting.");
cp.addOption<std::string>("--ssplit-mode", "Server Options",
"[paragraph, sentence, wrapped_text]", "paragraph");
cp.addOption<int>(
"--max-length-break", "Bergamot Options",
"Maximum input tokens to be processed in a single sentence.", 128);
return cp;
}
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_PARSER_H

299
src/translator/pcqueue.h Normal file
View File

@ -0,0 +1,299 @@
#ifndef SRC_BERGAMOT_PCQUEUE_H_
#define SRC_BERGAMOT_PCQUEUE_H_
#include "common/logging.h"
#include <algorithm>
#include <cerrno>
#include <iostream>
#include <memory>
#include <mutex>
#ifdef __APPLE__
#include <mach/mach.h>
#include <mach/mach_traps.h>
#include <mach/semaphore.h>
#include <mach/task.h>
#elif defined(__linux)
#include <semaphore.h>
#else
#include <boost/interprocess/sync/interprocess_semaphore.hpp>
#endif
#if __GNUC__ >= 3
#define UTIL_UNLIKELY(x) __builtin_expect(!!(x), 0)
#else
#define UTIL_UNLIKELY(x) (x)
#endif
namespace marian {
namespace bergamot {
/* OS X Maverick and Boost interprocess were doing "Function not implemented."
* So this is my own wrapper around the mach kernel APIs.
*/
#ifdef __APPLE__
class Semaphore {
public:
explicit Semaphore(int value) : task_(mach_task_self()) {
ABORT_IF(KERN_SUCCESS !=
semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value),
"Could not create semaphore");
}
~Semaphore() {
if (KERN_SUCCESS != semaphore_destroy(task_, back_)) {
std::cerr << "Could not destroy semaphore" << std::endl;
abort();
}
}
void wait() {
ABORT_IF(KERN_SUCCESS != semaphore_wait(back_),
"Wait for semaphore failed");
}
void post() {
ABORT_IF(KERN_SUCCESS != semaphore_signal(back_),
"Could not post to semaphore");
}
private:
semaphore_t back_;
task_t task_;
};
inline void WaitSemaphore(Semaphore &semaphore) { semaphore.wait(); }
#elif defined(__linux)
class Semaphore {
public:
explicit Semaphore(unsigned int value) {
ABORT_IF(sem_init(&sem_, 0, value), "Could not create semaphore");
}
~Semaphore() {
if (-1 == sem_destroy(&sem_)) {
std::cerr << "Could not destroy semaphore " << std::endl;
abort();
}
}
void wait() {
while (UTIL_UNLIKELY(-1 == sem_wait(&sem_))) {
ABORT_IF(errno != EINTR, "Wait for semaphore failed");
}
}
void post() {
ABORT_IF(-1 == sem_post(&sem_), "Could not post to semaphore");
}
private:
sem_t sem_;
};
inline void WaitSemaphore(Semaphore &semaphore) { semaphore.wait(); }
#else
typedef boost::interprocess::interprocess_semaphore Semaphore;
inline void WaitSemaphore(Semaphore &on) {
while (1) {
try {
on.wait();
break;
} catch (boost::interprocess::interprocess_exception &e) {
if (e.get_native_error() != EINTR) {
throw;
}
}
}
}
#endif // Apple
/**
* Producer consumer queue safe for multiple producers and multiple consumers.
* T must be default constructable and have operator=.
* The value is copied twice for Consume(T &out) or three times for Consume(),
* so larger objects should be passed via pointer.
* Strong exception guarantee if operator= throws. Undefined if semaphores
* throw.
*/
template <class T> class PCQueue {
public:
explicit PCQueue(size_t size)
: empty_(size), used_(0), storage_(new T[size]),
end_(storage_.get() + size), produce_at_(storage_.get()),
consume_at_(storage_.get()) {}
// Add a value to the queue.
void Produce(const T &val) {
WaitSemaphore(empty_);
{
std::lock_guard<std::mutex> produce_lock(produce_at_mutex_);
try {
*produce_at_ = val;
} catch (...) {
empty_.post();
throw;
}
if (++produce_at_ == end_)
produce_at_ = storage_.get();
}
used_.post();
}
// Add a value to the queue, but swap it into place.
void ProduceSwap(T &val) {
WaitSemaphore(empty_);
{
std::lock_guard<std::mutex> produce_lock(produce_at_mutex_);
try {
std::swap(*produce_at_, val);
} catch (...) {
empty_.post();
throw;
}
if (++produce_at_ == end_)
produce_at_ = storage_.get();
}
used_.post();
}
// Consume a value, assigning it to out.
T &Consume(T &out) {
WaitSemaphore(used_);
{
std::lock_guard<std::mutex> consume_lock(consume_at_mutex_);
try {
out = *consume_at_;
} catch (...) {
used_.post();
throw;
}
if (++consume_at_ == end_)
consume_at_ = storage_.get();
}
empty_.post();
return out;
}
// Consume a value, swapping it to out.
T &ConsumeSwap(T &out) {
WaitSemaphore(used_);
{
std::lock_guard<std::mutex> consume_lock(consume_at_mutex_);
try {
std::swap(out, *consume_at_);
} catch (...) {
used_.post();
throw;
}
if (++consume_at_ == end_)
consume_at_ = storage_.get();
}
empty_.post();
return out;
}
// Convenience version of Consume that copies the value to return.
// The other version is faster.
T Consume() {
T ret;
Consume(ret);
return ret;
}
private:
// Number of empty spaces in storage_.
Semaphore empty_;
// Number of occupied spaces in storage_.
Semaphore used_;
std::unique_ptr<T[]> storage_;
T *const end_;
// Index for next write in storage_.
T *produce_at_;
std::mutex produce_at_mutex_;
// Index for next read from storage_.
T *consume_at_;
std::mutex consume_at_mutex_;
};
template <class T> struct UnboundedPage {
UnboundedPage() : next(nullptr) {}
UnboundedPage *next;
T entries[1023];
};
template <class T> class UnboundedSingleQueue {
public:
UnboundedSingleQueue() : valid_(0) {
SetFilling(new UnboundedPage<T>());
SetReading(filling_);
}
void Produce(T &&val) {
if (filling_current_ == filling_end_) {
UnboundedPage<T> *next = new UnboundedPage<T>();
filling_->next = next;
SetFilling(next);
}
*(filling_current_++) = std::move(val);
valid_.post();
}
void Produce(const T &val) { Produce(T(val)); }
T &Consume(T &out) {
WaitSemaphore(valid_);
if (reading_current_ == reading_end_) {
SetReading(reading_->next);
}
out = std::move(*(reading_current_++));
return out;
}
// Warning: very much a no-guarantees race-condition-rich implementation!
// But sufficient for our specific purpose: The single thread that consumes
// is also the only one that checks Empty, and knows that it's racing.
bool Empty() const { return reading_current_ == filling_current_; }
private:
void SetFilling(UnboundedPage<T> *to) {
filling_ = to;
filling_current_ = to->entries;
filling_end_ = filling_current_ + sizeof(to->entries) / sizeof(T);
}
void SetReading(UnboundedPage<T> *to) {
reading_.reset(to);
reading_current_ = to->entries;
reading_end_ = reading_current_ + sizeof(to->entries) / sizeof(T);
}
Semaphore valid_;
UnboundedPage<T> *filling_;
std::unique_ptr<UnboundedPage<T>> reading_;
T *filling_current_;
T *filling_end_;
T *reading_current_;
T *reading_end_;
UnboundedSingleQueue(const UnboundedSingleQueue &) = delete;
UnboundedSingleQueue &operator=(const UnboundedSingleQueue &) = delete;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_PCQUEUE_H_

View File

@ -0,0 +1,95 @@
#include "request.h"
#include "definitions.h"
#include "response.h"
#include "sentence_ranges.h"
#include "common/logging.h"
#include <string>
namespace marian {
namespace bergamot {
// -----------------------------------------------------------------
Request::Request(size_t Id, size_t lineNumberBegin,
std::vector<Ptr<Vocab const>> &vocabs, std::string &&source,
Segments &&segments, SentenceRanges &&sourceRanges,
std::promise<Response> responsePromise)
: Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs),
source_(std::move(source)), segments_(std::move(segments)),
sourceRanges_(std::move(sourceRanges)),
response_(std::move(responsePromise)) {
counter_ = segments_.size();
histories_.resize(segments_.size(), nullptr);
}
size_t Request::lineNumberBegin() const { return lineNumberBegin_; }
size_t Request::numSegments() const { return segments_.size(); }
size_t Request::segmentTokens(size_t index) const {
return (segments_[index].size());
}
Segment Request::getSegment(size_t index) const { return segments_[index]; }
void Request::processHistory(size_t index, Ptr<History> history) {
// Concurrently called by multiple workers as a history from translation is
// ready. The container storing histories is set with the value obtained.
histories_[index] = history;
// In case this is last request in, completeRequest is called, which sets the
// value of the promise.
if (--counter_ == 0) {
completeRequest();
}
}
void Request::completeRequest() {
// Request no longer needs to hold the content, can transfer it to
// Response.
Response response(std::move(source_), std::move(sourceRanges_),
std::move(histories_), *vocabs_);
response_.set_value(std::move(response));
}
bool Request::operator<(const Request &b) const {
// Among Requests, only sequence id is used for obtaining priority.
return Id_ < b.Id_;
}
// ------------------------------------------------------------------
RequestSentence::RequestSentence(size_t index, Ptr<Request> request)
: index_(index), request_(request) {}
size_t RequestSentence::numTokens() const {
return (request_->segmentTokens(index_));
}
size_t RequestSentence::lineNumber() const {
return (request_->lineNumberBegin() + index_);
}
void RequestSentence::completeSentence(Ptr<History> history) {
// Relays completeSentence into request's processHistory, using index
// information.
request_->processHistory(index_, history);
}
Segment RequestSentence::getUnderlyingSegment() const {
return request_->getSegment(index_);
}
bool operator<(const RequestSentence &a, const RequestSentence &b) {
// Operator overload for usage in priority-queue / set.
if (a.request_ == b.request_) {
return a.index_ < b.index_;
}
return a.request_ < b.request_;
}
// ----------------------------------------------------------------------
} // namespace bergamot
} // namespace marian

126
src/translator/request.h Normal file
View File

@ -0,0 +1,126 @@
//
// Defines:
//
// Request: holds the input blob of a text, Segments (vector<Words>) which are
// to go to the batching mechanism and alignments between the processed
// segments and the input blob (sourceTokenRanges). In addition, Request takes
// care of the barrier which fires when all the Segments in a request are done
// translating by the workers (BatchTranslator).
// TODO(jerinphilip): Extend Request with notions of Priority (sequence,
// user-given).
//
// RequestSentence: is a tuple of (index, Ptr<Request>). This provides the
// batching mechanism access to the segment within the request. The backref to
// Request allows event triggering the barrier upon completion of the last
// sentence by a worker.
#ifndef SRC_BERGAMOT_REQUEST_H_
#define SRC_BERGAMOT_REQUEST_H_
#include "definitions.h"
#include "response.h"
#include "sentence_ranges.h"
#include "common/logging.h"
#include "data/types.h"
#include "translator/beam_search.h"
#include <cassert>
#include <future>
#include <vector>
namespace marian {
namespace bergamot {
class Request {
public:
Request(size_t Id, size_t lineNumberBegin,
std::vector<Ptr<Vocab const>> &vocabs_, std::string &&source,
Segments &&segments, SentenceRanges &&sourceTokenRanges,
std::promise<Response> responsePromise);
// Obtain the count of tokens in the segment correponding to index. Used to
// insert sentence from multiple requests into the corresponding size bucket.
size_t segmentTokens(size_t index) const;
// Obtain number of segments in a request.
size_t numSegments() const;
size_t lineNumberBegin() const;
// Obtains segment corresponding to index to create a batch of segments among
// several requests.
Segment getSegment(size_t index) const;
// For notions of priority among requests, used to enable std::set in
// Batcher.
bool operator<(const Request &request) const;
// Processes a history obtained after translating in a heterogenous batch
// compiled from requests.
void processHistory(size_t index, Ptr<History> history);
// On completion of last segment, sets value of the promise.
void completeRequest();
private:
size_t Id_;
size_t lineNumberBegin_;
// Multiple translation-workers can concurrently access the same Request. The
// following atomic atomically operates on the variable holding sentences
// remaining to be translated.
std::atomic<int> counter_;
// source_ holds the source string to be translated. segments_ hold the
// sentences generated from source_ in vector<Words>. sourceRanges_ are
// string_views of the text corresponding to these words, pointing to
// sequences in source_. histories_ is a buffer which eventually stores the
// translations of each segment in the corresponding index.
std::string source_;
Segments segments_;
SentenceRanges sourceRanges_;
std::vector<Ptr<History>> histories_;
// Members above are moved into newly constructed Response on completion
// of translation of all segments. The promise below is set to this Response
// value. future to this promise is made available to the user through
// Service.
std::promise<Response> response_;
// Constructing Response requires the vocabs_ used to generate Request.
std::vector<Ptr<Vocab const>> *vocabs_;
};
class RequestSentence {
// A RequestSentence provides a view to a sentence within a Request. Existence
// of this class allows the sentences and associated information to be kept
// within Request.
public:
RequestSentence(size_t, Ptr<Request>);
size_t numTokens() const;
// lineNumber in Request, used for matching marian-decoder. SentenceTuple
// requires lineNumber to be set for Corpus based batches.
size_t lineNumber() const;
// Accessor to the segment represented by the RequestSentence.
Segment getUnderlyingSegment() const;
// Forwards call to Request, checking for completion.
void completeSentence(Ptr<History> history);
friend bool operator<(const RequestSentence &a, const RequestSentence &b);
private:
size_t index_;
Ptr<Request> request_;
};
typedef std::vector<RequestSentence> RequestSentences;
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_REQUEST_H_

View File

@ -0,0 +1,98 @@
#include "response.h"
#include "sentence_ranges.h"
#include "common/logging.h"
#include "data/alignment.h"
#include <utility>
namespace marian {
namespace bergamot {
Response::Response(std::string &&source, SentenceRanges &&sourceRanges,
Histories &&histories, std::vector<Ptr<Vocab const>> &vocabs)
: source_(std::move(source)), sourceRanges_(std::move(sourceRanges)),
histories_(std::move(histories)), vocabs_(&vocabs) {}
void Response::move(std::string &source, std::string &translation,
SentenceMappings &sentenceMappings) {
// Construct required stuff first.
constructTranslation();
constructSentenceMappings(sentenceMappings);
// Move content out.
source = std::move(source_);
translation = std::move(translation_);
// The above assignment expects source, target be moved.
// which makes the following invalid, hence required to be cleared.
sourceRanges_.clear();
targetRanges_.clear();
histories_.clear();
}
void Response::constructTranslation() {
if (translationConstructed_) {
return;
}
// Reserving length at least as much as source_ seems like a reasonable thing
// to do to avoid reallocations.
translation_.reserve(source_.size());
// In a first step, the decoded units (individual senteneces) are compiled
// into a huge string. This is done by computing indices first and appending
// to the string as each sentences are decoded.
std::vector<std::pair<size_t, size_t>> translationRanges;
size_t offset{0};
bool first{true};
for (auto &history : histories_) {
// TODO(jerin): Change hardcode of nBest = 1
NBestList onebest = history->nBest(1);
Result result = onebest[0]; // Expecting only one result;
Words words = std::get<0>(result);
auto targetVocab = vocabs_->back();
std::string decoded = targetVocab->decode(words);
if (first) {
first = false;
} else {
translation_ += " ";
++offset;
}
translation_ += decoded;
translationRanges.emplace_back(offset, decoded.size());
offset += decoded.size();
}
// Once the entire string is constructed, there are no further possibility of
// reallocation in the string's storage, the indices are converted into
// string_views.
for (auto &range : translationRanges) {
// TODO(@jerinphilip): Currently considers target tokens as whole text.
// Needs to be further enhanced in marian-dev to extract alignments.
std::vector<string_view> targetMappings;
const char *begin = &translation_[range.first];
targetMappings.emplace_back(begin, range.second);
targetRanges_.addSentence(targetMappings);
}
translationConstructed_ = true;
}
void Response::constructSentenceMappings(
Response::SentenceMappings &sentenceMappings) {
for (size_t i = 0; i < sourceRanges_.numSentences(); i++) {
string_view src = sourceRanges_.sentence(i);
string_view tgt = targetRanges_.sentence(i);
sentenceMappings.emplace_back(src, tgt);
}
}
} // namespace bergamot
} // namespace marian

99
src/translator/response.h Normal file
View File

@ -0,0 +1,99 @@
#ifndef SRC_BERGAMOT_RESPONSE_H_
#define SRC_BERGAMOT_RESPONSE_H_
#include "sentence_ranges.h"
#include "data/types.h"
#include "definitions.h"
#include "translator/beam_search.h"
#include <cassert>
#include <string>
#include <vector>
namespace marian {
namespace bergamot {
class Response {
// Response is a marian internal class (not a bergamot-translator class)
// holding source blob of text, vector of TokenRanges corresponding to each
// sentence in the source text blob and histories obtained from translating
// these sentences.
//
// This class provides an API at a higher level in comparison to History to
// access translations and additionally use string_view manipulations to
// recover structure in translation from source-text's structure known through
// reference string and string_view. As many of these computations are not
// required until invoked, they are computed as required and stored in data
// members where it makes sense to do so (translation,translationTokenRanges).
//
// Examples of such use-cases are:
// translation()
// translationInSourceStructure() TODO(@jerinphilip)
// alignment(idx) TODO(@jerinphilip)
// sentenceMappings (for bergamot-translator)
public:
Response(std::string &&source, SentenceRanges &&sourceRanges,
Histories &&histories,
// Required for constructing translation and TokenRanges within
// translation lazily.
std::vector<Ptr<Vocab const>> &vocabs);
// Move constructor.
Response(Response &&other)
: source_(std::move(other.source_)),
translation_(std::move(other.translation_)),
sourceRanges_(std::move(other.sourceRanges_)),
targetRanges_(std::move(other.targetRanges_)),
histories_(std::move(other.histories_)),
vocabs_(std::move(other.vocabs_)){};
// Prevents CopyConstruction and CopyAssignment. sourceRanges_ is constituted
// by string_view and copying invalidates the data member.
Response(const Response &) = delete;
Response &operator=(const Response &) = delete;
typedef std::vector<std::pair<const string_view, const string_view>>
SentenceMappings;
// Moves source sentence into source, translated text into translation.
// Pairs of string_views to corresponding sentences in
// source and translation are loaded into sentenceMappings. These string_views
// reference the new source and translation.
//
// Calling move() invalidates the Response object as ownership is transferred.
// Exists for moving strc
void move(std::string &source, std::string &translation,
SentenceMappings &sentenceMappings);
const Histories &histories() const { return histories_; }
const std::string &source() const { return source_; }
const std::string &translation() {
constructTranslation();
return translation_;
}
// A convenience function provided to return translated text placed within
// source's structure. This is useful when the source text is a multi-line
// paragraph or string_views extracted from structured text like HTML and it's
// desirable to place the individual sentences in the locations of the source
// sentences.
// const std::string translationInSourceStructure();
// const PendingAlignmentType alignment(size_t idx);
private:
void constructTranslation();
void constructSentenceMappings(SentenceMappings &);
std::string source_;
SentenceRanges sourceRanges_;
Histories histories_;
std::vector<Ptr<Vocab const>> *vocabs_;
bool translationConstructed_{false};
std::string translation_;
SentenceRanges targetRanges_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_RESPONSE_H_

View File

@ -0,0 +1,46 @@
#include "sentence_ranges.h"
#include <cassert>
#include <iostream>
namespace marian {
namespace bergamot {
void SentenceRanges::addSentence(std::vector<string_view> &wordRanges) {
addSentence(std::begin(wordRanges), std::end(wordRanges));
}
void SentenceRanges::addSentence(WordIterator begin, WordIterator end) {
size_t size = flatByteRanges_.size();
flatByteRanges_.insert(std::end(flatByteRanges_), begin, end);
sentenceBeginIds_.push_back(size);
}
string_view SentenceRanges::sentence(size_t index) const {
size_t bos_id;
string_view eos, bos;
bos_id = sentenceBeginIds_[index];
bos = flatByteRanges_[bos_id];
if (index + 1 == numSentences()) {
eos = flatByteRanges_.back();
} else {
assert(index < numSentences());
size_t eos_id = sentenceBeginIds_[index + 1];
--eos_id;
eos = flatByteRanges_[eos_id];
}
return sentenceBetween(bos, eos);
}
string_view SentenceRanges::sentenceBetween(string_view firstWord,
string_view lastWord) const {
const char *data = firstWord.data();
size_t size = lastWord.data() + lastWord.size() - firstWord.data();
return string_view(data, size);
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,52 @@
#ifndef BERGAMOT_SENTENCE_RANGES_H_
#define BERGAMOT_SENTENCE_RANGES_H_
#include "data/types.h"
#include <cassert>
#include <vector>
namespace marian {
namespace bergamot {
class SentenceRanges {
// SentenceRanges stores string_views into a source text, with additional
// annotations to mark sentence boundaries.
//
// Given the availability annotations, this container provides capabilty to
// add sentences, and access individual sentences.
public:
typedef std::vector<string_view>::iterator WordIterator;
void addSentence(std::vector<string_view> &wordRanges);
void addSentence(WordIterator begin, WordIterator end);
void clear() {
flatByteRanges_.clear();
sentenceBeginIds_.clear();
}
size_t numSentences() const { return sentenceBeginIds_.size(); }
// Returns a string_view into the ith sentence.
string_view sentence(size_t index) const;
private:
// A flat storage for string_views. Can be words or sentences.
std::vector<string_view> flatByteRanges_;
// The container grows dynamically with addSentence. size_t marking index is
// used to ensure the sentence boundaries stay same while underlying storage
// might be changed during reallocation.
std::vector<size_t> sentenceBeginIds_;
// Utility function to extract the string starting at firstWord and ending at
// lastWord as a single string-view.
string_view sentenceBetween(string_view firstWord,
string_view lastWord) const;
};
} // namespace bergamot
} // namespace marian
#endif // BERGAMOT_SENTENCE_RANGES_H_

View File

@ -0,0 +1,53 @@
#include "sentence_splitter.h"
#include "common/cli_helper.h"
#include "common/logging.h"
#include "common/options.h"
#include <string>
namespace marian {
namespace bergamot {
SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options)
: options_(options) {
std::string smode_str = options_->get<std::string>("ssplit-mode", "");
mode_ = string2splitmode(smode_str);
std::string ssplit_prefix_file =
options_->get<std::string>("ssplit-prefix-file", "");
if (ssplit_prefix_file.size()) {
ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file);
LOG(info, "Loading protected prefixes for sentence splitting from {}",
ssplit_prefix_file);
ssplit_.load(ssplit_prefix_file);
} else {
LOG(warn, "Missing list of protected prefixes for sentence splitting. "
"Set with --ssplit-prefix-file.");
}
}
ug::ssplit::SentenceStream
SentenceSplitter::createSentenceStream(const string_view &input) {
std::string_view input_converted(input.data(), input.size());
return std::move(
ug::ssplit::SentenceStream(input_converted, this->ssplit_, mode_));
}
ug::ssplit::SentenceStream::splitmode
SentenceSplitter::string2splitmode(const std::string &m) {
typedef ug::ssplit::SentenceStream::splitmode splitmode;
// @TODO: throw Exception on error
if (m == "sentence" || m == "Sentence")
return splitmode::one_sentence_per_line;
if (m == "paragraph" || m == "Paragraph")
return splitmode::one_paragraph_per_line;
if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") {
LOG(warn, "Ignoring unknown text input format specification: {}.", m);
}
return splitmode::wrapped_text;
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,31 @@
#ifndef SRC_BERGAMOT_SENTENCE_SPLITTER_H_
#define SRC_BERGAMOT_SENTENCE_SPLITTER_H_
#include "common/options.h"
#include "data/types.h"
#include "ssplit.h"
#include <string>
namespace marian {
namespace bergamot {
class SentenceSplitter {
// A wrapper around @ugermann's ssplit-cpp compiled from several places in
// mts. Constructed based on options. Used in TextProcessor below to create
// sentence-streams, which provide access to one sentence from blob of text at
// a time.
public:
explicit SentenceSplitter(Ptr<Options> options);
ug::ssplit::SentenceStream createSentenceStream(string_view const &input);
private:
ug::ssplit::SentenceSplitter ssplit_;
Ptr<Options> options_;
ug::ssplit::SentenceStream::splitmode mode_;
ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m);
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_SENTENCE_SPLITTER_H_

View File

@ -0,0 +1,70 @@
#include "service.h"
#include "batch.h"
#include "definitions.h"
#include <string>
#include <utility>
namespace marian {
namespace bergamot {
Service::Service(Ptr<Options> options)
: ServiceBase(options), numWorkers_(options->get<int>("cpu-threads")),
pcqueue_(numWorkers_) {
if (numWorkers_ == 0) {
ABORT("Fatal: Attempt to create multithreaded instance with --cpu-threads "
"0. ");
}
translators_.reserve(numWorkers_);
workers_.reserve(numWorkers_);
for (size_t cpuId = 0; cpuId < numWorkers_; cpuId++) {
marian::DeviceId deviceId(cpuId, DeviceType::cpu);
translators_.emplace_back(deviceId, vocabs_, options);
auto &translator = translators_.back();
workers_.emplace_back([&translator, this] {
translator.initialize();
// Run thread mainloop
Batch batch;
Histories histories;
while (true) {
pcqueue_.ConsumeSwap(batch);
if (batch.isPoison()) {
return;
} else {
translator.translate(batch);
}
}
});
}
}
void Service::enqueue() {
Batch batch;
while (batcher_ >> batch) {
pcqueue_.ProduceSwap(batch);
}
}
void Service::stop() {
for (auto &worker : workers_) {
Batch poison = Batch::poison();
pcqueue_.ProduceSwap(poison);
}
for (auto &worker : workers_) {
if (worker.joinable()) {
worker.join();
}
}
workers_.clear();
}
Service::~Service() { stop(); }
} // namespace bergamot
} // namespace marian

56
src/translator/service.h Normal file
View File

@ -0,0 +1,56 @@
#ifndef SRC_BERGAMOT_SERVICE_H_
#define SRC_BERGAMOT_SERVICE_H_
#include "batch_translator.h"
#include "batcher.h"
#include "data/types.h"
#include "pcqueue.h"
#include "response.h"
#include "service_base.h"
#include "text_processor.h"
#include <queue>
#include <vector>
namespace marian {
namespace bergamot {
class Service : public ServiceBase {
// Service exposes methods to translate an incoming blob of text to the
// Consumer of bergamot API.
//
// An example use of this API looks as follows:
//
// options = ...;
// service = Service(options);
// std::string input_blob = "Hello World";
// std::future<Response>
// response = service.translate(std::move(input_blob));
// response.wait();
// Response result = response.get();
public:
explicit Service(Ptr<Options> options);
// Implements enqueue and top through blocking methods.
void stop() override;
~Service();
private:
void enqueue() override;
// In addition to the common members (text_processor, requestId, vocabs_,
// batcher) extends with a producer-consumer queue, vector of translator
// instances owned by service each listening to the pcqueue in separate
// threads.
size_t numWorkers_; // ORDER DEPENDENCY
PCQueue<Batch> pcqueue_; // ORDER DEPENDENCY
std::vector<std::thread> workers_;
std::vector<BatchTranslator> translators_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_SERVICE_H_

View File

@ -0,0 +1,42 @@
#include "service_base.h"
namespace marian {
namespace bergamot {
ServiceBase::ServiceBase(Ptr<Options> options)
: requestId_(0), vocabs_(std::move(loadVocabularies(options))),
text_processor_(vocabs_, options), batcher_(options) {}
std::future<Response> ServiceBase::translate(std::string &&input) {
Segments segments;
SentenceRanges sourceRanges;
text_processor_.process(input, segments, sourceRanges);
std::promise<Response> responsePromise;
auto future = responsePromise.get_future();
Ptr<Request> request = New<Request>(
requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(input),
std::move(segments), std::move(sourceRanges), std::move(responsePromise));
batcher_.addWholeRequest(request);
enqueue();
return future;
}
NonThreadedService::NonThreadedService(Ptr<Options> options)
: ServiceBase(options),
translator_(DeviceId(0, DeviceType::cpu), vocabs_, options) {
translator_.initialize();
}
void NonThreadedService::enqueue() {
// Queue single-threaded
Batch batch;
while (batcher_ >> batch) {
translator_.translate(batch);
}
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,80 @@
#ifndef SRC_BERGAMOT_SERVICE_BASE_H_
#define SRC_BERGAMOT_SERVICE_BASE_H_
#include "batch_translator.h"
#include "batcher.h"
#include "data/types.h"
#include "response.h"
#include "text_processor.h"
#include <queue>
#include <vector>
namespace marian {
namespace bergamot {
// This file describes the base class ServiceBase, and a non-threaded subclass
// implementing translation functionality called NonThreadedService.
class ServiceBase {
public:
explicit ServiceBase(Ptr<Options> options);
// Transfers ownership of input string to Service, returns a future containing
// an object which provides access to translations, other features like
// sentencemappings and (tentatively) alignments.
std::future<Response> translate(std::string &&input);
// Convenience accessor methods to extract these vocabulary outside service.
// e.g: For use in decoding histories for marian-decoder replacement.
Ptr<Vocab const> sourceVocab() const { return vocabs_.front(); }
Ptr<Vocab const> targetVocab() const { return vocabs_.back(); }
// Wraps up any thread related destruction code.
virtual void stop() = 0;
protected:
// Enqueue queues a request for translation, this can be synchronous, blocking
// or asynchronous and queued in the background.
virtual void enqueue() = 0;
size_t requestId_;
std::vector<Ptr<Vocab const>> vocabs_; // ORDER DEPENDENCY
TextProcessor text_processor_; // ORDER DEPENDENCY
Batcher batcher_;
};
class NonThreadedService : public ServiceBase {
public:
explicit NonThreadedService(Ptr<Options> options);
void stop() override{};
private:
// NonThreaded service overrides unimplemented functions in base-class using
// blocking mechanisms.
void enqueue() override;
// There's a single translator, launched as part of the main process.
BatchTranslator translator_;
};
// Used across Services
inline std::vector<Ptr<const Vocab>> loadVocabularies(Ptr<Options> options) {
// @TODO: parallelize vocab loading for faster startup
auto vfiles = options->get<std::vector<std::string>>("vocabs");
// with the current setup, we need at least two vocabs: src and trg
ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies.");
std::vector<Ptr<Vocab const>> vocabs(vfiles.size());
std::unordered_map<std::string, Ptr<Vocab>> vmap;
for (size_t i = 0; i < vocabs.size(); ++i) {
auto m = vmap.emplace(std::make_pair(vfiles[i], Ptr<Vocab>()));
if (m.second) { // new: load the vocab
m.first->second = New<Vocab>(options, i);
m.first->second->load(vfiles[i]);
}
vocabs[i] = m.first->second;
}
return vocabs;
}
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_SERVICE_BASE_H_

View File

@ -0,0 +1,69 @@
#include "text_processor.h"
#include "data/types.h"
#include "definitions.h"
#include "sentence_ranges.h"
#include "common/options.h"
#include "data/vocab.h"
#include <vector>
namespace marian {
namespace bergamot {
Segment TextProcessor::tokenize(const string_view &segment,
std::vector<string_view> &wordRanges) {
return vocabs_->front()->encodeWithByteRanges(
segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
}
TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options)
: vocabs_(&vocabs), sentence_splitter_(options) {
max_length_break_ = options->get<int>("max-length-break");
max_length_break_ = max_length_break_ - 1;
ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0");
}
void TextProcessor::process(const string_view &query, Segments &segments,
SentenceRanges &sourceRanges) {
auto sentenceStream = sentence_splitter_.createSentenceStream(query);
std::string_view sentenceStringPiece;
while (sentenceStream >> sentenceStringPiece) {
marian::string_view sentence(sentenceStringPiece.data(),
sentenceStringPiece.size());
std::vector<string_view> wordRanges;
Segment segment = tokenize(sentence, wordRanges);
// There are some cases where SentencePiece or vocab returns no words
// after normalization. 0 prevents any empty entries from being added.
if (segment.size() > 0) {
// Truncate segment into max_input_size segments.
truncate(segment, wordRanges, segments, sourceRanges);
}
}
}
void TextProcessor::truncate(Segment &segment,
std::vector<string_view> &wordRanges,
Segments &segments, SentenceRanges &sourceRanges) {
for (size_t offset = 0; offset < segment.size();
offset += max_length_break_) {
auto start = segment.begin() + offset;
size_t left = segment.size() - offset;
size_t diff = std::min(max_length_break_, left);
segments.emplace_back(start, start + diff);
segments.back().push_back(sourceEosId());
auto astart = wordRanges.begin() + offset;
sourceRanges.addSentence(astart, astart + diff);
}
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,50 @@
#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
#define SRC_BERGAMOT_TEXT_PROCESSOR_H_
#include "data/types.h"
#include "data/vocab.h"
#include "definitions.h"
#include "sentence_ranges.h"
#include "sentence_splitter.h"
#include <vector>
namespace marian {
namespace bergamot {
class TextProcessor {
// TextProcessor handles loading the sentencepiece vocabulary and also
// contains an instance of sentence-splitter based on ssplit.
//
// Used in Service to convert an incoming blog of text to a vector of
// sentences (vector of words). In addition, the ByteRanges of the
// source-tokens in unnormalized text are provided as string_views.
public:
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
void process(const string_view &query, Segments &segments,
SentenceRanges &sourceRanges);
private:
// Tokenizes an input string, returns Words corresponding. Loads the
// corresponding byte-ranges into tokenRanges.
Segment tokenize(const string_view &input,
std::vector<string_view> &tokenRanges);
// Truncate sentence into max_input_size segments.
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
Segments &segments, SentenceRanges &sourceRanges);
// shorthand, used only in truncate()
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
std::vector<Ptr<Vocab const>> *vocabs_;
SentenceSplitter sentence_splitter_;
size_t max_length_break_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_

28
wasm/CMakeLists.txt Normal file
View File

@ -0,0 +1,28 @@
add_executable(bergamot-translator-worker
bindings/TranslationModelBindings.cpp
bindings/TranslationRequestBindings.cpp
bindings/TranslationResultBindings.cpp
)
# This header inclusion needs to go away later as path to public headers of bergamot
# translator should be directly available from "bergamot-translator" target
target_include_directories(bergamot-translator-worker
PRIVATE ${CMAKE_SOURCE_DIR}/src/translator
PRIVATE ${CMAKE_SOURCE_DIR}
)
# This compile definition is required for generating binding code properly
target_compile_definitions(bergamot-translator-worker PRIVATE WASM_BINDINGS)
target_compile_options(bergamot-translator-worker PRIVATE ${WASM_COMPILE_FLAGS})
set(LINKER_FLAGS "--bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s FORCE_FILESYSTEM=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1")
if (NOT PACKAGE_DIR STREQUAL "")
get_filename_component(REALPATH_PACKAGE_DIR ${PACKAGE_DIR} REALPATH BASE_DIR ${CMAKE_BINARY_DIR})
set(LINKER_FLAGS "${LINKER_FLAGS} --preload-file ${REALPATH_PACKAGE_DIR}@/")
endif()
set_target_properties(bergamot-translator-worker PROPERTIES
SUFFIX ".js"
LINK_FLAGS ${LINKER_FLAGS}
)
target_link_libraries(bergamot-translator-worker bergamot-translator)

65
wasm/README.md Normal file
View File

@ -0,0 +1,65 @@
## Using Bergamot Translator in JavaScript
The example file `bergamot.html` in the folder `test_page` demonstrates how to use the bergamot translator in JavaScript via a `<script>` tag.
Please note that everything below assumes that the [bergamot project specific model files](https://github.com/mozilla-applied-ml/bergamot-models) were packaged in wasm binary (using the compile instructions given in the top level README).
### Using JS APIs
```js
// The model configuration as YAML formatted string. For available configuration options, please check: https://marian-nmt.github.io/docs/cmd/marian-decoder/
// This example captures the most relevant options: model file, vocabulary files and shortlist file
const modelConfig = "{\"models\":[\"/esen/model.esen.npz\"],\"vocabs\":[\"/esen/vocab.esen.spm\",\"/esen/vocab.esen.spm\"],\"shortlist\":[\"/esen/lex.esen.s2t\"],\"beam-size\":1}";
// Instantiate the TranslationModel
const model = new Module.TranslationModel(modelConfig);
// Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector<string>)
const request = new Module.TranslationRequest();
const input = new Module.VectorString;
// Initialize the input
input.push_back("Hola"); input.push_back("Mundo");
// translate the input; the result is a vector<TranslationResult>
const result = model.translate(input, request);
// Print original and translated text from each entry of vector<TranslationResult>
for (let i = 0; i < result.size(); i++) {
console.log(' original=' + result.get(i).getOriginalText() + ', translation=' + result.get(i).getTranslatedText());
}
// Don't forget to clean up the instances
model.delete();
request.delete();
input.delete();
```
### Demo (see everything in action)
* Start the test webserver (ensure you have the latest nodejs installed)
```bash
cd test_page
bash start_server.sh
```
* Open any of the browsers below
* Firefox Nightly +87: make sure the following prefs are on (about:config)
```
dom.postMessage.sharedArrayBuffer.bypassCOOP_COEP.insecure.enabled = true
javascript.options.wasm_simd = true
javascript.options.wasm_simd_wormhole = true
```
* Chrome Canary +90: start with the following argument
```
--js-flags="--experimental-wasm-simd"
```
* Browse to the following page:
```
http://localhost:8000/bergamot.html
```
* Run some translations:
* Choose a model and press `Load Model`
* Type a sentence to be translated in the `From` textbox and press `Translate`
* See the results in the `To` and `Log` textboxes

View File

@ -0,0 +1,23 @@
/*
* TranslationModelBindings.cpp
*
* Bindings for TranslationModel class
*/
#include <emscripten/bind.h>
#include "TranslationModel.h"
using namespace emscripten;
// Binding code
EMSCRIPTEN_BINDINGS(translation_model) {
class_<TranslationModel>("TranslationModel")
.constructor<std::string>()
.function("translate", &TranslationModel::translate)
.function("isAlignmentSupported", &TranslationModel::isAlignmentSupported)
;
register_vector<std::string>("VectorString");
register_vector<TranslationResult>("VectorTranslationResult");
}

View File

@ -0,0 +1,17 @@
/*
* Bindings for TranslationRequest class
*
*/
#include <emscripten/bind.h>
#include "TranslationRequest.h"
using namespace emscripten;
// Binding code
EMSCRIPTEN_BINDINGS(translation_request) {
class_<TranslationRequest>("TranslationRequest")
.constructor<>()
;
}

View File

@ -0,0 +1,20 @@
/*
* Bindings for TranslationResult class
*
*/
#include <emscripten/bind.h>
#include <vector>
#include "TranslationResult.h"
using namespace emscripten;
// Binding code
EMSCRIPTEN_BINDINGS(translation_result) {
class_<TranslationResult>("TranslationResult")
.constructor<std::string, std::string, TranslationResult::SentenceMappings>()
.function("getOriginalText", &TranslationResult::getOriginalText)
.function("getTranslatedText", &TranslationResult::getTranslatedText)
;
}

View File

@ -0,0 +1,35 @@
require(__dirname + '/helper.js');
var http = require('http');
var express = require('express');
var app = express();
var server = http.createServer(app);
var fs = require('fs');
var url = require('url');
const nocache = require('nocache');
const cors = require('cors');
app.use(cors())
app.use(nocache());
app.get('/*.*' , cors(), function(req, res) {
var options = url.parse(req.url, true);
var mime = Helper.getMime(options);
serveFile(res, options.pathname, mime);
});
function serveFile(res, pathName, mime) {
mime = mime || 'text/html';
fs.readFile(__dirname + '/' + pathName, function (err, data) {
if (err) {
res.writeHead(500, {"Content-Type": "text/plain"});
return res.end('Error loading ' + pathName + " with Error: " + err);
}
res.header('Cross-Origin-Embedder-Policy','require-corp');
res.header('Cross-Origin-Opener-Policy','same-origin');
res.writeHead(200, {"Content-Type": mime});
res.end(data);
});
}
server.listen(8000);
console.log('HTTP and BinaryJS server started on port 8000');

View File

@ -0,0 +1,200 @@
<!doctype html>
<html>
<head>
<link rel="icon" href="data:,">
<meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1">
</head>
<style>
body, html, div {
margin-left: 1%;
margin-right: 1%;
margin-bottom: 1%;
margin-top: 1%;
padding-left: 1%;
padding-right: 1%;
padding-bottom: 1%;
padding-top: 1%;
}
textarea, #to, #from {
width: 100%;
max-width: 100%;
}
div {
float: left;
width: 80%;
}
</style>
<body>
<div id="divradios">
<label>Choose the model to use</label>
<input type="radio" name="modellang" value="enes"/><label>English to Spanish</label>
<input type="radio" name="modellang" value="esen" checked/><label>Spanish to English</label>
<input type="button" id="load" value="Load Model"/>
</div>
<div id="divtranslation">
<label for="from">From</label>
<textarea id="from" name="from">
Una estrategia republicana para obstaculizar la reelección de Obama. Los dirigentes republicanos justificaron su política por la necesidad de luchar contra el fraude electoral.
Ahora bien, el Centro Brennan considera esto último un mito y afirma que el fraude electoral es menos frecuente en los Estados Unidos que el número de personas que mueren a causa de la caída de un rayo.
De hecho, los abogados republicanos no han encontrado más que 300 casos de fraude electoral en los Estados Unidos en diez años. Una cosa es cierta: esas nuevas disposiciones afectarán negativamente a la tasa de participación.
En ese sentido, estas medidas minarán en parte el sistema democrático americano. Al contrario de lo que ocurre en Canadá, los estados americanos son responsables de la organización de las elecciones federales en los Estados Unidos.
Y en esa misma línea una mayoría de los gobiernos americanos promulgaron, a partir de 2009, nuevas leyes que dificultaban el proceso de inscripción o de votación. Este fenómeno se ha extendido tras las elecciones de noviembre de 2010, que vieron el aumento de 675 nuevos representantes republicanos en 26 estados.
En consecuencia, durante el año 2011 se introdujeron 180 proyectos de ley que restringían el ejercicio del derecho de voto en 41 estados.
</textarea>
<br><br>
<label for="to">To</label>
<textarea id="to" name="to" readonly></textarea>
<br><br>
<input type="button" id="translate" value="Translate"/>
</div>
<div id="divlog">
<label for="log">Log:</label><br>
<textarea id="log" name="log" rows="50" cols="75"></textarea>
</div>
<script>
var model, request, input = undefined;
const loadModel = (from, to) => {
const languagePair = `${from}${to}`;
// Vocab files are re-used in both translation directions
const vocabLanguagePair = from === "en" ? `${to}${from}` : languagePair;
// Set the Model Configuration as YAML formatted string.
// For available configuration options, please check: https://marian-nmt.github.io/docs/cmd/marian-decoder/
const modelConfig = `models:
- /${languagePair}/model.${languagePair}.npz
vocabs:
- /${vocabLanguagePair}/vocab.${vocabLanguagePair}.spm
- /${vocabLanguagePair}/vocab.${vocabLanguagePair}.spm
beam-size: 1
normalize: 1.0
word-penalty: 0
max-length-break: 128
mini-batch-words: 1024
workspace: 128
max-length-factor: 2.0
skip-cost: true
cpu-threads: 0
quiet: true
quiet-translation: true
shortlist:
- /${languagePair}/lex.${languagePair}.s2t
- 50
- 50
`;
/*
This config is not valid anymore in new APIs
mini-batch: 32
maxi-batch: 100
maxi-batch-sort: src
*/
// TODO: Use in model config when wormhole is enabled:
// gemm-precision: int8shift
// TODO: Use in model config when loading of binary models is supported and we use model.intgemm.alphas.bin:
// gemm-precision: int8shiftAlphaAll
console.debug("modelConfig: ", modelConfig);
// Instantiate the TranslationModel
if (model) model.delete();
model = new Module.TranslationModel(modelConfig);
}
const translate = (paragraphs) => {
// Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector<string>)
var request = new Module.TranslationRequest();
let input = new Module.VectorString;
// Initialize the input
paragraphs.forEach(paragraph => {
// prevent empty paragraph - it breaks the translation
if (paragraph.trim() === "") {
return;
}
input.push_back(paragraph.trim())
})
// Access input (just for debugging)
console.log('Input size=', input.size());
/*
for (let i = 0; i < input.size(); i++) {
console.log(' val:' + input.get(i));
}
*/
// Translate the input; the result is a vector<TranslationResult>
let result = model.translate(input, request);
// Access original and translated text from each entry of vector<TranslationResult>
//console.log('Result size=', result.size(), ' - TimeDiff - ', (Date.now() - start)/1000);
const translatedParagraphs = [];
for (let i = 0; i < result.size(); i++) {
translatedParagraphs.push(result.get(i).getTranslatedText());
}
console.log({ translatedParagraphs });
request.delete();
input.delete();
return translatedParagraphs;
}
document.querySelector("#load").addEventListener("click", () => {
const lang = document.querySelector('input[name="modellang"]:checked').value;
const from = lang.substring(0, 2);
const to = lang.substring(2, 4);
let start = Date.now();
loadModel(from, to)
log(`model ${from}${to} loaded in ${(Date.now() - start) / 1000} secs`);
//log('Model Alignment:', model.isAlignmentSupported());
});
const translateCall = () => {
const text = document.querySelector('#from').value;
const paragraphs = text.split("\n");
let wordCount = 0;
paragraphs.forEach(sentence => {
wordCount += sentence.trim().split(" ").filter(word => word.trim() !== "").length;
})
const start = Date.now();
const translatedParagraphs = translate(paragraphs);
const secs = (Date.now() - start) / 1000;
log(`Translation of (${wordCount}) words took ${secs} secs (${Math.round(wordCount / secs)} words per second)`);
document.querySelector('#to').value = translatedParagraphs.join("\n");
}
document.querySelector("#translate").addEventListener("click", () => {
translateCall();
});
document.querySelector("#from").addEventListener('keyup', function(event) {
if (event.keyCode === 13) {
translateCall();
}
});
const log = (message) => {
document.querySelector("#log").value += message + "\n";
}
const start = Date.now();
let moduleLoadStart;
var Module = {
preRun: [function() {
log(`Time until Module.preRun: ${(Date.now() - start) / 1000} secs`);
moduleLoadStart = Date.now();
}],
onRuntimeInitialized: function() {
log(`Wasm Runtime initialized (preRun -> onRuntimeInitialized) in ${(Date.now() - moduleLoadStart) / 1000} secs`);
}
};
</script>
<script src="bergamot-translator-worker.js"></script>
</body>
</html>

40
wasm/test_page/helper.js Normal file
View File

@ -0,0 +1,40 @@
/*
* @author - Based of a file from Gist here: https://gist.github.com/1757658
*
* @modified - Mike Newell - it was on Gist so I figure I can use it
*
* @Description - Added support for a few more mime types including the new
* .ogv, .webm, and .mp4 file types for HTML5 video.
*
*/
/*
* @modified - Andre Natal - removed unused types for the purpose of this use
case
*/
Helper = {
types: {
"wasm" : "application/wasm"
, "js" : "application/javascript"
, "html" : "text/html"
, "htm" : "text/html"
, "ico" : "image/vnd.microsoft.icon",
},
getMime: function(u) {
var ext = this.getExt(u.pathname).replace('.', '');
return this.types[ext.toLowerCase()] || 'application/octet-stream';
},
getExt: function(path) {
var i = path.lastIndexOf('.');
return (i < 0) ? '' : path.substr(i);
}
};

391
wasm/test_page/package-lock.json generated Normal file
View File

@ -0,0 +1,391 @@
{
"requires": true,
"lockfileVersion": 1,
"dependencies": {
"accepts": {
"version": "1.3.7",
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz",
"integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==",
"requires": {
"mime-types": "~2.1.24",
"negotiator": "0.6.2"
}
},
"array-flatten": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
"integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
},
"body-parser": {
"version": "1.19.0",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz",
"integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==",
"requires": {
"bytes": "3.1.0",
"content-type": "~1.0.4",
"debug": "2.6.9",
"depd": "~1.1.2",
"http-errors": "1.7.2",
"iconv-lite": "0.4.24",
"on-finished": "~2.3.0",
"qs": "6.7.0",
"raw-body": "2.4.0",
"type-is": "~1.6.17"
}
},
"bytes": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz",
"integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg=="
},
"content-disposition": {
"version": "0.5.3",
"resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz",
"integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==",
"requires": {
"safe-buffer": "5.1.2"
}
},
"content-type": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA=="
},
"cookie": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz",
"integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg=="
},
"cookie-signature": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
"integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
},
"cors": {
"version": "2.8.5",
"resolved": "https://registry.npmjs.org/cors/-/cors-2.8.5.tgz",
"integrity": "sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==",
"requires": {
"object-assign": "^4",
"vary": "^1"
}
},
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
},
"depd": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
"integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak="
},
"destroy": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz",
"integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA="
},
"ee-first": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
"integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0="
},
"encodeurl": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
"integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k="
},
"escape-html": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
"integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg="
},
"etag": {
"version": "1.8.1",
"resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
"integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc="
},
"express": {
"version": "4.17.1",
"resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz",
"integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==",
"requires": {
"accepts": "~1.3.7",
"array-flatten": "1.1.1",
"body-parser": "1.19.0",
"content-disposition": "0.5.3",
"content-type": "~1.0.4",
"cookie": "0.4.0",
"cookie-signature": "1.0.6",
"debug": "2.6.9",
"depd": "~1.1.2",
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
"etag": "~1.8.1",
"finalhandler": "~1.1.2",
"fresh": "0.5.2",
"merge-descriptors": "1.0.1",
"methods": "~1.1.2",
"on-finished": "~2.3.0",
"parseurl": "~1.3.3",
"path-to-regexp": "0.1.7",
"proxy-addr": "~2.0.5",
"qs": "6.7.0",
"range-parser": "~1.2.1",
"safe-buffer": "5.1.2",
"send": "0.17.1",
"serve-static": "1.14.1",
"setprototypeof": "1.1.1",
"statuses": "~1.5.0",
"type-is": "~1.6.18",
"utils-merge": "1.0.1",
"vary": "~1.1.2"
}
},
"finalhandler": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz",
"integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==",
"requires": {
"debug": "2.6.9",
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
"on-finished": "~2.3.0",
"parseurl": "~1.3.3",
"statuses": "~1.5.0",
"unpipe": "~1.0.0"
}
},
"forwarded": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz",
"integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ="
},
"fresh": {
"version": "0.5.2",
"resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
"integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac="
},
"http-errors": {
"version": "1.7.2",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz",
"integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==",
"requires": {
"depd": "~1.1.2",
"inherits": "2.0.3",
"setprototypeof": "1.1.1",
"statuses": ">= 1.5.0 < 2",
"toidentifier": "1.0.0"
}
},
"iconv-lite": {
"version": "0.4.24",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
"integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
"requires": {
"safer-buffer": ">= 2.1.2 < 3"
}
},
"inherits": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
"integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
},
"ipaddr.js": {
"version": "1.9.1",
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
"integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="
},
"media-typer": {
"version": "0.3.0",
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
"integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g="
},
"merge-descriptors": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
"integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
},
"methods": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
"integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4="
},
"mime": {
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg=="
},
"mime-db": {
"version": "1.45.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.45.0.tgz",
"integrity": "sha512-CkqLUxUk15hofLoLyljJSrukZi8mAtgd+yE5uO4tqRZsdsAJKv0O+rFMhVDRJgozy+yG6md5KwuXhD4ocIoP+w=="
},
"mime-types": {
"version": "2.1.28",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.28.tgz",
"integrity": "sha512-0TO2yJ5YHYr7M2zzT7gDU1tbwHxEUWBCLt0lscSNpcdAfFyJOVEpRYNS7EXVcTLNj/25QO8gulHC5JtTzSE2UQ==",
"requires": {
"mime-db": "1.45.0"
}
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
},
"negotiator": {
"version": "0.6.2",
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz",
"integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw=="
},
"nocache": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/nocache/-/nocache-2.1.0.tgz",
"integrity": "sha512-0L9FvHG3nfnnmaEQPjT9xhfN4ISk0A8/2j4M37Np4mcDesJjHgEUfgPhdCyZuFI954tjokaIj/A3NdpFNdEh4Q=="
},
"object-assign": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
"integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM="
},
"on-finished": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
"integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=",
"requires": {
"ee-first": "1.1.1"
}
},
"parseurl": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
"integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="
},
"path-to-regexp": {
"version": "0.1.7",
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
},
"proxy-addr": {
"version": "2.0.6",
"resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.6.tgz",
"integrity": "sha512-dh/frvCBVmSsDYzw6n926jv974gddhkFPfiN8hPOi30Wax25QZyZEGveluCgliBnqmuM+UJmBErbAUFIoDbjOw==",
"requires": {
"forwarded": "~0.1.2",
"ipaddr.js": "1.9.1"
}
},
"qs": {
"version": "6.7.0",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz",
"integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ=="
},
"range-parser": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
"integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="
},
"raw-body": {
"version": "2.4.0",
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz",
"integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==",
"requires": {
"bytes": "3.1.0",
"http-errors": "1.7.2",
"iconv-lite": "0.4.24",
"unpipe": "1.0.0"
}
},
"safe-buffer": {
"version": "5.1.2",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
},
"safer-buffer": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
},
"send": {
"version": "0.17.1",
"resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz",
"integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==",
"requires": {
"debug": "2.6.9",
"depd": "~1.1.2",
"destroy": "~1.0.4",
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
"etag": "~1.8.1",
"fresh": "0.5.2",
"http-errors": "~1.7.2",
"mime": "1.6.0",
"ms": "2.1.1",
"on-finished": "~2.3.0",
"range-parser": "~1.2.1",
"statuses": "~1.5.0"
},
"dependencies": {
"ms": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
}
}
},
"serve-static": {
"version": "1.14.1",
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz",
"integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==",
"requires": {
"encodeurl": "~1.0.2",
"escape-html": "~1.0.3",
"parseurl": "~1.3.3",
"send": "0.17.1"
}
},
"setprototypeof": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz",
"integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw=="
},
"statuses": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
"integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow="
},
"toidentifier": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz",
"integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw=="
},
"type-is": {
"version": "1.6.18",
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
"integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
"requires": {
"media-typer": "0.3.0",
"mime-types": "~2.1.24"
}
},
"unpipe": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw="
},
"utils-merge": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
"integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM="
},
"vary": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
"integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw="
}
}
}

View File

@ -0,0 +1,7 @@
{
"dependencies": {
"cors": "^2.8.5",
"express": "^4.17.1",
"nocache": "^2.1.0"
}
}

View File

@ -0,0 +1,8 @@
#!/bin/bash
cp ../../build-wasm/wasm/bergamot-translator-worker.data .
cp ../../build-wasm/wasm/bergamot-translator-worker.js .
cp ../../build-wasm/wasm/bergamot-translator-worker.wasm .
cp ../../build-wasm/wasm/bergamot-translator-worker.worker.js .
npm install
node bergamot-httpserver.js