mirror of
https://github.com/browsermt/bergamot-translator.git
synced 2024-10-26 05:43:59 +03:00
Merge pull request #38 from browsermt/wasm-integration
wasm-integration -> integration
This commit is contained in:
commit
c28687fffb
4
.gitignore
vendored
4
.gitignore
vendored
@ -16,3 +16,7 @@ CTestTestfile.cmake
|
||||
_deps
|
||||
|
||||
|
||||
wasm/test_page/node_modules
|
||||
build-*
|
||||
models
|
||||
wasm/test_page/bergamot-translator-worker.*
|
||||
|
2
.gitmodules
vendored
2
.gitmodules
vendored
@ -1,6 +1,6 @@
|
||||
[submodule "3rd_party/ssplit-cpp"]
|
||||
path = 3rd_party/ssplit-cpp
|
||||
url = https://github.com/ugermann/ssplit-cpp
|
||||
url = https://github.com/abhi-agg/ssplit-cpp
|
||||
[submodule "3rd_party/marian-dev"]
|
||||
path = 3rd_party/marian-dev
|
||||
url = https://github.com/browsermt/marian-dev
|
||||
|
6
3rd_party/CMakeLists.txt
vendored
6
3rd_party/CMakeLists.txt
vendored
@ -1,4 +1,10 @@
|
||||
add_subdirectory(marian-dev)
|
||||
|
||||
if(COMPILE_WASM)
|
||||
# This is a bad way of adding compilation flags. Will be improved soon.
|
||||
add_compile_options(${WASM_COMPILE_FLAGS})
|
||||
endif(COMPILE_WASM)
|
||||
|
||||
add_subdirectory(ssplit-cpp)
|
||||
|
||||
# Add include directories for 3rd party targets to be able to use it anywhere in the
|
||||
|
2
3rd_party/marian-dev
vendored
2
3rd_party/marian-dev
vendored
@ -1 +1 @@
|
||||
Subproject commit 2f65280459737c37c270e4ad0b6d41de215d11e0
|
||||
Subproject commit 467c43a292a68b7913af2a00d353de97c1740f92
|
2
3rd_party/ssplit-cpp
vendored
2
3rd_party/ssplit-cpp
vendored
@ -1 +1 @@
|
||||
Subproject commit 01e71b4964fdc351f932a7a23cab4cb80b9698e8
|
||||
Subproject commit 432208826ee27e7b3984b53774b1a16d74256d77
|
@ -8,19 +8,69 @@ project(bergamot_translator CXX C)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
|
||||
|
||||
# Custom CMake options to compile marian (a 3rd party submodule) for this project
|
||||
option(COMPILE_CUDA "Compile GPU version" OFF)
|
||||
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
|
||||
option(USE_STATIC_LIBS "Link statically against non-system libs" ON)
|
||||
option(USE_MKL "Compile with MKL support" ON)
|
||||
include(CMakeDependentOption)
|
||||
|
||||
execute_process(COMMAND git submodule update --init --recursive --no-fetch
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
# Project specific cmake options
|
||||
option(COMPILE_WASM "Compile for WASM" OFF)
|
||||
option(USE_WASM_COMPATIBLE_MARIAN "Use wasm compatible marian backend" ON)
|
||||
CMAKE_DEPENDENT_OPTION(COMPILE_THREAD_VARIANT "Compile the project with thread support" OFF
|
||||
"USE_WASM_COMPATIBLE_MARIAN" ON)
|
||||
SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds")
|
||||
|
||||
# Set marian (3rd party submodule) cmake options to compile for this project
|
||||
SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
|
||||
SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
|
||||
SET(USE_STATIC_LIBS ON CACHE BOOL "Link statically against non-system libs")
|
||||
SET(COMPILE_LIBRARY_ONLY ON CACHE BOOL "Build only the Marian library and exclude all executables.")
|
||||
if (USE_WASM_COMPATIBLE_MARIAN)
|
||||
# If using wasm compatible marian then set following flags
|
||||
SET(USE_MKL OFF CACHE BOOL "Compile with MKL support")
|
||||
SET(COMPILE_DECODER_ONLY ON CACHE BOOL "Compile marian-decoder only")
|
||||
SET(COMPILE_WITH_PTHREADS OFF CACHE BOOL "Compile with pthreads support")
|
||||
SET(USE_WASM_COMPATIBLE_BLAS ON CACHE BOOL "Compile with a WASM compatible blas for decoder only builds")
|
||||
SET(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without exceptions")
|
||||
if(COMPILE_WASM)
|
||||
# Set WORMHOLE to ON for marian whenever compiling for wasm platform
|
||||
SET(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160")
|
||||
endif()
|
||||
endif()
|
||||
# Set ssplit (3rd party submodule) cmake options to compile for this project
|
||||
SET(USE_INTERNAL_PCRE2 ON CACHE BOOL "Use internal PCRE2 instead of system PCRE2")
|
||||
|
||||
# Documentation: https://cliutils.gitlab.io/modern-cmake/chapters/projects/submodule.html
|
||||
# Ensures the submodules are set correctly during a build.
|
||||
find_package(Git QUIET)
|
||||
if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
# Update submodules as needed
|
||||
option(GIT_SUBMODULE "Check submodules during build" ON)
|
||||
if(GIT_SUBMODULE)
|
||||
message(STATUS "Submodule update")
|
||||
execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
RESULT_VARIABLE GIT_SUBMOD_RESULT)
|
||||
if(NOT GIT_SUBMOD_RESULT EQUAL "0")
|
||||
message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(NOT COMPILE_WASM)
|
||||
# Set BUILD_ARCH to native only while compiling for non wasm platform
|
||||
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
|
||||
endif()
|
||||
|
||||
if(COMPILE_WASM)
|
||||
list(APPEND WASM_COMPILE_FLAGS -pthread -O3 -g2 -fPIC -mssse3 -msimd128)
|
||||
list(APPEND WASM_COMPILE_FLAGS "SHELL:-s WASM=1" "SHELL:-s ASSERTIONS=0" "SHELL:-s DISABLE_EXCEPTION_CATCHING=1" "SHELL:-s LLD_REPORT_UNDEFINED" "SHELL:-s FORCE_FILESYSTEM=1" "SHELL:-s ALLOW_MEMORY_GROWTH=1")
|
||||
list(APPEND WASM_COMPILE_FLAGS -Wno-error=pthreads-mem-growth)
|
||||
endif(COMPILE_WASM)
|
||||
|
||||
add_subdirectory(3rd_party)
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(app)
|
||||
|
||||
|
||||
if(NOT COMPILE_WASM)
|
||||
add_subdirectory(app)
|
||||
endif()
|
||||
if(COMPILE_WASM)
|
||||
add_subdirectory(wasm)
|
||||
endif(COMPILE_WASM)
|
||||
|
136
README.md
136
README.md
@ -3,58 +3,92 @@
|
||||
Bergamot translator provides a unified API for ([Marian NMT](https://marian-nmt.github.io/) framework based) neural machine translation functionality in accordance with the [Bergamot](https://browser.mt/) project that focuses on improving client-side machine translation in a web browser.
|
||||
|
||||
## Build Instructions
|
||||
```
|
||||
$ git clone https://github.com/browsermt/bergamot-translator
|
||||
$ cd bergamot-translator
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake ../
|
||||
$ make -j
|
||||
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Bergamot Translator
|
||||
|
||||
The build will generate the library that can be linked to any project. All the public header files are specified in `src` folder.
|
||||
|
||||
### `service-cli`
|
||||
|
||||
An executable `service-cli` is generated by the build in the `app` folder and
|
||||
provides command line interface to the underlying translator. The models
|
||||
required to run the command-line are available at
|
||||
[data.statmt.org/bergamot/models/](http://data.statmt.org/bergamot/models/).
|
||||
The following example uses an English to German tiny11 student model, available
|
||||
at:
|
||||
|
||||
* [data.statmt.org/bergamot/models/deen/ende.student.tiny11.tar.gz](http://data.statmt.org/bergamot/models/deen/ende.student.tiny11.tar.gz)
|
||||
### Build Natively
|
||||
|
||||
```bash
|
||||
MODEL_DIR=... # path to where the model-files are.
|
||||
ARGS=(
|
||||
-m $MODEL_DIR/model.intgemm.alphas.bin # Path to model file.
|
||||
--vocabs
|
||||
$MODEL_DIR/vocab.deen.spm # source-vocabulary
|
||||
$MODEL_DIR/vocab.deen.spm # target-vocabulary
|
||||
|
||||
# The following increases speed through one-best-decoding, shortlist and quantization.
|
||||
--beam-size 1 --skip-cost --shortlist $MODEL_DIR/lex.s2t.gz 50 50 --int8shiftAlphaAll
|
||||
|
||||
# Number of CPU threads (workers to launch). Parallelizes over cores and improves speed.
|
||||
--cpu-threads 4
|
||||
|
||||
# Hyperparameters of how many tokens to be accounted for in a batch and maximum tokens in a sentence.
|
||||
--max-input-sentence-tokens 1024 --max-input-tokens 1024
|
||||
|
||||
# Three modes are supported
|
||||
# - sentence: One sentence per line
|
||||
# - paragraph: One paragraph per line.
|
||||
# - wrapped text: Paragraphs are separated by empty line.
|
||||
|
||||
--ssplit-mode paragraph
|
||||
|
||||
)
|
||||
|
||||
./app/service-cli "${ARGS[@]}" < path-to-input-file
|
||||
git clone --recursive https://github.com/browsermt/bergamot-translator
|
||||
cd bergamot-translator
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ../
|
||||
make -j
|
||||
```
|
||||
|
||||
### Build WASM
|
||||
#### Compiling for the first time
|
||||
|
||||
1. Download and Install Emscripten using following instructions
|
||||
* Get the latest sdk: `git clone https://github.com/emscripten-core/emsdk.git`
|
||||
* Enter the cloned directory: `cd emsdk`
|
||||
* Install the lastest sdk tools: `./emsdk install latest`
|
||||
* Activate the latest sdk tools: `./emsdk activate latest`
|
||||
* Activate path variables: `source ./emsdk_env.sh`
|
||||
|
||||
2. Clone the repository and checkout the appropriate branch using these instructions:
|
||||
```bash
|
||||
git clone https://github.com/browsermt/bergamot-translator
|
||||
cd bergamot-translator
|
||||
git checkout -b wasm-integration origin/wasm-integration
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
3. Download files (only required if you want to package files in wasm binary)
|
||||
|
||||
This step is only required if you want to package files (e.g. models, vocabularies etc.)
|
||||
into wasm binary. If you don't then just skip this step.
|
||||
|
||||
The build preloads the files in Emscripten’s virtual file system.
|
||||
|
||||
If you want to package bergamot project specific models, please follow these instructions:
|
||||
```bash
|
||||
mkdir models
|
||||
git clone https://github.com/mozilla-applied-ml/bergamot-models
|
||||
cp -rf bergamot-models/* models
|
||||
gunzip models/*/*
|
||||
```
|
||||
|
||||
4. Compile
|
||||
1. Create a folder where you want to build all the artefacts (`build-wasm` in this case)
|
||||
```bash
|
||||
mkdir build-wasm
|
||||
cd build-wasm
|
||||
```
|
||||
|
||||
2. Compile the artefacts
|
||||
* If you want to package files into wasm binary then execute following commands (Replace `FILES_TO_PACKAGE` with the path of the
|
||||
directory containing the files to be packaged in wasm binary)
|
||||
|
||||
```bash
|
||||
emcmake cmake -DCOMPILE_WASM=on -DPACKAGE_DIR=FILES_TO_PACKAGE ../
|
||||
emmake make -j
|
||||
```
|
||||
e.g. If you want to package bergamot project specific models (downloaded using step 3 above) then
|
||||
replace `FILES_TO_PACKAGE` with `../models`
|
||||
|
||||
* If you don't want to package any file into wasm binary then execute following commands:
|
||||
```bash
|
||||
emcmake cmake -DCOMPILE_WASM=on ../
|
||||
emmake make -j
|
||||
```
|
||||
|
||||
The artefacts (.js and .wasm files) will be available in `wasm` folder of build directory ("build-wasm" in this case).
|
||||
|
||||
#### Recompiling
|
||||
As long as you don't update any submodule, just follow steps in `4.ii` to recompile.\
|
||||
If you update a submodule, execute following command before executing steps in `4.ii` to recompile.
|
||||
```bash
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
|
||||
## How to use
|
||||
|
||||
### Using Native version
|
||||
|
||||
The builds generate library that can be integrated to any project. All the public header files are specified in `src` folder.\
|
||||
A short example of how to use the APIs is provided in `app/main.cpp` file.
|
||||
|
||||
### Using WASM version
|
||||
|
||||
Please follow the `README` inside the `wasm` folder of this repository that demonstrates how to use the translator in JavaScript.
|
||||
|
@ -7,8 +7,8 @@
|
||||
#include "common/utils.h"
|
||||
#include "marian.h"
|
||||
#include "translator/parser.h"
|
||||
#include "translator/response.h"
|
||||
#include "translator/service.h"
|
||||
#include "translator/translation_result.h"
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
auto cp = marian::bergamot::createConfigParser();
|
||||
@ -19,14 +19,13 @@ int main(int argc, char *argv[]) {
|
||||
std::ostringstream std_input;
|
||||
std_input << std::cin.rdbuf();
|
||||
std::string input = std_input.str();
|
||||
using marian::bergamot::TranslationResult;
|
||||
using marian::bergamot::Response;
|
||||
|
||||
// Wait on future until TranslationResult is complete
|
||||
std::future<TranslationResult> translation_result_future =
|
||||
service.translate(std::move(input));
|
||||
translation_result_future.wait();
|
||||
const TranslationResult &translation_result = translation_result_future.get();
|
||||
std::cout << translation_result.getTranslatedText() << std::endl;
|
||||
// Wait on future until Response is complete
|
||||
std::future<Response> responseFuture = service.translate(std::move(input));
|
||||
responseFuture.wait();
|
||||
Response response = responseFuture.get();
|
||||
std::cout << response.translation() << std::endl;
|
||||
|
||||
// Stop Service.
|
||||
service.stop();
|
||||
|
@ -44,10 +44,10 @@ int main(int argc, char **argv) {
|
||||
"Prague, the University of Sheffield, University of Tartu, and "
|
||||
"Mozilla.");
|
||||
|
||||
auto futureResults = model->translate(std::move(texts), translationRequest);
|
||||
auto results = model->translate(std::move(texts), translationRequest);
|
||||
|
||||
// Resolve the future and get the actual result
|
||||
std::vector<TranslationResult> results = futureResults.get();
|
||||
//std::vector<TranslationResult> results = futureResults.get();
|
||||
|
||||
for (auto &result : results) {
|
||||
std::cout << "[original]: " << result.getOriginalText() << std::endl;
|
||||
|
@ -11,8 +11,8 @@
|
||||
#include "translator/output_collector.h"
|
||||
#include "translator/output_printer.h"
|
||||
#include "translator/parser.h"
|
||||
#include "translator/response.h"
|
||||
#include "translator/service.h"
|
||||
#include "translator/translation_result.h"
|
||||
|
||||
void marian_decoder_minimal(const marian::Histories &histories,
|
||||
marian::Ptr<marian::Vocab const> targetVocab,
|
||||
@ -46,16 +46,14 @@ int main(int argc, char *argv[]) {
|
||||
std::ostringstream std_input;
|
||||
std_input << std::cin.rdbuf();
|
||||
std::string input = std_input.str();
|
||||
using marian::bergamot::TranslationResult;
|
||||
using marian::bergamot::Response;
|
||||
|
||||
// Wait on future until TranslationResult is complete
|
||||
std::future<TranslationResult> translation_result_future =
|
||||
service.translate(std::move(input));
|
||||
translation_result_future.wait();
|
||||
const TranslationResult &translation_result = translation_result_future.get();
|
||||
// Wait on future until Response is complete
|
||||
std::future<Response> responseFuture = service.translate(std::move(input));
|
||||
responseFuture.wait();
|
||||
const Response &response = responseFuture.get();
|
||||
|
||||
marian_decoder_minimal(translation_result.getHistories(),
|
||||
service.targetVocab(), options);
|
||||
marian_decoder_minimal(response.histories(), service.targetVocab(), options);
|
||||
|
||||
LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed());
|
||||
service.stop();
|
||||
|
85
doc/marian-integration.md
Normal file
85
doc/marian-integration.md
Normal file
@ -0,0 +1,85 @@
|
||||
# Marian Integration
|
||||
|
||||
This document summarizes the minimal build instructions develop for the
|
||||
marian-code powering bergamot-translator.
|
||||
|
||||
## Build Instructions
|
||||
|
||||
```
|
||||
$ git clone https://github.com/browsermt/bergamot-translator
|
||||
$ cd bergamot-translator
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake .. -DUSE_WASM_COMPATIBLE_MARIAN=off -DCMAKE_BUILD_TYPE=Release
|
||||
$ make -j
|
||||
```
|
||||
|
||||
|
||||
The build will generate the library that can be linked to any project. All the
|
||||
public header files are specified in `src` folder.
|
||||
|
||||
## Command line apps
|
||||
|
||||
The following executables are created by the build:
|
||||
|
||||
1. `app/service-cli`: Extends marian to capability to work with string_views.
|
||||
`service-cli` exists to check if the underlying code, without the
|
||||
integration works or not.
|
||||
2. `app/bergamot-translator-app`: App which integreates service-cli's
|
||||
functionality into the translator agnostic API specified as part of the
|
||||
project. Integration failures are detected if same arguments work with
|
||||
`service-cli` and does not with `bergamot-translator-app`.
|
||||
3. `app/marian-decoder-new`: Helper executable to conveniently benchmark new
|
||||
implementation with the optimized upstream marian-decoder.
|
||||
|
||||
The models required to run the command-line are available at
|
||||
[data.statmt.org/bergamot/models/](http://data.statmt.org/bergamot/models/).
|
||||
The following example uses an English to German tiny11 student model, available
|
||||
at:
|
||||
|
||||
* [data.statmt.org/bergamot/models/deen/ende.student.tiny11.tar.gz](http://data.statmt.org/bergamot/models/deen/ende.student.tiny11.tar.gz)
|
||||
|
||||
<details>
|
||||
<summary> Example run of commandline: Click to expand </summary>
|
||||
<p>
|
||||
|
||||
```bash
|
||||
MODEL_DIR=... # path to where the model-files are.
|
||||
ARGS=(
|
||||
-m $MODEL_DIR/model.intgemm.alphas.bin # Path to model file.
|
||||
--vocabs
|
||||
$MODEL_DIR/vocab.deen.spm # source-vocabulary
|
||||
$MODEL_DIR/vocab.deen.spm # target-vocabulary
|
||||
|
||||
# The following increases speed through one-best-decoding, shortlist and quantization.
|
||||
--beam-size 1 --skip-cost --shortlist $MODEL_DIR/lex.s2t.gz 50 50 --int8shiftAlphaAll
|
||||
|
||||
# Number of CPU threads (workers to launch). Parallelizes over cores and improves speed.
|
||||
# A value of 0 allows a path with no worker thread-launches and a single-thread.
|
||||
--cpu-threads 4
|
||||
|
||||
# Maximum size of a sentence allowed. If a sentence is above this length,
|
||||
# it's broken into pieces of less than or equal to this size.
|
||||
--max-length-break 1024
|
||||
|
||||
# Maximum number of tokens that can be fit in a batch. The optimal value
|
||||
# for the parameter is dependant on hardware and can be obtained by running
|
||||
# with variations and benchmarking.
|
||||
--mini-batch-words 1024
|
||||
|
||||
# Three modes are supported
|
||||
# - sentence: One sentence per line
|
||||
# - paragraph: One paragraph per line.
|
||||
# - wrapped_text: Paragraphs are separated by empty line.
|
||||
--ssplit-mode paragraph
|
||||
)
|
||||
|
||||
./app/service-cli "${ARGS[@]}" < path-to-input-file
|
||||
./app/bergamot-translator-app "${ARGS[@]}" < path-to-input-file
|
||||
|
||||
```
|
||||
</p>
|
||||
|
||||
</summary>
|
||||
</details>
|
||||
|
@ -57,7 +57,7 @@ public:
|
||||
* entry of texts list will be moved to its corresponding TranslationResult
|
||||
* object).
|
||||
*/
|
||||
virtual std::future<std::vector<TranslationResult>>
|
||||
virtual std::vector<TranslationResult>
|
||||
translate(std::vector<std::string> &&texts, TranslationRequest request) = 0;
|
||||
|
||||
/* Check if the model can provide alignment information b/w original and
|
||||
|
@ -20,7 +20,11 @@ class TranslationResult {
|
||||
public:
|
||||
typedef std::vector<std::pair<std::string_view, std::string_view>>
|
||||
SentenceMappings;
|
||||
|
||||
#ifdef WASM_BINDINGS
|
||||
TranslationResult(const std::string &original, const std::string &translation)
|
||||
: originalText(original), translatedText(translation),
|
||||
sentenceMappings() {}
|
||||
#endif
|
||||
TranslationResult(const std::string &original, const std::string &translation,
|
||||
SentenceMappings &sentenceMappings)
|
||||
: originalText(original), translatedText(translation),
|
||||
@ -31,13 +35,29 @@ public:
|
||||
translatedText(std::move(other.translatedText)),
|
||||
sentenceMappings(std::move(other.sentenceMappings)) {}
|
||||
|
||||
#ifdef WASM_BINDINGS
|
||||
TranslationResult(const TranslationResult &other)
|
||||
: originalText(other.originalText),
|
||||
translatedText(other.translatedText),
|
||||
sentenceMappings(other.sentenceMappings) {}
|
||||
#endif
|
||||
|
||||
TranslationResult(std::string &&original, std::string &&translation,
|
||||
SentenceMappings &&sentenceMappings)
|
||||
: originalText(std::move(original)),
|
||||
translatedText(std::move(translation)),
|
||||
sentenceMappings(std::move(sentenceMappings)) {}
|
||||
|
||||
#ifndef WASM_BINDINGS
|
||||
TranslationResult &operator=(const TranslationResult &) = delete;
|
||||
#else
|
||||
TranslationResult &operator=(const TranslationResult &result) {
|
||||
originalText = result.originalText;
|
||||
translatedText = result.translatedText;
|
||||
sentenceMappings = result.sentenceMappings;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Return the original text. */
|
||||
const std::string &getOriginalText() const { return originalText; }
|
||||
|
@ -10,8 +10,26 @@ add_library(bergamot-translator STATIC
|
||||
request.cpp
|
||||
service.cpp
|
||||
batcher.cpp
|
||||
translation_result.cpp
|
||||
response.cpp
|
||||
batch.cpp
|
||||
sentence_ranges.cpp
|
||||
)
|
||||
if (COMPILE_DECODER_ONLY)
|
||||
# A dirty hack because of marian's bad cmake practices
|
||||
target_compile_definitions(bergamot-translator PUBLIC DECODER_ONLY)
|
||||
endif()
|
||||
|
||||
if(COMPILE_WASM)
|
||||
# A dirty hack because of marian's bad cmake practices
|
||||
target_compile_definitions(bergamot-translator PUBLIC USE_SSE2 WASM)
|
||||
# Enable code that is required for generating JS bindings
|
||||
target_compile_definitions(bergamot-translator PRIVATE WASM_BINDINGS)
|
||||
target_compile_options(bergamot-translator PRIVATE ${WASM_COMPILE_FLAGS})
|
||||
endif(COMPILE_WASM)
|
||||
|
||||
if (COMPILE_THREAD_VARIANT)
|
||||
target_compile_definitions(bergamot-translator PRIVATE WITH_PTHREADS)
|
||||
endif()
|
||||
|
||||
target_link_libraries(bergamot-translator marian ssplit)
|
||||
|
||||
|
@ -14,6 +14,7 @@
|
||||
|
||||
// All local project includes
|
||||
#include "TranslationModel.h"
|
||||
#include "translator/parser.h"
|
||||
#include "translator/service.h"
|
||||
|
||||
std::shared_ptr<marian::Options> parseOptions(const std::string &config) {
|
||||
@ -34,7 +35,7 @@ std::shared_ptr<marian::Options> parseOptions(const std::string &config) {
|
||||
// Error: Aborted from void unhandledException() in
|
||||
// 3rd_party/marian-dev/src/common/logging.cpp:113
|
||||
|
||||
marian::ConfigParser configParser(marian::cli::mode::translation);
|
||||
marian::ConfigParser configParser = marian::bergamot::createConfigParser();
|
||||
const YAML::Node &defaultConfig = configParser.getConfig();
|
||||
|
||||
options.merge(defaultConfig);
|
||||
@ -55,7 +56,7 @@ TranslationModel::TranslationModel(const std::string &config)
|
||||
|
||||
TranslationModel::~TranslationModel() {}
|
||||
|
||||
std::future<std::vector<TranslationResult>>
|
||||
std::vector<TranslationResult>
|
||||
TranslationModel::translate(std::vector<std::string> &&texts,
|
||||
TranslationRequest request) {
|
||||
// Implementing a non-async version first. Unpleasant, but should work.
|
||||
@ -68,24 +69,30 @@ TranslationModel::translate(std::vector<std::string> &&texts,
|
||||
// Collect future as marian::bergamot::TranslationResult
|
||||
auto intermediate = service_.translate(std::move(text));
|
||||
intermediate.wait();
|
||||
auto mTranslationResult(std::move(intermediate.get()));
|
||||
auto marianResponse(std::move(intermediate.get()));
|
||||
|
||||
// This mess because marian::string_view != std::string_view
|
||||
std::string source, translation;
|
||||
marian::bergamot::Response::SentenceMappings mSentenceMappings;
|
||||
marianResponse.move(source, translation, mSentenceMappings);
|
||||
|
||||
// Convert to UnifiedAPI::TranslationResult
|
||||
TranslationResult::SentenceMappings sentenceMappings;
|
||||
for (auto &p : mTranslationResult.getSentenceMappings()) {
|
||||
for (auto &p : mSentenceMappings) {
|
||||
std::string_view src(p.first.data(), p.first.size()),
|
||||
tgt(p.second.data(), p.second.size());
|
||||
sentenceMappings.emplace_back(src, tgt);
|
||||
}
|
||||
|
||||
// In place construction.
|
||||
translationResults.emplace_back(std::move(mTranslationResult.source_),
|
||||
std::move(mTranslationResult.translation_),
|
||||
std::move(sentenceMappings));
|
||||
translationResults.emplace_back(
|
||||
std::move(source), // &&marianResponse.source_
|
||||
std::move(translation), // &&marianResponse.translation_
|
||||
std::move(sentenceMappings) // &&sentenceMappings
|
||||
);
|
||||
}
|
||||
|
||||
promise.set_value(std::move(translationResults));
|
||||
return future;
|
||||
return translationResults;
|
||||
}
|
||||
|
||||
bool TranslationModel::isAlignmentSupported() const { return false; }
|
||||
|
@ -24,7 +24,8 @@
|
||||
*/
|
||||
class TranslationModel : public AbstractTranslationModel {
|
||||
public:
|
||||
/* Construct the model using the model configuration options as yaml-formatted string
|
||||
/* Construct the model using the model configuration options as yaml-formatted
|
||||
* string
|
||||
*/
|
||||
TranslationModel(const std::string &config);
|
||||
|
||||
@ -54,7 +55,7 @@ public:
|
||||
* entry of texts list will be moved to its corresponding TranslationResult
|
||||
* object).
|
||||
*/
|
||||
std::future<std::vector<TranslationResult>>
|
||||
std::vector<TranslationResult>
|
||||
translate(std::vector<std::string> &&texts,
|
||||
TranslationRequest request) override;
|
||||
|
||||
|
28
src/translator/batch.cpp
Normal file
28
src/translator/batch.cpp
Normal file
@ -0,0 +1,28 @@
|
||||
#include "batch.h"
|
||||
#include "request.h"
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
void Batch::log() {
|
||||
size_t numTokens{0}, maxLength{0};
|
||||
for (auto &sentence : sentences_) {
|
||||
numTokens += sentence.numTokens();
|
||||
maxLength = std::max(maxLength, static_cast<size_t>(sentence.numTokens()));
|
||||
}
|
||||
|
||||
LOG(info, "Batch(tokens={}, max-length={}, sentences_={})", numTokens,
|
||||
maxLength, sentences_.size());
|
||||
}
|
||||
|
||||
void Batch::add(const RequestSentence &sentence) {
|
||||
sentences_.push_back(sentence);
|
||||
}
|
||||
|
||||
void Batch::completeBatch(const Histories &histories) {
|
||||
for (size_t i = 0; i < sentences_.size(); i++) {
|
||||
sentences_[i].completeSentence(histories[i]);
|
||||
}
|
||||
}
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
52
src/translator/batch.h
Normal file
52
src/translator/batch.h
Normal file
@ -0,0 +1,52 @@
|
||||
#ifndef SRC_BERGAMOT_BATCH_H
|
||||
#define SRC_BERGAMOT_BATCH_H
|
||||
|
||||
#include "request.h"
|
||||
#include "translator/beam_search.h"
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
class Batch {
|
||||
public:
|
||||
Batch() {}
|
||||
void clear() { sentences_.clear(); }
|
||||
|
||||
// Methods to construct and determine poison.
|
||||
static Batch poison() {
|
||||
Batch batch;
|
||||
batch.poison_ = true;
|
||||
return batch;
|
||||
}
|
||||
|
||||
bool isPoison() const { return poison_; }
|
||||
|
||||
size_t size() const { return sentences_.size(); }
|
||||
|
||||
void add(const RequestSentence &sentence);
|
||||
|
||||
// Accessors to read from a Batch. For use in BatchTranslator (consumer on a
|
||||
// PCQueue holding batches).
|
||||
//
|
||||
// sentences() are used to access sentences to construct marian internal
|
||||
// batch.
|
||||
const RequestSentences &sentences() { return sentences_; }
|
||||
|
||||
// On obtaining Histories after translating a batch, completeBatch can be
|
||||
// called with Histories , which forwards the call to Request through
|
||||
// RequestSentence and triggers completion, by setting the promised value to
|
||||
// the future given to client.
|
||||
void completeBatch(const Histories &histories);
|
||||
|
||||
// Convenience function to log batch-statistics. numTokens, max-length.
|
||||
void log();
|
||||
|
||||
private:
|
||||
bool poison_{false};
|
||||
RequestSentences sentences_;
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
||||
#endif // SRC_BERGAMOT_BATCH_H_
|
@ -1,4 +1,5 @@
|
||||
#include "batch_translator.h"
|
||||
#include "batch.h"
|
||||
#include "common/logging.h"
|
||||
#include "data/corpus.h"
|
||||
#include "data/text_input.h"
|
||||
@ -8,15 +9,12 @@ namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
BatchTranslator::BatchTranslator(DeviceId const device,
|
||||
PCQueue<PCItem> &pcqueue,
|
||||
std::vector<Ptr<Vocab const>> &vocabs,
|
||||
Ptr<Options> options)
|
||||
: device_(device), options_(options), pcqueue_(&pcqueue), vocabs_(&vocabs) {
|
||||
: device_(device), options_(options), vocabs_(&vocabs) {}
|
||||
|
||||
thread_ = std::thread([this] { this->mainloop(); });
|
||||
}
|
||||
|
||||
void BatchTranslator::initGraph() {
|
||||
void BatchTranslator::initialize() {
|
||||
// Initializes the graph.
|
||||
if (options_->hasAndNotEmpty("shortlist")) {
|
||||
int srcIdx = 0, trgIdx = 1;
|
||||
bool shared_vcb = vocabs_->front() == vocabs_->back();
|
||||
@ -38,15 +36,14 @@ void BatchTranslator::initGraph() {
|
||||
scorer->setShortlistGenerator(slgen_);
|
||||
}
|
||||
}
|
||||
|
||||
graph_->forward();
|
||||
}
|
||||
|
||||
void BatchTranslator::translate(RequestSentences &requestSentences,
|
||||
Histories &histories) {
|
||||
void BatchTranslator::translate(Batch &batch) {
|
||||
std::vector<data::SentenceTuple> batchVector;
|
||||
|
||||
for (auto &sentence : requestSentences) {
|
||||
auto &sentences = batch.sentences();
|
||||
for (auto &sentence : sentences) {
|
||||
data::SentenceTuple sentence_tuple(sentence.lineNumber());
|
||||
Segment segment = sentence.getUnderlyingSegment();
|
||||
sentence_tuple.push_back(segment);
|
||||
@ -89,35 +86,32 @@ void BatchTranslator::translate(RequestSentences &requestSentences,
|
||||
for (size_t j = 0; j < maxDims.size(); ++j)
|
||||
subBatches[j]->setWords(words[j]);
|
||||
|
||||
auto batch = Ptr<CorpusBatch>(new CorpusBatch(subBatches));
|
||||
batch->setSentenceIds(sentenceIds);
|
||||
auto corpus_batch = Ptr<CorpusBatch>(new CorpusBatch(subBatches));
|
||||
corpus_batch->setSentenceIds(sentenceIds);
|
||||
|
||||
auto trgVocab = vocabs_->back();
|
||||
auto search = New<BeamSearch>(options_, scorers_, trgVocab);
|
||||
|
||||
histories = std::move(search->search(graph_, batch));
|
||||
auto histories = std::move(search->search(graph_, corpus_batch));
|
||||
batch.completeBatch(histories);
|
||||
}
|
||||
|
||||
void BatchTranslator::mainloop() {
|
||||
initGraph();
|
||||
#ifdef WITH_PTHREADS
|
||||
|
||||
PCItem pcitem;
|
||||
void BatchTranslator::consumeFrom(PCQueue<Batch> &pcqueue) {
|
||||
Batch batch;
|
||||
Histories histories;
|
||||
|
||||
while (true) {
|
||||
pcqueue_->ConsumeSwap(pcitem);
|
||||
if (pcitem.isPoison()) {
|
||||
pcqueue.ConsumeSwap(batch);
|
||||
if (batch.isPoison()) {
|
||||
return;
|
||||
} else {
|
||||
translate(pcitem.sentences, histories);
|
||||
for (int i = 0; i < pcitem.sentences.size(); i++) {
|
||||
pcitem.sentences[i].completeSentence(histories[i]);
|
||||
}
|
||||
translate(batch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BatchTranslator::join() { thread_.join(); }
|
||||
#endif
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
@ -4,14 +4,18 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "batch.h"
|
||||
#include "common/utils.h"
|
||||
#include "data/shortlist.h"
|
||||
#include "definitions.h"
|
||||
#include "pcqueue.h"
|
||||
#include "request.h"
|
||||
#include "translator/history.h"
|
||||
#include "translator/scorers.h"
|
||||
|
||||
#ifdef WITH_PTHREADS
|
||||
#include "pcqueue.h"
|
||||
#endif
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
@ -22,29 +26,27 @@ class BatchTranslator {
|
||||
// shut down in Service which calls join() on the threads.
|
||||
|
||||
public:
|
||||
BatchTranslator(DeviceId const device, PCQueue<PCItem> &pcqueue,
|
||||
std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options> options);
|
||||
void join();
|
||||
BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
|
||||
Ptr<Options> options);
|
||||
|
||||
// convenience function for logging. TODO(jerin)
|
||||
std::string _identifier() { return "worker" + std::to_string(device_.no); }
|
||||
void translate(Batch &batch);
|
||||
void initialize();
|
||||
|
||||
#ifdef WITH_PTHREADS
|
||||
void consumeFrom(PCQueue<Batch> &pcqueue);
|
||||
#endif
|
||||
|
||||
private:
|
||||
void initGraph();
|
||||
void translate(RequestSentences &requestSentences, Histories &histories);
|
||||
void mainloop();
|
||||
|
||||
Ptr<Options> options_;
|
||||
|
||||
DeviceId device_;
|
||||
std::vector<Ptr<Vocab const>> *vocabs_;
|
||||
Ptr<ExpressionGraph> graph_;
|
||||
std::vector<Ptr<Scorer>> scorers_;
|
||||
Ptr<data::ShortlistGenerator const> slgen_;
|
||||
|
||||
PCQueue<PCItem> *pcqueue_;
|
||||
std::thread thread_;
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "batcher.h"
|
||||
#include "batch.h"
|
||||
#include "common/logging.h"
|
||||
#include <cassert>
|
||||
|
||||
@ -6,49 +7,64 @@ namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
Batcher::Batcher(Ptr<Options> options) {
|
||||
max_input_tokens_ = options->get<int>("max-input-tokens");
|
||||
bucket_.resize(options->get<int>("max-input-sentence-tokens") + 1);
|
||||
ABORT_IF(
|
||||
max_input_tokens_ < bucket_.size() - 1,
|
||||
"max-input-tokens cannot be less than than max-input-sentence-tokens, "
|
||||
"batcher fail");
|
||||
miniBatchWords = options->get<int>("mini-batch-words");
|
||||
bucket_.resize(options->get<int>("max-length-break") + 1);
|
||||
ABORT_IF(bucket_.size() - 1 > miniBatchWords,
|
||||
"Fatal: max-length-break > mini-batch-words will lead to sentences "
|
||||
"longer than what can fit in a batch.");
|
||||
}
|
||||
|
||||
void Batcher::addSentenceWithPriority(RequestSentence &sentence) {
|
||||
int bucket_id = sentence.numTokens();
|
||||
size_t bucket_id = sentence.numTokens();
|
||||
assert(bucket_id < bucket_.size());
|
||||
bucket_[bucket_id].insert(sentence);
|
||||
}
|
||||
|
||||
void Batcher::cleaveBatch(RequestSentences &sentences) {
|
||||
bool Batcher::operator>>(Batch &batch) { return cleaveBatch(batch); }
|
||||
|
||||
bool Batcher::cleaveBatch(Batch &batch) {
|
||||
// For now simply iterates on buckets and converts batches greedily. This
|
||||
// has to be enhanced with optimizing over priority. The baseline
|
||||
// implementation should at least be as fast as marian's maxi-batch with full
|
||||
// corpus size as maxi-batch size.
|
||||
batch.clear();
|
||||
size_t paddedBatchSize = 0;
|
||||
|
||||
int segments_added = 0;
|
||||
int current_input_tokens = 0;
|
||||
int padded_batch_size = 0;
|
||||
int prev_padded_batch_size;
|
||||
|
||||
for (int i = 0; i < bucket_.size(); i++) {
|
||||
auto p = bucket_[i].begin();
|
||||
while (p != bucket_[i].end()) {
|
||||
padded_batch_size = (segments_added + 1) * i;
|
||||
if (padded_batch_size <= max_input_tokens_) {
|
||||
auto q = p;
|
||||
++p;
|
||||
current_input_tokens += i;
|
||||
sentences.push_back(*q);
|
||||
++segments_added;
|
||||
bucket_[i].erase(q);
|
||||
prev_padded_batch_size = padded_batch_size;
|
||||
for (size_t length = 0; length < bucket_.size(); length++) {
|
||||
auto p = bucket_[length].begin();
|
||||
while (p != bucket_[length].end()) {
|
||||
paddedBatchSize = (batch.size() + 1) * length;
|
||||
if (paddedBatchSize <= miniBatchWords) {
|
||||
auto q = p++;
|
||||
batch.add(*q);
|
||||
bucket_[length].erase(q);
|
||||
} else {
|
||||
return;
|
||||
// Check if elements exist
|
||||
assert(batch.size() > 0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool isValidBatch = batch.size() > 0;
|
||||
return isValidBatch;
|
||||
}
|
||||
|
||||
void Batcher::addWholeRequest(Ptr<Request> request) {
|
||||
for (size_t i = 0; i < request->numSegments(); i++) {
|
||||
RequestSentence requestSentence(i, request);
|
||||
addSentenceWithPriority(requestSentence);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WITH_PTHREADS
|
||||
void Batcher::produceTo(PCQueue<Batch> &pcqueue) {
|
||||
Batch batch;
|
||||
while (cleaveBatch(batch)) {
|
||||
pcqueue.ProduceSwap(batch);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
@ -1,11 +1,16 @@
|
||||
#ifndef SRC_BERGAMOT_BATCHER_H_
|
||||
#define SRC_BERGAMOT_BATCHER_H_
|
||||
|
||||
#include "batch.h"
|
||||
#include "common/options.h"
|
||||
#include "data/corpus_base.h"
|
||||
#include "definitions.h"
|
||||
#include "request.h"
|
||||
|
||||
#ifdef WITH_PTHREADS
|
||||
#include "pcqueue.h"
|
||||
#endif
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
@ -19,14 +24,20 @@ public:
|
||||
// sentence. This method inserts the sentence into the internal data-structure
|
||||
// which maintains priority among sentences from multiple concurrent requests.
|
||||
void addSentenceWithPriority(RequestSentence &sentence);
|
||||
void addWholeRequest(Ptr<Request> request);
|
||||
#ifdef WITH_PTHREADS
|
||||
void produceTo(PCQueue<Batch> &pcqueue);
|
||||
#endif
|
||||
|
||||
// Loads sentences with sentences compiled from (tentatively) multiple
|
||||
// requests optimizing for both padding and priority.
|
||||
void cleaveBatch(RequestSentences &sentences);
|
||||
bool cleaveBatch(Batch &batch);
|
||||
bool operator>>(Batch &batch); // alias
|
||||
|
||||
private:
|
||||
unsigned int max_input_tokens_;
|
||||
size_t miniBatchWords;
|
||||
std::vector<std::set<RequestSentence>> bucket_;
|
||||
size_t batchNumber_{0};
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
|
@ -5,7 +5,8 @@
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
marian::ConfigParser createConfigParser() {
|
||||
|
||||
inline marian::ConfigParser createConfigParser() {
|
||||
marian::ConfigParser cp(marian::cli::mode::translation);
|
||||
cp.addOption<std::string>(
|
||||
"--ssplit-prefix-file", "Bergamot Options",
|
||||
@ -15,14 +16,9 @@ marian::ConfigParser createConfigParser() {
|
||||
"[paragraph, sentence, wrapped_text]", "paragraph");
|
||||
|
||||
cp.addOption<int>(
|
||||
"--max-input-sentence-tokens", "Bergamot Options",
|
||||
"--max-length-break", "Bergamot Options",
|
||||
"Maximum input tokens to be processed in a single sentence.", 128);
|
||||
|
||||
cp.addOption<int>("--max-input-tokens", "Bergamot Options",
|
||||
"Maximum input tokens in a batch. control for"
|
||||
"Bergamot Queue",
|
||||
1024);
|
||||
|
||||
return cp;
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include "request.h"
|
||||
|
||||
#include "definitions.h"
|
||||
#include "translation_result.h"
|
||||
#include "response.h"
|
||||
#include "sentence_ranges.h"
|
||||
|
||||
#include "common/logging.h"
|
||||
|
||||
@ -10,15 +10,15 @@
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
Request::Request(unsigned int Id, int lineNumberBegin,
|
||||
// -----------------------------------------------------------------
|
||||
Request::Request(size_t Id, size_t lineNumberBegin,
|
||||
std::vector<Ptr<Vocab const>> &vocabs, std::string &&source,
|
||||
Segments &&segments,
|
||||
std::vector<TokenRanges> &&sourceAlignments,
|
||||
std::promise<TranslationResult> translationResultPromise)
|
||||
Segments &&segments, SentenceRanges &&sourceRanges,
|
||||
std::promise<Response> responsePromise)
|
||||
: Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs),
|
||||
source_(std::move(source)), segments_(std::move(segments)),
|
||||
sourceAlignments_(std::move(sourceAlignments)),
|
||||
response_(std::move(translationResultPromise)) {
|
||||
sourceRanges_(std::move(sourceRanges)),
|
||||
response_(std::move(responsePromise)) {
|
||||
|
||||
counter_ = segments_.size();
|
||||
histories_.resize(segments_.size(), nullptr);
|
||||
@ -47,11 +47,10 @@ void Request::processHistory(size_t index, Ptr<History> history) {
|
||||
|
||||
void Request::completeRequest() {
|
||||
// Request no longer needs to hold the content, can transfer it to
|
||||
// TranslationResult.
|
||||
TranslationResult translation_result(std::move(source_),
|
||||
std::move(sourceAlignments_),
|
||||
std::move(histories_), *vocabs_);
|
||||
response_.set_value(std::move(translation_result));
|
||||
// Response.
|
||||
Response response(std::move(source_), std::move(sourceRanges_),
|
||||
std::move(histories_), *vocabs_);
|
||||
response_.set_value(std::move(response));
|
||||
}
|
||||
|
||||
bool Request::operator<(const Request &b) const {
|
||||
@ -59,6 +58,8 @@ bool Request::operator<(const Request &b) const {
|
||||
return Id_ < b.Id_;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
RequestSentence::RequestSentence(size_t index, Ptr<Request> request)
|
||||
: index_(index), request_(request) {}
|
||||
|
||||
@ -88,5 +89,7 @@ bool operator<(const RequestSentence &a, const RequestSentence &b) {
|
||||
return a.request_ < b.request_;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
@ -3,30 +3,30 @@
|
||||
//
|
||||
// Request: holds the input blob of a text, Segments (vector<Words>) which are
|
||||
// to go to the batching mechanism and alignments between the processed
|
||||
// segments and the input blob (sourceAlignments). In addition, Request takes
|
||||
// segments and the input blob (sourceTokenRanges). In addition, Request takes
|
||||
// care of the barrier which fires when all the Segments in a request are done
|
||||
// translating by the workers (BatchTranslator). Request is to be extended with
|
||||
// notions of Priority (sequence, user-given).
|
||||
// translating by the workers (BatchTranslator).
|
||||
// TODO(jerinphilip): Extend Request with notions of Priority (sequence,
|
||||
// user-given).
|
||||
//
|
||||
// RequestSentence: is a tuple of (index, Request*). This provides the
|
||||
// RequestSentence: is a tuple of (index, Ptr<Request>). This provides the
|
||||
// batching mechanism access to the segment within the request. The backref to
|
||||
// Request allows event triggering the barrier upon completion of the last
|
||||
// sentence by a worker.
|
||||
//
|
||||
// PCItem: is a vector of RequestSentences and a batchNumber, which is what the
|
||||
// PCQueue holds. The batches are constructed from segments returned by a
|
||||
// RequestSentence. Can be enhanced with paddingSize, countTokens eventually for
|
||||
// logging.
|
||||
|
||||
#ifndef SRC_BERGAMOT_REQUEST_H_
|
||||
#define SRC_BERGAMOT_REQUEST_H_
|
||||
|
||||
#include "definitions.h"
|
||||
#include "translation_result.h"
|
||||
#include "response.h"
|
||||
#include "sentence_ranges.h"
|
||||
|
||||
#include "common/logging.h"
|
||||
#include "data/types.h"
|
||||
#include "translator/beam_search.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include <future>
|
||||
#include <vector>
|
||||
|
||||
@ -34,24 +34,11 @@ namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
class Request {
|
||||
private:
|
||||
unsigned int Id_;
|
||||
int lineNumberBegin_;
|
||||
std::string source_;
|
||||
std::atomic<int> counter_;
|
||||
std::vector<Ptr<Vocab const>> *vocabs_;
|
||||
|
||||
Segments segments_;
|
||||
std::vector<TokenRanges> sourceAlignments_;
|
||||
std::vector<Ptr<History>> histories_;
|
||||
|
||||
std::promise<TranslationResult> response_;
|
||||
|
||||
public:
|
||||
Request(unsigned int Id, int lineNumberBegin,
|
||||
Request(size_t Id, size_t lineNumberBegin,
|
||||
std::vector<Ptr<Vocab const>> &vocabs_, std::string &&source,
|
||||
Segments &&segments, std::vector<TokenRanges> &&sourceAlignments,
|
||||
std::promise<TranslationResult> translationResultPromise);
|
||||
Segments &&segments, SentenceRanges &&sourceTokenRanges,
|
||||
std::promise<Response> responsePromise);
|
||||
|
||||
// Obtain the count of tokens in the segment correponding to index. Used to
|
||||
// insert sentence from multiple requests into the corresponding size bucket.
|
||||
@ -65,7 +52,8 @@ public:
|
||||
// several requests.
|
||||
Segment getSegment(size_t index) const;
|
||||
|
||||
// For notions of priority among requests (used to enable <set> in Batcher).
|
||||
// For notions of priority among requests, used to enable std::set in
|
||||
// Batcher.
|
||||
bool operator<(const Request &request) const;
|
||||
|
||||
// Processes a history obtained after translating in a heterogenous batch
|
||||
@ -74,40 +62,64 @@ public:
|
||||
|
||||
// On completion of last segment, sets value of the promise.
|
||||
void completeRequest();
|
||||
|
||||
private:
|
||||
size_t Id_;
|
||||
size_t lineNumberBegin_;
|
||||
|
||||
// Multiple translation-workers can concurrently access the same Request. The
|
||||
// following atomic atomically operates on the variable holding sentences
|
||||
// remaining to be translated.
|
||||
std::atomic<int> counter_;
|
||||
|
||||
// source_ holds the source string to be translated. segments_ hold the
|
||||
// sentences generated from source_ in vector<Words>. sourceRanges_ are
|
||||
// string_views of the text corresponding to these words, pointing to
|
||||
// sequences in source_. histories_ is a buffer which eventually stores the
|
||||
// translations of each segment in the corresponding index.
|
||||
std::string source_;
|
||||
Segments segments_;
|
||||
SentenceRanges sourceRanges_;
|
||||
std::vector<Ptr<History>> histories_;
|
||||
|
||||
// Members above are moved into newly constructed Response on completion
|
||||
// of translation of all segments. The promise below is set to this Response
|
||||
// value. future to this promise is made available to the user through
|
||||
// Service.
|
||||
std::promise<Response> response_;
|
||||
|
||||
// Constructing Response requires the vocabs_ used to generate Request.
|
||||
std::vector<Ptr<Vocab const>> *vocabs_;
|
||||
};
|
||||
|
||||
class RequestSentence {
|
||||
private:
|
||||
size_t index_;
|
||||
Ptr<Request> request_;
|
||||
// A RequestSentence provides a view to a sentence within a Request. Existence
|
||||
// of this class allows the sentences and associated information to be kept
|
||||
// within Request.
|
||||
|
||||
public:
|
||||
RequestSentence(size_t, Ptr<Request>);
|
||||
size_t numTokens() const;
|
||||
|
||||
// lineNumber in Request, used for matching marian-decoder. SentenceTuple
|
||||
// requires lineNumber to be set for Corpus based batches.
|
||||
size_t lineNumber() const;
|
||||
|
||||
// Accessor to the segment represented by the RequestSentence.
|
||||
Segment getUnderlyingSegment() const;
|
||||
|
||||
// Forwards call to Request, checking for completion.
|
||||
void completeSentence(Ptr<History> history);
|
||||
|
||||
friend bool operator<(const RequestSentence &a, const RequestSentence &b);
|
||||
|
||||
private:
|
||||
size_t index_;
|
||||
Ptr<Request> request_;
|
||||
};
|
||||
|
||||
typedef std::vector<RequestSentence> RequestSentences;
|
||||
|
||||
struct PCItem {
|
||||
int batchNumber;
|
||||
RequestSentences sentences;
|
||||
|
||||
// PCItem should be default constructible for PCQueue. Default constructed
|
||||
// element is poison.
|
||||
PCItem() : batchNumber(-1) {}
|
||||
|
||||
// PCItem constructor to construct a legit PCItem.
|
||||
explicit PCItem(int batchNumber, RequestSentences &&sentences)
|
||||
: batchNumber(batchNumber), sentences(std::move(sentences)) {}
|
||||
|
||||
// Convenience function to determine poison.
|
||||
bool isPoison() { return (batchNumber == -1); }
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
||||
|
98
src/translator/response.cpp
Normal file
98
src/translator/response.cpp
Normal file
@ -0,0 +1,98 @@
|
||||
#include "response.h"
|
||||
#include "sentence_ranges.h"
|
||||
#include "common/logging.h"
|
||||
#include "data/alignment.h"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
Response::Response(std::string &&source, SentenceRanges &&sourceRanges,
|
||||
Histories &&histories, std::vector<Ptr<Vocab const>> &vocabs)
|
||||
: source_(std::move(source)), sourceRanges_(std::move(sourceRanges)),
|
||||
histories_(std::move(histories)), vocabs_(&vocabs) {}
|
||||
|
||||
void Response::move(std::string &source, std::string &translation,
|
||||
SentenceMappings &sentenceMappings) {
|
||||
|
||||
// Construct required stuff first.
|
||||
constructTranslation();
|
||||
constructSentenceMappings(sentenceMappings);
|
||||
|
||||
// Move content out.
|
||||
source = std::move(source_);
|
||||
translation = std::move(translation_);
|
||||
|
||||
// The above assignment expects source, target be moved.
|
||||
// which makes the following invalid, hence required to be cleared.
|
||||
sourceRanges_.clear();
|
||||
targetRanges_.clear();
|
||||
histories_.clear();
|
||||
}
|
||||
|
||||
void Response::constructTranslation() {
|
||||
if (translationConstructed_) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Reserving length at least as much as source_ seems like a reasonable thing
|
||||
// to do to avoid reallocations.
|
||||
translation_.reserve(source_.size());
|
||||
|
||||
// In a first step, the decoded units (individual senteneces) are compiled
|
||||
// into a huge string. This is done by computing indices first and appending
|
||||
// to the string as each sentences are decoded.
|
||||
std::vector<std::pair<size_t, size_t>> translationRanges;
|
||||
|
||||
size_t offset{0};
|
||||
bool first{true};
|
||||
|
||||
for (auto &history : histories_) {
|
||||
// TODO(jerin): Change hardcode of nBest = 1
|
||||
NBestList onebest = history->nBest(1);
|
||||
|
||||
Result result = onebest[0]; // Expecting only one result;
|
||||
Words words = std::get<0>(result);
|
||||
auto targetVocab = vocabs_->back();
|
||||
std::string decoded = targetVocab->decode(words);
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
translation_ += " ";
|
||||
++offset;
|
||||
}
|
||||
|
||||
translation_ += decoded;
|
||||
translationRanges.emplace_back(offset, decoded.size());
|
||||
offset += decoded.size();
|
||||
}
|
||||
|
||||
// Once the entire string is constructed, there are no further possibility of
|
||||
// reallocation in the string's storage, the indices are converted into
|
||||
// string_views.
|
||||
|
||||
for (auto &range : translationRanges) {
|
||||
// TODO(@jerinphilip): Currently considers target tokens as whole text.
|
||||
// Needs to be further enhanced in marian-dev to extract alignments.
|
||||
std::vector<string_view> targetMappings;
|
||||
|
||||
const char *begin = &translation_[range.first];
|
||||
targetMappings.emplace_back(begin, range.second);
|
||||
targetRanges_.addSentence(targetMappings);
|
||||
}
|
||||
|
||||
translationConstructed_ = true;
|
||||
}
|
||||
|
||||
void Response::constructSentenceMappings(
|
||||
Response::SentenceMappings &sentenceMappings) {
|
||||
|
||||
for (size_t i = 0; i < sourceRanges_.numSentences(); i++) {
|
||||
string_view src = sourceRanges_.sentence(i);
|
||||
string_view tgt = targetRanges_.sentence(i);
|
||||
sentenceMappings.emplace_back(src, tgt);
|
||||
}
|
||||
}
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
99
src/translator/response.h
Normal file
99
src/translator/response.h
Normal file
@ -0,0 +1,99 @@
|
||||
#ifndef SRC_BERGAMOT_RESPONSE_H_
|
||||
#define SRC_BERGAMOT_RESPONSE_H_
|
||||
|
||||
#include "sentence_ranges.h"
|
||||
#include "data/types.h"
|
||||
#include "definitions.h"
|
||||
#include "translator/beam_search.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
class Response {
|
||||
// Response is a marian internal class (not a bergamot-translator class)
|
||||
// holding source blob of text, vector of TokenRanges corresponding to each
|
||||
// sentence in the source text blob and histories obtained from translating
|
||||
// these sentences.
|
||||
//
|
||||
// This class provides an API at a higher level in comparison to History to
|
||||
// access translations and additionally use string_view manipulations to
|
||||
// recover structure in translation from source-text's structure known through
|
||||
// reference string and string_view. As many of these computations are not
|
||||
// required until invoked, they are computed as required and stored in data
|
||||
// members where it makes sense to do so (translation,translationTokenRanges).
|
||||
//
|
||||
// Examples of such use-cases are:
|
||||
// translation()
|
||||
// translationInSourceStructure() TODO(@jerinphilip)
|
||||
// alignment(idx) TODO(@jerinphilip)
|
||||
// sentenceMappings (for bergamot-translator)
|
||||
|
||||
public:
|
||||
Response(std::string &&source, SentenceRanges &&sourceRanges,
|
||||
Histories &&histories,
|
||||
// Required for constructing translation and TokenRanges within
|
||||
// translation lazily.
|
||||
std::vector<Ptr<Vocab const>> &vocabs);
|
||||
|
||||
// Move constructor.
|
||||
Response(Response &&other)
|
||||
: source_(std::move(other.source_)),
|
||||
translation_(std::move(other.translation_)),
|
||||
sourceRanges_(std::move(other.sourceRanges_)),
|
||||
targetRanges_(std::move(other.targetRanges_)),
|
||||
histories_(std::move(other.histories_)),
|
||||
vocabs_(std::move(other.vocabs_)){};
|
||||
|
||||
// Prevents CopyConstruction and CopyAssignment. sourceRanges_ is constituted
|
||||
// by string_view and copying invalidates the data member.
|
||||
Response(const Response &) = delete;
|
||||
Response &operator=(const Response &) = delete;
|
||||
|
||||
typedef std::vector<std::pair<const string_view, const string_view>>
|
||||
SentenceMappings;
|
||||
|
||||
// Moves source sentence into source, translated text into translation.
|
||||
// Pairs of string_views to corresponding sentences in
|
||||
// source and translation are loaded into sentenceMappings. These string_views
|
||||
// reference the new source and translation.
|
||||
//
|
||||
// Calling move() invalidates the Response object as ownership is transferred.
|
||||
// Exists for moving strc
|
||||
void move(std::string &source, std::string &translation,
|
||||
SentenceMappings &sentenceMappings);
|
||||
|
||||
const Histories &histories() const { return histories_; }
|
||||
const std::string &source() const { return source_; }
|
||||
const std::string &translation() {
|
||||
constructTranslation();
|
||||
return translation_;
|
||||
}
|
||||
|
||||
// A convenience function provided to return translated text placed within
|
||||
// source's structure. This is useful when the source text is a multi-line
|
||||
// paragraph or string_views extracted from structured text like HTML and it's
|
||||
// desirable to place the individual sentences in the locations of the source
|
||||
// sentences.
|
||||
// const std::string translationInSourceStructure();
|
||||
// const PendingAlignmentType alignment(size_t idx);
|
||||
|
||||
private:
|
||||
void constructTranslation();
|
||||
void constructSentenceMappings(SentenceMappings &);
|
||||
|
||||
std::string source_;
|
||||
SentenceRanges sourceRanges_;
|
||||
Histories histories_;
|
||||
|
||||
std::vector<Ptr<Vocab const>> *vocabs_;
|
||||
bool translationConstructed_{false};
|
||||
std::string translation_;
|
||||
SentenceRanges targetRanges_;
|
||||
};
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
||||
#endif // SRC_BERGAMOT_RESPONSE_H_
|
46
src/translator/sentence_ranges.cpp
Normal file
46
src/translator/sentence_ranges.cpp
Normal file
@ -0,0 +1,46 @@
|
||||
#include "sentence_ranges.h"
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
void SentenceRanges::addSentence(std::vector<string_view> &wordRanges) {
|
||||
addSentence(std::begin(wordRanges), std::end(wordRanges));
|
||||
}
|
||||
|
||||
void SentenceRanges::addSentence(WordIterator begin, WordIterator end) {
|
||||
size_t size = flatByteRanges_.size();
|
||||
flatByteRanges_.insert(std::end(flatByteRanges_), begin, end);
|
||||
sentenceBeginIds_.push_back(size);
|
||||
}
|
||||
|
||||
string_view SentenceRanges::sentence(size_t index) const {
|
||||
size_t bos_id;
|
||||
string_view eos, bos;
|
||||
|
||||
bos_id = sentenceBeginIds_[index];
|
||||
bos = flatByteRanges_[bos_id];
|
||||
|
||||
if (index + 1 == numSentences()) {
|
||||
eos = flatByteRanges_.back();
|
||||
} else {
|
||||
assert(index < numSentences());
|
||||
size_t eos_id = sentenceBeginIds_[index + 1];
|
||||
--eos_id;
|
||||
eos = flatByteRanges_[eos_id];
|
||||
}
|
||||
|
||||
return sentenceBetween(bos, eos);
|
||||
}
|
||||
|
||||
string_view SentenceRanges::sentenceBetween(string_view firstWord,
|
||||
string_view lastWord) const {
|
||||
|
||||
const char *data = firstWord.data();
|
||||
size_t size = lastWord.data() + lastWord.size() - firstWord.data();
|
||||
return string_view(data, size);
|
||||
}
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
52
src/translator/sentence_ranges.h
Normal file
52
src/translator/sentence_ranges.h
Normal file
@ -0,0 +1,52 @@
|
||||
#ifndef BERGAMOT_SENTENCE_RANGES_H_
|
||||
#define BERGAMOT_SENTENCE_RANGES_H_
|
||||
|
||||
#include "data/types.h"
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
class SentenceRanges {
|
||||
// SentenceRanges stores string_views into a source text, with additional
|
||||
// annotations to mark sentence boundaries.
|
||||
//
|
||||
// Given the availability annotations, this container provides capabilty to
|
||||
// add sentences, and access individual sentences.
|
||||
public:
|
||||
typedef std::vector<string_view>::iterator WordIterator;
|
||||
|
||||
void addSentence(std::vector<string_view> &wordRanges);
|
||||
void addSentence(WordIterator begin, WordIterator end);
|
||||
|
||||
void clear() {
|
||||
flatByteRanges_.clear();
|
||||
sentenceBeginIds_.clear();
|
||||
}
|
||||
|
||||
size_t numSentences() const { return sentenceBeginIds_.size(); }
|
||||
|
||||
// Returns a string_view into the ith sentence.
|
||||
string_view sentence(size_t index) const;
|
||||
|
||||
private:
|
||||
// A flat storage for string_views. Can be words or sentences.
|
||||
std::vector<string_view> flatByteRanges_;
|
||||
|
||||
// The container grows dynamically with addSentence. size_t marking index is
|
||||
// used to ensure the sentence boundaries stay same while underlying storage
|
||||
// might be changed during reallocation.
|
||||
std::vector<size_t> sentenceBeginIds_;
|
||||
|
||||
// Utility function to extract the string starting at firstWord and ending at
|
||||
// lastWord as a single string-view.
|
||||
string_view sentenceBetween(string_view firstWord,
|
||||
string_view lastWord) const;
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
|
||||
} // namespace marian
|
||||
|
||||
#endif // BERGAMOT_SENTENCE_RANGES_H_
|
@ -1,7 +1,7 @@
|
||||
#include "sentence_splitter.h"
|
||||
#include "common/cli_helper.h"
|
||||
#include "common/logging.h"
|
||||
#include "common/options.h"
|
||||
#include "sentence_splitter.h"
|
||||
#include <string>
|
||||
|
||||
namespace marian {
|
||||
@ -30,8 +30,9 @@ SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options)
|
||||
|
||||
ug::ssplit::SentenceStream
|
||||
SentenceSplitter::createSentenceStream(const string_view &input) {
|
||||
return std::move(ug::ssplit::SentenceStream(input.data(), input.size(),
|
||||
this->ssplit_, mode_));
|
||||
std::string_view input_converted(input.data(), input.size());
|
||||
return std::move(
|
||||
ug::ssplit::SentenceStream(input_converted, this->ssplit_, mode_));
|
||||
}
|
||||
|
||||
ug::ssplit::SentenceStream::splitmode
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "service.h"
|
||||
#include "batch.h"
|
||||
#include "definitions.h"
|
||||
|
||||
#include <string>
|
||||
@ -8,26 +9,53 @@ namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
Service::Service(Ptr<Options> options)
|
||||
: requestId_(0), batchNumber_(0),
|
||||
numWorkers_(options->get<int>("cpu-threads")),
|
||||
: requestId_(0), numWorkers_(options->get<int>("cpu-threads")),
|
||||
vocabs_(std::move(loadVocabularies(options))),
|
||||
text_processor_(vocabs_, options), batcher_(options),
|
||||
pcqueue_(2 * options->get<int>("cpu-threads")) {
|
||||
text_processor_(vocabs_, options), batcher_(options)
|
||||
#ifdef WITH_PTHREADS
|
||||
,
|
||||
pcqueue_(2 * options->get<int>("cpu-threads"))
|
||||
#endif // WITH_PTHREADS
|
||||
{
|
||||
|
||||
workers_.reserve(numWorkers_);
|
||||
if (numWorkers_ == 0) {
|
||||
// In case workers are 0, a single-translator is created and initialized
|
||||
// in the main thread.
|
||||
marian::DeviceId deviceId(/*cpuId=*/0, DeviceType::cpu);
|
||||
translators_.emplace_back(deviceId, vocabs_, options);
|
||||
translators_.back().initialize();
|
||||
} else {
|
||||
#ifdef WITH_PTHREADS
|
||||
// If workers specified are greater than 0, translators_ are populated with
|
||||
// unitialized instances. These are then initialized inside
|
||||
// individual threads and set to consume from producer-consumer queue.
|
||||
workers_.reserve(numWorkers_);
|
||||
translators_.reserve(numWorkers_);
|
||||
for (size_t cpuId = 0; cpuId < numWorkers_; cpuId++) {
|
||||
marian::DeviceId deviceId(cpuId, DeviceType::cpu);
|
||||
translators_.emplace_back(deviceId, vocabs_, options);
|
||||
|
||||
for (int i = 0; i < numWorkers_; i++) {
|
||||
marian::DeviceId deviceId(i, DeviceType::cpu);
|
||||
workers_.emplace_back(deviceId, pcqueue_, vocabs_, options);
|
||||
auto &translator = translators_.back();
|
||||
workers_.emplace_back([&translator, this] {
|
||||
translator.initialize();
|
||||
translator.consumeFrom(pcqueue_);
|
||||
});
|
||||
}
|
||||
#else // WITH_PTHREADS
|
||||
ABORT(
|
||||
"Fatal: Service started requesting multiple threadswhile compiled with "
|
||||
"COMPILE_THREAD_VARIANT=off. Please check your cmake build "
|
||||
"configuration");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
std::future<TranslationResult> Service::translateWithCopy(std::string input) {
|
||||
std::future<Response> Service::translateWithCopy(std::string input) {
|
||||
return translate(std::move(input));
|
||||
}
|
||||
|
||||
std::future<TranslationResult> Service::translate(std::string &&input) {
|
||||
// Takes in a blob of text. Segments and std::vector<TokenRanges> are
|
||||
std::future<Response> Service::translate(std::string &&input) {
|
||||
// Takes in a blob of text. Segments and SentenceRanges are
|
||||
// extracted from the input (blob of text) and used to construct a Request
|
||||
// along with a promise. promise value is set by the worker completing a
|
||||
// request.
|
||||
@ -40,56 +68,46 @@ std::future<TranslationResult> Service::translate(std::string &&input) {
|
||||
// returns future corresponding to the promise.
|
||||
|
||||
Segments segments;
|
||||
std::vector<TokenRanges> sourceAlignments;
|
||||
text_processor_.process(input, segments, sourceAlignments);
|
||||
SentenceRanges sourceRanges;
|
||||
text_processor_.process(input, segments, sourceRanges);
|
||||
|
||||
std::promise<TranslationResult> translationResultPromise;
|
||||
auto future = translationResultPromise.get_future();
|
||||
std::promise<Response> responsePromise;
|
||||
auto future = responsePromise.get_future();
|
||||
|
||||
Ptr<Request> request = New<Request>(
|
||||
requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(input),
|
||||
std::move(segments), std::move(sourceAlignments),
|
||||
std::move(translationResultPromise));
|
||||
std::move(segments), std::move(sourceRanges), std::move(responsePromise));
|
||||
|
||||
for (int i = 0; i < request->numSegments(); i++) {
|
||||
RequestSentence requestSentence(i, request);
|
||||
batcher_.addSentenceWithPriority(requestSentence);
|
||||
batcher_.addWholeRequest(request);
|
||||
|
||||
if (numWorkers_ > 0) {
|
||||
#ifdef WITH_PTHREADS
|
||||
batcher_.produceTo(pcqueue_);
|
||||
#endif
|
||||
} else {
|
||||
// Queue single-threaded
|
||||
Batch batch;
|
||||
while (batcher_ >> batch) {
|
||||
translators_[0].translate(batch);
|
||||
}
|
||||
}
|
||||
|
||||
int numSentences;
|
||||
do {
|
||||
RequestSentences batchSentences;
|
||||
batcher_.cleaveBatch(batchSentences);
|
||||
numSentences = batchSentences.size();
|
||||
|
||||
if (numSentences > 0) {
|
||||
PCItem pcitem(batchNumber_++, std::move(batchSentences));
|
||||
pcqueue_.ProduceSwap(pcitem);
|
||||
}
|
||||
|
||||
if (batchNumber_ % 500 == 0) {
|
||||
LOG(info, "Queuing batch {}", batchNumber_);
|
||||
}
|
||||
} while (numSentences > 0);
|
||||
|
||||
return future;
|
||||
}
|
||||
|
||||
void Service::stop() {
|
||||
int counter = 0;
|
||||
#ifdef WITH_PTHREADS
|
||||
for (auto &worker : workers_) {
|
||||
PCItem pcitem;
|
||||
pcqueue_.ProduceSwap(pcitem);
|
||||
++counter;
|
||||
Batch poison = Batch::poison();
|
||||
pcqueue_.ProduceSwap(poison);
|
||||
}
|
||||
|
||||
counter = 0;
|
||||
for (auto &worker : workers_) {
|
||||
worker.join();
|
||||
++counter;
|
||||
}
|
||||
|
||||
workers_.clear(); // Takes care of idempotency.
|
||||
#endif
|
||||
}
|
||||
|
||||
Service::~Service() { stop(); }
|
||||
|
@ -3,15 +3,18 @@
|
||||
|
||||
#include "batch_translator.h"
|
||||
#include "batcher.h"
|
||||
#include "pcqueue.h"
|
||||
#include "response.h"
|
||||
#include "text_processor.h"
|
||||
#include "translation_result.h"
|
||||
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
|
||||
#include "data/types.h"
|
||||
|
||||
#ifdef WITH_PTHREADS
|
||||
#include "pcqueue.h"
|
||||
#endif
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
@ -25,17 +28,17 @@ class Service {
|
||||
// options = ...;
|
||||
// service = Service(options);
|
||||
// std::string input_blob = "Hello World";
|
||||
// std::future<TranslationResult>
|
||||
// std::future<Response>
|
||||
// response = service.translate(std::move(input_blob));
|
||||
// response.wait();
|
||||
// TranslationResult result = response.get();
|
||||
// Response result = response.get();
|
||||
|
||||
public:
|
||||
explicit Service(Ptr<Options> options);
|
||||
|
||||
// Constructs new string copying, calls translate internally.
|
||||
std::future<TranslationResult> translateWithCopy(std::string input);
|
||||
std::future<TranslationResult> translate(std::string &&input);
|
||||
std::future<Response> translateWithCopy(std::string input);
|
||||
std::future<Response> translate(std::string &&input);
|
||||
|
||||
void stop();
|
||||
|
||||
@ -45,12 +48,11 @@ public:
|
||||
~Service();
|
||||
|
||||
private:
|
||||
unsigned int requestId_;
|
||||
unsigned int batchNumber_;
|
||||
int numWorkers_;
|
||||
size_t requestId_;
|
||||
size_t numWorkers_;
|
||||
|
||||
// vocabs are used to construct a Request, which later uses it to construct
|
||||
// TranslationResult (decode from words to string).
|
||||
// Response (decode from words to string).
|
||||
std::vector<Ptr<Vocab const>> vocabs_; // ORDER DEPENDENCY
|
||||
|
||||
// Consists of:
|
||||
@ -68,8 +70,12 @@ private:
|
||||
|
||||
TextProcessor text_processor_; // ORDER DEPENDENCY
|
||||
Batcher batcher_;
|
||||
PCQueue<PCItem> pcqueue_;
|
||||
std::vector<BatchTranslator> workers_;
|
||||
std::vector<BatchTranslator> translators_;
|
||||
|
||||
#ifdef WITH_PTHREADS
|
||||
PCQueue<Batch> pcqueue_;
|
||||
std::vector<std::thread> workers_;
|
||||
#endif
|
||||
};
|
||||
|
||||
std::vector<Ptr<const Vocab>> loadVocabularies(Ptr<Options> options);
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include "text_processor.h"
|
||||
#include "data/types.h"
|
||||
#include "definitions.h"
|
||||
#include "sentence_ranges.h"
|
||||
|
||||
#include "common/options.h"
|
||||
#include "data/vocab.h"
|
||||
@ -10,23 +11,22 @@ namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
Segment TextProcessor::tokenize(const string_view &segment,
|
||||
TokenRanges &tokenRanges) {
|
||||
std::vector<string_view> &wordRanges) {
|
||||
return vocabs_->front()->encodeWithByteRanges(
|
||||
segment, tokenRanges, /*addEOS=*/false, /*inference=*/true);
|
||||
segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
|
||||
}
|
||||
|
||||
TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
|
||||
Ptr<Options> options)
|
||||
: vocabs_(&vocabs), sentence_splitter_(options) {
|
||||
|
||||
max_input_sentence_tokens_ = options->get<int>("max-input-sentence-tokens");
|
||||
max_input_sentence_tokens_ = max_input_sentence_tokens_ - 1;
|
||||
ABORT_IF(max_input_sentence_tokens_ < 0,
|
||||
"max-input-sentence-tokens cannot be < 0");
|
||||
max_length_break_ = options->get<int>("max-length-break");
|
||||
max_length_break_ = max_length_break_ - 1;
|
||||
ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0");
|
||||
}
|
||||
|
||||
void TextProcessor::process(const string_view &query, Segments &segments,
|
||||
std::vector<TokenRanges> &sourceRanges) {
|
||||
SentenceRanges &sourceRanges) {
|
||||
|
||||
auto sentenceStream = sentence_splitter_.createSentenceStream(query);
|
||||
std::string_view sentenceStringPiece;
|
||||
@ -34,33 +34,34 @@ void TextProcessor::process(const string_view &query, Segments &segments,
|
||||
while (sentenceStream >> sentenceStringPiece) {
|
||||
marian::string_view sentence(sentenceStringPiece.data(),
|
||||
sentenceStringPiece.size());
|
||||
TokenRanges tokenRanges;
|
||||
Segment segment = tokenize(sentence, tokenRanges);
|
||||
|
||||
std::vector<string_view> wordRanges;
|
||||
Segment segment = tokenize(sentence, wordRanges);
|
||||
|
||||
// There are some cases where SentencePiece or vocab returns no words
|
||||
// after normalization. 0 prevents any empty entries from being added.
|
||||
if (segment.size() > 0) {
|
||||
// Truncate segment into max_input_size segments.
|
||||
truncate(segment, tokenRanges, segments, sourceRanges);
|
||||
truncate(segment, wordRanges, segments, sourceRanges);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TextProcessor::truncate(Segment &segment, TokenRanges &tokenRanges,
|
||||
Segments &segments,
|
||||
std::vector<TokenRanges> &sourceRanges) {
|
||||
for (int offset = 0; offset < segment.size();
|
||||
offset += max_input_sentence_tokens_) {
|
||||
void TextProcessor::truncate(Segment &segment,
|
||||
std::vector<string_view> &wordRanges,
|
||||
Segments &segments, SentenceRanges &sourceRanges) {
|
||||
for (size_t offset = 0; offset < segment.size();
|
||||
offset += max_length_break_) {
|
||||
auto start = segment.begin() + offset;
|
||||
|
||||
unsigned int left = segment.size() - offset;
|
||||
unsigned int diff = std::min(max_input_sentence_tokens_, left);
|
||||
size_t left = segment.size() - offset;
|
||||
size_t diff = std::min(max_length_break_, left);
|
||||
|
||||
segments.emplace_back(start, start + diff);
|
||||
segments.back().push_back(sourceEosId());
|
||||
|
||||
auto astart = tokenRanges.begin() + offset;
|
||||
sourceRanges.emplace_back(astart, astart + diff);
|
||||
auto astart = wordRanges.begin() + offset;
|
||||
sourceRanges.addSentence(astart, astart + diff);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "data/types.h"
|
||||
#include "data/vocab.h"
|
||||
#include "definitions.h"
|
||||
#include "sentence_ranges.h"
|
||||
|
||||
#include "sentence_splitter.h"
|
||||
|
||||
@ -23,23 +24,24 @@ public:
|
||||
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
|
||||
|
||||
void process(const string_view &query, Segments &segments,
|
||||
std::vector<TokenRanges> &sourceRanges);
|
||||
SentenceRanges &sourceRanges);
|
||||
|
||||
private:
|
||||
// Tokenizes an input string, returns Words corresponding. Loads the
|
||||
// corresponding byte-ranges into tokenRanges.
|
||||
Segment tokenize(const string_view &input, TokenRanges &tokenRanges);
|
||||
Segment tokenize(const string_view &input,
|
||||
std::vector<string_view> &tokenRanges);
|
||||
|
||||
// Truncate sentence into max_input_size segments.
|
||||
void truncate(Segment &sentence, TokenRanges &tokenRanges, Segments &segments,
|
||||
std::vector<TokenRanges> &sourceRanges);
|
||||
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
|
||||
Segments &segments, SentenceRanges &sourceRanges);
|
||||
|
||||
// shorthand, used only in truncate()
|
||||
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
|
||||
|
||||
std::vector<Ptr<Vocab const>> *vocabs_;
|
||||
SentenceSplitter sentence_splitter_;
|
||||
unsigned int max_input_sentence_tokens_;
|
||||
size_t max_length_break_;
|
||||
};
|
||||
|
||||
} // namespace bergamot
|
||||
|
@ -1,72 +0,0 @@
|
||||
#include "translation_result.h"
|
||||
#include "common/logging.h"
|
||||
#include "data/alignment.h"
|
||||
|
||||
#include <utility>
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
TranslationResult::TranslationResult(std::string &&source,
|
||||
std::vector<TokenRanges> &&sourceRanges,
|
||||
Histories &&histories,
|
||||
std::vector<Ptr<Vocab const>> &vocabs)
|
||||
: source_(std::move(source)), sourceRanges_(std::move(sourceRanges)),
|
||||
histories_(std::move(histories)) {
|
||||
|
||||
std::vector<string_view> sourceMappings;
|
||||
std::vector<string_view> targetMappings;
|
||||
|
||||
// Process sourceMappings into sourceMappings.
|
||||
sourceMappings.reserve(sourceRanges_.size());
|
||||
for (int i = 0; i < sourceRanges_.size(); i++) {
|
||||
string_view first = sourceRanges_[i].front();
|
||||
string_view last = sourceRanges_[i].back();
|
||||
sourceMappings.emplace_back(first.data(), last.end() - first.begin());
|
||||
}
|
||||
|
||||
// Compiles translations into a single std::string translation_
|
||||
// Current implementation uses += on std::string, multiple resizes.
|
||||
// Stores ByteRanges as indices first, followed by conversion into
|
||||
// string_views.
|
||||
// TODO(jerin): Add token level string_views here as well.
|
||||
std::vector<std::pair<int, int>> translationRanges;
|
||||
size_t offset{0};
|
||||
bool first{true};
|
||||
for (auto &history : histories_) {
|
||||
// TODO(jerin): Change hardcode of nBest = 1
|
||||
NBestList onebest = history->nBest(1);
|
||||
|
||||
Result result = onebest[0]; // Expecting only one result;
|
||||
Words words = std::get<0>(result);
|
||||
std::string decoded = (vocabs.back())->decode(words);
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
translation_ += " ";
|
||||
++offset;
|
||||
}
|
||||
|
||||
translation_ += decoded;
|
||||
translationRanges.emplace_back(offset, decoded.size());
|
||||
offset += decoded.size();
|
||||
}
|
||||
|
||||
// Converting ByteRanges as indices into string_views.
|
||||
targetMappings.reserve(translationRanges.size());
|
||||
for (auto &range : translationRanges) {
|
||||
const char *begin = &translation_[range.first];
|
||||
targetMappings.emplace_back(begin, range.second);
|
||||
}
|
||||
|
||||
// Surely, let's add sentenceMappings_
|
||||
for (auto src = sourceMappings.begin(), tgt = targetMappings.begin();
|
||||
src != sourceMappings.end() && tgt != targetMappings.end();
|
||||
++src, ++tgt) {
|
||||
sentenceMappings_.emplace_back(*src, *tgt);
|
||||
auto &t = sentenceMappings_.back();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
@ -1,76 +0,0 @@
|
||||
#ifndef SRC_BERGAMOT_TRANSLATION_RESULT_H_
|
||||
#define SRC_BERGAMOT_TRANSLATION_RESULT_H_
|
||||
|
||||
#include "data/types.h"
|
||||
#include "definitions.h"
|
||||
#include "translator/beam_search.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
class TranslationResult {
|
||||
public:
|
||||
TranslationResult(std::string &&source,
|
||||
std::vector<TokenRanges> &&sourceRanges,
|
||||
Histories &&histories,
|
||||
std::vector<Ptr<Vocab const>> &vocabs);
|
||||
|
||||
TranslationResult(TranslationResult &&other)
|
||||
: source_(std::move(other.source_)),
|
||||
translation_(std::move(other.translation_)),
|
||||
sourceRanges_(std::move(other.sourceRanges_)),
|
||||
sentenceMappings_(std::move(other.sentenceMappings_)),
|
||||
histories_(std::move(other.histories_)){};
|
||||
|
||||
TranslationResult(const TranslationResult &) = delete;
|
||||
TranslationResult &operator=(const TranslationResult &) = delete;
|
||||
|
||||
// Returns const references to source and translated texts, for external
|
||||
// consumption.
|
||||
|
||||
const std::string &getOriginalText() const { return source_; }
|
||||
const std::string &getTranslatedText() const { return translation_; }
|
||||
|
||||
// A mapping of string_views in the source_ and translation_ are provide as a
|
||||
// pair for external consumption. Each entry corresponding
|
||||
// to a (source-sentence, target-sentence).
|
||||
|
||||
typedef std::vector<std::pair<const string_view, const string_view>>
|
||||
SentenceMappings;
|
||||
const SentenceMappings &getSentenceMappings() const {
|
||||
return sentenceMappings_;
|
||||
}
|
||||
|
||||
// Return the Quality scores of the translated text.
|
||||
// Not implemented currently, commenting out.
|
||||
// const QualityScore &getQualityScore() const { return qualityScore; }
|
||||
|
||||
// For development use to benchmark with marian-decoder.
|
||||
const Histories &getHistories() const { return histories_; }
|
||||
|
||||
// @jerinphilip: Why are these members no longer-private? For move-semantics
|
||||
// with consistent string_views for bergamot-translator.
|
||||
|
||||
std::string source_;
|
||||
std::string translation_;
|
||||
// Adding the following to complete bergamot-translator spec, redundant while
|
||||
// sourceMappings_ and targetMappings_ exists or vice-versa.
|
||||
|
||||
SentenceMappings sentenceMappings_;
|
||||
|
||||
private:
|
||||
// Histories are currently required for interoperability with OutputPrinter
|
||||
// and OutputCollector and hence comparisons with marian-decoder.
|
||||
// Future hook to gain alignments.
|
||||
Histories histories_;
|
||||
|
||||
// string_views at the token level.
|
||||
std::vector<TokenRanges> sourceRanges_;
|
||||
};
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
||||
#endif // SRC_BERGAMOT_TRANSLATION_RESULT_H_
|
28
wasm/CMakeLists.txt
Normal file
28
wasm/CMakeLists.txt
Normal file
@ -0,0 +1,28 @@
|
||||
add_executable(bergamot-translator-worker
|
||||
bindings/TranslationModelBindings.cpp
|
||||
bindings/TranslationRequestBindings.cpp
|
||||
bindings/TranslationResultBindings.cpp
|
||||
)
|
||||
|
||||
# This header inclusion needs to go away later as path to public headers of bergamot
|
||||
# translator should be directly available from "bergamot-translator" target
|
||||
target_include_directories(bergamot-translator-worker
|
||||
PRIVATE ${CMAKE_SOURCE_DIR}/src/translator
|
||||
PRIVATE ${CMAKE_SOURCE_DIR}
|
||||
)
|
||||
# This compile definition is required for generating binding code properly
|
||||
target_compile_definitions(bergamot-translator-worker PRIVATE WASM_BINDINGS)
|
||||
target_compile_options(bergamot-translator-worker PRIVATE ${WASM_COMPILE_FLAGS})
|
||||
|
||||
set(LINKER_FLAGS "--bind -s ASSERTIONS=0 -s DISABLE_EXCEPTION_CATCHING=1 -s FORCE_FILESYSTEM=1 -s ALLOW_MEMORY_GROWTH=1 -s NO_DYNAMIC_EXECUTION=1")
|
||||
if (NOT PACKAGE_DIR STREQUAL "")
|
||||
get_filename_component(REALPATH_PACKAGE_DIR ${PACKAGE_DIR} REALPATH BASE_DIR ${CMAKE_BINARY_DIR})
|
||||
set(LINKER_FLAGS "${LINKER_FLAGS} --preload-file ${REALPATH_PACKAGE_DIR}@/")
|
||||
endif()
|
||||
|
||||
set_target_properties(bergamot-translator-worker PROPERTIES
|
||||
SUFFIX ".js"
|
||||
LINK_FLAGS ${LINKER_FLAGS}
|
||||
)
|
||||
|
||||
target_link_libraries(bergamot-translator-worker bergamot-translator)
|
65
wasm/README.md
Normal file
65
wasm/README.md
Normal file
@ -0,0 +1,65 @@
|
||||
## Using Bergamot Translator in JavaScript
|
||||
The example file `bergamot.html` in the folder `test_page` demonstrates how to use the bergamot translator in JavaScript via a `<script>` tag.
|
||||
|
||||
Please note that everything below assumes that the [bergamot project specific model files](https://github.com/mozilla-applied-ml/bergamot-models) were packaged in wasm binary (using the compile instructions given in the top level README).
|
||||
|
||||
### Using JS APIs
|
||||
|
||||
```js
|
||||
// The model configuration as YAML formatted string. For available configuration options, please check: https://marian-nmt.github.io/docs/cmd/marian-decoder/
|
||||
// This example captures the most relevant options: model file, vocabulary files and shortlist file
|
||||
const modelConfig = "{\"models\":[\"/esen/model.esen.npz\"],\"vocabs\":[\"/esen/vocab.esen.spm\",\"/esen/vocab.esen.spm\"],\"shortlist\":[\"/esen/lex.esen.s2t\"],\"beam-size\":1}";
|
||||
|
||||
// Instantiate the TranslationModel
|
||||
const model = new Module.TranslationModel(modelConfig);
|
||||
|
||||
// Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector<string>)
|
||||
const request = new Module.TranslationRequest();
|
||||
const input = new Module.VectorString;
|
||||
|
||||
// Initialize the input
|
||||
input.push_back("Hola"); input.push_back("Mundo");
|
||||
|
||||
// translate the input; the result is a vector<TranslationResult>
|
||||
const result = model.translate(input, request);
|
||||
|
||||
// Print original and translated text from each entry of vector<TranslationResult>
|
||||
for (let i = 0; i < result.size(); i++) {
|
||||
console.log(' original=' + result.get(i).getOriginalText() + ', translation=' + result.get(i).getTranslatedText());
|
||||
}
|
||||
|
||||
// Don't forget to clean up the instances
|
||||
model.delete();
|
||||
request.delete();
|
||||
input.delete();
|
||||
```
|
||||
|
||||
### Demo (see everything in action)
|
||||
|
||||
* Start the test webserver (ensure you have the latest nodejs installed)
|
||||
```bash
|
||||
cd test_page
|
||||
bash start_server.sh
|
||||
```
|
||||
* Open any of the browsers below
|
||||
* Firefox Nightly +87: make sure the following prefs are on (about:config)
|
||||
```
|
||||
dom.postMessage.sharedArrayBuffer.bypassCOOP_COEP.insecure.enabled = true
|
||||
javascript.options.wasm_simd = true
|
||||
javascript.options.wasm_simd_wormhole = true
|
||||
```
|
||||
|
||||
* Chrome Canary +90: start with the following argument
|
||||
```
|
||||
--js-flags="--experimental-wasm-simd"
|
||||
```
|
||||
|
||||
* Browse to the following page:
|
||||
```
|
||||
http://localhost:8000/bergamot.html
|
||||
```
|
||||
|
||||
* Run some translations:
|
||||
* Choose a model and press `Load Model`
|
||||
* Type a sentence to be translated in the `From` textbox and press `Translate`
|
||||
* See the results in the `To` and `Log` textboxes
|
23
wasm/bindings/TranslationModelBindings.cpp
Normal file
23
wasm/bindings/TranslationModelBindings.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
/*
|
||||
* TranslationModelBindings.cpp
|
||||
*
|
||||
* Bindings for TranslationModel class
|
||||
*/
|
||||
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include "TranslationModel.h"
|
||||
|
||||
using namespace emscripten;
|
||||
|
||||
// Binding code
|
||||
EMSCRIPTEN_BINDINGS(translation_model) {
|
||||
class_<TranslationModel>("TranslationModel")
|
||||
.constructor<std::string>()
|
||||
.function("translate", &TranslationModel::translate)
|
||||
.function("isAlignmentSupported", &TranslationModel::isAlignmentSupported)
|
||||
;
|
||||
|
||||
register_vector<std::string>("VectorString");
|
||||
register_vector<TranslationResult>("VectorTranslationResult");
|
||||
}
|
17
wasm/bindings/TranslationRequestBindings.cpp
Normal file
17
wasm/bindings/TranslationRequestBindings.cpp
Normal file
@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Bindings for TranslationRequest class
|
||||
*
|
||||
*/
|
||||
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include "TranslationRequest.h"
|
||||
|
||||
using namespace emscripten;
|
||||
|
||||
// Binding code
|
||||
EMSCRIPTEN_BINDINGS(translation_request) {
|
||||
class_<TranslationRequest>("TranslationRequest")
|
||||
.constructor<>()
|
||||
;
|
||||
}
|
20
wasm/bindings/TranslationResultBindings.cpp
Normal file
20
wasm/bindings/TranslationResultBindings.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Bindings for TranslationResult class
|
||||
*
|
||||
*/
|
||||
|
||||
#include <emscripten/bind.h>
|
||||
#include <vector>
|
||||
|
||||
#include "TranslationResult.h"
|
||||
|
||||
using namespace emscripten;
|
||||
|
||||
// Binding code
|
||||
EMSCRIPTEN_BINDINGS(translation_result) {
|
||||
class_<TranslationResult>("TranslationResult")
|
||||
.constructor<std::string, std::string, TranslationResult::SentenceMappings>()
|
||||
.function("getOriginalText", &TranslationResult::getOriginalText)
|
||||
.function("getTranslatedText", &TranslationResult::getTranslatedText)
|
||||
;
|
||||
}
|
35
wasm/test_page/bergamot-httpserver.js
Normal file
35
wasm/test_page/bergamot-httpserver.js
Normal file
@ -0,0 +1,35 @@
|
||||
require(__dirname + '/helper.js');
|
||||
|
||||
var http = require('http');
|
||||
var express = require('express');
|
||||
var app = express();
|
||||
var server = http.createServer(app);
|
||||
var fs = require('fs');
|
||||
var url = require('url');
|
||||
const nocache = require('nocache');
|
||||
const cors = require('cors');
|
||||
|
||||
app.use(cors())
|
||||
app.use(nocache());
|
||||
app.get('/*.*' , cors(), function(req, res) {
|
||||
var options = url.parse(req.url, true);
|
||||
var mime = Helper.getMime(options);
|
||||
serveFile(res, options.pathname, mime);
|
||||
});
|
||||
|
||||
function serveFile(res, pathName, mime) {
|
||||
mime = mime || 'text/html';
|
||||
fs.readFile(__dirname + '/' + pathName, function (err, data) {
|
||||
if (err) {
|
||||
res.writeHead(500, {"Content-Type": "text/plain"});
|
||||
return res.end('Error loading ' + pathName + " with Error: " + err);
|
||||
}
|
||||
res.header('Cross-Origin-Embedder-Policy','require-corp');
|
||||
res.header('Cross-Origin-Opener-Policy','same-origin');
|
||||
res.writeHead(200, {"Content-Type": mime});
|
||||
res.end(data);
|
||||
});
|
||||
}
|
||||
|
||||
server.listen(8000);
|
||||
console.log('HTTP and BinaryJS server started on port 8000');
|
199
wasm/test_page/bergamot.html
Normal file
199
wasm/test_page/bergamot.html
Normal file
@ -0,0 +1,199 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<link rel="icon" href="data:,">
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=ISO-8859-1">
|
||||
</head>
|
||||
<style>
|
||||
body, html, div {
|
||||
margin-left: 1%;
|
||||
margin-right: 1%;
|
||||
margin-bottom: 1%;
|
||||
margin-top: 1%;
|
||||
padding-left: 1%;
|
||||
padding-right: 1%;
|
||||
padding-bottom: 1%;
|
||||
padding-top: 1%;
|
||||
}
|
||||
|
||||
textarea, #to, #from {
|
||||
width: 100%;
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
div {
|
||||
float: left;
|
||||
width: 80%;
|
||||
}
|
||||
</style>
|
||||
<body>
|
||||
|
||||
<div id="divradios">
|
||||
<label>Choose the model to use</label>
|
||||
<input type="radio" name="modellang" value="enes"/><label>English to Spanish</label>
|
||||
<input type="radio" name="modellang" value="esen" checked/><label>Spanish to English</label>
|
||||
<input type="button" id="load" value="Load Model"/>
|
||||
</div>
|
||||
|
||||
<div id="divtranslation">
|
||||
<label for="from">From</label>
|
||||
<textarea id="from" name="from">
|
||||
Una estrategia republicana para obstaculizar la reelecci<63>n de Obama. Los dirigentes republicanos justificaron su pol<6F>tica por la necesidad de luchar contra el fraude electoral.
|
||||
Ahora bien, el Centro Brennan considera esto <20>ltimo un mito y afirma que el fraude electoral es menos frecuente en los Estados Unidos que el n<>mero de personas que mueren a causa de la ca<63>da de un rayo. De hecho, los abogados republicanos no han encontrado m<>s que 300 casos de fraude electoral en los Estados Unidos en diez a<>os. Una cosa es cierta: esas nuevas disposiciones afectar<61>n negativamente a la tasa de participaci<63>n.
|
||||
En ese sentido, estas medidas minar<61>n en parte el sistema democr<63>tico americano.
|
||||
Al contrario de lo que ocurre en Canad<61>, los estados americanos son responsables de la organizaci<63>n de las elecciones federales en los Estados Unidos. Y en esa misma l<>nea una mayor<6F>a de los gobiernos americanos promulgaron, a partir de 2009, nuevas leyes que dificultaban el proceso de inscripci<63>n o de votaci<63>n.
|
||||
Este fen<65>meno se ha extendido tras las elecciones de noviembre de 2010, que vieron el aumento de 675 nuevos representantes republicanos en 26 estados. En consecuencia, durante el a<>o 2011 se introdujeron 180 proyectos de ley que restring<6E>an el ejercicio del derecho de voto en 41 estados.
|
||||
</textarea>
|
||||
<br><br>
|
||||
<label for="to">To</label>
|
||||
<textarea id="to" name="to" readonly></textarea>
|
||||
<br><br>
|
||||
<input type="button" id="translate" value="Translate"/>
|
||||
</div>
|
||||
|
||||
<div id="divlog">
|
||||
<label for="log">Log:</label><br>
|
||||
<textarea id="log" name="log" rows="50" cols="75"></textarea>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
|
||||
var model, request, input = undefined;
|
||||
const loadModel = (from, to) => {
|
||||
|
||||
const languagePair = `${from}${to}`;
|
||||
|
||||
// Vocab files are re-used in both translation directions
|
||||
const vocabLanguagePair = from === "en" ? `${to}${from}` : languagePair;
|
||||
|
||||
// Set the Model Configuration as YAML formatted string.
|
||||
// For available configuration options, please check: https://marian-nmt.github.io/docs/cmd/marian-decoder/
|
||||
const modelConfig = `models:
|
||||
- /${languagePair}/model.${languagePair}.npz
|
||||
vocabs:
|
||||
- /${vocabLanguagePair}/vocab.${vocabLanguagePair}.spm
|
||||
- /${vocabLanguagePair}/vocab.${vocabLanguagePair}.spm
|
||||
beam-size: 1
|
||||
normalize: 1.0
|
||||
word-penalty: 0
|
||||
max-length-break: 128
|
||||
mini-batch-words: 1024
|
||||
workspace: 128
|
||||
max-length-factor: 2.0
|
||||
skip-cost: true
|
||||
cpu-threads: 0
|
||||
quiet: true
|
||||
quiet-translation: true
|
||||
shortlist:
|
||||
- /${languagePair}/lex.${languagePair}.s2t
|
||||
- 50
|
||||
- 50
|
||||
`;
|
||||
/*
|
||||
This config is not valid anymore in new APIs
|
||||
mini-batch: 32
|
||||
maxi-batch: 100
|
||||
maxi-batch-sort: src
|
||||
*/
|
||||
// TODO: Use in model config when wormhole is enabled:
|
||||
// gemm-precision: int8shift
|
||||
// TODO: Use in model config when loading of binary models is supported and we use model.intgemm.alphas.bin:
|
||||
// gemm-precision: int8shiftAlphaAll
|
||||
|
||||
console.debug("modelConfig: ", modelConfig);
|
||||
|
||||
// Instantiate the TranslationModel
|
||||
if (model) model.delete();
|
||||
model = new Module.TranslationModel(modelConfig);
|
||||
}
|
||||
|
||||
const translate = (paragraphs) => {
|
||||
|
||||
// Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector<string>)
|
||||
var request = new Module.TranslationRequest();
|
||||
let input = new Module.VectorString;
|
||||
|
||||
// Initialize the input
|
||||
paragraphs.forEach(paragraph => {
|
||||
// prevent empty paragraph - it breaks the translation
|
||||
if (paragraph.trim() === "") {
|
||||
return;
|
||||
}
|
||||
input.push_back(paragraph.trim())
|
||||
})
|
||||
// Access input (just for debugging)
|
||||
console.log('Input size=', input.size());
|
||||
/*
|
||||
for (let i = 0; i < input.size(); i++) {
|
||||
console.log(' val:' + input.get(i));
|
||||
}
|
||||
*/
|
||||
|
||||
// Translate the input; the result is a vector<TranslationResult>
|
||||
let result = model.translate(input, request);
|
||||
// Access original and translated text from each entry of vector<TranslationResult>
|
||||
//console.log('Result size=', result.size(), ' - TimeDiff - ', (Date.now() - start)/1000);
|
||||
const translatedParagraphs = [];
|
||||
for (let i = 0; i < result.size(); i++) {
|
||||
translatedParagraphs.push(result.get(i).getTranslatedText());
|
||||
}
|
||||
console.log({ translatedParagraphs });
|
||||
request.delete();
|
||||
input.delete();
|
||||
return translatedParagraphs;
|
||||
}
|
||||
|
||||
document.querySelector("#load").addEventListener("click", () => {
|
||||
const lang = document.querySelector('input[name="modellang"]:checked').value;
|
||||
const from = lang.substring(0, 2);
|
||||
const to = lang.substring(2, 4);
|
||||
let start = Date.now();
|
||||
loadModel(from, to)
|
||||
log(`model ${from}${to} loaded in ${(Date.now() - start) / 1000} secs`);
|
||||
//log('Model Alignment:', model.isAlignmentSupported());
|
||||
});
|
||||
|
||||
const translateCall = () => {
|
||||
const text = document.querySelector('#from').value;
|
||||
const paragraphs = text.split("\n");
|
||||
let wordCount = 0;
|
||||
paragraphs.forEach(sentence => {
|
||||
wordCount += sentence.trim().split(" ").filter(word => word.trim() !== "").length;
|
||||
})
|
||||
const start = Date.now();
|
||||
const translatedParagraphs = translate(paragraphs);
|
||||
const secs = (Date.now() - start) / 1000;
|
||||
log(`Translation of (${wordCount}) words took ${secs} secs (${Math.round(wordCount / secs)} words per second)`);
|
||||
|
||||
document.querySelector('#to').value = translatedParagraphs.join("\n");
|
||||
}
|
||||
|
||||
document.querySelector("#translate").addEventListener("click", () => {
|
||||
translateCall();
|
||||
});
|
||||
|
||||
document.querySelector("#from").addEventListener('keyup', function(event) {
|
||||
if (event.keyCode === 13) {
|
||||
translateCall();
|
||||
}
|
||||
});
|
||||
|
||||
const log = (message) => {
|
||||
document.querySelector("#log").value += message + "\n";
|
||||
}
|
||||
|
||||
const start = Date.now();
|
||||
let moduleLoadStart;
|
||||
var Module = {
|
||||
preRun: [function() {
|
||||
log(`Time until Module.preRun: ${(Date.now() - start) / 1000} secs`);
|
||||
moduleLoadStart = Date.now();
|
||||
}],
|
||||
onRuntimeInitialized: function() {
|
||||
log(`Wasm Runtime initialized (preRun -> onRuntimeInitialized) in ${(Date.now() - moduleLoadStart) / 1000} secs`);
|
||||
}
|
||||
};
|
||||
</script>
|
||||
<script src="bergamot-translator-worker.js"></script>
|
||||
</body>
|
||||
</html>
|
40
wasm/test_page/helper.js
Normal file
40
wasm/test_page/helper.js
Normal file
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* @author - Based of a file from Gist here: https://gist.github.com/1757658
|
||||
*
|
||||
* @modified - Mike Newell - it was on Gist so I figure I can use it
|
||||
*
|
||||
* @Description - Added support for a few more mime types including the new
|
||||
* .ogv, .webm, and .mp4 file types for HTML5 video.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* @modified - Andre Natal - removed unused types for the purpose of this use
|
||||
case
|
||||
*/
|
||||
|
||||
Helper = {
|
||||
|
||||
types: {
|
||||
"wasm" : "application/wasm"
|
||||
, "js" : "application/javascript"
|
||||
, "html" : "text/html"
|
||||
, "htm" : "text/html"
|
||||
, "ico" : "image/vnd.microsoft.icon",
|
||||
},
|
||||
|
||||
getMime: function(u) {
|
||||
|
||||
var ext = this.getExt(u.pathname).replace('.', '');
|
||||
|
||||
return this.types[ext.toLowerCase()] || 'application/octet-stream';
|
||||
|
||||
},
|
||||
|
||||
getExt: function(path) {
|
||||
var i = path.lastIndexOf('.');
|
||||
|
||||
return (i < 0) ? '' : path.substr(i);
|
||||
}
|
||||
|
||||
};
|
391
wasm/test_page/package-lock.json
generated
Normal file
391
wasm/test_page/package-lock.json
generated
Normal file
@ -0,0 +1,391 @@
|
||||
{
|
||||
"requires": true,
|
||||
"lockfileVersion": 1,
|
||||
"dependencies": {
|
||||
"accepts": {
|
||||
"version": "1.3.7",
|
||||
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.7.tgz",
|
||||
"integrity": "sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==",
|
||||
"requires": {
|
||||
"mime-types": "~2.1.24",
|
||||
"negotiator": "0.6.2"
|
||||
}
|
||||
},
|
||||
"array-flatten": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
|
||||
"integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
|
||||
},
|
||||
"body-parser": {
|
||||
"version": "1.19.0",
|
||||
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.19.0.tgz",
|
||||
"integrity": "sha512-dhEPs72UPbDnAQJ9ZKMNTP6ptJaionhP5cBb541nXPlW60Jepo9RV/a4fX4XWW9CuFNK22krhrj1+rgzifNCsw==",
|
||||
"requires": {
|
||||
"bytes": "3.1.0",
|
||||
"content-type": "~1.0.4",
|
||||
"debug": "2.6.9",
|
||||
"depd": "~1.1.2",
|
||||
"http-errors": "1.7.2",
|
||||
"iconv-lite": "0.4.24",
|
||||
"on-finished": "~2.3.0",
|
||||
"qs": "6.7.0",
|
||||
"raw-body": "2.4.0",
|
||||
"type-is": "~1.6.17"
|
||||
}
|
||||
},
|
||||
"bytes": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.0.tgz",
|
||||
"integrity": "sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg=="
|
||||
},
|
||||
"content-disposition": {
|
||||
"version": "0.5.3",
|
||||
"resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.3.tgz",
|
||||
"integrity": "sha512-ExO0774ikEObIAEV9kDo50o+79VCUdEB6n6lzKgGwupcVeRlhrj3qGAfwq8G6uBJjkqLrhT0qEYFcWng8z1z0g==",
|
||||
"requires": {
|
||||
"safe-buffer": "5.1.2"
|
||||
}
|
||||
},
|
||||
"content-type": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
|
||||
"integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA=="
|
||||
},
|
||||
"cookie": {
|
||||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.4.0.tgz",
|
||||
"integrity": "sha512-+Hp8fLp57wnUSt0tY0tHEXh4voZRDnoIrZPqlo3DPiI4y9lwg/jqx+1Om94/W6ZaPDOUbnjOt/99w66zk+l1Xg=="
|
||||
},
|
||||
"cookie-signature": {
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
|
||||
"integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
|
||||
},
|
||||
"cors": {
|
||||
"version": "2.8.5",
|
||||
"resolved": "https://registry.npmjs.org/cors/-/cors-2.8.5.tgz",
|
||||
"integrity": "sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==",
|
||||
"requires": {
|
||||
"object-assign": "^4",
|
||||
"vary": "^1"
|
||||
}
|
||||
},
|
||||
"debug": {
|
||||
"version": "2.6.9",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
|
||||
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
|
||||
"requires": {
|
||||
"ms": "2.0.0"
|
||||
}
|
||||
},
|
||||
"depd": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
||||
"integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak="
|
||||
},
|
||||
"destroy": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz",
|
||||
"integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA="
|
||||
},
|
||||
"ee-first": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
|
||||
"integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0="
|
||||
},
|
||||
"encodeurl": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
|
||||
"integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k="
|
||||
},
|
||||
"escape-html": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
|
||||
"integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg="
|
||||
},
|
||||
"etag": {
|
||||
"version": "1.8.1",
|
||||
"resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
|
||||
"integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc="
|
||||
},
|
||||
"express": {
|
||||
"version": "4.17.1",
|
||||
"resolved": "https://registry.npmjs.org/express/-/express-4.17.1.tgz",
|
||||
"integrity": "sha512-mHJ9O79RqluphRrcw2X/GTh3k9tVv8YcoyY4Kkh4WDMUYKRZUq0h1o0w2rrrxBqM7VoeUVqgb27xlEMXTnYt4g==",
|
||||
"requires": {
|
||||
"accepts": "~1.3.7",
|
||||
"array-flatten": "1.1.1",
|
||||
"body-parser": "1.19.0",
|
||||
"content-disposition": "0.5.3",
|
||||
"content-type": "~1.0.4",
|
||||
"cookie": "0.4.0",
|
||||
"cookie-signature": "1.0.6",
|
||||
"debug": "2.6.9",
|
||||
"depd": "~1.1.2",
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
"etag": "~1.8.1",
|
||||
"finalhandler": "~1.1.2",
|
||||
"fresh": "0.5.2",
|
||||
"merge-descriptors": "1.0.1",
|
||||
"methods": "~1.1.2",
|
||||
"on-finished": "~2.3.0",
|
||||
"parseurl": "~1.3.3",
|
||||
"path-to-regexp": "0.1.7",
|
||||
"proxy-addr": "~2.0.5",
|
||||
"qs": "6.7.0",
|
||||
"range-parser": "~1.2.1",
|
||||
"safe-buffer": "5.1.2",
|
||||
"send": "0.17.1",
|
||||
"serve-static": "1.14.1",
|
||||
"setprototypeof": "1.1.1",
|
||||
"statuses": "~1.5.0",
|
||||
"type-is": "~1.6.18",
|
||||
"utils-merge": "1.0.1",
|
||||
"vary": "~1.1.2"
|
||||
}
|
||||
},
|
||||
"finalhandler": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.1.2.tgz",
|
||||
"integrity": "sha512-aAWcW57uxVNrQZqFXjITpW3sIUQmHGG3qSb9mUah9MgMC4NeWhNOlNjXEYq3HjRAvL6arUviZGGJsBg6z0zsWA==",
|
||||
"requires": {
|
||||
"debug": "2.6.9",
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
"on-finished": "~2.3.0",
|
||||
"parseurl": "~1.3.3",
|
||||
"statuses": "~1.5.0",
|
||||
"unpipe": "~1.0.0"
|
||||
}
|
||||
},
|
||||
"forwarded": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz",
|
||||
"integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ="
|
||||
},
|
||||
"fresh": {
|
||||
"version": "0.5.2",
|
||||
"resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
|
||||
"integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac="
|
||||
},
|
||||
"http-errors": {
|
||||
"version": "1.7.2",
|
||||
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.7.2.tgz",
|
||||
"integrity": "sha512-uUQBt3H/cSIVfch6i1EuPNy/YsRSOUBXTVfZ+yR7Zjez3qjBz6i9+i4zjNaoqcoFVI4lQJ5plg63TvGfRSDCRg==",
|
||||
"requires": {
|
||||
"depd": "~1.1.2",
|
||||
"inherits": "2.0.3",
|
||||
"setprototypeof": "1.1.1",
|
||||
"statuses": ">= 1.5.0 < 2",
|
||||
"toidentifier": "1.0.0"
|
||||
}
|
||||
},
|
||||
"iconv-lite": {
|
||||
"version": "0.4.24",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
|
||||
"integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
|
||||
"requires": {
|
||||
"safer-buffer": ">= 2.1.2 < 3"
|
||||
}
|
||||
},
|
||||
"inherits": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
|
||||
"integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
|
||||
},
|
||||
"ipaddr.js": {
|
||||
"version": "1.9.1",
|
||||
"resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz",
|
||||
"integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="
|
||||
},
|
||||
"media-typer": {
|
||||
"version": "0.3.0",
|
||||
"resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
|
||||
"integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g="
|
||||
},
|
||||
"merge-descriptors": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
|
||||
"integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
|
||||
},
|
||||
"methods": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
|
||||
"integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4="
|
||||
},
|
||||
"mime": {
|
||||
"version": "1.6.0",
|
||||
"resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
|
||||
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg=="
|
||||
},
|
||||
"mime-db": {
|
||||
"version": "1.45.0",
|
||||
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.45.0.tgz",
|
||||
"integrity": "sha512-CkqLUxUk15hofLoLyljJSrukZi8mAtgd+yE5uO4tqRZsdsAJKv0O+rFMhVDRJgozy+yG6md5KwuXhD4ocIoP+w=="
|
||||
},
|
||||
"mime-types": {
|
||||
"version": "2.1.28",
|
||||
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.28.tgz",
|
||||
"integrity": "sha512-0TO2yJ5YHYr7M2zzT7gDU1tbwHxEUWBCLt0lscSNpcdAfFyJOVEpRYNS7EXVcTLNj/25QO8gulHC5JtTzSE2UQ==",
|
||||
"requires": {
|
||||
"mime-db": "1.45.0"
|
||||
}
|
||||
},
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
|
||||
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
|
||||
},
|
||||
"negotiator": {
|
||||
"version": "0.6.2",
|
||||
"resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz",
|
||||
"integrity": "sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw=="
|
||||
},
|
||||
"nocache": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/nocache/-/nocache-2.1.0.tgz",
|
||||
"integrity": "sha512-0L9FvHG3nfnnmaEQPjT9xhfN4ISk0A8/2j4M37Np4mcDesJjHgEUfgPhdCyZuFI954tjokaIj/A3NdpFNdEh4Q=="
|
||||
},
|
||||
"object-assign": {
|
||||
"version": "4.1.1",
|
||||
"resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
|
||||
"integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM="
|
||||
},
|
||||
"on-finished": {
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
|
||||
"integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=",
|
||||
"requires": {
|
||||
"ee-first": "1.1.1"
|
||||
}
|
||||
},
|
||||
"parseurl": {
|
||||
"version": "1.3.3",
|
||||
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
|
||||
"integrity": "sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ=="
|
||||
},
|
||||
"path-to-regexp": {
|
||||
"version": "0.1.7",
|
||||
"resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
|
||||
"integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
|
||||
},
|
||||
"proxy-addr": {
|
||||
"version": "2.0.6",
|
||||
"resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.6.tgz",
|
||||
"integrity": "sha512-dh/frvCBVmSsDYzw6n926jv974gddhkFPfiN8hPOi30Wax25QZyZEGveluCgliBnqmuM+UJmBErbAUFIoDbjOw==",
|
||||
"requires": {
|
||||
"forwarded": "~0.1.2",
|
||||
"ipaddr.js": "1.9.1"
|
||||
}
|
||||
},
|
||||
"qs": {
|
||||
"version": "6.7.0",
|
||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.7.0.tgz",
|
||||
"integrity": "sha512-VCdBRNFTX1fyE7Nb6FYoURo/SPe62QCaAyzJvUjwRaIsc+NePBEniHlvxFmmX56+HZphIGtV0XeCirBtpDrTyQ=="
|
||||
},
|
||||
"range-parser": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
|
||||
"integrity": "sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg=="
|
||||
},
|
||||
"raw-body": {
|
||||
"version": "2.4.0",
|
||||
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.4.0.tgz",
|
||||
"integrity": "sha512-4Oz8DUIwdvoa5qMJelxipzi/iJIi40O5cGV1wNYp5hvZP8ZN0T+jiNkL0QepXs+EsQ9XJ8ipEDoiH70ySUJP3Q==",
|
||||
"requires": {
|
||||
"bytes": "3.1.0",
|
||||
"http-errors": "1.7.2",
|
||||
"iconv-lite": "0.4.24",
|
||||
"unpipe": "1.0.0"
|
||||
}
|
||||
},
|
||||
"safe-buffer": {
|
||||
"version": "5.1.2",
|
||||
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
|
||||
"integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
|
||||
},
|
||||
"safer-buffer": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
|
||||
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
|
||||
},
|
||||
"send": {
|
||||
"version": "0.17.1",
|
||||
"resolved": "https://registry.npmjs.org/send/-/send-0.17.1.tgz",
|
||||
"integrity": "sha512-BsVKsiGcQMFwT8UxypobUKyv7irCNRHk1T0G680vk88yf6LBByGcZJOTJCrTP2xVN6yI+XjPJcNuE3V4fT9sAg==",
|
||||
"requires": {
|
||||
"debug": "2.6.9",
|
||||
"depd": "~1.1.2",
|
||||
"destroy": "~1.0.4",
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
"etag": "~1.8.1",
|
||||
"fresh": "0.5.2",
|
||||
"http-errors": "~1.7.2",
|
||||
"mime": "1.6.0",
|
||||
"ms": "2.1.1",
|
||||
"on-finished": "~2.3.0",
|
||||
"range-parser": "~1.2.1",
|
||||
"statuses": "~1.5.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"ms": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz",
|
||||
"integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg=="
|
||||
}
|
||||
}
|
||||
},
|
||||
"serve-static": {
|
||||
"version": "1.14.1",
|
||||
"resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.14.1.tgz",
|
||||
"integrity": "sha512-JMrvUwE54emCYWlTI+hGrGv5I8dEwmco/00EvkzIIsR7MqrHonbD9pO2MOfFnpFntl7ecpZs+3mW+XbQZu9QCg==",
|
||||
"requires": {
|
||||
"encodeurl": "~1.0.2",
|
||||
"escape-html": "~1.0.3",
|
||||
"parseurl": "~1.3.3",
|
||||
"send": "0.17.1"
|
||||
}
|
||||
},
|
||||
"setprototypeof": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.1.tgz",
|
||||
"integrity": "sha512-JvdAWfbXeIGaZ9cILp38HntZSFSo3mWg6xGcJJsd+d4aRMOqauag1C63dJfDw7OaMYwEbHMOxEZ1lqVRYP2OAw=="
|
||||
},
|
||||
"statuses": {
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
||||
"integrity": "sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow="
|
||||
},
|
||||
"toidentifier": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.0.tgz",
|
||||
"integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw=="
|
||||
},
|
||||
"type-is": {
|
||||
"version": "1.6.18",
|
||||
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",
|
||||
"integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==",
|
||||
"requires": {
|
||||
"media-typer": "0.3.0",
|
||||
"mime-types": "~2.1.24"
|
||||
}
|
||||
},
|
||||
"unpipe": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
|
||||
"integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw="
|
||||
},
|
||||
"utils-merge": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
|
||||
"integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM="
|
||||
},
|
||||
"vary": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
|
||||
"integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw="
|
||||
}
|
||||
}
|
||||
}
|
7
wasm/test_page/package.json
Normal file
7
wasm/test_page/package.json
Normal file
@ -0,0 +1,7 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"cors": "^2.8.5",
|
||||
"express": "^4.17.1",
|
||||
"nocache": "^2.1.0"
|
||||
}
|
||||
}
|
8
wasm/test_page/start_server.sh
Normal file
8
wasm/test_page/start_server.sh
Normal file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
cp ../../build-wasm/wasm/bergamot-translator-worker.data .
|
||||
cp ../../build-wasm/wasm/bergamot-translator-worker.js .
|
||||
cp ../../build-wasm/wasm/bergamot-translator-worker.wasm .
|
||||
cp ../../build-wasm/wasm/bergamot-translator-worker.worker.js .
|
||||
npm install
|
||||
node bergamot-httpserver.js
|
Loading…
Reference in New Issue
Block a user