Merge pull request #30 from browsermt/jp/absorb-batch-translator

Sync thread adjustments with wasm-integration
This commit is contained in:
abhi-agg 2021-02-23 16:32:26 +01:00 committed by GitHub
commit c0efc21c6e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
36 changed files with 1065 additions and 680 deletions

14
.gitignore vendored
View File

@ -2,6 +2,20 @@
*.swp
*.swo
# CMake
CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps
wasm/test_page/node_modules
build-*
models

@ -1 +1 @@
Subproject commit 16864967b7313e76e3b107d11ec39d8d5cedff1e
Subproject commit 432208826ee27e7b3984b53774b1a16d74256d77

View File

@ -9,28 +9,51 @@ project(bergamot_translator CXX C)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
include(CMakeDependentOption)
# Project specific cmake options
option(COMPILE_WASM "Compile for WASM" OFF)
option(COMPILE_THREAD_VARIANT "Compile with thread support" OFF)
option(USE_WASM_COMPATIBLE_MARIAN "Use wasm compatible marian backend" ON)
CMAKE_DEPENDENT_OPTION(COMPILE_THREAD_VARIANT "Compile the project with thread support" OFF
"USE_WASM_COMPATIBLE_MARIAN" ON)
SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds")
# Set marian (3rd party submodule) cmake options to compile for this project
SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version")
SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece")
SET(USE_STATIC_LIBS ON CACHE BOOL "Link statically against non-system libs")
SET(USE_MKL OFF CACHE BOOL "Compile with MKL support")
SET(COMPILE_DECODER_ONLY ON CACHE BOOL "Compile marian-decoder only")
SET(COMPILE_WITH_PTHREADS OFF CACHE BOOL "Compile with pthreads support")
SET(USE_WASM_COMPATIBLE_BLAS ON CACHE BOOL "Compile with a WASM compatible blas for decoder only builds")
SET(COMPILE_LIBRARY_ONLY ON CACHE BOOL "Build only the Marian library and exclude all executables.")
SET(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without exceptions")
if(COMPILE_WASM)
# Set WORMHOLE to ON for marian whenever compiling for wasm platform
SET(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160")
if (USE_WASM_COMPATIBLE_MARIAN)
# If using wasm compatible marian then set following flags
SET(USE_MKL OFF CACHE BOOL "Compile with MKL support")
SET(COMPILE_DECODER_ONLY ON CACHE BOOL "Compile marian-decoder only")
SET(COMPILE_WITH_PTHREADS OFF CACHE BOOL "Compile with pthreads support")
SET(USE_WASM_COMPATIBLE_BLAS ON CACHE BOOL "Compile with a WASM compatible blas for decoder only builds")
SET(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without exceptions")
if(COMPILE_WASM)
# Set WORMHOLE to ON for marian whenever compiling for wasm platform
SET(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160")
endif()
endif()
# Set ssplit (3rd party submodule) cmake options to compile for this project
SET(USE_INTERNAL_PCRE2 ON CACHE BOOL "Use internal PCRE2 instead of system PCRE2")
execute_process(COMMAND git submodule update --init --recursive --no-fetch
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
# Documentation: https://cliutils.gitlab.io/modern-cmake/chapters/projects/submodule.html
# Ensures the submodules are set correctly during a build.
find_package(Git QUIET)
if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
# Update submodules as needed
option(GIT_SUBMODULE "Check submodules during build" ON)
if(GIT_SUBMODULE)
message(STATUS "Submodule update")
execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE GIT_SUBMOD_RESULT)
if(NOT GIT_SUBMOD_RESULT EQUAL "0")
message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
endif()
endif()
endif()
if(NOT COMPILE_WASM)
# Set BUILD_ARCH to native only while compiling for non wasm platform

View File

@ -3,3 +3,6 @@ target_link_libraries(bergamot-translator-app PRIVATE bergamot-translator)
add_executable(service-cli main-mts.cpp)
target_link_libraries(service-cli PRIVATE bergamot-translator)
add_executable(marian-decoder-new marian-decoder-new.cpp)
target_link_libraries(marian-decoder-new PRIVATE bergamot-translator)

View File

@ -7,8 +7,8 @@
#include "common/utils.h"
#include "marian.h"
#include "translator/parser.h"
#include "translator/response.h"
#include "translator/service.h"
#include "translator/translation_result.h"
int main(int argc, char *argv[]) {
auto cp = marian::bergamot::createConfigParser();
@ -19,27 +19,13 @@ int main(int argc, char *argv[]) {
std::ostringstream std_input;
std_input << std::cin.rdbuf();
std::string input = std_input.str();
using marian::bergamot::TranslationResult;
using marian::bergamot::Response;
// Wait on future until TranslationResult is complete
std::future<TranslationResult> translation_result_future =
service.translate(std::move(input));
translation_result_future.wait();
const TranslationResult &translation_result = translation_result_future.get();
std::cout << "service-cli [Source text]: ";
std::cout << translation_result.getOriginalText() << std::endl;
std::cout << "service-cli [Translated text]: ";
std::cout << translation_result.getTranslatedText() << std::endl;
// Obtain sentenceMappings and print them as Proof of Concept.
const TranslationResult::SentenceMappings &sentenceMappings =
translation_result.getSentenceMappings();
for (auto &p : sentenceMappings) {
std::cout << "service-cli [src] " << p.first << "\n";
std::cout << "service-cli [tgt] " << p.second << "\n";
}
// Wait on future until Response is complete
std::future<Response> responseFuture = service.translate(std::move(input));
responseFuture.wait();
Response response = responseFuture.get();
std::cout << response.translation() << std::endl;
// Stop Service.
service.stop();

View File

@ -0,0 +1,61 @@
#include <cstdlib>
#include <future>
#include <iostream>
#include <sstream>
#include "common/definitions.h"
#include "common/timer.h"
#include "common/utils.h"
#include "marian.h"
#include "translator/history.h"
#include "translator/output_collector.h"
#include "translator/output_printer.h"
#include "translator/parser.h"
#include "translator/response.h"
#include "translator/service.h"
void marian_decoder_minimal(const marian::Histories &histories,
marian::Ptr<marian::Vocab const> targetVocab,
marian::Ptr<marian::Options> options) {
bool doNbest = options->get<bool>("n-best");
auto collector =
marian::New<marian::OutputCollector>(options->get<std::string>("output"));
// There is a dependency of vocabs here.
auto printer = marian::New<marian::OutputPrinter>(options, targetVocab);
if (options->get<bool>("quiet-translation"))
collector->setPrintingStrategy(marian::New<marian::QuietPrinting>());
for (auto &history : histories) {
std::stringstream best1;
std::stringstream bestn;
printer->print(history, best1, bestn);
collector->Write((long)history->getLineNum(), best1.str(), bestn.str(),
doNbest);
}
}
int main(int argc, char *argv[]) {
auto cp = marian::bergamot::createConfigParser();
auto options = cp.parseOptions(argc, argv, true);
marian::timer::Timer decoderTimer;
marian::bergamot::Service service(options);
// Read a large input text blob from stdin
std::ostringstream std_input;
std_input << std::cin.rdbuf();
std::string input = std_input.str();
using marian::bergamot::Response;
// Wait on future until Response is complete
std::future<Response> responseFuture = service.translate(std::move(input));
responseFuture.wait();
const Response &response = responseFuture.get();
marian_decoder_minimal(response.histories(), service.targetVocab(), options);
LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed());
service.stop();
return 0;
}

85
doc/marian-integration.md Normal file
View File

@ -0,0 +1,85 @@
# Marian Integration
This document summarizes the minimal build instructions develop for the
marian-code powering bergamot-translator.
## Build Instructions
```
$ git clone https://github.com/browsermt/bergamot-translator
$ cd bergamot-translator
$ mkdir build
$ cd build
$ cmake .. -DUSE_WASM_COMPATIBLE_MARIAN=off -DCMAKE_BUILD_TYPE=Release
$ make -j
```
The build will generate the library that can be linked to any project. All the
public header files are specified in `src` folder.
## Command line apps
The following executables are created by the build:
1. `app/service-cli`: Extends marian to capability to work with string_views.
`service-cli` exists to check if the underlying code, without the
integration works or not.
2. `app/bergamot-translator-app`: App which integreates service-cli's
functionality into the translator agnostic API specified as part of the
project. Integration failures are detected if same arguments work with
`service-cli` and does not with `bergamot-translator-app`.
3. `app/marian-decoder-new`: Helper executable to conveniently benchmark new
implementation with the optimized upstream marian-decoder.
The models required to run the command-line are available at
[data.statmt.org/bergamot/models/](http://data.statmt.org/bergamot/models/).
The following example uses an English to German tiny11 student model, available
at:
* [data.statmt.org/bergamot/models/deen/ende.student.tiny11.tar.gz](http://data.statmt.org/bergamot/models/deen/ende.student.tiny11.tar.gz)
<details>
<summary> Example run of commandline: Click to expand </summary>
<p>
```bash
MODEL_DIR=... # path to where the model-files are.
ARGS=(
-m $MODEL_DIR/model.intgemm.alphas.bin # Path to model file.
--vocabs
$MODEL_DIR/vocab.deen.spm # source-vocabulary
$MODEL_DIR/vocab.deen.spm # target-vocabulary
# The following increases speed through one-best-decoding, shortlist and quantization.
--beam-size 1 --skip-cost --shortlist $MODEL_DIR/lex.s2t.gz 50 50 --int8shiftAlphaAll
# Number of CPU threads (workers to launch). Parallelizes over cores and improves speed.
# A value of 0 allows a path with no worker thread-launches and a single-thread.
--cpu-threads 4
# Maximum size of a sentence allowed. If a sentence is above this length,
# it's broken into pieces of less than or equal to this size.
--max-length-break 1024
# Maximum number of tokens that can be fit in a batch. The optimal value
# for the parameter is dependant on hardware and can be obtained by running
# with variations and benchmarking.
--mini-batch-words 1024
# Three modes are supported
# - sentence: One sentence per line
# - paragraph: One paragraph per line.
# - wrapped_text: Paragraphs are separated by empty line.
--ssplit-mode paragraph
)
./app/service-cli "${ARGS[@]}" < path-to-input-file
./app/bergamot-translator-app "${ARGS[@]}" < path-to-input-file
```
</p>
</summary>
</details>

View File

@ -3,13 +3,16 @@ add_library(bergamot-translator STATIC
TranslationModel.cpp
# Following files added from browsermt/mts@nuke
textops.cpp
text_processor.cpp
sentence_splitter.cpp
batch_translator.cpp
multifactor_priority.cpp
request.cpp
service.cpp
batcher.cpp
translation_result.cpp
response.cpp
batch.cpp
sentence_ranges.cpp
)
if (COMPILE_DECODER_ONLY)
# A dirty hack because of marian's bad cmake practices
@ -34,3 +37,4 @@ target_include_directories(bergamot-translator
PRIVATE ${CMAKE_SOURCE_DIR}
PUBLIC ${CMAKE_SOURCE_DIR}/src)

View File

@ -14,9 +14,8 @@
// All local project includes
#include "TranslationModel.h"
#include "translator/service.h"
#include "translator/parser.h"
#include "translator/service.h"
std::shared_ptr<marian::Options> parseOptions(const std::string &config) {
marian::Options options;
@ -70,20 +69,27 @@ TranslationModel::translate(std::vector<std::string> &&texts,
// Collect future as marian::bergamot::TranslationResult
auto intermediate = service_.translate(std::move(text));
intermediate.wait();
auto mTranslationResult(std::move(intermediate.get()));
auto marianResponse(std::move(intermediate.get()));
// This mess because marian::string_view != std::string_view
std::string source, translation;
marian::bergamot::Response::SentenceMappings mSentenceMappings;
marianResponse.move(source, translation, mSentenceMappings);
// Convert to UnifiedAPI::TranslationResult
TranslationResult::SentenceMappings sentenceMappings;
for (auto &p : mTranslationResult.getSentenceMappings()) {
for (auto &p : mSentenceMappings) {
std::string_view src(p.first.data(), p.first.size()),
tgt(p.second.data(), p.second.size());
sentenceMappings.emplace_back(src, tgt);
}
// In place construction.
translationResults.emplace_back(std::move(mTranslationResult.source_),
std::move(mTranslationResult.translation_),
std::move(sentenceMappings));
translationResults.emplace_back(
std::move(source), // &&marianResponse.source_
std::move(translation), // &&marianResponse.translation_
std::move(sentenceMappings) // &&sentenceMappings
);
}
return translationResults;

View File

@ -24,7 +24,8 @@
*/
class TranslationModel : public AbstractTranslationModel {
public:
/* Construct the model using the model configuration options as yaml-formatted string
/* Construct the model using the model configuration options as yaml-formatted
* string
*/
TranslationModel(const std::string &config);

28
src/translator/batch.cpp Normal file
View File

@ -0,0 +1,28 @@
#include "batch.h"
#include "request.h"
namespace marian {
namespace bergamot {
void Batch::log() {
size_t numTokens{0}, maxLength{0};
for (auto &sentence : sentences_) {
numTokens += sentence.numTokens();
maxLength = std::max(maxLength, static_cast<size_t>(sentence.numTokens()));
}
LOG(info, "Batch(tokens={}, max-length={}, sentences_={})", numTokens,
maxLength, sentences_.size());
}
void Batch::add(const RequestSentence &sentence) {
sentences_.push_back(sentence);
}
void Batch::completeBatch(const Histories &histories) {
for (size_t i = 0; i < sentences_.size(); i++) {
sentences_[i].completeSentence(histories[i]);
}
}
} // namespace bergamot
} // namespace marian

52
src/translator/batch.h Normal file
View File

@ -0,0 +1,52 @@
#ifndef SRC_BERGAMOT_BATCH_H
#define SRC_BERGAMOT_BATCH_H
#include "request.h"
#include "translator/beam_search.h"
namespace marian {
namespace bergamot {
class Batch {
public:
Batch() {}
void clear() { sentences_.clear(); }
// Methods to construct and determine poison.
static Batch poison() {
Batch batch;
batch.poison_ = true;
return batch;
}
bool isPoison() const { return poison_; }
size_t size() const { return sentences_.size(); }
void add(const RequestSentence &sentence);
// Accessors to read from a Batch. For use in BatchTranslator (consumer on a
// PCQueue holding batches).
//
// sentences() are used to access sentences to construct marian internal
// batch.
const RequestSentences &sentences() { return sentences_; }
// On obtaining Histories after translating a batch, completeBatch can be
// called with Histories , which forwards the call to Request through
// RequestSentence and triggers completion, by setting the promised value to
// the future given to client.
void completeBatch(const Histories &histories);
// Convenience function to log batch-statistics. numTokens, max-length.
void log();
private:
bool poison_{false};
RequestSentences sentences_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_BATCH_H_

View File

@ -1,27 +1,20 @@
#include "batch_translator.h"
#include "batch.h"
#include "common/logging.h"
#include "data/corpus.h"
#include "data/text_input.h"
#include "sanelogging.h"
#include "translator/beam_search.h"
namespace marian {
namespace bergamot {
BatchTranslator::BatchTranslator(DeviceId const device,
PCQueue<PCItem> &pcqueue,
std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options)
: device_(device), options_(options), pcqueue_(&pcqueue), vocabs_(&vocabs) {
: device_(device), options_(options), vocabs_(&vocabs) {}
#ifdef WITH_PTHREADS
thread_ = std::thread([this] { this->mainloop(); });
#else
this->initGraph();
#endif
}
void BatchTranslator::initGraph() {
void BatchTranslator::initialize() {
// Initializes the graph.
if (options_->hasAndNotEmpty("shortlist")) {
int srcIdx = 0, trgIdx = 1;
bool shared_vcb = vocabs_->front() == vocabs_->back();
@ -43,15 +36,14 @@ void BatchTranslator::initGraph() {
scorer->setShortlistGenerator(slgen_);
}
}
graph_->forward();
}
void BatchTranslator::translate(RequestSentences &requestSentences,
Histories &histories) {
void BatchTranslator::translate(Batch &batch) {
std::vector<data::SentenceTuple> batchVector;
for (auto &sentence : requestSentences) {
auto &sentences = batch.sentences();
for (auto &sentence : sentences) {
data::SentenceTuple sentence_tuple(sentence.lineNumber());
Segment segment = sentence.getUnderlyingSegment();
sentence_tuple.push_back(segment);
@ -94,45 +86,32 @@ void BatchTranslator::translate(RequestSentences &requestSentences,
for (size_t j = 0; j < maxDims.size(); ++j)
subBatches[j]->setWords(words[j]);
auto batch = Ptr<CorpusBatch>(new CorpusBatch(subBatches));
batch->setSentenceIds(sentenceIds);
auto corpus_batch = Ptr<CorpusBatch>(new CorpusBatch(subBatches));
corpus_batch->setSentenceIds(sentenceIds);
auto trgVocab = vocabs_->back();
auto search = New<BeamSearch>(options_, scorers_, trgVocab);
histories = std::move(search->search(graph_, batch));
auto histories = std::move(search->search(graph_, corpus_batch));
batch.completeBatch(histories);
}
void BatchTranslator::mainloop() {
#ifdef WITH_PTHREADS
initGraph();
#endif
PCItem pcitem;
void BatchTranslator::consumeFrom(PCQueue<Batch> &pcqueue) {
Batch batch;
Histories histories;
#ifdef WITH_PTHREADS
while (true) {
#endif
pcqueue_->ConsumeSwap(pcitem);
if (pcitem.isPoison()) {
pcqueue.ConsumeSwap(batch);
if (batch.isPoison()) {
return;
} else {
translate(pcitem.sentences, histories);
for (int i = 0; i < pcitem.sentences.size(); i++) {
pcitem.sentences[i].completeSentence(histories[i]);
}
translate(batch);
}
#ifdef WITH_PTHREADS
}
#endif
}
void BatchTranslator::join() {
#ifdef WITH_PTHREADS
thread_.join();
#endif
}
} // namespace bergamot
} // namespace marian

View File

@ -4,14 +4,18 @@
#include <string>
#include <vector>
#include "batch.h"
#include "common/utils.h"
#include "data/shortlist.h"
#include "definitions.h"
#include "pcqueue.h"
#include "request.h"
#include "translator/history.h"
#include "translator/scorers.h"
#ifdef WITH_PTHREADS
#include "pcqueue.h"
#endif
namespace marian {
namespace bergamot {
@ -22,37 +26,27 @@ class BatchTranslator {
// shut down in Service which calls join() on the threads.
public:
BatchTranslator(DeviceId const device, PCQueue<PCItem> &pcqueue,
std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options> options);
void join();
BatchTranslator(DeviceId const device, std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options);
// convenience function for logging. TODO(jerin)
std::string _identifier() { return "worker" + std::to_string(device_.no); }
void translate(Batch &batch);
void initialize();
#ifndef WITH_PTHREADS
void mainloop();
#ifdef WITH_PTHREADS
void consumeFrom(PCQueue<Batch> &pcqueue);
#endif
private:
void initGraph();
void translate(RequestSentences &requestSentences, Histories &histories);
#ifdef WITH_PTHREADS
void mainloop();
#endif
Ptr<Options> options_;
DeviceId device_;
std::vector<Ptr<Vocab const>> *vocabs_;
Ptr<ExpressionGraph> graph_;
std::vector<Ptr<Scorer>> scorers_;
Ptr<data::ShortlistGenerator const> slgen_;
PCQueue<PCItem> *pcqueue_;
#ifdef WITH_PTHREADS
std::thread thread_;
#endif
};
} // namespace bergamot
} // namespace marian

View File

@ -1,55 +1,70 @@
#include "batcher.h"
#include "batch.h"
#include "common/logging.h"
#include "sanelogging.h"
#include <cassert>
namespace marian {
namespace bergamot {
Batcher::Batcher(Ptr<Options> options) {
max_input_tokens_ = options->get<int>("max-input-tokens");
bucket_.resize(options->get<int>("max-input-sentence-tokens") + 1);
ABORT_IF(
max_input_tokens_ < bucket_.size() - 1,
"max-input-tokens cannot be less than than max-input-sentence-tokens, "
"batcher fail");
miniBatchWords = options->get<int>("mini-batch-words");
bucket_.resize(options->get<int>("max-length-break") + 1);
ABORT_IF(bucket_.size() - 1 > miniBatchWords,
"Fatal: max-length-break > mini-batch-words will lead to sentences "
"longer than what can fit in a batch.");
}
void Batcher::addSentenceWithPriority(RequestSentence &sentence) {
int bucket_id = sentence.numTokens();
size_t bucket_id = sentence.numTokens();
assert(bucket_id < bucket_.size());
bucket_[bucket_id].insert(sentence);
}
void Batcher::cleaveBatch(RequestSentences &sentences) {
bool Batcher::operator>>(Batch &batch) { return cleaveBatch(batch); }
bool Batcher::cleaveBatch(Batch &batch) {
// For now simply iterates on buckets and converts batches greedily. This
// has to be enhanced with optimizing over priority. The baseline
// implementation should at least be as fast as marian's maxi-batch with full
// corpus size as maxi-batch size.
batch.clear();
size_t paddedBatchSize = 0;
int segments_added = 0;
int current_input_tokens = 0;
int padded_batch_size = 0;
int prev_padded_batch_size;
for (int i = 0; i < bucket_.size(); i++) {
auto p = bucket_[i].begin();
while (p != bucket_[i].end()) {
padded_batch_size = (segments_added + 1) * i;
if (padded_batch_size <= max_input_tokens_) {
auto q = p;
++p;
current_input_tokens += i;
sentences.push_back(*q);
++segments_added;
bucket_[i].erase(q);
prev_padded_batch_size = padded_batch_size;
for (size_t length = 0; length < bucket_.size(); length++) {
auto p = bucket_[length].begin();
while (p != bucket_[length].end()) {
paddedBatchSize = (batch.size() + 1) * length;
if (paddedBatchSize <= miniBatchWords) {
auto q = p++;
batch.add(*q);
bucket_[length].erase(q);
} else {
return;
// Check if elements exist
assert(batch.size() > 0);
return true;
}
}
}
bool isValidBatch = batch.size() > 0;
return isValidBatch;
}
void Batcher::addWholeRequest(Ptr<Request> request) {
for (size_t i = 0; i < request->numSegments(); i++) {
RequestSentence requestSentence(i, request);
addSentenceWithPriority(requestSentence);
}
}
#ifdef WITH_PTHREADS
void Batcher::produceTo(PCQueue<Batch> &pcqueue) {
Batch batch;
while (cleaveBatch(batch)) {
pcqueue.ProduceSwap(batch);
}
}
#endif
} // namespace bergamot
} // namespace marian

View File

@ -1,11 +1,16 @@
#ifndef SRC_BERGAMOT_BATCHER_H_
#define SRC_BERGAMOT_BATCHER_H_
#include "batch.h"
#include "common/options.h"
#include "data/corpus_base.h"
#include "definitions.h"
#include "request.h"
#ifdef WITH_PTHREADS
#include "pcqueue.h"
#endif
#include <set>
#include <vector>
@ -19,14 +24,20 @@ public:
// sentence. This method inserts the sentence into the internal data-structure
// which maintains priority among sentences from multiple concurrent requests.
void addSentenceWithPriority(RequestSentence &sentence);
void addWholeRequest(Ptr<Request> request);
#ifdef WITH_PTHREADS
void produceTo(PCQueue<Batch> &pcqueue);
#endif
// Loads sentences with sentences compiled from (tentatively) multiple
// requests optimizing for both padding and priority.
void cleaveBatch(RequestSentences &sentences);
bool cleaveBatch(Batch &batch);
bool operator>>(Batch &batch); // alias
private:
unsigned int max_input_tokens_;
size_t miniBatchWords;
std::vector<std::set<RequestSentence>> bucket_;
size_t batchNumber_{0};
};
} // namespace bergamot

View File

@ -5,7 +5,8 @@
namespace marian {
namespace bergamot {
marian::ConfigParser createConfigParser() {
inline marian::ConfigParser createConfigParser() {
marian::ConfigParser cp(marian::cli::mode::translation);
cp.addOption<std::string>(
"--ssplit-prefix-file", "Bergamot Options",
@ -15,14 +16,9 @@ marian::ConfigParser createConfigParser() {
"[paragraph, sentence, wrapped_text]", "paragraph");
cp.addOption<int>(
"--max-input-sentence-tokens", "Bergamot Options",
"--max-length-break", "Bergamot Options",
"Maximum input tokens to be processed in a single sentence.", 128);
cp.addOption<int>("--max-input-tokens", "Bergamot Options",
"Maximum input tokens in a batch. control for"
"Bergamot Queue",
1024);
return cp;
}

View File

@ -9,7 +9,6 @@
#include <memory>
#include <mutex>
#ifdef WITH_PTHREADS
#ifdef __APPLE__
#include <mach/mach.h>
#include <mach/mach_traps.h>
@ -20,7 +19,6 @@
#else
#include <boost/interprocess/sync/interprocess_semaphore.hpp>
#endif
#endif // WITH_PTHREADS
#if __GNUC__ >= 3
#define UTIL_UNLIKELY(x) __builtin_expect(!!(x), 0)
@ -31,7 +29,6 @@
namespace marian {
namespace bergamot {
#ifdef WITH_PTHREADS
/* OS X Maverick and Boost interprocess were doing "Function not implemented."
* So this is my own wrapper around the mach kernel APIs.
*/
@ -117,20 +114,6 @@ inline void WaitSemaphore(Semaphore &on) {
}
#endif // Apple
#else // WITH_PTHREADS
// A dummy Semaphore class that does nothing
class Semaphore {
public:
explicit Semaphore(unsigned int value) : count(value) {}
~Semaphore() {}
void wait() {}
void post() {}
private:
unsigned int count;
};
inline void WaitSemaphore(Semaphore &semaphore) { semaphore.wait(); }
#endif // WITH_PTHREADS
/**
* Producer consumer queue safe for multiple producers and multiple consumers.
@ -151,9 +134,7 @@ public:
void Produce(const T &val) {
WaitSemaphore(empty_);
{
#ifdef WITH_PTHREADS
std::lock_guard<std::mutex> produce_lock(produce_at_mutex_);
#endif
try {
*produce_at_ = val;
} catch (...) {
@ -170,9 +151,7 @@ public:
void ProduceSwap(T &val) {
WaitSemaphore(empty_);
{
#ifdef WITH_PTHREADS
std::lock_guard<std::mutex> produce_lock(produce_at_mutex_);
#endif
try {
std::swap(*produce_at_, val);
} catch (...) {
@ -189,9 +168,7 @@ public:
T &Consume(T &out) {
WaitSemaphore(used_);
{
#ifdef WITH_PTHREADS
std::lock_guard<std::mutex> consume_lock(consume_at_mutex_);
#endif
try {
out = *consume_at_;
} catch (...) {
@ -209,9 +186,7 @@ public:
T &ConsumeSwap(T &out) {
WaitSemaphore(used_);
{
#ifdef WITH_PTHREADS
std::lock_guard<std::mutex> consume_lock(consume_at_mutex_);
#endif
try {
std::swap(out, *consume_at_);
} catch (...) {
@ -245,15 +220,11 @@ private:
// Index for next write in storage_.
T *produce_at_;
#ifdef WITH_PTHREADS
std::mutex produce_at_mutex_;
#endif
// Index for next read from storage_.
T *consume_at_;
#ifdef WITH_PTHREADS
std::mutex consume_at_mutex_;
#endif
};
template <class T> struct UnboundedPage {

View File

@ -1,7 +1,7 @@
#include "request.h"
#include "definitions.h"
#include "translation_result.h"
#include "response.h"
#include "sentence_ranges.h"
#include "common/logging.h"
@ -10,15 +10,15 @@
namespace marian {
namespace bergamot {
Request::Request(unsigned int Id, int lineNumberBegin,
// -----------------------------------------------------------------
Request::Request(size_t Id, size_t lineNumberBegin,
std::vector<Ptr<Vocab const>> &vocabs, std::string &&source,
Segments &&segments,
std::vector<TokenRanges> &&sourceAlignments,
std::promise<TranslationResult> translationResultPromise)
Segments &&segments, SentenceRanges &&sourceRanges,
std::promise<Response> responsePromise)
: Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs),
source_(std::move(source)), segments_(std::move(segments)),
sourceAlignments_(std::move(sourceAlignments)),
response_(std::move(translationResultPromise)) {
sourceRanges_(std::move(sourceRanges)),
response_(std::move(responsePromise)) {
counter_ = segments_.size();
histories_.resize(segments_.size(), nullptr);
@ -47,11 +47,10 @@ void Request::processHistory(size_t index, Ptr<History> history) {
void Request::completeRequest() {
// Request no longer needs to hold the content, can transfer it to
// TranslationResult.
TranslationResult translation_result(std::move(source_),
std::move(sourceAlignments_),
std::move(histories_), *vocabs_);
response_.set_value(std::move(translation_result));
// Response.
Response response(std::move(source_), std::move(sourceRanges_),
std::move(histories_), *vocabs_);
response_.set_value(std::move(response));
}
bool Request::operator<(const Request &b) const {
@ -59,6 +58,8 @@ bool Request::operator<(const Request &b) const {
return Id_ < b.Id_;
}
// ------------------------------------------------------------------
RequestSentence::RequestSentence(size_t index, Ptr<Request> request)
: index_(index), request_(request) {}
@ -88,5 +89,7 @@ bool operator<(const RequestSentence &a, const RequestSentence &b) {
return a.request_ < b.request_;
}
// ----------------------------------------------------------------------
} // namespace bergamot
} // namespace marian

View File

@ -3,30 +3,30 @@
//
// Request: holds the input blob of a text, Segments (vector<Words>) which are
// to go to the batching mechanism and alignments between the processed
// segments and the input blob (sourceAlignments). In addition, Request takes
// segments and the input blob (sourceTokenRanges). In addition, Request takes
// care of the barrier which fires when all the Segments in a request are done
// translating by the workers (BatchTranslator). Request is to be extended with
// notions of Priority (sequence, user-given).
// translating by the workers (BatchTranslator).
// TODO(jerinphilip): Extend Request with notions of Priority (sequence,
// user-given).
//
// RequestSentence: is a tuple of (index, Request*). This provides the
// RequestSentence: is a tuple of (index, Ptr<Request>). This provides the
// batching mechanism access to the segment within the request. The backref to
// Request allows event triggering the barrier upon completion of the last
// sentence by a worker.
//
// PCItem: is a vector of RequestSentences and a batchNumber, which is what the
// PCQueue holds. The batches are constructed from segments returned by a
// RequestSentence. Can be enhanced with paddingSize, countTokens eventually for
// logging.
#ifndef SRC_BERGAMOT_REQUEST_H_
#define SRC_BERGAMOT_REQUEST_H_
#include "definitions.h"
#include "translation_result.h"
#include "response.h"
#include "sentence_ranges.h"
#include "common/logging.h"
#include "data/types.h"
#include "translator/beam_search.h"
#include <cassert>
#include <future>
#include <vector>
@ -34,24 +34,11 @@ namespace marian {
namespace bergamot {
class Request {
private:
unsigned int Id_;
int lineNumberBegin_;
std::string source_;
std::atomic<int> counter_;
std::vector<Ptr<Vocab const>> *vocabs_;
Segments segments_;
std::vector<TokenRanges> sourceAlignments_;
std::vector<Ptr<History>> histories_;
std::promise<TranslationResult> response_;
public:
Request(unsigned int Id, int lineNumberBegin,
Request(size_t Id, size_t lineNumberBegin,
std::vector<Ptr<Vocab const>> &vocabs_, std::string &&source,
Segments &&segments, std::vector<TokenRanges> &&sourceAlignments,
std::promise<TranslationResult> translationResultPromise);
Segments &&segments, SentenceRanges &&sourceTokenRanges,
std::promise<Response> responsePromise);
// Obtain the count of tokens in the segment correponding to index. Used to
// insert sentence from multiple requests into the corresponding size bucket.
@ -65,7 +52,8 @@ public:
// several requests.
Segment getSegment(size_t index) const;
// For notions of priority among requests (used to enable <set> in Batcher).
// For notions of priority among requests, used to enable std::set in
// Batcher.
bool operator<(const Request &request) const;
// Processes a history obtained after translating in a heterogenous batch
@ -74,40 +62,64 @@ public:
// On completion of last segment, sets value of the promise.
void completeRequest();
private:
size_t Id_;
size_t lineNumberBegin_;
// Multiple translation-workers can concurrently access the same Request. The
// following atomic atomically operates on the variable holding sentences
// remaining to be translated.
std::atomic<int> counter_;
// source_ holds the source string to be translated. segments_ hold the
// sentences generated from source_ in vector<Words>. sourceRanges_ are
// string_views of the text corresponding to these words, pointing to
// sequences in source_. histories_ is a buffer which eventually stores the
// translations of each segment in the corresponding index.
std::string source_;
Segments segments_;
SentenceRanges sourceRanges_;
std::vector<Ptr<History>> histories_;
// Members above are moved into newly constructed Response on completion
// of translation of all segments. The promise below is set to this Response
// value. future to this promise is made available to the user through
// Service.
std::promise<Response> response_;
// Constructing Response requires the vocabs_ used to generate Request.
std::vector<Ptr<Vocab const>> *vocabs_;
};
class RequestSentence {
private:
size_t index_;
Ptr<Request> request_;
// A RequestSentence provides a view to a sentence within a Request. Existence
// of this class allows the sentences and associated information to be kept
// within Request.
public:
RequestSentence(size_t, Ptr<Request>);
size_t numTokens() const;
// lineNumber in Request, used for matching marian-decoder. SentenceTuple
// requires lineNumber to be set for Corpus based batches.
size_t lineNumber() const;
// Accessor to the segment represented by the RequestSentence.
Segment getUnderlyingSegment() const;
// Forwards call to Request, checking for completion.
void completeSentence(Ptr<History> history);
friend bool operator<(const RequestSentence &a, const RequestSentence &b);
private:
size_t index_;
Ptr<Request> request_;
};
typedef std::vector<RequestSentence> RequestSentences;
struct PCItem {
int batchNumber;
RequestSentences sentences;
// PCItem should be default constructible for PCQueue. Default constructed
// element is poison.
PCItem() : batchNumber(-1) {}
// PCItem constructor to construct a legit PCItem.
explicit PCItem(int batchNumber, RequestSentences &&sentences)
: batchNumber(batchNumber), sentences(std::move(sentences)) {}
// Convenience function to determine poison.
bool isPoison() { return (batchNumber == -1); }
};
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,98 @@
#include "response.h"
#include "sentence_ranges.h"
#include "common/logging.h"
#include "data/alignment.h"
#include <utility>
namespace marian {
namespace bergamot {
Response::Response(std::string &&source, SentenceRanges &&sourceRanges,
Histories &&histories, std::vector<Ptr<Vocab const>> &vocabs)
: source_(std::move(source)), sourceRanges_(std::move(sourceRanges)),
histories_(std::move(histories)), vocabs_(&vocabs) {}
void Response::move(std::string &source, std::string &translation,
SentenceMappings &sentenceMappings) {
// Construct required stuff first.
constructTranslation();
constructSentenceMappings(sentenceMappings);
// Move content out.
source = std::move(source_);
translation = std::move(translation_);
// The above assignment expects source, target be moved.
// which makes the following invalid, hence required to be cleared.
sourceRanges_.clear();
targetRanges_.clear();
histories_.clear();
}
void Response::constructTranslation() {
if (translationConstructed_) {
return;
}
// Reserving length at least as much as source_ seems like a reasonable thing
// to do to avoid reallocations.
translation_.reserve(source_.size());
// In a first step, the decoded units (individual senteneces) are compiled
// into a huge string. This is done by computing indices first and appending
// to the string as each sentences are decoded.
std::vector<std::pair<size_t, size_t>> translationRanges;
size_t offset{0};
bool first{true};
for (auto &history : histories_) {
// TODO(jerin): Change hardcode of nBest = 1
NBestList onebest = history->nBest(1);
Result result = onebest[0]; // Expecting only one result;
Words words = std::get<0>(result);
auto targetVocab = vocabs_->back();
std::string decoded = targetVocab->decode(words);
if (first) {
first = false;
} else {
translation_ += " ";
++offset;
}
translation_ += decoded;
translationRanges.emplace_back(offset, decoded.size());
offset += decoded.size();
}
// Once the entire string is constructed, there are no further possibility of
// reallocation in the string's storage, the indices are converted into
// string_views.
for (auto &range : translationRanges) {
// TODO(@jerinphilip): Currently considers target tokens as whole text.
// Needs to be further enhanced in marian-dev to extract alignments.
std::vector<string_view> targetMappings;
const char *begin = &translation_[range.first];
targetMappings.emplace_back(begin, range.second);
targetRanges_.addSentence(targetMappings);
}
translationConstructed_ = true;
}
void Response::constructSentenceMappings(
Response::SentenceMappings &sentenceMappings) {
for (size_t i = 0; i < sourceRanges_.numSentences(); i++) {
string_view src = sourceRanges_.sentence(i);
string_view tgt = targetRanges_.sentence(i);
sentenceMappings.emplace_back(src, tgt);
}
}
} // namespace bergamot
} // namespace marian

99
src/translator/response.h Normal file
View File

@ -0,0 +1,99 @@
#ifndef SRC_BERGAMOT_RESPONSE_H_
#define SRC_BERGAMOT_RESPONSE_H_
#include "sentence_ranges.h"
#include "data/types.h"
#include "definitions.h"
#include "translator/beam_search.h"
#include <cassert>
#include <string>
#include <vector>
namespace marian {
namespace bergamot {
class Response {
// Response is a marian internal class (not a bergamot-translator class)
// holding source blob of text, vector of TokenRanges corresponding to each
// sentence in the source text blob and histories obtained from translating
// these sentences.
//
// This class provides an API at a higher level in comparison to History to
// access translations and additionally use string_view manipulations to
// recover structure in translation from source-text's structure known through
// reference string and string_view. As many of these computations are not
// required until invoked, they are computed as required and stored in data
// members where it makes sense to do so (translation,translationTokenRanges).
//
// Examples of such use-cases are:
// translation()
// translationInSourceStructure() TODO(@jerinphilip)
// alignment(idx) TODO(@jerinphilip)
// sentenceMappings (for bergamot-translator)
public:
Response(std::string &&source, SentenceRanges &&sourceRanges,
Histories &&histories,
// Required for constructing translation and TokenRanges within
// translation lazily.
std::vector<Ptr<Vocab const>> &vocabs);
// Move constructor.
Response(Response &&other)
: source_(std::move(other.source_)),
translation_(std::move(other.translation_)),
sourceRanges_(std::move(other.sourceRanges_)),
targetRanges_(std::move(other.targetRanges_)),
histories_(std::move(other.histories_)),
vocabs_(std::move(other.vocabs_)){};
// Prevents CopyConstruction and CopyAssignment. sourceRanges_ is constituted
// by string_view and copying invalidates the data member.
Response(const Response &) = delete;
Response &operator=(const Response &) = delete;
typedef std::vector<std::pair<const string_view, const string_view>>
SentenceMappings;
// Moves source sentence into source, translated text into translation.
// Pairs of string_views to corresponding sentences in
// source and translation are loaded into sentenceMappings. These string_views
// reference the new source and translation.
//
// Calling move() invalidates the Response object as ownership is transferred.
// Exists for moving strc
void move(std::string &source, std::string &translation,
SentenceMappings &sentenceMappings);
const Histories &histories() const { return histories_; }
const std::string &source() const { return source_; }
const std::string &translation() {
constructTranslation();
return translation_;
}
// A convenience function provided to return translated text placed within
// source's structure. This is useful when the source text is a multi-line
// paragraph or string_views extracted from structured text like HTML and it's
// desirable to place the individual sentences in the locations of the source
// sentences.
// const std::string translationInSourceStructure();
// const PendingAlignmentType alignment(size_t idx);
private:
void constructTranslation();
void constructSentenceMappings(SentenceMappings &);
std::string source_;
SentenceRanges sourceRanges_;
Histories histories_;
std::vector<Ptr<Vocab const>> *vocabs_;
bool translationConstructed_{false};
std::string translation_;
SentenceRanges targetRanges_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_RESPONSE_H_

View File

@ -1,44 +0,0 @@
#ifndef SRC_BERGAMOT_SANELOGGING_H_
#define SRC_BERGAMOT_SANELOGGING_H_
#include "spdlog/spdlog.h"
#include <iostream>
namespace marian {
#define PLOG(worker, level, ...)
#define _PLOG(worker, level, ...) checkedPLog(worker, #level, __VA_ARGS__)
template <class... Args>
void checkedPLog(std::string logger, std::string level, Args... args) {
Logger log = spdlog::get(logger);
if (!log) {
try {
log = spdlog::daily_logger_st(logger, "logs/" + logger + ".log");
} catch (const spdlog::spdlog_ex &ex) {
std::cout << "Log initialization failed: " << ex.what() << std::endl;
}
}
if (level == "trace")
log->trace(args...);
else if (level == "debug")
log->debug(args...);
else if (level == "info")
log->info(args...);
else if (level == "warn")
log->warn(args...);
else if (level == "error")
log->error(args...);
else if (level == "critical")
log->critical(args...);
else {
log->warn("Unknown log level '{}' for logger '{}'", level, logger);
}
// Not required when threads clean-exit.
log->flush();
}
} // namespace marian
#endif // SRC_BERGAMOT_SANELOGGING_H_

View File

@ -0,0 +1,46 @@
#include "sentence_ranges.h"
#include <cassert>
#include <iostream>
namespace marian {
namespace bergamot {
void SentenceRanges::addSentence(std::vector<string_view> &wordRanges) {
addSentence(std::begin(wordRanges), std::end(wordRanges));
}
void SentenceRanges::addSentence(WordIterator begin, WordIterator end) {
size_t size = flatByteRanges_.size();
flatByteRanges_.insert(std::end(flatByteRanges_), begin, end);
sentenceBeginIds_.push_back(size);
}
string_view SentenceRanges::sentence(size_t index) const {
size_t bos_id;
string_view eos, bos;
bos_id = sentenceBeginIds_[index];
bos = flatByteRanges_[bos_id];
if (index + 1 == numSentences()) {
eos = flatByteRanges_.back();
} else {
assert(index < numSentences());
size_t eos_id = sentenceBeginIds_[index + 1];
--eos_id;
eos = flatByteRanges_[eos_id];
}
return sentenceBetween(bos, eos);
}
string_view SentenceRanges::sentenceBetween(string_view firstWord,
string_view lastWord) const {
const char *data = firstWord.data();
size_t size = lastWord.data() + lastWord.size() - firstWord.data();
return string_view(data, size);
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,52 @@
#ifndef BERGAMOT_SENTENCE_RANGES_H_
#define BERGAMOT_SENTENCE_RANGES_H_
#include "data/types.h"
#include <cassert>
#include <vector>
namespace marian {
namespace bergamot {
class SentenceRanges {
// SentenceRanges stores string_views into a source text, with additional
// annotations to mark sentence boundaries.
//
// Given the availability annotations, this container provides capabilty to
// add sentences, and access individual sentences.
public:
typedef std::vector<string_view>::iterator WordIterator;
void addSentence(std::vector<string_view> &wordRanges);
void addSentence(WordIterator begin, WordIterator end);
void clear() {
flatByteRanges_.clear();
sentenceBeginIds_.clear();
}
size_t numSentences() const { return sentenceBeginIds_.size(); }
// Returns a string_view into the ith sentence.
string_view sentence(size_t index) const;
private:
// A flat storage for string_views. Can be words or sentences.
std::vector<string_view> flatByteRanges_;
// The container grows dynamically with addSentence. size_t marking index is
// used to ensure the sentence boundaries stay same while underlying storage
// might be changed during reallocation.
std::vector<size_t> sentenceBeginIds_;
// Utility function to extract the string starting at firstWord and ending at
// lastWord as a single string-view.
string_view sentenceBetween(string_view firstWord,
string_view lastWord) const;
};
} // namespace bergamot
} // namespace marian
#endif // BERGAMOT_SENTENCE_RANGES_H_

View File

@ -0,0 +1,53 @@
#include "sentence_splitter.h"
#include "common/cli_helper.h"
#include "common/logging.h"
#include "common/options.h"
#include <string>
namespace marian {
namespace bergamot {
SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options)
: options_(options) {
std::string smode_str = options_->get<std::string>("ssplit-mode", "");
mode_ = string2splitmode(smode_str);
std::string ssplit_prefix_file =
options_->get<std::string>("ssplit-prefix-file", "");
if (ssplit_prefix_file.size()) {
ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file);
LOG(info, "Loading protected prefixes for sentence splitting from {}",
ssplit_prefix_file);
ssplit_.load(ssplit_prefix_file);
} else {
LOG(warn, "Missing list of protected prefixes for sentence splitting. "
"Set with --ssplit-prefix-file.");
}
}
ug::ssplit::SentenceStream
SentenceSplitter::createSentenceStream(const string_view &input) {
std::string_view input_converted(input.data(), input.size());
return std::move(
ug::ssplit::SentenceStream(input_converted, this->ssplit_, mode_));
}
ug::ssplit::SentenceStream::splitmode
SentenceSplitter::string2splitmode(const std::string &m) {
typedef ug::ssplit::SentenceStream::splitmode splitmode;
// @TODO: throw Exception on error
if (m == "sentence" || m == "Sentence")
return splitmode::one_sentence_per_line;
if (m == "paragraph" || m == "Paragraph")
return splitmode::one_paragraph_per_line;
if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") {
LOG(warn, "Ignoring unknown text input format specification: {}.", m);
}
return splitmode::wrapped_text;
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,31 @@
#ifndef SRC_BERGAMOT_SENTENCE_SPLITTER_H_
#define SRC_BERGAMOT_SENTENCE_SPLITTER_H_
#include "common/options.h"
#include "data/types.h"
#include "ssplit.h"
#include <string>
namespace marian {
namespace bergamot {
class SentenceSplitter {
// A wrapper around @ugermann's ssplit-cpp compiled from several places in
// mts. Constructed based on options. Used in TextProcessor below to create
// sentence-streams, which provide access to one sentence from blob of text at
// a time.
public:
explicit SentenceSplitter(Ptr<Options> options);
ug::ssplit::SentenceStream createSentenceStream(string_view const &input);
private:
ug::ssplit::SentenceSplitter ssplit_;
Ptr<Options> options_;
ug::ssplit::SentenceStream::splitmode mode_;
ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m);
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_SENTENCE_SPLITTER_H_

View File

@ -1,6 +1,6 @@
#include "service.h"
#include "batch.h"
#include "definitions.h"
#include "sanelogging.h"
#include <string>
#include <utility>
@ -9,26 +9,53 @@ namespace marian {
namespace bergamot {
Service::Service(Ptr<Options> options)
: requestId_(0), batchNumber_(0),
numWorkers_(options->get<int>("cpu-threads")),
: requestId_(0), numWorkers_(options->get<int>("cpu-threads")),
vocabs_(std::move(loadVocabularies(options))),
text_processor_(vocabs_, options), batcher_(options),
pcqueue_(2 * options->get<int>("cpu-threads")) {
text_processor_(vocabs_, options), batcher_(options)
#ifdef WITH_PTHREADS
,
pcqueue_(2 * options->get<int>("cpu-threads"))
#endif // WITH_PTHREADS
{
workers_.reserve(numWorkers_);
if (numWorkers_ == 0) {
// In case workers are 0, a single-translator is created and initialized
// in the main thread.
marian::DeviceId deviceId(/*cpuId=*/0, DeviceType::cpu);
translators_.emplace_back(deviceId, vocabs_, options);
translators_.back().initialize();
} else {
#ifdef WITH_PTHREADS
// If workers specified are greater than 0, translators_ are populated with
// unitialized instances. These are then initialized inside
// individual threads and set to consume from producer-consumer queue.
workers_.reserve(numWorkers_);
translators_.reserve(numWorkers_);
for (size_t cpuId = 0; cpuId < numWorkers_; cpuId++) {
marian::DeviceId deviceId(cpuId, DeviceType::cpu);
translators_.emplace_back(deviceId, vocabs_, options);
for (int i = 0; i < numWorkers_; i++) {
marian::DeviceId deviceId(i, DeviceType::cpu);
workers_.emplace_back(deviceId, pcqueue_, vocabs_, options);
auto &translator = translators_.back();
workers_.emplace_back([&translator, this] {
translator.initialize();
translator.consumeFrom(pcqueue_);
});
}
#else // WITH_PTHREADS
ABORT(
"Fatal: Service started requesting multiple threadswhile compiled with "
"COMPILE_THREAD_VARIANT=off. Please check your cmake build "
"configuration");
#endif
}
}
std::future<TranslationResult> Service::translateWithCopy(std::string input) {
std::future<Response> Service::translateWithCopy(std::string input) {
return translate(std::move(input));
}
std::future<TranslationResult> Service::translate(std::string &&input) {
// Takes in a blob of text. Segments and std::vector<TokenRanges> are
std::future<Response> Service::translate(std::string &&input) {
// Takes in a blob of text. Segments and SentenceRanges are
// extracted from the input (blob of text) and used to construct a Request
// along with a promise. promise value is set by the worker completing a
// request.
@ -41,59 +68,46 @@ std::future<TranslationResult> Service::translate(std::string &&input) {
// returns future corresponding to the promise.
Segments segments;
std::vector<TokenRanges> sourceAlignments;
text_processor_.process(input, segments, sourceAlignments);
SentenceRanges sourceRanges;
text_processor_.process(input, segments, sourceRanges);
std::promise<TranslationResult> translationResultPromise;
auto future = translationResultPromise.get_future();
std::promise<Response> responsePromise;
auto future = responsePromise.get_future();
Ptr<Request> request = New<Request>(
requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(input),
std::move(segments), std::move(sourceAlignments),
std::move(translationResultPromise));
std::move(segments), std::move(sourceRanges), std::move(responsePromise));
for (int i = 0; i < request->numSegments(); i++) {
RequestSentence requestSentence(i, request);
batcher_.addSentenceWithPriority(requestSentence);
batcher_.addWholeRequest(request);
if (numWorkers_ > 0) {
#ifdef WITH_PTHREADS
batcher_.produceTo(pcqueue_);
#endif
} else {
// Queue single-threaded
Batch batch;
while (batcher_ >> batch) {
translators_[0].translate(batch);
}
}
int numSentences;
do {
RequestSentences batchSentences;
batcher_.cleaveBatch(batchSentences);
numSentences = batchSentences.size();
if (numSentences > 0) {
PCItem pcitem(batchNumber_++, std::move(batchSentences));
pcqueue_.ProduceSwap(pcitem);
}
if (batchNumber_ % 500 == 0) {
LOG(info, "Queuing batch {}", batchNumber_);
}
} while (numSentences > 0);
#ifndef WITH_PTHREADS
workers_[0].mainloop();
#endif
return future;
}
void Service::stop() {
int counter = 0;
#ifdef WITH_PTHREADS
for (auto &worker : workers_) {
PCItem pcitem;
pcqueue_.ProduceSwap(pcitem);
++counter;
Batch poison = Batch::poison();
pcqueue_.ProduceSwap(poison);
}
counter = 0;
for (auto &worker : workers_) {
worker.join();
++counter;
}
workers_.clear(); // Takes care of idempotency.
#endif
}
Service::~Service() { stop(); }

View File

@ -3,15 +3,18 @@
#include "batch_translator.h"
#include "batcher.h"
#include "pcqueue.h"
#include "textops.h"
#include "translation_result.h"
#include "response.h"
#include "text_processor.h"
#include <queue>
#include <vector>
#include "data/types.h"
#ifdef WITH_PTHREADS
#include "pcqueue.h"
#endif
namespace marian {
namespace bergamot {
@ -25,17 +28,17 @@ class Service {
// options = ...;
// service = Service(options);
// std::string input_blob = "Hello World";
// std::future<TranslationResult>
// std::future<Response>
// response = service.translate(std::move(input_blob));
// response.wait();
// TranslationResult result = response.get();
// Response result = response.get();
public:
explicit Service(Ptr<Options> options);
// Constructs new string copying, calls translate internally.
std::future<TranslationResult> translateWithCopy(std::string input);
std::future<TranslationResult> translate(std::string &&input);
std::future<Response> translateWithCopy(std::string input);
std::future<Response> translate(std::string &&input);
void stop();
@ -45,12 +48,11 @@ public:
~Service();
private:
unsigned int requestId_;
unsigned int batchNumber_;
int numWorkers_;
size_t requestId_;
size_t numWorkers_;
// vocabs are used to construct a Request, which later uses it to construct
// TranslationResult (decode from words to string).
// Response (decode from words to string).
std::vector<Ptr<Vocab const>> vocabs_; // ORDER DEPENDENCY
// Consists of:
@ -68,8 +70,12 @@ private:
TextProcessor text_processor_; // ORDER DEPENDENCY
Batcher batcher_;
PCQueue<PCItem> pcqueue_;
std::vector<BatchTranslator> workers_;
std::vector<BatchTranslator> translators_;
#ifdef WITH_PTHREADS
PCQueue<Batch> pcqueue_;
std::vector<std::thread> workers_;
#endif
};
std::vector<Ptr<const Vocab>> loadVocabularies(Ptr<Options> options);

View File

@ -0,0 +1,69 @@
#include "text_processor.h"
#include "data/types.h"
#include "definitions.h"
#include "sentence_ranges.h"
#include "common/options.h"
#include "data/vocab.h"
#include <vector>
namespace marian {
namespace bergamot {
Segment TextProcessor::tokenize(const string_view &segment,
std::vector<string_view> &wordRanges) {
return vocabs_->front()->encodeWithByteRanges(
segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
}
TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options)
: vocabs_(&vocabs), sentence_splitter_(options) {
max_length_break_ = options->get<int>("max-length-break");
max_length_break_ = max_length_break_ - 1;
ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0");
}
void TextProcessor::process(const string_view &query, Segments &segments,
SentenceRanges &sourceRanges) {
auto sentenceStream = sentence_splitter_.createSentenceStream(query);
std::string_view sentenceStringPiece;
while (sentenceStream >> sentenceStringPiece) {
marian::string_view sentence(sentenceStringPiece.data(),
sentenceStringPiece.size());
std::vector<string_view> wordRanges;
Segment segment = tokenize(sentence, wordRanges);
// There are some cases where SentencePiece or vocab returns no words
// after normalization. 0 prevents any empty entries from being added.
if (segment.size() > 0) {
// Truncate segment into max_input_size segments.
truncate(segment, wordRanges, segments, sourceRanges);
}
}
}
void TextProcessor::truncate(Segment &segment,
std::vector<string_view> &wordRanges,
Segments &segments, SentenceRanges &sourceRanges) {
for (size_t offset = 0; offset < segment.size();
offset += max_length_break_) {
auto start = segment.begin() + offset;
size_t left = segment.size() - offset;
size_t diff = std::min(max_length_break_, left);
segments.emplace_back(start, start + diff);
segments.back().push_back(sourceEosId());
auto astart = wordRanges.begin() + offset;
sourceRanges.addSentence(astart, astart + diff);
}
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,50 @@
#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
#define SRC_BERGAMOT_TEXT_PROCESSOR_H_
#include "data/types.h"
#include "data/vocab.h"
#include "definitions.h"
#include "sentence_ranges.h"
#include "sentence_splitter.h"
#include <vector>
namespace marian {
namespace bergamot {
class TextProcessor {
// TextProcessor handles loading the sentencepiece vocabulary and also
// contains an instance of sentence-splitter based on ssplit.
//
// Used in Service to convert an incoming blog of text to a vector of
// sentences (vector of words). In addition, the ByteRanges of the
// source-tokens in unnormalized text are provided as string_views.
public:
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
void process(const string_view &query, Segments &segments,
SentenceRanges &sourceRanges);
private:
// Tokenizes an input string, returns Words corresponding. Loads the
// corresponding byte-ranges into tokenRanges.
Segment tokenize(const string_view &input,
std::vector<string_view> &tokenRanges);
// Truncate sentence into max_input_size segments.
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
Segments &segments, SentenceRanges &sourceRanges);
// shorthand, used only in truncate()
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
std::vector<Ptr<Vocab const>> *vocabs_;
SentenceSplitter sentence_splitter_;
size_t max_length_break_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_

View File

@ -1,109 +0,0 @@
#include "textops.h"
#include "common/timer.h"
#include <pcrecpp.h>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
namespace marian {
namespace bergamot {
SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options)
: options_(options) {
std::string smode_str = options_->get<std::string>("ssplit-mode", "");
mode_ = string2splitmode(smode_str);
std::string ssplit_prefix_file =
options_->get<std::string>("ssplit-prefix-file", "");
if (ssplit_prefix_file.size()) {
ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file);
LOG(info, "Loading protected prefixes for sentence splitting from {}",
ssplit_prefix_file);
ssplit_.load(ssplit_prefix_file);
} else {
LOG(warn, "Missing list of protected prefixes for sentence splitting. "
"Set with --ssplit-prefix-file.");
}
}
ug::ssplit::SentenceStream
SentenceSplitter::createSentenceStream(const string_view &input) {
pcrecpp::StringPiece spiece(input.begin(), input.size());
return std::move(ug::ssplit::SentenceStream(spiece, this->ssplit_, mode_));
}
ug::ssplit::SentenceStream::splitmode
SentenceSplitter::string2splitmode(const std::string &m) {
typedef ug::ssplit::SentenceStream::splitmode splitmode;
// @TODO: throw Exception on error
if (m == "sentence" || m == "Sentence")
return splitmode::one_sentence_per_line;
if (m == "paragraph" || m == "Paragraph")
return splitmode::one_paragraph_per_line;
if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") {
LOG(warn, "Ignoring unknown text input format specification: {}.", m);
}
return splitmode::wrapped_text;
}
Segment TextProcessor::tokenize(const string_view &segment,
TokenRanges &tokenRanges) {
return vocabs_->front()->encodeWithByteRanges(
segment, tokenRanges, /*addEOS=*/false, /*inference=*/true);
}
TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
Ptr<Options> options)
: vocabs_(&vocabs), sentence_splitter_(options) {
max_input_sentence_tokens_ = options->get<int>("max-input-sentence-tokens");
max_input_sentence_tokens_ = max_input_sentence_tokens_ - 1;
ABORT_IF(max_input_sentence_tokens_ < 0,
"max-input-sentence-tokens cannot be < 0");
}
void TextProcessor::process(const string_view &query, Segments &segments,
std::vector<TokenRanges> &sourceRanges) {
auto sentenceStream = sentence_splitter_.createSentenceStream(query);
pcrecpp::StringPiece sentenceStringPiece;
while (sentenceStream >> sentenceStringPiece) {
string_view sentence(sentenceStringPiece.data(),
sentenceStringPiece.size());
TokenRanges tokenRanges;
Segment segment = tokenize(sentence, tokenRanges);
// There are some cases where SentencePiece or vocab returns no words
// after normalization. 0 prevents any empty entries from being added.
if (segment.size() > 0) {
// Truncate segment into max_input_size segments.
truncate(segment, tokenRanges, segments, sourceRanges);
}
}
}
void TextProcessor::truncate(Segment &segment, TokenRanges &tokenRanges,
Segments &segments,
std::vector<TokenRanges> &sourceRanges) {
for (int offset = 0; offset < segment.size();
offset += max_input_sentence_tokens_) {
auto start = segment.begin() + offset;
unsigned int left = segment.size() - offset;
unsigned int diff = std::min(max_input_sentence_tokens_, left);
segments.emplace_back(start, start + diff);
segments.back().push_back(sourceEosId());
auto astart = tokenRanges.begin() + offset;
sourceRanges.emplace_back(astart, astart + diff);
}
}
} // namespace bergamot
} // namespace marian

View File

@ -1,71 +0,0 @@
#ifndef SRC_BERGAMOT_TEXTOPS_H_
#define SRC_BERGAMOT_TEXTOPS_H_
#include "common/definitions.h"
#include "common/logging.h"
#include "common/options.h"
#include "common/types.h" // missing in shortlist.h
#include "common/utils.h"
#include "data/sentencepiece_vocab.h"
#include "data/shortlist.h"
#include "definitions.h"
#include "ssplit.h"
#include <cassert>
#include <iostream>
#include <string>
#include <vector>
namespace marian {
namespace bergamot {
class SentenceSplitter {
// A wrapper around @ugermann's ssplit-cpp compiled from several places in
// mts. Constructed based on options. Used in TextProcessor below to create
// sentence-streams, which provide access to one sentence from blob of text at
// a time.
public:
explicit SentenceSplitter(Ptr<Options> options);
ug::ssplit::SentenceStream createSentenceStream(string_view const &input);
private:
ug::ssplit::SentenceSplitter ssplit_;
Ptr<Options> options_;
ug::ssplit::SentenceStream::splitmode mode_;
ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m);
};
class TextProcessor {
// TextProcessor handles loading the sentencepiece vocabulary and also
// contains an instance of sentence-splitter based on ssplit.
//
// Used in Service to convert an incoming blog of text to a vector of
// sentences (vector of words). In addition, the ByteRanges of the
// source-tokens in unnormalized text are provided as string_views.
public:
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
void process(const string_view &query, Segments &segments,
std::vector<TokenRanges> &sourceRanges);
private:
// Tokenizes an input string, returns Words corresponding. Loads the
// corresponding byte-ranges into tokenRanges.
Segment tokenize(const string_view &input, TokenRanges &tokenRanges);
// Truncate sentence into max_input_size segments.
void truncate(Segment &sentence, TokenRanges &tokenRanges, Segments &segments,
std::vector<TokenRanges> &sourceRanges);
// shorthand, used only in truncate()
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }
std::vector<Ptr<Vocab const>> *vocabs_;
SentenceSplitter sentence_splitter_;
unsigned int max_input_sentence_tokens_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_TEXTOPS_H_

View File

@ -1,72 +0,0 @@
#include "translation_result.h"
#include "common/logging.h"
#include "data/alignment.h"
#include <utility>
namespace marian {
namespace bergamot {
TranslationResult::TranslationResult(std::string &&source,
std::vector<TokenRanges> &&sourceRanges,
Histories &&histories,
std::vector<Ptr<Vocab const>> &vocabs)
: source_(std::move(source)), sourceRanges_(std::move(sourceRanges)),
histories_(std::move(histories)) {
std::vector<string_view> sourceMappings;
std::vector<string_view> targetMappings;
// Process sourceMappings into sourceMappings.
sourceMappings.reserve(sourceRanges_.size());
for (int i = 0; i < sourceRanges_.size(); i++) {
string_view first = sourceRanges_[i].front();
string_view last = sourceRanges_[i].back();
sourceMappings.emplace_back(first.data(), last.end() - first.begin());
}
// Compiles translations into a single std::string translation_
// Current implementation uses += on std::string, multiple resizes.
// Stores ByteRanges as indices first, followed by conversion into
// string_views.
// TODO(jerin): Add token level string_views here as well.
std::vector<std::pair<int, int>> translationRanges;
size_t offset{0};
bool first{true};
for (auto &history : histories_) {
// TODO(jerin): Change hardcode of nBest = 1
NBestList onebest = history->nBest(1);
Result result = onebest[0]; // Expecting only one result;
Words words = std::get<0>(result);
std::string decoded = (vocabs.back())->decode(words);
if (first) {
first = false;
} else {
translation_ += " ";
++offset;
}
translation_ += decoded;
translationRanges.emplace_back(offset, decoded.size());
offset += decoded.size();
}
// Converting ByteRanges as indices into string_views.
targetMappings.reserve(translationRanges.size());
for (auto &range : translationRanges) {
const char *begin = &translation_[range.first];
targetMappings.emplace_back(begin, range.second);
}
// Surely, let's add sentenceMappings_
for (auto src = sourceMappings.begin(), tgt = targetMappings.begin();
src != sourceMappings.end() && tgt != targetMappings.end();
++src, ++tgt) {
sentenceMappings_.emplace_back(*src, *tgt);
auto &t = sentenceMappings_.back();
}
}
} // namespace bergamot
} // namespace marian

View File

@ -1,76 +0,0 @@
#ifndef SRC_BERGAMOT_TRANSLATION_RESULT_H_
#define SRC_BERGAMOT_TRANSLATION_RESULT_H_
#include "data/types.h"
#include "definitions.h"
#include "translator/beam_search.h"
#include <cassert>
#include <string>
#include <vector>
namespace marian {
namespace bergamot {
class TranslationResult {
public:
TranslationResult(std::string &&source,
std::vector<TokenRanges> &&sourceRanges,
Histories &&histories,
std::vector<Ptr<Vocab const>> &vocabs);
TranslationResult(TranslationResult &&other)
: source_(std::move(other.source_)),
translation_(std::move(other.translation_)),
sourceRanges_(std::move(other.sourceRanges_)),
sentenceMappings_(std::move(other.sentenceMappings_)),
histories_(std::move(other.histories_)){};
TranslationResult(const TranslationResult &) = delete;
TranslationResult &operator=(const TranslationResult &) = delete;
// Returns const references to source and translated texts, for external
// consumption.
const std::string &getOriginalText() const { return source_; }
const std::string &getTranslatedText() const { return translation_; }
// A mapping of string_views in the source_ and translation_ are provide as a
// pair for external consumption. Each entry corresponding
// to a (source-sentence, target-sentence).
typedef std::vector<std::pair<const string_view, const string_view>>
SentenceMappings;
const SentenceMappings &getSentenceMappings() const {
return sentenceMappings_;
}
// Return the Quality scores of the translated text.
// Not implemented currently, commenting out.
// const QualityScore &getQualityScore() const { return qualityScore; }
// For development use to benchmark with marian-decoder.
const Histories &getHistories() const { return histories_; }
// @jerinphilip: Why are these members no longer-private? For move-semantics
// with consistent string_views for bergamot-translator.
std::string source_;
std::string translation_;
// Adding the following to complete bergamot-translator spec, redundant while
// sourceMappings_ and targetMappings_ exists or vice-versa.
SentenceMappings sentenceMappings_;
private:
// Histories are currently required for interoperability with OutputPrinter
// and OutputCollector and hence comparisons with marian-decoder.
// Future hook to gain alignments.
Histories histories_;
// string_views at the token level.
std::vector<TokenRanges> sourceRanges_;
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_TRANSLATION_RESULT_H_

View File

@ -38,16 +38,11 @@
<div id="divtranslation">
<label for="from">From</label>
<textarea id="from" name="from">
Una estrategia republicana para obstaculizar la reelección de Obama
Los dirigentes republicanos justificaron su política por la necesidad de luchar contra el fraude electoral.
Ahora bien, el Centro Brennan considera esto último un mito y afirma que el fraude electoral es menos frecuente en los Estados Unidos que el número de personas que mueren a causa de la caída de un rayo.
De hecho, los abogados republicanos no han encontrado más que 300 casos de fraude electoral en los Estados Unidos en diez años.
Una cosa es cierta: esas nuevas disposiciones afectarán negativamente a la tasa de participación.
En ese sentido, estas medidas minarán en parte el sistema democrático americano.
Al contrario de lo que ocurre en Canadá, los estados americanos son responsables de la organización de las elecciones federales en los Estados Unidos.
Y en esa misma línea una mayoría de los gobiernos americanos promulgaron, a partir de 2009, nuevas leyes que dificultaban el proceso de inscripción o de votación.
Este fenómeno se ha extendido tras las elecciones de noviembre de 2010, que vieron el aumento de 675 nuevos representantes republicanos en 26 estados.
En consecuencia, durante el año 2011 se introdujeron 180 proyectos de ley que restringían el ejercicio del derecho de voto en 41 estados.
Una estrategia republicana para obstaculizar la reelecci<63>n de Obama. Los dirigentes republicanos justificaron su pol<6F>tica por la necesidad de luchar contra el fraude electoral.
Ahora bien, el Centro Brennan considera esto <20>ltimo un mito y afirma que el fraude electoral es menos frecuente en los Estados Unidos que el n<>mero de personas que mueren a causa de la ca<63>da de un rayo. De hecho, los abogados republicanos no han encontrado m<>s que 300 casos de fraude electoral en los Estados Unidos en diez a<>os. Una cosa es cierta: esas nuevas disposiciones afectar<61>n negativamente a la tasa de participaci<63>n.
En ese sentido, estas medidas minar<61>n en parte el sistema democr<63>tico americano.
Al contrario de lo que ocurre en Canad<61>, los estados americanos son responsables de la organizaci<63>n de las elecciones federales en los Estados Unidos. Y en esa misma l<>nea una mayor<6F>a de los gobiernos americanos promulgaron, a partir de 2009, nuevas leyes que dificultaban el proceso de inscripci<63>n o de votaci<63>n.
Este fen<65>meno se ha extendido tras las elecciones de noviembre de 2010, que vieron el aumento de 675 nuevos representantes republicanos en 26 estados. En consecuencia, durante el a<>o 2011 se introdujeron 180 proyectos de ley que restring<6E>an el ejercicio del derecho de voto en 41 estados.
</textarea>
<br><br>
<label for="to">To</label>
@ -81,12 +76,12 @@ vocabs:
beam-size: 1
normalize: 1.0
word-penalty: 0
max-input-sentence-tokens: 128
max-input-tokens: 1024
max-length-break: 128
mini-batch-words: 1024
workspace: 128
max-length-factor: 2.0
skip-cost: true
cpu-threads: 1
cpu-threads: 0
quiet: true
quiet-translation: true
shortlist:
@ -112,19 +107,19 @@ maxi-batch-sort: src
model = new Module.TranslationModel(modelConfig);
}
const translate = (sentences) => {
const translate = (paragraphs) => {
// Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector<string>)
var request = new Module.TranslationRequest();
let input = new Module.VectorString;
// Initialize the input
sentences.forEach(sentence => {
// prevent empty sentences - it breaks the translation
if (sentence.trim() === "") {
paragraphs.forEach(paragraph => {
// prevent empty paragraph - it breaks the translation
if (paragraph.trim() === "") {
return;
}
input.push_back(sentence.trim())
input.push_back(paragraph.trim())
})
// Access input (just for debugging)
console.log('Input size=', input.size());
@ -138,14 +133,14 @@ maxi-batch-sort: src
let result = model.translate(input, request);
// Access original and translated text from each entry of vector<TranslationResult>
//console.log('Result size=', result.size(), ' - TimeDiff - ', (Date.now() - start)/1000);
const translatedSentences = [];
const translatedParagraphs = [];
for (let i = 0; i < result.size(); i++) {
translatedSentences.push(result.get(i).getTranslatedText());
translatedParagraphs.push(result.get(i).getTranslatedText());
}
console.log({ translatedSentences });
console.log({ translatedParagraphs });
request.delete();
input.delete();
return translatedSentences;
return translatedParagraphs;
}
document.querySelector("#load").addEventListener("click", () => {
@ -160,17 +155,17 @@ maxi-batch-sort: src
const translateCall = () => {
const text = document.querySelector('#from').value;
const sentences = text.split("\n");
const paragraphs = text.split("\n");
let wordCount = 0;
sentences.forEach(sentence => {
paragraphs.forEach(sentence => {
wordCount += sentence.trim().split(" ").filter(word => word.trim() !== "").length;
})
const start = Date.now();
const translatedSentences = translate(sentences);
const translatedParagraphs = translate(paragraphs);
const secs = (Date.now() - start) / 1000;
log(`Translation of ${translatedSentences.length} sentences (wordCount ${wordCount}) took ${secs} secs (${Math.round(wordCount / secs)} words per second)`);
log(`Translation of (${wordCount}) words took ${secs} secs (${Math.round(wordCount / secs)} words per second)`);
document.querySelector('#to').value = translatedSentences.join("\n");
document.querySelector('#to').value = translatedParagraphs.join("\n");
}
document.querySelector("#translate").addEventListener("click", () => {