mirror of
https://github.com/browsermt/bergamot-translator.git
synced 2024-08-15 08:30:46 +03:00
WASM Bindings collapse (#87)
* Safe transfer of bindings through typedefs * Removing Translation* files and bringing in counterparts * Remove previously commented out code * Removing commented out include * Absorb Translation* documentation Co-authored-by: abhi-agg <66322306+abhi-agg@users.noreply.github.com>
This commit is contained in:
parent
4908e4019e
commit
36b3c7291a
@ -7,9 +7,9 @@
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "TranslationModel.h"
|
||||
#include "translator/parser.h"
|
||||
#include "translator/byte_array_util.h"
|
||||
#include "translator/parser.h"
|
||||
#include "translator/service.h"
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
@ -20,19 +20,17 @@ int main(int argc, char **argv) {
|
||||
std::string config = options->asYamlString();
|
||||
|
||||
// Route the config string to construct marian model through TranslationModel
|
||||
TranslationModel model(config, marian::bergamot::getModelMemoryFromConfig(options));
|
||||
marian::bergamot::Service model(
|
||||
config, marian::bergamot::getModelMemoryFromConfig(options));
|
||||
|
||||
TranslationRequest translationRequest;
|
||||
std::vector<std::string> texts;
|
||||
|
||||
for (std::string line; std::getline(std::cin, line);) {
|
||||
texts.emplace_back(line);
|
||||
texts.emplace_back(line);
|
||||
}
|
||||
|
||||
auto results = model.translate(std::move(texts), translationRequest);
|
||||
|
||||
// Resolve the future and get the actual result
|
||||
//std::vector<TranslationResult> results = futureResults.get();
|
||||
auto results = model.translateMultiple(std::move(texts), translationRequest);
|
||||
|
||||
for (auto &result : results) {
|
||||
std::cout << result.getTranslatedText() << std::endl;
|
||||
|
@ -1,16 +1,17 @@
|
||||
/*
|
||||
* main.cpp
|
||||
*
|
||||
* An application which accepts line separated texts in stdin and returns translated ones in stdout.
|
||||
* It is convenient for batch processing and can be used with tools like SacreBLEU.
|
||||
* An application which accepts line separated texts in stdin and returns
|
||||
* translated ones in stdout. It is convenient for batch processing and can be
|
||||
* used with tools like SacreBLEU.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "TranslationModel.h"
|
||||
#include "translator/parser.h"
|
||||
#include "translator/service.h"
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
@ -21,19 +22,16 @@ int main(int argc, char **argv) {
|
||||
std::string config = options->asYamlString();
|
||||
|
||||
// Route the config string to construct marian model through TranslationModel
|
||||
TranslationModel model(config);
|
||||
marian::bergamot::Service model(config);
|
||||
|
||||
TranslationRequest translationRequest;
|
||||
std::vector<std::string> texts;
|
||||
|
||||
for (std::string line; std::getline(std::cin, line);) {
|
||||
texts.emplace_back(line);
|
||||
texts.emplace_back(line);
|
||||
}
|
||||
|
||||
auto results = model.translate(std::move(texts), translationRequest);
|
||||
|
||||
// Resolve the future and get the actual result
|
||||
//std::vector<TranslationResult> results = futureResults.get();
|
||||
auto results = model.translateMultiple(std::move(texts), translationRequest);
|
||||
|
||||
for (auto &result : results) {
|
||||
std::cout << result.getTranslatedText() << std::endl;
|
||||
|
@ -1,80 +0,0 @@
|
||||
/*
|
||||
* TranslationModel.h
|
||||
*
|
||||
* Main interface for translation API.
|
||||
*/
|
||||
|
||||
#ifndef SRC_TRANSLATOR_TRANSLATIONMODEL_H_
|
||||
#define SRC_TRANSLATOR_TRANSLATIONMODEL_H_
|
||||
|
||||
#include <future>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// All 3rd party includes
|
||||
#include "3rd_party/marian-dev/src/common/options.h"
|
||||
|
||||
// All local project includes
|
||||
#include "TranslationRequest.h"
|
||||
#include "TranslationResult.h"
|
||||
#include "translator/definitions.h"
|
||||
#include "translator/service.h"
|
||||
|
||||
/* A Translation model that translates a plain (without any markups and emojis)
|
||||
* UTF-8 encoded text. This implementation supports translation from 1 source
|
||||
* language to 1 target language.
|
||||
*/
|
||||
class TranslationModel {
|
||||
public:
|
||||
/* Construct the model using the model configuration options as yaml-formatted
|
||||
* string
|
||||
*/
|
||||
/**
|
||||
* @param config Marian yml config file in the form of a string
|
||||
* @param model_memory optional byte array (aligned to 64!!!) that contains
|
||||
* the bytes of a model.bin.
|
||||
*/
|
||||
TranslationModel(const std::string &config,
|
||||
marian::bergamot::AlignedMemory modelMemory = marian::bergamot::AlignedMemory(),
|
||||
marian::bergamot::AlignedMemory shortlistMemory = marian::bergamot::AlignedMemory());
|
||||
|
||||
~TranslationModel();
|
||||
|
||||
/* This method performs translation on a list of UTF-8 encoded plain text
|
||||
* (without any markups or emojis) and returns a list of results in the same
|
||||
* order. The model supports translation from 1 source language to 1 target
|
||||
* language.
|
||||
*
|
||||
* Each text entry can either be a word, a phrase, a sentence or a list of
|
||||
* sentences. Additional information related to the translated text can be
|
||||
* requested via TranslationRequest which is applied equally to each text
|
||||
* entry. The translated text corresponding to each text entry and the
|
||||
* additional information (as specified in the TranslationRequest) is
|
||||
* encapsulated and returned in TranslationResult.
|
||||
*
|
||||
* The API splits each text entry into sentences internally, which are then
|
||||
* translated independent of each other. The translated sentences are then
|
||||
* joined back together and returned in TranslationResult.
|
||||
*
|
||||
* Please refer to the TranslationRequest class to find out what additional
|
||||
* information can be requested. The alignment information can only be
|
||||
* requested if the model supports it (check isAlignmentSupported() API).
|
||||
*
|
||||
* The texts argument will become empty after the execution of this API (each
|
||||
* entry of texts list will be moved to its corresponding TranslationResult
|
||||
* object).
|
||||
*/
|
||||
std::vector<TranslationResult> translate(std::vector<std::string> &&texts,
|
||||
TranslationRequest request);
|
||||
|
||||
/* Check if the model can provide alignment information b/w original and
|
||||
* translated text. */
|
||||
bool isAlignmentSupported() const;
|
||||
|
||||
private:
|
||||
// Model configuration options
|
||||
std::shared_ptr<marian::Options> configOptions_; // ORDER DEPENDECNY
|
||||
marian::bergamot::Service service_; // ORDER DEPENDENCY
|
||||
};
|
||||
|
||||
#endif /* SRC_TRANSLATOR_TRANSLATIONMODEL_H_ */
|
@ -1,108 +0,0 @@
|
||||
/*
|
||||
* TranslationResult.h
|
||||
*
|
||||
* The class that represents the result of TranslationModel::translate()
|
||||
* API for each of its text entry and TranslationRequest.
|
||||
*/
|
||||
|
||||
#ifndef SRC_TRANSLATOR_TRANSLATIONRESULT_H_
|
||||
#define SRC_TRANSLATOR_TRANSLATIONRESULT_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "QualityScore.h"
|
||||
|
||||
/* This class represents the result of TranslationModel::translate() API
|
||||
* for each of its text entry and TranslationRequest.
|
||||
*/
|
||||
class TranslationResult {
|
||||
public:
|
||||
typedef std::vector<std::pair<std::string_view, std::string_view>>
|
||||
SentenceMappings;
|
||||
#ifdef WASM_BINDINGS
|
||||
TranslationResult(const std::string &original, const std::string &translation)
|
||||
: originalText(original), translatedText(translation),
|
||||
sentenceMappings() {}
|
||||
#endif
|
||||
TranslationResult(const std::string &original, const std::string &translation,
|
||||
SentenceMappings &sentenceMappings)
|
||||
: originalText(original), translatedText(translation),
|
||||
sentenceMappings(sentenceMappings) {}
|
||||
|
||||
TranslationResult(TranslationResult &&other)
|
||||
: originalText(std::move(other.originalText)),
|
||||
translatedText(std::move(other.translatedText)),
|
||||
sentenceMappings(std::move(other.sentenceMappings)) {}
|
||||
|
||||
#ifdef WASM_BINDINGS
|
||||
TranslationResult(const TranslationResult &other)
|
||||
: originalText(other.originalText),
|
||||
translatedText(other.translatedText),
|
||||
sentenceMappings(other.sentenceMappings) {}
|
||||
#endif
|
||||
|
||||
TranslationResult(std::string &&original, std::string &&translation,
|
||||
SentenceMappings &&sentenceMappings)
|
||||
: originalText(std::move(original)),
|
||||
translatedText(std::move(translation)),
|
||||
sentenceMappings(std::move(sentenceMappings)) {}
|
||||
|
||||
#ifndef WASM_BINDINGS
|
||||
TranslationResult &operator=(const TranslationResult &) = delete;
|
||||
#else
|
||||
TranslationResult &operator=(const TranslationResult &result) {
|
||||
originalText = result.originalText;
|
||||
translatedText = result.translatedText;
|
||||
sentenceMappings = result.sentenceMappings;
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Return the original text. */
|
||||
const std::string &getOriginalText() const { return originalText; }
|
||||
|
||||
/* Return the translated text. */
|
||||
const std::string &getTranslatedText() const { return translatedText; }
|
||||
|
||||
/* Return the Quality scores of the translated text. */
|
||||
const QualityScore &getQualityScore() const { return qualityScore; }
|
||||
|
||||
/* Return the Sentence mappings (information regarding how individual
|
||||
* sentences of originalText map to corresponding translated sentences in
|
||||
* translatedText).
|
||||
*/
|
||||
const SentenceMappings &getSentenceMappings() const {
|
||||
return sentenceMappings;
|
||||
}
|
||||
|
||||
private:
|
||||
// Original text (in UTF-8 encoded format) that was supposed to be translated
|
||||
std::string originalText;
|
||||
|
||||
// Translation (in UTF-8 encoded format) of the originalText
|
||||
std::string translatedText;
|
||||
|
||||
// Quality score of the translated text at the granularity specified in
|
||||
// TranslationRequest. It is an optional result (it will have no information
|
||||
// if not requested in TranslationRequest)
|
||||
QualityScore qualityScore;
|
||||
|
||||
// Information regarding how individual sentences of originalText map to
|
||||
// corresponding translated sentences in joined translated text
|
||||
// (translatedText) An example of sentence mapping:
|
||||
// originalText (contains 2 sentences) = "What is your name?
|
||||
// My name is Abc." translatedText (contains 2 translated sentences) =
|
||||
// "Was ist dein Name? Mein Name ist Abc." sentenceMappings = [
|
||||
// {"What is your name?", "Was ist dein Name?"}, //
|
||||
// Pair(originalText[0],translatedText[0])
|
||||
// {"My name is Abc", "Mein Name ist Abc."} //
|
||||
// Pair(originalText[1],translatedText[1])
|
||||
// ]
|
||||
//
|
||||
// It is an optional result (it will be empty if not requested in
|
||||
// TranslationRequest).
|
||||
SentenceMappings sentenceMappings;
|
||||
};
|
||||
|
||||
#endif /* SRC_TRANSLATOR_TRANSLATIONRESULT_H_ */
|
@ -1,5 +1,4 @@
|
||||
add_library(bergamot-translator STATIC
|
||||
TranslationModel.cpp
|
||||
byte_array_util.cpp
|
||||
text_processor.cpp
|
||||
sentence_splitter.cpp
|
||||
|
@ -1,50 +0,0 @@
|
||||
/*
|
||||
* TranslationModel.cpp
|
||||
*
|
||||
*/
|
||||
|
||||
#include <future>
|
||||
#include <vector>
|
||||
|
||||
// All local project includes
|
||||
#include "TranslationModel.h"
|
||||
#include "translator/parser.h"
|
||||
#include "translator/response.h"
|
||||
#include "translator/service.h"
|
||||
|
||||
TranslationModel::TranslationModel(const std::string &config,
|
||||
marian::bergamot::AlignedMemory model_memory,
|
||||
marian::bergamot::AlignedMemory lexical_memory)
|
||||
: service_(config, std::move(model_memory), std::move(lexical_memory)) {}
|
||||
|
||||
TranslationModel::~TranslationModel() {}
|
||||
|
||||
std::vector<TranslationResult>
|
||||
TranslationModel::translate(std::vector<std::string> &&texts,
|
||||
TranslationRequest request) {
|
||||
|
||||
// This code, move into async?
|
||||
std::vector<TranslationResult> translationResults;
|
||||
std::vector<marian::bergamot::Response> responses =
|
||||
service_.translateMultiple(std::move(texts), request);
|
||||
for (auto &response : responses) {
|
||||
TranslationResult::SentenceMappings sentenceMappings;
|
||||
for (size_t idx = 0; idx < response.size(); idx++) {
|
||||
marian::string_view src = response.source.sentence(idx);
|
||||
marian::string_view tgt = response.target.sentence(idx);
|
||||
sentenceMappings.emplace_back(std::string_view(src.data(), src.size()),
|
||||
std::string_view(tgt.data(), tgt.size()));
|
||||
}
|
||||
|
||||
// In place construction.
|
||||
translationResults.emplace_back(
|
||||
std::move(response.source.text), // &&response.source_
|
||||
std::move(response.target.text), // &&response.translation_
|
||||
std::move(sentenceMappings) // &&sentenceMappings
|
||||
);
|
||||
}
|
||||
|
||||
return translationResults;
|
||||
}
|
||||
|
||||
bool TranslationModel::isAlignmentSupported() const { return false; }
|
@ -64,6 +64,10 @@ struct Response {
|
||||
/// sparse matrix representation with indices corresponding
|
||||
/// to (sub-)words accessible through Annotation.
|
||||
std::vector<Alignment> alignments;
|
||||
|
||||
const std::string &getOriginalText() const { return source.text; }
|
||||
|
||||
const std::string &getTranslatedText() const { return target.text; }
|
||||
};
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
@ -28,8 +28,8 @@ loadVocabularies(marian::Ptr<marian::Options> options) {
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
Service::Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory)
|
||||
: requestId_(0), vocabs_(std::move(loadVocabularies(options))),
|
||||
Service::Service(Ptr<Options> options, AlignedMemory modelMemory, AlignedMemory shortlistMemory)
|
||||
: requestId_(0), options_(options), vocabs_(std::move(loadVocabularies(options))),
|
||||
text_processor_(vocabs_, options), batcher_(options),
|
||||
numWorkers_(options->get<int>("cpu-threads")),
|
||||
modelMemory_(std::move(modelMemory)), shortlistMemory_(std::move(shortlistMemory))
|
||||
|
@ -20,15 +20,22 @@
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
|
||||
/// Service offers methods create an asynchronous translation service. This is
|
||||
/// intended to be similar to the ones provided for training or decoding in ML
|
||||
/// pipelines with the following additional capabilities:
|
||||
/// Service offers methods create an asynchronous translation service that
|
||||
/// translates a plain (without any markups and emojis) UTF-8 encoded text.
|
||||
/// This implementation supports translation from 1 source language to 1 target
|
||||
/// language.
|
||||
///
|
||||
/// This is intended to be similar to the ones provided for training or
|
||||
/// decoding in ML pipelines with the following additional capabilities:
|
||||
///
|
||||
/// 1. Provision of a request -> response based translation flow unlike the
|
||||
/// usual a line based translation or decoding provided in most ML frameworks.
|
||||
/// 2. Internal handling of normalization etc which changes source text to
|
||||
/// provide to client translation meta-information like alignments consistent
|
||||
/// with the unnormalized input text.
|
||||
/// 3. The API splits each text entry into sentences internally, which are then
|
||||
/// translated independent of each other. The translated sentences are then
|
||||
/// joined back together and returned in Response.
|
||||
///
|
||||
/// Service exposes methods to instantiate the service from a string
|
||||
/// configuration (which can cover most translators) and to translate an
|
||||
@ -48,9 +55,10 @@ namespace bergamot {
|
||||
/// // Do things with response.
|
||||
/// ```
|
||||
///
|
||||
/// Optionally Service can be initialized by also passing model_memory for
|
||||
/// Optionally Service can be initialized by also passing model memory for
|
||||
/// purposes of efficiency (which defaults to nullpointer and then reads from
|
||||
/// file supplied through config).
|
||||
///
|
||||
class Service {
|
||||
|
||||
public:
|
||||
@ -84,8 +92,8 @@ public:
|
||||
explicit Service(const std::string &config,
|
||||
AlignedMemory modelMemory = AlignedMemory(),
|
||||
AlignedMemory shortlistMemory = AlignedMemory())
|
||||
: Service(parseOptions(config, /*validate=*/false), std::move(modelMemory),
|
||||
std::move(shortlistMemory)) {}
|
||||
: Service(parseOptions(config, /*validate=*/false),
|
||||
std::move(modelMemory), std::move(shortlistMemory)) {}
|
||||
|
||||
/// Explicit destructor to clean up after any threads initialized in
|
||||
/// asynchronous operation mode.
|
||||
@ -108,12 +116,18 @@ public:
|
||||
std::future<Response> translate(std::string &&source,
|
||||
ResponseOptions options);
|
||||
|
||||
/// Translate an input, providing TranslationRequest across all texts to
|
||||
/// construct Response. Provides the browser with the ability to break texts
|
||||
/// into multiple Request keeping gains from efficiently batching internally.
|
||||
/// Also useful when one has to set/unset alignments or quality in the
|
||||
/// Response to save compute spent in constructing these objects.
|
||||
|
||||
/// Translate multiple text-blobs in a single *blocking* API call, providing
|
||||
/// TranslationRequest which applies across all text-blobs dictating how to
|
||||
/// construct Response. TranslationRequest can be used to enable/disable
|
||||
/// additional information like quality-scores, alignments etc.
|
||||
///
|
||||
/// All texts are combined to efficiently construct batches together providing
|
||||
/// speedups compared to calling translate() indepdently on individual
|
||||
/// text-blob. Note that there will be minor differences in output when
|
||||
/// text-blobs are individually translated due to approximations but similar
|
||||
/// quality nonetheless. If you have async/multithread capabilities, it is
|
||||
/// recommended to work with futures and translate() API.
|
||||
///
|
||||
/// @param [in] source: rvalue reference of the string to be translated
|
||||
/// @param [in] translationRequest: TranslationRequest (Unified API)
|
||||
/// indicating whether or not to include some member in the Response, also
|
||||
@ -123,6 +137,11 @@ public:
|
||||
translateMultiple(std::vector<std::string> &&source,
|
||||
TranslationRequest translationRequest);
|
||||
|
||||
/// Returns if model is alignment capable or not.
|
||||
bool isAlignmentSupported() const {
|
||||
return options_->hasAndNotEmpty("alignment");
|
||||
}
|
||||
|
||||
private:
|
||||
/// Queue an input for translation.
|
||||
std::future<Response> queueRequest(std::string &&input,
|
||||
@ -149,6 +168,10 @@ private:
|
||||
|
||||
/// Number of workers to launch.
|
||||
size_t numWorkers_; // ORDER DEPENDENCY (pcqueue_)
|
||||
|
||||
/// Options object holding the options Service was instantiated with.
|
||||
Ptr<Options> options_;
|
||||
|
||||
/// Model memory to load model passed as bytes.
|
||||
AlignedMemory modelMemory_; // ORDER DEPENDENCY (translators_)
|
||||
/// Shortlist memory passed as bytes.
|
||||
|
@ -6,10 +6,14 @@
|
||||
|
||||
#include <emscripten/bind.h>
|
||||
|
||||
#include "TranslationModel.h"
|
||||
#include "response.h"
|
||||
#include "service.h"
|
||||
|
||||
using namespace emscripten;
|
||||
|
||||
typedef marian::bergamot::Service TranslationModel;
|
||||
typedef marian::bergamot::Response TranslationResult;
|
||||
|
||||
val getByteArrayView(marian::bergamot::AlignedMemory& alignedMemory) {
|
||||
return val(typed_memory_view(alignedMemory.size(), alignedMemory.as<char>()));
|
||||
}
|
||||
@ -31,9 +35,11 @@ TranslationModel* TranslationModelFactory(const std::string &config,
|
||||
EMSCRIPTEN_BINDINGS(translation_model) {
|
||||
class_<TranslationModel>("TranslationModel")
|
||||
.constructor(&TranslationModelFactory, allow_raw_pointers())
|
||||
.function("translate", &TranslationModel::translate)
|
||||
.function("translate", &TranslationModel::translateMultiple)
|
||||
.function("isAlignmentSupported", &TranslationModel::isAlignmentSupported)
|
||||
;
|
||||
// ^ We redirect Service::translateMultiple to WASMBound::translate instead. Sane API is
|
||||
// translate. If and when async comes, we can be done with this inconsistency.
|
||||
|
||||
register_vector<std::string>("VectorString");
|
||||
register_vector<TranslationResult>("VectorTranslationResult");
|
||||
|
@ -6,15 +6,16 @@
|
||||
#include <emscripten/bind.h>
|
||||
#include <vector>
|
||||
|
||||
#include "TranslationResult.h"
|
||||
#include "response.h"
|
||||
|
||||
typedef marian::bergamot::Response TranslationResult;
|
||||
|
||||
using namespace emscripten;
|
||||
|
||||
// Binding code
|
||||
EMSCRIPTEN_BINDINGS(translation_result) {
|
||||
class_<TranslationResult>("TranslationResult")
|
||||
.constructor<std::string, std::string, TranslationResult::SentenceMappings>()
|
||||
.function("getOriginalText", &TranslationResult::getOriginalText)
|
||||
.function("getTranslatedText", &TranslationResult::getTranslatedText)
|
||||
;
|
||||
.constructor<>()
|
||||
.function("getOriginalText", &TranslationResult::getOriginalText)
|
||||
.function("getTranslatedText", &TranslationResult::getTranslatedText);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user