diff --git a/app/bergamot-translator-app-bytearray.cpp b/app/bergamot-translator-app-bytearray.cpp index 1fa5748..91353c0 100644 --- a/app/bergamot-translator-app-bytearray.cpp +++ b/app/bergamot-translator-app-bytearray.cpp @@ -7,9 +7,9 @@ #include -#include "TranslationModel.h" -#include "translator/parser.h" #include "translator/byte_array_util.h" +#include "translator/parser.h" +#include "translator/service.h" int main(int argc, char **argv) { @@ -20,19 +20,17 @@ int main(int argc, char **argv) { std::string config = options->asYamlString(); // Route the config string to construct marian model through TranslationModel - TranslationModel model(config, marian::bergamot::getModelMemoryFromConfig(options)); + marian::bergamot::Service model( + config, marian::bergamot::getModelMemoryFromConfig(options)); TranslationRequest translationRequest; std::vector texts; for (std::string line; std::getline(std::cin, line);) { - texts.emplace_back(line); + texts.emplace_back(line); } - auto results = model.translate(std::move(texts), translationRequest); - - // Resolve the future and get the actual result - //std::vector results = futureResults.get(); + auto results = model.translateMultiple(std::move(texts), translationRequest); for (auto &result : results) { std::cout << result.getTranslatedText() << std::endl; diff --git a/app/bergamot-translator-app.cpp b/app/bergamot-translator-app.cpp index 4fba00b..c487969 100644 --- a/app/bergamot-translator-app.cpp +++ b/app/bergamot-translator-app.cpp @@ -1,16 +1,17 @@ /* * main.cpp * - * An application which accepts line separated texts in stdin and returns translated ones in stdout. - * It is convenient for batch processing and can be used with tools like SacreBLEU. + * An application which accepts line separated texts in stdin and returns + * translated ones in stdout. It is convenient for batch processing and can be + * used with tools like SacreBLEU. * */ #include #include -#include "TranslationModel.h" #include "translator/parser.h" +#include "translator/service.h" int main(int argc, char **argv) { @@ -21,19 +22,16 @@ int main(int argc, char **argv) { std::string config = options->asYamlString(); // Route the config string to construct marian model through TranslationModel - TranslationModel model(config); + marian::bergamot::Service model(config); TranslationRequest translationRequest; std::vector texts; for (std::string line; std::getline(std::cin, line);) { - texts.emplace_back(line); + texts.emplace_back(line); } - auto results = model.translate(std::move(texts), translationRequest); - - // Resolve the future and get the actual result - //std::vector results = futureResults.get(); + auto results = model.translateMultiple(std::move(texts), translationRequest); for (auto &result : results) { std::cout << result.getTranslatedText() << std::endl; diff --git a/src/TranslationModel.h b/src/TranslationModel.h deleted file mode 100644 index 4b1be23..0000000 --- a/src/TranslationModel.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * TranslationModel.h - * - * Main interface for translation API. - */ - -#ifndef SRC_TRANSLATOR_TRANSLATIONMODEL_H_ -#define SRC_TRANSLATOR_TRANSLATIONMODEL_H_ - -#include -#include -#include - -// All 3rd party includes -#include "3rd_party/marian-dev/src/common/options.h" - -// All local project includes -#include "TranslationRequest.h" -#include "TranslationResult.h" -#include "translator/definitions.h" -#include "translator/service.h" - -/* A Translation model that translates a plain (without any markups and emojis) - * UTF-8 encoded text. This implementation supports translation from 1 source - * language to 1 target language. - */ -class TranslationModel { -public: - /* Construct the model using the model configuration options as yaml-formatted - * string - */ - /** - * @param config Marian yml config file in the form of a string - * @param model_memory optional byte array (aligned to 64!!!) that contains - * the bytes of a model.bin. - */ - TranslationModel(const std::string &config, - marian::bergamot::AlignedMemory modelMemory = marian::bergamot::AlignedMemory(), - marian::bergamot::AlignedMemory shortlistMemory = marian::bergamot::AlignedMemory()); - - ~TranslationModel(); - - /* This method performs translation on a list of UTF-8 encoded plain text - * (without any markups or emojis) and returns a list of results in the same - * order. The model supports translation from 1 source language to 1 target - * language. - * - * Each text entry can either be a word, a phrase, a sentence or a list of - * sentences. Additional information related to the translated text can be - * requested via TranslationRequest which is applied equally to each text - * entry. The translated text corresponding to each text entry and the - * additional information (as specified in the TranslationRequest) is - * encapsulated and returned in TranslationResult. - * - * The API splits each text entry into sentences internally, which are then - * translated independent of each other. The translated sentences are then - * joined back together and returned in TranslationResult. - * - * Please refer to the TranslationRequest class to find out what additional - * information can be requested. The alignment information can only be - * requested if the model supports it (check isAlignmentSupported() API). - * - * The texts argument will become empty after the execution of this API (each - * entry of texts list will be moved to its corresponding TranslationResult - * object). - */ - std::vector translate(std::vector &&texts, - TranslationRequest request); - - /* Check if the model can provide alignment information b/w original and - * translated text. */ - bool isAlignmentSupported() const; - -private: - // Model configuration options - std::shared_ptr configOptions_; // ORDER DEPENDECNY - marian::bergamot::Service service_; // ORDER DEPENDENCY -}; - -#endif /* SRC_TRANSLATOR_TRANSLATIONMODEL_H_ */ diff --git a/src/TranslationResult.h b/src/TranslationResult.h deleted file mode 100644 index 8c6c806..0000000 --- a/src/TranslationResult.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * TranslationResult.h - * - * The class that represents the result of TranslationModel::translate() - * API for each of its text entry and TranslationRequest. - */ - -#ifndef SRC_TRANSLATOR_TRANSLATIONRESULT_H_ -#define SRC_TRANSLATOR_TRANSLATIONRESULT_H_ - -#include -#include - -#include "QualityScore.h" - -/* This class represents the result of TranslationModel::translate() API - * for each of its text entry and TranslationRequest. - */ -class TranslationResult { -public: - typedef std::vector> - SentenceMappings; -#ifdef WASM_BINDINGS - TranslationResult(const std::string &original, const std::string &translation) - : originalText(original), translatedText(translation), - sentenceMappings() {} -#endif - TranslationResult(const std::string &original, const std::string &translation, - SentenceMappings &sentenceMappings) - : originalText(original), translatedText(translation), - sentenceMappings(sentenceMappings) {} - - TranslationResult(TranslationResult &&other) - : originalText(std::move(other.originalText)), - translatedText(std::move(other.translatedText)), - sentenceMappings(std::move(other.sentenceMappings)) {} - -#ifdef WASM_BINDINGS - TranslationResult(const TranslationResult &other) - : originalText(other.originalText), - translatedText(other.translatedText), - sentenceMappings(other.sentenceMappings) {} -#endif - - TranslationResult(std::string &&original, std::string &&translation, - SentenceMappings &&sentenceMappings) - : originalText(std::move(original)), - translatedText(std::move(translation)), - sentenceMappings(std::move(sentenceMappings)) {} - -#ifndef WASM_BINDINGS - TranslationResult &operator=(const TranslationResult &) = delete; -#else - TranslationResult &operator=(const TranslationResult &result) { - originalText = result.originalText; - translatedText = result.translatedText; - sentenceMappings = result.sentenceMappings; - return *this; - } -#endif - - /* Return the original text. */ - const std::string &getOriginalText() const { return originalText; } - - /* Return the translated text. */ - const std::string &getTranslatedText() const { return translatedText; } - - /* Return the Quality scores of the translated text. */ - const QualityScore &getQualityScore() const { return qualityScore; } - - /* Return the Sentence mappings (information regarding how individual - * sentences of originalText map to corresponding translated sentences in - * translatedText). - */ - const SentenceMappings &getSentenceMappings() const { - return sentenceMappings; - } - -private: - // Original text (in UTF-8 encoded format) that was supposed to be translated - std::string originalText; - - // Translation (in UTF-8 encoded format) of the originalText - std::string translatedText; - - // Quality score of the translated text at the granularity specified in - // TranslationRequest. It is an optional result (it will have no information - // if not requested in TranslationRequest) - QualityScore qualityScore; - - // Information regarding how individual sentences of originalText map to - // corresponding translated sentences in joined translated text - // (translatedText) An example of sentence mapping: - // originalText (contains 2 sentences) = "What is your name? - // My name is Abc." translatedText (contains 2 translated sentences) = - // "Was ist dein Name? Mein Name ist Abc." sentenceMappings = [ - // {"What is your name?", "Was ist dein Name?"}, // - // Pair(originalText[0],translatedText[0]) - // {"My name is Abc", "Mein Name ist Abc."} // - // Pair(originalText[1],translatedText[1]) - // ] - // - // It is an optional result (it will be empty if not requested in - // TranslationRequest). - SentenceMappings sentenceMappings; -}; - -#endif /* SRC_TRANSLATOR_TRANSLATIONRESULT_H_ */ diff --git a/src/translator/CMakeLists.txt b/src/translator/CMakeLists.txt index d7c8e3c..25ca916 100644 --- a/src/translator/CMakeLists.txt +++ b/src/translator/CMakeLists.txt @@ -1,5 +1,4 @@ add_library(bergamot-translator STATIC - TranslationModel.cpp byte_array_util.cpp text_processor.cpp sentence_splitter.cpp diff --git a/src/translator/TranslationModel.cpp b/src/translator/TranslationModel.cpp deleted file mode 100644 index 026a126..0000000 --- a/src/translator/TranslationModel.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * TranslationModel.cpp - * - */ - -#include -#include - -// All local project includes -#include "TranslationModel.h" -#include "translator/parser.h" -#include "translator/response.h" -#include "translator/service.h" - -TranslationModel::TranslationModel(const std::string &config, - marian::bergamot::AlignedMemory model_memory, - marian::bergamot::AlignedMemory lexical_memory) - : service_(config, std::move(model_memory), std::move(lexical_memory)) {} - -TranslationModel::~TranslationModel() {} - -std::vector -TranslationModel::translate(std::vector &&texts, - TranslationRequest request) { - - // This code, move into async? - std::vector translationResults; - std::vector responses = - service_.translateMultiple(std::move(texts), request); - for (auto &response : responses) { - TranslationResult::SentenceMappings sentenceMappings; - for (size_t idx = 0; idx < response.size(); idx++) { - marian::string_view src = response.source.sentence(idx); - marian::string_view tgt = response.target.sentence(idx); - sentenceMappings.emplace_back(std::string_view(src.data(), src.size()), - std::string_view(tgt.data(), tgt.size())); - } - - // In place construction. - translationResults.emplace_back( - std::move(response.source.text), // &&response.source_ - std::move(response.target.text), // &&response.translation_ - std::move(sentenceMappings) // &&sentenceMappings - ); - } - - return translationResults; -} - -bool TranslationModel::isAlignmentSupported() const { return false; } diff --git a/src/translator/response.h b/src/translator/response.h index 3b1f48d..0f7ecb5 100644 --- a/src/translator/response.h +++ b/src/translator/response.h @@ -64,6 +64,10 @@ struct Response { /// sparse matrix representation with indices corresponding /// to (sub-)words accessible through Annotation. std::vector alignments; + + const std::string &getOriginalText() const { return source.text; } + + const std::string &getTranslatedText() const { return target.text; } }; } // namespace bergamot } // namespace marian diff --git a/src/translator/service.cpp b/src/translator/service.cpp index f676797..3d19f5e 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -28,8 +28,8 @@ loadVocabularies(marian::Ptr options) { namespace marian { namespace bergamot { -Service::Service(Ptr options, AlignedMemory modelMemory, AlignedMemory shortlistMemory) - : requestId_(0), vocabs_(std::move(loadVocabularies(options))), +Service::Service(Ptr options, AlignedMemory modelMemory, AlignedMemory shortlistMemory) + : requestId_(0), options_(options), vocabs_(std::move(loadVocabularies(options))), text_processor_(vocabs_, options), batcher_(options), numWorkers_(options->get("cpu-threads")), modelMemory_(std::move(modelMemory)), shortlistMemory_(std::move(shortlistMemory)) diff --git a/src/translator/service.h b/src/translator/service.h index a731653..288c649 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -20,15 +20,22 @@ namespace marian { namespace bergamot { -/// Service offers methods create an asynchronous translation service. This is -/// intended to be similar to the ones provided for training or decoding in ML -/// pipelines with the following additional capabilities: +/// Service offers methods create an asynchronous translation service that +/// translates a plain (without any markups and emojis) UTF-8 encoded text. +/// This implementation supports translation from 1 source language to 1 target +/// language. +/// +/// This is intended to be similar to the ones provided for training or +/// decoding in ML pipelines with the following additional capabilities: /// /// 1. Provision of a request -> response based translation flow unlike the /// usual a line based translation or decoding provided in most ML frameworks. /// 2. Internal handling of normalization etc which changes source text to /// provide to client translation meta-information like alignments consistent /// with the unnormalized input text. +/// 3. The API splits each text entry into sentences internally, which are then +/// translated independent of each other. The translated sentences are then +/// joined back together and returned in Response. /// /// Service exposes methods to instantiate the service from a string /// configuration (which can cover most translators) and to translate an @@ -48,9 +55,10 @@ namespace bergamot { /// // Do things with response. /// ``` /// -/// Optionally Service can be initialized by also passing model_memory for +/// Optionally Service can be initialized by also passing model memory for /// purposes of efficiency (which defaults to nullpointer and then reads from /// file supplied through config). +/// class Service { public: @@ -84,8 +92,8 @@ public: explicit Service(const std::string &config, AlignedMemory modelMemory = AlignedMemory(), AlignedMemory shortlistMemory = AlignedMemory()) - : Service(parseOptions(config, /*validate=*/false), std::move(modelMemory), - std::move(shortlistMemory)) {} + : Service(parseOptions(config, /*validate=*/false), + std::move(modelMemory), std::move(shortlistMemory)) {} /// Explicit destructor to clean up after any threads initialized in /// asynchronous operation mode. @@ -108,12 +116,18 @@ public: std::future translate(std::string &&source, ResponseOptions options); - /// Translate an input, providing TranslationRequest across all texts to - /// construct Response. Provides the browser with the ability to break texts - /// into multiple Request keeping gains from efficiently batching internally. - /// Also useful when one has to set/unset alignments or quality in the - /// Response to save compute spent in constructing these objects. - + /// Translate multiple text-blobs in a single *blocking* API call, providing + /// TranslationRequest which applies across all text-blobs dictating how to + /// construct Response. TranslationRequest can be used to enable/disable + /// additional information like quality-scores, alignments etc. + /// + /// All texts are combined to efficiently construct batches together providing + /// speedups compared to calling translate() indepdently on individual + /// text-blob. Note that there will be minor differences in output when + /// text-blobs are individually translated due to approximations but similar + /// quality nonetheless. If you have async/multithread capabilities, it is + /// recommended to work with futures and translate() API. + /// /// @param [in] source: rvalue reference of the string to be translated /// @param [in] translationRequest: TranslationRequest (Unified API) /// indicating whether or not to include some member in the Response, also @@ -123,6 +137,11 @@ public: translateMultiple(std::vector &&source, TranslationRequest translationRequest); + /// Returns if model is alignment capable or not. + bool isAlignmentSupported() const { + return options_->hasAndNotEmpty("alignment"); + } + private: /// Queue an input for translation. std::future queueRequest(std::string &&input, @@ -149,6 +168,10 @@ private: /// Number of workers to launch. size_t numWorkers_; // ORDER DEPENDENCY (pcqueue_) + + /// Options object holding the options Service was instantiated with. + Ptr options_; + /// Model memory to load model passed as bytes. AlignedMemory modelMemory_; // ORDER DEPENDENCY (translators_) /// Shortlist memory passed as bytes. diff --git a/wasm/bindings/TranslationModelBindings.cpp b/wasm/bindings/TranslationModelBindings.cpp index cb9cf4b..41b9c2e 100644 --- a/wasm/bindings/TranslationModelBindings.cpp +++ b/wasm/bindings/TranslationModelBindings.cpp @@ -6,10 +6,14 @@ #include -#include "TranslationModel.h" +#include "response.h" +#include "service.h" using namespace emscripten; +typedef marian::bergamot::Service TranslationModel; +typedef marian::bergamot::Response TranslationResult; + val getByteArrayView(marian::bergamot::AlignedMemory& alignedMemory) { return val(typed_memory_view(alignedMemory.size(), alignedMemory.as())); } @@ -31,9 +35,11 @@ TranslationModel* TranslationModelFactory(const std::string &config, EMSCRIPTEN_BINDINGS(translation_model) { class_("TranslationModel") .constructor(&TranslationModelFactory, allow_raw_pointers()) - .function("translate", &TranslationModel::translate) + .function("translate", &TranslationModel::translateMultiple) .function("isAlignmentSupported", &TranslationModel::isAlignmentSupported) ; + // ^ We redirect Service::translateMultiple to WASMBound::translate instead. Sane API is + // translate. If and when async comes, we can be done with this inconsistency. register_vector("VectorString"); register_vector("VectorTranslationResult"); diff --git a/wasm/bindings/TranslationResultBindings.cpp b/wasm/bindings/TranslationResultBindings.cpp index a3713a1..c1c0ca8 100644 --- a/wasm/bindings/TranslationResultBindings.cpp +++ b/wasm/bindings/TranslationResultBindings.cpp @@ -6,15 +6,16 @@ #include #include -#include "TranslationResult.h" +#include "response.h" + +typedef marian::bergamot::Response TranslationResult; using namespace emscripten; // Binding code EMSCRIPTEN_BINDINGS(translation_result) { class_("TranslationResult") - .constructor() - .function("getOriginalText", &TranslationResult::getOriginalText) - .function("getTranslatedText", &TranslationResult::getTranslatedText) - ; + .constructor<>() + .function("getOriginalText", &TranslationResult::getOriginalText) + .function("getTranslatedText", &TranslationResult::getTranslatedText); }