From 5340b19eae8fcc91a2a79205e0b3dd65ad61ad4c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 18 May 2021 19:53:23 +0000 Subject: [PATCH] Format update with clang-format --- src/tests/annotation_tests.cpp | 28 ++++---- src/translator/annotation.cpp | 19 +++--- src/translator/annotation.h | 67 ++++++++------------ src/translator/batch.cpp | 12 ++-- src/translator/batch.h | 10 +-- src/translator/batch_translator.cpp | 65 ++++++++++--------- src/translator/batch_translator.h | 21 +++--- src/translator/batcher.cpp | 8 ++- src/translator/batcher.h | 12 ++-- src/translator/byte_array_util.cpp | 37 ++++++----- src/translator/byte_array_util.h | 6 +- src/translator/definitions.h | 17 ++--- src/translator/parser.h | 29 ++++----- src/translator/request.cpp | 36 ++++------- src/translator/request.h | 31 ++++----- src/translator/response.h | 24 +++---- src/translator/response_builder.cpp | 64 +++++++++---------- src/translator/response_builder.h | 27 ++++---- src/translator/response_options.h | 10 +-- src/translator/sentence_splitter.cpp | 38 +++++------ src/translator/sentence_splitter.h | 15 +++-- src/translator/service.cpp | 55 ++++++---------- src/translator/service.h | 46 ++++++-------- src/translator/text_processor.cpp | 36 ++++------- src/translator/text_processor.h | 25 ++++---- src/translator/vocabs.h | 29 ++++----- wasm/bindings/TranslationModelBindings.cpp | 30 ++++----- wasm/bindings/TranslationRequestBindings.cpp | 6 +- wasm/bindings/TranslationResultBindings.cpp | 1 + 29 files changed, 364 insertions(+), 440 deletions(-) diff --git a/src/tests/annotation_tests.cpp b/src/tests/annotation_tests.cpp index 0f02a7a..d7178f4 100644 --- a/src/tests/annotation_tests.cpp +++ b/src/tests/annotation_tests.cpp @@ -1,8 +1,9 @@ -#include "catch.hpp" -#include "translator/annotation.h" #include #include +#include "catch.hpp" +#include "translator/annotation.h" + using namespace marian::bergamot; TEST_CASE("Test Annotation API with random sentences") { @@ -52,8 +53,7 @@ TEST_CASE("Test Annotation API with random sentences") { } std::string text; for (size_t idx = 0; idx < sentences; idx++) { - if (idx != 0) - text += "\n"; + if (idx != 0) text += "\n"; // Words can be zero, we need to support empty word sentences as well. size_t numWords = randomIntGen_() % maxWords; @@ -96,8 +96,8 @@ TEST_CASE("Test Annotation API with random sentences") { groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd}); } - AnnotatedText testAnnotation(std::move(text)); // This the container we add through API and - // check if the access is correct. + AnnotatedText testAnnotation(std::move(text)); // This the container we add through API and + // check if the access is correct. // We prepare string_views now with the known ByteRanges and use the // string_view based AnnotatedText.addSentence(...) API to add sentences to @@ -105,8 +105,7 @@ TEST_CASE("Test Annotation API with random sentences") { // the math underneath. if (debug) { - std::cout << "Inserting words onto container and save ground-truth-table:" - << std::endl; + std::cout << "Inserting words onto container and save ground-truth-table:" << std::endl; } std::vector> wordStringViews; @@ -115,8 +114,7 @@ TEST_CASE("Test Annotation API with random sentences") { std::vector wordByteRanges; bool first{true}; for (auto &word : sentence) { - marian::string_view wordView(&testAnnotation.text[word.begin], - word.size()); + marian::string_view wordView(&testAnnotation.text[word.begin], word.size()); wordByteRanges.push_back(wordView); if (debug) { if (first) { @@ -127,7 +125,8 @@ TEST_CASE("Test Annotation API with random sentences") { std::cout << std::string(wordView); } } - testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(), testAnnotation.text.data() + sentence_iter->begin); + testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(), + testAnnotation.text.data() + sentence_iter->begin); ++sentence_iter; wordStringViews.push_back(wordByteRanges); if (debug) { @@ -136,9 +135,7 @@ TEST_CASE("Test Annotation API with random sentences") { } if (debug) { - std::cout - << "Inserting sentences onto container and save ground-truth-table" - << std::endl; + std::cout << "Inserting sentences onto container and save ground-truth-table" << std::endl; } std::vector sentenceStringViews; for (auto &sentenceByteRange : groundTruthSentences) { @@ -203,7 +200,8 @@ TEST_CASE("Test Annotation API with random sentences") { // Sentence if the random test above does not cover it for some reason. int emptySentenceIdx = sentences; std::vector emptySentence; - testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(), testAnnotation.text.data() + testAnnotation.text.size()); + testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(), + testAnnotation.text.data() + testAnnotation.text.size()); // There are no words. CHECK(testAnnotation.numWords(emptySentenceIdx) == 0); diff --git a/src/translator/annotation.cpp b/src/translator/annotation.cpp index 90e02e0..35f5c69 100644 --- a/src/translator/annotation.cpp +++ b/src/translator/annotation.cpp @@ -1,4 +1,5 @@ #include "annotation.h" + #include namespace marian { @@ -9,7 +10,8 @@ AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) { annotation.token_begin_.back() = text.size(); } -void AnnotatedText::appendSentence(string_view prefix, std::vector::iterator begin, std::vector::iterator end) { +void AnnotatedText::appendSentence(string_view prefix, std::vector::iterator begin, + std::vector::iterator end) { assert(annotation.token_begin_.back() == text.size()); // We'll be adding tokens from the sentence and another gap. annotation.token_begin_.reserve(annotation.token_begin_.size() + (end - begin) + 1); @@ -25,7 +27,7 @@ void AnnotatedText::appendSentence(string_view prefix, std::vector: } if (begin != end) { text.append(begin->data(), (end - 1)->data() + (end - 1)->size()); - assert(offset == text.size()); // Tokens should be contiguous. + assert(offset == text.size()); // Tokens should be contiguous. } // Add the gap after the sentence. This is empty for now, but will be @@ -39,7 +41,8 @@ void AnnotatedText::appendEndingWhitespace(string_view whitespace) { annotation.token_begin_.back() = text.size(); } -void AnnotatedText::recordExistingSentence(std::vector::iterator begin, std::vector::iterator end, const char *sentence_begin) { +void AnnotatedText::recordExistingSentence(std::vector::iterator begin, + std::vector::iterator end, const char *sentence_begin) { assert(sentence_begin >= text.data()); assert(sentence_begin <= text.data() + text.size()); assert(begin == end || sentence_begin == begin->data()); @@ -48,9 +51,9 @@ void AnnotatedText::recordExistingSentence(std::vector::iterator be // Clip off size token ending. annotation.token_begin_.resize(annotation.token_begin_.size() - 1); for (std::vector::iterator i = begin; i != end; ++i) { - assert(i->data() >= text.data()); // In range. - assert(i->data() + i->size() <= text.data() + text.size()); // In range - assert(i + 1 == end || i->data() + i->size() == (i+1)->data()); // Contiguous + assert(i->data() >= text.data()); // In range. + assert(i->data() + i->size() <= text.data() + text.size()); // In range + assert(i + 1 == end || i->data() + i->size() == (i + 1)->data()); // Contiguous annotation.token_begin_.push_back(i->data() - text.data()); } // Gap token after sentence. @@ -65,5 +68,5 @@ void AnnotatedText::recordExistingSentence(std::vector::iterator be annotation.token_begin_.push_back(text.size()); } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/annotation.h b/src/translator/annotation.h index 555ab53..dde340b 100644 --- a/src/translator/annotation.h +++ b/src/translator/annotation.h @@ -1,11 +1,12 @@ #ifndef BERGAMOT_SENTENCE_RANGES_H_ #define BERGAMOT_SENTENCE_RANGES_H_ -#include "data/types.h" #include #include #include +#include "data/types.h" + namespace marian { namespace bergamot { @@ -18,8 +19,8 @@ struct ByteRange { }; /// Annotation expresses sentence and token boundary information as ranges of -/// bytes in a string, but does not itself own the string. -/// +/// bytes in a string, but does not itself own the string. +/// /// See also AnnotatedText, which owns Annotation and the string. AnnotatedText /// wraps these ByteRange functions to provide a string_view interface. /// @@ -42,7 +43,7 @@ struct ByteRange { /// produced empty output). That's fine, these are just empty ranges as you /// would expect. class Annotation { -public: + public: /// Initially an empty string. Populated by AnnotatedText. Annotation() { token_begin_.push_back(0); @@ -62,25 +63,25 @@ public: /// `.numWords()` for `sentenceIdx` for defined behaviour. ByteRange word(size_t sentenceIdx, size_t wordIdx) const { size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx; - return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; + return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; } /// Returns a ByteRange representing sentence corresponding to `sentenceIdx`. /// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when /// less than `.numSentences()`. ByteRange sentence(size_t sentenceIdx) const { - return ByteRange { - token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */ - token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */ + return ByteRange{ + token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */ + token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */ }; } ByteRange gap(size_t gapIdx) const { size_t tokenIdx = gap_[gapIdx]; - return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; + return ByteRange{token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; } -private: + private: friend class AnnotatedText; /// Map from token index to byte offset at which it begins. Token i is: /// [token_begin_[i], token_begin_[i+1]) @@ -124,9 +125,9 @@ private: /// 3. Bind the text and annotations together, to move around as a meaningful /// unit. struct AnnotatedText { -public: - std::string text; ///< Blob of string elements in annotation refers to. - Annotation annotation; ///< sentence and (sub-) word annotations. + public: + std::string text; ///< Blob of string elements in annotation refers to. + Annotation annotation; ///< sentence and (sub-) word annotations. /// Construct an empty AnnotatedText. This is useful when the target string or /// ByteRanges are not known yet, but the public members can be used to @@ -143,10 +144,8 @@ public: /// string_views. Since this tracks only prefix, remember /// appendEndingWhitespace. /// The string_views must not already be in text. - void appendSentence( - string_view prefix, - std::vector::iterator tokens_begin, - std::vector::iterator tokens_end); + void appendSentence(string_view prefix, std::vector::iterator tokens_begin, + std::vector::iterator tokens_end); /// Append the whitespace at the end of input. string_view must not be in /// text. @@ -158,18 +157,14 @@ public: /// Normally the beginning of the sentence can be inferred from /// tokens_begin->data() but the tokens could be empty, so sentence_begin is /// required to know where the sentence is. - void recordExistingSentence( - std::vector::iterator tokens_begin, - std::vector::iterator tokens_end, - const char *sentence_begin); + void recordExistingSentence(std::vector::iterator tokens_begin, + std::vector::iterator tokens_end, const char *sentence_begin); /// Returns the number of sentences in the annotation structure. const size_t numSentences() const { return annotation.numSentences(); } /// Returns number of words in the sentece identified by sentenceIdx. - const size_t numWords(size_t sentenceIdx) const { - return annotation.numWords(sentenceIdx); - } + const size_t numWords(size_t sentenceIdx) const { return annotation.numWords(sentenceIdx); } /// Returns a string_view representing wordIdx in sentenceIdx string_view word(size_t sentenceIdx, size_t wordIdx) const { @@ -177,9 +172,7 @@ public: } /// Returns a string_view representing sentence corresponding to sentenceIdx. - string_view sentence(size_t sentenceIdx) const { - return asStringView(annotation.sentence(sentenceIdx)); - } + string_view sentence(size_t sentenceIdx) const { return asStringView(annotation.sentence(sentenceIdx)); } /// Returns the string_view of the gap between two sentences in the container. /// @@ -191,27 +184,21 @@ public: /// * For `i = N`, the gap between the last (N-1th) sentence and end of /// text. /// @param sentenceIdx: Can be between `[0, numSentences()]`. - string_view gap(size_t sentenceIdx) const { - return asStringView(annotation.gap(sentenceIdx)); - } + string_view gap(size_t sentenceIdx) const { return asStringView(annotation.gap(sentenceIdx)); } /// Returns a ByteRange representing wordIdx in sentenceIdx - ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const { - return annotation.word(sentenceIdx, wordIdx); - } + ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const { return annotation.word(sentenceIdx, wordIdx); } /// Returns a ByteRange representing sentence corresponding to sentenceIdx. - ByteRange sentenceAsByteRange(size_t sentenceIdx) const { - return annotation.sentence(sentenceIdx); - } + ByteRange sentenceAsByteRange(size_t sentenceIdx) const { return annotation.sentence(sentenceIdx); } -private: + private: string_view asStringView(const ByteRange &byteRange) const { return string_view(text.data() + byteRange.begin, byteRange.size()); } }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // BERGAMOT_SENTENCE_RANGES_H_ +#endif // BERGAMOT_SENTENCE_RANGES_H_ diff --git a/src/translator/batch.cpp b/src/translator/batch.cpp index 82ebbfb..08d3d02 100644 --- a/src/translator/batch.cpp +++ b/src/translator/batch.cpp @@ -1,4 +1,5 @@ #include "batch.h" + #include "request.h" namespace marian { @@ -11,18 +12,15 @@ void Batch::log() { maxLength = std::max(maxLength, static_cast(sentence.numTokens())); } - LOG(info, "Batch(tokens={}, max-length={}, sentences_={})", numTokens, - maxLength, sentences_.size()); + LOG(info, "Batch(tokens={}, max-length={}, sentences_={})", numTokens, maxLength, sentences_.size()); } -void Batch::add(const RequestSentence &sentence) { - sentences_.push_back(sentence); -} +void Batch::add(const RequestSentence &sentence) { sentences_.push_back(sentence); } void Batch::completeBatch(const Histories &histories) { for (size_t i = 0; i < sentences_.size(); i++) { sentences_[i].completeSentence(histories[i]); } } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/batch.h b/src/translator/batch.h index 5f86a2f..8a09c39 100644 --- a/src/translator/batch.h +++ b/src/translator/batch.h @@ -8,7 +8,7 @@ namespace marian { namespace bergamot { class Batch { -public: + public: Batch() {} void clear() { sentences_.clear(); } @@ -41,12 +41,12 @@ public: // Convenience function to log batch-statistics. numTokens, max-length. void log(); -private: + private: bool poison_{false}; RequestSentences sentences_; }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_BATCH_H_ +#endif // SRC_BERGAMOT_BATCH_H_ diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index b35c4ce..c27edc1 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -1,56 +1,63 @@ #include "batch_translator.h" + #include "batch.h" +#include "byte_array_util.h" #include "common/logging.h" #include "data/corpus.h" #include "data/text_input.h" #include "translator/beam_search.h" -#include "byte_array_util.h" namespace marian { namespace bergamot { -BatchTranslator::BatchTranslator(DeviceId const device, - Vocabs &vocabs, - Ptr options, - const AlignedMemory* modelMemory, - const AlignedMemory* shortlistMemory) - : device_(device), options_(options), vocabs_(vocabs), - modelMemory_(modelMemory), shortlistMemory_(shortlistMemory) {} +BatchTranslator::BatchTranslator(DeviceId const device, Vocabs &vocabs, Ptr options, + const AlignedMemory *modelMemory, const AlignedMemory *shortlistMemory) + : device_(device), + options_(options), + vocabs_(vocabs), + modelMemory_(modelMemory), + shortlistMemory_(shortlistMemory) {} void BatchTranslator::initialize() { // Initializes the graph. - bool check = options_->get("check-bytearray",false); // Flag holds whether validate the bytearray (model and shortlist) + bool check = + options_->get("check-bytearray", false); // Flag holds whether validate the bytearray (model and shortlist) if (options_->hasAndNotEmpty("shortlist")) { int srcIdx = 0, trgIdx = 1; - bool shared_vcb = vocabs_.sources().front() == vocabs_.target(); // vocabs_->sources().front() is invoked as we currently only support one source vocab + bool shared_vcb = + vocabs_.sources().front() == + vocabs_.target(); // vocabs_->sources().front() is invoked as we currently only support one source vocab if (shortlistMemory_->size() > 0 && shortlistMemory_->begin() != nullptr) { slgen_ = New(shortlistMemory_->begin(), shortlistMemory_->size(), - vocabs_.sources().front(), vocabs_.target(), - srcIdx, trgIdx, shared_vcb, check); - } - else { + vocabs_.sources().front(), vocabs_.target(), srcIdx, trgIdx, + shared_vcb, check); + } else { // Changed to BinaryShortlistGenerator to enable loading binary shortlist file // This class also supports text shortlist file - slgen_ = New(options_, vocabs_.sources().front(), - vocabs_.target(), srcIdx, - trgIdx, shared_vcb); + slgen_ = New(options_, vocabs_.sources().front(), vocabs_.target(), srcIdx, + trgIdx, shared_vcb); } } - graph_ = New(true); // set the graph to be inference only + graph_ = New(true); // set the graph to be inference only auto prec = options_->get>("precision", {"float32"}); graph_->setDefaultElementType(typeFromString(prec[0])); graph_->setDevice(device_); graph_->getBackend()->configureDevice(options_); graph_->reserveWorkspaceMB(options_->get("workspace")); - if (modelMemory_->size() > 0 && modelMemory_->begin() != nullptr) { // If we have provided a byte array that contains the model memory, we can initialise the model from there, as opposed to from reading in the config file + if (modelMemory_->size() > 0 && + modelMemory_->begin() != + nullptr) { // If we have provided a byte array that contains the model memory, we can initialise the model + // from there, as opposed to from reading in the config file ABORT_IF((uintptr_t)modelMemory_->begin() % 256 != 0, "The provided memory is not aligned to 256 bytes and will crash when vector instructions are used on it."); if (check) { ABORT_IF(!validateBinaryModel(*modelMemory_, modelMemory_->size()), "The binary file is invalid. Incomplete or corrupted download?"); } - const std::vector container = {modelMemory_->begin()}; // Marian supports multiple models initialised in this manner hence std::vector. However we will only ever use 1 during decoding. + const std::vector container = { + modelMemory_->begin()}; // Marian supports multiple models initialised in this manner hence std::vector. + // However we will only ever use 1 during decoding. scorers_ = createScorers(options_, container); } else { scorers_ = createScorers(options_); @@ -82,11 +89,9 @@ void BatchTranslator::translate(Batch &batch) { std::vector sentenceIds; std::vector maxDims; for (auto &ex : batchVector) { - if (maxDims.size() < ex.size()) - maxDims.resize(ex.size(), 0); + if (maxDims.size() < ex.size()) maxDims.resize(ex.size(), 0); for (size_t i = 0; i < ex.size(); ++i) { - if (ex[i].size() > (size_t)maxDims[i]) - maxDims[i] = (int)ex[i].size(); + if (ex[i].size() > (size_t)maxDims[i]) maxDims[i] = (int)ex[i].size(); } sentenceIds.push_back(ex.getId()); } @@ -96,8 +101,7 @@ void BatchTranslator::translate(Batch &batch) { std::vector> subBatches; for (size_t j = 0; j < maxDims.size(); ++j) { - subBatches.emplace_back( - New(batchSize, maxDims[j], vocabs_.sources().at(j))); + subBatches.emplace_back(New(batchSize, maxDims[j], vocabs_.sources().at(j))); } std::vector words(maxDims.size(), 0); @@ -111,17 +115,16 @@ void BatchTranslator::translate(Batch &batch) { } } - for (size_t j = 0; j < maxDims.size(); ++j) - subBatches[j]->setWords(words[j]); + for (size_t j = 0; j < maxDims.size(); ++j) subBatches[j]->setWords(words[j]); auto corpus_batch = Ptr(new CorpusBatch(subBatches)); corpus_batch->setSentenceIds(sentenceIds); - + auto search = New(options_, scorers_, vocabs_.target()); auto histories = std::move(search->search(graph_, corpus_batch)); batch.completeBatch(histories); } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index 048ba77..2d1ddfe 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -26,27 +26,28 @@ class BatchTranslator { // mainloop runs until until it receives poison from the PCQueue. Threads are // shut down in Service which calls join() on the threads. -public: + public: /** * Initialise the marian translator. * @param device DeviceId that performs translation. Could be CPU or GPU * @param vocabs Vector that contains ptrs to two vocabs * @param options Marian options object - * @param modelMemory byte array (aligned to 256!!!) that contains the bytes of a model.bin. Provide a nullptr if not used. + * @param modelMemory byte array (aligned to 256!!!) that contains the bytes of a model.bin. Provide a nullptr if not + * used. * @param shortlistMemory byte array of shortlist (aligned to 64) */ - explicit BatchTranslator(DeviceId const device, Vocabs &vocabs, - Ptr options, const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory); + explicit BatchTranslator(DeviceId const device, Vocabs& vocabs, Ptr options, + const AlignedMemory* modelMemory, const AlignedMemory* shortlistMemory); // convenience function for logging. TODO(jerin) std::string _identifier() { return "worker" + std::to_string(device_.no); } - void translate(Batch &batch); + void translate(Batch& batch); void initialize(); -private: + private: Ptr options_; DeviceId device_; - const Vocabs& vocabs_; + const Vocabs& vocabs_; Ptr graph_; std::vector> scorers_; Ptr slgen_; @@ -54,7 +55,7 @@ private: const AlignedMemory* shortlistMemory_{nullptr}; }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_BATCH_TRANSLATOR_H_ +#endif // SRC_BERGAMOT_BATCH_TRANSLATOR_H_ diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 2b9c551..3cf69fd 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -1,7 +1,9 @@ #include "batcher.h" + +#include + #include "batch.h" #include "common/logging.h" -#include namespace marian { namespace bergamot { @@ -57,5 +59,5 @@ void Batcher::addWholeRequest(Ptr request) { } } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/batcher.h b/src/translator/batcher.h index e5ac086..c76a16f 100644 --- a/src/translator/batcher.h +++ b/src/translator/batcher.h @@ -17,7 +17,7 @@ namespace marian { namespace bergamot { class Batcher { -public: + public: explicit Batcher(Ptr options); // RequestSentence incorporates (tentative) notions of priority with each @@ -26,9 +26,9 @@ public: void addSentenceWithPriority(RequestSentence &sentence); void addWholeRequest(Ptr request); - bool operator>>(Batch &batch); // alias for cleaveBatch + bool operator>>(Batch &batch); // alias for cleaveBatch -private: + private: // Loads sentences with sentences compiled from (tentatively) multiple // requests optimizing for both padding and priority. bool cleaveBatch(Batch &batch); @@ -37,7 +37,7 @@ private: size_t batchNumber_{0}; }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_BATCHER_H_ +#endif // SRC_BERGAMOT_BATCHER_H_ diff --git a/src/translator/byte_array_util.cpp b/src/translator/byte_array_util.cpp index 69564d2..247d164 100644 --- a/src/translator/byte_array_util.cpp +++ b/src/translator/byte_array_util.cpp @@ -1,5 +1,7 @@ #include "byte_array_util.h" + #include + #include #include @@ -26,29 +28,30 @@ const T* get(const void*& current, uint64_t num = 1) { current = (const T*)current + num; return ptr; } -} // Anonymous namespace +} // Anonymous namespace bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize) { - const void * current = model.begin(); - uint64_t memoryNeeded = sizeof(uint64_t)*2; // We keep track of how much memory we would need if we have a complete file + const void* current = model.begin(); + uint64_t memoryNeeded = + sizeof(uint64_t) * 2; // We keep track of how much memory we would need if we have a complete file uint64_t numHeaders; - if (fileSize >= memoryNeeded) { // We have enough filesize to fetch the headers. + if (fileSize >= memoryNeeded) { // We have enough filesize to fetch the headers. uint64_t binaryFileVersion = *get(current); - numHeaders = *get(current); // number of item headers that follow + numHeaders = *get(current); // number of item headers that follow } else { return false; } - memoryNeeded += numHeaders*sizeof(Header); + memoryNeeded += numHeaders * sizeof(Header); const Header* headers; if (fileSize >= memoryNeeded) { - headers = get
(current, numHeaders); // read that many headers + headers = get
(current, numHeaders); // read that many headers } else { return false; } // Calculate how many bytes we are going to for reading just the names and the shape for (uint64_t i = 0; i < numHeaders; i++) { - memoryNeeded += headers[i].nameLength + headers[i].shapeLength*sizeof(int); + memoryNeeded += headers[i].nameLength + headers[i].shapeLength * sizeof(int); // Advance the pointers. get(current, headers[i].nameLength); get(current, headers[i].shapeLength); @@ -58,7 +61,7 @@ bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize) { // Read that in, before calculating the actual tensor memory requirements. uint64_t aligned_offset; if (fileSize >= memoryNeeded) { - aligned_offset = *get(current); // Offset to align memory to 256 size + aligned_offset = *get(current); // Offset to align memory to 256 size memoryNeeded += aligned_offset + sizeof(uint64_t); } else { return false; @@ -77,17 +80,17 @@ bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize) { } } -AlignedMemory loadFileToMemory(const std::string& path, size_t alignment){ +AlignedMemory loadFileToMemory(const std::string& path, size_t alignment) { uint64_t fileSize = filesystem::fileSize(path); io::InputFileStream in(path); ABORT_IF(in.bad(), "Failed opening file stream: {}", path); AlignedMemory alignedMemory(fileSize, alignment); - in.read(reinterpret_cast(alignedMemory.begin()), fileSize); + in.read(reinterpret_cast(alignedMemory.begin()), fileSize); ABORT_IF(alignedMemory.size() != fileSize, "Error reading file {}", path); return alignedMemory; } -AlignedMemory getModelMemoryFromConfig(marian::Ptr options){ +AlignedMemory getModelMemoryFromConfig(marian::Ptr options) { auto models = options->get>("models"); ABORT_IF(models.size() != 1, "Loading multiple binary models is not supported for now as it is not necessary."); marian::filesystem::Path modelPath(models[0]); @@ -96,14 +99,14 @@ AlignedMemory getModelMemoryFromConfig(marian::Ptr options){ return alignedMemory; } -AlignedMemory getShortlistMemoryFromConfig(marian::Ptr options){ +AlignedMemory getShortlistMemoryFromConfig(marian::Ptr options) { auto shortlist = options->get>("shortlist"); ABORT_IF(shortlist.empty(), "No path to shortlist file is given."); return loadFileToMemory(shortlist[0], 64); } void getVocabsMemoryFromConfig(marian::Ptr options, - std::vector>& vocabMemories){ + std::vector>& vocabMemories) { auto vfiles = options->get>("vocabs"); ABORT_IF(vfiles.size() < 2, "Insufficient number of vocabularies."); vocabMemories.resize(vfiles.size()); @@ -117,7 +120,7 @@ void getVocabsMemoryFromConfig(marian::Ptr options, } } -MemoryBundle getMemoryBundleFromConfig(marian::Ptr options){ +MemoryBundle getMemoryBundleFromConfig(marian::Ptr options) { MemoryBundle memoryBundle; memoryBundle.model = getModelMemoryFromConfig(options); memoryBundle.shortlist = getShortlistMemoryFromConfig(options); @@ -125,5 +128,5 @@ MemoryBundle getMemoryBundleFromConfig(marian::Ptr options){ return memoryBundle; } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/byte_array_util.h b/src/translator/byte_array_util.h index 14c79b3..63aa3a6 100644 --- a/src/translator/byte_array_util.h +++ b/src/translator/byte_array_util.h @@ -1,5 +1,5 @@ -#include "marian.h" #include "definitions.h" +#include "marian.h" namespace marian { namespace bergamot { @@ -11,5 +11,5 @@ void getVocabsMemoryFromConfig(marian::Ptr options, std::vector>& vocabMemories); bool validateBinaryModel(const AlignedMemory& model, uint64_t fileSize); MemoryBundle getMemoryBundleFromConfig(marian::Ptr options); -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/definitions.h b/src/translator/definitions.h index bf1cb57..b1f874c 100644 --- a/src/translator/definitions.h +++ b/src/translator/definitions.h @@ -1,10 +1,11 @@ #ifndef SRC_BERGAMOT_DEFINITIONS_H_ #define SRC_BERGAMOT_DEFINITIONS_H_ +#include + +#include "aligned.h" #include "data/types.h" #include "data/vocab_base.h" -#include "aligned.h" -#include namespace marian { namespace bergamot { @@ -18,7 +19,7 @@ typedef AlignedVector AlignedMemory; /// Memory bundle for all byte-arrays. /// Can be a set/subset of model, shortlist, vocabs and ssplitPrefixFile bytes. struct MemoryBundle { - AlignedMemory model; ///< Byte-array of model (aligned to 256) + AlignedMemory model; ///< Byte-array of model (aligned to 256) AlignedMemory shortlist; ///< Byte-array of shortlist (aligned to 64) /// Vector of vocabulary memories (aligned to 64). @@ -30,8 +31,8 @@ struct MemoryBundle { AlignedMemory ssplitPrefixFile; }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian // @TODO at the moment the usage of string_view in this repository is a hot mess and a disaster waiting to happen. // ssplit uses std::string_view if the compiler supports c++17, else falls back to c++11 and absl::string_view @@ -44,10 +45,10 @@ struct MemoryBundle { #if defined(__GNUC__) && __GNUC__ < 6 && !defined(__clang__) #include namespace std { - using string_view = std::experimental::string_view; -} // namespace std +using string_view = std::experimental::string_view; +} // namespace std #else #include #endif -#endif // SRC_BERGAMOT_DEFINITIONS_H_ +#endif // SRC_BERGAMOT_DEFINITIONS_H_ diff --git a/src/translator/parser.h b/src/translator/parser.h index 207890c..b2b0a80 100644 --- a/src/translator/parser.h +++ b/src/translator/parser.h @@ -12,26 +12,21 @@ namespace bergamot { inline marian::ConfigParser createConfigParser() { marian::ConfigParser cp(marian::cli::mode::translation); - cp.addOption( - "--ssplit-prefix-file", "Bergamot Options", - "File with nonbreaking prefixes for sentence splitting."); + cp.addOption("--ssplit-prefix-file", "Bergamot Options", + "File with nonbreaking prefixes for sentence splitting."); - cp.addOption("--ssplit-mode", "Server Options", - "[paragraph, sentence, wrapped_text]", "paragraph"); + cp.addOption("--ssplit-mode", "Server Options", "[paragraph, sentence, wrapped_text]", "paragraph"); - cp.addOption( - "--max-length-break", "Bergamot Options", - "Maximum input tokens to be processed in a single sentence.", 128); + cp.addOption("--max-length-break", "Bergamot Options", + "Maximum input tokens to be processed in a single sentence.", 128); - cp.addOption( - "--check-bytearray", "Bergamot Options", - "Flag holds whether to check the content of the bytearray (true by default)", true); + cp.addOption("--check-bytearray", "Bergamot Options", + "Flag holds whether to check the content of the bytearray (true by default)", true); - return cp; + return cp; } -inline std::shared_ptr -parseOptions(const std::string &config, bool validate = true) { +inline std::shared_ptr parseOptions(const std::string &config, bool validate = true) { marian::Options options; // @TODO(jerinphilip) There's something off here, @XapaJIaMnu suggests @@ -67,7 +62,7 @@ parseOptions(const std::string &config, bool validate = true) { return std::make_shared(options); } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_PARSER_H +#endif // SRC_BERGAMOT_PARSER_H diff --git a/src/translator/request.cpp b/src/translator/request.cpp index 37b164f..9bdae9f 100644 --- a/src/translator/request.cpp +++ b/src/translator/request.cpp @@ -1,23 +1,22 @@ #include "request.h" -#include "definitions.h" -#include "response.h" -#include "annotation.h" - -#include "common/logging.h" #include +#include "annotation.h" +#include "common/logging.h" +#include "definitions.h" +#include "response.h" + namespace marian { namespace bergamot { // ----------------------------------------------------------------- -Request::Request(size_t Id, Segments &&segments, - ResponseBuilder &&responseBuilder) - : Id_(Id), segments_(std::move(segments)), +Request::Request(size_t Id, Segments &&segments, ResponseBuilder &&responseBuilder) + : Id_(Id), + segments_(std::move(segments)), responseBuilder_(std::move(responseBuilder)) { - counter_ = segments_.size(); histories_.resize(segments_.size(), nullptr); @@ -31,9 +30,7 @@ Request::Request(size_t Id, Segments &&segments, size_t Request::numSegments() const { return segments_.size(); } -size_t Request::segmentTokens(size_t index) const { - return (segments_[index].size()); -} +size_t Request::segmentTokens(size_t index) const { return (segments_[index].size()); } Segment Request::getSegment(size_t index) const { return segments_[index]; } @@ -56,12 +53,9 @@ bool Request::operator<(const Request &b) const { // ------------------------------------------------------------------ -RequestSentence::RequestSentence(size_t index, Ptr request) - : index_(index), request_(request) {} +RequestSentence::RequestSentence(size_t index, Ptr request) : index_(index), request_(request) {} -size_t RequestSentence::numTokens() const { - return (request_->segmentTokens(index_)); -} +size_t RequestSentence::numTokens() const { return (request_->segmentTokens(index_)); } void RequestSentence::completeSentence(Ptr history) { // Relays completeSentence into request's processHistory, using index @@ -69,9 +63,7 @@ void RequestSentence::completeSentence(Ptr history) { request_->processHistory(index_, history); } -Segment RequestSentence::getUnderlyingSegment() const { - return request_->getSegment(index_); -} +Segment RequestSentence::getUnderlyingSegment() const { return request_->getSegment(index_); } bool operator<(const RequestSentence &a, const RequestSentence &b) { // Operator overload for usage in priority-queue / set. @@ -83,5 +75,5 @@ bool operator<(const RequestSentence &a, const RequestSentence &b) { // ---------------------------------------------------------------------- -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/request.h b/src/translator/request.h index 7983680..a2ea1af 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -1,20 +1,18 @@ #ifndef SRC_BERGAMOT_REQUEST_H_ #define SRC_BERGAMOT_REQUEST_H_ +#include +#include +#include + +#include "annotation.h" +#include "common/logging.h" +#include "data/types.h" #include "definitions.h" #include "response.h" #include "response_builder.h" -#include "annotation.h" - -#include "common/logging.h" -#include "data/types.h" #include "translator/beam_search.h" -#include - -#include -#include - namespace marian { namespace bergamot { @@ -37,7 +35,7 @@ namespace bergamot { /// corresponding to the Request and set value of the promise which triggers the /// future at client. class Request { -public: + public: /// Constructs an internal representation of the Request identified by Id, /// processed Segments and accepts a callback (ResponseBuilder) which builds /// the Response upon completion of the Request. @@ -69,7 +67,7 @@ public: /// compiled from requests. void processHistory(size_t index, Ptr history); -private: + private: size_t Id_; /// Multiple translation-workers can concurrently access the same Request. The @@ -95,8 +93,7 @@ private: /// within Request, while batching mechanism (Batcher) compiles Batch from /// RequestSentence-s coming from different Requests. class RequestSentence { - -public: + public: RequestSentence(size_t, Ptr); /// Number of tokens in the segment this RequestSentence represents. Used to @@ -112,14 +109,14 @@ public: friend bool operator<(const RequestSentence &a, const RequestSentence &b); -private: + private: size_t index_; Ptr request_; }; typedef std::vector RequestSentences; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_REQUEST_H_ +#endif // SRC_BERGAMOT_REQUEST_H_ diff --git a/src/translator/response.h b/src/translator/response.h index 5d4ea78..5c92441 100644 --- a/src/translator/response.h +++ b/src/translator/response.h @@ -1,16 +1,16 @@ #ifndef SRC_BERGAMOT_RESPONSE_H_ #define SRC_BERGAMOT_RESPONSE_H_ -#include "data/alignment.h" -#include "data/types.h" -#include "definitions.h" -#include "annotation.h" -#include "translator/beam_search.h" - #include #include #include +#include "annotation.h" +#include "data/alignment.h" +#include "data/types.h" +#include "definitions.h" +#include "translator/beam_search.h" + namespace marian { namespace bergamot { @@ -18,9 +18,9 @@ namespace bergamot { /// internals but is brought here to maintain translator /// agnosticism/independence. struct Point { - size_t src; ///< Index pointing to source ByteRange - size_t tgt; ///< Index pointing to target ByteRange - float prob; ///< Score between [0, 1] on indicating degree of alignment. + size_t src; ///< Index pointing to source ByteRange + size_t tgt; ///< Index pointing to target ByteRange + float prob; ///< Score between [0, 1] on indicating degree of alignment. }; /// Alignment is a sparse matrix, where Points represent entries with values. @@ -69,7 +69,7 @@ struct Response { const std::string &getTranslatedText() const { return target.text; } }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_RESPONSE_H_ +#endif // SRC_BERGAMOT_RESPONSE_H_ diff --git a/src/translator/response_builder.cpp b/src/translator/response_builder.cpp index b2f561b..6d60b6c 100644 --- a/src/translator/response_builder.cpp +++ b/src/translator/response_builder.cpp @@ -1,17 +1,17 @@ #include "response_builder.h" + #include "response_options.h" namespace marian { namespace bergamot { -void ResponseBuilder::buildQualityScores(Histories &histories, - Response &response) { +void ResponseBuilder::buildQualityScores(Histories &histories, Response &response) { std::vector qualityScores; for (auto &history : histories) { // TODO(jerin): Change hardcode of nBest = 1 NBestList onebest = history->nBest(1); - Result result = onebest[0]; // Expecting only one result; + Result result = onebest[0]; // Expecting only one result; Words words = std::get<0>(result); auto hyp = std::get<1>(result); // Quality scores: Sequence level is obtained as normalized path scores. @@ -20,18 +20,16 @@ void ResponseBuilder::buildQualityScores(Histories &histories, auto normalizedPathScore = std::get<2>(result); auto wordQualities = hyp->tracebackWordScores(); wordQualities.pop_back(); - response.qualityScores.push_back( - Quality{normalizedPathScore, wordQualities}); + response.qualityScores.push_back(Quality{normalizedPathScore, wordQualities}); } } -void ResponseBuilder::buildAlignments(Histories &histories, - Response &response) { +void ResponseBuilder::buildAlignments(Histories &histories, Response &response) { for (auto &history : histories) { // TODO(jerin): Change hardcode of nBest = 1 NBestList onebest = history->nBest(1); - Result result = onebest[0]; // Expecting only one result; + Result result = onebest[0]; // Expecting only one result; Words words = std::get<0>(result); // Alignments // TODO(jerinphilip): The following double conversion might not be @@ -40,8 +38,7 @@ void ResponseBuilder::buildAlignments(Histories &histories, auto hyp = std::get<1>(result); auto softAlignment = hyp->tracebackAlignment(); auto threshold = responseOptions_.alignmentThreshold; - auto hardAlignment = - data::ConvertSoftAlignToHardAlign(softAlignment, threshold); + auto hardAlignment = data::ConvertSoftAlignToHardAlign(softAlignment, threshold); Alignment unified_alignment; for (auto &p : hardAlignment) { unified_alignment.emplace_back(Point{p.srcPos, p.tgtPos, p.prob}); @@ -51,8 +48,7 @@ void ResponseBuilder::buildAlignments(Histories &histories, } } -void ResponseBuilder::buildTranslatedText(Histories &histories, - Response &response) { +void ResponseBuilder::buildTranslatedText(Histories &histories, Response &response) { // Reserving length at least as much as source_ seems like a reasonable // thing to do to avoid reallocations. response.target.text.reserve(response.source.text.size()); @@ -63,7 +59,7 @@ void ResponseBuilder::buildTranslatedText(Histories &histories, auto &history = histories[sentenceIdx]; NBestList onebest = history->nBest(1); - Result result = onebest[0]; // Expecting only one result; + Result result = onebest[0]; // Expecting only one result; Words words = std::get<0>(result); std::string decoded; @@ -71,31 +67,31 @@ void ResponseBuilder::buildTranslatedText(Histories &histories, vocabs_.target()->decodeWithByteRanges(words, decoded, targetSentenceMappings); switch (responseOptions_.concatStrategy) { - case ConcatStrategy::FAITHFUL: { - // For each sentence, prepend the filler text between the corresponding - // source-sentence and the source-sentence before. - string_view pre = response.source.gap(sentenceIdx); - response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end()); + case ConcatStrategy::FAITHFUL: { + // For each sentence, prepend the filler text between the corresponding + // source-sentence and the source-sentence before. + string_view pre = response.source.gap(sentenceIdx); + response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end()); - // If this is the last history to be decoded and translated-text - // constructed, append the text till the end, which could be spaces or - // empty. - if (sentenceIdx + 1 == histories.size()) { - response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1)); + // If this is the last history to be decoded and translated-text + // constructed, append the text till the end, which could be spaces or + // empty. + if (sentenceIdx + 1 == histories.size()) { + response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1)); + } + break; + } + case ConcatStrategy::SPACE: { + string_view delimiter = (sentenceIdx == 0) ? "" : " "; + response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end()); + break; } - break; - } - case ConcatStrategy::SPACE: { - string_view delimiter = (sentenceIdx == 0) ? "" : " "; - response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end()); - break; - } - default: - ABORT("Unknown concat-strategy"); + default: + ABORT("Unknown concat-strategy"); } } } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/response_builder.h b/src/translator/response_builder.h index b8a8dd4..84ea09a 100644 --- a/src/translator/response_builder.h +++ b/src/translator/response_builder.h @@ -19,16 +19,14 @@ namespace bergamot { /// paragraph). class ResponseBuilder { -public: + public: /// @param [in] responseOptions: ResponseOptions, indicating what to include /// or not in the response and any additional configurable parameters. /// @param [in] vocabs: marian vocab object (used in decoding) /// @param [in] promise: promise to set with the constructed Response. - ResponseBuilder(ResponseOptions responseOptions, AnnotatedText &&source, - Vocabs &vocabs, + ResponseBuilder(ResponseOptions responseOptions, AnnotatedText &&source, Vocabs &vocabs, std::promise &&promise) - : responseOptions_(responseOptions), source_(std::move(source)), - vocabs_(vocabs), promise_(std::move(promise)) {} + : responseOptions_(responseOptions), source_(std::move(source)), vocabs_(vocabs), promise_(std::move(promise)) {} /// Constructs and sets the promise of a Response object from obtained /// histories after translating. @@ -38,8 +36,7 @@ public: // TODO(jerinphilip) load ResponseOptions into options and turn build // functions on or off. // responseOptions_ is unused, but we can try something here. - ABORT_IF(source_.numSentences() != histories.size(), - "Mismatch in source and translated sentences"); + ABORT_IF(source_.numSentences() != histories.size(), "Mismatch in source and translated sentences"); Response response; // Move source_ into response. @@ -61,7 +58,7 @@ public: promise_.set_value(std::move(response)); } -private: + private: /// Builds qualityScores from histories and writes to response. expects /// buildTranslatedText to be run before to be able to obtain target text and /// subword information. @@ -82,13 +79,13 @@ private: // Data members are context/curried args for the functor. ResponseOptions responseOptions_; - const Vocabs& vocabs_; // vocabs are required for decoding - // and any source validation checks. - std::promise promise_; // To be set when callback triggered and - // after Response constructed. + const Vocabs &vocabs_; // vocabs are required for decoding + // and any source validation checks. + std::promise promise_; // To be set when callback triggered and + // after Response constructed. AnnotatedText source_; }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_RESPONSE_BUILDER_H_ +#endif // SRC_BERGAMOT_RESPONSE_BUILDER_H_ diff --git a/src/translator/response_options.h b/src/translator/response_options.h index ed3cce3..b74f578 100644 --- a/src/translator/response_options.h +++ b/src/translator/response_options.h @@ -26,8 +26,8 @@ enum QualityScoreType { /// ResponseOptions dictate how to construct a Response for an input string of /// text to be translated. struct ResponseOptions { - bool qualityScores{false}; ///< Include quality-scores or not. - bool alignment{false}; ///< Include alignments or not. + bool qualityScores{false}; ///< Include quality-scores or not. + bool alignment{false}; ///< Include alignments or not. /// Whether to include sentenceMappings or not. Alignments require /// sentenceMappings and are available irrespective of this option if @@ -44,7 +44,7 @@ struct ResponseOptions { ConcatStrategy concatStrategy{ConcatStrategy::FAITHFUL}; }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_RESPONSE_OPTIONS_H_ +#endif // SRC_BERGAMOT_RESPONSE_OPTIONS_H_ diff --git a/src/translator/sentence_splitter.cpp b/src/translator/sentence_splitter.cpp index 0370125..d0ada81 100644 --- a/src/translator/sentence_splitter.cpp +++ b/src/translator/sentence_splitter.cpp @@ -1,53 +1,47 @@ #include "sentence_splitter.h" + +#include + #include "common/cli_helper.h" #include "common/logging.h" #include "common/options.h" -#include namespace marian { namespace bergamot { -SentenceSplitter::SentenceSplitter(marian::Ptr options) - : options_(options) { - +SentenceSplitter::SentenceSplitter(marian::Ptr options) : options_(options) { std::string smode_str = options_->get("ssplit-mode", ""); mode_ = string2splitmode(smode_str); - std::string ssplit_prefix_file = - options_->get("ssplit-prefix-file", ""); + std::string ssplit_prefix_file = options_->get("ssplit-prefix-file", ""); if (ssplit_prefix_file.size()) { ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file); - LOG(info, "Loading protected prefixes for sentence splitting from {}", - ssplit_prefix_file); + LOG(info, "Loading protected prefixes for sentence splitting from {}", ssplit_prefix_file); ssplit_.load(ssplit_prefix_file); } else { - LOG(warn, "Missing list of protected prefixes for sentence splitting. " - "Set with --ssplit-prefix-file."); + LOG(warn, + "Missing list of protected prefixes for sentence splitting. " + "Set with --ssplit-prefix-file."); } } -ug::ssplit::SentenceStream -SentenceSplitter::createSentenceStream(const string_view &input) { +ug::ssplit::SentenceStream SentenceSplitter::createSentenceStream(const string_view &input) { std::string_view input_converted(input.data(), input.size()); - return std::move( - ug::ssplit::SentenceStream(input_converted, this->ssplit_, mode_)); + return std::move(ug::ssplit::SentenceStream(input_converted, this->ssplit_, mode_)); } -ug::ssplit::SentenceStream::splitmode -SentenceSplitter::string2splitmode(const std::string &m) { +ug::ssplit::SentenceStream::splitmode SentenceSplitter::string2splitmode(const std::string &m) { typedef ug::ssplit::SentenceStream::splitmode splitmode; // @TODO: throw Exception on error - if (m == "sentence" || m == "Sentence") - return splitmode::one_sentence_per_line; - if (m == "paragraph" || m == "Paragraph") - return splitmode::one_paragraph_per_line; + if (m == "sentence" || m == "Sentence") return splitmode::one_sentence_per_line; + if (m == "paragraph" || m == "Paragraph") return splitmode::one_paragraph_per_line; if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") { LOG(warn, "Ignoring unknown text input format specification: {}.", m); } return splitmode::wrapped_text; } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/sentence_splitter.h b/src/translator/sentence_splitter.h index 1c4742e..9c58165 100644 --- a/src/translator/sentence_splitter.h +++ b/src/translator/sentence_splitter.h @@ -1,11 +1,12 @@ #ifndef SRC_BERGAMOT_SENTENCE_SPLITTER_H_ #define SRC_BERGAMOT_SENTENCE_SPLITTER_H_ +#include + #include "common/options.h" #include "data/types.h" -#include "ssplit.h" #include "definitions.h" -#include +#include "ssplit.h" namespace marian { namespace bergamot { @@ -15,18 +16,18 @@ class SentenceSplitter { // mts. Constructed based on options. Used in TextProcessor below to create // sentence-streams, which provide access to one sentence from blob of text at // a time. -public: + public: explicit SentenceSplitter(Ptr options); ug::ssplit::SentenceStream createSentenceStream(string_view const &input); -private: + private: ug::ssplit::SentenceSplitter ssplit_; Ptr options_; ug::ssplit::SentenceStream::splitmode mode_; ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m); }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_SENTENCE_SPLITTER_H_ +#endif // SRC_BERGAMOT_SENTENCE_SPLITTER_H_ diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 265695a..9f3fa21 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -1,17 +1,20 @@ #include "service.h" -#include "batch.h" -#include "definitions.h" #include #include +#include "batch.h" +#include "definitions.h" + namespace marian { namespace bergamot { Service::Service(Ptr options, MemoryBundle memoryBundle) - : requestId_(0), options_(options), + : requestId_(0), + options_(options), vocabs_(options, std::move(memoryBundle.vocabs)), - text_processor_(vocabs_, options), batcher_(options), + text_processor_(vocabs_, options), + batcher_(options), numWorkers_(options->get("cpu-threads")), modelMemory_(std::move(memoryBundle.model)), shortlistMemory_(std::move(memoryBundle.shortlist)) @@ -41,9 +44,7 @@ void Service::build_translators(Ptr options, size_t numTranslators) { } } -void Service::initialize_blocking_translator() { - translators_.back().initialize(); -} +void Service::initialize_blocking_translator() { translators_.back().initialize(); } void Service::blocking_translate() { Batch batch; @@ -83,31 +84,23 @@ void Service::async_translate() { pcqueue_.ProduceSwap(batch); } } -#else // WASM_COMPATIBLE_SOURCE -void Service::initialize_async_translators() { - ABORT("Cannot run in async mode without multithreading."); -} +#else // WASM_COMPATIBLE_SOURCE +void Service::initialize_async_translators() { ABORT("Cannot run in async mode without multithreading."); } -void Service::async_translate() { - ABORT("Cannot run in async mode without multithreading."); -} -#endif // WASM_COMPATIBLE_SOURCE +void Service::async_translate() { ABORT("Cannot run in async mode without multithreading."); } +#endif // WASM_COMPATIBLE_SOURCE std::future Service::translate(std::string &&input) { ResponseOptions responseOptions; // Hardcode responseOptions for now return translate(std::move(input), responseOptions); } -std::vector -Service::translateMultiple(std::vector &&inputs, - ResponseOptions responseOptions) { - +std::vector Service::translateMultiple(std::vector &&inputs, ResponseOptions responseOptions) { // We queue the individual Requests so they get compiled at batches to be // efficiently translated. std::vector> responseFutures; for (auto &input : inputs) { - std::future inputResponse = - queueRequest(std::move(input), responseOptions); + std::future inputResponse = queueRequest(std::move(input), responseOptions); responseFutures.push_back(std::move(inputResponse)); } @@ -126,8 +119,7 @@ Service::translateMultiple(std::vector &&inputs, return responses; } -std::future Service::queueRequest(std::string &&input, - ResponseOptions responseOptions) { +std::future Service::queueRequest(std::string &&input, ResponseOptions responseOptions) { Segments segments; AnnotatedText source(std::move(input)); text_processor_.process(source, segments); @@ -135,19 +127,15 @@ std::future Service::queueRequest(std::string &&input, std::promise responsePromise; auto future = responsePromise.get_future(); - ResponseBuilder responseBuilder(responseOptions, std::move(source), vocabs_, - std::move(responsePromise)); - Ptr request = New(requestId_++, std::move(segments), - std::move(responseBuilder)); + ResponseBuilder responseBuilder(responseOptions, std::move(source), vocabs_, std::move(responsePromise)); + Ptr request = New(requestId_++, std::move(segments), std::move(responseBuilder)); batcher_.addWholeRequest(request); return future; } -std::future Service::translate(std::string &&input, - ResponseOptions responseOptions) { - std::future future = - queueRequest(std::move(input), responseOptions); +std::future Service::translate(std::string &&input, ResponseOptions responseOptions) { + std::future future = queueRequest(std::move(input), responseOptions); dispatchTranslate(); return future; } @@ -163,7 +151,6 @@ void Service::dispatchTranslate() { Service::~Service() { #ifndef WASM_COMPATIBLE_SOURCE for (size_t workerId = 0; workerId < numWorkers_; workerId++) { - Batch poison = Batch::poison(); pcqueue_.ProduceSwap(poison); } @@ -176,5 +163,5 @@ Service::~Service() { #endif } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/service.h b/src/translator/service.h index a678d53..1af34af 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -60,15 +60,14 @@ namespace bergamot { /// file supplied through config). /// class Service { - -public: + public: /// Construct Service from Marian options. If memoryBundle is empty, Service is /// initialized from file-based loading. Otherwise, Service is initialized from /// the given bytearray memories. /// @param options Marian options object /// @param memoryBundle holds all byte-array memories. Can be a set/subset of /// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional. - explicit Service(Ptr options, MemoryBundle memoryBundle={}); + explicit Service(Ptr options, MemoryBundle memoryBundle = {}); /// Construct Service from a string configuration. If memoryBundle is empty, Service is /// initialized from file-based loading. Otherwise, Service is initialized from @@ -76,7 +75,7 @@ public: /// @param [in] config string parsable as YAML expected to adhere with marian config /// @param [in] memoryBundle holds all byte-array memories. Can be a set/subset of /// model, shortlist, vocabs and ssplitPrefixFile bytes. Optional. - explicit Service(const std::string &config, MemoryBundle memoryBundle={}) + explicit Service(const std::string &config, MemoryBundle memoryBundle = {}) : Service(parseOptions(config, /*validate=*/false), std::move(memoryBundle)) {} /// Explicit destructor to clean up after any threads initialized in @@ -97,8 +96,7 @@ public: /// @param [in] responseOptions: Options indicating whether or not to include /// some member in the Response, also specify any additional configurable /// parameters. - std::future translate(std::string &&source, - ResponseOptions options); + std::future translate(std::string &&source, ResponseOptions options); /// Translate multiple text-blobs in a single *blocking* API call, providing /// ResponseOptions which applies across all text-blobs dictating how to @@ -117,19 +115,14 @@ public: /// to include some member in the Response, also specify any additional /// configurable parameters. - std::vector - translateMultiple(std::vector &&source, - ResponseOptions responseOptions); + std::vector translateMultiple(std::vector &&source, ResponseOptions responseOptions); /// Returns if model is alignment capable or not. - bool isAlignmentSupported() const { - return options_->hasAndNotEmpty("alignment"); - } + bool isAlignmentSupported() const { return options_->hasAndNotEmpty("alignment"); } -private: + private: /// Queue an input for translation. - std::future queueRequest(std::string &&input, - ResponseOptions responseOptions); + std::future queueRequest(std::string &&input, ResponseOptions responseOptions); /// Dispatch call to translate after inserting in queue void dispatchTranslate(); @@ -151,32 +144,31 @@ private: void async_translate(); /// Number of workers to launch. - size_t numWorkers_; // ORDER DEPENDENCY (pcqueue_) + size_t numWorkers_; // ORDER DEPENDENCY (pcqueue_) /// Options object holding the options Service was instantiated with. Ptr options_; /// Model memory to load model passed as bytes. - AlignedMemory modelMemory_; // ORDER DEPENDENCY (translators_) + AlignedMemory modelMemory_; // ORDER DEPENDENCY (translators_) /// Shortlist memory passed as bytes. - AlignedMemory shortlistMemory_; // ORDER DEPENDENCY (translators_) + AlignedMemory shortlistMemory_; // ORDER DEPENDENCY (translators_) /// Holds instances of batch translators, just one in case /// of single-threaded application, numWorkers_ in case of multithreaded /// setting. - std::vector - translators_; // ORDER DEPENDENCY (modelMemory_, shortlistMemory_) + std::vector translators_; // ORDER DEPENDENCY (modelMemory_, shortlistMemory_) /// Stores requestId of active request. Used to establish /// ordering among requests and logging/book-keeping. size_t requestId_; /// Store vocabs representing source and target. - Vocabs vocabs_; // ORDER DEPENDENCY (text_processor_) + Vocabs vocabs_; // ORDER DEPENDENCY (text_processor_) /// TextProcesser takes a blob of text and converts into format consumable by /// the batch-translator and annotates sentences and words. - TextProcessor text_processor_; // ORDER DEPENDENCY (vocabs_) + TextProcessor text_processor_; // ORDER DEPENDENCY (vocabs_) /// Batcher handles generation of batches from a request, subject to /// packing-efficiency and priority optimization heuristics. @@ -185,12 +177,12 @@ private: // The following constructs are available providing full capabilities on a non // WASM platform, where one does not have to hide threads. #ifndef WASM_COMPATIBLE_SOURCE - PCQueue pcqueue_; // ORDER DEPENDENCY (numWorkers_) + PCQueue pcqueue_; // ORDER DEPENDENCY (numWorkers_) std::vector workers_; -#endif // WASM_COMPATIBLE_SOURCE +#endif // WASM_COMPATIBLE_SOURCE }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_SERVICE_H_ +#endif // SRC_BERGAMOT_SERVICE_H_ diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp index bca5fd1..234c484 100644 --- a/src/translator/text_processor.cpp +++ b/src/translator/text_processor.cpp @@ -1,39 +1,33 @@ #include "text_processor.h" + +#include + +#include "annotation.h" +#include "common/options.h" #include "data/types.h" #include "definitions.h" -#include "annotation.h" - -#include "common/options.h" -#include namespace marian { namespace bergamot { -Segment TextProcessor::tokenize(const string_view &segment, - std::vector &wordRanges) { +Segment TextProcessor::tokenize(const string_view &segment, std::vector &wordRanges) { // vocabs_->sources().front() is invoked as we currently only support one source vocab - return vocabs_.sources().front()->encodeWithByteRanges( - segment, wordRanges, /*addEOS=*/false, /*inference=*/true); + return vocabs_.sources().front()->encodeWithByteRanges(segment, wordRanges, /*addEOS=*/false, /*inference=*/true); } -TextProcessor::TextProcessor(Vocabs &vocabs, - Ptr options) - : vocabs_(vocabs), sentence_splitter_(options) { - +TextProcessor::TextProcessor(Vocabs &vocabs, Ptr options) : vocabs_(vocabs), sentence_splitter_(options) { max_length_break_ = options->get("max-length-break"); max_length_break_ = max_length_break_ - 1; ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0"); } void TextProcessor::process(AnnotatedText &source, Segments &segments) { - string_view query = string_view(source.text); auto sentenceStream = sentence_splitter_.createSentenceStream(query); std::string_view sentenceStringPiece; while (sentenceStream >> sentenceStringPiece) { - marian::string_view sentence(sentenceStringPiece.data(), - sentenceStringPiece.size()); + marian::string_view sentence(sentenceStringPiece.data(), sentenceStringPiece.size()); std::vector wordRanges; Segment segment = tokenize(sentence, wordRanges); @@ -48,11 +42,9 @@ void TextProcessor::process(AnnotatedText &source, Segments &segments) { } } -void TextProcessor::wrap(Segment &segment, - std::vector &wordRanges, - Segments &segments, AnnotatedText &source) { - for (size_t offset = 0; offset < segment.size(); - offset += max_length_break_) { +void TextProcessor::wrap(Segment &segment, std::vector &wordRanges, Segments &segments, + AnnotatedText &source) { + for (size_t offset = 0; offset < segment.size(); offset += max_length_break_) { auto start = segment.begin() + offset; size_t left = segment.size() - offset; @@ -67,5 +59,5 @@ void TextProcessor::wrap(Segment &segment, } } -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h index 7328877..be37c3d 100644 --- a/src/translator/text_processor.h +++ b/src/translator/text_processor.h @@ -1,16 +1,15 @@ #ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_ #define SRC_BERGAMOT_TEXT_PROCESSOR_H_ +#include + +#include "annotation.h" #include "data/types.h" #include "data/vocab.h" #include "definitions.h" -#include "annotation.h" - #include "sentence_splitter.h" #include "vocabs.h" -#include - namespace marian { namespace bergamot { @@ -21,31 +20,29 @@ class TextProcessor { // Used in Service to convert an incoming blog of text to a vector of // sentences (vector of words). In addition, the ByteRanges of the // source-tokens in unnormalized text are provided as string_views. -public: + public: explicit TextProcessor(Vocabs &vocabs, Ptr); void process(AnnotatedText &source, Segments &segments); -private: + private: // Tokenizes an input string, returns Words corresponding. Loads the // corresponding byte-ranges into tokenRanges. - Segment tokenize(const string_view &input, - std::vector &tokenRanges); + Segment tokenize(const string_view &input, std::vector &tokenRanges); // Wrap into sentences of at most max_length_break_ tokens and add to source. - void wrap(Segment &sentence, std::vector &tokenRanges, - Segments &segments, AnnotatedText &source); + void wrap(Segment &sentence, std::vector &tokenRanges, Segments &segments, AnnotatedText &source); // shorthand, used only in truncate() // vocabs_->sources().front() is invoked as we currently only support one source vocab const Word sourceEosId() const { return vocabs_.sources().front()->getEosId(); } - const Vocabs& vocabs_; + const Vocabs &vocabs_; SentenceSplitter sentence_splitter_; size_t max_length_break_; }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian -#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_ +#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_ diff --git a/src/translator/vocabs.h b/src/translator/vocabs.h index 89aed4b..bc5ef14 100644 --- a/src/translator/vocabs.h +++ b/src/translator/vocabs.h @@ -6,14 +6,13 @@ namespace bergamot { /// Wrapper of Marian Vocab objects needed for translator. /// Holds multiple source vocabularies and one target vocabulary class Vocabs { -public: + public: /// Construct vocabs object from either byte-arrays or files - Vocabs(Ptr options, std::vector>&& vocabMemories): options_(options){ - if (!vocabMemories.empty()){ + Vocabs(Ptr options, std::vector>&& vocabMemories) : options_(options) { + if (!vocabMemories.empty()) { // load vocabs from buffer load(std::move(vocabMemories)); - } - else{ + } else { // load vocabs from file auto vocabPaths = options->get>("vocabs"); load(vocabPaths); @@ -21,16 +20,12 @@ public: } /// Get all source vocabularies (as a vector) - const std::vector>& sources() const { - return srcVocabs_; - } + const std::vector>& sources() const { return srcVocabs_; } /// Get the target vocabulary - const Ptr& target() const { - return trgVocab_; - } + const Ptr& target() const { return trgVocab_; } -private: + private: std::vector> srcVocabs_; // source vocabularies Ptr trgVocab_; // target vocabulary Ptr options_; @@ -46,7 +41,7 @@ private: std::unordered_map> vmap; for (size_t i = 0; i < srcVocabs_.size(); i++) { auto m = vmap.emplace(std::make_pair(reinterpret_cast(vocabMemories[i].get()), Ptr())); - if (m.second) { // new: load the vocab + if (m.second) { // new: load the vocab m.first->second = New(options_, i); m.first->second->loadFromSerialized(absl::string_view(vocabMemories[i]->begin(), vocabMemories[i]->size())); } @@ -58,14 +53,14 @@ private: } // load from file - void load(const std::vector& vocabPaths){ + void load(const std::vector& vocabPaths) { // with the current setup, we need at least two vocabs: src and trg ABORT_IF(vocabPaths.size() < 2, "Insufficient number of vocabularies."); srcVocabs_.resize(vocabPaths.size()); std::unordered_map> vmap; for (size_t i = 0; i < srcVocabs_.size(); ++i) { auto m = vmap.emplace(std::make_pair(vocabPaths[i], Ptr())); - if (m.second) { // new: load the vocab + if (m.second) { // new: load the vocab m.first->second = New(options_, i); m.first->second->load(vocabPaths[i]); } @@ -77,5 +72,5 @@ private: } }; -} // namespace bergamot -} // namespace marian +} // namespace bergamot +} // namespace marian diff --git a/wasm/bindings/TranslationModelBindings.cpp b/wasm/bindings/TranslationModelBindings.cpp index 1db7401..64203a1 100644 --- a/wasm/bindings/TranslationModelBindings.cpp +++ b/wasm/bindings/TranslationModelBindings.cpp @@ -21,12 +21,11 @@ val getByteArrayView(AlignedMemory& alignedMemory) { EMSCRIPTEN_BINDINGS(aligned_memory) { class_("AlignedMemory") - .constructor() - .function("size", &AlignedMemory::size) - .function("getByteArrayView", &getByteArrayView) - ; + .constructor() + .function("size", &AlignedMemory::size) + .function("getByteArrayView", &getByteArrayView); - register_vector("AlignedMemoryList"); + register_vector("AlignedMemoryList"); } // When source and target vocab files are same, only one memory object is passed from JS to @@ -41,16 +40,14 @@ std::vector> prepareVocabsSmartMemories(std::vect if (vocabsMemories.size() == 2) { auto targetVocabMemory = std::make_shared(std::move(*(vocabsMemories[1]))); vocabsSmartMemories.push_back(std::move(targetVocabMemory)); - } - else { + } else { vocabsSmartMemories.push_back(sourceVocabMemory); } return vocabsSmartMemories; } -marian::bergamot::MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory, - AlignedMemory* shortlistMemory, - std::vector uniqueVocabsMemories){ +marian::bergamot::MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory, AlignedMemory* shortlistMemory, + std::vector uniqueVocabsMemories) { marian::bergamot::MemoryBundle memoryBundle; memoryBundle.model = std::move(*modelMemory); memoryBundle.shortlist = std::move(*shortlistMemory); @@ -59,19 +56,18 @@ marian::bergamot::MemoryBundle prepareMemoryBundle(AlignedMemory* modelMemory, return memoryBundle; } -TranslationModel* TranslationModelFactory(const std::string &config, - AlignedMemory* modelMemory, +TranslationModel* TranslationModelFactory(const std::string& config, AlignedMemory* modelMemory, AlignedMemory* shortlistMemory, std::vector uniqueVocabsMemories) { - return new TranslationModel(config, std::move(prepareMemoryBundle(modelMemory, shortlistMemory, uniqueVocabsMemories))); + return new TranslationModel(config, + std::move(prepareMemoryBundle(modelMemory, shortlistMemory, uniqueVocabsMemories))); } EMSCRIPTEN_BINDINGS(translation_model) { class_("TranslationModel") - .constructor(&TranslationModelFactory, allow_raw_pointers()) - .function("translate", &TranslationModel::translateMultiple) - .function("isAlignmentSupported", &TranslationModel::isAlignmentSupported) - ; + .constructor(&TranslationModelFactory, allow_raw_pointers()) + .function("translate", &TranslationModel::translateMultiple) + .function("isAlignmentSupported", &TranslationModel::isAlignmentSupported); // ^ We redirect Service::translateMultiple to WASMBound::translate instead. Sane API is // translate. If and when async comes, we can be done with this inconsistency. diff --git a/wasm/bindings/TranslationRequestBindings.cpp b/wasm/bindings/TranslationRequestBindings.cpp index 7d5cd1e..42ac6c6 100644 --- a/wasm/bindings/TranslationRequestBindings.cpp +++ b/wasm/bindings/TranslationRequestBindings.cpp @@ -12,8 +12,4 @@ typedef marian::bergamot::ResponseOptions TranslationRequest; using namespace emscripten; // Binding code -EMSCRIPTEN_BINDINGS(translation_request) { - class_("TranslationRequest") - .constructor<>() - ; -} +EMSCRIPTEN_BINDINGS(translation_request) { class_("TranslationRequest").constructor<>(); } diff --git a/wasm/bindings/TranslationResultBindings.cpp b/wasm/bindings/TranslationResultBindings.cpp index c1c0ca8..f02bef9 100644 --- a/wasm/bindings/TranslationResultBindings.cpp +++ b/wasm/bindings/TranslationResultBindings.cpp @@ -4,6 +4,7 @@ */ #include + #include #include "response.h"