From 38e8b3cd6d5a2db561ce201c3e69fb79c676389c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Fri, 5 Feb 2021 12:55:57 +0000 Subject: [PATCH 01/39] Updates: marian-dev, ssplit for marian-decoder-new Updates marian-dev and ssplit submodules to point to the upstream commits which implements the following: - marian-dev: encodeWithByteRanges(...) to get source token byte-ranges - ssplit: Has a trivial sentencesplitter functionality implemented, and now is faster to benchmark with marian-decoder. This enables a marian-decoder replacement written through ssplit in this source to be benchmarked constantly with existing marian-decoder. Nits: Removes logging introduced for multiple workers, and respective log statements. --- .gitignore | 14 +++++ 3rd_party/marian-dev | 2 +- 3rd_party/ssplit-cpp | 2 +- app/CMakeLists.txt | 3 + app/main-mts.cpp | 13 ---- app/marian-decoder-new.cpp | 63 +++++++++++++++++++ src/translator/CMakeLists.txt | 4 +- src/translator/batch_translator.cpp | 1 - src/translator/batcher.cpp | 1 - src/translator/sanelogging.h | 44 ------------- src/translator/sentence_splitter.cpp | 52 +++++++++++++++ src/translator/sentence_splitter.h | 31 +++++++++ src/translator/service.cpp | 1 - src/translator/service.h | 2 +- .../{textops.cpp => text_processor.cpp} | 61 +++--------------- .../{textops.h => text_processor.h} | 37 +++-------- 16 files changed, 186 insertions(+), 145 deletions(-) create mode 100644 app/marian-decoder-new.cpp delete mode 100644 src/translator/sanelogging.h create mode 100644 src/translator/sentence_splitter.cpp create mode 100644 src/translator/sentence_splitter.h rename src/translator/{textops.cpp => text_processor.cpp} (52%) rename src/translator/{textops.h => text_processor.h} (56%) diff --git a/.gitignore b/.gitignore index e63aee1..54493b9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,17 @@ *.swp *.swo +# CMake +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps + + diff --git a/3rd_party/marian-dev b/3rd_party/marian-dev index ee56e02..2f65280 160000 --- a/3rd_party/marian-dev +++ b/3rd_party/marian-dev @@ -1 +1 @@ -Subproject commit ee56e02f0525a4651157a07f74b44f456db14c8c +Subproject commit 2f65280459737c37c270e4ad0b6d41de215d11e0 diff --git a/3rd_party/ssplit-cpp b/3rd_party/ssplit-cpp index f5d0229..01e71b4 160000 --- a/3rd_party/ssplit-cpp +++ b/3rd_party/ssplit-cpp @@ -1 +1 @@ -Subproject commit f5d022992f4a00c860eb809389748908bb85ffcf +Subproject commit 01e71b4964fdc351f932a7a23cab4cb80b9698e8 diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 6e71e9e..24bd0b4 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -3,3 +3,6 @@ target_link_libraries(bergamot-translator-app PRIVATE bergamot-translator) add_executable(service-cli main-mts.cpp) target_link_libraries(service-cli PRIVATE bergamot-translator) + +add_executable(marian-decoder-new marian-decoder-new.cpp) +target_link_libraries(marian-decoder-new PRIVATE bergamot-translator) diff --git a/app/main-mts.cpp b/app/main-mts.cpp index 44a019a..c94ff30 100644 --- a/app/main-mts.cpp +++ b/app/main-mts.cpp @@ -26,21 +26,8 @@ int main(int argc, char *argv[]) { service.translate(std::move(input)); translation_result_future.wait(); const TranslationResult &translation_result = translation_result_future.get(); - - std::cout << "service-cli [Source text]: "; - std::cout << translation_result.getOriginalText() << std::endl; - - std::cout << "service-cli [Translated text]: "; std::cout << translation_result.getTranslatedText() << std::endl; - // Obtain sentenceMappings and print them as Proof of Concept. - const TranslationResult::SentenceMappings &sentenceMappings = - translation_result.getSentenceMappings(); - for (auto &p : sentenceMappings) { - std::cout << "service-cli [src] " << p.first << "\n"; - std::cout << "service-cli [tgt] " << p.second << "\n"; - } - // Stop Service. service.stop(); return 0; diff --git a/app/marian-decoder-new.cpp b/app/marian-decoder-new.cpp new file mode 100644 index 0000000..62b1bb4 --- /dev/null +++ b/app/marian-decoder-new.cpp @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +#include "common/definitions.h" +#include "common/timer.h" +#include "common/utils.h" +#include "marian.h" +#include "translator/history.h" +#include "translator/output_collector.h" +#include "translator/output_printer.h" +#include "translator/parser.h" +#include "translator/service.h" +#include "translator/translation_result.h" + +void marian_decoder_minimal(const marian::Histories &histories, + marian::Ptr targetVocab, + marian::Ptr options) { + + bool doNbest = options->get("n-best"); + auto collector = + marian::New(options->get("output")); + + // There is a dependency of vocabs here. + auto printer = marian::New(options, targetVocab); + if (options->get("quiet-translation")) + collector->setPrintingStrategy(marian::New()); + + for (auto &history : histories) { + std::stringstream best1; + std::stringstream bestn; + printer->print(history, best1, bestn); + collector->Write((long)history->getLineNum(), best1.str(), bestn.str(), + doNbest); + } +} + +int main(int argc, char *argv[]) { + auto cp = marian::bergamot::createConfigParser(); + auto options = cp.parseOptions(argc, argv, true); + marian::timer::Timer decoderTimer; + + marian::bergamot::Service service(options); + // Read a large input text blob from stdin + std::ostringstream std_input; + std_input << std::cin.rdbuf(); + std::string input = std_input.str(); + using marian::bergamot::TranslationResult; + + // Wait on future until TranslationResult is complete + std::future translation_result_future = + service.translate(std::move(input)); + translation_result_future.wait(); + const TranslationResult &translation_result = translation_result_future.get(); + + marian_decoder_minimal(translation_result.getHistories(), + service.targetVocab(), options); + + LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed()); + service.stop(); + return 0; +} diff --git a/src/translator/CMakeLists.txt b/src/translator/CMakeLists.txt index b6fcf69..16c3db9 100644 --- a/src/translator/CMakeLists.txt +++ b/src/translator/CMakeLists.txt @@ -3,7 +3,8 @@ add_library(bergamot-translator STATIC TranslationModel.cpp # Following files added from browsermt/mts@nuke - textops.cpp + text_processor.cpp + sentence_splitter.cpp batch_translator.cpp multifactor_priority.cpp request.cpp @@ -18,3 +19,4 @@ target_include_directories(bergamot-translator PRIVATE ${CMAKE_SOURCE_DIR} PUBLIC ${CMAKE_SOURCE_DIR}/src) + diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index 6380a00..860255c 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -2,7 +2,6 @@ #include "common/logging.h" #include "data/corpus.h" #include "data/text_input.h" -#include "sanelogging.h" #include "translator/beam_search.h" namespace marian { diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 22ee46d..2fa4eaf 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -1,6 +1,5 @@ #include "batcher.h" #include "common/logging.h" -#include "sanelogging.h" #include namespace marian { diff --git a/src/translator/sanelogging.h b/src/translator/sanelogging.h deleted file mode 100644 index 21f70dd..0000000 --- a/src/translator/sanelogging.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef SRC_BERGAMOT_SANELOGGING_H_ -#define SRC_BERGAMOT_SANELOGGING_H_ - -#include "spdlog/spdlog.h" -#include - -namespace marian { - -#define PLOG(worker, level, ...) -#define _PLOG(worker, level, ...) checkedPLog(worker, #level, __VA_ARGS__) - -template -void checkedPLog(std::string logger, std::string level, Args... args) { - Logger log = spdlog::get(logger); - if (!log) { - try { - log = spdlog::daily_logger_st(logger, "logs/" + logger + ".log"); - } catch (const spdlog::spdlog_ex &ex) { - std::cout << "Log initialization failed: " << ex.what() << std::endl; - } - } - - if (level == "trace") - log->trace(args...); - else if (level == "debug") - log->debug(args...); - else if (level == "info") - log->info(args...); - else if (level == "warn") - log->warn(args...); - else if (level == "error") - log->error(args...); - else if (level == "critical") - log->critical(args...); - else { - log->warn("Unknown log level '{}' for logger '{}'", level, logger); - } - // Not required when threads clean-exit. - log->flush(); -} - -} // namespace marian - -#endif // SRC_BERGAMOT_SANELOGGING_H_ diff --git a/src/translator/sentence_splitter.cpp b/src/translator/sentence_splitter.cpp new file mode 100644 index 0000000..0f9be01 --- /dev/null +++ b/src/translator/sentence_splitter.cpp @@ -0,0 +1,52 @@ +#include "common/cli_helper.h" +#include "common/logging.h" +#include "common/options.h" +#include "sentence_splitter.h" +#include + +namespace marian { +namespace bergamot { + +SentenceSplitter::SentenceSplitter(marian::Ptr options) + : options_(options) { + + std::string smode_str = options_->get("ssplit-mode", ""); + mode_ = string2splitmode(smode_str); + std::string ssplit_prefix_file = + options_->get("ssplit-prefix-file", ""); + + if (ssplit_prefix_file.size()) { + ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file); + + LOG(info, "Loading protected prefixes for sentence splitting from {}", + ssplit_prefix_file); + + ssplit_.load(ssplit_prefix_file); + } else { + LOG(warn, "Missing list of protected prefixes for sentence splitting. " + "Set with --ssplit-prefix-file."); + } +} + +ug::ssplit::SentenceStream +SentenceSplitter::createSentenceStream(const string_view &input) { + return std::move(ug::ssplit::SentenceStream(input.data(), input.size(), + this->ssplit_, mode_)); +} + +ug::ssplit::SentenceStream::splitmode +SentenceSplitter::string2splitmode(const std::string &m) { + typedef ug::ssplit::SentenceStream::splitmode splitmode; + // @TODO: throw Exception on error + if (m == "sentence" || m == "Sentence") + return splitmode::one_sentence_per_line; + if (m == "paragraph" || m == "Paragraph") + return splitmode::one_paragraph_per_line; + if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") { + LOG(warn, "Ignoring unknown text input format specification: {}.", m); + } + return splitmode::wrapped_text; +} + +} // namespace bergamot +} // namespace marian diff --git a/src/translator/sentence_splitter.h b/src/translator/sentence_splitter.h new file mode 100644 index 0000000..5175176 --- /dev/null +++ b/src/translator/sentence_splitter.h @@ -0,0 +1,31 @@ +#ifndef SRC_BERGAMOT_SENTENCE_SPLITTER_H_ +#define SRC_BERGAMOT_SENTENCE_SPLITTER_H_ + +#include "common/options.h" +#include "data/types.h" +#include "ssplit.h" +#include + +namespace marian { +namespace bergamot { + +class SentenceSplitter { + // A wrapper around @ugermann's ssplit-cpp compiled from several places in + // mts. Constructed based on options. Used in TextProcessor below to create + // sentence-streams, which provide access to one sentence from blob of text at + // a time. +public: + explicit SentenceSplitter(Ptr options); + ug::ssplit::SentenceStream createSentenceStream(string_view const &input); + +private: + ug::ssplit::SentenceSplitter ssplit_; + Ptr options_; + ug::ssplit::SentenceStream::splitmode mode_; + ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m); +}; + +} // namespace bergamot +} // namespace marian + +#endif // SRC_BERGAMOT_SENTENCE_SPLITTER_H_ diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 4a5af30..2acbbdb 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -1,6 +1,5 @@ #include "service.h" #include "definitions.h" -#include "sanelogging.h" #include #include diff --git a/src/translator/service.h b/src/translator/service.h index 4069d13..0ed8d0c 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -4,7 +4,7 @@ #include "batch_translator.h" #include "batcher.h" #include "pcqueue.h" -#include "textops.h" +#include "text_processor.h" #include "translation_result.h" #include diff --git a/src/translator/textops.cpp b/src/translator/text_processor.cpp similarity index 52% rename from src/translator/textops.cpp rename to src/translator/text_processor.cpp index 25e48f1..8114855 100644 --- a/src/translator/textops.cpp +++ b/src/translator/text_processor.cpp @@ -1,58 +1,17 @@ -#include "textops.h" -#include "common/timer.h" -#include -#include -#include -#include +#include "text_processor.h" +#include "data/types.h" +#include "definitions.h" + +#include "common/options.h" +#include "data/vocab.h" #include namespace marian { namespace bergamot { -SentenceSplitter::SentenceSplitter(marian::Ptr options) - : options_(options) { - - std::string smode_str = options_->get("ssplit-mode", ""); - mode_ = string2splitmode(smode_str); - std::string ssplit_prefix_file = - options_->get("ssplit-prefix-file", ""); - - if (ssplit_prefix_file.size()) { - ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file); - - LOG(info, "Loading protected prefixes for sentence splitting from {}", - ssplit_prefix_file); - - ssplit_.load(ssplit_prefix_file); - } else { - LOG(warn, "Missing list of protected prefixes for sentence splitting. " - "Set with --ssplit-prefix-file."); - } -} - -ug::ssplit::SentenceStream -SentenceSplitter::createSentenceStream(const string_view &input) { - pcrecpp::StringPiece spiece(input.begin(), input.size()); - return std::move(ug::ssplit::SentenceStream(spiece, this->ssplit_, mode_)); -} - -ug::ssplit::SentenceStream::splitmode -SentenceSplitter::string2splitmode(const std::string &m) { - typedef ug::ssplit::SentenceStream::splitmode splitmode; - // @TODO: throw Exception on error - if (m == "sentence" || m == "Sentence") - return splitmode::one_sentence_per_line; - if (m == "paragraph" || m == "Paragraph") - return splitmode::one_paragraph_per_line; - if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") { - LOG(warn, "Ignoring unknown text input format specification: {}.", m); - } - return splitmode::wrapped_text; -} - Segment TextProcessor::tokenize(const string_view &segment, TokenRanges &tokenRanges) { - return vocabs_->front()->encodePreservingSource( + return vocabs_->front()->encodeWithByteRanges( segment, tokenRanges, /*addEOS=*/false, /*inference=*/true); } @@ -70,11 +29,11 @@ void TextProcessor::process(const string_view &query, Segments &segments, std::vector &sourceRanges) { auto sentenceStream = sentence_splitter_.createSentenceStream(query); - pcrecpp::StringPiece sentenceStringPiece; + std::string_view sentenceStringPiece; while (sentenceStream >> sentenceStringPiece) { - string_view sentence(sentenceStringPiece.data(), - sentenceStringPiece.size()); + marian::string_view sentence(sentenceStringPiece.data(), + sentenceStringPiece.size()); TokenRanges tokenRanges; Segment segment = tokenize(sentence, tokenRanges); diff --git a/src/translator/textops.h b/src/translator/text_processor.h similarity index 56% rename from src/translator/textops.h rename to src/translator/text_processor.h index 79a5040..111ae00 100644 --- a/src/translator/textops.h +++ b/src/translator/text_processor.h @@ -1,40 +1,17 @@ -#ifndef SRC_BERGAMOT_TEXTOPS_H_ -#define SRC_BERGAMOT_TEXTOPS_H_ +#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_ +#define SRC_BERGAMOT_TEXT_PROCESSOR_H_ -#include "common/definitions.h" -#include "common/logging.h" -#include "common/options.h" -#include "common/types.h" // missing in shortlist.h -#include "common/utils.h" -#include "data/sentencepiece_vocab.h" -#include "data/shortlist.h" +#include "data/types.h" +#include "data/vocab.h" #include "definitions.h" -#include "ssplit.h" -#include -#include -#include +#include "sentence_splitter.h" + #include namespace marian { namespace bergamot { -class SentenceSplitter { - // A wrapper around @ugermann's ssplit-cpp compiled from several places in - // mts. Constructed based on options. Used in TextProcessor below to create - // sentence-streams, which provide access to one sentence from blob of text at - // a time. -public: - explicit SentenceSplitter(Ptr options); - ug::ssplit::SentenceStream createSentenceStream(string_view const &input); - -private: - ug::ssplit::SentenceSplitter ssplit_; - Ptr options_; - ug::ssplit::SentenceStream::splitmode mode_; - ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m); -}; - class TextProcessor { // TextProcessor handles loading the sentencepiece vocabulary and also // contains an instance of sentence-splitter based on ssplit. @@ -68,4 +45,4 @@ private: } // namespace bergamot } // namespace marian -#endif // SRC_BERGAMOT_TEXTOPS_H_ +#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_ From 4764f11e95cb2ec3c2766949ba58a74ee0d2cc90 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 13 Feb 2021 10:55:07 +0000 Subject: [PATCH 02/39] Move BatchTranslator::thread_ to Service (#10) Service now holds an std::vector instead of BatchTranslators. --- src/translator/batch_translator.cpp | 26 +++++++++++--------------- src/translator/batch_translator.h | 19 ++++++++----------- src/translator/service.cpp | 8 +++++--- src/translator/service.h | 2 +- 4 files changed, 25 insertions(+), 30 deletions(-) diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index 860255c..7f801c9 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -8,15 +8,10 @@ namespace marian { namespace bergamot { BatchTranslator::BatchTranslator(DeviceId const device, - PCQueue &pcqueue, std::vector> &vocabs, Ptr options) - : device_(device), options_(options), pcqueue_(&pcqueue), vocabs_(&vocabs) { - - thread_ = std::thread([this] { this->mainloop(); }); -} - -void BatchTranslator::initGraph() { + : device_(device), options_(options), vocabs_(&vocabs) { + // Initializes the graph. if (options_->hasAndNotEmpty("shortlist")) { int srcIdx = 0, trgIdx = 1; bool shared_vcb = vocabs_->front() == vocabs_->back(); @@ -38,7 +33,6 @@ void BatchTranslator::initGraph() { scorer->setShortlistGenerator(slgen_); } } - graph_->forward(); } @@ -98,18 +92,22 @@ void BatchTranslator::translate(RequestSentences &requestSentences, histories = std::move(search->search(graph_, batch)); } -void BatchTranslator::mainloop() { - initGraph(); +// void BatchTranslator::join() { thread_.join(); } + +void translation_loop(DeviceId const &device, PCQueue &pcqueue, + std::vector> &vocabs, + Ptr options) { + + BatchTranslator translator(device, vocabs, options); PCItem pcitem; Histories histories; - while (true) { - pcqueue_->ConsumeSwap(pcitem); + pcqueue.ConsumeSwap(pcitem); if (pcitem.isPoison()) { return; } else { - translate(pcitem.sentences, histories); + translator.translate(pcitem.sentences, histories); for (int i = 0; i < pcitem.sentences.size(); i++) { pcitem.sentences[i].completeSentence(histories[i]); } @@ -117,7 +115,5 @@ void BatchTranslator::mainloop() { } } -void BatchTranslator::join() { thread_.join(); } - } // namespace bergamot } // namespace marian diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index 069155e..c718b32 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -22,29 +22,26 @@ class BatchTranslator { // shut down in Service which calls join() on the threads. public: - BatchTranslator(DeviceId const device, PCQueue &pcqueue, - std::vector> &vocabs, Ptr options); - void join(); + BatchTranslator(DeviceId const device, std::vector> &vocabs, + Ptr options); // convenience function for logging. TODO(jerin) std::string _identifier() { return "worker" + std::to_string(device_.no); } + void translate(RequestSentences &requestSentences, Histories &histories); private: - void initGraph(); - void translate(RequestSentences &requestSentences, Histories &histories); - void mainloop(); - Ptr options_; - DeviceId device_; std::vector> *vocabs_; Ptr graph_; std::vector> scorers_; Ptr slgen_; - - PCQueue *pcqueue_; - std::thread thread_; }; + +void translation_loop(DeviceId const &device, PCQueue &pcqueue, + std::vector> &vocabs, + Ptr options); + } // namespace bergamot } // namespace marian diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 2acbbdb..62073f9 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -16,9 +16,11 @@ Service::Service(Ptr options) workers_.reserve(numWorkers_); - for (int i = 0; i < numWorkers_; i++) { - marian::DeviceId deviceId(i, DeviceType::cpu); - workers_.emplace_back(deviceId, pcqueue_, vocabs_, options); + for (int cpuId = 0; cpuId < numWorkers_; cpuId++) { + workers_.emplace_back([&] { + marian::DeviceId deviceId(cpuId, DeviceType::cpu); + translation_loop(deviceId, pcqueue_, vocabs_, options); + }); } } diff --git a/src/translator/service.h b/src/translator/service.h index 0ed8d0c..e516bba 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -69,7 +69,7 @@ private: TextProcessor text_processor_; // ORDER DEPENDENCY Batcher batcher_; PCQueue pcqueue_; - std::vector workers_; + std::vector workers_; }; std::vector> loadVocabularies(Ptr options); From f1d9f67b56ed5d84f74236b166fd592c060bf8d2 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 13 Feb 2021 11:42:57 +0000 Subject: [PATCH 03/39] single-threaded run with --cpu-threads 0 (#10) --- src/translator/batch_translator.cpp | 13 +++---- src/translator/batch_translator.h | 2 +- src/translator/batcher.cpp | 25 +++++++++++++ src/translator/batcher.h | 4 ++ src/translator/service.cpp | 57 ++++++++++++++++------------- src/translator/service.h | 3 ++ 6 files changed, 70 insertions(+), 34 deletions(-) diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index 7f801c9..3d2ec41 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -36,8 +36,7 @@ BatchTranslator::BatchTranslator(DeviceId const device, graph_->forward(); } -void BatchTranslator::translate(RequestSentences &requestSentences, - Histories &histories) { +void BatchTranslator::translate(RequestSentences &requestSentences) { std::vector batchVector; for (auto &sentence : requestSentences) { @@ -89,7 +88,10 @@ void BatchTranslator::translate(RequestSentences &requestSentences, auto trgVocab = vocabs_->back(); auto search = New(options_, scorers_, trgVocab); - histories = std::move(search->search(graph_, batch)); + auto histories = std::move(search->search(graph_, batch)); + for (int i = 0; i < requestSentences.size(); i++) { + requestSentences[i].completeSentence(histories[i]); + } } // void BatchTranslator::join() { thread_.join(); } @@ -107,10 +109,7 @@ void translation_loop(DeviceId const &device, PCQueue &pcqueue, if (pcitem.isPoison()) { return; } else { - translator.translate(pcitem.sentences, histories); - for (int i = 0; i < pcitem.sentences.size(); i++) { - pcitem.sentences[i].completeSentence(histories[i]); - } + translator.translate(pcitem.sentences); } } } diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index c718b32..4067e59 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -27,7 +27,7 @@ public: // convenience function for logging. TODO(jerin) std::string _identifier() { return "worker" + std::to_string(device_.no); } - void translate(RequestSentences &requestSentences, Histories &histories); + void translate(RequestSentences &requestSentences); private: Ptr options_; diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 2fa4eaf..18bf5fd 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -50,5 +50,30 @@ void Batcher::cleaveBatch(RequestSentences &sentences) { } } +void Batcher::addWholeRequest(Ptr request) { + for (int i = 0; i < request->numSegments(); i++) { + RequestSentence requestSentence(i, request); + addSentenceWithPriority(requestSentence); + } +} + +void Batcher::enqueue(PCQueue &pcqueue) { + int numSentences; + do { + RequestSentences batchSentences; + cleaveBatch(batchSentences); + numSentences = batchSentences.size(); + + if (numSentences > 0) { + PCItem pcitem(batchNumber_++, std::move(batchSentences)); + pcqueue.ProduceSwap(pcitem); + } + + if (batchNumber_ % 500 == 0) { + LOG(info, "Queuing batch {}", batchNumber_); + } + } while (numSentences > 0); +} + } // namespace bergamot } // namespace marian diff --git a/src/translator/batcher.h b/src/translator/batcher.h index b60b642..2499cd2 100644 --- a/src/translator/batcher.h +++ b/src/translator/batcher.h @@ -4,6 +4,7 @@ #include "common/options.h" #include "data/corpus_base.h" #include "definitions.h" +#include "pcqueue.h" #include "request.h" #include @@ -19,6 +20,8 @@ public: // sentence. This method inserts the sentence into the internal data-structure // which maintains priority among sentences from multiple concurrent requests. void addSentenceWithPriority(RequestSentence &sentence); + void addWholeRequest(Ptr request); + void enqueue(PCQueue &pcqueue); // Loads sentences with sentences compiled from (tentatively) multiple // requests optimizing for both padding and priority. @@ -27,6 +30,7 @@ public: private: unsigned int max_input_tokens_; std::vector> bucket_; + unsigned int batchNumber_{0}; }; } // namespace bergamot diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 62073f9..fc71385 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -14,13 +14,17 @@ Service::Service(Ptr options) text_processor_(vocabs_, options), batcher_(options), pcqueue_(2 * options->get("cpu-threads")) { - workers_.reserve(numWorkers_); - - for (int cpuId = 0; cpuId < numWorkers_; cpuId++) { - workers_.emplace_back([&] { - marian::DeviceId deviceId(cpuId, DeviceType::cpu); - translation_loop(deviceId, pcqueue_, vocabs_, options); - }); + if (numWorkers_ > 0) { + workers_.reserve(numWorkers_); + for (int cpuId = 0; cpuId < numWorkers_; cpuId++) { + workers_.emplace_back([&] { + marian::DeviceId deviceId(cpuId, DeviceType::cpu); + translation_loop(deviceId, pcqueue_, vocabs_, options); + }); + } + } else { + marian::DeviceId deviceId(/*cpuId=*/0, DeviceType::cpu); + translator = new BatchTranslator(deviceId, vocabs_, options); } } @@ -53,27 +57,28 @@ std::future Service::translate(std::string &&input) { std::move(segments), std::move(sourceAlignments), std::move(translationResultPromise)); - for (int i = 0; i < request->numSegments(); i++) { - RequestSentence requestSentence(i, request); - batcher_.addSentenceWithPriority(requestSentence); + batcher_.addWholeRequest(request); + if (numWorkers_ > 0) { + batcher_.enqueue(pcqueue_); + } else { + // Queue single-threaded + int numSentences; + do { + RequestSentences batchSentences; + batcher_.cleaveBatch(batchSentences); + numSentences = batchSentences.size(); + + if (numSentences > 0) { + translator->translate(batchSentences); + batchNumber_++; + } + + if (batchNumber_ % 500 == 0) { + LOG(info, "Tranlsating batch {}", batchNumber_); + } + } while (numSentences > 0); } - int numSentences; - do { - RequestSentences batchSentences; - batcher_.cleaveBatch(batchSentences); - numSentences = batchSentences.size(); - - if (numSentences > 0) { - PCItem pcitem(batchNumber_++, std::move(batchSentences)); - pcqueue_.ProduceSwap(pcitem); - } - - if (batchNumber_ % 500 == 0) { - LOG(info, "Queuing batch {}", batchNumber_); - } - } while (numSentences > 0); - return future; } diff --git a/src/translator/service.h b/src/translator/service.h index e516bba..951398d 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -70,6 +70,9 @@ private: Batcher batcher_; PCQueue pcqueue_; std::vector workers_; + + // Optional + BatchTranslator *translator{nullptr}; }; std::vector> loadVocabularies(Ptr options); From 77a600b637afd854a189f96b052f37896d37acb7 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 13 Feb 2021 14:19:10 +0000 Subject: [PATCH 04/39] Removing join() (#10) --- src/translator/batch_translator.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index 3d2ec41..b944bed 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -94,8 +94,6 @@ void BatchTranslator::translate(RequestSentences &requestSentences) { } } -// void BatchTranslator::join() { thread_.join(); } - void translation_loop(DeviceId const &device, PCQueue &pcqueue, std::vector> &vocabs, Ptr options) { From 73a56a8f4fa447fb58e230905c7c6e3d25c366da Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 13 Feb 2021 15:48:23 +0000 Subject: [PATCH 05/39] Refactoring batching-mechanisms into Batcher Guided by an objective to move batching mechanism and queueing request to generate batches into a diffenrent thread. This commit is in preparation for this functionality. First, PCItem from the looks of it is *Batch*. Renamed to reflect the same. Fingers crossed, hopefully no naming conflicts with marian. BatchTranslator translates a "Batch" now, instead of vector. Additional data members are setup at Batch to enable development. Workflows previously in Service, but more adequate in Batcher are now moved, preparing to move Batcher/enqueuing of a request into a new thread making it non-blocking. This will allow service to queue requests into the batcher thread and exit, without waiting until the full-request is queued. Batcher now has a path with and without pcqueue. --- src/translator/batch_translator.cpp | 25 +++++----- src/translator/batch_translator.h | 4 +- src/translator/batcher.cpp | 73 +++++++++++++++-------------- src/translator/batcher.h | 7 +-- src/translator/request.h | 22 +++++---- src/translator/service.cpp | 27 ++++------- src/translator/service.h | 3 +- 7 files changed, 78 insertions(+), 83 deletions(-) diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index b944bed..a6e6b93 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -36,10 +36,10 @@ BatchTranslator::BatchTranslator(DeviceId const device, graph_->forward(); } -void BatchTranslator::translate(RequestSentences &requestSentences) { +void BatchTranslator::translate(Batch &batch) { std::vector batchVector; - for (auto &sentence : requestSentences) { + for (auto &sentence : batch.sentences) { data::SentenceTuple sentence_tuple(sentence.lineNumber()); Segment segment = sentence.getUnderlyingSegment(); sentence_tuple.push_back(segment); @@ -82,32 +82,31 @@ void BatchTranslator::translate(RequestSentences &requestSentences) { for (size_t j = 0; j < maxDims.size(); ++j) subBatches[j]->setWords(words[j]); - auto batch = Ptr(new CorpusBatch(subBatches)); - batch->setSentenceIds(sentenceIds); + auto corpus_batch = Ptr(new CorpusBatch(subBatches)); + corpus_batch->setSentenceIds(sentenceIds); auto trgVocab = vocabs_->back(); auto search = New(options_, scorers_, trgVocab); - auto histories = std::move(search->search(graph_, batch)); - for (int i = 0; i < requestSentences.size(); i++) { - requestSentences[i].completeSentence(histories[i]); + auto histories = std::move(search->search(graph_, corpus_batch)); + for (int i = 0; i < batch.sentences.size(); i++) { + batch.sentences[i].completeSentence(histories[i]); } } -void translation_loop(DeviceId const &device, PCQueue &pcqueue, +void translation_loop(DeviceId const &device, PCQueue &pcqueue, std::vector> &vocabs, Ptr options) { BatchTranslator translator(device, vocabs, options); - - PCItem pcitem; + Batch batch; Histories histories; while (true) { - pcqueue.ConsumeSwap(pcitem); - if (pcitem.isPoison()) { + pcqueue.ConsumeSwap(batch); + if (batch.isPoison()) { return; } else { - translator.translate(pcitem.sentences); + translator.translate(batch); } } } diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index 4067e59..2ee4e04 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -27,7 +27,7 @@ public: // convenience function for logging. TODO(jerin) std::string _identifier() { return "worker" + std::to_string(device_.no); } - void translate(RequestSentences &requestSentences); + void translate(Batch &batch); private: Ptr options_; @@ -38,7 +38,7 @@ private: Ptr slgen_; }; -void translation_loop(DeviceId const &device, PCQueue &pcqueue, +void translation_loop(DeviceId const &device, PCQueue &pcqueue, std::vector> &vocabs, Ptr options); diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 18bf5fd..13b5635 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -6,10 +6,10 @@ namespace marian { namespace bergamot { Batcher::Batcher(Ptr options) { - max_input_tokens_ = options->get("max-input-tokens"); + miniBatchWords = options->get("max-input-tokens"); bucket_.resize(options->get("max-input-sentence-tokens") + 1); ABORT_IF( - max_input_tokens_ < bucket_.size() - 1, + miniBatchWords < bucket_.size() - 1, "max-input-tokens cannot be less than than max-input-sentence-tokens, " "batcher fail"); } @@ -20,34 +20,48 @@ void Batcher::addSentenceWithPriority(RequestSentence &sentence) { bucket_[bucket_id].insert(sentence); } -void Batcher::cleaveBatch(RequestSentences &sentences) { +bool Batcher::operator>>(Batch &batch) { return cleaveBatch(batch); } + +bool Batcher::cleaveBatch(Batch &batch) { // For now simply iterates on buckets and converts batches greedily. This // has to be enhanced with optimizing over priority. The baseline // implementation should at least be as fast as marian's maxi-batch with full // corpus size as maxi-batch size. + batch.reset(); + int paddedBatchSize = 0; - int segments_added = 0; - int current_input_tokens = 0; - int padded_batch_size = 0; - int prev_padded_batch_size; - - for (int i = 0; i < bucket_.size(); i++) { - auto p = bucket_[i].begin(); - while (p != bucket_[i].end()) { - padded_batch_size = (segments_added + 1) * i; - if (padded_batch_size <= max_input_tokens_) { + for (int length = 0; length < bucket_.size(); length++) { + auto p = bucket_[length].begin(); + while (p != bucket_[length].end()) { + paddedBatchSize = (batch.sentences.size() + 1) * length; + if (paddedBatchSize <= miniBatchWords) { auto q = p; ++p; - current_input_tokens += i; - sentences.push_back(*q); - ++segments_added; - bucket_[i].erase(q); - prev_padded_batch_size = padded_batch_size; + + batch.numTokens += length; + batch.sentences.push_back(*q); + batch.maxLength = std::max(batch.maxLength, length); + + bucket_[length].erase(q); } else { - return; + // Check if elements exist + assert(batch.sentences.size() > 0); + batch.Id = ++batchNumber_; + if (batchId % 500 == 0) { + batch.log(); + } + return true; } } } + + if (batch.sentences.size()) { + batch.Id = ++batchNumber_; + batch.log(); + return true; + } else { + return false; + } } void Batcher::addWholeRequest(Ptr request) { @@ -57,22 +71,11 @@ void Batcher::addWholeRequest(Ptr request) { } } -void Batcher::enqueue(PCQueue &pcqueue) { - int numSentences; - do { - RequestSentences batchSentences; - cleaveBatch(batchSentences); - numSentences = batchSentences.size(); - - if (numSentences > 0) { - PCItem pcitem(batchNumber_++, std::move(batchSentences)); - pcqueue.ProduceSwap(pcitem); - } - - if (batchNumber_ % 500 == 0) { - LOG(info, "Queuing batch {}", batchNumber_); - } - } while (numSentences > 0); +void Batcher::enqueue(PCQueue &pcqueue) { + Batch batch; + while (cleaveBatch(batch)) { + pcqueue.ProduceSwap(batch); + } } } // namespace bergamot diff --git a/src/translator/batcher.h b/src/translator/batcher.h index 2499cd2..d6b85f3 100644 --- a/src/translator/batcher.h +++ b/src/translator/batcher.h @@ -21,14 +21,15 @@ public: // which maintains priority among sentences from multiple concurrent requests. void addSentenceWithPriority(RequestSentence &sentence); void addWholeRequest(Ptr request); - void enqueue(PCQueue &pcqueue); + void enqueue(PCQueue &pcqueue); // Loads sentences with sentences compiled from (tentatively) multiple // requests optimizing for both padding and priority. - void cleaveBatch(RequestSentences &sentences); + bool cleaveBatch(Batch &batch); + bool operator>>(Batch &batch); // alias private: - unsigned int max_input_tokens_; + unsigned int miniBatchWords; std::vector> bucket_; unsigned int batchNumber_{0}; }; diff --git a/src/translator/request.h b/src/translator/request.h index 6f268ba..673f88c 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -24,6 +24,7 @@ #include "definitions.h" #include "translation_result.h" +#include "common/logging.h" #include "data/types.h" #include "translator/beam_search.h" @@ -92,20 +93,23 @@ public: typedef std::vector RequestSentences; -struct PCItem { - int batchNumber; +struct Batch { + int Id; + int numTokens, maxLength; RequestSentences sentences; - // PCItem should be default constructible for PCQueue. Default constructed + // Batch should be default constructible for PCQueue. Default constructed // element is poison. - PCItem() : batchNumber(-1) {} - - // PCItem constructor to construct a legit PCItem. - explicit PCItem(int batchNumber, RequestSentences &&sentences) - : batchNumber(batchNumber), sentences(std::move(sentences)) {} + Batch() { reset(); } + void reset() { Id = -1, numTokens = 0, maxLength = 0, sentences.clear(); } // Convenience function to determine poison. - bool isPoison() { return (batchNumber == -1); } + bool isPoison() { return (Id == -1); } + + void log() { + LOG(info, "Batch(Id={}, tokens={}, max-length={}, sentences={})", Id, + numTokens, maxLength, sentences.size()); + } }; } // namespace bergamot diff --git a/src/translator/service.cpp b/src/translator/service.cpp index fc71385..3701955 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -8,8 +8,7 @@ namespace marian { namespace bergamot { Service::Service(Ptr options) - : requestId_(0), batchNumber_(0), - numWorkers_(options->get("cpu-threads")), + : requestId_(0), numWorkers_(options->get("cpu-threads")), vocabs_(std::move(loadVocabularies(options))), text_processor_(vocabs_, options), batcher_(options), pcqueue_(2 * options->get("cpu-threads")) { @@ -58,25 +57,15 @@ std::future Service::translate(std::string &&input) { std::move(translationResultPromise)); batcher_.addWholeRequest(request); + if (numWorkers_ > 0) { batcher_.enqueue(pcqueue_); } else { // Queue single-threaded - int numSentences; - do { - RequestSentences batchSentences; - batcher_.cleaveBatch(batchSentences); - numSentences = batchSentences.size(); - - if (numSentences > 0) { - translator->translate(batchSentences); - batchNumber_++; - } - - if (batchNumber_ % 500 == 0) { - LOG(info, "Tranlsating batch {}", batchNumber_); - } - } while (numSentences > 0); + Batch batch; + while (batcher_ >> batch) { + translator->translate(batch); + } } return future; @@ -85,8 +74,8 @@ std::future Service::translate(std::string &&input) { void Service::stop() { int counter = 0; for (auto &worker : workers_) { - PCItem pcitem; - pcqueue_.ProduceSwap(pcitem); + Batch batch; + pcqueue_.ProduceSwap(batch); ++counter; } diff --git a/src/translator/service.h b/src/translator/service.h index 951398d..c57e609 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -46,7 +46,6 @@ public: private: unsigned int requestId_; - unsigned int batchNumber_; int numWorkers_; // vocabs are used to construct a Request, which later uses it to construct @@ -68,7 +67,7 @@ private: TextProcessor text_processor_; // ORDER DEPENDENCY Batcher batcher_; - PCQueue pcqueue_; + PCQueue pcqueue_; std::vector workers_; // Optional From e585a9e7861934e40d3d4e2a5793724be3a9e3a6 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 13 Feb 2021 16:31:30 +0000 Subject: [PATCH 06/39] Sanitizing Batch construction Batch Ids cannot be set by outside classes to values < 0. Batch.Id_ = -1 : Poison, for use in PCQueue 0 : Default constructed, invalid batch. >0 : Legit batch. Book-keeping for batch metrics (maxLength, numTokens, etc) and logging are now moved to Batch. Batch is now a class instead of a struct with accessors controlling how members can be modified to suit above. --- src/translator/batch_translator.cpp | 7 ++-- src/translator/batcher.cpp | 23 ++++--------- src/translator/request.h | 53 ++++++++++++++++++++++------- src/translator/service.cpp | 4 +-- 4 files changed, 53 insertions(+), 34 deletions(-) diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index a6e6b93..13eb58a 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -39,7 +39,8 @@ BatchTranslator::BatchTranslator(DeviceId const device, void BatchTranslator::translate(Batch &batch) { std::vector batchVector; - for (auto &sentence : batch.sentences) { + auto &sentences = batch.sentences(); + for (auto &sentence : sentences) { data::SentenceTuple sentence_tuple(sentence.lineNumber()); Segment segment = sentence.getUnderlyingSegment(); sentence_tuple.push_back(segment); @@ -89,9 +90,7 @@ void BatchTranslator::translate(Batch &batch) { auto search = New(options_, scorers_, trgVocab); auto histories = std::move(search->search(graph_, corpus_batch)); - for (int i = 0; i < batch.sentences.size(); i++) { - batch.sentences[i].completeSentence(histories[i]); - } + batch.completeBatch(histories); } void translation_loop(DeviceId const &device, PCQueue &pcqueue, diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 13b5635..5fdcc3a 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -33,31 +33,22 @@ bool Batcher::cleaveBatch(Batch &batch) { for (int length = 0; length < bucket_.size(); length++) { auto p = bucket_[length].begin(); while (p != bucket_[length].end()) { - paddedBatchSize = (batch.sentences.size() + 1) * length; + paddedBatchSize = (batch.size() + 1) * length; if (paddedBatchSize <= miniBatchWords) { - auto q = p; - ++p; - - batch.numTokens += length; - batch.sentences.push_back(*q); - batch.maxLength = std::max(batch.maxLength, length); - + auto q = p++; + batch.add(*q); bucket_[length].erase(q); } else { // Check if elements exist - assert(batch.sentences.size() > 0); - batch.Id = ++batchNumber_; - if (batchId % 500 == 0) { - batch.log(); - } + assert(batch.size() > 0); + batch.setId(++batchNumber_); return true; } } } - if (batch.sentences.size()) { - batch.Id = ++batchNumber_; - batch.log(); + if (batch.size()) { + batch.setId(++batchNumber_); return true; } else { return false; diff --git a/src/translator/request.h b/src/translator/request.h index 673f88c..5fb9c3c 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -28,6 +28,8 @@ #include "data/types.h" #include "translator/beam_search.h" +#include + #include #include @@ -93,23 +95,50 @@ public: typedef std::vector RequestSentences; -struct Batch { - int Id; - int numTokens, maxLength; - RequestSentences sentences; - - // Batch should be default constructible for PCQueue. Default constructed - // element is poison. +class Batch { +public: Batch() { reset(); } - void reset() { Id = -1, numTokens = 0, maxLength = 0, sentences.clear(); } - + void reset() { Id_ = 0, numTokens_ = 0, maxLength_ = 0, sentences_.clear(); } // Convenience function to determine poison. - bool isPoison() { return (Id == -1); } + bool isPoison() { return (Id_ == -1); } + static Batch poison() { + Batch poison_; + poison_.Id_ = -1; + return poison_; + } void log() { - LOG(info, "Batch(Id={}, tokens={}, max-length={}, sentences={})", Id, - numTokens, maxLength, sentences.size()); + LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_, + numTokens_, maxLength_, sentences_.size()); } + + void add(const RequestSentence &sentence) { + sentences_.push_back(sentence); + maxLength_ = std::max(sentence.numTokens(), maxLength_); + numTokens_ += sentence.numTokens(); + } + + size_t size() { return sentences_.size(); } + + void setId(int Id) { + assert(Id > 0); + Id_ = Id; + if (Id % 500 == 0) { + log(); + } + } + + const RequestSentences &sentences() { return sentences_; } + void completeBatch(const Histories &histories) { + for (int i = 0; i < sentences_.size(); i++) { + sentences_[i].completeSentence(histories[i]); + } + } + +private: + int Id_; + size_t numTokens_, maxLength_; + RequestSentences sentences_; }; } // namespace bergamot diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 3701955..c93aa5f 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -74,8 +74,8 @@ std::future Service::translate(std::string &&input) { void Service::stop() { int counter = 0; for (auto &worker : workers_) { - Batch batch; - pcqueue_.ProduceSwap(batch); + Batch poison = Batch::poison(); + pcqueue_.ProduceSwap(poison); ++counter; } From 47323d21b93795e19d82a499bfb13b71f7032c40 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 14 Feb 2021 13:05:05 +0000 Subject: [PATCH 07/39] Getting rid of unused variables in Batch --- src/translator/request.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/translator/request.h b/src/translator/request.h index 5fb9c3c..eab0d4b 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -98,7 +98,10 @@ typedef std::vector RequestSentences; class Batch { public: Batch() { reset(); } - void reset() { Id_ = 0, numTokens_ = 0, maxLength_ = 0, sentences_.clear(); } + void reset() { + Id_ = 0; + sentences_.clear(); + } // Convenience function to determine poison. bool isPoison() { return (Id_ == -1); } static Batch poison() { @@ -108,15 +111,17 @@ public: } void log() { + int numTokens{0}, maxLength{0}; + for (auto &sentence : sentences_) { + numTokens += sentence.numTokens(); + maxLength = std::max(maxLength, static_cast(sentence.numTokens())); + } + LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_, - numTokens_, maxLength_, sentences_.size()); + numTokens, maxLength, sentences_.size()); } - void add(const RequestSentence &sentence) { - sentences_.push_back(sentence); - maxLength_ = std::max(sentence.numTokens(), maxLength_); - numTokens_ += sentence.numTokens(); - } + void add(const RequestSentence &sentence) { sentences_.push_back(sentence); } size_t size() { return sentences_.size(); } @@ -137,7 +142,6 @@ public: private: int Id_; - size_t numTokens_, maxLength_; RequestSentences sentences_; }; From ecc91c51e3b439b32173e3e4a821fdfe1a538436 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 14 Feb 2021 13:23:46 +0000 Subject: [PATCH 08/39] BatchTranslator* -> unique_ptr --- src/translator/service.cpp | 3 ++- src/translator/service.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/translator/service.cpp b/src/translator/service.cpp index c93aa5f..bdfb7e9 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -23,7 +23,8 @@ Service::Service(Ptr options) } } else { marian::DeviceId deviceId(/*cpuId=*/0, DeviceType::cpu); - translator = new BatchTranslator(deviceId, vocabs_, options); + translator = + UPtr(new BatchTranslator(deviceId, vocabs_, options)); } } diff --git a/src/translator/service.h b/src/translator/service.h index c57e609..db01468 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -71,7 +71,7 @@ private: std::vector workers_; // Optional - BatchTranslator *translator{nullptr}; + UPtr translator{nullptr}; }; std::vector> loadVocabularies(Ptr options); From 5bd4a1a3c0ef388249794298b5ed2c0b1cf92d05 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 14 Feb 2021 19:58:29 +0000 Subject: [PATCH 09/39] Refactor: marian-TranslationResult and associated marian-TranslationResult has more guards in place. Switching to a construction on demand model for sentenceMappings. These changes propogate to bergamot translation results. Integration broke with the change in marian's internals, which are updated accordingly to get back functionality. Changes revealed a few bugs, which are fixed: - ConfigParser already discovered in wasm-integration (https://github.com/browsermt/bergamot-translator/commit/a06530e92b6d16527487c8fa0ead4ae04f0ddbb5). - Lambda captures and undefined values in DeviceId --- app/main-mts.cpp | 2 +- app/marian-decoder-new.cpp | 4 +- src/translator/TranslationModel.cpp | 18 +++++-- src/translator/parser.h | 3 +- src/translator/service.cpp | 10 ++-- src/translator/translation_result.cpp | 67 +++++++++++++++++---------- src/translator/translation_result.h | 42 ++++------------- 7 files changed, 76 insertions(+), 70 deletions(-) diff --git a/app/main-mts.cpp b/app/main-mts.cpp index c94ff30..d8e7567 100644 --- a/app/main-mts.cpp +++ b/app/main-mts.cpp @@ -26,7 +26,7 @@ int main(int argc, char *argv[]) { service.translate(std::move(input)); translation_result_future.wait(); const TranslationResult &translation_result = translation_result_future.get(); - std::cout << translation_result.getTranslatedText() << std::endl; + std::cout << translation_result.translation() << std::endl; // Stop Service. service.stop(); diff --git a/app/marian-decoder-new.cpp b/app/marian-decoder-new.cpp index 62b1bb4..6e44fb7 100644 --- a/app/marian-decoder-new.cpp +++ b/app/marian-decoder-new.cpp @@ -54,8 +54,8 @@ int main(int argc, char *argv[]) { translation_result_future.wait(); const TranslationResult &translation_result = translation_result_future.get(); - marian_decoder_minimal(translation_result.getHistories(), - service.targetVocab(), options); + marian_decoder_minimal(translation_result.histories(), service.targetVocab(), + options); LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed()); service.stop(); diff --git a/src/translator/TranslationModel.cpp b/src/translator/TranslationModel.cpp index f501678..9c55422 100644 --- a/src/translator/TranslationModel.cpp +++ b/src/translator/TranslationModel.cpp @@ -14,6 +14,7 @@ // All local project includes #include "TranslationModel.h" +#include "translator/parser.h" #include "translator/service.h" std::shared_ptr parseOptions(const std::string &config) { @@ -34,7 +35,7 @@ std::shared_ptr parseOptions(const std::string &config) { // Error: Aborted from void unhandledException() in // 3rd_party/marian-dev/src/common/logging.cpp:113 - marian::ConfigParser configParser(marian::cli::mode::translation); + marian::ConfigParser configParser = marian::bergamot::createConfigParser(); const YAML::Node &defaultConfig = configParser.getConfig(); options.merge(defaultConfig); @@ -70,18 +71,25 @@ TranslationModel::translate(std::vector &&texts, intermediate.wait(); auto mTranslationResult(std::move(intermediate.get())); + // This mess because marian::string_view != std::string_view + std::string source, translation; + marian::bergamot::TranslationResult::SentenceMappings mSentenceMappings; + mTranslationResult.move(source, translation, mSentenceMappings); + // Convert to UnifiedAPI::TranslationResult TranslationResult::SentenceMappings sentenceMappings; - for (auto &p : mTranslationResult.getSentenceMappings()) { + for (auto &p : mSentenceMappings) { std::string_view src(p.first.data(), p.first.size()), tgt(p.second.data(), p.second.size()); sentenceMappings.emplace_back(src, tgt); } // In place construction. - translationResults.emplace_back(std::move(mTranslationResult.source_), - std::move(mTranslationResult.translation_), - std::move(sentenceMappings)); + translationResults.emplace_back( + std::move(source), // &&mTranslationResult.source_ + std::move(translation), // &&mTranslationResult.translation_ + std::move(sentenceMappings) // &&sentenceMappings + ); } promise.set_value(std::move(translationResults)); diff --git a/src/translator/parser.h b/src/translator/parser.h index e273d6a..606b6a4 100644 --- a/src/translator/parser.h +++ b/src/translator/parser.h @@ -5,7 +5,8 @@ namespace marian { namespace bergamot { -marian::ConfigParser createConfigParser() { + +inline marian::ConfigParser createConfigParser() { marian::ConfigParser cp(marian::cli::mode::translation); cp.addOption( "--ssplit-prefix-file", "Bergamot Options", diff --git a/src/translator/service.cpp b/src/translator/service.cpp index bdfb7e9..ef2bacb 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -15,11 +15,11 @@ Service::Service(Ptr options) if (numWorkers_ > 0) { workers_.reserve(numWorkers_); - for (int cpuId = 0; cpuId < numWorkers_; cpuId++) { - workers_.emplace_back([&] { - marian::DeviceId deviceId(cpuId, DeviceType::cpu); - translation_loop(deviceId, pcqueue_, vocabs_, options); - }); + for (size_t cpuId = 0; cpuId < numWorkers_; cpuId++) { + marian::DeviceId deviceId(cpuId, DeviceType::cpu); + workers_.emplace_back(translation_loop, // Function + deviceId, std::ref(pcqueue_), std::ref(vocabs_), + options); } } else { marian::DeviceId deviceId(/*cpuId=*/0, DeviceType::cpu); diff --git a/src/translator/translation_result.cpp b/src/translator/translation_result.cpp index d69259f..ee147be 100644 --- a/src/translator/translation_result.cpp +++ b/src/translator/translation_result.cpp @@ -14,22 +14,26 @@ TranslationResult::TranslationResult(std::string &&source, : source_(std::move(source)), sourceRanges_(std::move(sourceRanges)), histories_(std::move(histories)) { - std::vector sourceMappings; - std::vector targetMappings; + constructTargetProperties(vocabs); +} - // Process sourceMappings into sourceMappings. - sourceMappings.reserve(sourceRanges_.size()); - for (int i = 0; i < sourceRanges_.size(); i++) { - string_view first = sourceRanges_[i].front(); - string_view last = sourceRanges_[i].back(); - sourceMappings.emplace_back(first.data(), last.end() - first.begin()); - } +void TranslationResult::move(std::string &source, std::string &translation, + SentenceMappings &sentenceMappings) { - // Compiles translations into a single std::string translation_ - // Current implementation uses += on std::string, multiple resizes. - // Stores ByteRanges as indices first, followed by conversion into - // string_views. - // TODO(jerin): Add token level string_views here as well. + constructSentenceMappings(sentenceMappings); + // Totally illegal stuff. + source = std::move(source_); + translation = std::move(translation_); + + // The above assignment expects source, target be moved. + // which makes the following invalid, hence required to be cleared. + sourceRanges_.clear(); + targetRanges_.clear(); + histories_.clear(); +} + +void TranslationResult::constructTargetProperties( + std::vector> &vocabs) { std::vector> translationRanges; size_t offset{0}; bool first{true}; @@ -52,21 +56,36 @@ TranslationResult::TranslationResult(std::string &&source, offset += decoded.size(); } - // Converting ByteRanges as indices into string_views. - targetMappings.reserve(translationRanges.size()); + // TODO(@jerinphilip): + // Currently considers target tokens as whole text. Needs + // to be further enhanced in marian-dev to extract alignments. for (auto &range : translationRanges) { + std::vector targetMappings; const char *begin = &translation_[range.first]; targetMappings.emplace_back(begin, range.second); - } - - // Surely, let's add sentenceMappings_ - for (auto src = sourceMappings.begin(), tgt = targetMappings.begin(); - src != sourceMappings.end() && tgt != targetMappings.end(); - ++src, ++tgt) { - sentenceMappings_.emplace_back(*src, *tgt); - auto &t = sentenceMappings_.back(); + targetRanges_.push_back(std::move(targetMappings)); } } +void TranslationResult::constructSentenceMappings( + TranslationResult::SentenceMappings &sentenceMappings) { + + for (int i = 0; i < sourceRanges_.size(); i++) { + string_view first, last; + + // Handle source-sentence + first = sourceRanges_[i].front(); + last = sourceRanges_[i].back(); + string_view src_sentence(first.data(), last.end() - first.begin()); + + // Handle target-sentence + first = targetRanges_[i].front(); + last = targetRanges_[i].back(); + string_view tgt_sentence(first.data(), last.end() - first.begin()); + + // Add both into sentence-mappings + sentenceMappings.emplace_back(src_sentence, tgt_sentence); + } +} } // namespace bergamot } // namespace marian diff --git a/src/translator/translation_result.h b/src/translator/translation_result.h index edc9a8d..5903145 100644 --- a/src/translator/translation_result.h +++ b/src/translator/translation_result.h @@ -22,53 +22,31 @@ public: : source_(std::move(other.source_)), translation_(std::move(other.translation_)), sourceRanges_(std::move(other.sourceRanges_)), - sentenceMappings_(std::move(other.sentenceMappings_)), + targetRanges_(std::move(other.targetRanges_)), histories_(std::move(other.histories_)){}; TranslationResult(const TranslationResult &) = delete; TranslationResult &operator=(const TranslationResult &) = delete; - // Returns const references to source and translated texts, for external - // consumption. - - const std::string &getOriginalText() const { return source_; } - const std::string &getTranslatedText() const { return translation_; } - - // A mapping of string_views in the source_ and translation_ are provide as a - // pair for external consumption. Each entry corresponding - // to a (source-sentence, target-sentence). - typedef std::vector> SentenceMappings; - const SentenceMappings &getSentenceMappings() const { - return sentenceMappings_; - } - // Return the Quality scores of the translated text. - // Not implemented currently, commenting out. - // const QualityScore &getQualityScore() const { return qualityScore; } + void move(std::string &source, std::string &target, + SentenceMappings &sentenceMappings); - // For development use to benchmark with marian-decoder. - const Histories &getHistories() const { return histories_; } + const Histories &histories() const { return histories_; } + const std::string &source() const { return source_; } + const std::string &translation() const { return translation_; } - // @jerinphilip: Why are these members no longer-private? For move-semantics - // with consistent string_views for bergamot-translator. +private: + void constructTargetProperties(std::vector> &vocabs); + void constructSentenceMappings(SentenceMappings &); std::string source_; std::string translation_; - // Adding the following to complete bergamot-translator spec, redundant while - // sourceMappings_ and targetMappings_ exists or vice-versa. - - SentenceMappings sentenceMappings_; - -private: - // Histories are currently required for interoperability with OutputPrinter - // and OutputCollector and hence comparisons with marian-decoder. - // Future hook to gain alignments. Histories histories_; - - // string_views at the token level. std::vector sourceRanges_; + std::vector targetRanges_; }; } // namespace bergamot } // namespace marian From 0fc6105df49a4e0f05e1d382ea9909776ad3aeec Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 14 Feb 2021 20:27:53 +0000 Subject: [PATCH 10/39] No more two TranslationResults (sort-of) To avoid confusion, this commit renames marian::bergamot::TranslationResult -> marian::bergamot::Response. Usages of marian::bergamot::TranslationResults are updated across the source to be consistent with the change and get source back working. --- app/main-mts.cpp | 13 ++++++------- app/marian-decoder-new.cpp | 14 ++++++-------- src/translator/TranslationModel.cpp | 10 +++++----- src/translator/TranslationModel.h | 3 ++- src/translator/request.cpp | 11 +++++------ src/translator/request.h | 4 ++-- src/translator/service.cpp | 16 ++++++++-------- src/translator/service.h | 10 +++++----- src/translator/translation_result.cpp | 17 ++++++++--------- src/translator/translation_result.h | 14 ++++++-------- 10 files changed, 53 insertions(+), 59 deletions(-) diff --git a/app/main-mts.cpp b/app/main-mts.cpp index d8e7567..b5a4938 100644 --- a/app/main-mts.cpp +++ b/app/main-mts.cpp @@ -19,14 +19,13 @@ int main(int argc, char *argv[]) { std::ostringstream std_input; std_input << std::cin.rdbuf(); std::string input = std_input.str(); - using marian::bergamot::TranslationResult; + using marian::bergamot::Response; - // Wait on future until TranslationResult is complete - std::future translation_result_future = - service.translate(std::move(input)); - translation_result_future.wait(); - const TranslationResult &translation_result = translation_result_future.get(); - std::cout << translation_result.translation() << std::endl; + // Wait on future until Response is complete + std::future responseFuture = service.translate(std::move(input)); + responseFuture.wait(); + const Response &response = responseFuture.get(); + std::cout << response.translation() << std::endl; // Stop Service. service.stop(); diff --git a/app/marian-decoder-new.cpp b/app/marian-decoder-new.cpp index 6e44fb7..8988310 100644 --- a/app/marian-decoder-new.cpp +++ b/app/marian-decoder-new.cpp @@ -46,16 +46,14 @@ int main(int argc, char *argv[]) { std::ostringstream std_input; std_input << std::cin.rdbuf(); std::string input = std_input.str(); - using marian::bergamot::TranslationResult; + using marian::bergamot::Response; - // Wait on future until TranslationResult is complete - std::future translation_result_future = - service.translate(std::move(input)); - translation_result_future.wait(); - const TranslationResult &translation_result = translation_result_future.get(); + // Wait on future until Response is complete + std::future responseFuture = service.translate(std::move(input)); + responseFuture.wait(); + const Response &response = responseFuture.get(); - marian_decoder_minimal(translation_result.histories(), service.targetVocab(), - options); + marian_decoder_minimal(response.histories(), service.targetVocab(), options); LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed()); service.stop(); diff --git a/src/translator/TranslationModel.cpp b/src/translator/TranslationModel.cpp index 9c55422..a5d396e 100644 --- a/src/translator/TranslationModel.cpp +++ b/src/translator/TranslationModel.cpp @@ -69,12 +69,12 @@ TranslationModel::translate(std::vector &&texts, // Collect future as marian::bergamot::TranslationResult auto intermediate = service_.translate(std::move(text)); intermediate.wait(); - auto mTranslationResult(std::move(intermediate.get())); + auto marianResponse(std::move(intermediate.get())); // This mess because marian::string_view != std::string_view std::string source, translation; - marian::bergamot::TranslationResult::SentenceMappings mSentenceMappings; - mTranslationResult.move(source, translation, mSentenceMappings); + marian::bergamot::Response::SentenceMappings mSentenceMappings; + marianResponse.move(source, translation, mSentenceMappings); // Convert to UnifiedAPI::TranslationResult TranslationResult::SentenceMappings sentenceMappings; @@ -86,8 +86,8 @@ TranslationModel::translate(std::vector &&texts, // In place construction. translationResults.emplace_back( - std::move(source), // &&mTranslationResult.source_ - std::move(translation), // &&mTranslationResult.translation_ + std::move(source), // &&marianResponse.source_ + std::move(translation), // &&marianResponse.translation_ std::move(sentenceMappings) // &&sentenceMappings ); } diff --git a/src/translator/TranslationModel.h b/src/translator/TranslationModel.h index c922538..5f590d9 100644 --- a/src/translator/TranslationModel.h +++ b/src/translator/TranslationModel.h @@ -24,7 +24,8 @@ */ class TranslationModel : public AbstractTranslationModel { public: - /* Construct the model using the model configuration options as yaml-formatted string + /* Construct the model using the model configuration options as yaml-formatted + * string */ TranslationModel(const std::string &config); diff --git a/src/translator/request.cpp b/src/translator/request.cpp index a743389..5433699 100644 --- a/src/translator/request.cpp +++ b/src/translator/request.cpp @@ -14,11 +14,11 @@ Request::Request(unsigned int Id, int lineNumberBegin, std::vector> &vocabs, std::string &&source, Segments &&segments, std::vector &&sourceAlignments, - std::promise translationResultPromise) + std::promise responsePromise) : Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs), source_(std::move(source)), segments_(std::move(segments)), sourceAlignments_(std::move(sourceAlignments)), - response_(std::move(translationResultPromise)) { + response_(std::move(responsePromise)) { counter_ = segments_.size(); histories_.resize(segments_.size(), nullptr); @@ -47,10 +47,9 @@ void Request::processHistory(size_t index, Ptr history) { void Request::completeRequest() { // Request no longer needs to hold the content, can transfer it to - // TranslationResult. - TranslationResult translation_result(std::move(source_), - std::move(sourceAlignments_), - std::move(histories_), *vocabs_); + // Response. + Response translation_result(std::move(source_), std::move(sourceAlignments_), + std::move(histories_), *vocabs_); response_.set_value(std::move(translation_result)); } diff --git a/src/translator/request.h b/src/translator/request.h index eab0d4b..ddd6ccc 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -48,13 +48,13 @@ private: std::vector sourceAlignments_; std::vector> histories_; - std::promise response_; + std::promise response_; public: Request(unsigned int Id, int lineNumberBegin, std::vector> &vocabs_, std::string &&source, Segments &&segments, std::vector &&sourceAlignments, - std::promise translationResultPromise); + std::promise responsePromise); // Obtain the count of tokens in the segment correponding to index. Used to // insert sentence from multiple requests into the corresponding size bucket. diff --git a/src/translator/service.cpp b/src/translator/service.cpp index ef2bacb..4ab539f 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -28,11 +28,11 @@ Service::Service(Ptr options) } } -std::future Service::translateWithCopy(std::string input) { +std::future Service::translateWithCopy(std::string input) { return translate(std::move(input)); } -std::future Service::translate(std::string &&input) { +std::future Service::translate(std::string &&input) { // Takes in a blob of text. Segments and std::vector are // extracted from the input (blob of text) and used to construct a Request // along with a promise. promise value is set by the worker completing a @@ -49,13 +49,13 @@ std::future Service::translate(std::string &&input) { std::vector sourceAlignments; text_processor_.process(input, segments, sourceAlignments); - std::promise translationResultPromise; - auto future = translationResultPromise.get_future(); + std::promise responsePromise; + auto future = responsePromise.get_future(); - Ptr request = New( - requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(input), - std::move(segments), std::move(sourceAlignments), - std::move(translationResultPromise)); + Ptr request = + New(requestId_++, /* lineNumberBegin = */ 0, vocabs_, + std::move(input), std::move(segments), + std::move(sourceAlignments), std::move(responsePromise)); batcher_.addWholeRequest(request); diff --git a/src/translator/service.h b/src/translator/service.h index db01468..6f26bc8 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -25,17 +25,17 @@ class Service { // options = ...; // service = Service(options); // std::string input_blob = "Hello World"; - // std::future + // std::future // response = service.translate(std::move(input_blob)); // response.wait(); - // TranslationResult result = response.get(); + // Response result = response.get(); public: explicit Service(Ptr options); // Constructs new string copying, calls translate internally. - std::future translateWithCopy(std::string input); - std::future translate(std::string &&input); + std::future translateWithCopy(std::string input); + std::future translate(std::string &&input); void stop(); @@ -49,7 +49,7 @@ private: int numWorkers_; // vocabs are used to construct a Request, which later uses it to construct - // TranslationResult (decode from words to string). + // Response (decode from words to string). std::vector> vocabs_; // ORDER DEPENDENCY // Consists of: diff --git a/src/translator/translation_result.cpp b/src/translator/translation_result.cpp index ee147be..58f0926 100644 --- a/src/translator/translation_result.cpp +++ b/src/translator/translation_result.cpp @@ -7,18 +7,17 @@ namespace marian { namespace bergamot { -TranslationResult::TranslationResult(std::string &&source, - std::vector &&sourceRanges, - Histories &&histories, - std::vector> &vocabs) +Response::Response(std::string &&source, + std::vector &&sourceRanges, + Histories &&histories, std::vector> &vocabs) : source_(std::move(source)), sourceRanges_(std::move(sourceRanges)), histories_(std::move(histories)) { constructTargetProperties(vocabs); } -void TranslationResult::move(std::string &source, std::string &translation, - SentenceMappings &sentenceMappings) { +void Response::move(std::string &source, std::string &translation, + SentenceMappings &sentenceMappings) { constructSentenceMappings(sentenceMappings); // Totally illegal stuff. @@ -32,7 +31,7 @@ void TranslationResult::move(std::string &source, std::string &translation, histories_.clear(); } -void TranslationResult::constructTargetProperties( +void Response::constructTargetProperties( std::vector> &vocabs) { std::vector> translationRanges; size_t offset{0}; @@ -67,8 +66,8 @@ void TranslationResult::constructTargetProperties( } } -void TranslationResult::constructSentenceMappings( - TranslationResult::SentenceMappings &sentenceMappings) { +void Response::constructSentenceMappings( + Response::SentenceMappings &sentenceMappings) { for (int i = 0; i < sourceRanges_.size(); i++) { string_view first, last; diff --git a/src/translator/translation_result.h b/src/translator/translation_result.h index 5903145..6ed8927 100644 --- a/src/translator/translation_result.h +++ b/src/translator/translation_result.h @@ -11,22 +11,20 @@ namespace marian { namespace bergamot { -class TranslationResult { +class Response { public: - TranslationResult(std::string &&source, - std::vector &&sourceRanges, - Histories &&histories, - std::vector> &vocabs); + Response(std::string &&source, std::vector &&sourceRanges, + Histories &&histories, std::vector> &vocabs); - TranslationResult(TranslationResult &&other) + Response(Response &&other) : source_(std::move(other.source_)), translation_(std::move(other.translation_)), sourceRanges_(std::move(other.sourceRanges_)), targetRanges_(std::move(other.targetRanges_)), histories_(std::move(other.histories_)){}; - TranslationResult(const TranslationResult &) = delete; - TranslationResult &operator=(const TranslationResult &) = delete; + Response(const Response &) = delete; + Response &operator=(const Response &) = delete; typedef std::vector> SentenceMappings; From 370e9e2fb619b5f45693a3d4e6e3dac1442b6fed Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 14 Feb 2021 20:35:41 +0000 Subject: [PATCH 11/39] {translation_result -> response}.h; propogates; --- app/main-mts.cpp | 2 +- app/marian-decoder-new.cpp | 2 +- src/translator/CMakeLists.txt | 2 +- src/translator/request.cpp | 2 +- src/translator/request.h | 2 +- src/translator/{translation_result.cpp => response.cpp} | 2 +- src/translator/{translation_result.h => response.h} | 6 +++--- src/translator/service.h | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) rename src/translator/{translation_result.cpp => response.cpp} (98%) rename src/translator/{translation_result.h => response.h} (91%) diff --git a/app/main-mts.cpp b/app/main-mts.cpp index b5a4938..78967be 100644 --- a/app/main-mts.cpp +++ b/app/main-mts.cpp @@ -7,8 +7,8 @@ #include "common/utils.h" #include "marian.h" #include "translator/parser.h" +#include "translator/response.h" #include "translator/service.h" -#include "translator/translation_result.h" int main(int argc, char *argv[]) { auto cp = marian::bergamot::createConfigParser(); diff --git a/app/marian-decoder-new.cpp b/app/marian-decoder-new.cpp index 8988310..f807909 100644 --- a/app/marian-decoder-new.cpp +++ b/app/marian-decoder-new.cpp @@ -11,8 +11,8 @@ #include "translator/output_collector.h" #include "translator/output_printer.h" #include "translator/parser.h" +#include "translator/response.h" #include "translator/service.h" -#include "translator/translation_result.h" void marian_decoder_minimal(const marian::Histories &histories, marian::Ptr targetVocab, diff --git a/src/translator/CMakeLists.txt b/src/translator/CMakeLists.txt index 16c3db9..c279ab9 100644 --- a/src/translator/CMakeLists.txt +++ b/src/translator/CMakeLists.txt @@ -10,7 +10,7 @@ add_library(bergamot-translator STATIC request.cpp service.cpp batcher.cpp - translation_result.cpp + response.cpp ) target_link_libraries(bergamot-translator marian ssplit) diff --git a/src/translator/request.cpp b/src/translator/request.cpp index 5433699..23bd679 100644 --- a/src/translator/request.cpp +++ b/src/translator/request.cpp @@ -1,7 +1,7 @@ #include "request.h" #include "definitions.h" -#include "translation_result.h" +#include "response.h" #include "common/logging.h" diff --git a/src/translator/request.h b/src/translator/request.h index ddd6ccc..8912a49 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -22,7 +22,7 @@ #define SRC_BERGAMOT_REQUEST_H_ #include "definitions.h" -#include "translation_result.h" +#include "response.h" #include "common/logging.h" #include "data/types.h" diff --git a/src/translator/translation_result.cpp b/src/translator/response.cpp similarity index 98% rename from src/translator/translation_result.cpp rename to src/translator/response.cpp index 58f0926..d40f88d 100644 --- a/src/translator/translation_result.cpp +++ b/src/translator/response.cpp @@ -1,4 +1,4 @@ -#include "translation_result.h" +#include "response.h" #include "common/logging.h" #include "data/alignment.h" diff --git a/src/translator/translation_result.h b/src/translator/response.h similarity index 91% rename from src/translator/translation_result.h rename to src/translator/response.h index 6ed8927..5737717 100644 --- a/src/translator/translation_result.h +++ b/src/translator/response.h @@ -1,5 +1,5 @@ -#ifndef SRC_BERGAMOT_TRANSLATION_RESULT_H_ -#define SRC_BERGAMOT_TRANSLATION_RESULT_H_ +#ifndef SRC_BERGAMOT_RESPONSE_H_ +#define SRC_BERGAMOT_RESPONSE_H_ #include "data/types.h" #include "definitions.h" @@ -49,4 +49,4 @@ private: } // namespace bergamot } // namespace marian -#endif // SRC_BERGAMOT_TRANSLATION_RESULT_H_ +#endif // SRC_BERGAMOT_RESPONSE_H_ diff --git a/src/translator/service.h b/src/translator/service.h index 6f26bc8..38a45c6 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -4,8 +4,8 @@ #include "batch_translator.h" #include "batcher.h" #include "pcqueue.h" +#include "response.h" #include "text_processor.h" -#include "translation_result.h" #include #include From be455a3da101132c5d7c3a283b90cc1cffd8a119 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 14 Feb 2021 22:08:17 +0000 Subject: [PATCH 12/39] Straightening multithreading in translator workers BatchTranslators are now held in Service. Threads are separate, and constructed via lambdas. Retaining BatchTranslator class and member function (Probably a matter of taste I guess). This should eliminate complaints in (#10), hopefully. --- src/translator/batch_translator.cpp | 12 +++++------- src/translator/batch_translator.h | 6 ++---- src/translator/service.cpp | 28 +++++++++++++++++++--------- src/translator/service.h | 4 +--- 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index 13eb58a..7da63cf 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -10,7 +10,9 @@ namespace bergamot { BatchTranslator::BatchTranslator(DeviceId const device, std::vector> &vocabs, Ptr options) - : device_(device), options_(options), vocabs_(&vocabs) { + : device_(device), options_(options), vocabs_(&vocabs) {} + +void BatchTranslator::initialize() { // Initializes the graph. if (options_->hasAndNotEmpty("shortlist")) { int srcIdx = 0, trgIdx = 1; @@ -93,11 +95,7 @@ void BatchTranslator::translate(Batch &batch) { batch.completeBatch(histories); } -void translation_loop(DeviceId const &device, PCQueue &pcqueue, - std::vector> &vocabs, - Ptr options) { - - BatchTranslator translator(device, vocabs, options); +void BatchTranslator::consumeFrom(PCQueue &pcqueue) { Batch batch; Histories histories; while (true) { @@ -105,7 +103,7 @@ void translation_loop(DeviceId const &device, PCQueue &pcqueue, if (batch.isPoison()) { return; } else { - translator.translate(batch); + translate(batch); } } } diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index 2ee4e04..83b911c 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -28,6 +28,8 @@ public: // convenience function for logging. TODO(jerin) std::string _identifier() { return "worker" + std::to_string(device_.no); } void translate(Batch &batch); + void initialize(); + void consumeFrom(PCQueue &pcqueue); private: Ptr options_; @@ -38,10 +40,6 @@ private: Ptr slgen_; }; -void translation_loop(DeviceId const &device, PCQueue &pcqueue, - std::vector> &vocabs, - Ptr options); - } // namespace bergamot } // namespace marian diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 4ab539f..1b33558 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -13,18 +13,28 @@ Service::Service(Ptr options) text_processor_(vocabs_, options), batcher_(options), pcqueue_(2 * options->get("cpu-threads")) { - if (numWorkers_ > 0) { + if (numWorkers_ == 0) { + // In case workers are 0, a single-translator is created and initialized + // in the main thread. + marian::DeviceId deviceId(/*cpuId=*/0, DeviceType::cpu); + translators_.emplace_back(deviceId, vocabs_, options); + translators_.back().initialize(); + } else { + // If workers specified are greater than 0, translators_ are populated with + // unitialized instances. These are then initialized inside + // individual threads and set to consume from producer-consumer queue. workers_.reserve(numWorkers_); + translators_.reserve(numWorkers_); for (size_t cpuId = 0; cpuId < numWorkers_; cpuId++) { marian::DeviceId deviceId(cpuId, DeviceType::cpu); - workers_.emplace_back(translation_loop, // Function - deviceId, std::ref(pcqueue_), std::ref(vocabs_), - options); + translators_.emplace_back(deviceId, vocabs_, options); + + auto &translator = translators_.back(); + workers_.emplace_back([&translator, this] { + translator.initialize(); + translator.consumeFrom(pcqueue_); + }); } - } else { - marian::DeviceId deviceId(/*cpuId=*/0, DeviceType::cpu); - translator = - UPtr(new BatchTranslator(deviceId, vocabs_, options)); } } @@ -65,7 +75,7 @@ std::future Service::translate(std::string &&input) { // Queue single-threaded Batch batch; while (batcher_ >> batch) { - translator->translate(batch); + translators_[0].translate(batch); } } diff --git a/src/translator/service.h b/src/translator/service.h index 38a45c6..55b754a 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -68,10 +68,8 @@ private: TextProcessor text_processor_; // ORDER DEPENDENCY Batcher batcher_; PCQueue pcqueue_; + std::vector translators_; std::vector workers_; - - // Optional - UPtr translator{nullptr}; }; std::vector> loadVocabularies(Ptr options); From 45a8309c6972b121d62f1e9329267f752b8c796b Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 14 Feb 2021 22:28:08 +0000 Subject: [PATCH 13/39] Missed translation_result -> response rename --- src/translator/request.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/translator/request.cpp b/src/translator/request.cpp index 23bd679..9317f69 100644 --- a/src/translator/request.cpp +++ b/src/translator/request.cpp @@ -48,9 +48,9 @@ void Request::processHistory(size_t index, Ptr history) { void Request::completeRequest() { // Request no longer needs to hold the content, can transfer it to // Response. - Response translation_result(std::move(source_), std::move(sourceAlignments_), - std::move(histories_), *vocabs_); - response_.set_value(std::move(translation_result)); + Response response(std::move(source_), std::move(sourceAlignments_), + std::move(histories_), *vocabs_); + response_.set_value(std::move(response)); } bool Request::operator<(const Request &b) const { From ca6ca154b9ee74899f1a801a8a3c91972ca10043 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 15 Feb 2021 15:22:31 +0000 Subject: [PATCH 14/39] Changing fn name from enqueue to produceTo(pcqueue) --- src/translator/batcher.cpp | 2 +- src/translator/batcher.h | 2 +- src/translator/service.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 5fdcc3a..9ba0d03 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -62,7 +62,7 @@ void Batcher::addWholeRequest(Ptr request) { } } -void Batcher::enqueue(PCQueue &pcqueue) { +void Batcher::produceTo(PCQueue &pcqueue) { Batch batch; while (cleaveBatch(batch)) { pcqueue.ProduceSwap(batch); diff --git a/src/translator/batcher.h b/src/translator/batcher.h index d6b85f3..3427257 100644 --- a/src/translator/batcher.h +++ b/src/translator/batcher.h @@ -21,7 +21,7 @@ public: // which maintains priority among sentences from multiple concurrent requests. void addSentenceWithPriority(RequestSentence &sentence); void addWholeRequest(Ptr request); - void enqueue(PCQueue &pcqueue); + void produceTo(PCQueue &pcqueue); // Loads sentences with sentences compiled from (tentatively) multiple // requests optimizing for both padding and priority. diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 1b33558..96f391c 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -70,7 +70,7 @@ std::future Service::translate(std::string &&input) { batcher_.addWholeRequest(request); if (numWorkers_ > 0) { - batcher_.enqueue(pcqueue_); + batcher_.produceTo(pcqueue_); } else { // Queue single-threaded Batch batch; From d5a5e754510aeb158fea3e82939426e4d29885ed Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 15 Feb 2021 20:21:10 +0000 Subject: [PATCH 15/39] Renaming variables; Enhancing documentation --- src/translator/request.cpp | 45 +++++++++++- src/translator/request.h | 142 ++++++++++++++++++++++--------------- src/translator/service.cpp | 6 +- 3 files changed, 130 insertions(+), 63 deletions(-) diff --git a/src/translator/request.cpp b/src/translator/request.cpp index 9317f69..303f9cc 100644 --- a/src/translator/request.cpp +++ b/src/translator/request.cpp @@ -10,14 +10,15 @@ namespace marian { namespace bergamot { +// ----------------------------------------------------------------- Request::Request(unsigned int Id, int lineNumberBegin, std::vector> &vocabs, std::string &&source, Segments &&segments, - std::vector &&sourceAlignments, + std::vector &&sourceTokenRanges, std::promise responsePromise) : Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs), source_(std::move(source)), segments_(std::move(segments)), - sourceAlignments_(std::move(sourceAlignments)), + sourceTokenRanges_(std::move(sourceTokenRanges)), response_(std::move(responsePromise)) { counter_ = segments_.size(); @@ -48,7 +49,7 @@ void Request::processHistory(size_t index, Ptr history) { void Request::completeRequest() { // Request no longer needs to hold the content, can transfer it to // Response. - Response response(std::move(source_), std::move(sourceAlignments_), + Response response(std::move(source_), std::move(sourceTokenRanges_), std::move(histories_), *vocabs_); response_.set_value(std::move(response)); } @@ -58,6 +59,8 @@ bool Request::operator<(const Request &b) const { return Id_ < b.Id_; } +// ------------------------------------------------------------------ + RequestSentence::RequestSentence(size_t index, Ptr request) : index_(index), request_(request) {} @@ -87,5 +90,41 @@ bool operator<(const RequestSentence &a, const RequestSentence &b) { return a.request_ < b.request_; } +// ---------------------------------------------------------------------- + +void Batch::reset() { + Id_ = 0; + sentences_.clear(); +} + +void Batch::log() { + int numTokens{0}, maxLength{0}; + for (auto &sentence : sentences_) { + numTokens += sentence.numTokens(); + maxLength = std::max(maxLength, static_cast(sentence.numTokens())); + } + + LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_, + numTokens, maxLength, sentences_.size()); +} + +void Batch::add(const RequestSentence &sentence) { + sentences_.push_back(sentence); +} + +void Batch::setId(int Id) { + assert(Id > 0); + Id_ = Id; + if (Id % 500 == 0) { + log(); + } +} + +void Batch::completeBatch(const Histories &histories) { + for (int i = 0; i < sentences_.size(); i++) { + sentences_[i].completeSentence(histories[i]); + } +} + } // namespace bergamot } // namespace marian diff --git a/src/translator/request.h b/src/translator/request.h index 8912a49..095a03c 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -3,20 +3,19 @@ // // Request: holds the input blob of a text, Segments (vector) which are // to go to the batching mechanism and alignments between the processed -// segments and the input blob (sourceAlignments). In addition, Request takes +// segments and the input blob (sourceTokenRanges). In addition, Request takes // care of the barrier which fires when all the Segments in a request are done -// translating by the workers (BatchTranslator). Request is to be extended with -// notions of Priority (sequence, user-given). +// translating by the workers (BatchTranslator). +// TODO(jerinphilip): Extend Request with notions of Priority (sequence, +// user-given). // -// RequestSentence: is a tuple of (index, Request*). This provides the +// RequestSentence: is a tuple of (index, Ptr). This provides the // batching mechanism access to the segment within the request. The backref to // Request allows event triggering the barrier upon completion of the last // sentence by a worker. // -// PCItem: is a vector of RequestSentences and a batchNumber, which is what the -// PCQueue holds. The batches are constructed from segments returned by a -// RequestSentence. Can be enhanced with paddingSize, countTokens eventually for -// logging. +// Batch: is a vector of RequestSentences tagged with a batchNumber, which is +// what the PCQueue holds. Batch is "produced" by the Batcher. #ifndef SRC_BERGAMOT_REQUEST_H_ #define SRC_BERGAMOT_REQUEST_H_ @@ -37,23 +36,10 @@ namespace marian { namespace bergamot { class Request { -private: - unsigned int Id_; - int lineNumberBegin_; - std::string source_; - std::atomic counter_; - std::vector> *vocabs_; - - Segments segments_; - std::vector sourceAlignments_; - std::vector> histories_; - - std::promise response_; - public: Request(unsigned int Id, int lineNumberBegin, std::vector> &vocabs_, std::string &&source, - Segments &&segments, std::vector &&sourceAlignments, + Segments &&segments, std::vector &&sourceTokenRanges, std::promise responsePromise); // Obtain the count of tokens in the segment correponding to index. Used to @@ -68,7 +54,8 @@ public: // several requests. Segment getSegment(size_t index) const; - // For notions of priority among requests (used to enable in Batcher). + // For notions of priority among requests, used to enable std::set in + // Batcher. bool operator<(const Request &request) const; // Processes a history obtained after translating in a heterogenous batch @@ -77,20 +64,60 @@ public: // On completion of last segment, sets value of the promise. void completeRequest(); + +private: + unsigned int Id_; + int lineNumberBegin_; + + // Multiple translation-workers can concurrently access the same Request. The + // following atomic atomically operates on the variable holding sentences + // remaining to be translated. + std::atomic counter_; + + // source_ holds the source string to be translated. segments_ hold the + // sentences generated from source_ in vector. sourceTokenRanges_ are + // string_views of the text corresponding to these words, pointing to + // sequences in source_. histories_ is a buffer which eventually stores the + // translations of each segment in the corresponding index. + std::string source_; + Segments segments_; + std::vector sourceTokenRanges_; + std::vector> histories_; + + // Members above are moved into newly constructed Response on completion + // of translation of all segments. The promise below is set to this Response + // value. future to this promise is made available to the user through + // Service. + std::promise response_; + + // Constructing Response requires the vocabs_ used to generate Request. + std::vector> *vocabs_; }; class RequestSentence { -private: - size_t index_; - Ptr request_; + // A RequestSentence provides a view to a sentence within a Request. Existence + // of this class allows the sentences and associated information to be kept + // within Request. public: RequestSentence(size_t, Ptr); size_t numTokens() const; + + // lineNumber in Request, used for matching marian-decoder. SentenceTuple + // requires lineNumber to be set for Corpus based batches. size_t lineNumber() const; + + // Accessor to the segment represented by the RequestSentence. Segment getUnderlyingSegment() const; + + // Forwards call to Request, checking for completion. void completeSentence(Ptr history); + friend bool operator<(const RequestSentence &a, const RequestSentence &b); + +private: + size_t index_; + Ptr request_; }; typedef std::vector RequestSentences; @@ -98,47 +125,48 @@ typedef std::vector RequestSentences; class Batch { public: Batch() { reset(); } - void reset() { - Id_ = 0; - sentences_.clear(); - } - // Convenience function to determine poison. - bool isPoison() { return (Id_ == -1); } + // Reset is required to reuse the same batch by consumer. + void reset(); + + // Methods to construct and determine poison. static Batch poison() { Batch poison_; poison_.Id_ = -1; return poison_; } + bool isPoison() const { return (Id_ == -1); } - void log() { - int numTokens{0}, maxLength{0}; - for (auto &sentence : sentences_) { - numTokens += sentence.numTokens(); - maxLength = std::max(maxLength, static_cast(sentence.numTokens())); - } + size_t size() const { return sentences_.size(); } - LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_, - numTokens, maxLength, sentences_.size()); - } + // Accessors to load data into a batch. Use add(...) to add sentences into a + // batch. Once complete with a legal batch, use setId to set Id_ accordingly. + // setId only allows setting Id > 0. For use in Batcher, which acts as a + // producer to a PCQueue holding "Batch"es. + // + // Id_ = + // -1 : Batch::Poison + // 0 : Empty Batch + // >0 : Legal batch containing sentences - void add(const RequestSentence &sentence) { sentences_.push_back(sentence); } - - size_t size() { return sentences_.size(); } - - void setId(int Id) { - assert(Id > 0); - Id_ = Id; - if (Id % 500 == 0) { - log(); - } - } + void add(const RequestSentence &sentence); + void setId(int Id); + // Accessors to read from a Batch. For use in BatchTranslator (consumer on a + // PCQueue holding batches). + // + // sentences() are used to access sentences to construct marian internal + // batch. const RequestSentences &sentences() { return sentences_; } - void completeBatch(const Histories &histories) { - for (int i = 0; i < sentences_.size(); i++) { - sentences_[i].completeSentence(histories[i]); - } - } + + // On obtaining Histories after translating a batch, completeBatch can be + // called with Histories , which forwards the call to Request through + // RequestSentence and triggers completion, by setting the promised value to + // the future given to client. + void completeBatch(const Histories &histories); + + // Convenience function to log batch-statistics. numTokens, max-length. + // TODO(jerinphilip): Use to log and report packing efficiency. + void log(); private: int Id_; diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 96f391c..2163eef 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -56,8 +56,8 @@ std::future Service::translate(std::string &&input) { // returns future corresponding to the promise. Segments segments; - std::vector sourceAlignments; - text_processor_.process(input, segments, sourceAlignments); + std::vector sourceTokenRanges; + text_processor_.process(input, segments, sourceTokenRanges); std::promise responsePromise; auto future = responsePromise.get_future(); @@ -65,7 +65,7 @@ std::future Service::translate(std::string &&input) { Ptr request = New(requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(input), std::move(segments), - std::move(sourceAlignments), std::move(responsePromise)); + std::move(sourceTokenRanges), std::move(responsePromise)); batcher_.addWholeRequest(request); From 65e74069709784614d6936a1ea55623ee001d04b Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 16 Feb 2021 17:00:53 +0000 Subject: [PATCH 16/39] Comments and lazy stuff to response --- app/main-mts.cpp | 2 +- src/translator/response.cpp | 30 +++++++++++-------- src/translator/response.h | 59 +++++++++++++++++++++++++++++++++---- 3 files changed, 72 insertions(+), 19 deletions(-) diff --git a/app/main-mts.cpp b/app/main-mts.cpp index 78967be..a8382b2 100644 --- a/app/main-mts.cpp +++ b/app/main-mts.cpp @@ -24,7 +24,7 @@ int main(int argc, char *argv[]) { // Wait on future until Response is complete std::future responseFuture = service.translate(std::move(input)); responseFuture.wait(); - const Response &response = responseFuture.get(); + Response response = responseFuture.get(); std::cout << response.translation() << std::endl; // Stop Service. diff --git a/src/translator/response.cpp b/src/translator/response.cpp index d40f88d..6af089d 100644 --- a/src/translator/response.cpp +++ b/src/translator/response.cpp @@ -11,16 +11,12 @@ Response::Response(std::string &&source, std::vector &&sourceRanges, Histories &&histories, std::vector> &vocabs) : source_(std::move(source)), sourceRanges_(std::move(sourceRanges)), - histories_(std::move(histories)) { - - constructTargetProperties(vocabs); -} + histories_(std::move(histories)), vocabs_(&vocabs) {} void Response::move(std::string &source, std::string &translation, SentenceMappings &sentenceMappings) { constructSentenceMappings(sentenceMappings); - // Totally illegal stuff. source = std::move(source_); translation = std::move(translation_); @@ -31,18 +27,23 @@ void Response::move(std::string &source, std::string &translation, histories_.clear(); } -void Response::constructTargetProperties( - std::vector> &vocabs) { - std::vector> translationRanges; +void Response::constructTranslation() { + + // In a first step, the decoded units (individual senteneces) are compiled + // into a huge string. This is done by computing indices first and appending + // to the string as each sentences are decoded. + std::vector> translationRanges; + size_t offset{0}; bool first{true}; + for (auto &history : histories_) { // TODO(jerin): Change hardcode of nBest = 1 NBestList onebest = history->nBest(1); Result result = onebest[0]; // Expecting only one result; Words words = std::get<0>(result); - std::string decoded = (vocabs.back())->decode(words); + std::string decoded = vocabs_->back()->decode(words); if (first) { first = false; } else { @@ -55,13 +56,18 @@ void Response::constructTargetProperties( offset += decoded.size(); } - // TODO(@jerinphilip): - // Currently considers target tokens as whole text. Needs - // to be further enhanced in marian-dev to extract alignments. + // Once the entire string is constructed, there are no further possibility of + // reallocation in the string's storage, the indices are converted into + // string_views. + for (auto &range : translationRanges) { + // TODO(@jerinphilip): Currently considers target tokens as whole text. + // Needs to be further enhanced in marian-dev to extract alignments. std::vector targetMappings; + const char *begin = &translation_[range.first]; targetMappings.emplace_back(begin, range.second); + targetRanges_.push_back(std::move(targetMappings)); } } diff --git a/src/translator/response.h b/src/translator/response.h index 5737717..66891cc 100644 --- a/src/translator/response.h +++ b/src/translator/response.h @@ -12,10 +12,32 @@ namespace marian { namespace bergamot { class Response { + // Response is a marian internal class (not a bergamot-translator class) + // holding source blob of text, vector of TokenRanges corresponding to each + // sentence in the source text blob and histories obtained from translating + // these sentences. + // + // This class provides an API at a higher level in comparison to History to + // access translations and additionally use string_view manipulations to + // recover structure in translation from source-text's structure known through + // reference string and string_view. As many of these computations are not + // required until invoked, they are computed as required and stored in data + // members where it makes sense to do so (translation,translationTokenRanges). + // + // Examples of such use-cases are: + // translation() + // translationInSourceStructure() TODO(@jerinphilip) + // alignment(idx) TODO(@jerinphilip) + // sentenceMappings (for bergamot-translator) + public: Response(std::string &&source, std::vector &&sourceRanges, - Histories &&histories, std::vector> &vocabs); + Histories &&histories, + // Required for constructing translation and TokenRanges within + // translation lazily. + std::vector> &vocabs); + // Move constructor. Response(Response &&other) : source_(std::move(other.source_)), translation_(std::move(other.translation_)), @@ -23,27 +45,52 @@ public: targetRanges_(std::move(other.targetRanges_)), histories_(std::move(other.histories_)){}; + // Prevents CopyConstruction and CopyAssignment. sourceRanges_ is constituted + // by string_view and copying invalidates the data member. Response(const Response &) = delete; Response &operator=(const Response &) = delete; typedef std::vector> SentenceMappings; - void move(std::string &source, std::string &target, + // Moves source sentence into source, translated text into translation. + // Pairs of string_views to corresponding sentences in + // source and translation are loaded into sentenceMappings. These string_views + // reference the new source and translation. + // + // Calling move() invalidates the Response object as ownership is transferred. + // Exists for moving strc + void move(std::string &source, std::string &translation, SentenceMappings &sentenceMappings); const Histories &histories() const { return histories_; } const std::string &source() const { return source_; } - const std::string &translation() const { return translation_; } + const std::string &translation() { + if (!translationConstructed) { + constructTranslation(); + } + return translation_; + } + + // A convenience function provided to return translated text placed within + // source's structure. This is useful when the source text is a multi-line + // paragraph or string_views extracted from structured text like HTML and it's + // desirable to place the individual sentences in the locations of the source + // sentences. + // const std::string translationInSourceStructure(); + // const PendingAlignmentType alignment(size_t idx); private: - void constructTargetProperties(std::vector> &vocabs); + void constructTranslation(); void constructSentenceMappings(SentenceMappings &); std::string source_; - std::string translation_; - Histories histories_; std::vector sourceRanges_; + Histories histories_; + + std::vector> *vocabs_{nullptr}; + bool translationConstructed{false}; + std::string translation_; std::vector targetRanges_; }; } // namespace bergamot From 4c8b655ac5c46c6eaaafa6dd3eea32232866a182 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 16 Feb 2021 19:46:40 +0000 Subject: [PATCH 17/39] Batch cleanup Moves Batch into batch.{h,cpp}. - Id_ no longer used due to overflow concerns. (#27) - size_t for places where signed integer is not preferred. - Adjustments to response.{h,cpp} --- src/translator/CMakeLists.txt | 1 + src/translator/batch.cpp | 28 +++++++++++++++ src/translator/batch.h | 52 +++++++++++++++++++++++++++ src/translator/batch_translator.cpp | 1 + src/translator/batch_translator.h | 1 + src/translator/batcher.cpp | 12 +++---- src/translator/batcher.h | 1 + src/translator/request.cpp | 34 ------------------ src/translator/request.h | 54 ----------------------------- src/translator/response.cpp | 17 +++++++-- src/translator/response.h | 11 +++--- src/translator/service.cpp | 1 + 12 files changed, 109 insertions(+), 104 deletions(-) create mode 100644 src/translator/batch.cpp create mode 100644 src/translator/batch.h diff --git a/src/translator/CMakeLists.txt b/src/translator/CMakeLists.txt index c279ab9..e83c615 100644 --- a/src/translator/CMakeLists.txt +++ b/src/translator/CMakeLists.txt @@ -11,6 +11,7 @@ add_library(bergamot-translator STATIC service.cpp batcher.cpp response.cpp + batch.cpp ) target_link_libraries(bergamot-translator marian ssplit) diff --git a/src/translator/batch.cpp b/src/translator/batch.cpp new file mode 100644 index 0000000..973762f --- /dev/null +++ b/src/translator/batch.cpp @@ -0,0 +1,28 @@ +#include "batch.h" +#include "request.h" + +namespace marian { +namespace bergamot { + +void Batch::log() { + size_t numTokens{0}, maxLength{0}; + for (auto &sentence : sentences_) { + numTokens += sentence.numTokens(); + maxLength = std::max(maxLength, static_cast(sentence.numTokens())); + } + + LOG(info, "Batch(tokens={}, max-length={}, sentences_={})", numTokens, + maxLength, sentences_.size()); +} + +void Batch::add(const RequestSentence &sentence) { + sentences_.push_back(sentence); +} + +void Batch::completeBatch(const Histories &histories) { + for (int i = 0; i < sentences_.size(); i++) { + sentences_[i].completeSentence(histories[i]); + } +} +} // namespace bergamot +} // namespace marian diff --git a/src/translator/batch.h b/src/translator/batch.h new file mode 100644 index 0000000..5f86a2f --- /dev/null +++ b/src/translator/batch.h @@ -0,0 +1,52 @@ +#ifndef SRC_BERGAMOT_BATCH_H +#define SRC_BERGAMOT_BATCH_H + +#include "request.h" +#include "translator/beam_search.h" + +namespace marian { +namespace bergamot { + +class Batch { +public: + Batch() {} + void clear() { sentences_.clear(); } + + // Methods to construct and determine poison. + static Batch poison() { + Batch batch; + batch.poison_ = true; + return batch; + } + + bool isPoison() const { return poison_; } + + size_t size() const { return sentences_.size(); } + + void add(const RequestSentence &sentence); + + // Accessors to read from a Batch. For use in BatchTranslator (consumer on a + // PCQueue holding batches). + // + // sentences() are used to access sentences to construct marian internal + // batch. + const RequestSentences &sentences() { return sentences_; } + + // On obtaining Histories after translating a batch, completeBatch can be + // called with Histories , which forwards the call to Request through + // RequestSentence and triggers completion, by setting the promised value to + // the future given to client. + void completeBatch(const Histories &histories); + + // Convenience function to log batch-statistics. numTokens, max-length. + void log(); + +private: + bool poison_{false}; + RequestSentences sentences_; +}; + +} // namespace bergamot +} // namespace marian + +#endif // SRC_BERGAMOT_BATCH_H_ diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index 7da63cf..2679f02 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -1,4 +1,5 @@ #include "batch_translator.h" +#include "batch.h" #include "common/logging.h" #include "data/corpus.h" #include "data/text_input.h" diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index 83b911c..721cee4 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -4,6 +4,7 @@ #include #include +#include "batch.h" #include "common/utils.h" #include "data/shortlist.h" #include "definitions.h" diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 9ba0d03..657a78a 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -1,4 +1,5 @@ #include "batcher.h" +#include "batch.h" #include "common/logging.h" #include @@ -27,7 +28,7 @@ bool Batcher::cleaveBatch(Batch &batch) { // has to be enhanced with optimizing over priority. The baseline // implementation should at least be as fast as marian's maxi-batch with full // corpus size as maxi-batch size. - batch.reset(); + batch.clear(); int paddedBatchSize = 0; for (int length = 0; length < bucket_.size(); length++) { @@ -41,18 +42,13 @@ bool Batcher::cleaveBatch(Batch &batch) { } else { // Check if elements exist assert(batch.size() > 0); - batch.setId(++batchNumber_); return true; } } } - if (batch.size()) { - batch.setId(++batchNumber_); - return true; - } else { - return false; - } + bool isValidBatch = batch.size() > 0; + return isValidBatch; } void Batcher::addWholeRequest(Ptr request) { diff --git a/src/translator/batcher.h b/src/translator/batcher.h index 3427257..adf2dbe 100644 --- a/src/translator/batcher.h +++ b/src/translator/batcher.h @@ -1,6 +1,7 @@ #ifndef SRC_BERGAMOT_BATCHER_H_ #define SRC_BERGAMOT_BATCHER_H_ +#include "batch.h" #include "common/options.h" #include "data/corpus_base.h" #include "definitions.h" diff --git a/src/translator/request.cpp b/src/translator/request.cpp index 303f9cc..82c4868 100644 --- a/src/translator/request.cpp +++ b/src/translator/request.cpp @@ -92,39 +92,5 @@ bool operator<(const RequestSentence &a, const RequestSentence &b) { // ---------------------------------------------------------------------- -void Batch::reset() { - Id_ = 0; - sentences_.clear(); -} - -void Batch::log() { - int numTokens{0}, maxLength{0}; - for (auto &sentence : sentences_) { - numTokens += sentence.numTokens(); - maxLength = std::max(maxLength, static_cast(sentence.numTokens())); - } - - LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_, - numTokens, maxLength, sentences_.size()); -} - -void Batch::add(const RequestSentence &sentence) { - sentences_.push_back(sentence); -} - -void Batch::setId(int Id) { - assert(Id > 0); - Id_ = Id; - if (Id % 500 == 0) { - log(); - } -} - -void Batch::completeBatch(const Histories &histories) { - for (int i = 0; i < sentences_.size(); i++) { - sentences_[i].completeSentence(histories[i]); - } -} - } // namespace bergamot } // namespace marian diff --git a/src/translator/request.h b/src/translator/request.h index 095a03c..8aea3d8 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -13,9 +13,6 @@ // batching mechanism access to the segment within the request. The backref to // Request allows event triggering the barrier upon completion of the last // sentence by a worker. -// -// Batch: is a vector of RequestSentences tagged with a batchNumber, which is -// what the PCQueue holds. Batch is "produced" by the Batcher. #ifndef SRC_BERGAMOT_REQUEST_H_ #define SRC_BERGAMOT_REQUEST_H_ @@ -122,57 +119,6 @@ private: typedef std::vector RequestSentences; -class Batch { -public: - Batch() { reset(); } - // Reset is required to reuse the same batch by consumer. - void reset(); - - // Methods to construct and determine poison. - static Batch poison() { - Batch poison_; - poison_.Id_ = -1; - return poison_; - } - bool isPoison() const { return (Id_ == -1); } - - size_t size() const { return sentences_.size(); } - - // Accessors to load data into a batch. Use add(...) to add sentences into a - // batch. Once complete with a legal batch, use setId to set Id_ accordingly. - // setId only allows setting Id > 0. For use in Batcher, which acts as a - // producer to a PCQueue holding "Batch"es. - // - // Id_ = - // -1 : Batch::Poison - // 0 : Empty Batch - // >0 : Legal batch containing sentences - - void add(const RequestSentence &sentence); - void setId(int Id); - - // Accessors to read from a Batch. For use in BatchTranslator (consumer on a - // PCQueue holding batches). - // - // sentences() are used to access sentences to construct marian internal - // batch. - const RequestSentences &sentences() { return sentences_; } - - // On obtaining Histories after translating a batch, completeBatch can be - // called with Histories , which forwards the call to Request through - // RequestSentence and triggers completion, by setting the promised value to - // the future given to client. - void completeBatch(const Histories &histories); - - // Convenience function to log batch-statistics. numTokens, max-length. - // TODO(jerinphilip): Use to log and report packing efficiency. - void log(); - -private: - int Id_; - RequestSentences sentences_; -}; - } // namespace bergamot } // namespace marian diff --git a/src/translator/response.cpp b/src/translator/response.cpp index 6af089d..97af625 100644 --- a/src/translator/response.cpp +++ b/src/translator/response.cpp @@ -16,7 +16,11 @@ Response::Response(std::string &&source, void Response::move(std::string &source, std::string &translation, SentenceMappings &sentenceMappings) { + // Construct required stuff first. + constructTranslation(); constructSentenceMappings(sentenceMappings); + + // Move content out. source = std::move(source_); translation = std::move(translation_); @@ -28,6 +32,13 @@ void Response::move(std::string &source, std::string &translation, } void Response::constructTranslation() { + if (translationConstructed_) { + return; + } + + // Reserving length at least as much as source_ seems like a reasonable thing + // to do to avoid reallocations. + translation_.reserve(source_.size()); // In a first step, the decoded units (individual senteneces) are compiled // into a huge string. This is done by computing indices first and appending @@ -43,7 +54,8 @@ void Response::constructTranslation() { Result result = onebest[0]; // Expecting only one result; Words words = std::get<0>(result); - std::string decoded = vocabs_->back()->decode(words); + auto targetVocab = vocabs_->back(); + std::string decoded = targetVocab->decode(words); if (first) { first = false; } else { @@ -67,9 +79,10 @@ void Response::constructTranslation() { const char *begin = &translation_[range.first]; targetMappings.emplace_back(begin, range.second); - targetRanges_.push_back(std::move(targetMappings)); } + + translationConstructed_ = true; } void Response::constructSentenceMappings( diff --git a/src/translator/response.h b/src/translator/response.h index 66891cc..54e14fb 100644 --- a/src/translator/response.h +++ b/src/translator/response.h @@ -43,7 +43,8 @@ public: translation_(std::move(other.translation_)), sourceRanges_(std::move(other.sourceRanges_)), targetRanges_(std::move(other.targetRanges_)), - histories_(std::move(other.histories_)){}; + histories_(std::move(other.histories_)), + vocabs_(std::move(other.vocabs_)){}; // Prevents CopyConstruction and CopyAssignment. sourceRanges_ is constituted // by string_view and copying invalidates the data member. @@ -66,9 +67,7 @@ public: const Histories &histories() const { return histories_; } const std::string &source() const { return source_; } const std::string &translation() { - if (!translationConstructed) { - constructTranslation(); - } + constructTranslation(); return translation_; } @@ -88,8 +87,8 @@ private: std::vector sourceRanges_; Histories histories_; - std::vector> *vocabs_{nullptr}; - bool translationConstructed{false}; + std::vector> *vocabs_; + bool translationConstructed_{false}; std::string translation_; std::vector targetRanges_; }; diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 2163eef..151fcb6 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -1,4 +1,5 @@ #include "service.h" +#include "batch.h" #include "definitions.h" #include From 9c907ea605af4a568fb945272247ada8e70a6a9c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 16 Feb 2021 20:04:30 +0000 Subject: [PATCH 18/39] another int to size_t --- src/translator/batch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/translator/batch.cpp b/src/translator/batch.cpp index 973762f..82ebbfb 100644 --- a/src/translator/batch.cpp +++ b/src/translator/batch.cpp @@ -20,7 +20,7 @@ void Batch::add(const RequestSentence &sentence) { } void Batch::completeBatch(const Histories &histories) { - for (int i = 0; i < sentences_.size(); i++) { + for (size_t i = 0; i < sentences_.size(); i++) { sentences_[i].completeSentence(histories[i]); } } From d7556bc1681048856e8e3e5f83087b2856221c0e Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 00:31:44 +0000 Subject: [PATCH 19/39] SentenceRanges: Class to work with string_views Adds SentenceRanges in sentence_ranges.{h,cpp} and propogates use of the class into the rest of the pipeline. SentenceRanges previously a vector> is now converted into a flat single vector. Annotations marking sentence boundaries are additionally stored in the class, enabling sentence string_view access through methods. --- src/translator/CMakeLists.txt | 1 + src/translator/request.cpp | 9 +++--- src/translator/request.h | 7 ++-- src/translator/response.cpp | 25 ++++---------- src/translator/response.h | 7 ++-- src/translator/sentence_ranges.cpp | 46 ++++++++++++++++++++++++++ src/translator/sentence_ranges.h | 52 ++++++++++++++++++++++++++++++ src/translator/service.cpp | 13 ++++---- src/translator/text_processor.cpp | 24 +++++++------- src/translator/text_processor.h | 10 +++--- 10 files changed, 143 insertions(+), 51 deletions(-) create mode 100644 src/translator/sentence_ranges.cpp create mode 100644 src/translator/sentence_ranges.h diff --git a/src/translator/CMakeLists.txt b/src/translator/CMakeLists.txt index e83c615..07d7ef0 100644 --- a/src/translator/CMakeLists.txt +++ b/src/translator/CMakeLists.txt @@ -12,6 +12,7 @@ add_library(bergamot-translator STATIC batcher.cpp response.cpp batch.cpp + sentence_ranges.cpp ) target_link_libraries(bergamot-translator marian ssplit) diff --git a/src/translator/request.cpp b/src/translator/request.cpp index 82c4868..8c858e6 100644 --- a/src/translator/request.cpp +++ b/src/translator/request.cpp @@ -1,5 +1,5 @@ #include "request.h" - +#include "sentence_ranges.h" #include "definitions.h" #include "response.h" @@ -13,12 +13,11 @@ namespace bergamot { // ----------------------------------------------------------------- Request::Request(unsigned int Id, int lineNumberBegin, std::vector> &vocabs, std::string &&source, - Segments &&segments, - std::vector &&sourceTokenRanges, + Segments &&segments, SentenceRanges &&sourceRanges, std::promise responsePromise) : Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs), source_(std::move(source)), segments_(std::move(segments)), - sourceTokenRanges_(std::move(sourceTokenRanges)), + sourceRanges_(std::move(sourceRanges)), response_(std::move(responsePromise)) { counter_ = segments_.size(); @@ -49,7 +48,7 @@ void Request::processHistory(size_t index, Ptr history) { void Request::completeRequest() { // Request no longer needs to hold the content, can transfer it to // Response. - Response response(std::move(source_), std::move(sourceTokenRanges_), + Response response(std::move(source_), std::move(sourceRanges_), std::move(histories_), *vocabs_); response_.set_value(std::move(response)); } diff --git a/src/translator/request.h b/src/translator/request.h index 8aea3d8..0bc3c27 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -17,6 +17,7 @@ #ifndef SRC_BERGAMOT_REQUEST_H_ #define SRC_BERGAMOT_REQUEST_H_ +#include "sentence_ranges.h" #include "definitions.h" #include "response.h" @@ -36,7 +37,7 @@ class Request { public: Request(unsigned int Id, int lineNumberBegin, std::vector> &vocabs_, std::string &&source, - Segments &&segments, std::vector &&sourceTokenRanges, + Segments &&segments, SentenceRanges &&sourceTokenRanges, std::promise responsePromise); // Obtain the count of tokens in the segment correponding to index. Used to @@ -72,13 +73,13 @@ private: std::atomic counter_; // source_ holds the source string to be translated. segments_ hold the - // sentences generated from source_ in vector. sourceTokenRanges_ are + // sentences generated from source_ in vector. sourceRanges_ are // string_views of the text corresponding to these words, pointing to // sequences in source_. histories_ is a buffer which eventually stores the // translations of each segment in the corresponding index. std::string source_; Segments segments_; - std::vector sourceTokenRanges_; + SentenceRanges sourceRanges_; std::vector> histories_; // Members above are moved into newly constructed Response on completion diff --git a/src/translator/response.cpp b/src/translator/response.cpp index 97af625..b731755 100644 --- a/src/translator/response.cpp +++ b/src/translator/response.cpp @@ -1,4 +1,5 @@ #include "response.h" +#include "sentence_ranges.h" #include "common/logging.h" #include "data/alignment.h" @@ -7,8 +8,7 @@ namespace marian { namespace bergamot { -Response::Response(std::string &&source, - std::vector &&sourceRanges, +Response::Response(std::string &&source, SentenceRanges &&sourceRanges, Histories &&histories, std::vector> &vocabs) : source_(std::move(source)), sourceRanges_(std::move(sourceRanges)), histories_(std::move(histories)), vocabs_(&vocabs) {} @@ -79,7 +79,7 @@ void Response::constructTranslation() { const char *begin = &translation_[range.first]; targetMappings.emplace_back(begin, range.second); - targetRanges_.push_back(std::move(targetMappings)); + targetRanges_.addSentence(targetMappings); } translationConstructed_ = true; @@ -88,21 +88,10 @@ void Response::constructTranslation() { void Response::constructSentenceMappings( Response::SentenceMappings &sentenceMappings) { - for (int i = 0; i < sourceRanges_.size(); i++) { - string_view first, last; - - // Handle source-sentence - first = sourceRanges_[i].front(); - last = sourceRanges_[i].back(); - string_view src_sentence(first.data(), last.end() - first.begin()); - - // Handle target-sentence - first = targetRanges_[i].front(); - last = targetRanges_[i].back(); - string_view tgt_sentence(first.data(), last.end() - first.begin()); - - // Add both into sentence-mappings - sentenceMappings.emplace_back(src_sentence, tgt_sentence); + for (size_t i = 0; i < sourceRanges_.numSentences(); i++) { + string_view src = sourceRanges_.sentence(i); + string_view tgt = targetRanges_.sentence(i); + sentenceMappings.emplace_back(src, tgt); } } } // namespace bergamot diff --git a/src/translator/response.h b/src/translator/response.h index 54e14fb..17fee05 100644 --- a/src/translator/response.h +++ b/src/translator/response.h @@ -1,6 +1,7 @@ #ifndef SRC_BERGAMOT_RESPONSE_H_ #define SRC_BERGAMOT_RESPONSE_H_ +#include "sentence_ranges.h" #include "data/types.h" #include "definitions.h" #include "translator/beam_search.h" @@ -31,7 +32,7 @@ class Response { // sentenceMappings (for bergamot-translator) public: - Response(std::string &&source, std::vector &&sourceRanges, + Response(std::string &&source, SentenceRanges &&sourceRanges, Histories &&histories, // Required for constructing translation and TokenRanges within // translation lazily. @@ -84,13 +85,13 @@ private: void constructSentenceMappings(SentenceMappings &); std::string source_; - std::vector sourceRanges_; + SentenceRanges sourceRanges_; Histories histories_; std::vector> *vocabs_; bool translationConstructed_{false}; std::string translation_; - std::vector targetRanges_; + SentenceRanges targetRanges_; }; } // namespace bergamot } // namespace marian diff --git a/src/translator/sentence_ranges.cpp b/src/translator/sentence_ranges.cpp new file mode 100644 index 0000000..a9ee8c5 --- /dev/null +++ b/src/translator/sentence_ranges.cpp @@ -0,0 +1,46 @@ +#include "sentence_ranges.h" +#include +#include + +namespace marian { +namespace bergamot { + +void SentenceRanges::addSentence(std::vector &wordRanges) { + addSentence(std::begin(wordRanges), std::end(wordRanges)); +} + +void SentenceRanges::addSentence(WordIterator begin, WordIterator end) { + size_t size = flatByteRanges_.size(); + flatByteRanges_.insert(std::end(flatByteRanges_), begin, end); + sentenceBeginIds_.push_back(size); +} + +string_view SentenceRanges::sentence(size_t index) const { + size_t bos_id; + string_view eos, bos; + + bos_id = sentenceBeginIds_[index]; + bos = flatByteRanges_[bos_id]; + + if (index + 1 == numSentences()) { + eos = flatByteRanges_.back(); + } else { + assert(index < numSentences()); + size_t eos_id = sentenceBeginIds_[index + 1]; + --eos_id; + eos = flatByteRanges_[eos_id]; + } + + return sentenceBetween(bos, eos); +} + +string_view SentenceRanges::sentenceBetween(string_view firstWord, + string_view lastWord) const { + + const char *data = firstWord.data(); + size_t size = lastWord.data() + lastWord.size() - firstWord.data(); + return string_view(data, size); +} + +} // namespace bergamot +} // namespace marian diff --git a/src/translator/sentence_ranges.h b/src/translator/sentence_ranges.h new file mode 100644 index 0000000..c6a0770 --- /dev/null +++ b/src/translator/sentence_ranges.h @@ -0,0 +1,52 @@ +#ifndef BERGAMOT_SENTENCE_RANGES_H_ +#define BERGAMOT_SENTENCE_RANGES_H_ + +#include "data/types.h" +#include +#include + +namespace marian { +namespace bergamot { + +class SentenceRanges { + // SentenceRanges stores string_views into a source text, with additional + // annotations to mark sentence boundaries. + // + // Given the availability annotations, this container provides capabilty to + // add sentences, and access individual sentences. +public: + typedef std::vector::iterator WordIterator; + + void addSentence(std::vector &wordRanges); + void addSentence(WordIterator begin, WordIterator end); + + void clear() { + flatByteRanges_.clear(); + sentenceBeginIds_.clear(); + } + + size_t numSentences() const { return sentenceBeginIds_.size(); } + + // Returns a string_view into the ith sentence. + string_view sentence(size_t index) const; + +private: + // A flat storage for string_views. Can be words or sentences. + std::vector flatByteRanges_; + + // The container grows dynamically with addSentence. size_t marking index is + // used to ensure the sentence boundaries stay same while underlying storage + // might be changed during reallocation. + std::vector sentenceBeginIds_; + + // Utility function to extract the string starting at firstWord and ending at + // lastWord as a single string-view. + string_view sentenceBetween(string_view firstWord, + string_view lastWord) const; +}; + +} // namespace bergamot + +} // namespace marian + +#endif // BERGAMOT_SENTENCE_RANGES_H_ diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 151fcb6..6d68f18 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -44,7 +44,7 @@ std::future Service::translateWithCopy(std::string input) { } std::future Service::translate(std::string &&input) { - // Takes in a blob of text. Segments and std::vector are + // Takes in a blob of text. Segments and SentenceRanges are // extracted from the input (blob of text) and used to construct a Request // along with a promise. promise value is set by the worker completing a // request. @@ -57,16 +57,15 @@ std::future Service::translate(std::string &&input) { // returns future corresponding to the promise. Segments segments; - std::vector sourceTokenRanges; - text_processor_.process(input, segments, sourceTokenRanges); + SentenceRanges sourceRanges; + text_processor_.process(input, segments, sourceRanges); std::promise responsePromise; auto future = responsePromise.get_future(); - Ptr request = - New(requestId_++, /* lineNumberBegin = */ 0, vocabs_, - std::move(input), std::move(segments), - std::move(sourceTokenRanges), std::move(responsePromise)); + Ptr request = New( + requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(input), + std::move(segments), std::move(sourceRanges), std::move(responsePromise)); batcher_.addWholeRequest(request); diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp index 8114855..ee13dbd 100644 --- a/src/translator/text_processor.cpp +++ b/src/translator/text_processor.cpp @@ -1,4 +1,5 @@ #include "text_processor.h" +#include "sentence_ranges.h" #include "data/types.h" #include "definitions.h" @@ -10,9 +11,9 @@ namespace marian { namespace bergamot { Segment TextProcessor::tokenize(const string_view &segment, - TokenRanges &tokenRanges) { + std::vector &wordRanges) { return vocabs_->front()->encodeWithByteRanges( - segment, tokenRanges, /*addEOS=*/false, /*inference=*/true); + segment, wordRanges, /*addEOS=*/false, /*inference=*/true); } TextProcessor::TextProcessor(std::vector> &vocabs, @@ -26,7 +27,7 @@ TextProcessor::TextProcessor(std::vector> &vocabs, } void TextProcessor::process(const string_view &query, Segments &segments, - std::vector &sourceRanges) { + SentenceRanges &sourceRanges) { auto sentenceStream = sentence_splitter_.createSentenceStream(query); std::string_view sentenceStringPiece; @@ -34,21 +35,22 @@ void TextProcessor::process(const string_view &query, Segments &segments, while (sentenceStream >> sentenceStringPiece) { marian::string_view sentence(sentenceStringPiece.data(), sentenceStringPiece.size()); - TokenRanges tokenRanges; - Segment segment = tokenize(sentence, tokenRanges); + + std::vector wordRanges; + Segment segment = tokenize(sentence, wordRanges); // There are some cases where SentencePiece or vocab returns no words // after normalization. 0 prevents any empty entries from being added. if (segment.size() > 0) { // Truncate segment into max_input_size segments. - truncate(segment, tokenRanges, segments, sourceRanges); + truncate(segment, wordRanges, segments, sourceRanges); } } } -void TextProcessor::truncate(Segment &segment, TokenRanges &tokenRanges, - Segments &segments, - std::vector &sourceRanges) { +void TextProcessor::truncate(Segment &segment, + std::vector &wordRanges, + Segments &segments, SentenceRanges &sourceRanges) { for (int offset = 0; offset < segment.size(); offset += max_input_sentence_tokens_) { auto start = segment.begin() + offset; @@ -59,8 +61,8 @@ void TextProcessor::truncate(Segment &segment, TokenRanges &tokenRanges, segments.emplace_back(start, start + diff); segments.back().push_back(sourceEosId()); - auto astart = tokenRanges.begin() + offset; - sourceRanges.emplace_back(astart, astart + diff); + auto astart = wordRanges.begin() + offset; + sourceRanges.addSentence(astart, astart + diff); } } diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h index 111ae00..d366bf2 100644 --- a/src/translator/text_processor.h +++ b/src/translator/text_processor.h @@ -1,6 +1,7 @@ #ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_ #define SRC_BERGAMOT_TEXT_PROCESSOR_H_ +#include "sentence_ranges.h" #include "data/types.h" #include "data/vocab.h" #include "definitions.h" @@ -23,16 +24,17 @@ public: explicit TextProcessor(std::vector> &vocabs, Ptr); void process(const string_view &query, Segments &segments, - std::vector &sourceRanges); + SentenceRanges &sourceRanges); private: // Tokenizes an input string, returns Words corresponding. Loads the // corresponding byte-ranges into tokenRanges. - Segment tokenize(const string_view &input, TokenRanges &tokenRanges); + Segment tokenize(const string_view &input, + std::vector &tokenRanges); // Truncate sentence into max_input_size segments. - void truncate(Segment &sentence, TokenRanges &tokenRanges, Segments &segments, - std::vector &sourceRanges); + void truncate(Segment &sentence, std::vector &tokenRanges, + Segments &segments, SentenceRanges &sourceRanges); // shorthand, used only in truncate() const Word sourceEosId() const { return vocabs_->front()->getEosId(); } From 0296a38cd48970234753cf4f95e7d46269aa089f Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 00:45:19 +0000 Subject: [PATCH 20/39] Bunch of integers on containers to size_ts --- src/translator/batcher.cpp | 8 ++++---- src/translator/batcher.h | 4 ++-- src/translator/request.cpp | 4 ++-- src/translator/request.h | 8 ++++---- src/translator/service.cpp | 4 ---- src/translator/service.h | 4 ++-- src/translator/text_processor.cpp | 8 ++++---- src/translator/text_processor.h | 4 ++-- 8 files changed, 20 insertions(+), 24 deletions(-) diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 657a78a..bb223c9 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -16,7 +16,7 @@ Batcher::Batcher(Ptr options) { } void Batcher::addSentenceWithPriority(RequestSentence &sentence) { - int bucket_id = sentence.numTokens(); + size_t bucket_id = sentence.numTokens(); assert(bucket_id < bucket_.size()); bucket_[bucket_id].insert(sentence); } @@ -29,9 +29,9 @@ bool Batcher::cleaveBatch(Batch &batch) { // implementation should at least be as fast as marian's maxi-batch with full // corpus size as maxi-batch size. batch.clear(); - int paddedBatchSize = 0; + size_t paddedBatchSize = 0; - for (int length = 0; length < bucket_.size(); length++) { + for (size_t length = 0; length < bucket_.size(); length++) { auto p = bucket_[length].begin(); while (p != bucket_[length].end()) { paddedBatchSize = (batch.size() + 1) * length; @@ -52,7 +52,7 @@ bool Batcher::cleaveBatch(Batch &batch) { } void Batcher::addWholeRequest(Ptr request) { - for (int i = 0; i < request->numSegments(); i++) { + for (size_t i = 0; i < request->numSegments(); i++) { RequestSentence requestSentence(i, request); addSentenceWithPriority(requestSentence); } diff --git a/src/translator/batcher.h b/src/translator/batcher.h index adf2dbe..1b1bc21 100644 --- a/src/translator/batcher.h +++ b/src/translator/batcher.h @@ -30,9 +30,9 @@ public: bool operator>>(Batch &batch); // alias private: - unsigned int miniBatchWords; + size_t miniBatchWords; std::vector> bucket_; - unsigned int batchNumber_{0}; + size_t batchNumber_{0}; }; } // namespace bergamot diff --git a/src/translator/request.cpp b/src/translator/request.cpp index 8c858e6..42dfb35 100644 --- a/src/translator/request.cpp +++ b/src/translator/request.cpp @@ -1,7 +1,7 @@ #include "request.h" -#include "sentence_ranges.h" #include "definitions.h" #include "response.h" +#include "sentence_ranges.h" #include "common/logging.h" @@ -11,7 +11,7 @@ namespace marian { namespace bergamot { // ----------------------------------------------------------------- -Request::Request(unsigned int Id, int lineNumberBegin, +Request::Request(size_t Id, size_t lineNumberBegin, std::vector> &vocabs, std::string &&source, Segments &&segments, SentenceRanges &&sourceRanges, std::promise responsePromise) diff --git a/src/translator/request.h b/src/translator/request.h index 0bc3c27..3909019 100644 --- a/src/translator/request.h +++ b/src/translator/request.h @@ -17,9 +17,9 @@ #ifndef SRC_BERGAMOT_REQUEST_H_ #define SRC_BERGAMOT_REQUEST_H_ -#include "sentence_ranges.h" #include "definitions.h" #include "response.h" +#include "sentence_ranges.h" #include "common/logging.h" #include "data/types.h" @@ -35,7 +35,7 @@ namespace bergamot { class Request { public: - Request(unsigned int Id, int lineNumberBegin, + Request(size_t Id, size_t lineNumberBegin, std::vector> &vocabs_, std::string &&source, Segments &&segments, SentenceRanges &&sourceTokenRanges, std::promise responsePromise); @@ -64,8 +64,8 @@ public: void completeRequest(); private: - unsigned int Id_; - int lineNumberBegin_; + size_t Id_; + size_t lineNumberBegin_; // Multiple translation-workers can concurrently access the same Request. The // following atomic atomically operates on the variable holding sentences diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 6d68f18..c7d15af 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -83,17 +83,13 @@ std::future Service::translate(std::string &&input) { } void Service::stop() { - int counter = 0; for (auto &worker : workers_) { Batch poison = Batch::poison(); pcqueue_.ProduceSwap(poison); - ++counter; } - counter = 0; for (auto &worker : workers_) { worker.join(); - ++counter; } workers_.clear(); // Takes care of idempotency. diff --git a/src/translator/service.h b/src/translator/service.h index 55b754a..6c1d799 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -45,8 +45,8 @@ public: ~Service(); private: - unsigned int requestId_; - int numWorkers_; + size_t requestId_; + size_t numWorkers_; // vocabs are used to construct a Request, which later uses it to construct // Response (decode from words to string). diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp index ee13dbd..277b9ca 100644 --- a/src/translator/text_processor.cpp +++ b/src/translator/text_processor.cpp @@ -1,7 +1,7 @@ #include "text_processor.h" -#include "sentence_ranges.h" #include "data/types.h" #include "definitions.h" +#include "sentence_ranges.h" #include "common/options.h" #include "data/vocab.h" @@ -51,12 +51,12 @@ void TextProcessor::process(const string_view &query, Segments &segments, void TextProcessor::truncate(Segment &segment, std::vector &wordRanges, Segments &segments, SentenceRanges &sourceRanges) { - for (int offset = 0; offset < segment.size(); + for (size_t offset = 0; offset < segment.size(); offset += max_input_sentence_tokens_) { auto start = segment.begin() + offset; - unsigned int left = segment.size() - offset; - unsigned int diff = std::min(max_input_sentence_tokens_, left); + size_t left = segment.size() - offset; + size_t diff = std::min(max_input_sentence_tokens_, left); segments.emplace_back(start, start + diff); segments.back().push_back(sourceEosId()); diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h index d366bf2..ad88ab9 100644 --- a/src/translator/text_processor.h +++ b/src/translator/text_processor.h @@ -1,10 +1,10 @@ #ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_ #define SRC_BERGAMOT_TEXT_PROCESSOR_H_ -#include "sentence_ranges.h" #include "data/types.h" #include "data/vocab.h" #include "definitions.h" +#include "sentence_ranges.h" #include "sentence_splitter.h" @@ -41,7 +41,7 @@ private: std::vector> *vocabs_; SentenceSplitter sentence_splitter_; - unsigned int max_input_sentence_tokens_; + size_t max_input_sentence_tokens_; }; } // namespace bergamot From 69201ba44ccd1611fe03ce5a6d225cad118c2ba1 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 00:54:30 +0000 Subject: [PATCH 21/39] Unify options with marian Service specific options are renamed to align with marian-option naming as follows: 1. max-input-sentence-tokens -> max-length-break (There's a max-length-crop in marian, this is the same, except breaks into multiple sentences than truncate/crop). 2. max-input-tokens -> mini-batch-words. --- src/translator/batcher.cpp | 4 ++-- src/translator/parser.h | 7 +------ src/translator/text_processor.cpp | 11 +++++------ src/translator/text_processor.h | 2 +- 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index bb223c9..3a384d4 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -7,8 +7,8 @@ namespace marian { namespace bergamot { Batcher::Batcher(Ptr options) { - miniBatchWords = options->get("max-input-tokens"); - bucket_.resize(options->get("max-input-sentence-tokens") + 1); + miniBatchWords = options->get("mini-batch-words"); + bucket_.resize(options->get("max-length-break") + 1); ABORT_IF( miniBatchWords < bucket_.size() - 1, "max-input-tokens cannot be less than than max-input-sentence-tokens, " diff --git a/src/translator/parser.h b/src/translator/parser.h index 606b6a4..7fb53e1 100644 --- a/src/translator/parser.h +++ b/src/translator/parser.h @@ -16,14 +16,9 @@ inline marian::ConfigParser createConfigParser() { "[paragraph, sentence, wrapped_text]", "paragraph"); cp.addOption( - "--max-input-sentence-tokens", "Bergamot Options", + "--max-length-break", "Bergamot Options", "Maximum input tokens to be processed in a single sentence.", 128); - cp.addOption("--max-input-tokens", "Bergamot Options", - "Maximum input tokens in a batch. control for" - "Bergamot Queue", - 1024); - return cp; } diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp index 277b9ca..9d6733e 100644 --- a/src/translator/text_processor.cpp +++ b/src/translator/text_processor.cpp @@ -20,10 +20,9 @@ TextProcessor::TextProcessor(std::vector> &vocabs, Ptr options) : vocabs_(&vocabs), sentence_splitter_(options) { - max_input_sentence_tokens_ = options->get("max-input-sentence-tokens"); - max_input_sentence_tokens_ = max_input_sentence_tokens_ - 1; - ABORT_IF(max_input_sentence_tokens_ < 0, - "max-input-sentence-tokens cannot be < 0"); + max_length_break_ = options->get("max-length-break"); + max_length_break_ = max_length_break_ - 1; + ABORT_IF(max_length_break_ < 0, "max-length-break cannot be < 0"); } void TextProcessor::process(const string_view &query, Segments &segments, @@ -52,11 +51,11 @@ void TextProcessor::truncate(Segment &segment, std::vector &wordRanges, Segments &segments, SentenceRanges &sourceRanges) { for (size_t offset = 0; offset < segment.size(); - offset += max_input_sentence_tokens_) { + offset += max_length_break_) { auto start = segment.begin() + offset; size_t left = segment.size() - offset; - size_t diff = std::min(max_input_sentence_tokens_, left); + size_t diff = std::min(max_length_break_, left); segments.emplace_back(start, start + diff); segments.back().push_back(sourceEosId()); diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h index ad88ab9..4cd1761 100644 --- a/src/translator/text_processor.h +++ b/src/translator/text_processor.h @@ -41,7 +41,7 @@ private: std::vector> *vocabs_; SentenceSplitter sentence_splitter_; - size_t max_input_sentence_tokens_; + size_t max_length_break_; }; } // namespace bergamot From fba44bec8f754e0a17c7bbc76d50697121bf97a3 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 01:05:20 +0000 Subject: [PATCH 22/39] Improving Batcher error message with new option names --- src/translator/batcher.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 3a384d4..f671ffc 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -9,10 +9,9 @@ namespace bergamot { Batcher::Batcher(Ptr options) { miniBatchWords = options->get("mini-batch-words"); bucket_.resize(options->get("max-length-break") + 1); - ABORT_IF( - miniBatchWords < bucket_.size() - 1, - "max-input-tokens cannot be less than than max-input-sentence-tokens, " - "batcher fail"); + ABORT_IF(bucket_.size() - 1 > miniBatchWords, + "Fatal: max-length-break > mini-batch-words will lead to sentences " + "longer than what can fit in a batch."); } void Batcher::addSentenceWithPriority(RequestSentence &sentence) { From c205c82585752b32f6f60191edcbb8bdebd687f7 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 01:12:30 +0000 Subject: [PATCH 23/39] Updates to README with option changes --- README.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 52f60b2..2019fbf 100644 --- a/README.md +++ b/README.md @@ -42,18 +42,23 @@ ARGS=( --beam-size 1 --skip-cost --shortlist $MODEL_DIR/lex.s2t.gz 50 50 --int8shiftAlphaAll # Number of CPU threads (workers to launch). Parallelizes over cores and improves speed. + # A value of 0 allows a path with no worker thread-launches and a single-thread. --cpu-threads 4 - # Hyperparameters of how many tokens to be accounted for in a batch and maximum tokens in a sentence. - --max-input-sentence-tokens 1024 --max-input-tokens 1024 + # Maximum size of a sentence allowed. If a sentence is above this length, + # it's broken into pieces of less than or equal to this size. + --max-length-break 1024 + + # Maximum number of tokens that can be fit in a batch. The optimal value + # for the parameter is dependant on hardware and can be obtained by running + # with variations and benchmarking. + --mini-batch-words 1024 # Three modes are supported # - sentence: One sentence per line # - paragraph: One paragraph per line. - # - wrapped text: Paragraphs are separated by empty line. - + # - wrapped_text: Paragraphs are separated by empty line. --ssplit-mode paragraph - ) ./app/service-cli "${ARGS[@]}" < path-to-input-file From 44a44fa1562319178e5246205ef6c5b5fbc49f10 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 11:48:00 +0000 Subject: [PATCH 24/39] CMake build with submodule recursive clones --- CMakeLists.txt | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ce48a90..8202511 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,8 +16,22 @@ option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON) option(USE_STATIC_LIBS "Link statically against non-system libs" ON) option(USE_MKL "Compile with MKL support" ON) -execute_process(COMMAND git submodule update --init --recursive --no-fetch - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +# Documentation: https://cliutils.gitlab.io/modern-cmake/chapters/projects/submodule.html +# Ensures the submodules are set correctly during a build. +find_package(Git QUIET) +if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git") +# Update submodules as needed + option(GIT_SUBMODULE "Check submodules during build" ON) + if(GIT_SUBMODULE) + message(STATUS "Submodule update") + execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE GIT_SUBMOD_RESULT) + if(NOT GIT_SUBMOD_RESULT EQUAL "0") + message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules") + endif() + endif() +endif() add_subdirectory(3rd_party) add_subdirectory(src) From d005f73cb9c67e39766d19130cb4499c71adb9d0 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 13:10:39 +0000 Subject: [PATCH 25/39] Reverting changes to PCQueue --- src/translator/pcqueue.h | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/src/translator/pcqueue.h b/src/translator/pcqueue.h index 79d6b75..f0b3541 100644 --- a/src/translator/pcqueue.h +++ b/src/translator/pcqueue.h @@ -9,7 +9,6 @@ #include #include -#ifdef WITH_PTHREADS #ifdef __APPLE__ #include #include @@ -20,7 +19,6 @@ #else #include #endif -#endif // WITH_PTHREADS #if __GNUC__ >= 3 #define UTIL_UNLIKELY(x) __builtin_expect(!!(x), 0) @@ -31,7 +29,6 @@ namespace marian { namespace bergamot { -#ifdef WITH_PTHREADS /* OS X Maverick and Boost interprocess were doing "Function not implemented." * So this is my own wrapper around the mach kernel APIs. */ @@ -117,20 +114,6 @@ inline void WaitSemaphore(Semaphore &on) { } #endif // Apple -#else // WITH_PTHREADS -// A dummy Semaphore class that does nothing -class Semaphore { -public: - explicit Semaphore(unsigned int value) : count(value) {} - ~Semaphore() {} - void wait() {} - void post() {} -private: - unsigned int count; -}; - -inline void WaitSemaphore(Semaphore &semaphore) { semaphore.wait(); } -#endif // WITH_PTHREADS /** * Producer consumer queue safe for multiple producers and multiple consumers. @@ -151,9 +134,7 @@ public: void Produce(const T &val) { WaitSemaphore(empty_); { - #ifdef WITH_PTHREADS std::lock_guard produce_lock(produce_at_mutex_); - #endif try { *produce_at_ = val; } catch (...) { @@ -170,9 +151,7 @@ public: void ProduceSwap(T &val) { WaitSemaphore(empty_); { - #ifdef WITH_PTHREADS std::lock_guard produce_lock(produce_at_mutex_); - #endif try { std::swap(*produce_at_, val); } catch (...) { @@ -189,9 +168,7 @@ public: T &Consume(T &out) { WaitSemaphore(used_); { - #ifdef WITH_PTHREADS std::lock_guard consume_lock(consume_at_mutex_); - #endif try { out = *consume_at_; } catch (...) { @@ -209,9 +186,7 @@ public: T &ConsumeSwap(T &out) { WaitSemaphore(used_); { - #ifdef WITH_PTHREADS std::lock_guard consume_lock(consume_at_mutex_); - #endif try { std::swap(out, *consume_at_); } catch (...) { @@ -245,15 +220,11 @@ private: // Index for next write in storage_. T *produce_at_; -#ifdef WITH_PTHREADS std::mutex produce_at_mutex_; -#endif // Index for next read from storage_. T *consume_at_; -#ifdef WITH_PTHREADS std::mutex consume_at_mutex_; -#endif }; template struct UnboundedPage { From 72848ba0f6070be711cbf7f8bbbdaf5dd534e9fb Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 13:28:58 +0000 Subject: [PATCH 26/39] Fixes UEdin builds after wasm-integration merge A bug which crept in during manual merge is now fixed. PCItem -> Batch on a PCQueue. docs/marian-integration.md provides instructions to compile successfully for multithread. --- doc/marian-integration.md | 4 +++- src/translator/batch_translator.h | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/marian-integration.md b/doc/marian-integration.md index 8fce52f..f72543a 100644 --- a/doc/marian-integration.md +++ b/doc/marian-integration.md @@ -10,7 +10,9 @@ $ git clone https://github.com/browsermt/bergamot-translator $ cd bergamot-translator $ mkdir build $ cd build -$ cmake ../ +$ cmake .. -DCOMPILE_CUDA=off -DCMAKE_BUILD_TYPE=Release \ + -DCOMPILE_DECODER_ONLY=off -DCOMPILE_LIBRARY_ONLY=off -DUSE_MKL=on \ + -DCOMPILE_THREAD_VARIANT=on $ make -j diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index b53bb99..78027a2 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -47,7 +47,6 @@ private: Ptr slgen_; #ifdef WITH_PTHREADS - PCQueue *pcqueue_; std::thread thread_; #endif }; From 47b9db0c455528984e868a03709e0fb4bedddb37 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 13:35:10 +0000 Subject: [PATCH 27/39] Documentation formatting/syntax fix --- doc/marian-integration.md | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/marian-integration.md b/doc/marian-integration.md index f72543a..6d603a3 100644 --- a/doc/marian-integration.md +++ b/doc/marian-integration.md @@ -14,6 +14,7 @@ $ cmake .. -DCOMPILE_CUDA=off -DCMAKE_BUILD_TYPE=Release \ -DCOMPILE_DECODER_ONLY=off -DCOMPILE_LIBRARY_ONLY=off -DUSE_MKL=on \ -DCOMPILE_THREAD_VARIANT=on $ make -j +``` The build will generate the library that can be linked to any project. All the From 7b10c3548333d8c5b67b1c6c32b1e279647c5e68 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 13:50:42 +0000 Subject: [PATCH 28/39] Hard abort if multithread path launched without multithread-support --- src/translator/service.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/translator/service.cpp b/src/translator/service.cpp index a864a88..986d4a4 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -41,6 +41,11 @@ Service::Service(Ptr options) translator.consumeFrom(pcqueue_); }); } +#else // WITH_PTHREADS + ABORT( + "Fatal: Service started requesting multiple threadswhile compiled with " + "COMPILE_THREAD_VARIANT=off. Please check your cmake build " + "configuration"); #endif } } From 70b57ee3e7ed1f0569d6c838851354753fe88c8d Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 16:38:47 +0000 Subject: [PATCH 29/39] Redundant parser include fixed --- src/translator/TranslationModel.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/translator/TranslationModel.cpp b/src/translator/TranslationModel.cpp index 65b0fc0..45b93d1 100644 --- a/src/translator/TranslationModel.cpp +++ b/src/translator/TranslationModel.cpp @@ -16,8 +16,6 @@ #include "TranslationModel.h" #include "translator/parser.h" #include "translator/service.h" -#include "translator/parser.h" - std::shared_ptr parseOptions(const std::string &config) { marian::Options options; From d72343567c978d9a8605193f7027c23edff728b3 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 16:41:04 +0000 Subject: [PATCH 30/39] BatchTranslator doesn't do thread_, residue from merge removed --- src/translator/batch_translator.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/translator/batch_translator.h b/src/translator/batch_translator.h index 78027a2..b927c01 100644 --- a/src/translator/batch_translator.h +++ b/src/translator/batch_translator.h @@ -45,10 +45,6 @@ private: Ptr graph_; std::vector> scorers_; Ptr slgen_; - -#ifdef WITH_PTHREADS - std::thread thread_; -#endif }; } // namespace bergamot From b9d081dd4552ac8f6baf536046b1eebbfb7792ef Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 19:25:19 +0000 Subject: [PATCH 31/39] Temporary: Updating marian-dev to wasm branch --- 3rd_party/marian-dev | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd_party/marian-dev b/3rd_party/marian-dev index 2f65280..467c43a 160000 --- a/3rd_party/marian-dev +++ b/3rd_party/marian-dev @@ -1 +1 @@ -Subproject commit 2f65280459737c37c270e4ad0b6d41de215d11e0 +Subproject commit 467c43a292a68b7913af2a00d353de97c1740f92 From d249dcbfaa45f44cda022482b314143a9b9e2b80 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 17 Feb 2021 21:15:35 +0000 Subject: [PATCH 32/39] Build doc updated with wasm-branch compatible command --- doc/marian-integration.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/marian-integration.md b/doc/marian-integration.md index 6d603a3..b7089c0 100644 --- a/doc/marian-integration.md +++ b/doc/marian-integration.md @@ -11,8 +11,9 @@ $ cd bergamot-translator $ mkdir build $ cd build $ cmake .. -DCOMPILE_CUDA=off -DCMAKE_BUILD_TYPE=Release \ - -DCOMPILE_DECODER_ONLY=off -DCOMPILE_LIBRARY_ONLY=off -DUSE_MKL=on \ - -DCOMPILE_THREAD_VARIANT=on + -DCOMPILE_DECODER_ONLY=on -DUSE_MKL=on -DCOMPILE_THREAD_VARIANT=on \ + -DUSE_WASM_COMPATIBLE_BLAS=off -DCOMPILE_MAIN_EIGEN=off + $ make -j ``` From ca9aa64926684f41a36c29e4cba8b23ac9e550ec Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 18 Feb 2021 11:07:31 +0000 Subject: [PATCH 33/39] Switch to work with ssplit-cpp both pcre2 and pcrecpp --- src/translator/sentence_splitter.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/translator/sentence_splitter.cpp b/src/translator/sentence_splitter.cpp index 0f9be01..0370125 100644 --- a/src/translator/sentence_splitter.cpp +++ b/src/translator/sentence_splitter.cpp @@ -1,7 +1,7 @@ +#include "sentence_splitter.h" #include "common/cli_helper.h" #include "common/logging.h" #include "common/options.h" -#include "sentence_splitter.h" #include namespace marian { @@ -30,8 +30,9 @@ SentenceSplitter::SentenceSplitter(marian::Ptr options) ug::ssplit::SentenceStream SentenceSplitter::createSentenceStream(const string_view &input) { - return std::move(ug::ssplit::SentenceStream(input.data(), input.size(), - this->ssplit_, mode_)); + std::string_view input_converted(input.data(), input.size()); + return std::move( + ug::ssplit::SentenceStream(input_converted, this->ssplit_, mode_)); } ug::ssplit::SentenceStream::splitmode From fbff7389d1f735272133db68f5029772dabc78c0 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 18 Feb 2021 11:20:01 +0000 Subject: [PATCH 34/39] Temporary: Switch to abhi-agg/ssplit-cpp@wasm --- 3rd_party/ssplit-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd_party/ssplit-cpp b/3rd_party/ssplit-cpp index 01e71b4..1686496 160000 --- a/3rd_party/ssplit-cpp +++ b/3rd_party/ssplit-cpp @@ -1 +1 @@ -Subproject commit 01e71b4964fdc351f932a7a23cab4cb80b9698e8 +Subproject commit 16864967b7313e76e3b107d11ec39d8d5cedff1e From 5dcbb721faadc55ded424b6df5b74854e54d600e Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 22 Feb 2021 18:03:53 +0100 Subject: [PATCH 35/39] Update ssplit submodule to master branch - This submodule brings pcre2 lib compiled from sources --- 3rd_party/ssplit-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd_party/ssplit-cpp b/3rd_party/ssplit-cpp index 1686496..4322088 160000 --- a/3rd_party/ssplit-cpp +++ b/3rd_party/ssplit-cpp @@ -1 +1 @@ -Subproject commit 16864967b7313e76e3b107d11ec39d8d5cedff1e +Subproject commit 432208826ee27e7b3984b53774b1a16d74256d77 From fa4a1ed67d2313efc88d4ac7930da08ab5e882b7 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 22 Feb 2021 18:28:32 +0100 Subject: [PATCH 36/39] Adapted model config in test example of bergamot - Replaced deprecated names with new names mini-batch-words and max-length-break - Set cpu-threads to 0 --- wasm/test_page/bergamot.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wasm/test_page/bergamot.html b/wasm/test_page/bergamot.html index 7956544..8b3cfa8 100644 --- a/wasm/test_page/bergamot.html +++ b/wasm/test_page/bergamot.html @@ -81,12 +81,12 @@ vocabs: beam-size: 1 normalize: 1.0 word-penalty: 0 -max-input-sentence-tokens: 128 -max-input-tokens: 1024 +max-length-break: 128 +mini-batch-words: 1024 workspace: 128 max-length-factor: 2.0 skip-cost: true -cpu-threads: 1 +cpu-threads: 0 quiet: true quiet-translation: true shortlist: From 462a850d8ae554e835f0e3e64aef897aa3f93dad Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 22 Feb 2021 18:48:59 +0100 Subject: [PATCH 37/39] Changed Sentences to Paragraphs in test page of WASM - Sentence Splitter works now => No more sentence splitting in test code - Changed example to include some paragraphs --- wasm/test_page/bergamot.html | 43 ++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/wasm/test_page/bergamot.html b/wasm/test_page/bergamot.html index 8b3cfa8..d529e42 100644 --- a/wasm/test_page/bergamot.html +++ b/wasm/test_page/bergamot.html @@ -38,16 +38,11 @@


@@ -112,19 +107,19 @@ maxi-batch-sort: src model = new Module.TranslationModel(modelConfig); } - const translate = (sentences) => { + const translate = (paragraphs) => { // Instantiate the arguments of translate() API i.e. TranslationRequest and input (vector) var request = new Module.TranslationRequest(); let input = new Module.VectorString; // Initialize the input - sentences.forEach(sentence => { - // prevent empty sentences - it breaks the translation - if (sentence.trim() === "") { + paragraphs.forEach(paragraph => { + // prevent empty paragraph - it breaks the translation + if (paragraph.trim() === "") { return; } - input.push_back(sentence.trim()) + input.push_back(paragraph.trim()) }) // Access input (just for debugging) console.log('Input size=', input.size()); @@ -138,14 +133,14 @@ maxi-batch-sort: src let result = model.translate(input, request); // Access original and translated text from each entry of vector //console.log('Result size=', result.size(), ' - TimeDiff - ', (Date.now() - start)/1000); - const translatedSentences = []; + const translatedParagraphs = []; for (let i = 0; i < result.size(); i++) { - translatedSentences.push(result.get(i).getTranslatedText()); + translatedParagraphs.push(result.get(i).getTranslatedText()); } - console.log({ translatedSentences }); + console.log({ translatedParagraphs }); request.delete(); input.delete(); - return translatedSentences; + return translatedParagraphs; } document.querySelector("#load").addEventListener("click", () => { @@ -160,17 +155,17 @@ maxi-batch-sort: src const translateCall = () => { const text = document.querySelector('#from').value; - const sentences = text.split("\n"); + const paragraphs = text.split("\n"); let wordCount = 0; - sentences.forEach(sentence => { + paragraphs.forEach(sentence => { wordCount += sentence.trim().split(" ").filter(word => word.trim() !== "").length; }) const start = Date.now(); - const translatedSentences = translate(sentences); + const translatedParagraphs = translate(paragraphs); const secs = (Date.now() - start) / 1000; - log(`Translation of ${translatedSentences.length} sentences (wordCount ${wordCount}) took ${secs} secs (${Math.round(wordCount / secs)} words per second)`); + log(`Translation of (${wordCount}) words took ${secs} secs (${Math.round(wordCount / secs)} words per second)`); - document.querySelector('#to').value = translatedSentences.join("\n"); + document.querySelector('#to').value = translatedParagraphs.join("\n"); } document.querySelector("#translate").addEventListener("click", () => { From 458176c0501880e3f308cf90bfc8b8fb7e9504c6 Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Mon, 22 Feb 2021 18:51:48 +0100 Subject: [PATCH 38/39] Enable building pcre2 from sources for ssplit submodule - USE_INTERNAL_PCRE2 is set to ON - Sentence splitting is working (tested it via wasm test page) --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb027b7..aed7c81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,8 @@ if(COMPILE_WASM) # Set WORMHOLE to ON for marian whenever compiling for wasm platform SET(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160") endif() +# Set ssplit (3rd party submodule) cmake options to compile for this project +SET(USE_INTERNAL_PCRE2 ON CACHE BOOL "Use internal PCRE2 instead of system PCRE2") # Documentation: https://cliutils.gitlab.io/modern-cmake/chapters/projects/submodule.html # Ensures the submodules are set correctly during a build. From 415d16bd1de06e67f46d02c54edabbf4e980bd9b Mon Sep 17 00:00:00 2001 From: Abhishek Aggarwal Date: Tue, 23 Feb 2021 15:53:05 +0100 Subject: [PATCH 39/39] Single cmake option to enable/disable wasm compatible marian compilation - USE_WASM_COMPATIBLE_MARIAN=off will start using vanilla Marian i.e. with full threading support, with exceptions, with MKL - Changed the relevant documentation --- CMakeLists.txt | 25 ++++++++++++++++--------- doc/marian-integration.md | 5 +---- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aed7c81..622a41b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,24 +9,31 @@ project(bergamot_translator CXX C) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) +include(CMakeDependentOption) + # Project specific cmake options option(COMPILE_WASM "Compile for WASM" OFF) -option(COMPILE_THREAD_VARIANT "Compile with thread support" OFF) +option(USE_WASM_COMPATIBLE_MARIAN "Use wasm compatible marian backend" ON) +CMAKE_DEPENDENT_OPTION(COMPILE_THREAD_VARIANT "Compile the project with thread support" OFF + "USE_WASM_COMPATIBLE_MARIAN" ON) SET(PACKAGE_DIR "" CACHE STRING "Directory including all the files to be packaged (pre-loaded) in wasm builds") # Set marian (3rd party submodule) cmake options to compile for this project SET(COMPILE_CUDA OFF CACHE BOOL "Compile GPU version") SET(USE_SENTENCEPIECE ON CACHE BOOL "Download and compile SentencePiece") SET(USE_STATIC_LIBS ON CACHE BOOL "Link statically against non-system libs") -SET(USE_MKL OFF CACHE BOOL "Compile with MKL support") -SET(COMPILE_DECODER_ONLY ON CACHE BOOL "Compile marian-decoder only") -SET(COMPILE_WITH_PTHREADS OFF CACHE BOOL "Compile with pthreads support") -SET(USE_WASM_COMPATIBLE_BLAS ON CACHE BOOL "Compile with a WASM compatible blas for decoder only builds") SET(COMPILE_LIBRARY_ONLY ON CACHE BOOL "Build only the Marian library and exclude all executables.") -SET(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without exceptions") -if(COMPILE_WASM) - # Set WORMHOLE to ON for marian whenever compiling for wasm platform - SET(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160") +if (USE_WASM_COMPATIBLE_MARIAN) + # If using wasm compatible marian then set following flags + SET(USE_MKL OFF CACHE BOOL "Compile with MKL support") + SET(COMPILE_DECODER_ONLY ON CACHE BOOL "Compile marian-decoder only") + SET(COMPILE_WITH_PTHREADS OFF CACHE BOOL "Compile with pthreads support") + SET(USE_WASM_COMPATIBLE_BLAS ON CACHE BOOL "Compile with a WASM compatible blas for decoder only builds") + SET(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without exceptions") + if(COMPILE_WASM) + # Set WORMHOLE to ON for marian whenever compiling for wasm platform + SET(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160") + endif() endif() # Set ssplit (3rd party submodule) cmake options to compile for this project SET(USE_INTERNAL_PCRE2 ON CACHE BOOL "Use internal PCRE2 instead of system PCRE2") diff --git a/doc/marian-integration.md b/doc/marian-integration.md index b7089c0..7621024 100644 --- a/doc/marian-integration.md +++ b/doc/marian-integration.md @@ -10,10 +10,7 @@ $ git clone https://github.com/browsermt/bergamot-translator $ cd bergamot-translator $ mkdir build $ cd build -$ cmake .. -DCOMPILE_CUDA=off -DCMAKE_BUILD_TYPE=Release \ - -DCOMPILE_DECODER_ONLY=on -DUSE_MKL=on -DCOMPILE_THREAD_VARIANT=on \ - -DUSE_WASM_COMPATIBLE_BLAS=off -DCOMPILE_MAIN_EIGEN=off - +$ cmake .. -DUSE_WASM_COMPATIBLE_MARIAN=off -DCMAKE_BUILD_TYPE=Release $ make -j ```