From 38e8b3cd6d5a2db561ce201c3e69fb79c676389c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Fri, 5 Feb 2021 12:55:57 +0000 Subject: [PATCH] Updates: marian-dev, ssplit for marian-decoder-new Updates marian-dev and ssplit submodules to point to the upstream commits which implements the following: - marian-dev: encodeWithByteRanges(...) to get source token byte-ranges - ssplit: Has a trivial sentencesplitter functionality implemented, and now is faster to benchmark with marian-decoder. This enables a marian-decoder replacement written through ssplit in this source to be benchmarked constantly with existing marian-decoder. Nits: Removes logging introduced for multiple workers, and respective log statements. --- .gitignore | 14 +++++ 3rd_party/marian-dev | 2 +- 3rd_party/ssplit-cpp | 2 +- app/CMakeLists.txt | 3 + app/main-mts.cpp | 13 ---- app/marian-decoder-new.cpp | 63 +++++++++++++++++++ src/translator/CMakeLists.txt | 4 +- src/translator/batch_translator.cpp | 1 - src/translator/batcher.cpp | 1 - src/translator/sanelogging.h | 44 ------------- src/translator/sentence_splitter.cpp | 52 +++++++++++++++ src/translator/sentence_splitter.h | 31 +++++++++ src/translator/service.cpp | 1 - src/translator/service.h | 2 +- .../{textops.cpp => text_processor.cpp} | 61 +++--------------- .../{textops.h => text_processor.h} | 37 +++-------- 16 files changed, 186 insertions(+), 145 deletions(-) create mode 100644 app/marian-decoder-new.cpp delete mode 100644 src/translator/sanelogging.h create mode 100644 src/translator/sentence_splitter.cpp create mode 100644 src/translator/sentence_splitter.h rename src/translator/{textops.cpp => text_processor.cpp} (52%) rename src/translator/{textops.h => text_processor.h} (56%) diff --git a/.gitignore b/.gitignore index e63aee1..54493b9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,17 @@ *.swp *.swo +# CMake +CMakeLists.txt.user +CMakeCache.txt +CMakeFiles +CMakeScripts +Testing +Makefile +cmake_install.cmake +install_manifest.txt +compile_commands.json +CTestTestfile.cmake +_deps + + diff --git a/3rd_party/marian-dev b/3rd_party/marian-dev index ee56e02..2f65280 160000 --- a/3rd_party/marian-dev +++ b/3rd_party/marian-dev @@ -1 +1 @@ -Subproject commit ee56e02f0525a4651157a07f74b44f456db14c8c +Subproject commit 2f65280459737c37c270e4ad0b6d41de215d11e0 diff --git a/3rd_party/ssplit-cpp b/3rd_party/ssplit-cpp index f5d0229..01e71b4 160000 --- a/3rd_party/ssplit-cpp +++ b/3rd_party/ssplit-cpp @@ -1 +1 @@ -Subproject commit f5d022992f4a00c860eb809389748908bb85ffcf +Subproject commit 01e71b4964fdc351f932a7a23cab4cb80b9698e8 diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 6e71e9e..24bd0b4 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -3,3 +3,6 @@ target_link_libraries(bergamot-translator-app PRIVATE bergamot-translator) add_executable(service-cli main-mts.cpp) target_link_libraries(service-cli PRIVATE bergamot-translator) + +add_executable(marian-decoder-new marian-decoder-new.cpp) +target_link_libraries(marian-decoder-new PRIVATE bergamot-translator) diff --git a/app/main-mts.cpp b/app/main-mts.cpp index 44a019a..c94ff30 100644 --- a/app/main-mts.cpp +++ b/app/main-mts.cpp @@ -26,21 +26,8 @@ int main(int argc, char *argv[]) { service.translate(std::move(input)); translation_result_future.wait(); const TranslationResult &translation_result = translation_result_future.get(); - - std::cout << "service-cli [Source text]: "; - std::cout << translation_result.getOriginalText() << std::endl; - - std::cout << "service-cli [Translated text]: "; std::cout << translation_result.getTranslatedText() << std::endl; - // Obtain sentenceMappings and print them as Proof of Concept. - const TranslationResult::SentenceMappings &sentenceMappings = - translation_result.getSentenceMappings(); - for (auto &p : sentenceMappings) { - std::cout << "service-cli [src] " << p.first << "\n"; - std::cout << "service-cli [tgt] " << p.second << "\n"; - } - // Stop Service. service.stop(); return 0; diff --git a/app/marian-decoder-new.cpp b/app/marian-decoder-new.cpp new file mode 100644 index 0000000..62b1bb4 --- /dev/null +++ b/app/marian-decoder-new.cpp @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +#include "common/definitions.h" +#include "common/timer.h" +#include "common/utils.h" +#include "marian.h" +#include "translator/history.h" +#include "translator/output_collector.h" +#include "translator/output_printer.h" +#include "translator/parser.h" +#include "translator/service.h" +#include "translator/translation_result.h" + +void marian_decoder_minimal(const marian::Histories &histories, + marian::Ptr targetVocab, + marian::Ptr options) { + + bool doNbest = options->get("n-best"); + auto collector = + marian::New(options->get("output")); + + // There is a dependency of vocabs here. + auto printer = marian::New(options, targetVocab); + if (options->get("quiet-translation")) + collector->setPrintingStrategy(marian::New()); + + for (auto &history : histories) { + std::stringstream best1; + std::stringstream bestn; + printer->print(history, best1, bestn); + collector->Write((long)history->getLineNum(), best1.str(), bestn.str(), + doNbest); + } +} + +int main(int argc, char *argv[]) { + auto cp = marian::bergamot::createConfigParser(); + auto options = cp.parseOptions(argc, argv, true); + marian::timer::Timer decoderTimer; + + marian::bergamot::Service service(options); + // Read a large input text blob from stdin + std::ostringstream std_input; + std_input << std::cin.rdbuf(); + std::string input = std_input.str(); + using marian::bergamot::TranslationResult; + + // Wait on future until TranslationResult is complete + std::future translation_result_future = + service.translate(std::move(input)); + translation_result_future.wait(); + const TranslationResult &translation_result = translation_result_future.get(); + + marian_decoder_minimal(translation_result.getHistories(), + service.targetVocab(), options); + + LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed()); + service.stop(); + return 0; +} diff --git a/src/translator/CMakeLists.txt b/src/translator/CMakeLists.txt index b6fcf69..16c3db9 100644 --- a/src/translator/CMakeLists.txt +++ b/src/translator/CMakeLists.txt @@ -3,7 +3,8 @@ add_library(bergamot-translator STATIC TranslationModel.cpp # Following files added from browsermt/mts@nuke - textops.cpp + text_processor.cpp + sentence_splitter.cpp batch_translator.cpp multifactor_priority.cpp request.cpp @@ -18,3 +19,4 @@ target_include_directories(bergamot-translator PRIVATE ${CMAKE_SOURCE_DIR} PUBLIC ${CMAKE_SOURCE_DIR}/src) + diff --git a/src/translator/batch_translator.cpp b/src/translator/batch_translator.cpp index 6380a00..860255c 100644 --- a/src/translator/batch_translator.cpp +++ b/src/translator/batch_translator.cpp @@ -2,7 +2,6 @@ #include "common/logging.h" #include "data/corpus.h" #include "data/text_input.h" -#include "sanelogging.h" #include "translator/beam_search.h" namespace marian { diff --git a/src/translator/batcher.cpp b/src/translator/batcher.cpp index 22ee46d..2fa4eaf 100644 --- a/src/translator/batcher.cpp +++ b/src/translator/batcher.cpp @@ -1,6 +1,5 @@ #include "batcher.h" #include "common/logging.h" -#include "sanelogging.h" #include namespace marian { diff --git a/src/translator/sanelogging.h b/src/translator/sanelogging.h deleted file mode 100644 index 21f70dd..0000000 --- a/src/translator/sanelogging.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef SRC_BERGAMOT_SANELOGGING_H_ -#define SRC_BERGAMOT_SANELOGGING_H_ - -#include "spdlog/spdlog.h" -#include - -namespace marian { - -#define PLOG(worker, level, ...) -#define _PLOG(worker, level, ...) checkedPLog(worker, #level, __VA_ARGS__) - -template -void checkedPLog(std::string logger, std::string level, Args... args) { - Logger log = spdlog::get(logger); - if (!log) { - try { - log = spdlog::daily_logger_st(logger, "logs/" + logger + ".log"); - } catch (const spdlog::spdlog_ex &ex) { - std::cout << "Log initialization failed: " << ex.what() << std::endl; - } - } - - if (level == "trace") - log->trace(args...); - else if (level == "debug") - log->debug(args...); - else if (level == "info") - log->info(args...); - else if (level == "warn") - log->warn(args...); - else if (level == "error") - log->error(args...); - else if (level == "critical") - log->critical(args...); - else { - log->warn("Unknown log level '{}' for logger '{}'", level, logger); - } - // Not required when threads clean-exit. - log->flush(); -} - -} // namespace marian - -#endif // SRC_BERGAMOT_SANELOGGING_H_ diff --git a/src/translator/sentence_splitter.cpp b/src/translator/sentence_splitter.cpp new file mode 100644 index 0000000..0f9be01 --- /dev/null +++ b/src/translator/sentence_splitter.cpp @@ -0,0 +1,52 @@ +#include "common/cli_helper.h" +#include "common/logging.h" +#include "common/options.h" +#include "sentence_splitter.h" +#include + +namespace marian { +namespace bergamot { + +SentenceSplitter::SentenceSplitter(marian::Ptr options) + : options_(options) { + + std::string smode_str = options_->get("ssplit-mode", ""); + mode_ = string2splitmode(smode_str); + std::string ssplit_prefix_file = + options_->get("ssplit-prefix-file", ""); + + if (ssplit_prefix_file.size()) { + ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file); + + LOG(info, "Loading protected prefixes for sentence splitting from {}", + ssplit_prefix_file); + + ssplit_.load(ssplit_prefix_file); + } else { + LOG(warn, "Missing list of protected prefixes for sentence splitting. " + "Set with --ssplit-prefix-file."); + } +} + +ug::ssplit::SentenceStream +SentenceSplitter::createSentenceStream(const string_view &input) { + return std::move(ug::ssplit::SentenceStream(input.data(), input.size(), + this->ssplit_, mode_)); +} + +ug::ssplit::SentenceStream::splitmode +SentenceSplitter::string2splitmode(const std::string &m) { + typedef ug::ssplit::SentenceStream::splitmode splitmode; + // @TODO: throw Exception on error + if (m == "sentence" || m == "Sentence") + return splitmode::one_sentence_per_line; + if (m == "paragraph" || m == "Paragraph") + return splitmode::one_paragraph_per_line; + if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") { + LOG(warn, "Ignoring unknown text input format specification: {}.", m); + } + return splitmode::wrapped_text; +} + +} // namespace bergamot +} // namespace marian diff --git a/src/translator/sentence_splitter.h b/src/translator/sentence_splitter.h new file mode 100644 index 0000000..5175176 --- /dev/null +++ b/src/translator/sentence_splitter.h @@ -0,0 +1,31 @@ +#ifndef SRC_BERGAMOT_SENTENCE_SPLITTER_H_ +#define SRC_BERGAMOT_SENTENCE_SPLITTER_H_ + +#include "common/options.h" +#include "data/types.h" +#include "ssplit.h" +#include + +namespace marian { +namespace bergamot { + +class SentenceSplitter { + // A wrapper around @ugermann's ssplit-cpp compiled from several places in + // mts. Constructed based on options. Used in TextProcessor below to create + // sentence-streams, which provide access to one sentence from blob of text at + // a time. +public: + explicit SentenceSplitter(Ptr options); + ug::ssplit::SentenceStream createSentenceStream(string_view const &input); + +private: + ug::ssplit::SentenceSplitter ssplit_; + Ptr options_; + ug::ssplit::SentenceStream::splitmode mode_; + ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m); +}; + +} // namespace bergamot +} // namespace marian + +#endif // SRC_BERGAMOT_SENTENCE_SPLITTER_H_ diff --git a/src/translator/service.cpp b/src/translator/service.cpp index 4a5af30..2acbbdb 100644 --- a/src/translator/service.cpp +++ b/src/translator/service.cpp @@ -1,6 +1,5 @@ #include "service.h" #include "definitions.h" -#include "sanelogging.h" #include #include diff --git a/src/translator/service.h b/src/translator/service.h index 4069d13..0ed8d0c 100644 --- a/src/translator/service.h +++ b/src/translator/service.h @@ -4,7 +4,7 @@ #include "batch_translator.h" #include "batcher.h" #include "pcqueue.h" -#include "textops.h" +#include "text_processor.h" #include "translation_result.h" #include diff --git a/src/translator/textops.cpp b/src/translator/text_processor.cpp similarity index 52% rename from src/translator/textops.cpp rename to src/translator/text_processor.cpp index 25e48f1..8114855 100644 --- a/src/translator/textops.cpp +++ b/src/translator/text_processor.cpp @@ -1,58 +1,17 @@ -#include "textops.h" -#include "common/timer.h" -#include -#include -#include -#include +#include "text_processor.h" +#include "data/types.h" +#include "definitions.h" + +#include "common/options.h" +#include "data/vocab.h" #include namespace marian { namespace bergamot { -SentenceSplitter::SentenceSplitter(marian::Ptr options) - : options_(options) { - - std::string smode_str = options_->get("ssplit-mode", ""); - mode_ = string2splitmode(smode_str); - std::string ssplit_prefix_file = - options_->get("ssplit-prefix-file", ""); - - if (ssplit_prefix_file.size()) { - ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file); - - LOG(info, "Loading protected prefixes for sentence splitting from {}", - ssplit_prefix_file); - - ssplit_.load(ssplit_prefix_file); - } else { - LOG(warn, "Missing list of protected prefixes for sentence splitting. " - "Set with --ssplit-prefix-file."); - } -} - -ug::ssplit::SentenceStream -SentenceSplitter::createSentenceStream(const string_view &input) { - pcrecpp::StringPiece spiece(input.begin(), input.size()); - return std::move(ug::ssplit::SentenceStream(spiece, this->ssplit_, mode_)); -} - -ug::ssplit::SentenceStream::splitmode -SentenceSplitter::string2splitmode(const std::string &m) { - typedef ug::ssplit::SentenceStream::splitmode splitmode; - // @TODO: throw Exception on error - if (m == "sentence" || m == "Sentence") - return splitmode::one_sentence_per_line; - if (m == "paragraph" || m == "Paragraph") - return splitmode::one_paragraph_per_line; - if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") { - LOG(warn, "Ignoring unknown text input format specification: {}.", m); - } - return splitmode::wrapped_text; -} - Segment TextProcessor::tokenize(const string_view &segment, TokenRanges &tokenRanges) { - return vocabs_->front()->encodePreservingSource( + return vocabs_->front()->encodeWithByteRanges( segment, tokenRanges, /*addEOS=*/false, /*inference=*/true); } @@ -70,11 +29,11 @@ void TextProcessor::process(const string_view &query, Segments &segments, std::vector &sourceRanges) { auto sentenceStream = sentence_splitter_.createSentenceStream(query); - pcrecpp::StringPiece sentenceStringPiece; + std::string_view sentenceStringPiece; while (sentenceStream >> sentenceStringPiece) { - string_view sentence(sentenceStringPiece.data(), - sentenceStringPiece.size()); + marian::string_view sentence(sentenceStringPiece.data(), + sentenceStringPiece.size()); TokenRanges tokenRanges; Segment segment = tokenize(sentence, tokenRanges); diff --git a/src/translator/textops.h b/src/translator/text_processor.h similarity index 56% rename from src/translator/textops.h rename to src/translator/text_processor.h index 79a5040..111ae00 100644 --- a/src/translator/textops.h +++ b/src/translator/text_processor.h @@ -1,40 +1,17 @@ -#ifndef SRC_BERGAMOT_TEXTOPS_H_ -#define SRC_BERGAMOT_TEXTOPS_H_ +#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_ +#define SRC_BERGAMOT_TEXT_PROCESSOR_H_ -#include "common/definitions.h" -#include "common/logging.h" -#include "common/options.h" -#include "common/types.h" // missing in shortlist.h -#include "common/utils.h" -#include "data/sentencepiece_vocab.h" -#include "data/shortlist.h" +#include "data/types.h" +#include "data/vocab.h" #include "definitions.h" -#include "ssplit.h" -#include -#include -#include +#include "sentence_splitter.h" + #include namespace marian { namespace bergamot { -class SentenceSplitter { - // A wrapper around @ugermann's ssplit-cpp compiled from several places in - // mts. Constructed based on options. Used in TextProcessor below to create - // sentence-streams, which provide access to one sentence from blob of text at - // a time. -public: - explicit SentenceSplitter(Ptr options); - ug::ssplit::SentenceStream createSentenceStream(string_view const &input); - -private: - ug::ssplit::SentenceSplitter ssplit_; - Ptr options_; - ug::ssplit::SentenceStream::splitmode mode_; - ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m); -}; - class TextProcessor { // TextProcessor handles loading the sentencepiece vocabulary and also // contains an instance of sentence-splitter based on ssplit. @@ -68,4 +45,4 @@ private: } // namespace bergamot } // namespace marian -#endif // SRC_BERGAMOT_TEXTOPS_H_ +#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_