Updates: marian-dev, ssplit for marian-decoder-new

Updates marian-dev and ssplit submodules to point to the upstream commits which implements the following: - marian-dev: encodeWithByteRanges(...) to get source token byte-ranges - ssplit: Has a trivial sentencesplitter functionality implemented, and now is faster to benchmark with marian-decoder. This enables a marian-decoder replacement written through ssplit in this source to be benchmarked constantly with existing marian-decoder. Nits: Removes logging introduced for multiple workers, and respective log statements.
2024-09-11 05:35:33 +03:00 · 2021-02-05 12:55:57 +00:00 · 2021-02-05 12:55:57 +00:00 · 38e8b3cd6d
commit 38e8b3cd6d
parent 2929077324
16 changed files with 186 additions and 145 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,17 @@
 *.swp
 *.swo

+# CMake
+CMakeLists.txt.user
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Testing
+Makefile
+cmake_install.cmake
+install_manifest.txt
+compile_commands.json
+CTestTestfile.cmake
+_deps
+
+
--- a/3rd_party/marian-dev
+++ b/3rd_party/marian-dev
@ -1 +1 @@
-Subproject commit ee56e02f0525a4651157a07f74b44f456db14c8c
+Subproject commit 2f65280459737c37c270e4ad0b6d41de215d11e0
--- a/3rd_party/ssplit-cpp
+++ b/3rd_party/ssplit-cpp
@ -1 +1 @@
-Subproject commit f5d022992f4a00c860eb809389748908bb85ffcf
+Subproject commit 01e71b4964fdc351f932a7a23cab4cb80b9698e8
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -3,3 +3,6 @@ target_link_libraries(bergamot-translator-app PRIVATE bergamot-translator)

 add_executable(service-cli main-mts.cpp)
 target_link_libraries(service-cli PRIVATE bergamot-translator)
+
+add_executable(marian-decoder-new marian-decoder-new.cpp)
+target_link_libraries(marian-decoder-new PRIVATE bergamot-translator)
--- a/app/main-mts.cpp
+++ b/app/main-mts.cpp
@ -26,21 +26,8 @@ int main(int argc, char *argv[]) {
      service.translate(std::move(input));
  translation_result_future.wait();
  const TranslationResult &translation_result = translation_result_future.get();
-
-  std::cout << "service-cli [Source text]: ";
-  std::cout << translation_result.getOriginalText() << std::endl;
-
-  std::cout << "service-cli [Translated text]: ";
  std::cout << translation_result.getTranslatedText() << std::endl;

-  // Obtain sentenceMappings and print them as Proof of Concept.
-  const TranslationResult::SentenceMappings &sentenceMappings =
-      translation_result.getSentenceMappings();
-  for (auto &p : sentenceMappings) {
-    std::cout << "service-cli [src] " << p.first << "\n";
-    std::cout << "service-cli [tgt] " << p.second << "\n";
-  }
-
  // Stop Service.
  service.stop();
  return 0;
--- a/app/marian-decoder-new.cpp
+++ b/app/marian-decoder-new.cpp
@ -0,0 +1,63 @@
+#include <cstdlib>
+#include <future>
+#include <iostream>
+#include <sstream>
+
+#include "common/definitions.h"
+#include "common/timer.h"
+#include "common/utils.h"
+#include "marian.h"
+#include "translator/history.h"
+#include "translator/output_collector.h"
+#include "translator/output_printer.h"
+#include "translator/parser.h"
+#include "translator/service.h"
+#include "translator/translation_result.h"
+
+void marian_decoder_minimal(const marian::Histories &histories,
+                            marian::Ptr<marian::Vocab const> targetVocab,
+                            marian::Ptr<marian::Options> options) {
+
+  bool doNbest = options->get<bool>("n-best");
+  auto collector =
+      marian::New<marian::OutputCollector>(options->get<std::string>("output"));
+
+  // There is a dependency of vocabs here.
+  auto printer = marian::New<marian::OutputPrinter>(options, targetVocab);
+  if (options->get<bool>("quiet-translation"))
+    collector->setPrintingStrategy(marian::New<marian::QuietPrinting>());
+
+  for (auto &history : histories) {
+    std::stringstream best1;
+    std::stringstream bestn;
+    printer->print(history, best1, bestn);
+    collector->Write((long)history->getLineNum(), best1.str(), bestn.str(),
+                     doNbest);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  auto cp = marian::bergamot::createConfigParser();
+  auto options = cp.parseOptions(argc, argv, true);
+  marian::timer::Timer decoderTimer;
+
+  marian::bergamot::Service service(options);
+  // Read a large input text blob from stdin
+  std::ostringstream std_input;
+  std_input << std::cin.rdbuf();
+  std::string input = std_input.str();
+  using marian::bergamot::TranslationResult;
+
+  // Wait on future until TranslationResult is complete
+  std::future<TranslationResult> translation_result_future =
+      service.translate(std::move(input));
+  translation_result_future.wait();
+  const TranslationResult &translation_result = translation_result_future.get();
+
+  marian_decoder_minimal(translation_result.getHistories(),
+                         service.targetVocab(), options);
+
+  LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed());
+  service.stop();
+  return 0;
+}
--- a/src/translator/CMakeLists.txt
+++ b/src/translator/CMakeLists.txt
@ -3,7 +3,8 @@ add_library(bergamot-translator STATIC
    TranslationModel.cpp

    # Following files added from browsermt/mts@nuke
-    textops.cpp
+    text_processor.cpp
+    sentence_splitter.cpp
    batch_translator.cpp 
    multifactor_priority.cpp 
    request.cpp 
@ -18,3 +19,4 @@ target_include_directories(bergamot-translator
    PRIVATE ${CMAKE_SOURCE_DIR}
    PUBLIC ${CMAKE_SOURCE_DIR}/src)

+
--- a/src/translator/batch_translator.cpp
+++ b/src/translator/batch_translator.cpp
@ -2,7 +2,6 @@
 #include "common/logging.h"
 #include "data/corpus.h"
 #include "data/text_input.h"
-#include "sanelogging.h"
 #include "translator/beam_search.h"

 namespace marian {
--- a/src/translator/batcher.cpp
+++ b/src/translator/batcher.cpp
@ -1,6 +1,5 @@
 #include "batcher.h"
 #include "common/logging.h"
-#include "sanelogging.h"
 #include <cassert>

 namespace marian {
--- a/src/translator/sanelogging.h
+++ b/src/translator/sanelogging.h
@ -1,44 +0,0 @@
-#ifndef SRC_BERGAMOT_SANELOGGING_H_
-#define SRC_BERGAMOT_SANELOGGING_H_
-
-#include "spdlog/spdlog.h"
-#include <iostream>
-
-namespace marian {
-
-#define PLOG(worker, level, ...)
-#define _PLOG(worker, level, ...) checkedPLog(worker, #level, __VA_ARGS__)
-
-template <class... Args>
-void checkedPLog(std::string logger, std::string level, Args... args) {
-  Logger log = spdlog::get(logger);
-  if (!log) {
-    try {
-      log = spdlog::daily_logger_st(logger, "logs/" + logger + ".log");
-    } catch (const spdlog::spdlog_ex &ex) {
-      std::cout << "Log initialization failed: " << ex.what() << std::endl;
-    }
-  }
-
-  if (level == "trace")
-    log->trace(args...);
-  else if (level == "debug")
-    log->debug(args...);
-  else if (level == "info")
-    log->info(args...);
-  else if (level == "warn")
-    log->warn(args...);
-  else if (level == "error")
-    log->error(args...);
-  else if (level == "critical")
-    log->critical(args...);
-  else {
-    log->warn("Unknown log level '{}' for logger '{}'", level, logger);
-  }
-  // Not required when threads clean-exit.
-  log->flush();
-}
-
-} // namespace marian
-
-#endif // SRC_BERGAMOT_SANELOGGING_H_
--- a/src/translator/sentence_splitter.cpp
+++ b/src/translator/sentence_splitter.cpp
@ -0,0 +1,52 @@
+#include "common/cli_helper.h"
+#include "common/logging.h"
+#include "common/options.h"
+#include "sentence_splitter.h"
+#include <string>
+
+namespace marian {
+namespace bergamot {
+
+SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options)
+    : options_(options) {
+
+  std::string smode_str = options_->get<std::string>("ssplit-mode", "");
+  mode_ = string2splitmode(smode_str);
+  std::string ssplit_prefix_file =
+      options_->get<std::string>("ssplit-prefix-file", "");
+
+  if (ssplit_prefix_file.size()) {
+    ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file);
+
+    LOG(info, "Loading protected prefixes for sentence splitting from {}",
+        ssplit_prefix_file);
+
+    ssplit_.load(ssplit_prefix_file);
+  } else {
+    LOG(warn, "Missing list of protected prefixes for sentence splitting. "
+              "Set with --ssplit-prefix-file.");
+  }
+}
+
+ug::ssplit::SentenceStream
+SentenceSplitter::createSentenceStream(const string_view &input) {
+  return std::move(ug::ssplit::SentenceStream(input.data(), input.size(),
+                                              this->ssplit_, mode_));
+}
+
+ug::ssplit::SentenceStream::splitmode
+SentenceSplitter::string2splitmode(const std::string &m) {
+  typedef ug::ssplit::SentenceStream::splitmode splitmode;
+  // @TODO: throw Exception on error
+  if (m == "sentence" || m == "Sentence")
+    return splitmode::one_sentence_per_line;
+  if (m == "paragraph" || m == "Paragraph")
+    return splitmode::one_paragraph_per_line;
+  if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") {
+    LOG(warn, "Ignoring unknown text input format specification: {}.", m);
+  }
+  return splitmode::wrapped_text;
+}
+
+} // namespace bergamot
+} // namespace marian
--- a/src/translator/sentence_splitter.h
+++ b/src/translator/sentence_splitter.h
@ -0,0 +1,31 @@
+#ifndef SRC_BERGAMOT_SENTENCE_SPLITTER_H_
+#define SRC_BERGAMOT_SENTENCE_SPLITTER_H_
+
+#include "common/options.h"
+#include "data/types.h"
+#include "ssplit.h"
+#include <string>
+
+namespace marian {
+namespace bergamot {
+
+class SentenceSplitter {
+  // A wrapper around @ugermann's ssplit-cpp compiled from several places in
+  // mts. Constructed based on options. Used in TextProcessor below to create
+  // sentence-streams, which provide access to one sentence from blob of text at
+  // a time.
+public:
+  explicit SentenceSplitter(Ptr<Options> options);
+  ug::ssplit::SentenceStream createSentenceStream(string_view const &input);
+
+private:
+  ug::ssplit::SentenceSplitter ssplit_;
+  Ptr<Options> options_;
+  ug::ssplit::SentenceStream::splitmode mode_;
+  ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m);
+};
+
+} // namespace bergamot
+} // namespace marian
+
+#endif //  SRC_BERGAMOT_SENTENCE_SPLITTER_H_
--- a/src/translator/service.cpp
+++ b/src/translator/service.cpp
@ -1,6 +1,5 @@
 #include "service.h"
 #include "definitions.h"
-#include "sanelogging.h"

 #include <string>
 #include <utility>
--- a/src/translator/service.h
+++ b/src/translator/service.h
@ -4,7 +4,7 @@
 #include "batch_translator.h"
 #include "batcher.h"
 #include "pcqueue.h"
-#include "textops.h"
+#include "text_processor.h"
 #include "translation_result.h"

 #include <queue>
--- a/src/translator/text_processor.cpp
+++ b/src/translator/text_processor.cpp
@ -1,58 +1,17 @@
-#include "textops.h"
-#include "common/timer.h"
-#include <pcrecpp.h>
-#include <string>
-#include <unordered_map>
-#include <utility>
+#include "text_processor.h"
+#include "data/types.h"
+#include "definitions.h"
+
+#include "common/options.h"
+#include "data/vocab.h"
 #include <vector>

 namespace marian {
 namespace bergamot {

-SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options)
-    : options_(options) {
-
-  std::string smode_str = options_->get<std::string>("ssplit-mode", "");
-  mode_ = string2splitmode(smode_str);
-  std::string ssplit_prefix_file =
-      options_->get<std::string>("ssplit-prefix-file", "");
-
-  if (ssplit_prefix_file.size()) {
-    ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file);
-
-    LOG(info, "Loading protected prefixes for sentence splitting from {}",
-        ssplit_prefix_file);
-
-    ssplit_.load(ssplit_prefix_file);
-  } else {
-    LOG(warn, "Missing list of protected prefixes for sentence splitting. "
-              "Set with --ssplit-prefix-file.");
-  }
-}
-
-ug::ssplit::SentenceStream
-SentenceSplitter::createSentenceStream(const string_view &input) {
-  pcrecpp::StringPiece spiece(input.begin(), input.size());
-  return std::move(ug::ssplit::SentenceStream(spiece, this->ssplit_, mode_));
-}
-
-ug::ssplit::SentenceStream::splitmode
-SentenceSplitter::string2splitmode(const std::string &m) {
-  typedef ug::ssplit::SentenceStream::splitmode splitmode;
-  // @TODO: throw Exception on error
-  if (m == "sentence" || m == "Sentence")
-    return splitmode::one_sentence_per_line;
-  if (m == "paragraph" || m == "Paragraph")
-    return splitmode::one_paragraph_per_line;
-  if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") {
-    LOG(warn, "Ignoring unknown text input format specification: {}.", m);
-  }
-  return splitmode::wrapped_text;
-}
-
 Segment TextProcessor::tokenize(const string_view &segment,
                                TokenRanges &tokenRanges) {
-  return vocabs_->front()->encodePreservingSource(
+  return vocabs_->front()->encodeWithByteRanges(
      segment, tokenRanges, /*addEOS=*/false, /*inference=*/true);
 }

@ -70,11 +29,11 @@ void TextProcessor::process(const string_view &query, Segments &segments,
                            std::vector<TokenRanges> &sourceRanges) {

  auto sentenceStream = sentence_splitter_.createSentenceStream(query);
-  pcrecpp::StringPiece sentenceStringPiece;
+  std::string_view sentenceStringPiece;

  while (sentenceStream >> sentenceStringPiece) {
-    string_view sentence(sentenceStringPiece.data(),
-                         sentenceStringPiece.size());
+    marian::string_view sentence(sentenceStringPiece.data(),
+                                 sentenceStringPiece.size());
    TokenRanges tokenRanges;
    Segment segment = tokenize(sentence, tokenRanges);

--- a/src/translator/text_processor.h
+++ b/src/translator/text_processor.h
@ -1,40 +1,17 @@
-#ifndef SRC_BERGAMOT_TEXTOPS_H_
-#define SRC_BERGAMOT_TEXTOPS_H_
+#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
+#define SRC_BERGAMOT_TEXT_PROCESSOR_H_

-#include "common/definitions.h"
-#include "common/logging.h"
-#include "common/options.h"
-#include "common/types.h" // missing in shortlist.h
-#include "common/utils.h"
-#include "data/sentencepiece_vocab.h"
-#include "data/shortlist.h"
+#include "data/types.h"
+#include "data/vocab.h"
 #include "definitions.h"
-#include "ssplit.h"

-#include <cassert>
-#include <iostream>
-#include <string>
+#include "sentence_splitter.h"
+
 #include <vector>

 namespace marian {
 namespace bergamot {

-class SentenceSplitter {
-  // A wrapper around @ugermann's ssplit-cpp compiled from several places in
-  // mts. Constructed based on options. Used in TextProcessor below to create
-  // sentence-streams, which provide access to one sentence from blob of text at
-  // a time.
-public:
-  explicit SentenceSplitter(Ptr<Options> options);
-  ug::ssplit::SentenceStream createSentenceStream(string_view const &input);
-
-private:
-  ug::ssplit::SentenceSplitter ssplit_;
-  Ptr<Options> options_;
-  ug::ssplit::SentenceStream::splitmode mode_;
-  ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m);
-};
-
 class TextProcessor {
  // TextProcessor handles loading the sentencepiece vocabulary and also
  // contains an instance of sentence-splitter based on ssplit.
@ -68,4 +45,4 @@ private:
 } // namespace bergamot
 } // namespace marian

-#endif // SRC_BERGAMOT_TEXTOPS_H_
+#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_