Updates: marian-dev, ssplit for marian-decoder-new

Updates marian-dev and ssplit submodules to point to the upstream
commits which implements the following:

 - marian-dev: encodeWithByteRanges(...) to get source token byte-ranges
 - ssplit: Has a trivial sentencesplitter functionality implemented, and
   now is faster to benchmark with marian-decoder.

This enables a marian-decoder replacement written through ssplit in this
source to be benchmarked constantly with existing marian-decoder.

Nits: Removes logging introduced for multiple workers, and respective
log statements.
This commit is contained in:
Jerin Philip 2021-02-05 12:55:57 +00:00
parent 2929077324
commit 38e8b3cd6d
16 changed files with 186 additions and 145 deletions

14
.gitignore vendored
View File

@ -2,3 +2,17 @@
*.swp
*.swo
# CMake
CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps

@ -1 +1 @@
Subproject commit ee56e02f0525a4651157a07f74b44f456db14c8c
Subproject commit 2f65280459737c37c270e4ad0b6d41de215d11e0

@ -1 +1 @@
Subproject commit f5d022992f4a00c860eb809389748908bb85ffcf
Subproject commit 01e71b4964fdc351f932a7a23cab4cb80b9698e8

View File

@ -3,3 +3,6 @@ target_link_libraries(bergamot-translator-app PRIVATE bergamot-translator)
add_executable(service-cli main-mts.cpp)
target_link_libraries(service-cli PRIVATE bergamot-translator)
add_executable(marian-decoder-new marian-decoder-new.cpp)
target_link_libraries(marian-decoder-new PRIVATE bergamot-translator)

View File

@ -26,21 +26,8 @@ int main(int argc, char *argv[]) {
service.translate(std::move(input));
translation_result_future.wait();
const TranslationResult &translation_result = translation_result_future.get();
std::cout << "service-cli [Source text]: ";
std::cout << translation_result.getOriginalText() << std::endl;
std::cout << "service-cli [Translated text]: ";
std::cout << translation_result.getTranslatedText() << std::endl;
// Obtain sentenceMappings and print them as Proof of Concept.
const TranslationResult::SentenceMappings &sentenceMappings =
translation_result.getSentenceMappings();
for (auto &p : sentenceMappings) {
std::cout << "service-cli [src] " << p.first << "\n";
std::cout << "service-cli [tgt] " << p.second << "\n";
}
// Stop Service.
service.stop();
return 0;

View File

@ -0,0 +1,63 @@
#include <cstdlib>
#include <future>
#include <iostream>
#include <sstream>
#include "common/definitions.h"
#include "common/timer.h"
#include "common/utils.h"
#include "marian.h"
#include "translator/history.h"
#include "translator/output_collector.h"
#include "translator/output_printer.h"
#include "translator/parser.h"
#include "translator/service.h"
#include "translator/translation_result.h"
void marian_decoder_minimal(const marian::Histories &histories,
marian::Ptr<marian::Vocab const> targetVocab,
marian::Ptr<marian::Options> options) {
bool doNbest = options->get<bool>("n-best");
auto collector =
marian::New<marian::OutputCollector>(options->get<std::string>("output"));
// There is a dependency of vocabs here.
auto printer = marian::New<marian::OutputPrinter>(options, targetVocab);
if (options->get<bool>("quiet-translation"))
collector->setPrintingStrategy(marian::New<marian::QuietPrinting>());
for (auto &history : histories) {
std::stringstream best1;
std::stringstream bestn;
printer->print(history, best1, bestn);
collector->Write((long)history->getLineNum(), best1.str(), bestn.str(),
doNbest);
}
}
int main(int argc, char *argv[]) {
auto cp = marian::bergamot::createConfigParser();
auto options = cp.parseOptions(argc, argv, true);
marian::timer::Timer decoderTimer;
marian::bergamot::Service service(options);
// Read a large input text blob from stdin
std::ostringstream std_input;
std_input << std::cin.rdbuf();
std::string input = std_input.str();
using marian::bergamot::TranslationResult;
// Wait on future until TranslationResult is complete
std::future<TranslationResult> translation_result_future =
service.translate(std::move(input));
translation_result_future.wait();
const TranslationResult &translation_result = translation_result_future.get();
marian_decoder_minimal(translation_result.getHistories(),
service.targetVocab(), options);
LOG(info, "Total time: {:.5f}s wall", decoderTimer.elapsed());
service.stop();
return 0;
}

View File

@ -3,7 +3,8 @@ add_library(bergamot-translator STATIC
TranslationModel.cpp
# Following files added from browsermt/mts@nuke
textops.cpp
text_processor.cpp
sentence_splitter.cpp
batch_translator.cpp
multifactor_priority.cpp
request.cpp
@ -18,3 +19,4 @@ target_include_directories(bergamot-translator
PRIVATE ${CMAKE_SOURCE_DIR}
PUBLIC ${CMAKE_SOURCE_DIR}/src)

View File

@ -2,7 +2,6 @@
#include "common/logging.h"
#include "data/corpus.h"
#include "data/text_input.h"
#include "sanelogging.h"
#include "translator/beam_search.h"
namespace marian {

View File

@ -1,6 +1,5 @@
#include "batcher.h"
#include "common/logging.h"
#include "sanelogging.h"
#include <cassert>
namespace marian {

View File

@ -1,44 +0,0 @@
#ifndef SRC_BERGAMOT_SANELOGGING_H_
#define SRC_BERGAMOT_SANELOGGING_H_
#include "spdlog/spdlog.h"
#include <iostream>
namespace marian {
#define PLOG(worker, level, ...)
#define _PLOG(worker, level, ...) checkedPLog(worker, #level, __VA_ARGS__)
template <class... Args>
void checkedPLog(std::string logger, std::string level, Args... args) {
Logger log = spdlog::get(logger);
if (!log) {
try {
log = spdlog::daily_logger_st(logger, "logs/" + logger + ".log");
} catch (const spdlog::spdlog_ex &ex) {
std::cout << "Log initialization failed: " << ex.what() << std::endl;
}
}
if (level == "trace")
log->trace(args...);
else if (level == "debug")
log->debug(args...);
else if (level == "info")
log->info(args...);
else if (level == "warn")
log->warn(args...);
else if (level == "error")
log->error(args...);
else if (level == "critical")
log->critical(args...);
else {
log->warn("Unknown log level '{}' for logger '{}'", level, logger);
}
// Not required when threads clean-exit.
log->flush();
}
} // namespace marian
#endif // SRC_BERGAMOT_SANELOGGING_H_

View File

@ -0,0 +1,52 @@
#include "common/cli_helper.h"
#include "common/logging.h"
#include "common/options.h"
#include "sentence_splitter.h"
#include <string>
namespace marian {
namespace bergamot {
SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options)
: options_(options) {
std::string smode_str = options_->get<std::string>("ssplit-mode", "");
mode_ = string2splitmode(smode_str);
std::string ssplit_prefix_file =
options_->get<std::string>("ssplit-prefix-file", "");
if (ssplit_prefix_file.size()) {
ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file);
LOG(info, "Loading protected prefixes for sentence splitting from {}",
ssplit_prefix_file);
ssplit_.load(ssplit_prefix_file);
} else {
LOG(warn, "Missing list of protected prefixes for sentence splitting. "
"Set with --ssplit-prefix-file.");
}
}
ug::ssplit::SentenceStream
SentenceSplitter::createSentenceStream(const string_view &input) {
return std::move(ug::ssplit::SentenceStream(input.data(), input.size(),
this->ssplit_, mode_));
}
ug::ssplit::SentenceStream::splitmode
SentenceSplitter::string2splitmode(const std::string &m) {
typedef ug::ssplit::SentenceStream::splitmode splitmode;
// @TODO: throw Exception on error
if (m == "sentence" || m == "Sentence")
return splitmode::one_sentence_per_line;
if (m == "paragraph" || m == "Paragraph")
return splitmode::one_paragraph_per_line;
if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") {
LOG(warn, "Ignoring unknown text input format specification: {}.", m);
}
return splitmode::wrapped_text;
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,31 @@
#ifndef SRC_BERGAMOT_SENTENCE_SPLITTER_H_
#define SRC_BERGAMOT_SENTENCE_SPLITTER_H_
#include "common/options.h"
#include "data/types.h"
#include "ssplit.h"
#include <string>
namespace marian {
namespace bergamot {
class SentenceSplitter {
// A wrapper around @ugermann's ssplit-cpp compiled from several places in
// mts. Constructed based on options. Used in TextProcessor below to create
// sentence-streams, which provide access to one sentence from blob of text at
// a time.
public:
explicit SentenceSplitter(Ptr<Options> options);
ug::ssplit::SentenceStream createSentenceStream(string_view const &input);
private:
ug::ssplit::SentenceSplitter ssplit_;
Ptr<Options> options_;
ug::ssplit::SentenceStream::splitmode mode_;
ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m);
};
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_SENTENCE_SPLITTER_H_

View File

@ -1,6 +1,5 @@
#include "service.h"
#include "definitions.h"
#include "sanelogging.h"
#include <string>
#include <utility>

View File

@ -4,7 +4,7 @@
#include "batch_translator.h"
#include "batcher.h"
#include "pcqueue.h"
#include "textops.h"
#include "text_processor.h"
#include "translation_result.h"
#include <queue>

View File

@ -1,58 +1,17 @@
#include "textops.h"
#include "common/timer.h"
#include <pcrecpp.h>
#include <string>
#include <unordered_map>
#include <utility>
#include "text_processor.h"
#include "data/types.h"
#include "definitions.h"
#include "common/options.h"
#include "data/vocab.h"
#include <vector>
namespace marian {
namespace bergamot {
SentenceSplitter::SentenceSplitter(marian::Ptr<marian::Options> options)
: options_(options) {
std::string smode_str = options_->get<std::string>("ssplit-mode", "");
mode_ = string2splitmode(smode_str);
std::string ssplit_prefix_file =
options_->get<std::string>("ssplit-prefix-file", "");
if (ssplit_prefix_file.size()) {
ssplit_prefix_file = marian::cli::interpolateEnvVars(ssplit_prefix_file);
LOG(info, "Loading protected prefixes for sentence splitting from {}",
ssplit_prefix_file);
ssplit_.load(ssplit_prefix_file);
} else {
LOG(warn, "Missing list of protected prefixes for sentence splitting. "
"Set with --ssplit-prefix-file.");
}
}
ug::ssplit::SentenceStream
SentenceSplitter::createSentenceStream(const string_view &input) {
pcrecpp::StringPiece spiece(input.begin(), input.size());
return std::move(ug::ssplit::SentenceStream(spiece, this->ssplit_, mode_));
}
ug::ssplit::SentenceStream::splitmode
SentenceSplitter::string2splitmode(const std::string &m) {
typedef ug::ssplit::SentenceStream::splitmode splitmode;
// @TODO: throw Exception on error
if (m == "sentence" || m == "Sentence")
return splitmode::one_sentence_per_line;
if (m == "paragraph" || m == "Paragraph")
return splitmode::one_paragraph_per_line;
if (m != "wrapped_text" && m != "WrappedText" && m != "wrappedText") {
LOG(warn, "Ignoring unknown text input format specification: {}.", m);
}
return splitmode::wrapped_text;
}
Segment TextProcessor::tokenize(const string_view &segment,
TokenRanges &tokenRanges) {
return vocabs_->front()->encodePreservingSource(
return vocabs_->front()->encodeWithByteRanges(
segment, tokenRanges, /*addEOS=*/false, /*inference=*/true);
}
@ -70,11 +29,11 @@ void TextProcessor::process(const string_view &query, Segments &segments,
std::vector<TokenRanges> &sourceRanges) {
auto sentenceStream = sentence_splitter_.createSentenceStream(query);
pcrecpp::StringPiece sentenceStringPiece;
std::string_view sentenceStringPiece;
while (sentenceStream >> sentenceStringPiece) {
string_view sentence(sentenceStringPiece.data(),
sentenceStringPiece.size());
marian::string_view sentence(sentenceStringPiece.data(),
sentenceStringPiece.size());
TokenRanges tokenRanges;
Segment segment = tokenize(sentence, tokenRanges);

View File

@ -1,40 +1,17 @@
#ifndef SRC_BERGAMOT_TEXTOPS_H_
#define SRC_BERGAMOT_TEXTOPS_H_
#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
#define SRC_BERGAMOT_TEXT_PROCESSOR_H_
#include "common/definitions.h"
#include "common/logging.h"
#include "common/options.h"
#include "common/types.h" // missing in shortlist.h
#include "common/utils.h"
#include "data/sentencepiece_vocab.h"
#include "data/shortlist.h"
#include "data/types.h"
#include "data/vocab.h"
#include "definitions.h"
#include "ssplit.h"
#include <cassert>
#include <iostream>
#include <string>
#include "sentence_splitter.h"
#include <vector>
namespace marian {
namespace bergamot {
class SentenceSplitter {
// A wrapper around @ugermann's ssplit-cpp compiled from several places in
// mts. Constructed based on options. Used in TextProcessor below to create
// sentence-streams, which provide access to one sentence from blob of text at
// a time.
public:
explicit SentenceSplitter(Ptr<Options> options);
ug::ssplit::SentenceStream createSentenceStream(string_view const &input);
private:
ug::ssplit::SentenceSplitter ssplit_;
Ptr<Options> options_;
ug::ssplit::SentenceStream::splitmode mode_;
ug::ssplit::SentenceStream::splitmode string2splitmode(const std::string &m);
};
class TextProcessor {
// TextProcessor handles loading the sentencepiece vocabulary and also
// contains an instance of sentence-splitter based on ssplit.
@ -68,4 +45,4 @@ private:
} // namespace bergamot
} // namespace marian
#endif // SRC_BERGAMOT_TEXTOPS_H_
#endif // SRC_BERGAMOT_TEXT_PROCESSOR_H_