Add option to print word-level scores (#501)

* Add printing word level scores * Add option --no-spm-decode * Fix precision for word-level scores * Fix getting the no-spm-decode option * Update CHANGELOG * Add comments and refactor * Print word-level scores next to other scores in an n-best list * Remove --word-scores from marian-scorer * Add --no-spm-decode only if compiled with SentencePiece * Add comments * Printing word scores before model scores in n-best lists * Update VERSION Co-authored-by: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
2024-10-26 09:09:10 +03:00 · 2020-01-04 03:10:21 +00:00 · 2020-01-04 03:10:21 +00:00 · 24f062cd27
commit 24f062cd27
parent 2bd986d8a7
8 changed files with 94 additions and 42 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]

 ### Added
+- An option to print word-level translation scores
+- An option to turn off automatic detokenization from SentencePiece
 - Separate quantization types for 8-bit FBGEMM for AVX2 and AVX512
 - Sequence-level unliklihood training
 - Allow file name templated valid-translation-output files
--- a/2
+++ b/2
@ -1 +1 @@
-v1.8.35
+v1.8.36
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -545,6 +545,8 @@ void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
      "Allow unknown words to appear in output");
  cli.add<bool>("--n-best",
      "Generate n-best list");
+  cli.add<bool>("--word-scores",
+      "Print word-level scores");

  // efficiency options
  cli.add<int>("--valid-mini-batch",
@ -607,6 +609,12 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
  cli.add<std::string>("--alignment",
     "Return word alignment. Possible values: 0.0-1.0, hard, soft")
    ->implicit_val("1");
+  cli.add<bool>("--word-scores",
+      "Print word-level scores");
+#ifdef USE_SENTENCEPIECE
+  cli.add<bool>("--no-spm-decode",
+      "Keep the output segmented into SentencePiece subwords");
+#endif

  addSuboptionsDevices(cli);
  addSuboptionsInputLength(cli);
--- a/src/data/alignment.cpp
+++ b/src/data/alignment.cpp
@ -8,9 +8,7 @@ namespace data {

 WordAlignment::WordAlignment() {}

-WordAlignment::WordAlignment(
-    const std::vector<Point>& align)
-    : data_(align) {}
+WordAlignment::WordAlignment(const std::vector<Point>& align) : data_(align) {}

 WordAlignment::WordAlignment(const std::string& line) {
  std::vector<std::string> atok = utils::splitAny(line, " -");
--- a/src/data/sentencepiece_vocab.cpp
+++ b/src/data/sentencepiece_vocab.cpp
@ -36,17 +36,18 @@ private:
  std::mt19937 generator_;
  std::uniform_int_distribution<int> randInt_; // from 0 to INT_MAX

+  // Keeps sentences segmented into subword units
+  bool keepEncoded_{false};
+
  // Sample from one file, based on first algorithm from:
  // https://en.wikipedia.org/wiki/Reservoir_sampling
  void reservoirSampling(std::vector<std::string>& sample, size_t& seenLines,
                        const std::string& trainPath, size_t maxLines, size_t maxBytes) {
-
    ABORT_IF(maxLines == 0, "Sample needs to be larger 0");

-    std::unique_ptr<std::istream> trainStrm(
-			    trainPath == "stdin" ? new std::istream(std::cin.rdbuf())
-                           : new io::InputFileStream(trainPath)
-    );
+    std::unique_ptr<std::istream> trainStrm(trainPath == "stdin"
+                                                ? new std::istream(std::cin.rdbuf())
+                                                : new io::InputFileStream(trainPath));

    std::string line;
    while(getline(*trainStrm, line)) {
@ -109,8 +110,10 @@ private:

 public:
  SentencePieceVocab(Ptr<Options> options, size_t batchIndex)
-    : options_(options), batchIndex_(batchIndex), generator_((uint32_t)Config::seed) {
-
+      : options_(options),
+        batchIndex_(batchIndex),
+        generator_((uint32_t)Config::seed),
+        keepEncoded_(options->get<bool>("no-spm-decode", false)) {
    if(options_->has("sentencepiece-alphas")) {
      auto alphas = options_->get<std::vector<float>>("sentencepiece-alphas");
      if(alphas.size() <= batchIndex)
@ -221,11 +224,18 @@ public:

  std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
    std::string line;
+    if(keepEncoded_) {  // i.e. keep the sentence segmented into subword units
+      for(const Word& id : sentence)
+        line += (*this)[id] + " ";
+      line.pop_back();  // trim the trailing whitespace
+    } else {
      // convert vector of Word to vector of int
-    std::vector<int> spmSentence; spmSentence.reserve(sentence.size());
-    for (auto&& word : sentence)
+      std::vector<int> spmSentence;
+      spmSentence.reserve(sentence.size());
+      for(auto&& word : sentence)
        spmSentence.push_back(word.toWordIndex());
      spm_->Decode(spmSentence, &line);
+    }
    return line;
  }

--- a/src/translator/hypothesis.h
+++ b/src/translator/hypothesis.h
@ -42,29 +42,40 @@ public:
  float getPathScore() const { return pathScore_; }

  const std::vector<float>& getScoreBreakdown() { return scoreBreakdown_; }
-  void setScoreBreakdown(const std::vector<float>& scoreBreaddown) { scoreBreakdown_ = scoreBreaddown; }
+  void setScoreBreakdown(const std::vector<float>& scoreBreakdown) { scoreBreakdown_ = scoreBreakdown; }

  const std::vector<float>& getAlignment() { return alignment_; }
  void setAlignment(const std::vector<float>& align) { alignment_ = align; };

-  // helpers to trace back paths referenced from this hypothesis
-  Words tracebackWords()
-  {
+  // trace back paths referenced from this hypothesis
+  Words tracebackWords() {
    Words targetWords;
-      for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
+    for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
      targetWords.push_back(hyp->getWord());
-        // std::cerr << hyp->getWord() << " " << hyp << std::endl;
    }
    std::reverse(targetWords.begin(), targetWords.end());
    return targetWords;
  }

+  // calculate word-level scores for each target word by de-aggregating the path score
+  std::vector<float> tracebackWordScores() {
+    std::vector<float> scores;
+    // traverse hypotheses backward
+    for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
+      // a path score is a cumulative score including scores from all preceding hypotheses (words),
+      // so calculate a word-level score by subtracting the previous path score from the current path score
+      auto prevPathScore = hyp->getPrevHyp() ? hyp->getPrevHyp().get()->pathScore_ : 0.f;
+      scores.push_back(hyp->pathScore_ - prevPathScore);
+    }
+    std::reverse(scores.begin(), scores.end());
+    return scores;
+  }
+
  // get soft alignments [t][s] -> P(s|t) for each target word starting from the hyp one
  typedef data::SoftAlignment SoftAlignment;
-  SoftAlignment tracebackAlignment()
-  {
+  SoftAlignment tracebackAlignment() {
    SoftAlignment align;
-      for (auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
+    for(auto hyp = this; hyp->getPrevHyp(); hyp = hyp->getPrevHyp().get()) {
      align.push_back(hyp->getAlignment());
    }
    std::reverse(align.begin(), align.end());
--- a/src/translator/output_printer.cpp
+++ b/src/translator/output_printer.cpp
@ -1,5 +1,7 @@
 #include "output_printer.h"

+#include <sstream>
+
 namespace marian {

 std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
@ -19,11 +21,18 @@ std::string OutputPrinter::getAlignment(const Hypothesis::PtrType& hyp) {
  } else if(alignment_ == "hard") {
    return data::ConvertSoftAlignToHardAlign(align, 1.f).toString();
  } else if(alignmentThreshold_ > 0.f) {
-    return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_)
-        .toString();
+    return data::ConvertSoftAlignToHardAlign(align, alignmentThreshold_).toString();
  } else {
    ABORT("Unrecognized word alignment type");
  }
 }

+std::string OutputPrinter::getWordScores(const Hypothesis::PtrType& hyp) {
+  std::ostringstream scores;
+  scores.precision(5);
+  for(const auto& score : hyp->tracebackWordScores())
+    scores << " " << std::fixed << score;
+  return scores.str();
+}
+
 }  // namespace marian
--- a/src/translator/output_printer.h
+++ b/src/translator/output_printer.h
@ -20,12 +20,14 @@ public:
                   ? options->get<size_t>("beam-size")
                   : 0),
        alignment_(options->get<std::string>("alignment", "")),
-        alignmentThreshold_(getAlignmentThreshold(alignment_)) {}
+        alignmentThreshold_(getAlignmentThreshold(alignment_)),
+        wordScores_(options->get<bool>("word-scores")) {}

  template <class OStream>
  void print(Ptr<const History> history, OStream& best1, OStream& bestn) {
    const auto& nbl = history->nBest(nbest_);

+    // prepare n-best list output
    for(size_t i = 0; i < nbl.size(); ++i) {
      const auto& result = nbl[i];
      const auto& hypo = std::get<1>(result);
@ -40,6 +42,9 @@ public:
      if(!alignment_.empty())
        bestn << " ||| " << getAlignment(hypo);

+      if(wordScores_)
+        bestn << " ||| WordScores=" << getWordScores(hypo);
+
      bestn << " |||";
      if(hypo->getScoreBreakdown().empty()) {
        bestn << " F0=" << hypo->getPathScore();
@ -72,17 +77,26 @@ public:
      best1 << " ||| " << getAlignment(hypo);
    }

+    if(wordScores_) {
+      const auto& hypo = std::get<1>(result);
+      best1 << " ||| WordScores=" << getWordScores(hypo);
+    }
+
    best1 << std::flush;
  }

 private:
  Ptr<Vocab const> vocab_;
-  bool reverse_{false};
-  size_t nbest_{0};
-  std::string alignment_;
-  float alignmentThreshold_{0.f};
+  bool reverse_{false};            // If it is a right-to-left model that needs reversed word order
+  size_t nbest_{0};                // Size of the n-best list to print
+  std::string alignment_;          // A non-empty string indicates the type of word alignment
+  float alignmentThreshold_{0.f};  // Threshold for converting attention into hard word alignment
+  bool wordScores_{false};         // Whether to print word-level scores or not

+  // Get word alignment pairs or soft alignment
  std::string getAlignment(const Hypothesis::PtrType& hyp);
+  // Get word-level scores
+  std::string getWordScores(const Hypothesis::PtrType& hyp);

  float getAlignmentThreshold(const std::string& str) {
    try {