Faithful to source-structure translation (#115)

* First draft of faithful translation * Comments explaining pre and post * Comments on response_builder * Updating bergamot-translator-tests with new outputs * Cosmetic changes in response target text construction * Replacing &(x[0]) -> x.data() to avoid illegal indices * Removing nullptr given both branches init pointer with legal values * pre, post -> gap(i) addressing review comments Functions which were pre and post before are subsumed by gap(i), and the algorithm in ResponseBuilder adjusted to fix. `x = nullptr` is back, should be harmless. * Updating brt with paragraph outputs * Bumping brt with updated outputs, buffer text at begin as well * Bumping BRT with sync after bytearray collapse merge * Pointing BRT to main after merge Co-authored-by: Nikolay Bogoychev <nheart@gmail.com>
2024-08-15 16:40:26 +03:00 · 2021-05-06 16:19:27 +01:00 · 2021-05-06 16:19:27 +01:00 · b86c76b004
commit b86c76b004
parent bc2e4eee5c
4 changed files with 73 additions and 14 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 442edcfea34dc1c53c90b5775347958fba1ffd08
+Subproject commit 9209aa51e71f57b90172ffd259cf3021c4890bcf
--- a/src/translator/response_builder.cpp
+++ b/src/translator/response_builder.cpp
@ -1,4 +1,5 @@
 #include "response_builder.h"
+#include "response_options.h"

 namespace marian {
 namespace bergamot {
@ -56,11 +57,10 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
  // thing to do to avoid reallocations.
  response.target.text.reserve(response.source.text.size());

-  size_t offset{0};
-  bool first{true};
-
-  for (auto &history : histories) {
+  for (size_t sentenceIdx = 0; sentenceIdx < histories.size(); sentenceIdx++) {
    // TODO(jerin): Change hardcode of nBest = 1
+
+    auto &history = histories[sentenceIdx];
    NBestList onebest = history->nBest(1);

    Result result = onebest[0]; // Expecting only one result;
@ -71,15 +71,33 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
    std::vector<string_view> targetSentenceMappings;
    targetVocab->decodeWithByteRanges(words, decoded, targetSentenceMappings);

-    // delimiter can be used to fill in the blanks from source as well.
-    std::string delimiter;
-    if (first) {
-      first = false;
-    } else {
-      delimiter = " ";
+    switch (responseOptions_.concatStrategy) {
+    case ConcatStrategy::FAITHFUL: {
+      // For each sentence, prepend the filler text between the corresponding
+      // source-sentence and the source-sentence before.
+      string_view pre = response.source.gap(sentenceIdx);
+      response.target.appendSentence(std::string(pre.data(), pre.size()),
+                                     decoded, targetSentenceMappings);
+
+      // If this is the last history to be decoded and translated-text
+      // constructed, append the text till the end, which could be spaces or
+      // empty.
+      if (sentenceIdx + 1 == histories.size()) {
+        string_view post = response.source.gap(sentenceIdx + 1);
+        response.target.text += std::string(post.data(), post.size());
+      }
+      break;
+    }
+    case ConcatStrategy::SPACE: {
+      std::string delimiter = (sentenceIdx == 0) ? "" : " ";
+      response.target.appendSentence(delimiter, decoded,
+                                     targetSentenceMappings);
+      break;
    }

-    response.target.appendSentence(delimiter, decoded, targetSentenceMappings);
+    default:
+      ABORT("Unknown concat-strategy");
+    }
  }
 }

--- a/src/translator/sentence_ranges.cpp
+++ b/src/translator/sentence_ranges.cpp
@ -63,7 +63,7 @@ void AnnotatedText::appendSentence(std::string prefix, std::string &reference,
  text += reference;           // Append reference to text
  std::vector<ByteRange> sentence;
  for (auto &wordView : wordRanges) {
-    size_t thisWordBegin = offset + wordView.data() - &reference[0];
+    size_t thisWordBegin = offset + wordView.data() - reference.data();
    sentence.push_back(
        ByteRange{thisWordBegin, thisWordBegin + wordView.size()});
  }
@ -78,7 +78,7 @@ void AnnotatedText::addSentence(std::vector<string_view>::iterator begin,
                                std::vector<string_view>::iterator end) {
  std::vector<ByteRange> sentence;
  for (auto p = begin; p != end; p++) {
-    size_t begin_offset = p->data() - &text[0];
+    size_t begin_offset = p->data() - text.data();
    sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()});
  }
  annotation.addSentence(sentence);
@ -99,5 +99,33 @@ string_view AnnotatedText::asStringView(const ByteRange &byteRange) const {
  return string_view(data, size);
 }

+string_view AnnotatedText::gap(size_t sentenceIdx) const {
+  // Find start of filler-text before, there's a corner case when there's no
+  // sentence before.
+  const char *start = nullptr;
+  if (sentenceIdx == 0) {
+    // If first sentence, filler begins at start of whole-text.
+    start = text.data();
+  } else {
+    // Otherwise, filler begins at end of previous sentence.
+    string_view sentenceBefore = sentence(sentenceIdx - 1);
+    start = sentenceBefore.data() + sentenceBefore.size();
+  }
+
+  // Find end of filler-text, but there is a corner-case to handle.
+  const char *end = nullptr;
+  if (sentenceIdx == numSentences()) {
+    // If last sentence, manually find end of whole-text.
+    const char *begin = text.data();
+    end = begin + text.size();
+  } else {
+    // Otherwise, the filler ends at the start of next sentence.
+    string_view sentenceAfter = sentence(sentenceIdx);
+    end = sentenceAfter.data();
+  }
+
+  return string_view(start, end - start);
+}
+
 } // namespace bergamot
 } // namespace marian
--- a/src/translator/sentence_ranges.h
+++ b/src/translator/sentence_ranges.h
@ -151,6 +151,19 @@ public:
  /// Returns a string_view representing sentence corresponding to sentenceIdx.
  string_view sentence(size_t sentenceIdx) const;

+  /// Returns the string_view of the gap between two sentences in the container.
+  ///
+  /// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
+  ///
+  /// * For `i = 0`: The gap between the start of text and the first sentence.
+  /// * For `i = 1...N-1`, returns the text comprising of the gap
+  ///   between the `i-1`-th and `i`-th sentence.
+  /// * For `i = N`, the gap between the last sentence and end of
+  ///   text.
+
+  /// @param sentenceIdx: Can be between `[0, numSentences()]`.
+  string_view gap(size_t sentenceIdx) const;
+
  /// Returns a ByteRange representing wordIdx in sentenceIdx
  ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;