diff --git a/bergamot-translator-tests b/bergamot-translator-tests index 442edcf..9209aa5 160000 --- a/bergamot-translator-tests +++ b/bergamot-translator-tests @@ -1 +1 @@ -Subproject commit 442edcfea34dc1c53c90b5775347958fba1ffd08 +Subproject commit 9209aa51e71f57b90172ffd259cf3021c4890bcf diff --git a/src/translator/response_builder.cpp b/src/translator/response_builder.cpp index c624707..f68bd31 100644 --- a/src/translator/response_builder.cpp +++ b/src/translator/response_builder.cpp @@ -1,4 +1,5 @@ #include "response_builder.h" +#include "response_options.h" namespace marian { namespace bergamot { @@ -56,11 +57,10 @@ void ResponseBuilder::buildTranslatedText(Histories &histories, // thing to do to avoid reallocations. response.target.text.reserve(response.source.text.size()); - size_t offset{0}; - bool first{true}; - - for (auto &history : histories) { + for (size_t sentenceIdx = 0; sentenceIdx < histories.size(); sentenceIdx++) { // TODO(jerin): Change hardcode of nBest = 1 + + auto &history = histories[sentenceIdx]; NBestList onebest = history->nBest(1); Result result = onebest[0]; // Expecting only one result; @@ -71,15 +71,33 @@ void ResponseBuilder::buildTranslatedText(Histories &histories, std::vector targetSentenceMappings; targetVocab->decodeWithByteRanges(words, decoded, targetSentenceMappings); - // delimiter can be used to fill in the blanks from source as well. - std::string delimiter; - if (first) { - first = false; - } else { - delimiter = " "; + switch (responseOptions_.concatStrategy) { + case ConcatStrategy::FAITHFUL: { + // For each sentence, prepend the filler text between the corresponding + // source-sentence and the source-sentence before. + string_view pre = response.source.gap(sentenceIdx); + response.target.appendSentence(std::string(pre.data(), pre.size()), + decoded, targetSentenceMappings); + + // If this is the last history to be decoded and translated-text + // constructed, append the text till the end, which could be spaces or + // empty. + if (sentenceIdx + 1 == histories.size()) { + string_view post = response.source.gap(sentenceIdx + 1); + response.target.text += std::string(post.data(), post.size()); + } + break; + } + case ConcatStrategy::SPACE: { + std::string delimiter = (sentenceIdx == 0) ? "" : " "; + response.target.appendSentence(delimiter, decoded, + targetSentenceMappings); + break; } - response.target.appendSentence(delimiter, decoded, targetSentenceMappings); + default: + ABORT("Unknown concat-strategy"); + } } } diff --git a/src/translator/sentence_ranges.cpp b/src/translator/sentence_ranges.cpp index da9d3ee..603d9bf 100644 --- a/src/translator/sentence_ranges.cpp +++ b/src/translator/sentence_ranges.cpp @@ -63,7 +63,7 @@ void AnnotatedText::appendSentence(std::string prefix, std::string &reference, text += reference; // Append reference to text std::vector sentence; for (auto &wordView : wordRanges) { - size_t thisWordBegin = offset + wordView.data() - &reference[0]; + size_t thisWordBegin = offset + wordView.data() - reference.data(); sentence.push_back( ByteRange{thisWordBegin, thisWordBegin + wordView.size()}); } @@ -78,7 +78,7 @@ void AnnotatedText::addSentence(std::vector::iterator begin, std::vector::iterator end) { std::vector sentence; for (auto p = begin; p != end; p++) { - size_t begin_offset = p->data() - &text[0]; + size_t begin_offset = p->data() - text.data(); sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()}); } annotation.addSentence(sentence); @@ -99,5 +99,33 @@ string_view AnnotatedText::asStringView(const ByteRange &byteRange) const { return string_view(data, size); } +string_view AnnotatedText::gap(size_t sentenceIdx) const { + // Find start of filler-text before, there's a corner case when there's no + // sentence before. + const char *start = nullptr; + if (sentenceIdx == 0) { + // If first sentence, filler begins at start of whole-text. + start = text.data(); + } else { + // Otherwise, filler begins at end of previous sentence. + string_view sentenceBefore = sentence(sentenceIdx - 1); + start = sentenceBefore.data() + sentenceBefore.size(); + } + + // Find end of filler-text, but there is a corner-case to handle. + const char *end = nullptr; + if (sentenceIdx == numSentences()) { + // If last sentence, manually find end of whole-text. + const char *begin = text.data(); + end = begin + text.size(); + } else { + // Otherwise, the filler ends at the start of next sentence. + string_view sentenceAfter = sentence(sentenceIdx); + end = sentenceAfter.data(); + } + + return string_view(start, end - start); +} + } // namespace bergamot } // namespace marian diff --git a/src/translator/sentence_ranges.h b/src/translator/sentence_ranges.h index f9c881e..8cb7caf 100644 --- a/src/translator/sentence_ranges.h +++ b/src/translator/sentence_ranges.h @@ -151,6 +151,19 @@ public: /// Returns a string_view representing sentence corresponding to sentenceIdx. string_view sentence(size_t sentenceIdx) const; + /// Returns the string_view of the gap between two sentences in the container. + /// + /// More precisely where `i = sentenceIdx, N = numSentences()` for brevity: + /// + /// * For `i = 0`: The gap between the start of text and the first sentence. + /// * For `i = 1...N-1`, returns the text comprising of the gap + /// between the `i-1`-th and `i`-th sentence. + /// * For `i = N`, the gap between the last sentence and end of + /// text. + + /// @param sentenceIdx: Can be between `[0, numSentences()]`. + string_view gap(size_t sentenceIdx) const; + /// Returns a ByteRange representing wordIdx in sentenceIdx ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;