Faithful to source-structure translation (#115)

* First draft of faithful translation

* Comments explaining pre and post

* Comments on response_builder

* Updating bergamot-translator-tests with new outputs

* Cosmetic changes in response target text construction

* Replacing &(x[0]) -> x.data() to avoid illegal indices

* Removing nullptr given both branches init pointer with legal values

* pre, post -> gap(i) addressing review comments

Functions which were pre and post before are subsumed by gap(i), and the
algorithm in ResponseBuilder adjusted to fix.

`x = nullptr` is back, should be harmless.

* Updating brt with paragraph outputs

* Bumping brt with updated outputs, buffer text at begin as well

* Bumping BRT with sync after bytearray collapse merge

* Pointing BRT to main after merge

Co-authored-by: Nikolay Bogoychev <nheart@gmail.com>
This commit is contained in:
Jerin Philip 2021-05-06 16:19:27 +01:00 committed by GitHub
parent bc2e4eee5c
commit b86c76b004
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 73 additions and 14 deletions

@ -1 +1 @@
Subproject commit 442edcfea34dc1c53c90b5775347958fba1ffd08
Subproject commit 9209aa51e71f57b90172ffd259cf3021c4890bcf

View File

@ -1,4 +1,5 @@
#include "response_builder.h"
#include "response_options.h"
namespace marian {
namespace bergamot {
@ -56,11 +57,10 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
// thing to do to avoid reallocations.
response.target.text.reserve(response.source.text.size());
size_t offset{0};
bool first{true};
for (auto &history : histories) {
for (size_t sentenceIdx = 0; sentenceIdx < histories.size(); sentenceIdx++) {
// TODO(jerin): Change hardcode of nBest = 1
auto &history = histories[sentenceIdx];
NBestList onebest = history->nBest(1);
Result result = onebest[0]; // Expecting only one result;
@ -71,15 +71,33 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
std::vector<string_view> targetSentenceMappings;
targetVocab->decodeWithByteRanges(words, decoded, targetSentenceMappings);
// delimiter can be used to fill in the blanks from source as well.
std::string delimiter;
if (first) {
first = false;
} else {
delimiter = " ";
switch (responseOptions_.concatStrategy) {
case ConcatStrategy::FAITHFUL: {
// For each sentence, prepend the filler text between the corresponding
// source-sentence and the source-sentence before.
string_view pre = response.source.gap(sentenceIdx);
response.target.appendSentence(std::string(pre.data(), pre.size()),
decoded, targetSentenceMappings);
// If this is the last history to be decoded and translated-text
// constructed, append the text till the end, which could be spaces or
// empty.
if (sentenceIdx + 1 == histories.size()) {
string_view post = response.source.gap(sentenceIdx + 1);
response.target.text += std::string(post.data(), post.size());
}
break;
}
case ConcatStrategy::SPACE: {
std::string delimiter = (sentenceIdx == 0) ? "" : " ";
response.target.appendSentence(delimiter, decoded,
targetSentenceMappings);
break;
}
response.target.appendSentence(delimiter, decoded, targetSentenceMappings);
default:
ABORT("Unknown concat-strategy");
}
}
}

View File

@ -63,7 +63,7 @@ void AnnotatedText::appendSentence(std::string prefix, std::string &reference,
text += reference; // Append reference to text
std::vector<ByteRange> sentence;
for (auto &wordView : wordRanges) {
size_t thisWordBegin = offset + wordView.data() - &reference[0];
size_t thisWordBegin = offset + wordView.data() - reference.data();
sentence.push_back(
ByteRange{thisWordBegin, thisWordBegin + wordView.size()});
}
@ -78,7 +78,7 @@ void AnnotatedText::addSentence(std::vector<string_view>::iterator begin,
std::vector<string_view>::iterator end) {
std::vector<ByteRange> sentence;
for (auto p = begin; p != end; p++) {
size_t begin_offset = p->data() - &text[0];
size_t begin_offset = p->data() - text.data();
sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()});
}
annotation.addSentence(sentence);
@ -99,5 +99,33 @@ string_view AnnotatedText::asStringView(const ByteRange &byteRange) const {
return string_view(data, size);
}
string_view AnnotatedText::gap(size_t sentenceIdx) const {
// Find start of filler-text before, there's a corner case when there's no
// sentence before.
const char *start = nullptr;
if (sentenceIdx == 0) {
// If first sentence, filler begins at start of whole-text.
start = text.data();
} else {
// Otherwise, filler begins at end of previous sentence.
string_view sentenceBefore = sentence(sentenceIdx - 1);
start = sentenceBefore.data() + sentenceBefore.size();
}
// Find end of filler-text, but there is a corner-case to handle.
const char *end = nullptr;
if (sentenceIdx == numSentences()) {
// If last sentence, manually find end of whole-text.
const char *begin = text.data();
end = begin + text.size();
} else {
// Otherwise, the filler ends at the start of next sentence.
string_view sentenceAfter = sentence(sentenceIdx);
end = sentenceAfter.data();
}
return string_view(start, end - start);
}
} // namespace bergamot
} // namespace marian

View File

@ -151,6 +151,19 @@ public:
/// Returns a string_view representing sentence corresponding to sentenceIdx.
string_view sentence(size_t sentenceIdx) const;
/// Returns the string_view of the gap between two sentences in the container.
///
/// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
///
/// * For `i = 0`: The gap between the start of text and the first sentence.
/// * For `i = 1...N-1`, returns the text comprising of the gap
/// between the `i-1`-th and `i`-th sentence.
/// * For `i = N`, the gap between the last sentence and end of
/// text.
/// @param sentenceIdx: Can be between `[0, numSentences()]`.
string_view gap(size_t sentenceIdx) const;
/// Returns a ByteRange representing wordIdx in sentenceIdx
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;