mirror of
https://github.com/browsermt/bergamot-translator.git
synced 2024-08-15 16:40:26 +03:00
Faithful to source-structure translation (#115)
* First draft of faithful translation * Comments explaining pre and post * Comments on response_builder * Updating bergamot-translator-tests with new outputs * Cosmetic changes in response target text construction * Replacing &(x[0]) -> x.data() to avoid illegal indices * Removing nullptr given both branches init pointer with legal values * pre, post -> gap(i) addressing review comments Functions which were pre and post before are subsumed by gap(i), and the algorithm in ResponseBuilder adjusted to fix. `x = nullptr` is back, should be harmless. * Updating brt with paragraph outputs * Bumping brt with updated outputs, buffer text at begin as well * Bumping BRT with sync after bytearray collapse merge * Pointing BRT to main after merge Co-authored-by: Nikolay Bogoychev <nheart@gmail.com>
This commit is contained in:
parent
bc2e4eee5c
commit
b86c76b004
@ -1 +1 @@
|
||||
Subproject commit 442edcfea34dc1c53c90b5775347958fba1ffd08
|
||||
Subproject commit 9209aa51e71f57b90172ffd259cf3021c4890bcf
|
@ -1,4 +1,5 @@
|
||||
#include "response_builder.h"
|
||||
#include "response_options.h"
|
||||
|
||||
namespace marian {
|
||||
namespace bergamot {
|
||||
@ -56,11 +57,10 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
|
||||
// thing to do to avoid reallocations.
|
||||
response.target.text.reserve(response.source.text.size());
|
||||
|
||||
size_t offset{0};
|
||||
bool first{true};
|
||||
|
||||
for (auto &history : histories) {
|
||||
for (size_t sentenceIdx = 0; sentenceIdx < histories.size(); sentenceIdx++) {
|
||||
// TODO(jerin): Change hardcode of nBest = 1
|
||||
|
||||
auto &history = histories[sentenceIdx];
|
||||
NBestList onebest = history->nBest(1);
|
||||
|
||||
Result result = onebest[0]; // Expecting only one result;
|
||||
@ -71,15 +71,33 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
|
||||
std::vector<string_view> targetSentenceMappings;
|
||||
targetVocab->decodeWithByteRanges(words, decoded, targetSentenceMappings);
|
||||
|
||||
// delimiter can be used to fill in the blanks from source as well.
|
||||
std::string delimiter;
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
delimiter = " ";
|
||||
switch (responseOptions_.concatStrategy) {
|
||||
case ConcatStrategy::FAITHFUL: {
|
||||
// For each sentence, prepend the filler text between the corresponding
|
||||
// source-sentence and the source-sentence before.
|
||||
string_view pre = response.source.gap(sentenceIdx);
|
||||
response.target.appendSentence(std::string(pre.data(), pre.size()),
|
||||
decoded, targetSentenceMappings);
|
||||
|
||||
// If this is the last history to be decoded and translated-text
|
||||
// constructed, append the text till the end, which could be spaces or
|
||||
// empty.
|
||||
if (sentenceIdx + 1 == histories.size()) {
|
||||
string_view post = response.source.gap(sentenceIdx + 1);
|
||||
response.target.text += std::string(post.data(), post.size());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ConcatStrategy::SPACE: {
|
||||
std::string delimiter = (sentenceIdx == 0) ? "" : " ";
|
||||
response.target.appendSentence(delimiter, decoded,
|
||||
targetSentenceMappings);
|
||||
break;
|
||||
}
|
||||
|
||||
response.target.appendSentence(delimiter, decoded, targetSentenceMappings);
|
||||
default:
|
||||
ABORT("Unknown concat-strategy");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,7 @@ void AnnotatedText::appendSentence(std::string prefix, std::string &reference,
|
||||
text += reference; // Append reference to text
|
||||
std::vector<ByteRange> sentence;
|
||||
for (auto &wordView : wordRanges) {
|
||||
size_t thisWordBegin = offset + wordView.data() - &reference[0];
|
||||
size_t thisWordBegin = offset + wordView.data() - reference.data();
|
||||
sentence.push_back(
|
||||
ByteRange{thisWordBegin, thisWordBegin + wordView.size()});
|
||||
}
|
||||
@ -78,7 +78,7 @@ void AnnotatedText::addSentence(std::vector<string_view>::iterator begin,
|
||||
std::vector<string_view>::iterator end) {
|
||||
std::vector<ByteRange> sentence;
|
||||
for (auto p = begin; p != end; p++) {
|
||||
size_t begin_offset = p->data() - &text[0];
|
||||
size_t begin_offset = p->data() - text.data();
|
||||
sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()});
|
||||
}
|
||||
annotation.addSentence(sentence);
|
||||
@ -99,5 +99,33 @@ string_view AnnotatedText::asStringView(const ByteRange &byteRange) const {
|
||||
return string_view(data, size);
|
||||
}
|
||||
|
||||
string_view AnnotatedText::gap(size_t sentenceIdx) const {
|
||||
// Find start of filler-text before, there's a corner case when there's no
|
||||
// sentence before.
|
||||
const char *start = nullptr;
|
||||
if (sentenceIdx == 0) {
|
||||
// If first sentence, filler begins at start of whole-text.
|
||||
start = text.data();
|
||||
} else {
|
||||
// Otherwise, filler begins at end of previous sentence.
|
||||
string_view sentenceBefore = sentence(sentenceIdx - 1);
|
||||
start = sentenceBefore.data() + sentenceBefore.size();
|
||||
}
|
||||
|
||||
// Find end of filler-text, but there is a corner-case to handle.
|
||||
const char *end = nullptr;
|
||||
if (sentenceIdx == numSentences()) {
|
||||
// If last sentence, manually find end of whole-text.
|
||||
const char *begin = text.data();
|
||||
end = begin + text.size();
|
||||
} else {
|
||||
// Otherwise, the filler ends at the start of next sentence.
|
||||
string_view sentenceAfter = sentence(sentenceIdx);
|
||||
end = sentenceAfter.data();
|
||||
}
|
||||
|
||||
return string_view(start, end - start);
|
||||
}
|
||||
|
||||
} // namespace bergamot
|
||||
} // namespace marian
|
||||
|
@ -151,6 +151,19 @@ public:
|
||||
/// Returns a string_view representing sentence corresponding to sentenceIdx.
|
||||
string_view sentence(size_t sentenceIdx) const;
|
||||
|
||||
/// Returns the string_view of the gap between two sentences in the container.
|
||||
///
|
||||
/// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
|
||||
///
|
||||
/// * For `i = 0`: The gap between the start of text and the first sentence.
|
||||
/// * For `i = 1...N-1`, returns the text comprising of the gap
|
||||
/// between the `i-1`-th and `i`-th sentence.
|
||||
/// * For `i = N`, the gap between the last sentence and end of
|
||||
/// text.
|
||||
|
||||
/// @param sentenceIdx: Can be between `[0, numSentences()]`.
|
||||
string_view gap(size_t sentenceIdx) const;
|
||||
|
||||
/// Returns a ByteRange representing wordIdx in sentenceIdx
|
||||
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user