mirror of
https://github.com/browsermt/bergamot-translator.git
synced 2024-10-26 05:43:59 +03:00
Rewrite annotation class to remove corner cases (#135)
This commit is contained in:
parent
5bd1fc6b83
commit
3e70587672
@ -23,9 +23,6 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||||||
std::mt19937 randomIntGen_;
|
std::mt19937 randomIntGen_;
|
||||||
randomIntGen_.seed(42);
|
randomIntGen_.seed(42);
|
||||||
|
|
||||||
AnnotatedText testAnnotation; // This the container we add through API and
|
|
||||||
// check if the access is correct.
|
|
||||||
|
|
||||||
// External book-keeping so we have ground truths. Each element represents a
|
// External book-keeping so we have ground truths. Each element represents a
|
||||||
// sentence.
|
// sentence.
|
||||||
|
|
||||||
@ -45,7 +42,7 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||||||
//
|
//
|
||||||
// 4-0 4-1 4-2 4-3
|
// 4-0 4-1 4-2 4-3
|
||||||
//
|
//
|
||||||
// Words are separated by space units.
|
// Tokens are contiguous because that's how SentencePiece works.
|
||||||
//
|
//
|
||||||
// Below, we accumulate the text with intended structure as above, and
|
// Below, we accumulate the text with intended structure as above, and
|
||||||
// ground-truth tables populated to be aware of the ByteRanges where they are
|
// ground-truth tables populated to be aware of the ByteRanges where they are
|
||||||
@ -53,9 +50,10 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||||||
if (debug) {
|
if (debug) {
|
||||||
std::cout << "Preparing text and ground truth-tables" << std::endl;
|
std::cout << "Preparing text and ground truth-tables" << std::endl;
|
||||||
}
|
}
|
||||||
|
std::string text;
|
||||||
for (size_t idx = 0; idx < sentences; idx++) {
|
for (size_t idx = 0; idx < sentences; idx++) {
|
||||||
if (idx != 0)
|
if (idx != 0)
|
||||||
testAnnotation.text += "\n";
|
text += "\n";
|
||||||
|
|
||||||
// Words can be zero, we need to support empty word sentences as well.
|
// Words can be zero, we need to support empty word sentences as well.
|
||||||
size_t numWords = randomIntGen_() % maxWords;
|
size_t numWords = randomIntGen_() % maxWords;
|
||||||
@ -65,23 +63,16 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||||||
|
|
||||||
// For empty sentence, we expect it to be empty and marked in position where
|
// For empty sentence, we expect it to be empty and marked in position where
|
||||||
// the existing string is if needed to be pointed out.
|
// the existing string is if needed to be pointed out.
|
||||||
size_t before = testAnnotation.text.size() - 1;
|
size_t before = text.size() - 1;
|
||||||
size_t sentenceBegin{before}, sentenceEnd{before};
|
size_t sentenceBegin{before}, sentenceEnd{before};
|
||||||
|
|
||||||
for (size_t idw = 0; idw < numWords; idw++) {
|
for (size_t idw = 0; idw < numWords; idw++) {
|
||||||
if (idw != 0) {
|
|
||||||
testAnnotation.text += " ";
|
|
||||||
if (debug) {
|
|
||||||
std::cout << " ";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get new beginning, accounting for space above.
|
// Get new beginning, accounting for space above.
|
||||||
before = testAnnotation.text.size();
|
before = text.size();
|
||||||
|
|
||||||
// Add the word
|
// Add the word
|
||||||
std::string word = std::to_string(idx) + "-" + std::to_string(idw);
|
std::string word = std::to_string(idx) + "-" + std::to_string(idw);
|
||||||
testAnnotation.text += word;
|
text += word;
|
||||||
|
|
||||||
// Do math, before, before + new-word's size.
|
// Do math, before, before + new-word's size.
|
||||||
wordByteRanges.push_back((ByteRange){before, before + word.size()});
|
wordByteRanges.push_back((ByteRange){before, before + word.size()});
|
||||||
@ -105,6 +96,9 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||||||
groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
|
groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AnnotatedText testAnnotation(std::move(text)); // This the container we add through API and
|
||||||
|
// check if the access is correct.
|
||||||
|
|
||||||
// We prepare string_views now with the known ByteRanges and use the
|
// We prepare string_views now with the known ByteRanges and use the
|
||||||
// string_view based AnnotatedText.addSentence(...) API to add sentences to
|
// string_view based AnnotatedText.addSentence(...) API to add sentences to
|
||||||
// transparently convert from string_views to ByteRanges, rebasing/working out
|
// transparently convert from string_views to ByteRanges, rebasing/working out
|
||||||
@ -116,6 +110,7 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::vector<marian::string_view>> wordStringViews;
|
std::vector<std::vector<marian::string_view>> wordStringViews;
|
||||||
|
std::vector<ByteRange>::const_iterator sentence_iter = groundTruthSentences.begin();
|
||||||
for (auto &sentence : groundTruthWords) {
|
for (auto &sentence : groundTruthWords) {
|
||||||
std::vector<marian::string_view> wordByteRanges;
|
std::vector<marian::string_view> wordByteRanges;
|
||||||
bool first{true};
|
bool first{true};
|
||||||
@ -132,7 +127,8 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||||||
std::cout << std::string(wordView);
|
std::cout << std::string(wordView);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
testAnnotation.addSentence(wordByteRanges);
|
testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(), testAnnotation.text.data() + sentence_iter->begin);
|
||||||
|
++sentence_iter;
|
||||||
wordStringViews.push_back(wordByteRanges);
|
wordStringViews.push_back(wordByteRanges);
|
||||||
if (debug) {
|
if (debug) {
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
@ -207,7 +203,7 @@ TEST_CASE("Test Annotation API with random sentences") {
|
|||||||
// Sentence if the random test above does not cover it for some reason.
|
// Sentence if the random test above does not cover it for some reason.
|
||||||
int emptySentenceIdx = sentences;
|
int emptySentenceIdx = sentences;
|
||||||
std::vector<marian::string_view> emptySentence;
|
std::vector<marian::string_view> emptySentence;
|
||||||
testAnnotation.addSentence(emptySentence);
|
testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(), testAnnotation.text.data() + testAnnotation.text.size());
|
||||||
|
|
||||||
// There are no words.
|
// There are no words.
|
||||||
CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
|
CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
|
||||||
|
@ -1,130 +1,68 @@
|
|||||||
#include "annotation.h"
|
#include "annotation.h"
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
namespace bergamot {
|
namespace bergamot {
|
||||||
|
|
||||||
void Annotation::addSentence(std::vector<ByteRange> &sentence) {
|
AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) {
|
||||||
flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence),
|
// Treat the entire text as a gap that recordExistingSentence will break.
|
||||||
std::end(sentence));
|
annotation.token_begin_.back() = text.size();
|
||||||
size_t size = flatByteRanges_.size();
|
|
||||||
sentenceEndIds_.push_back(size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Annotation::numWords(size_t sentenceIdx) const {
|
void AnnotatedText::appendSentence(string_view prefix, std::vector<string_view>::iterator begin, std::vector<string_view>::iterator end) {
|
||||||
size_t bosId, eosId;
|
assert(annotation.token_begin_.back() == text.size());
|
||||||
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
|
// We'll be adding tokens from the sentence and another gap.
|
||||||
eosId = sentenceEndIds_[sentenceIdx + 1];
|
annotation.token_begin_.reserve(annotation.token_begin_.size() + (end - begin) + 1);
|
||||||
// Difference between eosId and bosId is the number of words.
|
|
||||||
return eosId - bosId;
|
// prefix is just end of the previous one.
|
||||||
|
appendEndingWhitespace(prefix);
|
||||||
|
|
||||||
|
// Appending sentence text.
|
||||||
|
std::size_t offset = text.size();
|
||||||
|
for (std::vector<string_view>::iterator token = begin; token != end; ++token) {
|
||||||
|
offset += token->size();
|
||||||
|
annotation.token_begin_.push_back(offset);
|
||||||
|
}
|
||||||
|
if (begin != end) {
|
||||||
|
text.append(begin->data(), (end - 1)->data() + (end - 1)->size());
|
||||||
|
assert(offset == text.size()); // Tokens should be contiguous.
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the gap after the sentence. This is empty for now, but will be
|
||||||
|
// extended with appendEndingWhitespace or another appendSentence.
|
||||||
|
annotation.gap_.push_back(annotation.token_begin_.size() - 1);
|
||||||
|
annotation.token_begin_.push_back(offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
ByteRange Annotation::sentence(size_t sentenceIdx) const {
|
void AnnotatedText::appendEndingWhitespace(string_view whitespace) {
|
||||||
size_t bosId, eosId;
|
text.append(whitespace.data(), whitespace.size());
|
||||||
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
|
annotation.token_begin_.back() = text.size();
|
||||||
eosId = sentenceEndIds_[sentenceIdx + 1];
|
}
|
||||||
ByteRange sentenceByteRange;
|
|
||||||
|
|
||||||
if (bosId == eosId) {
|
void AnnotatedText::recordExistingSentence(std::vector<string_view>::iterator begin, std::vector<string_view>::iterator end, const char *sentence_begin) {
|
||||||
// We have an empty sentence. However, we want to be able to point where in
|
assert(sentence_begin >= text.data());
|
||||||
// target this happened through the ranges. We are looking for the end of
|
assert(sentence_begin <= text.data() + text.size());
|
||||||
// the flatByteRange and non-empty sentence before this happened and
|
assert(begin == end || sentence_begin == begin->data());
|
||||||
// construct empty string-view equivalent ByteRange.
|
assert(!annotation.token_begin_.empty());
|
||||||
ByteRange eos = flatByteRanges_[eosId - 1];
|
assert(annotation.token_begin_.back() == text.size());
|
||||||
sentenceByteRange = ByteRange{eos.end, eos.end};
|
// Clip off size token ending.
|
||||||
|
annotation.token_begin_.resize(annotation.token_begin_.size() - 1);
|
||||||
|
for (std::vector<string_view>::iterator i = begin; i != end; ++i) {
|
||||||
|
assert(i->data() >= text.data()); // In range.
|
||||||
|
assert(i->data() + i->size() <= text.data() + text.size()); // In range
|
||||||
|
assert(i + 1 == end || i->data() + i->size() == (i+1)->data()); // Contiguous
|
||||||
|
annotation.token_begin_.push_back(i->data() - text.data());
|
||||||
|
}
|
||||||
|
// Gap token after sentence.
|
||||||
|
annotation.gap_.push_back(annotation.token_begin_.size());
|
||||||
|
if (begin != end) {
|
||||||
|
annotation.token_begin_.push_back((end - 1)->data() + (end - 1)->size() - text.data());
|
||||||
} else {
|
} else {
|
||||||
ByteRange bos = flatByteRanges_[bosId];
|
// empty sentence.
|
||||||
ByteRange eos = flatByteRanges_[eosId - 1];
|
annotation.token_begin_.push_back(sentence_begin - text.data());
|
||||||
sentenceByteRange = ByteRange{bos.begin, eos.end};
|
|
||||||
}
|
}
|
||||||
return sentenceByteRange;
|
// Add back size token ending.
|
||||||
}
|
annotation.token_begin_.push_back(text.size());
|
||||||
|
|
||||||
ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const {
|
|
||||||
size_t bosOffset = sentenceEndIds_[sentenceIdx];
|
|
||||||
return flatByteRanges_[bosOffset + wordIdx];
|
|
||||||
}
|
|
||||||
|
|
||||||
string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const {
|
|
||||||
auto terminals = annotation.word(sentenceIdx, wordIdx);
|
|
||||||
return string_view(&text[terminals.begin], terminals.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
string_view AnnotatedText::sentence(size_t sentenceIdx) const {
|
|
||||||
auto sentenceAsByteRange = annotation.sentence(sentenceIdx);
|
|
||||||
return asStringView(sentenceAsByteRange);
|
|
||||||
}
|
|
||||||
|
|
||||||
void AnnotatedText::appendSentence(std::string prefix, std::string &reference,
|
|
||||||
std::vector<string_view> &wordRanges) {
|
|
||||||
text += prefix;
|
|
||||||
size_t offset = text.size(); // Get size before to do ByteRange arithmetic
|
|
||||||
text += reference; // Append reference to text
|
|
||||||
std::vector<ByteRange> sentence;
|
|
||||||
for (auto &wordView : wordRanges) {
|
|
||||||
size_t thisWordBegin = offset + wordView.data() - reference.data();
|
|
||||||
sentence.push_back(
|
|
||||||
ByteRange{thisWordBegin, thisWordBegin + wordView.size()});
|
|
||||||
}
|
|
||||||
annotation.addSentence(sentence);
|
|
||||||
}
|
|
||||||
|
|
||||||
void AnnotatedText::addSentence(std::vector<string_view> &wordRanges) {
|
|
||||||
addSentence(std::begin(wordRanges), std::end(wordRanges));
|
|
||||||
};
|
|
||||||
|
|
||||||
void AnnotatedText::addSentence(std::vector<string_view>::iterator begin,
|
|
||||||
std::vector<string_view>::iterator end) {
|
|
||||||
std::vector<ByteRange> sentence;
|
|
||||||
for (auto p = begin; p != end; p++) {
|
|
||||||
size_t begin_offset = p->data() - text.data();
|
|
||||||
sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()});
|
|
||||||
}
|
|
||||||
annotation.addSentence(sentence);
|
|
||||||
};
|
|
||||||
|
|
||||||
ByteRange AnnotatedText::wordAsByteRange(size_t sentenceIdx,
|
|
||||||
size_t wordIdx) const {
|
|
||||||
return annotation.word(sentenceIdx, wordIdx);
|
|
||||||
}
|
|
||||||
|
|
||||||
ByteRange AnnotatedText::sentenceAsByteRange(size_t sentenceIdx) const {
|
|
||||||
return annotation.sentence(sentenceIdx);
|
|
||||||
}
|
|
||||||
|
|
||||||
string_view AnnotatedText::asStringView(const ByteRange &byteRange) const {
|
|
||||||
const char *data = &text[byteRange.begin];
|
|
||||||
size_t size = byteRange.size();
|
|
||||||
return string_view(data, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
string_view AnnotatedText::gap(size_t sentenceIdx) const {
|
|
||||||
// Find start of filler-text before, there's a corner case when there's no
|
|
||||||
// sentence before.
|
|
||||||
const char *start = nullptr;
|
|
||||||
if (sentenceIdx == 0) {
|
|
||||||
// If first sentence, filler begins at start of whole-text.
|
|
||||||
start = text.data();
|
|
||||||
} else {
|
|
||||||
// Otherwise, filler begins at end of previous sentence.
|
|
||||||
string_view sentenceBefore = sentence(sentenceIdx - 1);
|
|
||||||
start = sentenceBefore.data() + sentenceBefore.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find end of filler-text, but there is a corner-case to handle.
|
|
||||||
const char *end = nullptr;
|
|
||||||
if (sentenceIdx == numSentences()) {
|
|
||||||
// If last sentence, manually find end of whole-text.
|
|
||||||
const char *begin = text.data();
|
|
||||||
end = begin + text.size();
|
|
||||||
} else {
|
|
||||||
// Otherwise, the filler ends at the start of next sentence.
|
|
||||||
string_view sentenceAfter = sentence(sentenceIdx);
|
|
||||||
end = sentenceAfter.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
return string_view(start, end - start);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
|
@ -17,83 +17,99 @@ struct ByteRange {
|
|||||||
const size_t size() const { return end - begin; }
|
const size_t size() const { return end - begin; }
|
||||||
};
|
};
|
||||||
|
|
||||||
/// An Annotation is a collection of ByteRanges used to denote ancillary
|
/// Annotation expresses sentence and token boundary information as ranges of
|
||||||
/// information of sentences and words on a text of string. Annotation is meant
|
/// bytes in a string, but does not itself own the string.
|
||||||
/// for consumption on platforms where `string_view` creates problems (eg:
|
|
||||||
/// exports through WASM) conveniently rebasing them as required into
|
|
||||||
/// ByteRanges. See AnnotatedText for cases where this is a non-issue.
|
|
||||||
///
|
///
|
||||||
/// **Usage**
|
/// See also AnnotatedText, which owns Annotation and the string. AnnotatedText
|
||||||
|
/// wraps these ByteRange functions to provide a string_view interface.
|
||||||
///
|
///
|
||||||
/// To ensure rebasing is consistent during creation and updation, use
|
/// Text is divided into gaps (whitespace between sentences) and sentences like
|
||||||
/// `Annotation` best through `AnnotatedText`, which also holds the reference
|
/// so:
|
||||||
/// string and can work with `string_views`.
|
/// gap sentence gap sentence gap
|
||||||
|
/// Because gaps appear at the beginning and end of the text, there's always
|
||||||
|
/// one more gap than there are sentences.
|
||||||
///
|
///
|
||||||
/// If used separately, it is on the user to ensure the reference string
|
/// The entire text is a unbroken sequence of tokens (i.e. the end of a token
|
||||||
/// is the same as what the Annotation refers to. For best results, an instance
|
/// is the beginning of the next token). A gap is exactly one token containing
|
||||||
/// is expected to be read only in this mode of operation.
|
/// whatever whitespace is between the sentences. A sentence is a sequence of
|
||||||
|
/// tokens.
|
||||||
///
|
///
|
||||||
/// **Idea**
|
/// Since we are using SentencePiece, a token can include whitespace. The term
|
||||||
|
/// "word" is used, somewhat incorrectly, as a synonym of token.
|
||||||
///
|
///
|
||||||
/// Annotation is intended to be the same structure conceptually as below,
|
/// A gap can be empty (for example there may not have been whitespace at the
|
||||||
/// except the `std::vector<std::vector<ByteRange>>` hammered into a flat
|
/// beginning). A sentence can also be empty (typically the translation system
|
||||||
/// structure to avoid multiple reallocs keeping efficiency in mind. This is
|
/// produced empty output). That's fine, these are just empty ranges as you
|
||||||
/// achieved by having markers of where sentence ends in the flat container
|
/// would expect.
|
||||||
/// storing word ByteRanges.
|
|
||||||
///
|
|
||||||
/// ```cpp
|
|
||||||
/// typedef ByteRange Word;
|
|
||||||
/// // std::vector<ByteRange>, a single sentence
|
|
||||||
/// typedef std::vector<Word> Sentence;
|
|
||||||
/// std::vector<std::vector<ByteRange> // multiple sentences
|
|
||||||
/// typedef std::vector<Sentence> Annotation;
|
|
||||||
///
|
|
||||||
/// Annotation example;
|
|
||||||
/// ```
|
|
||||||
/// This structure exists to provide a consistent API to access the nested
|
|
||||||
/// sentences of varying lengths, which occur in source-text processed into
|
|
||||||
/// multiple sentences, and target-text translated from source as multiple
|
|
||||||
/// sentences, both composed of (sub)-words, providing a List[List] like access
|
|
||||||
/// while storing it in a compact and efficient manner.
|
|
||||||
class Annotation {
|
class Annotation {
|
||||||
public:
|
public:
|
||||||
/// Annotation is constructed empty. See `addSentence()` to populate it with
|
/// Initially an empty string. Populated by AnnotatedText.
|
||||||
/// annotations.
|
|
||||||
Annotation() {
|
Annotation() {
|
||||||
// The -1-th sentence ends at 0.
|
token_begin_.push_back(0);
|
||||||
sentenceEndIds_.push_back(0);
|
token_begin_.push_back(0);
|
||||||
|
gap_.push_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t numSentences() const { return sentenceEndIds_.size() - 1; }
|
size_t numSentences() const { return gap_.size() - 1; }
|
||||||
|
|
||||||
/// Returns number of words in the sentence identified by `sentenceIdx`.
|
/// Returns number of words in the sentence identified by `sentenceIdx`.
|
||||||
size_t numWords(size_t sentenceIdx) const;
|
size_t numWords(size_t sentenceIdx) const {
|
||||||
|
return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */;
|
||||||
/// Adds a sentences from `vector<ByteRange>` representation, internally doing
|
}
|
||||||
/// extra book-keeping for the sentence terminal markings. Sentences are
|
|
||||||
/// expected to be added in order as they occur in text.
|
|
||||||
void addSentence(std::vector<ByteRange> &sentence);
|
|
||||||
|
|
||||||
/// Returns a ByteRange representing `wordIdx` in sentence indexed by
|
/// Returns a ByteRange representing `wordIdx` in sentence indexed by
|
||||||
/// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
|
/// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
|
||||||
/// `.numWords()` for `sentenceIdx` for defined behaviour.
|
/// `.numWords()` for `sentenceIdx` for defined behaviour.
|
||||||
ByteRange word(size_t sentenceIdx, size_t wordIdx) const;
|
ByteRange word(size_t sentenceIdx, size_t wordIdx) const {
|
||||||
|
size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx;
|
||||||
|
return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
|
/// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
|
||||||
/// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
|
/// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
|
||||||
/// less than `.numSentences()`.
|
/// less than `.numSentences()`.
|
||||||
ByteRange sentence(size_t sentenceIdx) const;
|
ByteRange sentence(size_t sentenceIdx) const {
|
||||||
|
return ByteRange {
|
||||||
|
token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */
|
||||||
|
token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
ByteRange gap(size_t gapIdx) const {
|
||||||
|
size_t tokenIdx = gap_[gapIdx];
|
||||||
|
return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]};
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// A flat storage for ByteRanges. Composed of word ByteRanges, extra
|
friend class AnnotatedText;
|
||||||
/// information in sentenceEndIds_ to denote sentence boundary markers as
|
/// Map from token index to byte offset at which it begins. Token i is:
|
||||||
/// indices.
|
/// [token_begin_[i], token_begin_[i+1])
|
||||||
std::vector<ByteRange> flatByteRanges_;
|
/// The vector is padded so that these indices are always valid, even at the
|
||||||
|
/// end. So tokens_begin_.size() is the number of tokens plus 1.
|
||||||
|
std::vector<size_t> token_begin_;
|
||||||
|
|
||||||
/// Stores indices onto flatByteRanges_ of where sentences end (not inclusive,
|
/// Indices of tokens that correspond to gaps between sentences. These are
|
||||||
/// aligned with C++ half interval notions). There is a 0 marker to simplify
|
/// indices into token_begin_.
|
||||||
/// sources, indicating where the -1-th sentence ends.
|
/// Gap g is byte range:
|
||||||
std::vector<size_t> sentenceEndIds_;
|
/// [token_begin_[gap_[w]], token_begin_[gap_[w]+1])
|
||||||
|
/// Sentence s is byte range:
|
||||||
|
/// [token_begin_[gap_[s]+1], token_begin_[gap_[s+1]])
|
||||||
|
/// A sentence does not include whitespace at the beginning or end.
|
||||||
|
///
|
||||||
|
/// gap_.size() == numSentences() + 1.
|
||||||
|
///
|
||||||
|
/// Example: empty text "" -> just an empty gap.
|
||||||
|
/// token_begin_ = {0, 0};
|
||||||
|
/// gap_ = {0};
|
||||||
|
///
|
||||||
|
/// Example: only space " " -> just a gap containing the space.
|
||||||
|
/// token_begin_ = {0, 1};
|
||||||
|
/// gap_ = {0};
|
||||||
|
///
|
||||||
|
/// Example: one token "hi" -> empty gap, sentence with one token, empty gap
|
||||||
|
/// token_begin_ = {0, 0, 2, 2};
|
||||||
|
/// gap_ = {0, 2};
|
||||||
|
std::vector<size_t> gap_;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// AnnotatedText is effectively std::string text + Annotation, providing the
|
/// AnnotatedText is effectively std::string text + Annotation, providing the
|
||||||
@ -107,7 +123,6 @@ private:
|
|||||||
///
|
///
|
||||||
/// 3. Bind the text and annotations together, to move around as a meaningful
|
/// 3. Bind the text and annotations together, to move around as a meaningful
|
||||||
/// unit.
|
/// unit.
|
||||||
|
|
||||||
struct AnnotatedText {
|
struct AnnotatedText {
|
||||||
public:
|
public:
|
||||||
std::string text; ///< Blob of string elements in annotation refers to.
|
std::string text; ///< Blob of string elements in annotation refers to.
|
||||||
@ -122,7 +137,31 @@ public:
|
|||||||
|
|
||||||
/// Construct moving in a string (for efficiency purposes, copying string
|
/// Construct moving in a string (for efficiency purposes, copying string
|
||||||
/// constructor is disallowed).
|
/// constructor is disallowed).
|
||||||
AnnotatedText(std::string &&text) : text(std::move(text)){};
|
AnnotatedText(std::string &&text);
|
||||||
|
|
||||||
|
/// Appends a sentence to the existing text and transparently rebases
|
||||||
|
/// string_views. Since this tracks only prefix, remember
|
||||||
|
/// appendEndingWhitespace.
|
||||||
|
/// The string_views must not already be in text.
|
||||||
|
void appendSentence(
|
||||||
|
string_view prefix,
|
||||||
|
std::vector<string_view>::iterator tokens_begin,
|
||||||
|
std::vector<string_view>::iterator tokens_end);
|
||||||
|
|
||||||
|
/// Append the whitespace at the end of input. string_view must not be in
|
||||||
|
/// text.
|
||||||
|
void appendEndingWhitespace(string_view whitespace);
|
||||||
|
|
||||||
|
/// Record the existence of a sentence that is already in text. The
|
||||||
|
/// iterators are over string_views for each token that must be in text
|
||||||
|
/// already. This function must be called to record sentences in order.
|
||||||
|
/// Normally the beginning of the sentence can be inferred from
|
||||||
|
/// tokens_begin->data() but the tokens could be empty, so sentence_begin is
|
||||||
|
/// required to know where the sentence is.
|
||||||
|
void recordExistingSentence(
|
||||||
|
std::vector<string_view>::iterator tokens_begin,
|
||||||
|
std::vector<string_view>::iterator tokens_end,
|
||||||
|
const char *sentence_begin);
|
||||||
|
|
||||||
/// Returns the number of sentences in the annotation structure.
|
/// Returns the number of sentences in the annotation structure.
|
||||||
const size_t numSentences() const { return annotation.numSentences(); }
|
const size_t numSentences() const { return annotation.numSentences(); }
|
||||||
@ -132,46 +171,44 @@ public:
|
|||||||
return annotation.numWords(sentenceIdx);
|
return annotation.numWords(sentenceIdx);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Appends a sentence to the existing text and transparently rebases
|
|
||||||
/// string_views
|
|
||||||
void appendSentence(std::string prefix, std::string &reference,
|
|
||||||
std::vector<string_view> &wordRanges);
|
|
||||||
|
|
||||||
/// Adds a sentence, used to load from SentencePiece annotations conveniently.
|
|
||||||
void addSentence(std::vector<string_view> &wordRanges);
|
|
||||||
|
|
||||||
/// Adds a sentence between two iterators, often useful while constructing
|
|
||||||
/// from parts of a container.
|
|
||||||
void addSentence(std::vector<string_view>::iterator begin,
|
|
||||||
std::vector<string_view>::iterator end);
|
|
||||||
|
|
||||||
/// Returns a string_view representing wordIdx in sentenceIdx
|
/// Returns a string_view representing wordIdx in sentenceIdx
|
||||||
string_view word(size_t sentenceIdx, size_t wordIdx) const;
|
string_view word(size_t sentenceIdx, size_t wordIdx) const {
|
||||||
|
return asStringView(annotation.word(sentenceIdx, wordIdx));
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns a string_view representing sentence corresponding to sentenceIdx.
|
/// Returns a string_view representing sentence corresponding to sentenceIdx.
|
||||||
string_view sentence(size_t sentenceIdx) const;
|
string_view sentence(size_t sentenceIdx) const {
|
||||||
|
return asStringView(annotation.sentence(sentenceIdx));
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the string_view of the gap between two sentences in the container.
|
/// Returns the string_view of the gap between two sentences in the container.
|
||||||
///
|
///
|
||||||
/// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
|
/// More precisely where `i = sentenceIdx, N = numSentences()` for brevity:
|
||||||
///
|
///
|
||||||
/// * For `i = 0`: The gap between the start of text and the first sentence.
|
/// * For `i = 0`: The gap between the start of text and the 0th sentence.
|
||||||
/// * For `i = 1...N-1`, returns the text comprising of the gap
|
/// * For `i = 1...N-1`, returns the text comprising of the gap
|
||||||
/// between the `i-1`-th and `i`-th sentence.
|
/// between the `i`-th and `i+1`-th sentence.
|
||||||
/// * For `i = N`, the gap between the last sentence and end of
|
/// * For `i = N`, the gap between the last (N-1th) sentence and end of
|
||||||
/// text.
|
/// text.
|
||||||
|
|
||||||
/// @param sentenceIdx: Can be between `[0, numSentences()]`.
|
/// @param sentenceIdx: Can be between `[0, numSentences()]`.
|
||||||
string_view gap(size_t sentenceIdx) const;
|
string_view gap(size_t sentenceIdx) const {
|
||||||
|
return asStringView(annotation.gap(sentenceIdx));
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns a ByteRange representing wordIdx in sentenceIdx
|
/// Returns a ByteRange representing wordIdx in sentenceIdx
|
||||||
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const;
|
ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const {
|
||||||
|
return annotation.word(sentenceIdx, wordIdx);
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns a ByteRange representing sentence corresponding to sentenceIdx.
|
/// Returns a ByteRange representing sentence corresponding to sentenceIdx.
|
||||||
ByteRange sentenceAsByteRange(size_t sentenceIdx) const;
|
ByteRange sentenceAsByteRange(size_t sentenceIdx) const {
|
||||||
|
return annotation.sentence(sentenceIdx);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
string_view asStringView(const ByteRange &byteRange) const;
|
string_view asStringView(const ByteRange &byteRange) const {
|
||||||
|
return string_view(text.data() + byteRange.begin, byteRange.size());
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace bergamot
|
} // namespace bergamot
|
||||||
|
@ -75,22 +75,19 @@ void ResponseBuilder::buildTranslatedText(Histories &histories,
|
|||||||
// For each sentence, prepend the filler text between the corresponding
|
// For each sentence, prepend the filler text between the corresponding
|
||||||
// source-sentence and the source-sentence before.
|
// source-sentence and the source-sentence before.
|
||||||
string_view pre = response.source.gap(sentenceIdx);
|
string_view pre = response.source.gap(sentenceIdx);
|
||||||
response.target.appendSentence(std::string(pre.data(), pre.size()),
|
response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end());
|
||||||
decoded, targetSentenceMappings);
|
|
||||||
|
|
||||||
// If this is the last history to be decoded and translated-text
|
// If this is the last history to be decoded and translated-text
|
||||||
// constructed, append the text till the end, which could be spaces or
|
// constructed, append the text till the end, which could be spaces or
|
||||||
// empty.
|
// empty.
|
||||||
if (sentenceIdx + 1 == histories.size()) {
|
if (sentenceIdx + 1 == histories.size()) {
|
||||||
string_view post = response.source.gap(sentenceIdx + 1);
|
response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1));
|
||||||
response.target.text += std::string(post.data(), post.size());
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ConcatStrategy::SPACE: {
|
case ConcatStrategy::SPACE: {
|
||||||
std::string delimiter = (sentenceIdx == 0) ? "" : " ";
|
string_view delimiter = (sentenceIdx == 0) ? "" : " ";
|
||||||
response.target.appendSentence(delimiter, decoded,
|
response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end());
|
||||||
targetSentenceMappings);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,15 +41,16 @@ void TextProcessor::process(AnnotatedText &source, Segments &segments) {
|
|||||||
// There are some cases where SentencePiece or vocab returns no words
|
// There are some cases where SentencePiece or vocab returns no words
|
||||||
// after normalization. 0 prevents any empty entries from being added.
|
// after normalization. 0 prevents any empty entries from being added.
|
||||||
if (segment.size() > 0) {
|
if (segment.size() > 0) {
|
||||||
// Truncate segment into max_input_size segments.
|
// Wrap segment into sentences of at most max_length_break_ tokens and
|
||||||
truncate(segment, wordRanges, segments, source);
|
// tell source about them.
|
||||||
|
wrap(segment, wordRanges, segments, source);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void TextProcessor::truncate(Segment &segment,
|
void TextProcessor::wrap(Segment &segment,
|
||||||
std::vector<string_view> &wordRanges,
|
std::vector<string_view> &wordRanges,
|
||||||
Segments &segments, AnnotatedText &source) {
|
Segments &segments, AnnotatedText &source) {
|
||||||
for (size_t offset = 0; offset < segment.size();
|
for (size_t offset = 0; offset < segment.size();
|
||||||
offset += max_length_break_) {
|
offset += max_length_break_) {
|
||||||
auto start = segment.begin() + offset;
|
auto start = segment.begin() + offset;
|
||||||
@ -61,7 +62,8 @@ void TextProcessor::truncate(Segment &segment,
|
|||||||
segments.back().push_back(sourceEosId());
|
segments.back().push_back(sourceEosId());
|
||||||
|
|
||||||
auto astart = wordRanges.begin() + offset;
|
auto astart = wordRanges.begin() + offset;
|
||||||
source.addSentence(astart, astart + diff);
|
// diff > 0
|
||||||
|
source.recordExistingSentence(astart, astart + diff, astart->data());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,9 +32,9 @@ private:
|
|||||||
Segment tokenize(const string_view &input,
|
Segment tokenize(const string_view &input,
|
||||||
std::vector<string_view> &tokenRanges);
|
std::vector<string_view> &tokenRanges);
|
||||||
|
|
||||||
// Truncate sentence into max_input_size segments.
|
// Wrap into sentences of at most max_length_break_ tokens and add to source.
|
||||||
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
|
void wrap(Segment &sentence, std::vector<string_view> &tokenRanges,
|
||||||
Segments &segments, AnnotatedText &source);
|
Segments &segments, AnnotatedText &source);
|
||||||
|
|
||||||
// shorthand, used only in truncate()
|
// shorthand, used only in truncate()
|
||||||
// vocabs_->sources().front() is invoked as we currently only support one source vocab
|
// vocabs_->sources().front() is invoked as we currently only support one source vocab
|
||||||
|
Loading…
Reference in New Issue
Block a user