Strengthen the Annotation class: Handle empty sentences and tests (#85)

* Changing Annotation to adhere to [begin, end)

* Stronger unit tests on sentences + num words, num sentences

* Hotfix with empty string view from EOS

* No more absolving empty-sentence; Added tests now defined behaviour

* Uncommenting important section in unit test

* Ensure empty string view default, beginning at end so marker points

* Further strengthen and comment unit-tests, mark exactly where empty sentence is happening

* Review comments: Dummy sentence + docs

- What should be a simple fast accessor is turning into compute.
  Normally the way to deal with this, for better or worse, is to put 0 at
  the beginning of sentenceEndIds_. (Putting 0 at the beginning of
  sentenceEndIds_)

- Indices into what? Mentioned to be flatByteRanges_.

* Documentation updates

* More changes to docs

Co-authored-by: abhi-agg <66322306+abhi-agg@users.noreply.github.com>
This commit is contained in:
Jerin Philip 2021-04-12 17:05:23 +01:00 committed by GitHub
parent b345b0e035
commit 3daa024eb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 258 additions and 84 deletions

View File

@ -11,63 +11,210 @@ TEST_CASE("Test Annotation API with random sentences") {
/// which sentence went in where and try to use accessor methods on
/// AnnotatedText to check if what we have as ground-truth by construction is
/// consistent with what is returned.
size_t sentences = 20;
size_t sentences = 500;
size_t maxWords = 40;
// Set in case needed to see output. The output is in lines of #sentences +
// header, which can be split and compared for easy understanding. The ideal
// way to inspect what is going wrong is to redirect output and use to split
// the different stages by sentences + 1 lines and check the diff.
bool debug{false};
std::mt19937 randomIntGen_;
randomIntGen_.seed(42);
AnnotatedText testAnnotation;
std::vector<std::vector<ByteRange>> sentenceWords;
std::vector<ByteRange> Words;
AnnotatedText testAnnotation; // This the container we add through API and
// check if the access is correct.
// External book-keeping so we have ground truths. Each element represents a
// sentence.
// word byte ranges - for testAnnotation.word(sId, wId)
std::vector<std::vector<ByteRange>> groundTruthWords;
// sentence byte ranges - for testAnnotation.sentence(sId, wId)
std::vector<ByteRange> groundTruthSentences;
// Prepare the text and construct ByteRanges as intended for sentences and
// words. The ByteRanges we construct here are expected to be the
// ground-truths for words and sentences. The string being constructed is like
// as follows:
//
// 0-0 0-1 0-2 0-3
// 1-0 1-1 1-2 1-3 1-4
// 2-0 2-1
//
// 4-0 4-1 4-2 4-3
//
// Words are separated by space units.
//
// Below, we accumulate the text with intended structure as above, and
// ground-truth tables populated to be aware of the ByteRanges where they are
// meant to be.
if (debug) {
std::cout << "Preparing text and ground truth-tables" << std::endl;
}
for (size_t idx = 0; idx < sentences; idx++) {
if (idx != 0)
testAnnotation.text += "\n";
Words.clear();
size_t words = randomIntGen_() % maxWords + 1;
Words.reserve(words);
for (size_t idw = 0; idw < words; idw++) {
size_t before = testAnnotation.text.size();
// Words can be zero, we need to support empty word sentences as well.
size_t numWords = randomIntGen_() % maxWords;
std::vector<ByteRange> wordByteRanges;
wordByteRanges.reserve(numWords);
// For empty sentence, we expect it to be empty and marked in position where
// the existing string is if needed to be pointed out.
size_t before = testAnnotation.text.size() - 1;
size_t sentenceBegin{before}, sentenceEnd{before};
for (size_t idw = 0; idw < numWords; idw++) {
if (idw != 0) {
testAnnotation.text += " ";
if (debug) {
std::cout << " ";
}
}
// Get new beginning, accounting for space above.
before = testAnnotation.text.size();
// Add the word
std::string word = std::to_string(idx) + "-" + std::to_string(idw);
testAnnotation.text += word;
if (idw != 0)
testAnnotation.text += " ";
Words.push_back((ByteRange){before, before + word.size() - 1});
}
// std::cout << std::endl;
sentenceWords.push_back(Words);
// Do math, before, before + new-word's size.
wordByteRanges.push_back((ByteRange){before, before + word.size()});
if (debug) {
std::cout << word;
}
if (idw == 0) {
sentenceBegin = before;
}
if (idw == numWords - 1) {
sentenceEnd = before + word.size();
}
}
if (debug) {
std::cout << std::endl;
}
groundTruthWords.push_back(wordByteRanges);
groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd});
}
// std::cout << "Inserting words:" << std::endl;
std::vector<std::vector<marian::string_view>> byteRanges;
for (auto &sentence : sentenceWords) {
// We prepare string_views now with the known ByteRanges and use the
// string_view based AnnotatedText.addSentence(...) API to add sentences to
// transparently convert from string_views to ByteRanges, rebasing/working out
// the math underneath.
if (debug) {
std::cout << "Inserting words onto container and save ground-truth-table:"
<< std::endl;
}
std::vector<std::vector<marian::string_view>> wordStringViews;
for (auto &sentence : groundTruthWords) {
std::vector<marian::string_view> wordByteRanges;
bool first{true};
for (auto &word : sentence) {
marian::string_view wordView(&testAnnotation.text[word.begin],
word.end - word.begin);
word.size());
wordByteRanges.push_back(wordView);
// std::cout << std::string(wordView) << " ";
if (debug) {
if (first) {
first = false;
} else {
std::cout << " ";
}
std::cout << std::string(wordView);
}
}
testAnnotation.addSentence(wordByteRanges);
byteRanges.push_back(wordByteRanges);
// std::cout << std::endl;
wordStringViews.push_back(wordByteRanges);
if (debug) {
std::cout << std::endl;
}
}
// std::cout << "From container: " << std::endl;
for (int idx = 0; idx < sentenceWords.size(); idx++) {
for (int idw = 0; idw < sentenceWords[idx].size(); idw++) {
ByteRange expected = sentenceWords[idx][idw];
if (debug) {
std::cout
<< "Inserting sentences onto container and save ground-truth-table"
<< std::endl;
}
std::vector<marian::string_view> sentenceStringViews;
for (auto &sentenceByteRange : groundTruthSentences) {
char *data = &(testAnnotation.text[sentenceByteRange.begin]);
marian::string_view sentenceView(data, sentenceByteRange.size());
sentenceStringViews.push_back(sentenceView);
if (debug) {
std::cout << sentenceView << std::endl;
}
}
// Access from the sentence(sentenceIdx) API and confirm that the ground truth
// we expect is same as what comes out of the container.
if (debug) {
std::cout << "From container: Sentences" << std::endl;
}
for (int idx = 0; idx < groundTruthSentences.size(); idx++) {
ByteRange expected = groundTruthSentences[idx];
ByteRange obtained = testAnnotation.sentenceAsByteRange(idx);
if (debug) {
std::cout << std::string(testAnnotation.sentence(idx)) << std::endl;
}
CHECK(expected.begin == obtained.begin);
CHECK(expected.end == obtained.end);
std::string expected_string = std::string(sentenceStringViews[idx]);
std::string obtained_string = std::string(testAnnotation.sentence(idx));
CHECK(expected_string == obtained_string);
}
/// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as
/// expected words are the same as those obtained from the container.
if (debug) {
std::cout << "From container: Words" << std::endl;
}
CHECK(groundTruthWords.size() == testAnnotation.numSentences());
for (int idx = 0; idx < groundTruthWords.size(); idx++) {
CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx));
}
for (int idx = 0; idx < groundTruthWords.size(); idx++) {
for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) {
ByteRange expected = groundTruthWords[idx][idw];
ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw);
// std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
if (debug) {
std::cout << std::string(testAnnotation.word(idx, idw)) << " ";
}
CHECK(expected.begin == obtained.begin);
CHECK(expected.end == obtained.end);
std::string expected_string = std::string(byteRanges[idx][idw]);
CHECK(expected_string == std::string(testAnnotation.word(idx, idw)));
std::string expected_string = std::string(wordStringViews[idx][idw]);
std::string obtained_string = std::string(testAnnotation.word(idx, idw));
CHECK(expected_string == obtained_string);
}
if (debug) {
std::cout << std::endl;
}
// std::cout << std::endl;
}
// Try inserting an empty Sentence. This is ensuring we check for empty
// Sentence if the random test above does not cover it for some reason.
int emptySentenceIdx = sentences;
std::vector<marian::string_view> emptySentence;
testAnnotation.addSentence(emptySentence);
// There are no words.
CHECK(testAnnotation.numWords(emptySentenceIdx) == 0);
// Empty sentence expected at output.
std::string expectedEmptyString = "";
marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx);
std::string obtainedString = std::string(emptyView.data(), emptyView.size());
CHECK(expectedEmptyString == obtainedString);
}

View File

@ -6,48 +6,44 @@ namespace marian {
namespace bergamot {
void Annotation::addSentence(std::vector<ByteRange> &sentence) {
size_t size = flatByteRanges_.size();
flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence),
std::end(sentence));
sentenceBeginIds_.push_back(size);
size_t size = flatByteRanges_.size();
sentenceEndIds_.push_back(size);
}
size_t Annotation::numWords(size_t sentenceIdx) const {
auto terminals = sentenceTerminalIds(sentenceIdx);
return terminals.second - terminals.first + 1;
}
std::pair<size_t, size_t>
Annotation::sentenceTerminalIds(size_t sentenceIdx) const {
size_t bosId, eosId;
bosId = sentenceBeginIds_[sentenceIdx];
eosId = sentenceIdx + 1 < numSentences()
? sentenceBeginIds_[sentenceIdx + 1] - 1
: flatByteRanges_.size() - 1;
// Out of bound checks.
assert(bosId < flatByteRanges_.size());
assert(eosId < flatByteRanges_.size());
return std::make_pair(bosId, eosId);
}
std::pair<ByteRange, ByteRange>
Annotation::sentenceTerminals(size_t sentenceIdx) const {
auto terminals = sentenceTerminalIds(sentenceIdx);
return std::make_pair(flatByteRanges_[terminals.first],
flatByteRanges_[terminals.second]);
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
eosId = sentenceEndIds_[sentenceIdx + 1];
// Difference between eosId and bosId is the number of words.
return eosId - bosId;
}
ByteRange Annotation::sentence(size_t sentenceIdx) const {
auto terminals = sentenceTerminals(sentenceIdx);
return (ByteRange){terminals.first.begin, terminals.second.end};
size_t bosId, eosId;
bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so;
eosId = sentenceEndIds_[sentenceIdx + 1];
ByteRange sentenceByteRange;
if (bosId == eosId) {
// We have an empty sentence. However, we want to be able to point where in
// target this happened through the ranges. We are looking for the end of
// the flatByteRange and non-empty sentence before this happened and
// construct empty string-view equivalent ByteRange.
ByteRange eos = flatByteRanges_[eosId - 1];
sentenceByteRange = (ByteRange){eos.end, eos.end};
} else {
ByteRange bos = flatByteRanges_[bosId];
ByteRange eos = flatByteRanges_[eosId - 1];
sentenceByteRange = (ByteRange){bos.begin, eos.end};
}
return sentenceByteRange;
}
ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const {
size_t offset = sentenceBeginIds_[sentenceIdx];
// auto terminals = sentenceTerminals(sentenceIdx);
// assert(offset + wordIdx <= terminals.second);
return flatByteRanges_[offset + wordIdx];
size_t bosOffset = sentenceEndIds_[sentenceIdx];
return flatByteRanges_[bosOffset + wordIdx];
}
string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const {

View File

@ -19,51 +19,82 @@ struct ByteRange {
/// An Annotation is a collection of ByteRanges used to denote ancillary
/// information of sentences and words on a text of string. Annotation is meant
/// for consumption on platforms where string_view creates problems (eg: exports
/// through WASM). See AnnotatedText for cases where this is a non-issue.
/// for consumption on platforms where `string_view` creates problems (eg:
/// exports through WASM) conveniently rebasing them as required into
/// ByteRanges. See AnnotatedText for cases where this is a non-issue.
///
/// **Usage**
///
/// To ensure rebasing is consistent during creation and updation, use
/// `Annotation` best through `AnnotatedText`, which also holds the reference
/// string and can work with `string_views`.
///
/// If used separately, it is on the user to ensure the reference string
/// is the same as what the Annotation refers to. For best results, an instance
/// is expected to be read only in this mode of operation.
///
/// **Idea**
///
/// Annotation is intended to be the same structure conceptually as below,
/// except the `std::vector<std::vector<ByteRange>>` hammered into a flat
/// structure to avoid multiple reallocs keeping efficiency in mind. This is
/// achieved by having markers of where sentence ends in the flat container
/// storing word ByteRanges.
///
/// ```cpp
/// typedef ByteRange Word;
/// // std::vector<ByteRange>, a single sentence
/// typedef std::vector<Word> Sentence;
/// std::vector<std::vector<ByteRange> // multiple sentences
/// typedef std::vector<Sentence> Annotation;
///
/// Annotation example;
/// ```
/// This structure exists to provide a consistent API to access the nested
/// sentences of varying lengths, which occur in source-text processed into
/// multiple sentences, and target-text translated from source as multiple
/// sentences, both composed of (sub)-words, providing a List[List] like access
/// while storing it in a compact and efficient manner.
class Annotation {
public:
/// Annotation is constructed empty. See addSentence to populate it with
/// Annotation is constructed empty. See `addSentence()` to populate it with
/// annotations.
Annotation() {}
Annotation() {
// The -1-th sentence ends at 0.
sentenceEndIds_.push_back(0);
}
/// Returns the number of sentences annotated in a text.
size_t numSentences() const { return sentenceBeginIds_.size(); }
size_t numSentences() const { return sentenceEndIds_.size() - 1; }
/// Returns number of words in the sentece identified by sentenceIdx.
/// Returns number of words in the sentence identified by `sentenceIdx`.
size_t numWords(size_t sentenceIdx) const;
/// Adds a sentences from vector<ByteRange> representation, internally doing
/// Adds a sentences from `vector<ByteRange>` representation, internally doing
/// extra book-keeping for the sentence terminal markings. Sentences are
/// expected to be added in order as they occur in text.
void addSentence(std::vector<ByteRange> &sentence);
/// Returns a ByteRange representing wordIdx in sentenceIdx
/// Returns a ByteRange representing `wordIdx` in sentence indexed by
/// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than
/// `.numWords()` for `sentenceIdx` for defined behaviour.
ByteRange word(size_t sentenceIdx, size_t wordIdx) const;
/// Returns a ByteRange representing sentence corresponding to sentenceIdx.
/// Returns a ByteRange representing sentence corresponding to `sentenceIdx`.
/// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when
/// less than `.numSentences()`.
ByteRange sentence(size_t sentenceIdx) const;
private:
/// A flat storage for ByteRanges. Composed of word ByteRanges, extra
/// information in sentenceBeginIds_ to denote sentence boundary markers as
/// information in sentenceEndIds_ to denote sentence boundary markers as
/// indices.
std::vector<ByteRange> flatByteRanges_;
/// Stores indices where sentences begin
std::vector<size_t> sentenceBeginIds_;
/// Returns ByteRanges corresponding to beginning and end words of sentence
/// corresponding to sentenceIdx. This is useful in using the information to
/// construct a ByteRange of a sentence taking the begin from the first and
/// end from the second.
std::pair<ByteRange, ByteRange> sentenceTerminals(size_t sentenceIdx) const;
/// Returns indices of terminal (word) ByteRanges in sentenceIds_ of a
/// sentence corresponding to sentenceIdx. The distance can be used to compute
/// number of words in a sentence (numWords) and also to construct the
/// terminal ByteRanges (sentenceTerminals).
std::pair<size_t, size_t> sentenceTerminalIds(size_t sentenceIdx) const;
/// Stores indices onto flatByteRanges_ of where sentences end (not inclusive,
/// aligned with C++ half interval notions). There is a 0 marker to simplify
/// sources, indicating where the -1-th sentence ends.
std::vector<size_t> sentenceEndIds_;
};
/// AnnotatedText is effectively std::string text + Annotation, providing the