From 3daa024eb303b52e483f326b26d6c976b42b2fe9 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 12 Apr 2021 17:05:23 +0100 Subject: [PATCH] Strengthen the Annotation class: Handle empty sentences and tests (#85) * Changing Annotation to adhere to [begin, end) * Stronger unit tests on sentences + num words, num sentences * Hotfix with empty string view from EOS * No more absolving empty-sentence; Added tests now defined behaviour * Uncommenting important section in unit test * Ensure empty string view default, beginning at end so marker points * Further strengthen and comment unit-tests, mark exactly where empty sentence is happening * Review comments: Dummy sentence + docs - What should be a simple fast accessor is turning into compute. Normally the way to deal with this, for better or worse, is to put 0 at the beginning of sentenceEndIds_. (Putting 0 at the beginning of sentenceEndIds_) - Indices into what? Mentioned to be flatByteRanges_. * Documentation updates * More changes to docs Co-authored-by: abhi-agg <66322306+abhi-agg@users.noreply.github.com> --- src/tests/annotation_tests.cpp | 207 ++++++++++++++++++++++++----- src/translator/sentence_ranges.cpp | 56 ++++---- src/translator/sentence_ranges.h | 79 +++++++---- 3 files changed, 258 insertions(+), 84 deletions(-) diff --git a/src/tests/annotation_tests.cpp b/src/tests/annotation_tests.cpp index 2284011..3e42bae 100644 --- a/src/tests/annotation_tests.cpp +++ b/src/tests/annotation_tests.cpp @@ -11,63 +11,210 @@ TEST_CASE("Test Annotation API with random sentences") { /// which sentence went in where and try to use accessor methods on /// AnnotatedText to check if what we have as ground-truth by construction is /// consistent with what is returned. - size_t sentences = 20; + size_t sentences = 500; size_t maxWords = 40; + // Set in case needed to see output. The output is in lines of #sentences + + // header, which can be split and compared for easy understanding. The ideal + // way to inspect what is going wrong is to redirect output and use to split + // the different stages by sentences + 1 lines and check the diff. + bool debug{false}; + std::mt19937 randomIntGen_; randomIntGen_.seed(42); - AnnotatedText testAnnotation; - std::vector> sentenceWords; - std::vector Words; + AnnotatedText testAnnotation; // This the container we add through API and + // check if the access is correct. + // External book-keeping so we have ground truths. Each element represents a + // sentence. + + // word byte ranges - for testAnnotation.word(sId, wId) + std::vector> groundTruthWords; + // sentence byte ranges - for testAnnotation.sentence(sId, wId) + std::vector groundTruthSentences; + + // Prepare the text and construct ByteRanges as intended for sentences and + // words. The ByteRanges we construct here are expected to be the + // ground-truths for words and sentences. The string being constructed is like + // as follows: + // + // 0-0 0-1 0-2 0-3 + // 1-0 1-1 1-2 1-3 1-4 + // 2-0 2-1 + // + // 4-0 4-1 4-2 4-3 + // + // Words are separated by space units. + // + // Below, we accumulate the text with intended structure as above, and + // ground-truth tables populated to be aware of the ByteRanges where they are + // meant to be. + if (debug) { + std::cout << "Preparing text and ground truth-tables" << std::endl; + } for (size_t idx = 0; idx < sentences; idx++) { if (idx != 0) testAnnotation.text += "\n"; - Words.clear(); - size_t words = randomIntGen_() % maxWords + 1; - Words.reserve(words); - for (size_t idw = 0; idw < words; idw++) { - size_t before = testAnnotation.text.size(); + // Words can be zero, we need to support empty word sentences as well. + size_t numWords = randomIntGen_() % maxWords; + + std::vector wordByteRanges; + wordByteRanges.reserve(numWords); + + // For empty sentence, we expect it to be empty and marked in position where + // the existing string is if needed to be pointed out. + size_t before = testAnnotation.text.size() - 1; + size_t sentenceBegin{before}, sentenceEnd{before}; + + for (size_t idw = 0; idw < numWords; idw++) { + if (idw != 0) { + testAnnotation.text += " "; + if (debug) { + std::cout << " "; + } + } + + // Get new beginning, accounting for space above. + before = testAnnotation.text.size(); + + // Add the word std::string word = std::to_string(idx) + "-" + std::to_string(idw); testAnnotation.text += word; - if (idw != 0) - testAnnotation.text += " "; - Words.push_back((ByteRange){before, before + word.size() - 1}); - } - // std::cout << std::endl; - sentenceWords.push_back(Words); + // Do math, before, before + new-word's size. + wordByteRanges.push_back((ByteRange){before, before + word.size()}); + + if (debug) { + std::cout << word; + } + + if (idw == 0) { + sentenceBegin = before; + } + if (idw == numWords - 1) { + sentenceEnd = before + word.size(); + } + } + if (debug) { + std::cout << std::endl; + } + + groundTruthWords.push_back(wordByteRanges); + groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd}); } - // std::cout << "Inserting words:" << std::endl; - std::vector> byteRanges; - for (auto &sentence : sentenceWords) { + // We prepare string_views now with the known ByteRanges and use the + // string_view based AnnotatedText.addSentence(...) API to add sentences to + // transparently convert from string_views to ByteRanges, rebasing/working out + // the math underneath. + + if (debug) { + std::cout << "Inserting words onto container and save ground-truth-table:" + << std::endl; + } + + std::vector> wordStringViews; + for (auto &sentence : groundTruthWords) { std::vector wordByteRanges; + bool first{true}; for (auto &word : sentence) { marian::string_view wordView(&testAnnotation.text[word.begin], - word.end - word.begin); + word.size()); wordByteRanges.push_back(wordView); - // std::cout << std::string(wordView) << " "; + if (debug) { + if (first) { + first = false; + } else { + std::cout << " "; + } + std::cout << std::string(wordView); + } } testAnnotation.addSentence(wordByteRanges); - byteRanges.push_back(wordByteRanges); - // std::cout << std::endl; + wordStringViews.push_back(wordByteRanges); + if (debug) { + std::cout << std::endl; + } } - // std::cout << "From container: " << std::endl; - for (int idx = 0; idx < sentenceWords.size(); idx++) { - for (int idw = 0; idw < sentenceWords[idx].size(); idw++) { - ByteRange expected = sentenceWords[idx][idw]; + if (debug) { + std::cout + << "Inserting sentences onto container and save ground-truth-table" + << std::endl; + } + std::vector sentenceStringViews; + for (auto &sentenceByteRange : groundTruthSentences) { + char *data = &(testAnnotation.text[sentenceByteRange.begin]); + marian::string_view sentenceView(data, sentenceByteRange.size()); + sentenceStringViews.push_back(sentenceView); + + if (debug) { + std::cout << sentenceView << std::endl; + } + } + + // Access from the sentence(sentenceIdx) API and confirm that the ground truth + // we expect is same as what comes out of the container. + if (debug) { + std::cout << "From container: Sentences" << std::endl; + } + for (int idx = 0; idx < groundTruthSentences.size(); idx++) { + ByteRange expected = groundTruthSentences[idx]; + ByteRange obtained = testAnnotation.sentenceAsByteRange(idx); + if (debug) { + std::cout << std::string(testAnnotation.sentence(idx)) << std::endl; + } + CHECK(expected.begin == obtained.begin); + CHECK(expected.end == obtained.end); + std::string expected_string = std::string(sentenceStringViews[idx]); + std::string obtained_string = std::string(testAnnotation.sentence(idx)); + CHECK(expected_string == obtained_string); + } + + /// Access the word(sentenceIdx, wordIdx) API and confirm what we hold as + /// expected words are the same as those obtained from the container. + if (debug) { + std::cout << "From container: Words" << std::endl; + } + + CHECK(groundTruthWords.size() == testAnnotation.numSentences()); + for (int idx = 0; idx < groundTruthWords.size(); idx++) { + CHECK(groundTruthWords[idx].size() == testAnnotation.numWords(idx)); + } + + for (int idx = 0; idx < groundTruthWords.size(); idx++) { + for (int idw = 0; idw < groundTruthWords[idx].size(); idw++) { + ByteRange expected = groundTruthWords[idx][idw]; ByteRange obtained = testAnnotation.wordAsByteRange(idx, idw); - // std::cout << std::string(testAnnotation.word(idx, idw)) << " "; + if (debug) { + std::cout << std::string(testAnnotation.word(idx, idw)) << " "; + } CHECK(expected.begin == obtained.begin); CHECK(expected.end == obtained.end); - std::string expected_string = std::string(byteRanges[idx][idw]); - CHECK(expected_string == std::string(testAnnotation.word(idx, idw))); + std::string expected_string = std::string(wordStringViews[idx][idw]); + std::string obtained_string = std::string(testAnnotation.word(idx, idw)); + CHECK(expected_string == obtained_string); + } + if (debug) { + std::cout << std::endl; } - // std::cout << std::endl; } + + // Try inserting an empty Sentence. This is ensuring we check for empty + // Sentence if the random test above does not cover it for some reason. + int emptySentenceIdx = sentences; + std::vector emptySentence; + testAnnotation.addSentence(emptySentence); + + // There are no words. + CHECK(testAnnotation.numWords(emptySentenceIdx) == 0); + + // Empty sentence expected at output. + std::string expectedEmptyString = ""; + marian::string_view emptyView = testAnnotation.sentence(emptySentenceIdx); + std::string obtainedString = std::string(emptyView.data(), emptyView.size()); + CHECK(expectedEmptyString == obtainedString); } diff --git a/src/translator/sentence_ranges.cpp b/src/translator/sentence_ranges.cpp index 053eeaa..aae9dd3 100644 --- a/src/translator/sentence_ranges.cpp +++ b/src/translator/sentence_ranges.cpp @@ -6,48 +6,44 @@ namespace marian { namespace bergamot { void Annotation::addSentence(std::vector &sentence) { - size_t size = flatByteRanges_.size(); flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence), std::end(sentence)); - sentenceBeginIds_.push_back(size); + size_t size = flatByteRanges_.size(); + sentenceEndIds_.push_back(size); } size_t Annotation::numWords(size_t sentenceIdx) const { - auto terminals = sentenceTerminalIds(sentenceIdx); - return terminals.second - terminals.first + 1; -} - -std::pair -Annotation::sentenceTerminalIds(size_t sentenceIdx) const { size_t bosId, eosId; - bosId = sentenceBeginIds_[sentenceIdx]; - eosId = sentenceIdx + 1 < numSentences() - ? sentenceBeginIds_[sentenceIdx + 1] - 1 - : flatByteRanges_.size() - 1; - - // Out of bound checks. - assert(bosId < flatByteRanges_.size()); - assert(eosId < flatByteRanges_.size()); - return std::make_pair(bosId, eosId); -} - -std::pair -Annotation::sentenceTerminals(size_t sentenceIdx) const { - auto terminals = sentenceTerminalIds(sentenceIdx); - return std::make_pair(flatByteRanges_[terminals.first], - flatByteRanges_[terminals.second]); + bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so; + eosId = sentenceEndIds_[sentenceIdx + 1]; + // Difference between eosId and bosId is the number of words. + return eosId - bosId; } ByteRange Annotation::sentence(size_t sentenceIdx) const { - auto terminals = sentenceTerminals(sentenceIdx); - return (ByteRange){terminals.first.begin, terminals.second.end}; + size_t bosId, eosId; + bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so; + eosId = sentenceEndIds_[sentenceIdx + 1]; + ByteRange sentenceByteRange; + + if (bosId == eosId) { + // We have an empty sentence. However, we want to be able to point where in + // target this happened through the ranges. We are looking for the end of + // the flatByteRange and non-empty sentence before this happened and + // construct empty string-view equivalent ByteRange. + ByteRange eos = flatByteRanges_[eosId - 1]; + sentenceByteRange = (ByteRange){eos.end, eos.end}; + } else { + ByteRange bos = flatByteRanges_[bosId]; + ByteRange eos = flatByteRanges_[eosId - 1]; + sentenceByteRange = (ByteRange){bos.begin, eos.end}; + } + return sentenceByteRange; } ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const { - size_t offset = sentenceBeginIds_[sentenceIdx]; - // auto terminals = sentenceTerminals(sentenceIdx); - // assert(offset + wordIdx <= terminals.second); - return flatByteRanges_[offset + wordIdx]; + size_t bosOffset = sentenceEndIds_[sentenceIdx]; + return flatByteRanges_[bosOffset + wordIdx]; } string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const { diff --git a/src/translator/sentence_ranges.h b/src/translator/sentence_ranges.h index a0dc8c9..b3986e3 100644 --- a/src/translator/sentence_ranges.h +++ b/src/translator/sentence_ranges.h @@ -19,51 +19,82 @@ struct ByteRange { /// An Annotation is a collection of ByteRanges used to denote ancillary /// information of sentences and words on a text of string. Annotation is meant -/// for consumption on platforms where string_view creates problems (eg: exports -/// through WASM). See AnnotatedText for cases where this is a non-issue. +/// for consumption on platforms where `string_view` creates problems (eg: +/// exports through WASM) conveniently rebasing them as required into +/// ByteRanges. See AnnotatedText for cases where this is a non-issue. +/// +/// **Usage** +/// +/// To ensure rebasing is consistent during creation and updation, use +/// `Annotation` best through `AnnotatedText`, which also holds the reference +/// string and can work with `string_views`. +/// +/// If used separately, it is on the user to ensure the reference string +/// is the same as what the Annotation refers to. For best results, an instance +/// is expected to be read only in this mode of operation. +/// +/// **Idea** +/// +/// Annotation is intended to be the same structure conceptually as below, +/// except the `std::vector>` hammered into a flat +/// structure to avoid multiple reallocs keeping efficiency in mind. This is +/// achieved by having markers of where sentence ends in the flat container +/// storing word ByteRanges. +/// +/// ```cpp +/// typedef ByteRange Word; +/// // std::vector, a single sentence +/// typedef std::vector Sentence; +/// std::vector // multiple sentences +/// typedef std::vector Annotation; +/// +/// Annotation example; +/// ``` +/// This structure exists to provide a consistent API to access the nested +/// sentences of varying lengths, which occur in source-text processed into +/// multiple sentences, and target-text translated from source as multiple +/// sentences, both composed of (sub)-words, providing a List[List] like access +/// while storing it in a compact and efficient manner. class Annotation { public: - /// Annotation is constructed empty. See addSentence to populate it with + /// Annotation is constructed empty. See `addSentence()` to populate it with /// annotations. - Annotation() {} + Annotation() { + // The -1-th sentence ends at 0. + sentenceEndIds_.push_back(0); + } /// Returns the number of sentences annotated in a text. - size_t numSentences() const { return sentenceBeginIds_.size(); } + size_t numSentences() const { return sentenceEndIds_.size() - 1; } - /// Returns number of words in the sentece identified by sentenceIdx. + /// Returns number of words in the sentence identified by `sentenceIdx`. size_t numWords(size_t sentenceIdx) const; - /// Adds a sentences from vector representation, internally doing + /// Adds a sentences from `vector` representation, internally doing /// extra book-keeping for the sentence terminal markings. Sentences are /// expected to be added in order as they occur in text. void addSentence(std::vector &sentence); - /// Returns a ByteRange representing wordIdx in sentenceIdx + /// Returns a ByteRange representing `wordIdx` in sentence indexed by + /// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than + /// `.numWords()` for `sentenceIdx` for defined behaviour. ByteRange word(size_t sentenceIdx, size_t wordIdx) const; - /// Returns a ByteRange representing sentence corresponding to sentenceIdx. + /// Returns a ByteRange representing sentence corresponding to `sentenceIdx`. + /// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when + /// less than `.numSentences()`. ByteRange sentence(size_t sentenceIdx) const; private: /// A flat storage for ByteRanges. Composed of word ByteRanges, extra - /// information in sentenceBeginIds_ to denote sentence boundary markers as + /// information in sentenceEndIds_ to denote sentence boundary markers as /// indices. std::vector flatByteRanges_; - /// Stores indices where sentences begin - std::vector sentenceBeginIds_; - - /// Returns ByteRanges corresponding to beginning and end words of sentence - /// corresponding to sentenceIdx. This is useful in using the information to - /// construct a ByteRange of a sentence taking the begin from the first and - /// end from the second. - std::pair sentenceTerminals(size_t sentenceIdx) const; - - /// Returns indices of terminal (word) ByteRanges in sentenceIds_ of a - /// sentence corresponding to sentenceIdx. The distance can be used to compute - /// number of words in a sentence (numWords) and also to construct the - /// terminal ByteRanges (sentenceTerminals). - std::pair sentenceTerminalIds(size_t sentenceIdx) const; + /// Stores indices onto flatByteRanges_ of where sentences end (not inclusive, + /// aligned with C++ half interval notions). There is a 0 marker to simplify + /// sources, indicating where the -1-th sentence ends. + std::vector sentenceEndIds_; }; /// AnnotatedText is effectively std::string text + Annotation, providing the