diff --git a/src/tests/annotation_tests.cpp b/src/tests/annotation_tests.cpp index d323b9d..0f02a7a 100644 --- a/src/tests/annotation_tests.cpp +++ b/src/tests/annotation_tests.cpp @@ -23,9 +23,6 @@ TEST_CASE("Test Annotation API with random sentences") { std::mt19937 randomIntGen_; randomIntGen_.seed(42); - AnnotatedText testAnnotation; // This the container we add through API and - // check if the access is correct. - // External book-keeping so we have ground truths. Each element represents a // sentence. @@ -45,7 +42,7 @@ TEST_CASE("Test Annotation API with random sentences") { // // 4-0 4-1 4-2 4-3 // - // Words are separated by space units. + // Tokens are contiguous because that's how SentencePiece works. // // Below, we accumulate the text with intended structure as above, and // ground-truth tables populated to be aware of the ByteRanges where they are @@ -53,9 +50,10 @@ TEST_CASE("Test Annotation API with random sentences") { if (debug) { std::cout << "Preparing text and ground truth-tables" << std::endl; } + std::string text; for (size_t idx = 0; idx < sentences; idx++) { if (idx != 0) - testAnnotation.text += "\n"; + text += "\n"; // Words can be zero, we need to support empty word sentences as well. size_t numWords = randomIntGen_() % maxWords; @@ -65,23 +63,16 @@ TEST_CASE("Test Annotation API with random sentences") { // For empty sentence, we expect it to be empty and marked in position where // the existing string is if needed to be pointed out. - size_t before = testAnnotation.text.size() - 1; + size_t before = text.size() - 1; size_t sentenceBegin{before}, sentenceEnd{before}; for (size_t idw = 0; idw < numWords; idw++) { - if (idw != 0) { - testAnnotation.text += " "; - if (debug) { - std::cout << " "; - } - } - // Get new beginning, accounting for space above. - before = testAnnotation.text.size(); + before = text.size(); // Add the word std::string word = std::to_string(idx) + "-" + std::to_string(idw); - testAnnotation.text += word; + text += word; // Do math, before, before + new-word's size. wordByteRanges.push_back((ByteRange){before, before + word.size()}); @@ -105,6 +96,9 @@ TEST_CASE("Test Annotation API with random sentences") { groundTruthSentences.push_back((ByteRange){sentenceBegin, sentenceEnd}); } + AnnotatedText testAnnotation(std::move(text)); // This the container we add through API and + // check if the access is correct. + // We prepare string_views now with the known ByteRanges and use the // string_view based AnnotatedText.addSentence(...) API to add sentences to // transparently convert from string_views to ByteRanges, rebasing/working out @@ -116,6 +110,7 @@ TEST_CASE("Test Annotation API with random sentences") { } std::vector> wordStringViews; + std::vector::const_iterator sentence_iter = groundTruthSentences.begin(); for (auto &sentence : groundTruthWords) { std::vector wordByteRanges; bool first{true}; @@ -132,7 +127,8 @@ TEST_CASE("Test Annotation API with random sentences") { std::cout << std::string(wordView); } } - testAnnotation.addSentence(wordByteRanges); + testAnnotation.recordExistingSentence(wordByteRanges.begin(), wordByteRanges.end(), testAnnotation.text.data() + sentence_iter->begin); + ++sentence_iter; wordStringViews.push_back(wordByteRanges); if (debug) { std::cout << std::endl; @@ -207,7 +203,7 @@ TEST_CASE("Test Annotation API with random sentences") { // Sentence if the random test above does not cover it for some reason. int emptySentenceIdx = sentences; std::vector emptySentence; - testAnnotation.addSentence(emptySentence); + testAnnotation.recordExistingSentence(emptySentence.begin(), emptySentence.end(), testAnnotation.text.data() + testAnnotation.text.size()); // There are no words. CHECK(testAnnotation.numWords(emptySentenceIdx) == 0); diff --git a/src/translator/annotation.cpp b/src/translator/annotation.cpp index c27d784..90e02e0 100644 --- a/src/translator/annotation.cpp +++ b/src/translator/annotation.cpp @@ -1,130 +1,68 @@ #include "annotation.h" #include -#include namespace marian { namespace bergamot { -void Annotation::addSentence(std::vector &sentence) { - flatByteRanges_.insert(std::end(flatByteRanges_), std::begin(sentence), - std::end(sentence)); - size_t size = flatByteRanges_.size(); - sentenceEndIds_.push_back(size); +AnnotatedText::AnnotatedText(std::string &&t) : text(std::move(t)) { + // Treat the entire text as a gap that recordExistingSentence will break. + annotation.token_begin_.back() = text.size(); } -size_t Annotation::numWords(size_t sentenceIdx) const { - size_t bosId, eosId; - bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so; - eosId = sentenceEndIds_[sentenceIdx + 1]; - // Difference between eosId and bosId is the number of words. - return eosId - bosId; +void AnnotatedText::appendSentence(string_view prefix, std::vector::iterator begin, std::vector::iterator end) { + assert(annotation.token_begin_.back() == text.size()); + // We'll be adding tokens from the sentence and another gap. + annotation.token_begin_.reserve(annotation.token_begin_.size() + (end - begin) + 1); + + // prefix is just end of the previous one. + appendEndingWhitespace(prefix); + + // Appending sentence text. + std::size_t offset = text.size(); + for (std::vector::iterator token = begin; token != end; ++token) { + offset += token->size(); + annotation.token_begin_.push_back(offset); + } + if (begin != end) { + text.append(begin->data(), (end - 1)->data() + (end - 1)->size()); + assert(offset == text.size()); // Tokens should be contiguous. + } + + // Add the gap after the sentence. This is empty for now, but will be + // extended with appendEndingWhitespace or another appendSentence. + annotation.gap_.push_back(annotation.token_begin_.size() - 1); + annotation.token_begin_.push_back(offset); } -ByteRange Annotation::sentence(size_t sentenceIdx) const { - size_t bosId, eosId; - bosId = sentenceEndIds_[sentenceIdx]; // Half interval, so; - eosId = sentenceEndIds_[sentenceIdx + 1]; - ByteRange sentenceByteRange; +void AnnotatedText::appendEndingWhitespace(string_view whitespace) { + text.append(whitespace.data(), whitespace.size()); + annotation.token_begin_.back() = text.size(); +} - if (bosId == eosId) { - // We have an empty sentence. However, we want to be able to point where in - // target this happened through the ranges. We are looking for the end of - // the flatByteRange and non-empty sentence before this happened and - // construct empty string-view equivalent ByteRange. - ByteRange eos = flatByteRanges_[eosId - 1]; - sentenceByteRange = ByteRange{eos.end, eos.end}; +void AnnotatedText::recordExistingSentence(std::vector::iterator begin, std::vector::iterator end, const char *sentence_begin) { + assert(sentence_begin >= text.data()); + assert(sentence_begin <= text.data() + text.size()); + assert(begin == end || sentence_begin == begin->data()); + assert(!annotation.token_begin_.empty()); + assert(annotation.token_begin_.back() == text.size()); + // Clip off size token ending. + annotation.token_begin_.resize(annotation.token_begin_.size() - 1); + for (std::vector::iterator i = begin; i != end; ++i) { + assert(i->data() >= text.data()); // In range. + assert(i->data() + i->size() <= text.data() + text.size()); // In range + assert(i + 1 == end || i->data() + i->size() == (i+1)->data()); // Contiguous + annotation.token_begin_.push_back(i->data() - text.data()); + } + // Gap token after sentence. + annotation.gap_.push_back(annotation.token_begin_.size()); + if (begin != end) { + annotation.token_begin_.push_back((end - 1)->data() + (end - 1)->size() - text.data()); } else { - ByteRange bos = flatByteRanges_[bosId]; - ByteRange eos = flatByteRanges_[eosId - 1]; - sentenceByteRange = ByteRange{bos.begin, eos.end}; + // empty sentence. + annotation.token_begin_.push_back(sentence_begin - text.data()); } - return sentenceByteRange; -} - -ByteRange Annotation::word(size_t sentenceIdx, size_t wordIdx) const { - size_t bosOffset = sentenceEndIds_[sentenceIdx]; - return flatByteRanges_[bosOffset + wordIdx]; -} - -string_view AnnotatedText::word(size_t sentenceIdx, size_t wordIdx) const { - auto terminals = annotation.word(sentenceIdx, wordIdx); - return string_view(&text[terminals.begin], terminals.size()); -} - -string_view AnnotatedText::sentence(size_t sentenceIdx) const { - auto sentenceAsByteRange = annotation.sentence(sentenceIdx); - return asStringView(sentenceAsByteRange); -} - -void AnnotatedText::appendSentence(std::string prefix, std::string &reference, - std::vector &wordRanges) { - text += prefix; - size_t offset = text.size(); // Get size before to do ByteRange arithmetic - text += reference; // Append reference to text - std::vector sentence; - for (auto &wordView : wordRanges) { - size_t thisWordBegin = offset + wordView.data() - reference.data(); - sentence.push_back( - ByteRange{thisWordBegin, thisWordBegin + wordView.size()}); - } - annotation.addSentence(sentence); -} - -void AnnotatedText::addSentence(std::vector &wordRanges) { - addSentence(std::begin(wordRanges), std::end(wordRanges)); -}; - -void AnnotatedText::addSentence(std::vector::iterator begin, - std::vector::iterator end) { - std::vector sentence; - for (auto p = begin; p != end; p++) { - size_t begin_offset = p->data() - text.data(); - sentence.push_back(ByteRange{begin_offset, begin_offset + p->size()}); - } - annotation.addSentence(sentence); -}; - -ByteRange AnnotatedText::wordAsByteRange(size_t sentenceIdx, - size_t wordIdx) const { - return annotation.word(sentenceIdx, wordIdx); -} - -ByteRange AnnotatedText::sentenceAsByteRange(size_t sentenceIdx) const { - return annotation.sentence(sentenceIdx); -} - -string_view AnnotatedText::asStringView(const ByteRange &byteRange) const { - const char *data = &text[byteRange.begin]; - size_t size = byteRange.size(); - return string_view(data, size); -} - -string_view AnnotatedText::gap(size_t sentenceIdx) const { - // Find start of filler-text before, there's a corner case when there's no - // sentence before. - const char *start = nullptr; - if (sentenceIdx == 0) { - // If first sentence, filler begins at start of whole-text. - start = text.data(); - } else { - // Otherwise, filler begins at end of previous sentence. - string_view sentenceBefore = sentence(sentenceIdx - 1); - start = sentenceBefore.data() + sentenceBefore.size(); - } - - // Find end of filler-text, but there is a corner-case to handle. - const char *end = nullptr; - if (sentenceIdx == numSentences()) { - // If last sentence, manually find end of whole-text. - const char *begin = text.data(); - end = begin + text.size(); - } else { - // Otherwise, the filler ends at the start of next sentence. - string_view sentenceAfter = sentence(sentenceIdx); - end = sentenceAfter.data(); - } - - return string_view(start, end - start); + // Add back size token ending. + annotation.token_begin_.push_back(text.size()); } } // namespace bergamot diff --git a/src/translator/annotation.h b/src/translator/annotation.h index 8cb7caf..555ab53 100644 --- a/src/translator/annotation.h +++ b/src/translator/annotation.h @@ -17,83 +17,99 @@ struct ByteRange { const size_t size() const { return end - begin; } }; -/// An Annotation is a collection of ByteRanges used to denote ancillary -/// information of sentences and words on a text of string. Annotation is meant -/// for consumption on platforms where `string_view` creates problems (eg: -/// exports through WASM) conveniently rebasing them as required into -/// ByteRanges. See AnnotatedText for cases where this is a non-issue. +/// Annotation expresses sentence and token boundary information as ranges of +/// bytes in a string, but does not itself own the string. +/// +/// See also AnnotatedText, which owns Annotation and the string. AnnotatedText +/// wraps these ByteRange functions to provide a string_view interface. /// -/// **Usage** +/// Text is divided into gaps (whitespace between sentences) and sentences like +/// so: +/// gap sentence gap sentence gap +/// Because gaps appear at the beginning and end of the text, there's always +/// one more gap than there are sentences. /// -/// To ensure rebasing is consistent during creation and updation, use -/// `Annotation` best through `AnnotatedText`, which also holds the reference -/// string and can work with `string_views`. +/// The entire text is a unbroken sequence of tokens (i.e. the end of a token +/// is the beginning of the next token). A gap is exactly one token containing +/// whatever whitespace is between the sentences. A sentence is a sequence of +/// tokens. /// -/// If used separately, it is on the user to ensure the reference string -/// is the same as what the Annotation refers to. For best results, an instance -/// is expected to be read only in this mode of operation. +/// Since we are using SentencePiece, a token can include whitespace. The term +/// "word" is used, somewhat incorrectly, as a synonym of token. /// -/// **Idea** -/// -/// Annotation is intended to be the same structure conceptually as below, -/// except the `std::vector>` hammered into a flat -/// structure to avoid multiple reallocs keeping efficiency in mind. This is -/// achieved by having markers of where sentence ends in the flat container -/// storing word ByteRanges. -/// -/// ```cpp -/// typedef ByteRange Word; -/// // std::vector, a single sentence -/// typedef std::vector Sentence; -/// std::vector // multiple sentences -/// typedef std::vector Annotation; -/// -/// Annotation example; -/// ``` -/// This structure exists to provide a consistent API to access the nested -/// sentences of varying lengths, which occur in source-text processed into -/// multiple sentences, and target-text translated from source as multiple -/// sentences, both composed of (sub)-words, providing a List[List] like access -/// while storing it in a compact and efficient manner. +/// A gap can be empty (for example there may not have been whitespace at the +/// beginning). A sentence can also be empty (typically the translation system +/// produced empty output). That's fine, these are just empty ranges as you +/// would expect. class Annotation { public: - /// Annotation is constructed empty. See `addSentence()` to populate it with - /// annotations. + /// Initially an empty string. Populated by AnnotatedText. Annotation() { - // The -1-th sentence ends at 0. - sentenceEndIds_.push_back(0); + token_begin_.push_back(0); + token_begin_.push_back(0); + gap_.push_back(0); } - size_t numSentences() const { return sentenceEndIds_.size() - 1; } + size_t numSentences() const { return gap_.size() - 1; } /// Returns number of words in the sentence identified by `sentenceIdx`. - size_t numWords(size_t sentenceIdx) const; - - /// Adds a sentences from `vector` representation, internally doing - /// extra book-keeping for the sentence terminal markings. Sentences are - /// expected to be added in order as they occur in text. - void addSentence(std::vector &sentence); + size_t numWords(size_t sentenceIdx) const { + return gap_[sentenceIdx + 1] - gap_[sentenceIdx] - 1 /* minus the gap */; + } /// Returns a ByteRange representing `wordIdx` in sentence indexed by /// `sentenceIdx`. `wordIdx` follows 0-based indexing, and should be less than /// `.numWords()` for `sentenceIdx` for defined behaviour. - ByteRange word(size_t sentenceIdx, size_t wordIdx) const; + ByteRange word(size_t sentenceIdx, size_t wordIdx) const { + size_t tokenIdx = gap_[sentenceIdx] + 1 + wordIdx; + return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; + } /// Returns a ByteRange representing sentence corresponding to `sentenceIdx`. /// `sentenceIdx` follows 0-based indexing, and behaviour is defined only when /// less than `.numSentences()`. - ByteRange sentence(size_t sentenceIdx) const; + ByteRange sentence(size_t sentenceIdx) const { + return ByteRange { + token_begin_[gap_[sentenceIdx] + 1], /*end of whitespace before */ + token_begin_[gap_[sentenceIdx + 1]] /*beginning of whitespace after */ + }; + } + + ByteRange gap(size_t gapIdx) const { + size_t tokenIdx = gap_[gapIdx]; + return ByteRange {token_begin_[tokenIdx], token_begin_[tokenIdx + 1]}; + } private: - /// A flat storage for ByteRanges. Composed of word ByteRanges, extra - /// information in sentenceEndIds_ to denote sentence boundary markers as - /// indices. - std::vector flatByteRanges_; + friend class AnnotatedText; + /// Map from token index to byte offset at which it begins. Token i is: + /// [token_begin_[i], token_begin_[i+1]) + /// The vector is padded so that these indices are always valid, even at the + /// end. So tokens_begin_.size() is the number of tokens plus 1. + std::vector token_begin_; - /// Stores indices onto flatByteRanges_ of where sentences end (not inclusive, - /// aligned with C++ half interval notions). There is a 0 marker to simplify - /// sources, indicating where the -1-th sentence ends. - std::vector sentenceEndIds_; + /// Indices of tokens that correspond to gaps between sentences. These are + /// indices into token_begin_. + /// Gap g is byte range: + /// [token_begin_[gap_[w]], token_begin_[gap_[w]+1]) + /// Sentence s is byte range: + /// [token_begin_[gap_[s]+1], token_begin_[gap_[s+1]]) + /// A sentence does not include whitespace at the beginning or end. + /// + /// gap_.size() == numSentences() + 1. + /// + /// Example: empty text "" -> just an empty gap. + /// token_begin_ = {0, 0}; + /// gap_ = {0}; + /// + /// Example: only space " " -> just a gap containing the space. + /// token_begin_ = {0, 1}; + /// gap_ = {0}; + /// + /// Example: one token "hi" -> empty gap, sentence with one token, empty gap + /// token_begin_ = {0, 0, 2, 2}; + /// gap_ = {0, 2}; + std::vector gap_; }; /// AnnotatedText is effectively std::string text + Annotation, providing the @@ -107,7 +123,6 @@ private: /// /// 3. Bind the text and annotations together, to move around as a meaningful /// unit. - struct AnnotatedText { public: std::string text; ///< Blob of string elements in annotation refers to. @@ -122,7 +137,31 @@ public: /// Construct moving in a string (for efficiency purposes, copying string /// constructor is disallowed). - AnnotatedText(std::string &&text) : text(std::move(text)){}; + AnnotatedText(std::string &&text); + + /// Appends a sentence to the existing text and transparently rebases + /// string_views. Since this tracks only prefix, remember + /// appendEndingWhitespace. + /// The string_views must not already be in text. + void appendSentence( + string_view prefix, + std::vector::iterator tokens_begin, + std::vector::iterator tokens_end); + + /// Append the whitespace at the end of input. string_view must not be in + /// text. + void appendEndingWhitespace(string_view whitespace); + + /// Record the existence of a sentence that is already in text. The + /// iterators are over string_views for each token that must be in text + /// already. This function must be called to record sentences in order. + /// Normally the beginning of the sentence can be inferred from + /// tokens_begin->data() but the tokens could be empty, so sentence_begin is + /// required to know where the sentence is. + void recordExistingSentence( + std::vector::iterator tokens_begin, + std::vector::iterator tokens_end, + const char *sentence_begin); /// Returns the number of sentences in the annotation structure. const size_t numSentences() const { return annotation.numSentences(); } @@ -132,46 +171,44 @@ public: return annotation.numWords(sentenceIdx); } - /// Appends a sentence to the existing text and transparently rebases - /// string_views - void appendSentence(std::string prefix, std::string &reference, - std::vector &wordRanges); - - /// Adds a sentence, used to load from SentencePiece annotations conveniently. - void addSentence(std::vector &wordRanges); - - /// Adds a sentence between two iterators, often useful while constructing - /// from parts of a container. - void addSentence(std::vector::iterator begin, - std::vector::iterator end); - /// Returns a string_view representing wordIdx in sentenceIdx - string_view word(size_t sentenceIdx, size_t wordIdx) const; + string_view word(size_t sentenceIdx, size_t wordIdx) const { + return asStringView(annotation.word(sentenceIdx, wordIdx)); + } /// Returns a string_view representing sentence corresponding to sentenceIdx. - string_view sentence(size_t sentenceIdx) const; + string_view sentence(size_t sentenceIdx) const { + return asStringView(annotation.sentence(sentenceIdx)); + } /// Returns the string_view of the gap between two sentences in the container. /// /// More precisely where `i = sentenceIdx, N = numSentences()` for brevity: /// - /// * For `i = 0`: The gap between the start of text and the first sentence. + /// * For `i = 0`: The gap between the start of text and the 0th sentence. /// * For `i = 1...N-1`, returns the text comprising of the gap - /// between the `i-1`-th and `i`-th sentence. - /// * For `i = N`, the gap between the last sentence and end of + /// between the `i`-th and `i+1`-th sentence. + /// * For `i = N`, the gap between the last (N-1th) sentence and end of /// text. - /// @param sentenceIdx: Can be between `[0, numSentences()]`. - string_view gap(size_t sentenceIdx) const; + string_view gap(size_t sentenceIdx) const { + return asStringView(annotation.gap(sentenceIdx)); + } /// Returns a ByteRange representing wordIdx in sentenceIdx - ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const; + ByteRange wordAsByteRange(size_t sentenceIdx, size_t wordIdx) const { + return annotation.word(sentenceIdx, wordIdx); + } /// Returns a ByteRange representing sentence corresponding to sentenceIdx. - ByteRange sentenceAsByteRange(size_t sentenceIdx) const; + ByteRange sentenceAsByteRange(size_t sentenceIdx) const { + return annotation.sentence(sentenceIdx); + } private: - string_view asStringView(const ByteRange &byteRange) const; + string_view asStringView(const ByteRange &byteRange) const { + return string_view(text.data() + byteRange.begin, byteRange.size()); + } }; } // namespace bergamot diff --git a/src/translator/response_builder.cpp b/src/translator/response_builder.cpp index 037d456..b2f561b 100644 --- a/src/translator/response_builder.cpp +++ b/src/translator/response_builder.cpp @@ -75,22 +75,19 @@ void ResponseBuilder::buildTranslatedText(Histories &histories, // For each sentence, prepend the filler text between the corresponding // source-sentence and the source-sentence before. string_view pre = response.source.gap(sentenceIdx); - response.target.appendSentence(std::string(pre.data(), pre.size()), - decoded, targetSentenceMappings); + response.target.appendSentence(pre, targetSentenceMappings.begin(), targetSentenceMappings.end()); // If this is the last history to be decoded and translated-text // constructed, append the text till the end, which could be spaces or // empty. if (sentenceIdx + 1 == histories.size()) { - string_view post = response.source.gap(sentenceIdx + 1); - response.target.text += std::string(post.data(), post.size()); + response.target.appendEndingWhitespace(response.source.gap(sentenceIdx + 1)); } break; } case ConcatStrategy::SPACE: { - std::string delimiter = (sentenceIdx == 0) ? "" : " "; - response.target.appendSentence(delimiter, decoded, - targetSentenceMappings); + string_view delimiter = (sentenceIdx == 0) ? "" : " "; + response.target.appendSentence(delimiter, targetSentenceMappings.begin(), targetSentenceMappings.end()); break; } diff --git a/src/translator/text_processor.cpp b/src/translator/text_processor.cpp index 457e2b9..bca5fd1 100644 --- a/src/translator/text_processor.cpp +++ b/src/translator/text_processor.cpp @@ -41,15 +41,16 @@ void TextProcessor::process(AnnotatedText &source, Segments &segments) { // There are some cases where SentencePiece or vocab returns no words // after normalization. 0 prevents any empty entries from being added. if (segment.size() > 0) { - // Truncate segment into max_input_size segments. - truncate(segment, wordRanges, segments, source); + // Wrap segment into sentences of at most max_length_break_ tokens and + // tell source about them. + wrap(segment, wordRanges, segments, source); } } } -void TextProcessor::truncate(Segment &segment, - std::vector &wordRanges, - Segments &segments, AnnotatedText &source) { +void TextProcessor::wrap(Segment &segment, + std::vector &wordRanges, + Segments &segments, AnnotatedText &source) { for (size_t offset = 0; offset < segment.size(); offset += max_length_break_) { auto start = segment.begin() + offset; @@ -61,7 +62,8 @@ void TextProcessor::truncate(Segment &segment, segments.back().push_back(sourceEosId()); auto astart = wordRanges.begin() + offset; - source.addSentence(astart, astart + diff); + // diff > 0 + source.recordExistingSentence(astart, astart + diff, astart->data()); } } diff --git a/src/translator/text_processor.h b/src/translator/text_processor.h index f5d4d88..7328877 100644 --- a/src/translator/text_processor.h +++ b/src/translator/text_processor.h @@ -32,9 +32,9 @@ private: Segment tokenize(const string_view &input, std::vector &tokenRanges); - // Truncate sentence into max_input_size segments. - void truncate(Segment &sentence, std::vector &tokenRanges, - Segments &segments, AnnotatedText &source); + // Wrap into sentences of at most max_length_break_ tokens and add to source. + void wrap(Segment &sentence, std::vector &tokenRanges, + Segments &segments, AnnotatedText &source); // shorthand, used only in truncate() // vocabs_->sources().front() is invoked as we currently only support one source vocab