SentenceRanges: Class to work with string_views

Adds SentenceRanges in sentence_ranges.{h,cpp} and propogates use of the
class into the rest of the pipeline.

SentenceRanges previously a vector<vector<...>> is now converted into a
flat single vector<string_view>. Annotations marking sentence boundaries
are additionally stored in the class, enabling sentence string_view
access through methods.
This commit is contained in:
Jerin Philip 2021-02-17 00:31:44 +00:00
parent 9c907ea605
commit d7556bc168
10 changed files with 143 additions and 51 deletions

View File

@ -12,6 +12,7 @@ add_library(bergamot-translator STATIC
batcher.cpp
response.cpp
batch.cpp
sentence_ranges.cpp
)
target_link_libraries(bergamot-translator marian ssplit)

View File

@ -1,5 +1,5 @@
#include "request.h"
#include "sentence_ranges.h"
#include "definitions.h"
#include "response.h"
@ -13,12 +13,11 @@ namespace bergamot {
// -----------------------------------------------------------------
Request::Request(unsigned int Id, int lineNumberBegin,
std::vector<Ptr<Vocab const>> &vocabs, std::string &&source,
Segments &&segments,
std::vector<TokenRanges> &&sourceTokenRanges,
Segments &&segments, SentenceRanges &&sourceRanges,
std::promise<Response> responsePromise)
: Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs),
source_(std::move(source)), segments_(std::move(segments)),
sourceTokenRanges_(std::move(sourceTokenRanges)),
sourceRanges_(std::move(sourceRanges)),
response_(std::move(responsePromise)) {
counter_ = segments_.size();
@ -49,7 +48,7 @@ void Request::processHistory(size_t index, Ptr<History> history) {
void Request::completeRequest() {
// Request no longer needs to hold the content, can transfer it to
// Response.
Response response(std::move(source_), std::move(sourceTokenRanges_),
Response response(std::move(source_), std::move(sourceRanges_),
std::move(histories_), *vocabs_);
response_.set_value(std::move(response));
}

View File

@ -17,6 +17,7 @@
#ifndef SRC_BERGAMOT_REQUEST_H_
#define SRC_BERGAMOT_REQUEST_H_
#include "sentence_ranges.h"
#include "definitions.h"
#include "response.h"
@ -36,7 +37,7 @@ class Request {
public:
Request(unsigned int Id, int lineNumberBegin,
std::vector<Ptr<Vocab const>> &vocabs_, std::string &&source,
Segments &&segments, std::vector<TokenRanges> &&sourceTokenRanges,
Segments &&segments, SentenceRanges &&sourceTokenRanges,
std::promise<Response> responsePromise);
// Obtain the count of tokens in the segment correponding to index. Used to
@ -72,13 +73,13 @@ private:
std::atomic<int> counter_;
// source_ holds the source string to be translated. segments_ hold the
// sentences generated from source_ in vector<Words>. sourceTokenRanges_ are
// sentences generated from source_ in vector<Words>. sourceRanges_ are
// string_views of the text corresponding to these words, pointing to
// sequences in source_. histories_ is a buffer which eventually stores the
// translations of each segment in the corresponding index.
std::string source_;
Segments segments_;
std::vector<TokenRanges> sourceTokenRanges_;
SentenceRanges sourceRanges_;
std::vector<Ptr<History>> histories_;
// Members above are moved into newly constructed Response on completion

View File

@ -1,4 +1,5 @@
#include "response.h"
#include "sentence_ranges.h"
#include "common/logging.h"
#include "data/alignment.h"
@ -7,8 +8,7 @@
namespace marian {
namespace bergamot {
Response::Response(std::string &&source,
std::vector<TokenRanges> &&sourceRanges,
Response::Response(std::string &&source, SentenceRanges &&sourceRanges,
Histories &&histories, std::vector<Ptr<Vocab const>> &vocabs)
: source_(std::move(source)), sourceRanges_(std::move(sourceRanges)),
histories_(std::move(histories)), vocabs_(&vocabs) {}
@ -79,7 +79,7 @@ void Response::constructTranslation() {
const char *begin = &translation_[range.first];
targetMappings.emplace_back(begin, range.second);
targetRanges_.push_back(std::move(targetMappings));
targetRanges_.addSentence(targetMappings);
}
translationConstructed_ = true;
@ -88,21 +88,10 @@ void Response::constructTranslation() {
void Response::constructSentenceMappings(
Response::SentenceMappings &sentenceMappings) {
for (int i = 0; i < sourceRanges_.size(); i++) {
string_view first, last;
// Handle source-sentence
first = sourceRanges_[i].front();
last = sourceRanges_[i].back();
string_view src_sentence(first.data(), last.end() - first.begin());
// Handle target-sentence
first = targetRanges_[i].front();
last = targetRanges_[i].back();
string_view tgt_sentence(first.data(), last.end() - first.begin());
// Add both into sentence-mappings
sentenceMappings.emplace_back(src_sentence, tgt_sentence);
for (size_t i = 0; i < sourceRanges_.numSentences(); i++) {
string_view src = sourceRanges_.sentence(i);
string_view tgt = targetRanges_.sentence(i);
sentenceMappings.emplace_back(src, tgt);
}
}
} // namespace bergamot

View File

@ -1,6 +1,7 @@
#ifndef SRC_BERGAMOT_RESPONSE_H_
#define SRC_BERGAMOT_RESPONSE_H_
#include "sentence_ranges.h"
#include "data/types.h"
#include "definitions.h"
#include "translator/beam_search.h"
@ -31,7 +32,7 @@ class Response {
// sentenceMappings (for bergamot-translator)
public:
Response(std::string &&source, std::vector<TokenRanges> &&sourceRanges,
Response(std::string &&source, SentenceRanges &&sourceRanges,
Histories &&histories,
// Required for constructing translation and TokenRanges within
// translation lazily.
@ -84,13 +85,13 @@ private:
void constructSentenceMappings(SentenceMappings &);
std::string source_;
std::vector<TokenRanges> sourceRanges_;
SentenceRanges sourceRanges_;
Histories histories_;
std::vector<Ptr<Vocab const>> *vocabs_;
bool translationConstructed_{false};
std::string translation_;
std::vector<TokenRanges> targetRanges_;
SentenceRanges targetRanges_;
};
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,46 @@
#include "sentence_ranges.h"
#include <cassert>
#include <iostream>
namespace marian {
namespace bergamot {
void SentenceRanges::addSentence(std::vector<string_view> &wordRanges) {
addSentence(std::begin(wordRanges), std::end(wordRanges));
}
void SentenceRanges::addSentence(WordIterator begin, WordIterator end) {
size_t size = flatByteRanges_.size();
flatByteRanges_.insert(std::end(flatByteRanges_), begin, end);
sentenceBeginIds_.push_back(size);
}
string_view SentenceRanges::sentence(size_t index) const {
size_t bos_id;
string_view eos, bos;
bos_id = sentenceBeginIds_[index];
bos = flatByteRanges_[bos_id];
if (index + 1 == numSentences()) {
eos = flatByteRanges_.back();
} else {
assert(index < numSentences());
size_t eos_id = sentenceBeginIds_[index + 1];
--eos_id;
eos = flatByteRanges_[eos_id];
}
return sentenceBetween(bos, eos);
}
string_view SentenceRanges::sentenceBetween(string_view firstWord,
string_view lastWord) const {
const char *data = firstWord.data();
size_t size = lastWord.data() + lastWord.size() - firstWord.data();
return string_view(data, size);
}
} // namespace bergamot
} // namespace marian

View File

@ -0,0 +1,52 @@
#ifndef BERGAMOT_SENTENCE_RANGES_H_
#define BERGAMOT_SENTENCE_RANGES_H_
#include "data/types.h"
#include <cassert>
#include <vector>
namespace marian {
namespace bergamot {
class SentenceRanges {
// SentenceRanges stores string_views into a source text, with additional
// annotations to mark sentence boundaries.
//
// Given the availability annotations, this container provides capabilty to
// add sentences, and access individual sentences.
public:
typedef std::vector<string_view>::iterator WordIterator;
void addSentence(std::vector<string_view> &wordRanges);
void addSentence(WordIterator begin, WordIterator end);
void clear() {
flatByteRanges_.clear();
sentenceBeginIds_.clear();
}
size_t numSentences() const { return sentenceBeginIds_.size(); }
// Returns a string_view into the ith sentence.
string_view sentence(size_t index) const;
private:
// A flat storage for string_views. Can be words or sentences.
std::vector<string_view> flatByteRanges_;
// The container grows dynamically with addSentence. size_t marking index is
// used to ensure the sentence boundaries stay same while underlying storage
// might be changed during reallocation.
std::vector<size_t> sentenceBeginIds_;
// Utility function to extract the string starting at firstWord and ending at
// lastWord as a single string-view.
string_view sentenceBetween(string_view firstWord,
string_view lastWord) const;
};
} // namespace bergamot
} // namespace marian
#endif // BERGAMOT_SENTENCE_RANGES_H_

View File

@ -44,7 +44,7 @@ std::future<Response> Service::translateWithCopy(std::string input) {
}
std::future<Response> Service::translate(std::string &&input) {
// Takes in a blob of text. Segments and std::vector<TokenRanges> are
// Takes in a blob of text. Segments and SentenceRanges are
// extracted from the input (blob of text) and used to construct a Request
// along with a promise. promise value is set by the worker completing a
// request.
@ -57,16 +57,15 @@ std::future<Response> Service::translate(std::string &&input) {
// returns future corresponding to the promise.
Segments segments;
std::vector<TokenRanges> sourceTokenRanges;
text_processor_.process(input, segments, sourceTokenRanges);
SentenceRanges sourceRanges;
text_processor_.process(input, segments, sourceRanges);
std::promise<Response> responsePromise;
auto future = responsePromise.get_future();
Ptr<Request> request =
New<Request>(requestId_++, /* lineNumberBegin = */ 0, vocabs_,
std::move(input), std::move(segments),
std::move(sourceTokenRanges), std::move(responsePromise));
Ptr<Request> request = New<Request>(
requestId_++, /* lineNumberBegin = */ 0, vocabs_, std::move(input),
std::move(segments), std::move(sourceRanges), std::move(responsePromise));
batcher_.addWholeRequest(request);

View File

@ -1,4 +1,5 @@
#include "text_processor.h"
#include "sentence_ranges.h"
#include "data/types.h"
#include "definitions.h"
@ -10,9 +11,9 @@ namespace marian {
namespace bergamot {
Segment TextProcessor::tokenize(const string_view &segment,
TokenRanges &tokenRanges) {
std::vector<string_view> &wordRanges) {
return vocabs_->front()->encodeWithByteRanges(
segment, tokenRanges, /*addEOS=*/false, /*inference=*/true);
segment, wordRanges, /*addEOS=*/false, /*inference=*/true);
}
TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
@ -26,7 +27,7 @@ TextProcessor::TextProcessor(std::vector<Ptr<Vocab const>> &vocabs,
}
void TextProcessor::process(const string_view &query, Segments &segments,
std::vector<TokenRanges> &sourceRanges) {
SentenceRanges &sourceRanges) {
auto sentenceStream = sentence_splitter_.createSentenceStream(query);
std::string_view sentenceStringPiece;
@ -34,21 +35,22 @@ void TextProcessor::process(const string_view &query, Segments &segments,
while (sentenceStream >> sentenceStringPiece) {
marian::string_view sentence(sentenceStringPiece.data(),
sentenceStringPiece.size());
TokenRanges tokenRanges;
Segment segment = tokenize(sentence, tokenRanges);
std::vector<string_view> wordRanges;
Segment segment = tokenize(sentence, wordRanges);
// There are some cases where SentencePiece or vocab returns no words
// after normalization. 0 prevents any empty entries from being added.
if (segment.size() > 0) {
// Truncate segment into max_input_size segments.
truncate(segment, tokenRanges, segments, sourceRanges);
truncate(segment, wordRanges, segments, sourceRanges);
}
}
}
void TextProcessor::truncate(Segment &segment, TokenRanges &tokenRanges,
Segments &segments,
std::vector<TokenRanges> &sourceRanges) {
void TextProcessor::truncate(Segment &segment,
std::vector<string_view> &wordRanges,
Segments &segments, SentenceRanges &sourceRanges) {
for (int offset = 0; offset < segment.size();
offset += max_input_sentence_tokens_) {
auto start = segment.begin() + offset;
@ -59,8 +61,8 @@ void TextProcessor::truncate(Segment &segment, TokenRanges &tokenRanges,
segments.emplace_back(start, start + diff);
segments.back().push_back(sourceEosId());
auto astart = tokenRanges.begin() + offset;
sourceRanges.emplace_back(astart, astart + diff);
auto astart = wordRanges.begin() + offset;
sourceRanges.addSentence(astart, astart + diff);
}
}

View File

@ -1,6 +1,7 @@
#ifndef SRC_BERGAMOT_TEXT_PROCESSOR_H_
#define SRC_BERGAMOT_TEXT_PROCESSOR_H_
#include "sentence_ranges.h"
#include "data/types.h"
#include "data/vocab.h"
#include "definitions.h"
@ -23,16 +24,17 @@ public:
explicit TextProcessor(std::vector<Ptr<Vocab const>> &vocabs, Ptr<Options>);
void process(const string_view &query, Segments &segments,
std::vector<TokenRanges> &sourceRanges);
SentenceRanges &sourceRanges);
private:
// Tokenizes an input string, returns Words corresponding. Loads the
// corresponding byte-ranges into tokenRanges.
Segment tokenize(const string_view &input, TokenRanges &tokenRanges);
Segment tokenize(const string_view &input,
std::vector<string_view> &tokenRanges);
// Truncate sentence into max_input_size segments.
void truncate(Segment &sentence, TokenRanges &tokenRanges, Segments &segments,
std::vector<TokenRanges> &sourceRanges);
void truncate(Segment &sentence, std::vector<string_view> &tokenRanges,
Segments &segments, SentenceRanges &sourceRanges);
// shorthand, used only in truncate()
const Word sourceEosId() const { return vocabs_->front()->getEosId(); }