Renaming variables; Enhancing documentation

This commit is contained in:
Jerin Philip 2021-02-15 20:21:10 +00:00
parent ca6ca154b9
commit d5a5e75451
3 changed files with 130 additions and 63 deletions

View File

@ -10,14 +10,15 @@
namespace marian {
namespace bergamot {
// -----------------------------------------------------------------
Request::Request(unsigned int Id, int lineNumberBegin,
std::vector<Ptr<Vocab const>> &vocabs, std::string &&source,
Segments &&segments,
std::vector<TokenRanges> &&sourceAlignments,
std::vector<TokenRanges> &&sourceTokenRanges,
std::promise<Response> responsePromise)
: Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs),
source_(std::move(source)), segments_(std::move(segments)),
sourceAlignments_(std::move(sourceAlignments)),
sourceTokenRanges_(std::move(sourceTokenRanges)),
response_(std::move(responsePromise)) {
counter_ = segments_.size();
@ -48,7 +49,7 @@ void Request::processHistory(size_t index, Ptr<History> history) {
void Request::completeRequest() {
// Request no longer needs to hold the content, can transfer it to
// Response.
Response response(std::move(source_), std::move(sourceAlignments_),
Response response(std::move(source_), std::move(sourceTokenRanges_),
std::move(histories_), *vocabs_);
response_.set_value(std::move(response));
}
@ -58,6 +59,8 @@ bool Request::operator<(const Request &b) const {
return Id_ < b.Id_;
}
// ------------------------------------------------------------------
RequestSentence::RequestSentence(size_t index, Ptr<Request> request)
: index_(index), request_(request) {}
@ -87,5 +90,41 @@ bool operator<(const RequestSentence &a, const RequestSentence &b) {
return a.request_ < b.request_;
}
// ----------------------------------------------------------------------
void Batch::reset() {
Id_ = 0;
sentences_.clear();
}
void Batch::log() {
int numTokens{0}, maxLength{0};
for (auto &sentence : sentences_) {
numTokens += sentence.numTokens();
maxLength = std::max(maxLength, static_cast<int>(sentence.numTokens()));
}
LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_,
numTokens, maxLength, sentences_.size());
}
void Batch::add(const RequestSentence &sentence) {
sentences_.push_back(sentence);
}
void Batch::setId(int Id) {
assert(Id > 0);
Id_ = Id;
if (Id % 500 == 0) {
log();
}
}
void Batch::completeBatch(const Histories &histories) {
for (int i = 0; i < sentences_.size(); i++) {
sentences_[i].completeSentence(histories[i]);
}
}
} // namespace bergamot
} // namespace marian

View File

@ -3,20 +3,19 @@
//
// Request: holds the input blob of a text, Segments (vector<Words>) which are
// to go to the batching mechanism and alignments between the processed
// segments and the input blob (sourceAlignments). In addition, Request takes
// segments and the input blob (sourceTokenRanges). In addition, Request takes
// care of the barrier which fires when all the Segments in a request are done
// translating by the workers (BatchTranslator). Request is to be extended with
// notions of Priority (sequence, user-given).
// translating by the workers (BatchTranslator).
// TODO(jerinphilip): Extend Request with notions of Priority (sequence,
// user-given).
//
// RequestSentence: is a tuple of (index, Request*). This provides the
// RequestSentence: is a tuple of (index, Ptr<Request>). This provides the
// batching mechanism access to the segment within the request. The backref to
// Request allows event triggering the barrier upon completion of the last
// sentence by a worker.
//
// PCItem: is a vector of RequestSentences and a batchNumber, which is what the
// PCQueue holds. The batches are constructed from segments returned by a
// RequestSentence. Can be enhanced with paddingSize, countTokens eventually for
// logging.
// Batch: is a vector of RequestSentences tagged with a batchNumber, which is
// what the PCQueue holds. Batch is "produced" by the Batcher.
#ifndef SRC_BERGAMOT_REQUEST_H_
#define SRC_BERGAMOT_REQUEST_H_
@ -37,23 +36,10 @@ namespace marian {
namespace bergamot {
class Request {
private:
unsigned int Id_;
int lineNumberBegin_;
std::string source_;
std::atomic<int> counter_;
std::vector<Ptr<Vocab const>> *vocabs_;
Segments segments_;
std::vector<TokenRanges> sourceAlignments_;
std::vector<Ptr<History>> histories_;
std::promise<Response> response_;
public:
Request(unsigned int Id, int lineNumberBegin,
std::vector<Ptr<Vocab const>> &vocabs_, std::string &&source,
Segments &&segments, std::vector<TokenRanges> &&sourceAlignments,
Segments &&segments, std::vector<TokenRanges> &&sourceTokenRanges,
std::promise<Response> responsePromise);
// Obtain the count of tokens in the segment correponding to index. Used to
@ -68,7 +54,8 @@ public:
// several requests.
Segment getSegment(size_t index) const;
// For notions of priority among requests (used to enable <set> in Batcher).
// For notions of priority among requests, used to enable std::set in
// Batcher.
bool operator<(const Request &request) const;
// Processes a history obtained after translating in a heterogenous batch
@ -77,20 +64,60 @@ public:
// On completion of last segment, sets value of the promise.
void completeRequest();
private:
unsigned int Id_;
int lineNumberBegin_;
// Multiple translation-workers can concurrently access the same Request. The
// following atomic atomically operates on the variable holding sentences
// remaining to be translated.
std::atomic<int> counter_;
// source_ holds the source string to be translated. segments_ hold the
// sentences generated from source_ in vector<Words>. sourceTokenRanges_ are
// string_views of the text corresponding to these words, pointing to
// sequences in source_. histories_ is a buffer which eventually stores the
// translations of each segment in the corresponding index.
std::string source_;
Segments segments_;
std::vector<TokenRanges> sourceTokenRanges_;
std::vector<Ptr<History>> histories_;
// Members above are moved into newly constructed Response on completion
// of translation of all segments. The promise below is set to this Response
// value. future to this promise is made available to the user through
// Service.
std::promise<Response> response_;
// Constructing Response requires the vocabs_ used to generate Request.
std::vector<Ptr<Vocab const>> *vocabs_;
};
class RequestSentence {
private:
size_t index_;
Ptr<Request> request_;
// A RequestSentence provides a view to a sentence within a Request. Existence
// of this class allows the sentences and associated information to be kept
// within Request.
public:
RequestSentence(size_t, Ptr<Request>);
size_t numTokens() const;
// lineNumber in Request, used for matching marian-decoder. SentenceTuple
// requires lineNumber to be set for Corpus based batches.
size_t lineNumber() const;
// Accessor to the segment represented by the RequestSentence.
Segment getUnderlyingSegment() const;
// Forwards call to Request, checking for completion.
void completeSentence(Ptr<History> history);
friend bool operator<(const RequestSentence &a, const RequestSentence &b);
private:
size_t index_;
Ptr<Request> request_;
};
typedef std::vector<RequestSentence> RequestSentences;
@ -98,47 +125,48 @@ typedef std::vector<RequestSentence> RequestSentences;
class Batch {
public:
Batch() { reset(); }
void reset() {
Id_ = 0;
sentences_.clear();
}
// Convenience function to determine poison.
bool isPoison() { return (Id_ == -1); }
// Reset is required to reuse the same batch by consumer.
void reset();
// Methods to construct and determine poison.
static Batch poison() {
Batch poison_;
poison_.Id_ = -1;
return poison_;
}
bool isPoison() const { return (Id_ == -1); }
void log() {
int numTokens{0}, maxLength{0};
for (auto &sentence : sentences_) {
numTokens += sentence.numTokens();
maxLength = std::max(maxLength, static_cast<int>(sentence.numTokens()));
}
size_t size() const { return sentences_.size(); }
LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_,
numTokens, maxLength, sentences_.size());
}
// Accessors to load data into a batch. Use add(...) to add sentences into a
// batch. Once complete with a legal batch, use setId to set Id_ accordingly.
// setId only allows setting Id > 0. For use in Batcher, which acts as a
// producer to a PCQueue holding "Batch"es.
//
// Id_ =
// -1 : Batch::Poison
// 0 : Empty Batch
// >0 : Legal batch containing sentences
void add(const RequestSentence &sentence) { sentences_.push_back(sentence); }
size_t size() { return sentences_.size(); }
void setId(int Id) {
assert(Id > 0);
Id_ = Id;
if (Id % 500 == 0) {
log();
}
}
void add(const RequestSentence &sentence);
void setId(int Id);
// Accessors to read from a Batch. For use in BatchTranslator (consumer on a
// PCQueue holding batches).
//
// sentences() are used to access sentences to construct marian internal
// batch.
const RequestSentences &sentences() { return sentences_; }
void completeBatch(const Histories &histories) {
for (int i = 0; i < sentences_.size(); i++) {
sentences_[i].completeSentence(histories[i]);
}
}
// On obtaining Histories after translating a batch, completeBatch can be
// called with Histories , which forwards the call to Request through
// RequestSentence and triggers completion, by setting the promised value to
// the future given to client.
void completeBatch(const Histories &histories);
// Convenience function to log batch-statistics. numTokens, max-length.
// TODO(jerinphilip): Use to log and report packing efficiency.
void log();
private:
int Id_;

View File

@ -56,8 +56,8 @@ std::future<Response> Service::translate(std::string &&input) {
// returns future corresponding to the promise.
Segments segments;
std::vector<TokenRanges> sourceAlignments;
text_processor_.process(input, segments, sourceAlignments);
std::vector<TokenRanges> sourceTokenRanges;
text_processor_.process(input, segments, sourceTokenRanges);
std::promise<Response> responsePromise;
auto future = responsePromise.get_future();
@ -65,7 +65,7 @@ std::future<Response> Service::translate(std::string &&input) {
Ptr<Request> request =
New<Request>(requestId_++, /* lineNumberBegin = */ 0, vocabs_,
std::move(input), std::move(segments),
std::move(sourceAlignments), std::move(responsePromise));
std::move(sourceTokenRanges), std::move(responsePromise));
batcher_.addWholeRequest(request);