Renaming variables; Enhancing documentation

2024-09-11 05:35:33 +03:00 · 2021-02-15 20:21:10 +00:00 · 2021-02-15 20:21:10 +00:00 · d5a5e75451
commit d5a5e75451
parent ca6ca154b9
3 changed files with 130 additions and 63 deletions
--- a/src/translator/request.cpp
+++ b/src/translator/request.cpp
@ -10,14 +10,15 @@
 namespace marian {
 namespace bergamot {

+// -----------------------------------------------------------------
 Request::Request(unsigned int Id, int lineNumberBegin,
                 std::vector<Ptr<Vocab const>> &vocabs, std::string &&source,
                 Segments &&segments,
-                 std::vector<TokenRanges> &&sourceAlignments,
+                 std::vector<TokenRanges> &&sourceTokenRanges,
                 std::promise<Response> responsePromise)
    : Id_(Id), lineNumberBegin_(lineNumberBegin), vocabs_(&vocabs),
      source_(std::move(source)), segments_(std::move(segments)),
-      sourceAlignments_(std::move(sourceAlignments)),
+      sourceTokenRanges_(std::move(sourceTokenRanges)),
      response_(std::move(responsePromise)) {

  counter_ = segments_.size();
@ -48,7 +49,7 @@ void Request::processHistory(size_t index, Ptr<History> history) {
 void Request::completeRequest() {
  // Request no longer needs to hold the content, can transfer it to
  // Response.
-  Response response(std::move(source_), std::move(sourceAlignments_),
+  Response response(std::move(source_), std::move(sourceTokenRanges_),
                    std::move(histories_), *vocabs_);
  response_.set_value(std::move(response));
 }
@ -58,6 +59,8 @@ bool Request::operator<(const Request &b) const {
  return Id_ < b.Id_;
 }

+// ------------------------------------------------------------------
+
 RequestSentence::RequestSentence(size_t index, Ptr<Request> request)
    : index_(index), request_(request) {}

@ -87,5 +90,41 @@ bool operator<(const RequestSentence &a, const RequestSentence &b) {
  return a.request_ < b.request_;
 }

+// ----------------------------------------------------------------------
+
+void Batch::reset() {
+  Id_ = 0;
+  sentences_.clear();
+}
+
+void Batch::log() {
+  int numTokens{0}, maxLength{0};
+  for (auto &sentence : sentences_) {
+    numTokens += sentence.numTokens();
+    maxLength = std::max(maxLength, static_cast<int>(sentence.numTokens()));
+  }
+
+  LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_,
+      numTokens, maxLength, sentences_.size());
+}
+
+void Batch::add(const RequestSentence &sentence) {
+  sentences_.push_back(sentence);
+}
+
+void Batch::setId(int Id) {
+  assert(Id > 0);
+  Id_ = Id;
+  if (Id % 500 == 0) {
+    log();
+  }
+}
+
+void Batch::completeBatch(const Histories &histories) {
+  for (int i = 0; i < sentences_.size(); i++) {
+    sentences_[i].completeSentence(histories[i]);
+  }
+}
+
 } // namespace bergamot
 } // namespace marian
--- a/src/translator/request.h
+++ b/src/translator/request.h
@ -3,20 +3,19 @@
 //
 // Request: holds the input blob of a text, Segments (vector<Words>) which are
 // to go to the batching mechanism and alignments between the processed
-// segments and the input blob (sourceAlignments). In addition, Request takes
+// segments and the input blob (sourceTokenRanges). In addition, Request takes
 // care of the barrier which fires when all the Segments in a request are done
-// translating by the workers (BatchTranslator). Request is to be extended with
-// notions of Priority (sequence, user-given).
+// translating by the workers (BatchTranslator).
+// TODO(jerinphilip):  Extend Request with notions of Priority (sequence,
+// user-given).
 //
-// RequestSentence: is a tuple of (index, Request*). This provides the
+// RequestSentence: is a tuple of (index, Ptr<Request>). This provides the
 // batching mechanism access to the segment within the request. The backref to
 // Request allows event triggering the barrier upon completion of the last
 // sentence by a worker.
 //
-// PCItem: is a vector of RequestSentences and a batchNumber, which is what the
-// PCQueue holds. The batches are constructed from segments returned by a
-// RequestSentence. Can be enhanced with paddingSize, countTokens eventually for
-// logging.
+// Batch: is a vector of RequestSentences tagged with a batchNumber, which is
+// what the PCQueue holds. Batch is "produced" by the Batcher.

 #ifndef SRC_BERGAMOT_REQUEST_H_
 #define SRC_BERGAMOT_REQUEST_H_
@ -37,23 +36,10 @@ namespace marian {
 namespace bergamot {

 class Request {
-private:
-  unsigned int Id_;
-  int lineNumberBegin_;
-  std::string source_;
-  std::atomic<int> counter_;
-  std::vector<Ptr<Vocab const>> *vocabs_;
-
-  Segments segments_;
-  std::vector<TokenRanges> sourceAlignments_;
-  std::vector<Ptr<History>> histories_;
-
-  std::promise<Response> response_;
-
 public:
  Request(unsigned int Id, int lineNumberBegin,
          std::vector<Ptr<Vocab const>> &vocabs_, std::string &&source,
-          Segments &&segments, std::vector<TokenRanges> &&sourceAlignments,
+          Segments &&segments, std::vector<TokenRanges> &&sourceTokenRanges,
          std::promise<Response> responsePromise);

  // Obtain the count of tokens in the segment correponding to index. Used to
@ -68,7 +54,8 @@ public:
  // several requests.
  Segment getSegment(size_t index) const;

-  // For notions of priority among requests (used to enable <set> in Batcher).
+  // For notions of priority among requests, used to enable std::set in
+  // Batcher.
  bool operator<(const Request &request) const;

  // Processes a history obtained after translating in a heterogenous batch
@ -77,20 +64,60 @@ public:

  // On completion of last segment, sets value of the promise.
  void completeRequest();
+
+private:
+  unsigned int Id_;
+  int lineNumberBegin_;
+
+  // Multiple translation-workers can concurrently access the same Request. The
+  // following atomic atomically operates on the variable holding sentences
+  // remaining to be translated.
+  std::atomic<int> counter_;
+
+  // source_ holds the source string to be translated. segments_ hold the
+  // sentences generated from source_ in vector<Words>. sourceTokenRanges_ are
+  // string_views of the text corresponding to these words, pointing to
+  // sequences in source_. histories_ is a buffer which eventually stores the
+  // translations of each segment in the corresponding index.
+  std::string source_;
+  Segments segments_;
+  std::vector<TokenRanges> sourceTokenRanges_;
+  std::vector<Ptr<History>> histories_;
+
+  // Members above are moved into newly constructed Response on completion
+  // of translation of all segments. The promise below is set to this Response
+  // value. future to this promise is made available to the user through
+  // Service.
+  std::promise<Response> response_;
+
+  // Constructing Response requires the vocabs_ used to generate Request.
+  std::vector<Ptr<Vocab const>> *vocabs_;
 };

 class RequestSentence {
-private:
-  size_t index_;
-  Ptr<Request> request_;
+  // A RequestSentence provides a view to a sentence within a Request. Existence
+  // of this class allows the sentences and associated information to be kept
+  // within Request.

 public:
  RequestSentence(size_t, Ptr<Request>);
  size_t numTokens() const;
+
+  // lineNumber in Request, used for matching marian-decoder. SentenceTuple
+  // requires lineNumber to be set for Corpus based batches.
  size_t lineNumber() const;
+
+  // Accessor to the segment represented by the RequestSentence.
  Segment getUnderlyingSegment() const;
+
+  // Forwards call to Request, checking for completion.
  void completeSentence(Ptr<History> history);
+
  friend bool operator<(const RequestSentence &a, const RequestSentence &b);
+
+private:
+  size_t index_;
+  Ptr<Request> request_;
 };

 typedef std::vector<RequestSentence> RequestSentences;
@ -98,47 +125,48 @@ typedef std::vector<RequestSentence> RequestSentences;
 class Batch {
 public:
  Batch() { reset(); }
-  void reset() {
-    Id_ = 0;
-    sentences_.clear();
-  }
-  // Convenience function to determine poison.
-  bool isPoison() { return (Id_ == -1); }
+  // Reset is required to reuse the same batch by consumer.
+  void reset();
+
+  //  Methods to construct and determine poison.
  static Batch poison() {
    Batch poison_;
    poison_.Id_ = -1;
    return poison_;
  }
+  bool isPoison() const { return (Id_ == -1); }

-  void log() {
-    int numTokens{0}, maxLength{0};
-    for (auto &sentence : sentences_) {
-      numTokens += sentence.numTokens();
-      maxLength = std::max(maxLength, static_cast<int>(sentence.numTokens()));
-    }
+  size_t size() const { return sentences_.size(); }

-    LOG(info, "Batch(Id_={}, tokens={}, max-length={}, sentences_={})", Id_,
-        numTokens, maxLength, sentences_.size());
-  }
+  // Accessors to load data into a batch. Use add(...) to add sentences into a
+  // batch. Once complete with a legal batch, use setId to set Id_ accordingly.
+  // setId only allows setting Id > 0. For use in Batcher, which acts as a
+  // producer to a PCQueue holding "Batch"es.
+  //
+  // Id_ =
+  //    -1 : Batch::Poison
+  //     0 : Empty Batch
+  //    >0 : Legal batch containing sentences

-  void add(const RequestSentence &sentence) { sentences_.push_back(sentence); }
-
-  size_t size() { return sentences_.size(); }
-
-  void setId(int Id) {
-    assert(Id > 0);
-    Id_ = Id;
-    if (Id % 500 == 0) {
-      log();
-    }
-  }
+  void add(const RequestSentence &sentence);
+  void setId(int Id);

+  // Accessors to read from a Batch. For use in BatchTranslator (consumer on a
+  // PCQueue holding batches).
+  //
+  // sentences() are used to access sentences to construct marian internal
+  // batch.
  const RequestSentences &sentences() { return sentences_; }
-  void completeBatch(const Histories &histories) {
-    for (int i = 0; i < sentences_.size(); i++) {
-      sentences_[i].completeSentence(histories[i]);
-    }
-  }
+
+  // On obtaining Histories after translating a batch, completeBatch can be
+  // called with Histories , which forwards the call to Request through
+  // RequestSentence and triggers completion, by setting the promised value to
+  // the future given to client.
+  void completeBatch(const Histories &histories);
+
+  // Convenience function to log batch-statistics. numTokens, max-length.
+  // TODO(jerinphilip): Use to log and report packing efficiency.
+  void log();

 private:
  int Id_;
--- a/src/translator/service.cpp
+++ b/src/translator/service.cpp
@ -56,8 +56,8 @@ std::future<Response> Service::translate(std::string &&input) {
  // returns future corresponding to the promise.

  Segments segments;
-  std::vector<TokenRanges> sourceAlignments;
-  text_processor_.process(input, segments, sourceAlignments);
+  std::vector<TokenRanges> sourceTokenRanges;
+  text_processor_.process(input, segments, sourceTokenRanges);

  std::promise<Response> responsePromise;
  auto future = responsePromise.get_future();
@ -65,7 +65,7 @@ std::future<Response> Service::translate(std::string &&input) {
  Ptr<Request> request =
      New<Request>(requestId_++, /* lineNumberBegin = */ 0, vocabs_,
                   std::move(input), std::move(segments),
-                   std::move(sourceAlignments), std::move(responsePromise));
+                   std::move(sourceTokenRanges), std::move(responsePromise));

  batcher_.addWholeRequest(request);