Merged PR 11188: Handle empty inputs with batch purging

The previous mechanism to remove empty inputs does not play well with batch purging (removal of finished sentences). Now we reuse the batch purging mechanism to get rid of empty inputs by forcing EOS for all beam entries of a batch entry for the corresponding source batch entry. The purging then takes care of the rest. We set the probability to log(1) = 0.
2024-09-11 06:15:56 +03:00 · 2020-01-17 21:52:33 +00:00 · 2020-01-17 21:52:33 +00:00 · b3a23108b4
commit b3a23108b4
parent b822cd4d12
4 changed files with 27 additions and 26 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -34,6 +34,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Gradient-checkpointing

 ### Fixed
+- Fix empty source batch entries with batch purging
 - Clear RNN chache in transformer model, add correct hash functions to nodes
 - Gather-operation for all index sizes
 - Fix word weighting with max length cropping
--- a/2
+++ b/2
@ -1 +1 @@
-v1.8.39
+v1.8.40
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit c09a206a28ab3e3196b3cc123b6ec6a181e50c2c
+Subproject commit 6a08849b23f6c14eefbe12f4eb73dc638b962587
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -39,6 +39,7 @@ public:
               const std::vector<Ptr<ScorerState /*const*/>>& states,
               Ptr<data::CorpusBatch /*const*/> batch, // for alignments only
               Ptr<FactoredVocab/*const*/> factoredVocab, size_t factorGroup,
+               const std::vector<bool>& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use.
               const std::vector<IndexType>& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx]
    std::vector<float> align; // collects alignment information from the last executed time step
    if(options_->hasAndNotEmpty("alignment") && factorGroup == 0) 
@ -49,9 +50,10 @@ public:

    // create a reverse batchMap to obtain original batchIdx in the starting batch size
    // and calculate the current batch size based on non-empty beams
-    std::vector<IndexType> reverseBatchIdxMap(batchIdxMap.size());
+    std::vector<IndexType> reverseBatchIdxMap; // empty if not purging batch entries
    size_t currentDimBatch = beams.size();
    if(PURGE_BATCH) {
+      reverseBatchIdxMap.resize(batchIdxMap.size()); // adjust size if doing batch purging.
      currentDimBatch = 0;
      for(int i = 0; i < batchIdxMap.size(); ++i) {
        reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one, 
@ -66,16 +68,22 @@ public:
      // They can be between 0 and (vocabSize * nBestBeamSize * batchSize)-1.
      // (beamHypIdx refers to the GPU tensors, *not* the beams[] array; they are not the same in case of purging)
      const auto  key       = nBestKeys[i];
-      const float pathScore = nBestPathScores[i]; // expanded path score for (batchIdx, beamHypIdx, word)
-
+      
      // decompose key into individual indices (batchIdx, beamHypIdx, wordIdx)
-      const auto wordIdx         = (WordIndex)(key % vocabSize);
-      const auto beamHypIdx      =            (key / vocabSize) % nBestBeamSize;
-      const auto currentBatchIdx =            (key / vocabSize) / nBestBeamSize;
+      const auto beamHypIdx      = (key / vocabSize) % nBestBeamSize;
+      const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
+      const auto origBatchIdx    = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam

-      auto origBatchIdx          = currentBatchIdx;
-      if(PURGE_BATCH)
-        origBatchIdx = reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
+      bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx];
+
+      // if we force=drop the hypothesis, assign EOS, otherwise the expected word id. 
+      const auto wordIdx    = dropHyp ? trgVocab_->getEosId().toWordIndex() : (WordIndex)(key % vocabSize);
+
+      // @TODO: We currently assign a log probability of 0 to all beam entries of the dropped batch entry, instead it might be a good idea to use
+      // the per Hyp pathScore without the current expansion (a bit hard to obtain). 
+      // For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better. 
+      // For the empty hyp this would naturally result in 0, too. 
+      const float pathScore = dropHyp ? 0.f : nBestPathScores[i]; // 0 (Prob = 1, maximum score) if dropped or expanded path score for (batchIdx, beamHypIdx, word)

      const auto& beam = beams[origBatchIdx];
      auto& newBeam = newBeams[origBatchIdx]; // extended hypotheses are going to be placed in this new beam
@ -85,7 +93,7 @@ public:
      if (pathScore <= INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor)
        continue;

-      ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??");
+      ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...)

      // map wordIdx to word
      auto prevBeamHypIdx = beamHypIdx; // back pointer
@ -287,31 +295,22 @@ public:
      states.push_back(scorer->startState(graph, batch));
    }

-    const auto srcEosId = batch->front()->vocab()->getEosId();
-    
    // create one beam per batch entry with sentence-start hypothesis
    Beams beams(origDimBatch, Beam(beamSize_, Hypothesis::New())); // array [origDimBatch] of array [maxBeamSize] of Hypothesis, keeps full size through search.
                                                                   // batch purging is determined from an empty sub-beam.
    std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking. 
                                                      // By default that corresponds to position in array, 
                                                      // but shifts in the course of removing batch entries when they are finished.
+
+    const std::vector<bool> emptyBatchEntries; // used for recording if there are empty input batch entries
    for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) {
      batchIdxMap[origBatchIdx] = origBatchIdx; // map to same position on initialization
      auto& beam = beams[origBatchIdx];
      histories[origBatchIdx]->add(beam, trgEosId); // add beams with start-hypotheses to traceback grid

-      // Handle batch entries that consist only of source <EOS> i.e. these are empty lines
-      if(batch->front()->data()[origBatchIdx] == srcEosId) {
-        // create a target <EOS> hypothesis that extends the start-hypothesis
-        auto eosHyp = Hypothesis::New(/*prevHyp=*/    beam[0], 
-                                      /*currWord=*/   trgEosId, 
-                                      /*prevHypIdx=*/ 0, 
-                                      /*pathScore=*/  0.f);
-        auto eosBeam = Beam(beamSize_, eosHyp);      // create a dummy beam filled with <EOS>-hyps
-        histories[origBatchIdx]->add(eosBeam, trgEosId); // push dummy <EOS>-beam to traceback grid
-        beam.clear(); // Zero out current beam, so it does not get used for further symbols as empty beams get omitted everywhere.
-                      // The corresponding neural states will be purged further down.
-      }
+      // Mark batch entries that consist only of source <EOS> i.e. these are empty lines. They will be forced to EOS and purged from batch
+      const auto& srcEosId = batch->front()->vocab()->getEosId();
+      const_cast<std::vector<bool>&>(emptyBatchEntries).push_back(batch->front()->data()[origBatchIdx] == srcEosId); // const_cast during construction
    }

    // determine index of UNK in the log prob vectors if we want to suppress it in the decoding process
@ -496,6 +495,7 @@ public:
                       states,    // used for keeping track of per-ensemble-member path score
                       batch,     // only used for propagating alignment info
                       factoredVocab, factorGroup, 
+                       emptyBatchEntries, // [origDimBatch] - empty source batch entries are marked with true
                       batchIdxMap); // used to create a reverse batch index map to recover original batch indices for this step
      } // END FOR factorGroup = 0 .. numFactorGroups-1