Merged PR 11188: Handle empty inputs with batch purging

The previous mechanism to remove empty inputs does not play well with batch purging (removal of finished sentences). Now we reuse the batch purging mechanism to get rid of empty inputs by forcing EOS for all beam entries of a batch entry for the corresponding source batch entry. The purging then takes care of the rest. We set the probability to log(1) = 0.
This commit is contained in:
Martin Junczys-Dowmunt 2020-01-17 21:52:33 +00:00
parent b822cd4d12
commit b3a23108b4
4 changed files with 27 additions and 26 deletions

View File

@ -34,6 +34,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Gradient-checkpointing
### Fixed
- Fix empty source batch entries with batch purging
- Clear RNN chache in transformer model, add correct hash functions to nodes
- Gather-operation for all index sizes
- Fix word weighting with max length cropping

View File

@ -1 +1 @@
v1.8.39
v1.8.40

@ -1 +1 @@
Subproject commit c09a206a28ab3e3196b3cc123b6ec6a181e50c2c
Subproject commit 6a08849b23f6c14eefbe12f4eb73dc638b962587

View File

@ -39,6 +39,7 @@ public:
const std::vector<Ptr<ScorerState /*const*/>>& states,
Ptr<data::CorpusBatch /*const*/> batch, // for alignments only
Ptr<FactoredVocab/*const*/> factoredVocab, size_t factorGroup,
const std::vector<bool>& dropBatchEntries, // [origDimBatch] - empty source batch entries are marked with true, should be cleared after first use.
const std::vector<IndexType>& batchIdxMap) const { // [origBatchIdx -> currentBatchIdx]
std::vector<float> align; // collects alignment information from the last executed time step
if(options_->hasAndNotEmpty("alignment") && factorGroup == 0)
@ -49,9 +50,10 @@ public:
// create a reverse batchMap to obtain original batchIdx in the starting batch size
// and calculate the current batch size based on non-empty beams
std::vector<IndexType> reverseBatchIdxMap(batchIdxMap.size());
std::vector<IndexType> reverseBatchIdxMap; // empty if not purging batch entries
size_t currentDimBatch = beams.size();
if(PURGE_BATCH) {
reverseBatchIdxMap.resize(batchIdxMap.size()); // adjust size if doing batch purging.
currentDimBatch = 0;
for(int i = 0; i < batchIdxMap.size(); ++i) {
reverseBatchIdxMap[batchIdxMap[i]] = i; // reverse batch index mapping, multiple occurences get overwritten with the last one,
@ -66,16 +68,22 @@ public:
// They can be between 0 and (vocabSize * nBestBeamSize * batchSize)-1.
// (beamHypIdx refers to the GPU tensors, *not* the beams[] array; they are not the same in case of purging)
const auto key = nBestKeys[i];
const float pathScore = nBestPathScores[i]; // expanded path score for (batchIdx, beamHypIdx, word)
// decompose key into individual indices (batchIdx, beamHypIdx, wordIdx)
const auto wordIdx = (WordIndex)(key % vocabSize);
const auto beamHypIdx = (key / vocabSize) % nBestBeamSize;
const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
const auto beamHypIdx = (key / vocabSize) % nBestBeamSize;
const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
const auto origBatchIdx = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
auto origBatchIdx = currentBatchIdx;
if(PURGE_BATCH)
origBatchIdx = reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx];
// if we force=drop the hypothesis, assign EOS, otherwise the expected word id.
const auto wordIdx = dropHyp ? trgVocab_->getEosId().toWordIndex() : (WordIndex)(key % vocabSize);
// @TODO: We currently assign a log probability of 0 to all beam entries of the dropped batch entry, instead it might be a good idea to use
// the per Hyp pathScore without the current expansion (a bit hard to obtain).
// For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better.
// For the empty hyp this would naturally result in 0, too.
const float pathScore = dropHyp ? 0.f : nBestPathScores[i]; // 0 (Prob = 1, maximum score) if dropped or expanded path score for (batchIdx, beamHypIdx, word)
const auto& beam = beams[origBatchIdx];
auto& newBeam = newBeams[origBatchIdx]; // extended hypotheses are going to be placed in this new beam
@ -85,7 +93,7 @@ public:
if (pathScore <= INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor)
continue;
ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??");
ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...)
// map wordIdx to word
auto prevBeamHypIdx = beamHypIdx; // back pointer
@ -287,31 +295,22 @@ public:
states.push_back(scorer->startState(graph, batch));
}
const auto srcEosId = batch->front()->vocab()->getEosId();
// create one beam per batch entry with sentence-start hypothesis
Beams beams(origDimBatch, Beam(beamSize_, Hypothesis::New())); // array [origDimBatch] of array [maxBeamSize] of Hypothesis, keeps full size through search.
// batch purging is determined from an empty sub-beam.
std::vector<IndexType> batchIdxMap(origDimBatch); // Record at which batch entry a beam is looking.
// By default that corresponds to position in array,
// but shifts in the course of removing batch entries when they are finished.
const std::vector<bool> emptyBatchEntries; // used for recording if there are empty input batch entries
for(int origBatchIdx = 0; origBatchIdx < origDimBatch; ++origBatchIdx) {
batchIdxMap[origBatchIdx] = origBatchIdx; // map to same position on initialization
auto& beam = beams[origBatchIdx];
histories[origBatchIdx]->add(beam, trgEosId); // add beams with start-hypotheses to traceback grid
// Handle batch entries that consist only of source <EOS> i.e. these are empty lines
if(batch->front()->data()[origBatchIdx] == srcEosId) {
// create a target <EOS> hypothesis that extends the start-hypothesis
auto eosHyp = Hypothesis::New(/*prevHyp=*/ beam[0],
/*currWord=*/ trgEosId,
/*prevHypIdx=*/ 0,
/*pathScore=*/ 0.f);
auto eosBeam = Beam(beamSize_, eosHyp); // create a dummy beam filled with <EOS>-hyps
histories[origBatchIdx]->add(eosBeam, trgEosId); // push dummy <EOS>-beam to traceback grid
beam.clear(); // Zero out current beam, so it does not get used for further symbols as empty beams get omitted everywhere.
// The corresponding neural states will be purged further down.
}
// Mark batch entries that consist only of source <EOS> i.e. these are empty lines. They will be forced to EOS and purged from batch
const auto& srcEosId = batch->front()->vocab()->getEosId();
const_cast<std::vector<bool>&>(emptyBatchEntries).push_back(batch->front()->data()[origBatchIdx] == srcEosId); // const_cast during construction
}
// determine index of UNK in the log prob vectors if we want to suppress it in the decoding process
@ -496,6 +495,7 @@ public:
states, // used for keeping track of per-ensemble-member path score
batch, // only used for propagating alignment info
factoredVocab, factorGroup,
emptyBatchEntries, // [origDimBatch] - empty source batch entries are marked with true
batchIdxMap); // used to create a reverse batch index map to recover original batch indices for this step
} // END FOR factorGroup = 0 .. numFactorGroups-1