Merged PR 19860: Case augmented data, if not using factored vocab must not set guided alignments

This change allows marking SentenceTuples as 'altered', if they were generated or modified by data augmentation internally in such a way so as to impact processing. In particular, for such sentence tuples, we do not want to try setting guided alignments if the externally provided guided alignments might no longer be correct after that alteration.
This commit is contained in:
Rohit Jain 2021-07-17 23:03:16 +00:00 committed by Martin Junczys-Dowmunt
parent 8e88071ae8
commit 056c4bef5b
5 changed files with 37 additions and 7 deletions

View File

@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Integrate a shortlist converter (which can convert a text lexical shortlist to a binary shortlist) into marian-conv with --shortlist option
### Fixed
- Do not set guided alignments for case augmented data if vocab is not factored
- Various fixes to enable LSH in Quicksand
- Added support to MPIWrappest::bcast (and similar) for count of type size_t
- Adding new validation metrics when training is restarted and --reset-valid-stalled is used

View File

@ -7,6 +7,7 @@
#include "common/filesystem.h"
#include "data/corpus.h"
#include "data/factored_vocab.h"
namespace marian {
namespace data {
@ -26,13 +27,16 @@ Corpus::Corpus(std::vector<std::string> paths,
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
void Corpus::preprocessLine(std::string& line, size_t streamId) {
void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
bool isFactoredVocab = vocabs_.back()->tryAs<FactoredVocab>() != nullptr;
altered = false;
if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) {
line = vocabs_[streamId]->toUpper(line);
if (streamId == 0)
LOG_ONCE(info, "[data] Source all-caps'ed line to: {}", line);
else
LOG_ONCE(info, "[data] Target all-caps'ed line to: {}", line);
altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for all caps
}
else if (titleCaseEvery_ != 0 && pos_ % titleCaseEvery_ == 1 && !inference_ && streamId == 0) {
// Only applied to stream 0 (source) since this feature is aimed at robustness against
@ -43,6 +47,7 @@ void Corpus::preprocessLine(std::string& line, size_t streamId) {
LOG_ONCE(info, "[data] Source English-title-case'd line to: {}", line);
else
LOG_ONCE(info, "[data] Target English-title-case'd line to: {}", line);
altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for title casing
}
}
@ -103,7 +108,10 @@ SentenceTuple Corpus::next() {
++shift;
} else {
size_t vocabId = j - shift;
preprocessLine(fields[j], vocabId);
bool altered;
preprocessLine(fields[j], vocabId, /*out=*/altered);
if (altered)
tup.markAltered();
addWordsToSentenceTuple(fields[j], vocabId, tup);
}
}
@ -116,7 +124,10 @@ SentenceTuple Corpus::next() {
addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
} else {
preprocessLine(line, i);
bool altered;
preprocessLine(line, i, /*out=*/altered);
if (altered)
tup.markAltered();
addWordsToSentenceTuple(line, i, tup);
}
}

View File

@ -30,7 +30,7 @@ private:
// for pre-processing
size_t allCapsEvery_{0}; // if set, convert every N-th input sentence (after randomization) to all-caps (source and target)
size_t titleCaseEvery_{0}; // ditto for title case (source only)
void preprocessLine(std::string& line, size_t streamId);
void preprocessLine(std::string& line, size_t streamId, bool& altered); // altered => whether the segmentation was altered in marian
public:
// @TODO: check if translate can be replaced by an option in options

View File

@ -447,9 +447,15 @@ void CorpusBase::addAlignmentsToBatch(Ptr<CorpusBatch> batch,
std::vector<float> aligns(srcWords * dimBatch * trgWords, 0.f);
for(int b = 0; b < dimBatch; ++b) {
for(auto p : batchVector[b].getAlignment()) {
size_t idx = p.srcPos * dimBatch * trgWords + b * trgWords + p.tgtPos;
aligns[idx] = 1.f;
// If the batch vector is altered within marian by, for example, case augmentation,
// the guided alignments we received for this tuple cease to be valid.
// Hence skip setting alignments for that sentence tuple..
if (!batchVector[b].isAltered()) {
for(auto p : batchVector[b].getAlignment()) {
size_t idx = p.srcPos * dimBatch * trgWords + b * trgWords + p.tgtPos;
aligns[idx] = 1.f;
}
}
}
batch->setGuidedAlignment(std::move(aligns));

View File

@ -28,6 +28,7 @@ private:
std::vector<Words> tuple_; // [stream index][step index]
std::vector<float> weights_; // [stream index]
WordAlignment alignment_;
bool altered_ = false;
public:
typedef Words value_type;
@ -44,6 +45,17 @@ public:
*/
size_t getId() const { return id_; }
/**
* @brief Returns whether this Tuple was altered or augmented from what
* was provided to Marian in input.
*/
bool isAltered() const { return altered_; }
/**
* @brief Mark that this Tuple was internally altered or augmented by Marian
*/
void markAltered() { altered_ = true; }
/**
* @brief Adds a new sentence at the end of the tuple.
*