Merged PR 22939: Fix case augmentation with multi-threaded reading

This PR fixes case augmentation with multi-threaded reading. The solution is to not look at iterator::pos_ in lazy processing, rather pass it as an argument to the lazy function.
This commit is contained in:
Marcin Junczys-Dowmunt 2022-03-07 16:57:32 +00:00
parent adaaf087e4
commit 310d2f42f6
4 changed files with 7 additions and 6 deletions

View File

@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### Added
### Fixed
- Fixed case augmentation with multi-threaded reading.
- Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load
### Changed

View File

@ -1 +1 @@
v1.11.3
v1.11.4

View File

@ -39,10 +39,10 @@ Corpus::Corpus(std::vector<std::string> paths,
}
void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
void Corpus::preprocessLine(std::string& line, size_t streamId, size_t lineId, bool& altered) {
bool isFactoredVocab = vocabs_.back()->tryAs<FactoredVocab>() != nullptr;
altered = false;
if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) {
if (allCapsEvery_ != 0 && lineId % allCapsEvery_ == 0 && !inference_) {
line = vocabs_[streamId]->toUpper(line);
if (streamId == 0)
LOG_ONCE(info, "[data] Source all-caps'ed line to: {}", line);
@ -50,7 +50,7 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
LOG_ONCE(info, "[data] Target all-caps'ed line to: {}", line);
altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for all caps
}
else if (titleCaseEvery_ != 0 && pos_ % titleCaseEvery_ == 1 && !inference_ && streamId == 0) {
else if (titleCaseEvery_ != 0 && lineId % titleCaseEvery_ == 1 && !inference_ && streamId == 0) {
// Only applied to stream 0 (source) since this feature is aimed at robustness against
// title case in the source (and not at translating into title case).
// Note: It is user's responsibility to not enable this if the source language is not English.
@ -127,7 +127,7 @@ SentenceTuple Corpus::next() {
} else {
size_t vocabId = i - shift;
bool altered;
preprocessLine(fields[i], vocabId, /*out=*/altered);
preprocessLine(fields[i], vocabId, curId, /*out=*/altered);
if (altered)
tup.markAltered();
addWordsToSentenceTuple(fields[i], vocabId, tup);

View File

@ -33,7 +33,7 @@ private:
// for pre-processing
size_t allCapsEvery_{0}; // if set, convert every N-th input sentence (after randomization) to all-caps (source and target)
size_t titleCaseEvery_{0}; // ditto for title case (source only)
void preprocessLine(std::string& line, size_t streamId, bool& altered); // altered => whether the segmentation was altered in marian
void preprocessLine(std::string& line, size_t streamId, size_t curId, bool& altered); // altered => whether the segmentation was altered in marian
public:
// @TODO: check if translate can be replaced by an option in options