diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e2a40d5..5cb28e12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added ### Fixed +- Fixed case augmentation with multi-threaded reading. - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load ### Changed diff --git a/VERSION b/VERSION index 3d461ead..f5f1545d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.11.3 +v1.11.4 diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index 2fbe4982..835d9d76 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -39,10 +39,10 @@ Corpus::Corpus(std::vector paths, } -void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { +void Corpus::preprocessLine(std::string& line, size_t streamId, size_t lineId, bool& altered) { bool isFactoredVocab = vocabs_.back()->tryAs() != nullptr; altered = false; - if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) { + if (allCapsEvery_ != 0 && lineId % allCapsEvery_ == 0 && !inference_) { line = vocabs_[streamId]->toUpper(line); if (streamId == 0) LOG_ONCE(info, "[data] Source all-caps'ed line to: {}", line); @@ -50,7 +50,7 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) { LOG_ONCE(info, "[data] Target all-caps'ed line to: {}", line); altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for all caps } - else if (titleCaseEvery_ != 0 && pos_ % titleCaseEvery_ == 1 && !inference_ && streamId == 0) { + else if (titleCaseEvery_ != 0 && lineId % titleCaseEvery_ == 1 && !inference_ && streamId == 0) { // Only applied to stream 0 (source) since this feature is aimed at robustness against // title case in the source (and not at translating into title case). // Note: It is user's responsibility to not enable this if the source language is not English. @@ -127,7 +127,7 @@ SentenceTuple Corpus::next() { } else { size_t vocabId = i - shift; bool altered; - preprocessLine(fields[i], vocabId, /*out=*/altered); + preprocessLine(fields[i], vocabId, curId, /*out=*/altered); if (altered) tup.markAltered(); addWordsToSentenceTuple(fields[i], vocabId, tup); diff --git a/src/data/corpus.h b/src/data/corpus.h index 281d43a2..20200e93 100644 --- a/src/data/corpus.h +++ b/src/data/corpus.h @@ -33,7 +33,7 @@ private: // for pre-processing size_t allCapsEvery_{0}; // if set, convert every N-th input sentence (after randomization) to all-caps (source and target) size_t titleCaseEvery_{0}; // ditto for title case (source only) - void preprocessLine(std::string& line, size_t streamId, bool& altered); // altered => whether the segmentation was altered in marian + void preprocessLine(std::string& line, size_t streamId, size_t curId, bool& altered); // altered => whether the segmentation was altered in marian public: // @TODO: check if translate can be replaced by an option in options