mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-17 09:47:34 +03:00
Merged PR 22939: Fix case augmentation with multi-threaded reading
This PR fixes case augmentation with multi-threaded reading. The solution is to not look at iterator::pos_ in lazy processing, rather pass it as an argument to the lazy function.
This commit is contained in:
parent
adaaf087e4
commit
310d2f42f6
@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|||||||
### Added
|
### Added
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
- Fixed case augmentation with multi-threaded reading.
|
||||||
- Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load
|
- Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
@ -39,10 +39,10 @@ Corpus::Corpus(std::vector<std::string> paths,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
|
void Corpus::preprocessLine(std::string& line, size_t streamId, size_t lineId, bool& altered) {
|
||||||
bool isFactoredVocab = vocabs_.back()->tryAs<FactoredVocab>() != nullptr;
|
bool isFactoredVocab = vocabs_.back()->tryAs<FactoredVocab>() != nullptr;
|
||||||
altered = false;
|
altered = false;
|
||||||
if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) {
|
if (allCapsEvery_ != 0 && lineId % allCapsEvery_ == 0 && !inference_) {
|
||||||
line = vocabs_[streamId]->toUpper(line);
|
line = vocabs_[streamId]->toUpper(line);
|
||||||
if (streamId == 0)
|
if (streamId == 0)
|
||||||
LOG_ONCE(info, "[data] Source all-caps'ed line to: {}", line);
|
LOG_ONCE(info, "[data] Source all-caps'ed line to: {}", line);
|
||||||
@ -50,7 +50,7 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
|
|||||||
LOG_ONCE(info, "[data] Target all-caps'ed line to: {}", line);
|
LOG_ONCE(info, "[data] Target all-caps'ed line to: {}", line);
|
||||||
altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for all caps
|
altered = isFactoredVocab ? false : true; // FS vocab does not really "alter" the token lemma for all caps
|
||||||
}
|
}
|
||||||
else if (titleCaseEvery_ != 0 && pos_ % titleCaseEvery_ == 1 && !inference_ && streamId == 0) {
|
else if (titleCaseEvery_ != 0 && lineId % titleCaseEvery_ == 1 && !inference_ && streamId == 0) {
|
||||||
// Only applied to stream 0 (source) since this feature is aimed at robustness against
|
// Only applied to stream 0 (source) since this feature is aimed at robustness against
|
||||||
// title case in the source (and not at translating into title case).
|
// title case in the source (and not at translating into title case).
|
||||||
// Note: It is user's responsibility to not enable this if the source language is not English.
|
// Note: It is user's responsibility to not enable this if the source language is not English.
|
||||||
@ -127,7 +127,7 @@ SentenceTuple Corpus::next() {
|
|||||||
} else {
|
} else {
|
||||||
size_t vocabId = i - shift;
|
size_t vocabId = i - shift;
|
||||||
bool altered;
|
bool altered;
|
||||||
preprocessLine(fields[i], vocabId, /*out=*/altered);
|
preprocessLine(fields[i], vocabId, curId, /*out=*/altered);
|
||||||
if (altered)
|
if (altered)
|
||||||
tup.markAltered();
|
tup.markAltered();
|
||||||
addWordsToSentenceTuple(fields[i], vocabId, tup);
|
addWordsToSentenceTuple(fields[i], vocabId, tup);
|
||||||
|
@ -33,7 +33,7 @@ private:
|
|||||||
// for pre-processing
|
// for pre-processing
|
||||||
size_t allCapsEvery_{0}; // if set, convert every N-th input sentence (after randomization) to all-caps (source and target)
|
size_t allCapsEvery_{0}; // if set, convert every N-th input sentence (after randomization) to all-caps (source and target)
|
||||||
size_t titleCaseEvery_{0}; // ditto for title case (source only)
|
size_t titleCaseEvery_{0}; // ditto for title case (source only)
|
||||||
void preprocessLine(std::string& line, size_t streamId, bool& altered); // altered => whether the segmentation was altered in marian
|
void preprocessLine(std::string& line, size_t streamId, size_t curId, bool& altered); // altered => whether the segmentation was altered in marian
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// @TODO: check if translate can be replaced by an option in options
|
// @TODO: check if translate can be replaced by an option in options
|
||||||
|
Loading…
Reference in New Issue
Block a user