mirror of
https://github.com/marian-nmt/marian.git
synced 2024-11-03 20:13:47 +03:00
merged with master
This commit is contained in:
parent
266b931daa
commit
8da539e835
@ -6,6 +6,13 @@
|
||||
# 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file
|
||||
# 4. "More actions" > "Save"
|
||||
|
||||
parameters:
|
||||
# Allow skipping the entire 'Build' stage
|
||||
- name: runBuilds
|
||||
displayName: Run builds? Uncheck to run regression tests only.
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
# The pipeline CI trigger is set on the branch master only and PR trigger on a
|
||||
# (non-draft) pull request to any branch
|
||||
trigger:
|
||||
@ -45,6 +52,7 @@ stages:
|
||||
|
||||
######################################################################
|
||||
- job: BuildWindows
|
||||
condition: eq(${{ parameters.runBuilds }}, true)
|
||||
displayName: Windows
|
||||
|
||||
strategy:
|
||||
@ -180,6 +188,7 @@ stages:
|
||||
|
||||
######################################################################
|
||||
- job: BuildUbuntu
|
||||
condition: eq(${{ parameters.runBuilds }}, true)
|
||||
displayName: Ubuntu
|
||||
timeoutInMinutes: 90
|
||||
|
||||
@ -237,17 +246,7 @@ stages:
|
||||
examples: true
|
||||
static: true
|
||||
################################################################
|
||||
# Ubuntu 16.04 supports CUDA 8+
|
||||
"16.04 CUDA 9.2 gcc-7":
|
||||
image: ubuntu-16.04
|
||||
boost: true
|
||||
cpu: true
|
||||
gpu: true
|
||||
cuda: 9.2
|
||||
gcc: 7
|
||||
unit_tests: true
|
||||
examples: true
|
||||
static: false
|
||||
# Ubuntu 16.04 is no longer available on Azure-hosted machines
|
||||
|
||||
pool:
|
||||
vmImage: $(image)
|
||||
@ -322,18 +321,17 @@ stages:
|
||||
|
||||
######################################################################
|
||||
- job: BuildUbuntuMinimal
|
||||
displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5
|
||||
condition: eq(${{ parameters.runBuilds }}, true)
|
||||
displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-16.04
|
||||
vmImage: ubuntu-18.04
|
||||
|
||||
steps:
|
||||
- checkout: self
|
||||
submodules: true
|
||||
|
||||
# The script simplifies installation of different versions of CUDA.
|
||||
# Ubuntu 16.04 on Azure-hosted VMs have GCC 5.5 as gcc-5, which is not compatible with CUDA 9.
|
||||
# Downgrading to GCC 5.4 (the default gcc on Ubuntu 16.04) would be more work...
|
||||
- bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0"
|
||||
displayName: Install CUDA
|
||||
|
||||
@ -346,10 +344,10 @@ stages:
|
||||
|
||||
# GCC 5 is the minimum version supported
|
||||
- bash: |
|
||||
/usr/bin/gcc-5 --version
|
||||
/usr/bin/gcc-7 --version
|
||||
mkdir -p build
|
||||
cd build
|
||||
CC=/usr/bin/gcc-5 CXX=/usr/bin/g++-5 CUDAHOSTCXX=/usr/bin/g++-5 \
|
||||
CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \
|
||||
../cmake-3.5.1-Linux-x86_64/bin/cmake .. \
|
||||
-DCOMPILE_CPU=on \
|
||||
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0
|
||||
@ -368,10 +366,11 @@ stages:
|
||||
|
||||
######################################################################
|
||||
- job: BuildMacOS
|
||||
condition: eq(${{ parameters.runBuilds }}, true)
|
||||
displayName: macOS CPU clang
|
||||
|
||||
pool:
|
||||
vmImage: macos-latest
|
||||
vmImage: macos-10.15
|
||||
|
||||
steps:
|
||||
- checkout: self
|
||||
@ -416,6 +415,7 @@ stages:
|
||||
|
||||
######################################################################
|
||||
- job: BuildInstall
|
||||
condition: eq(${{ parameters.runBuilds }}, true)
|
||||
displayName: Linux CPU library install
|
||||
|
||||
pool:
|
||||
@ -580,7 +580,7 @@ stages:
|
||||
|
||||
# Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\'
|
||||
# instead of '/', which often breaks the job
|
||||
- bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
|
||||
- bash: MARIAN=../marian-dev/build TIMEOUT=10m bash ./run_mrt.sh '#cpu' '#basics' '#devops'
|
||||
continueOnError: true
|
||||
displayName: Run tests
|
||||
workingDirectory: marian-prod-tests
|
||||
@ -677,7 +677,7 @@ stages:
|
||||
AWS_SECRET_SAS_TOKEN: $(blob-sas-token)
|
||||
workingDirectory: marian-prod-tests
|
||||
|
||||
- bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
|
||||
- bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops'
|
||||
continueOnError: true
|
||||
displayName: Run tests
|
||||
workingDirectory: marian-prod-tests
|
||||
|
@ -31,8 +31,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
|
||||
cli.alias("fp16", "true", [&](YAML::Node& config) {
|
||||
if(mode_ == cli::mode::training) {
|
||||
config["precision"] = std::vector<std::string>({"float16", "float32"}); // inference type, optimization type, save type
|
||||
// scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor
|
||||
config["cost-scaling"] = std::vector<std::string>({"0", "1000", "2", "0.05", "10", "1e-5"});
|
||||
// scaling factor, frequency, multiplier at increase, minium scaling factor
|
||||
config["cost-scaling"] = std::vector<std::string>({"256.f", "1000", "2.f", "256.f"});
|
||||
} else {
|
||||
config["precision"] = std::vector<std::string>({"float16"}); // for inference we do not need the other types
|
||||
}
|
||||
|
@ -267,10 +267,16 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
|
||||
"Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
|
||||
cli.add<int>("--transformer-dim-ffn",
|
||||
"Size of position-wise feed-forward network (transformer)",
|
||||
2048);
|
||||
2048);
|
||||
cli.add<int>("--transformer-decoder-dim-ffn",
|
||||
"Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.",
|
||||
0);
|
||||
cli.add<int>("--transformer-ffn-depth",
|
||||
"Depth of filters (transformer)",
|
||||
2);
|
||||
cli.add<int>("--transformer-decoder-ffn-depth",
|
||||
"Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0",
|
||||
0);
|
||||
cli.add<std::string>("--transformer-ffn-activation",
|
||||
"Activation between filters: swish or relu (transformer)",
|
||||
"swish");
|
||||
@ -528,15 +534,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
|
||||
// mixed precision training
|
||||
cli.add<bool>("--fp16",
|
||||
"Shortcut for mixed precision training with float16 and cost-scaling, "
|
||||
"corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f");
|
||||
"corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f");
|
||||
cli.add<std::vector<std::string>>("--precision",
|
||||
"Mixed precision training for forward/backward pass and optimizaton. "
|
||||
"Defines types for: forward/backward pass, optimization.",
|
||||
{"float32", "float32"});
|
||||
cli.add<std::vector<std::string>>("--cost-scaling",
|
||||
"Dynamic cost scaling for mixed precision training: "
|
||||
"power of 2, scaling window, scaling factor, tolerance, range, minimum factor")
|
||||
->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f");
|
||||
"scaling factor, frequency, multiplier, minimum factor")
|
||||
->implicit_val("256.f 1000 2.f 256.f");
|
||||
cli.add<size_t>("--gradient-norm-average-window",
|
||||
"Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
|
||||
"After this many updates about 90% of the mass of the exponential average comes from these updates",
|
||||
@ -702,9 +708,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
|
||||
"Use softmax shortlist: path first best prune");
|
||||
cli.add<std::vector<float>>("--weights",
|
||||
"Scorer weights");
|
||||
cli.add<bool>("--output-sampling",
|
||||
"Noise output layer with gumbel noise",
|
||||
false);
|
||||
cli.add<std::vector<std::string>>("--output-sampling",
|
||||
"Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. "
|
||||
" Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.")
|
||||
->implicit_val("full");
|
||||
cli.add<std::vector<int>>("--output-approx-knn",
|
||||
"Use approximate knn search in output layer (currently only in transformer)")
|
||||
->implicit_val("100 1024");
|
||||
@ -889,6 +896,10 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
|
||||
if(mode_ == cli::mode::training) {
|
||||
cli.add<bool>("--shuffle-in-ram",
|
||||
"Keep shuffled corpus in RAM, do not write to temp file");
|
||||
|
||||
cli.add<size_t>("--data-threads",
|
||||
"Number of concurrent threads to use during data reading and processing", 1);
|
||||
|
||||
// @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
|
||||
cli.add<size_t>("--all-caps-every",
|
||||
"When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
|
||||
@ -907,6 +918,9 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
|
||||
cli.add<bool>("--mini-batch-round-up",
|
||||
"Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false",
|
||||
true);
|
||||
} else {
|
||||
cli.add<size_t>("--data-threads",
|
||||
"Number of concurrent threads to use during data reading and processing", 1);
|
||||
}
|
||||
// clang-format on
|
||||
}
|
||||
|
@ -106,24 +106,24 @@ using Weak = std::weak_ptr<T>;
|
||||
/** @brief Creates shared_ptr of any type, passes all arguments to any available
|
||||
* constructor */
|
||||
template <class T, typename... Args>
|
||||
Ptr<T> New(Args&&... args) {
|
||||
return Ptr<T>(new T(std::forward<Args>(args)...));
|
||||
inline Ptr<T> New(Args&&... args) {
|
||||
return std::make_shared<T>(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
Ptr<T> New(Ptr<T> p) {
|
||||
inline Ptr<T> New(Ptr<T> p) {
|
||||
return Ptr<T>(p);
|
||||
}
|
||||
|
||||
/** @brief Creates InstrusivePtr of any type, passes all arguments to any available
|
||||
* constructor */
|
||||
template <class T, typename... Args>
|
||||
IPtr<T> INew(Args&&... args) {
|
||||
inline IPtr<T> INew(Args&&... args) {
|
||||
return IPtr<T>(new T(std::forward<Args>(args)...));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
IPtr<T> INew(Ptr<T> p) {
|
||||
inline IPtr<T> INew(Ptr<T> p) {
|
||||
return IPtr<T>(p);
|
||||
}
|
||||
|
||||
|
@ -70,22 +70,20 @@ void split(const std::string& line,
|
||||
// the function guarantees that the output has as many elements as requested
|
||||
void splitTsv(const std::string& line, std::vector<std::string>& fields, size_t numFields) {
|
||||
fields.clear();
|
||||
fields.resize(numFields); // make sure there is as many elements as requested
|
||||
|
||||
size_t begin = 0;
|
||||
size_t pos = 0;
|
||||
for(size_t i = 0; i < numFields; ++i) {
|
||||
pos = line.find('\t', begin);
|
||||
if(pos == std::string::npos) {
|
||||
fields.push_back(line.substr(begin));
|
||||
fields[i] = line.substr(begin);
|
||||
break;
|
||||
}
|
||||
fields.push_back(line.substr(begin, pos - begin));
|
||||
fields[i] = line.substr(begin, pos - begin);
|
||||
begin = pos + 1;
|
||||
}
|
||||
|
||||
if(fields.size() < numFields) // make sure there is as many elements as requested
|
||||
fields.resize(numFields);
|
||||
|
||||
ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line);
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include "common/options.h"
|
||||
#include "common/signal_handling.h"
|
||||
#include "common/timer.h"
|
||||
#include "data/batch_stats.h"
|
||||
#include "data/rng_engine.h"
|
||||
#include "training/training_state.h"
|
||||
@ -92,6 +93,8 @@ private:
|
||||
|
||||
// this runs on a bg thread; sequencing is handled by caller, but locking is done in here
|
||||
std::deque<BatchPtr> fetchBatches() {
|
||||
timer::Timer total;
|
||||
|
||||
typedef typename Sample::value_type Item;
|
||||
auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content
|
||||
|
||||
@ -135,19 +138,29 @@ private:
|
||||
if(current_ != data_->end())
|
||||
++current_;
|
||||
}
|
||||
size_t sets = 0;
|
||||
while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data
|
||||
|
||||
Samples maxiBatchTemp;
|
||||
while(current_ != data_->end() && maxiBatchTemp.size() < maxSize) { // loop over data
|
||||
if (saveAndExitRequested()) // stop generating batches
|
||||
return std::deque<BatchPtr>();
|
||||
maxiBatch->push(*current_);
|
||||
sets = current_->size();
|
||||
|
||||
maxiBatchTemp.push_back(*current_);
|
||||
|
||||
// do not consume more than required for the maxi batch as this causes
|
||||
// that line-by-line translation is delayed by one sentence
|
||||
bool last = maxiBatch->size() == maxSize;
|
||||
bool last = maxiBatchTemp.size() == maxSize;
|
||||
if(!last)
|
||||
++current_; // this actually reads the next line and pre-processes it
|
||||
}
|
||||
size_t numSentencesRead = maxiBatch->size();
|
||||
size_t numSentencesRead = maxiBatchTemp.size();
|
||||
|
||||
size_t sets = 0;
|
||||
for(auto&& s : maxiBatchTemp) {
|
||||
if(!s.empty()) {
|
||||
sets = s.size();
|
||||
maxiBatch->push(s);
|
||||
}
|
||||
}
|
||||
|
||||
// construct the actual batches and place them in the queue
|
||||
Samples batchVector;
|
||||
@ -163,6 +176,7 @@ private:
|
||||
BatchStats::const_iterator cachedStatsIter;
|
||||
if (stats_)
|
||||
cachedStatsIter = stats_->begin();
|
||||
|
||||
while(!maxiBatch->empty()) { // while there are sentences in the queue
|
||||
if (saveAndExitRequested()) // stop generating batches
|
||||
return std::deque<BatchPtr>();
|
||||
@ -178,12 +192,7 @@ private:
|
||||
lengths[i] = batchVector.back()[i].size(); // record max lengths so far
|
||||
|
||||
maxBatchSize = stats_->findBatchSize(lengths, cachedStatsIter);
|
||||
// this optimization makes no difference indeed
|
||||
#if 0 // sanity check: would we find the same entry if searching from the start?
|
||||
auto it = stats_->lower_bound(lengths);
|
||||
auto maxBatchSize1 = stats_->findBatchSize(lengths, it);
|
||||
ABORT_IF(maxBatchSize != maxBatchSize1, "findBatchSize iter caching logic is borked");
|
||||
#endif
|
||||
|
||||
makeBatch = batchVector.size() >= maxBatchSize;
|
||||
// if last added sentence caused a bump then we likely have bad padding, so rather move it into the next batch
|
||||
if(batchVector.size() > maxBatchSize) {
|
||||
@ -231,6 +240,8 @@ private:
|
||||
LOG(debug, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.",
|
||||
tempBatches.size(), numSentencesRead,
|
||||
(double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom);
|
||||
LOG(debug, "[data] fetching batches took {:.2f} seconds, {:.2f} sents/s", total.elapsed(), (double)numSentencesRead / total.elapsed());
|
||||
|
||||
return tempBatches;
|
||||
}
|
||||
|
||||
|
@ -14,18 +14,30 @@ namespace data {
|
||||
|
||||
Corpus::Corpus(Ptr<Options> options, bool translate /*= false*/, size_t seed /*= Config:seed*/)
|
||||
: CorpusBase(options, translate, seed),
|
||||
shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
|
||||
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
|
||||
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
|
||||
shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
|
||||
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
|
||||
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {
|
||||
|
||||
auto numThreads = options_->get<size_t>("data-threads", 1);
|
||||
if(numThreads > 1)
|
||||
threadPool_.reset(new ThreadPool(numThreads));
|
||||
|
||||
}
|
||||
|
||||
Corpus::Corpus(std::vector<std::string> paths,
|
||||
std::vector<Ptr<Vocab>> vocabs,
|
||||
Ptr<Options> options,
|
||||
size_t seed /*= Config:seed*/)
|
||||
: CorpusBase(paths, vocabs, options, seed),
|
||||
shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
|
||||
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
|
||||
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
|
||||
shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
|
||||
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
|
||||
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {
|
||||
|
||||
auto numThreads = options_->get<size_t>("data-threads", 1);
|
||||
if(numThreads > 1)
|
||||
threadPool_.reset(new ThreadPool(numThreads));
|
||||
|
||||
}
|
||||
|
||||
void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
|
||||
bool isFactoredVocab = vocabs_.back()->tryAs<FactoredVocab>() != nullptr;
|
||||
@ -52,16 +64,10 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
|
||||
}
|
||||
|
||||
SentenceTuple Corpus::next() {
|
||||
// Used for handling TSV inputs
|
||||
// Determine the total number of fields including alignments or weights
|
||||
auto tsvNumAllFields = tsvNumInputFields_;
|
||||
if(alignFileIdx_ > -1)
|
||||
++tsvNumAllFields;
|
||||
if(weightFileIdx_ > -1)
|
||||
++tsvNumAllFields;
|
||||
std::vector<std::string> fields(tsvNumAllFields);
|
||||
size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size();
|
||||
std::vector<std::string> fields(numStreams);
|
||||
|
||||
for(;;) { // (this is a retry loop for skipping invalid sentences)
|
||||
while(true) { // retry loop
|
||||
// get index of the current sentence
|
||||
size_t curId = pos_; // note: at end, pos_ == total size
|
||||
// if corpus has been shuffled, ids_ contains sentence indexes
|
||||
@ -69,83 +75,91 @@ SentenceTuple Corpus::next() {
|
||||
curId = ids_[pos_];
|
||||
pos_++;
|
||||
|
||||
// fill up the sentence tuple with sentences from all input files
|
||||
SentenceTuple tup(curId);
|
||||
size_t eofsHit = 0;
|
||||
size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size();
|
||||
for(size_t i = 0; i < numStreams; ++i) {
|
||||
std::string line;
|
||||
|
||||
for(size_t i = 0; i < numStreams; ++i) { // looping of all streams
|
||||
// fetch line, from cached copy in RAM or actual file
|
||||
if (!corpusInRAM_.empty()) {
|
||||
if (curId < corpusInRAM_[i].size())
|
||||
line = corpusInRAM_[i][curId];
|
||||
fields[i] = corpusInRAM_[i][curId];
|
||||
else {
|
||||
eofsHit++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else {
|
||||
bool gotLine = io::getline(*files_[i], line).good();
|
||||
bool gotLine = io::getline(*files_[i], fields[i]).good();
|
||||
if(!gotLine) {
|
||||
eofsHit++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if(i > 0 && i == alignFileIdx_) {
|
||||
addAlignmentToSentenceTuple(line, tup);
|
||||
} else if(i > 0 && i == weightFileIdx_) {
|
||||
addWeightsToSentenceTuple(line, tup);
|
||||
} else {
|
||||
if(tsv_) { // split TSV input and add each field into the sentence tuple
|
||||
utils::splitTsv(line, fields, tsvNumAllFields);
|
||||
size_t shift = 0;
|
||||
for(size_t j = 0; j < tsvNumAllFields; ++j) {
|
||||
// index j needs to be shifted to get the proper vocab index if guided-alignment or
|
||||
// data-weighting are preceding source or target sequences in TSV input
|
||||
if(j == alignFileIdx_ || j == weightFileIdx_) {
|
||||
++shift;
|
||||
} else {
|
||||
size_t vocabId = j - shift;
|
||||
bool altered;
|
||||
preprocessLine(fields[j], vocabId, /*out=*/altered);
|
||||
if (altered)
|
||||
tup.markAltered();
|
||||
addWordsToSentenceTuple(fields[j], vocabId, tup);
|
||||
}
|
||||
}
|
||||
|
||||
// weights are added last to the sentence tuple, because this runs a validation that needs
|
||||
// length of the target sequence
|
||||
if(alignFileIdx_ > -1)
|
||||
addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
|
||||
if(weightFileIdx_ > -1)
|
||||
addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
|
||||
|
||||
} else {
|
||||
bool altered;
|
||||
preprocessLine(line, i, /*out=*/altered);
|
||||
if (altered)
|
||||
tup.markAltered();
|
||||
addWordsToSentenceTuple(line, i, tup);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (eofsHit == numStreams)
|
||||
return SentenceTuple(0);
|
||||
if(eofsHit == numStreams)
|
||||
return SentenceTuple(); // unintialized SentenceTuple which will be invalid when tested
|
||||
|
||||
ABORT_IF(eofsHit != 0, "not all input files have the same number of lines");
|
||||
|
||||
// check if all streams are valid, that is, non-empty and no longer than maximum allowed length
|
||||
if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
|
||||
return words.size() > 0 && words.size() <= maxLength_;
|
||||
}))
|
||||
return tup;
|
||||
auto makeSentenceTuple = [this](size_t curId, std::vector<std::string> fields) {
|
||||
if(tsv_) {
|
||||
// with tsv inputs data, there is only one input stream, hence we only have one field
|
||||
// which needs to be tokenized into tab-separated fields
|
||||
ABORT_IF(fields.size() != 1, "Reading TSV file, but we have don't have exactly one stream??");
|
||||
size_t numAllFields = tsvNumInputFields_;
|
||||
if(alignFileIdx_ > -1)
|
||||
++numAllFields;
|
||||
if(weightFileIdx_ > -1)
|
||||
++numAllFields;
|
||||
// replace single-element fields array with extracted tsv fields
|
||||
std::vector<std::string> tmpFields;
|
||||
utils::splitTsv(fields[0], tmpFields, numAllFields); // this verifies the number of fields
|
||||
fields.swap(tmpFields);
|
||||
}
|
||||
|
||||
// otherwise skip this sentence and try the next one
|
||||
// @TODO: tail recursion?
|
||||
}
|
||||
// fill up the sentence tuple with sentences from all input files
|
||||
SentenceTupleImpl tup(curId);
|
||||
size_t shift = 0;
|
||||
for(size_t i = 0; i < fields.size(); ++i) {
|
||||
// index j needs to be shifted to get the proper vocab index if guided-alignment or
|
||||
// data-weighting are preceding source or target sequences in TSV input
|
||||
if(i == alignFileIdx_ || i == weightFileIdx_) {
|
||||
++shift;
|
||||
} else {
|
||||
size_t vocabId = i - shift;
|
||||
bool altered;
|
||||
preprocessLine(fields[i], vocabId, /*out=*/altered);
|
||||
if (altered)
|
||||
tup.markAltered();
|
||||
addWordsToSentenceTuple(fields[i], vocabId, tup);
|
||||
}
|
||||
|
||||
// weights are added last to the sentence tuple, because this runs a validation that needs
|
||||
// length of the target sequence
|
||||
if(alignFileIdx_ > -1)
|
||||
addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
|
||||
if(weightFileIdx_ > -1)
|
||||
addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
|
||||
}
|
||||
|
||||
// check if all streams are valid, that is, non-empty and no longer than maximum allowed length
|
||||
if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
|
||||
return words.size() > 0 && words.size() <= maxLength_;
|
||||
})) {
|
||||
return tup;
|
||||
} else {
|
||||
return SentenceTupleImpl(); // return an empty tuple if above test does not pass
|
||||
}
|
||||
};
|
||||
|
||||
if(threadPool_) { // use thread pool if available
|
||||
return SentenceTuple(threadPool_->enqueue(makeSentenceTuple, curId, fields));
|
||||
} else { // otherwise launch here and just pass the result into the wrapper
|
||||
auto tup = makeSentenceTuple(curId, fields);
|
||||
if(!tup.empty())
|
||||
return SentenceTuple(tup);
|
||||
}
|
||||
|
||||
} // end of retry loop
|
||||
}
|
||||
|
||||
// reset and initialize shuffled reading
|
||||
@ -167,6 +181,8 @@ void Corpus::reset() {
|
||||
pos_ = 0;
|
||||
for (size_t i = 0; i < paths_.size(); ++i) {
|
||||
if(paths_[i] == "stdin" || paths_[i] == "-") {
|
||||
std::cin.tie(0);
|
||||
std::ios_base::sync_with_stdio(false);
|
||||
files_[i].reset(new std::istream(std::cin.rdbuf()));
|
||||
// Probably not necessary, unless there are some buffers
|
||||
// that we want flushed.
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
#include "3rd_party/threadpool.h"
|
||||
#include "common/definitions.h"
|
||||
#include "common/file_stream.h"
|
||||
#include "common/options.h"
|
||||
@ -20,6 +21,8 @@ class Corpus : public CorpusBase {
|
||||
private:
|
||||
std::vector<UPtr<io::TemporaryFile>> tempFiles_;
|
||||
std::vector<size_t> ids_;
|
||||
|
||||
UPtr<ThreadPool> threadPool_; // thread pool for parallelized data reading
|
||||
|
||||
// for shuffle-in-ram
|
||||
bool shuffleInRAM_{false};
|
||||
|
@ -12,7 +12,24 @@ typedef std::vector<float> MaskBatch;
|
||||
typedef std::pair<WordBatch, MaskBatch> WordMask;
|
||||
typedef std::vector<WordMask> SentBatch;
|
||||
|
||||
CorpusIterator::CorpusIterator() : pos_(-1), tup_(0) {}
|
||||
void SentenceTupleImpl::setWeights(const std::vector<float>& weights) {
|
||||
if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine
|
||||
ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights");
|
||||
auto numWeights = weights.size();
|
||||
auto numTrgWords = back().size();
|
||||
// word-level weights may or may not contain a weight for EOS tokens
|
||||
if(numWeights != numTrgWords && numWeights != numTrgWords - 1)
|
||||
LOG(warn,
|
||||
"[warn] "
|
||||
"Number of weights ({}) does not match the number of target words ({}) in line #{}",
|
||||
numWeights,
|
||||
numTrgWords,
|
||||
id_);
|
||||
}
|
||||
weights_ = weights;
|
||||
}
|
||||
|
||||
CorpusIterator::CorpusIterator() : pos_(-1) {}
|
||||
|
||||
CorpusIterator::CorpusIterator(CorpusBase* corpus)
|
||||
: corpus_(corpus), pos_(0), tup_(corpus_->next()) {}
|
||||
@ -23,7 +40,7 @@ void CorpusIterator::increment() {
|
||||
}
|
||||
|
||||
bool CorpusIterator::equal(CorpusIterator const& other) const {
|
||||
return this->pos_ == other.pos_ || (this->tup_.empty() && other.tup_.empty());
|
||||
return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid());
|
||||
}
|
||||
|
||||
const SentenceTuple& CorpusIterator::dereference() const {
|
||||
@ -390,7 +407,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
|
||||
|
||||
void CorpusBase::addWordsToSentenceTuple(const std::string& line,
|
||||
size_t batchIndex,
|
||||
SentenceTuple& tup) const {
|
||||
SentenceTupleImpl& tup) const {
|
||||
// This turns a string in to a sequence of numerical word ids. Depending
|
||||
// on the vocabulary type, this can be non-trivial, e.g. when SentencePiece
|
||||
// is used.
|
||||
@ -411,7 +428,7 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
|
||||
}
|
||||
|
||||
void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
|
||||
SentenceTuple& tup) const {
|
||||
SentenceTupleImpl& tup) const {
|
||||
ABORT_IF(rightLeft_,
|
||||
"Guided alignment and right-left model cannot be used "
|
||||
"together at the moment");
|
||||
@ -420,7 +437,7 @@ void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
|
||||
tup.setAlignment(align);
|
||||
}
|
||||
|
||||
void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const {
|
||||
void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const {
|
||||
auto elements = utils::split(line, " ");
|
||||
|
||||
if(!elements.empty()) {
|
||||
@ -549,6 +566,7 @@ size_t CorpusBase::getNumberOfTSVInputFields(Ptr<Options> options) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
<<<<<<< HEAD
|
||||
void SentenceTuple::setWeights(const std::vector<float>& weights) {
|
||||
if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine
|
||||
ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights");
|
||||
@ -564,6 +582,55 @@ void SentenceTuple::setWeights(const std::vector<float>& weights) {
|
||||
id_);
|
||||
}
|
||||
weights_ = weights;
|
||||
=======
|
||||
// experimental: hide inline-fix source tokens from cross attention
|
||||
std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
|
||||
{
|
||||
const auto& srcVocab = *vocab();
|
||||
|
||||
auto factoredVocab = vocab()->tryAs<FactoredVocab>();
|
||||
size_t inlineFixGroupIndex = 0, inlineFixSrc = 0;
|
||||
auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc);
|
||||
|
||||
auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG];
|
||||
auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG];
|
||||
auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG];
|
||||
auto unkId = srcVocab.getUnkId();
|
||||
auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId;
|
||||
|
||||
auto m = mask(); // default return value, which we will modify in-place below in case we need to
|
||||
if (hasInlineFixFactors || hasInlineFixTags) {
|
||||
LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens");
|
||||
|
||||
// example: force French translation of name "frank" to always be "franck"
|
||||
// - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to
|
||||
// - hasInlineFixTags: "<IOPEN> frank <IDELIM> franck <ICLOSE>", "frank" and all tags cannot be cross-attended to
|
||||
auto dimBatch = batchSize(); // number of sentences in the batch
|
||||
auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch
|
||||
const auto& d = data();
|
||||
size_t numWords = 0;
|
||||
for (size_t b = 0; b < dimBatch; b++) { // loop over batch entries
|
||||
bool inside = false;
|
||||
for (size_t s = 0; s < dimWidth; s++) { // loop over source positions
|
||||
auto i = locate(/*batchIdx=*/b, /*wordPos=*/s);
|
||||
if (!m[i])
|
||||
break;
|
||||
numWords++;
|
||||
// keep track of entering/exiting the inline-fix source tags
|
||||
auto w = d[i];
|
||||
if (w == fixSrcId)
|
||||
inside = true;
|
||||
else if (w == fixTgtId)
|
||||
inside = false;
|
||||
bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc;
|
||||
if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor)
|
||||
m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens
|
||||
}
|
||||
}
|
||||
ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??");
|
||||
}
|
||||
return m;
|
||||
>>>>>>> master
|
||||
}
|
||||
|
||||
} // namespace data
|
||||
|
@ -11,6 +11,8 @@
|
||||
#include "data/rng_engine.h"
|
||||
#include "data/vocab.h"
|
||||
|
||||
#include <future>
|
||||
|
||||
namespace marian {
|
||||
namespace data {
|
||||
|
||||
@ -22,7 +24,7 @@ namespace data {
|
||||
* construction of marian::data::CorpusBatch objects. They are not a part of
|
||||
* marian::data::CorpusBatch.
|
||||
*/
|
||||
class SentenceTuple {
|
||||
class SentenceTupleImpl {
|
||||
private:
|
||||
size_t id_;
|
||||
std::vector<Words> tuple_; // [stream index][step index]
|
||||
@ -33,12 +35,17 @@ private:
|
||||
public:
|
||||
typedef Words value_type;
|
||||
|
||||
/**
|
||||
* @brief Creates an empty tuple with 0 id (default constructor).
|
||||
*/
|
||||
SentenceTupleImpl() : id_(0) {}
|
||||
|
||||
/**
|
||||
* @brief Creates an empty tuple with the given Id.
|
||||
*/
|
||||
SentenceTuple(size_t id) : id_(id) {}
|
||||
SentenceTupleImpl(size_t id) : id_(id) {}
|
||||
|
||||
~SentenceTuple() { tuple_.clear(); }
|
||||
~SentenceTupleImpl() {}
|
||||
|
||||
/**
|
||||
* @brief Returns the sentence's ID.
|
||||
@ -114,6 +121,92 @@ public:
|
||||
void setAlignment(const WordAlignment& alignment) { alignment_ = alignment; }
|
||||
};
|
||||
|
||||
class SentenceTuple {
|
||||
private:
|
||||
std::shared_ptr<std::future<SentenceTupleImpl>> fImpl_;
|
||||
mutable std::shared_ptr<SentenceTupleImpl> impl_;
|
||||
|
||||
public:
|
||||
typedef Words value_type;
|
||||
|
||||
/**
|
||||
* @brief Creates an empty tuple with no associated future.
|
||||
*/
|
||||
SentenceTuple() {}
|
||||
|
||||
SentenceTuple(const SentenceTupleImpl& tupImpl)
|
||||
: impl_(std::make_shared<SentenceTupleImpl>(tupImpl)) {}
|
||||
|
||||
SentenceTuple(std::future<SentenceTupleImpl>&& fImpl)
|
||||
: fImpl_(new std::future<SentenceTupleImpl>(std::move(fImpl))) {}
|
||||
|
||||
SentenceTupleImpl& get() const {
|
||||
if(!impl_) {
|
||||
ABORT_IF(!fImpl_ || !fImpl_->valid(), "No future tuple associated with SentenceTuple");
|
||||
impl_ = std::make_shared<SentenceTupleImpl>(fImpl_->get());
|
||||
}
|
||||
return *impl_;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Returns the sentence's ID.
|
||||
*/
|
||||
size_t getId() const { return get().getId(); }
|
||||
|
||||
/**
|
||||
* @brief Returns whether this Tuple was altered or augmented from what
|
||||
* was provided to Marian in input.
|
||||
*/
|
||||
bool isAltered() const { return get().isAltered(); }
|
||||
|
||||
/**
|
||||
* @brief The size of the tuple, e.g. two for parallel data with a source and
|
||||
* target sentences.
|
||||
*/
|
||||
size_t size() const { return get().size(); }
|
||||
|
||||
/**
|
||||
* @brief confirms that the tuple has been populated with data
|
||||
*/
|
||||
bool valid() const {
|
||||
return fImpl_ || impl_;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The i-th tuple sentence.
|
||||
*
|
||||
* @param i Tuple's index.
|
||||
*/
|
||||
Words& operator[](size_t i) { return get()[i]; }
|
||||
const Words& operator[](size_t i) const { return get()[i]; }
|
||||
|
||||
/**
|
||||
* @brief The last tuple sentence, i.e. the target sentence.
|
||||
*/
|
||||
Words& back() { return get().back(); }
|
||||
const Words& back() const { return get().back(); }
|
||||
|
||||
/**
|
||||
* @brief Checks whether the tuple is empty.
|
||||
*/
|
||||
bool empty() const { return get().empty(); }
|
||||
|
||||
auto begin() const -> decltype(get().begin()) { return get().begin(); }
|
||||
auto end() const -> decltype(get().end()) { return get().end(); }
|
||||
|
||||
auto rbegin() const -> decltype(get().rbegin()) { return get().rbegin(); }
|
||||
auto rend() const -> decltype(get().rend()) { return get().rend(); }
|
||||
|
||||
/**
|
||||
* @brief Get sentence weights.
|
||||
*
|
||||
* For sentence-level weights the vector contains only one element.
|
||||
*/
|
||||
const std::vector<float>& getWeights() const { return get().getWeights(); }
|
||||
|
||||
const WordAlignment& getAlignment() const { return get().getAlignment(); }
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Batch of sentences represented as word indices with masking.
|
||||
*/
|
||||
@ -583,17 +676,17 @@ protected:
|
||||
* @brief Helper function converting a line of text into words using the i-th
|
||||
* vocabulary and adding them to the sentence tuple.
|
||||
*/
|
||||
void addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTuple& tup) const;
|
||||
void addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTupleImpl& tup) const;
|
||||
/**
|
||||
* @brief Helper function parsing a line with word alignments and adding them
|
||||
* to the sentence tuple.
|
||||
*/
|
||||
void addAlignmentToSentenceTuple(const std::string& line, SentenceTuple& tup) const;
|
||||
void addAlignmentToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const;
|
||||
/**
|
||||
* @brief Helper function parsing a line of weights and adding them to the
|
||||
* sentence tuple.
|
||||
*/
|
||||
void addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const;
|
||||
void addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const;
|
||||
|
||||
void addAlignmentsToBatch(Ptr<CorpusBatch> batch, const std::vector<Sample>& batchVector);
|
||||
|
||||
|
@ -43,7 +43,7 @@ SentenceTuple CorpusNBest::next() {
|
||||
pos_++;
|
||||
|
||||
// fill up the sentence tuple with sentences from all input files
|
||||
SentenceTuple tup(curId);
|
||||
SentenceTupleImpl tup(curId);
|
||||
|
||||
std::string line;
|
||||
lastLines_.resize(files_.size() - 1);
|
||||
@ -74,9 +74,10 @@ SentenceTuple CorpusNBest::next() {
|
||||
if(cont && std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
|
||||
return words.size() > 0 && words.size() <= maxLength_;
|
||||
}))
|
||||
return tup;
|
||||
return SentenceTuple(tup);
|
||||
}
|
||||
return SentenceTuple(0);
|
||||
|
||||
return SentenceTuple();
|
||||
}
|
||||
|
||||
void CorpusNBest::reset() {
|
||||
|
@ -109,7 +109,7 @@ SentenceTuple CorpusSQLite::next() {
|
||||
while(select_->executeStep()) {
|
||||
// fill up the sentence tuple with sentences from all input files
|
||||
size_t curId = select_->getColumn(0).getInt();
|
||||
SentenceTuple tup(curId);
|
||||
SentenceTupleImpl tup(curId);
|
||||
|
||||
for(size_t i = 0; i < files_.size(); ++i) {
|
||||
auto line = select_->getColumn((int)(i + 1));
|
||||
@ -126,9 +126,9 @@ SentenceTuple CorpusSQLite::next() {
|
||||
if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
|
||||
return words.size() > 0 && words.size() <= maxLength_;
|
||||
}))
|
||||
return tup;
|
||||
return SentenceTuple(tup);
|
||||
}
|
||||
return SentenceTuple(0);
|
||||
return SentenceTuple();
|
||||
}
|
||||
|
||||
void CorpusSQLite::shuffle() {
|
||||
|
@ -236,18 +236,20 @@ public:
|
||||
return words;
|
||||
}
|
||||
|
||||
std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
|
||||
std::string decode(const Words& sentence, bool ignoreEOS) const override {
|
||||
std::string line;
|
||||
if(keepEncoded_) { // i.e. keep the sentence segmented into subword units
|
||||
for(const Word& id : sentence)
|
||||
line += (*this)[id] + " ";
|
||||
if(!ignoreEOS || id != getEosId())
|
||||
line += (*this)[id] + " ";
|
||||
line.pop_back(); // trim the trailing whitespace
|
||||
} else {
|
||||
// convert vector of Word to vector of int
|
||||
std::vector<int> spmSentence;
|
||||
spmSentence.reserve(sentence.size());
|
||||
for(auto&& word : sentence)
|
||||
spmSentence.push_back(word.toWordIndex());
|
||||
if(!ignoreEOS || word != getEosId())
|
||||
spmSentence.push_back(word.toWordIndex());
|
||||
spm_->Decode(spmSentence, &line);
|
||||
}
|
||||
return line;
|
||||
|
@ -40,7 +40,7 @@ SentenceTuple TextInput::next() {
|
||||
size_t curId = pos_++;
|
||||
|
||||
// fill up the sentence tuple with source and/or target sentences
|
||||
SentenceTuple tup(curId);
|
||||
SentenceTupleImpl tup(curId);
|
||||
for(size_t i = 0; i < files_.size(); ++i) {
|
||||
std::string line;
|
||||
if(io::getline(*files_[i], line)) {
|
||||
@ -57,9 +57,9 @@ SentenceTuple TextInput::next() {
|
||||
}
|
||||
|
||||
if(tup.size() == files_.size()) // check if each input file provided an example
|
||||
return tup;
|
||||
return SentenceTuple(tup);
|
||||
else if(tup.size() == 0) // if no file provided examples we are done
|
||||
return SentenceTuple(0);
|
||||
return SentenceTuple();
|
||||
else // neither all nor none => we have at least on missing entry
|
||||
ABORT("There are missing entries in the text tuples.");
|
||||
}
|
||||
|
@ -357,6 +357,13 @@ Expr gather(Expr a, int axis, Expr indices) {
|
||||
return Expression<GatherNodeOp>(a, axis, indices);
|
||||
}
|
||||
|
||||
// scatter() -- scatter arbitrary elements along an axis; batched or non-batched
|
||||
// This is the reverse operation to gather.
|
||||
Expr scatter(Expr a, int axis, Expr indices, Expr source) {
|
||||
return Expression<ScatterNodeOp>(a, axis, indices, source);
|
||||
}
|
||||
|
||||
|
||||
// index_select() -- gather arbitrary elements along an axis from an unbatched
|
||||
// input 'a'. Indices are specified as a 1D vector.
|
||||
// This is used e.g. for embedding lookup.
|
||||
|
@ -687,10 +687,23 @@ Expr stopGradient(Expr a);
|
||||
* @param indices The indices to be gathered
|
||||
* @returns Gathered expression with the same shape as @p indices
|
||||
* @note @p a and @p indices must have the same rank
|
||||
* @note The non-target axes of @p a and @p indicies must have the same size, or be broadcastable.
|
||||
* @note The non-target axes of @p a and @p indices must have the same size, or be broadcastable.
|
||||
*/
|
||||
Expr gather(Expr a, int axis, Expr indices);
|
||||
|
||||
/**
|
||||
* Scatter elements from source along an axis into a. Unindexed elements from a remain unchanged.
|
||||
* This is the reverse operation to gather.
|
||||
* @param a The input expression
|
||||
* @param axis The axis along which to index
|
||||
* @param indices The indices to be scattered
|
||||
* @param source Expression with values to scatter.
|
||||
* @returns Scattered expression with the same shape as @p a now containing values from @p source in positions @p indices
|
||||
* @note @p source and @p indices must have the same rank
|
||||
* @note In this version @p source and @p indicies must have the same shape
|
||||
*/
|
||||
Expr scatter(Expr a, int axis, Expr indices, Expr source);
|
||||
|
||||
#if 0
|
||||
// reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis.
|
||||
// with broadcasting
|
||||
|
@ -1033,12 +1033,14 @@ struct GatherNodeOp : public NaryNodeOp {
|
||||
|
||||
NodeOps forwardOps() override {
|
||||
return {NodeOp(
|
||||
// @TODO: rename to gather
|
||||
Select(val_, child(0)->val(), child(1)->val(), axis_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() override {
|
||||
return {NodeOp(
|
||||
Insert(child(0)->grad(), adj_, child(1)->val(), axis_))};
|
||||
// @TODO: rename to scatter
|
||||
Insert</*add=*/true>(child(0)->grad(), adj_, child(1)->val(), axis_))};
|
||||
}
|
||||
|
||||
Shape newShape(Expr a, int axis, Expr indices) {
|
||||
@ -1046,7 +1048,6 @@ struct GatherNodeOp : public NaryNodeOp {
|
||||
axis = shape.axis(axis);
|
||||
auto rank = shape.size();
|
||||
ABORT_IF(rank != indices->shape().size(), "Mismatching ranks for input ({}) and indices ({})", std::string(shape), std::string(indices->shape()));
|
||||
axis = a->shape().axis(axis);
|
||||
shape.set(axis, indices->shape()[axis]);
|
||||
for (size_t i = 0; i < rank; ++i) {
|
||||
if (i != axis) {
|
||||
@ -1086,6 +1087,62 @@ private:
|
||||
int axis_;
|
||||
};
|
||||
|
||||
struct ScatterNodeOp : public NaryNodeOp {
|
||||
ScatterNodeOp(Expr a, int axis, Expr indices, Expr source)
|
||||
: NaryNodeOp({a, indices, source}, newShape(a, axis, indices, source), a->value_type()),
|
||||
axis_(a->shape().axis(axis)) {
|
||||
matchOrAbort<IndexType>(indices->value_type());
|
||||
}
|
||||
|
||||
NodeOps forwardOps() override {
|
||||
return {NodeOp(
|
||||
CopyCast(val_, child(0)->val()); // @TODO: use normal copy
|
||||
Insert</*add=*/false>(val_, child(2)->val(), child(1)->val(), axis_)
|
||||
)};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() override {
|
||||
ABORT("backward for ScatterNodeOp not yet implemented");
|
||||
}
|
||||
|
||||
Shape newShape(Expr a, int axis, Expr indices, Expr source) {
|
||||
ABORT_IF(axis != -1, "only last dimensions");
|
||||
ABORT_IF(indices->shape() != source->shape(), "Shapes must match");
|
||||
|
||||
Shape shape = a->shape();
|
||||
// @TODO: do proper checking
|
||||
return shape;
|
||||
}
|
||||
|
||||
const std::string type() override { return "scatter"; }
|
||||
|
||||
const std::string color() override { return "orange"; }
|
||||
|
||||
virtual size_t hash() override {
|
||||
if(!hash_) {
|
||||
size_t seed = NaryNodeOp::hash();
|
||||
util::hash_combine(seed, axis_);
|
||||
hash_ = seed;
|
||||
}
|
||||
return hash_;
|
||||
}
|
||||
|
||||
virtual bool equal(Expr node) override {
|
||||
if(!NaryNodeOp::equal(node))
|
||||
return false;
|
||||
auto cnode = std::dynamic_pointer_cast<ScatterNodeOp>(node);
|
||||
if(!cnode)
|
||||
return false;
|
||||
if(axis_ != cnode->axis_)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class SerializationHelpers;
|
||||
int axis_;
|
||||
};
|
||||
|
||||
struct ColsNodeOp : public NaryNodeOp {
|
||||
ColsNodeOp(Expr a, Expr indices)
|
||||
: NaryNodeOp({a, indices}, newShape(a, indices), a->value_type()) {
|
||||
|
@ -133,7 +133,7 @@ public:
|
||||
}
|
||||
|
||||
void backward() override {
|
||||
Insert(/*out*/child(0)->grad(), adj_, val_, axis_);
|
||||
Insert</*add=*/true>(/*out*/child(0)->grad(), adj_, val_, axis_);
|
||||
}
|
||||
|
||||
const std::string type() override { return "topk"; }
|
||||
|
@ -309,14 +309,24 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
|
||||
}
|
||||
return Logits(std::move(allLogits), factoredVocab_);
|
||||
} else if(shortlist_) {
|
||||
return Logits(affineOrDot(input,
|
||||
shortlist_->getCachedShortWt(),
|
||||
shortlist_->getCachedShortb(),
|
||||
const Shape &inputShape = input->shape();
|
||||
assert(inputShape[1] == 1); // time dimension always 1 for decoding
|
||||
input = reshape(input, {inputShape[0], inputShape[2], 1, inputShape[3]});
|
||||
|
||||
Expr Wt = shortlist_->getCachedShortWt();
|
||||
Expr b = shortlist_->getCachedShortb();
|
||||
Expr ret = affineShortlist(input,
|
||||
Wt,
|
||||
b,
|
||||
false,
|
||||
/*transB=*/isLegacyUntransposedW ? false : true));
|
||||
/*transB=*/isLegacyUntransposedW ? false : true);
|
||||
const Shape &retShape = ret->shape();
|
||||
assert(retShape[2] == 1); // time dimension always 1 for decoding
|
||||
ret = reshape(ret, {retShape[0], 1, retShape[1], retShape[3]});
|
||||
return Logits(ret);
|
||||
} else {
|
||||
return Logits(
|
||||
affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
|
||||
Expr ret = affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true);
|
||||
return Logits(ret);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -10,5 +10,40 @@ Ptr<DecoderState> LogSoftmaxStep::apply(Ptr<DecoderState> state) {
|
||||
return state;
|
||||
}
|
||||
|
||||
Ptr<DecoderState> GumbelSoftmaxStep::apply(Ptr<DecoderState> state) {
|
||||
state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
|
||||
[](Expr logits) { // lemma gets gumbelled
|
||||
return logsoftmax(logits + constant_like(logits, inits::gumbel()));
|
||||
},
|
||||
logsoftmax)); // factors don't
|
||||
return state;
|
||||
}
|
||||
|
||||
TopkGumbelSoftmaxStep::TopkGumbelSoftmaxStep(int k) : k_{k} {}
|
||||
|
||||
Ptr<DecoderState> TopkGumbelSoftmaxStep::apply(Ptr<DecoderState> state) {
|
||||
state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
|
||||
[=](Expr logits) { // lemma gets gumbelled
|
||||
// create logits-sized tensor consisting only of invalid path scores
|
||||
float invalidPathScore = NumericLimits<float>(logits->value_type()).lowest;
|
||||
Expr invalidLogits = constant_like(logits, inits::fromValue(invalidPathScore));
|
||||
|
||||
// select top-k values
|
||||
Expr val, idx;
|
||||
std::tie(val, idx) = topk(logits, k_, /*axis=*/-1, /*descending=*/true);
|
||||
|
||||
// uncomment below to display probability mass in top-k selection
|
||||
// debug(sum(gather(softmax(logits), -1, idx), -1), "sum");
|
||||
|
||||
// Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search
|
||||
Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel()));
|
||||
|
||||
// Scatter gumbelled values back into logits to fill with usable values
|
||||
return scatter(invalidLogits, -1, idx, gumbelVal);
|
||||
},
|
||||
logsoftmax)); // factors don't
|
||||
return state;
|
||||
}
|
||||
|
||||
} // namespace models
|
||||
} // namespace marian
|
||||
|
@ -297,20 +297,30 @@ public:
|
||||
virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
|
||||
};
|
||||
|
||||
// Gumbel-max noising for sampling during beam-search
|
||||
// Seems to work well enough with beam-size=1. Turn on
|
||||
// with --output-sampling during translation with marian-decoder
|
||||
// Gumbel-max noising for sampling during translation.
|
||||
// Produces accurate sampling with beam=1. Turn on
|
||||
// with --output-sampling [full] during translation
|
||||
// with marian-decoder for samnpling from the full
|
||||
// softmax distribution.
|
||||
class GumbelSoftmaxStep : public ILogProbStep {
|
||||
public:
|
||||
virtual ~GumbelSoftmaxStep() {}
|
||||
virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
|
||||
state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
|
||||
[](Expr logits) { // lemma gets gumbelled
|
||||
return logsoftmax(logits + constant_like(logits, inits::gumbel()));
|
||||
},
|
||||
logsoftmax)); // factors don't
|
||||
return state;
|
||||
}
|
||||
virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
|
||||
};
|
||||
|
||||
|
||||
// Gumbel-max noising for top-k sampling during translation.
|
||||
// Produces accurate sampling with beam=1. Turn on
|
||||
// with --output-sampling topk [10] during translation
|
||||
// with marian-decoder for top-10 sampling.
|
||||
class TopkGumbelSoftmaxStep : public ILogProbStep {
|
||||
private:
|
||||
int k_{1};
|
||||
|
||||
public:
|
||||
TopkGumbelSoftmaxStep(int k);
|
||||
virtual ~TopkGumbelSoftmaxStep() {}
|
||||
virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
|
||||
};
|
||||
|
||||
// class to wrap an IEncoderDecoder and a ILogProbStep that are executed in sequence,
|
||||
|
@ -38,7 +38,9 @@ EncoderDecoder::EncoderDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
|
||||
modelFeatures_.insert("transformer-heads");
|
||||
modelFeatures_.insert("transformer-no-projection");
|
||||
modelFeatures_.insert("transformer-dim-ffn");
|
||||
modelFeatures_.insert("transformer-decoder-dim-ffn");
|
||||
modelFeatures_.insert("transformer-ffn-depth");
|
||||
modelFeatures_.insert("transformer-decoder-ffn-depth");
|
||||
modelFeatures_.insert("transformer-ffn-activation");
|
||||
modelFeatures_.insert("transformer-dim-aan");
|
||||
modelFeatures_.insert("transformer-aan-depth");
|
||||
|
@ -370,10 +370,25 @@ Ptr<IModel> createModelFromOptions(Ptr<Options> options, usage use) {
|
||||
// add (log)softmax if requested
|
||||
if (use == usage::translation) {
|
||||
if(std::dynamic_pointer_cast<EncoderDecoder>(baseModel)) {
|
||||
if(options->get<bool>("output-sampling", false))
|
||||
return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<GumbelSoftmaxStep>());
|
||||
else
|
||||
if(options->hasAndNotEmpty("output-sampling")) {
|
||||
auto sampling = options->get<std::vector<std::string>>("output-sampling", {});
|
||||
std::string method = sampling.size() > 0 ? sampling[0] : "full";
|
||||
|
||||
if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) {
|
||||
LOG(info, "Output sampling from the full softmax distribution");
|
||||
return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<GumbelSoftmaxStep>());
|
||||
} else if(method == "topk") {
|
||||
int k = sampling.size() > 1 ? std::stoi(sampling[1]) : 10;
|
||||
if(k == 1)
|
||||
LOG(info, "Output sampling with k=1 is equivalent to beam search with beam size 1");
|
||||
LOG(info, "Output sampling via top-{} sampling", k);
|
||||
return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<TopkGumbelSoftmaxStep>(k));
|
||||
} else {
|
||||
ABORT("Unknown sampling method: {}", method);
|
||||
}
|
||||
} else {
|
||||
return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<LogSoftmaxStep>());
|
||||
}
|
||||
}
|
||||
#ifdef COMPILE_EXAMPLES
|
||||
// note: 'usage::translation' here means 'inference'
|
||||
|
@ -148,8 +148,7 @@ public:
|
||||
|
||||
int dimDepth = dimModel / dimHeads;
|
||||
|
||||
auto output
|
||||
= reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});
|
||||
auto output = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});
|
||||
|
||||
return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth]
|
||||
}
|
||||
@ -364,9 +363,9 @@ public:
|
||||
|
||||
Expr LayerAttention(std::string prefix,
|
||||
Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
|
||||
const Expr& keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
|
||||
const Expr& values, // ...?
|
||||
const Expr& mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
|
||||
Expr keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
|
||||
Expr values, // ...?
|
||||
Expr mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
|
||||
int dimHeads,
|
||||
bool cache = false,
|
||||
bool saveAttentionWeights = false) {
|
||||
@ -376,6 +375,12 @@ public:
|
||||
auto opsPre = opt<std::string>("transformer-preprocess");
|
||||
auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb);
|
||||
|
||||
// fixes missing norm for keys and values in self-attention with pre-norm
|
||||
if(input == keys)
|
||||
keys = output;
|
||||
if(input == values)
|
||||
values = output;
|
||||
|
||||
// multi-head self-attention over previous input
|
||||
output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights);
|
||||
|
||||
@ -403,7 +408,7 @@ public:
|
||||
opt<int>("transformer-heads"), /*cache=*/false);
|
||||
}
|
||||
|
||||
Expr LayerFFN(std::string prefix, Expr input) const {
|
||||
Expr LayerFFN(std::string prefix, Expr input, bool isDecoder=false) const {
|
||||
int dimModel = input->shape()[-1];
|
||||
|
||||
float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
|
||||
@ -411,13 +416,22 @@ public:
|
||||
auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb);
|
||||
|
||||
auto actName = opt<std::string>("transformer-ffn-activation");
|
||||
|
||||
int dimFfn = opt<int>("transformer-dim-ffn");
|
||||
int depthFfn = opt<int>("transformer-ffn-depth");
|
||||
float ffnDropProb
|
||||
= inference_ ? 0 : opt<float>("transformer-dropout-ffn");
|
||||
|
||||
if(isDecoder) {
|
||||
int decDimFfn = opt<int>("transformer-decoder-dim-ffn", 0);
|
||||
if(decDimFfn != 0)
|
||||
dimFfn = decDimFfn;
|
||||
|
||||
int decDepthFfn = opt<int>("transformer-decoder-ffn-depth", 0);
|
||||
if(decDepthFfn != 0)
|
||||
depthFfn = decDepthFfn;
|
||||
}
|
||||
|
||||
ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn);
|
||||
|
||||
|
||||
float ffnDropProb = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
|
||||
auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f);
|
||||
|
||||
// the stack of FF layers
|
||||
@ -866,7 +880,7 @@ public:
|
||||
// remember decoder state
|
||||
decoderStates.push_back(decoderState);
|
||||
|
||||
query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
|
||||
query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query, /*isDecoder=*/true); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
|
||||
|
||||
checkpoint(query);
|
||||
}
|
||||
|
@ -24,6 +24,10 @@ void IsNaN(const Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool& /*isNaN*/, b
|
||||
ABORT("Not implemented");
|
||||
}
|
||||
|
||||
bool SanitizeGradient(marian::Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool /*pruneNaN*/, bool /*clipInf*/) {
|
||||
ABORT("Not implemented");
|
||||
}
|
||||
|
||||
template <bool add, typename To, typename From>
|
||||
void CopyCastTo(To* out, const From* in, int length) {
|
||||
for(int i = 0; i < length; ++i)
|
||||
@ -735,6 +739,7 @@ void Select(Tensor out,
|
||||
}
|
||||
}
|
||||
|
||||
template <bool add>
|
||||
void Insert(Tensor out,
|
||||
const Tensor in,
|
||||
const Tensor indices,
|
||||
@ -756,10 +761,16 @@ void Insert(Tensor out,
|
||||
int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
|
||||
dims[axisCPU] = (int)indices->data<IndexType>()[idxIndex];
|
||||
int outIndex = outShape.index(dims);
|
||||
out->data()[outIndex] += in->data()[index];
|
||||
if(add)
|
||||
out->data()[outIndex] += in->data()[index];
|
||||
else
|
||||
out->data()[outIndex] = in->data()[index];
|
||||
}
|
||||
}
|
||||
|
||||
template void Insert<true>(Tensor out, const Tensor in, const Tensor indices, int axis);
|
||||
template void Insert<false>(Tensor out, const Tensor in, const Tensor indices, int axis);
|
||||
|
||||
void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
|
||||
int rows = out_->shape().elements() / out_->shape().back();
|
||||
int cols = out_->shape().back();
|
||||
|
@ -29,7 +29,9 @@ __global__ void gElement(
|
||||
indices[i] = tensors[i].shape().bindex(dims);
|
||||
}
|
||||
|
||||
tensors[0].data()[index] = functional::apply(functor, tensors, indices);
|
||||
// This performs the internal application of the functor in float32 regardless of the input type.
|
||||
// It seems there are no speed penalties but improved precision.
|
||||
tensors[0].data()[index] = (T)functional::applyWithCast<float>(functor, tensors, indices);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -65,13 +67,7 @@ void Element(Functor functor, Tensor out, Tensors... tensors) {
|
||||
ElementTyped<float>(functor, out, tensors...);
|
||||
} else if(out->type() == Type::float16) {
|
||||
#if COMPILE_FP16
|
||||
std::vector<marian::Tensor> ts({out, tensors...});
|
||||
bool div2 = std::all_of(ts.cbegin(), ts.cend(), [](marian::Tensor t){ return t->shape()[-1] % 2 == 0; });
|
||||
if(div2) {
|
||||
ElementTyped<halfx2>(functor, out, tensors...);
|
||||
} else {
|
||||
ElementTyped<half>(functor, out, tensors...);
|
||||
}
|
||||
ElementTyped<half>(functor, out, tensors...);
|
||||
#else
|
||||
ABORT("FP16 not supported with chosen current hardware or CUDA version");
|
||||
#endif
|
||||
|
@ -562,7 +562,11 @@ void ProdBatchedLegacy(marian::Tensor C,
|
||||
ProdBatchedTypedLegacy<float, float>(C, allocator, A, B, transA, transB, beta, scalar);
|
||||
#if COMPILE_FP16
|
||||
} else if(C->type() == Type::float16) { // not a *.cu file
|
||||
ProdBatchedTypedLegacy<half, half>(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar));
|
||||
// we use computeType=float here for fp16 training as this seems more stable and roughly as fast
|
||||
ProdBatchedTypedLegacy<half, float>(C, allocator, A, B, transA, transB, beta, scalar);
|
||||
|
||||
// original for reference:
|
||||
// ProdBatchedTypedLegacy<half, half>(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar));
|
||||
#endif
|
||||
} else {
|
||||
ABORT("ProdBatchedLegacy not implemented for element type {}", C->type());
|
||||
|
@ -16,15 +16,12 @@ namespace gpu {
|
||||
namespace atomics {
|
||||
|
||||
static inline __device__ void atomicAdd(float *address, float val) {
|
||||
//*address += val;
|
||||
::atomicAdd(address, val);
|
||||
}
|
||||
|
||||
#if COMPILE_FP16
|
||||
// @TODO: copied from CuTorch, adapt this better, give credit.
|
||||
static inline __device__ void atomicAdd(half *address, half val) {
|
||||
//*address += val;
|
||||
|
||||
#if __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000 // compute capability 70 and higher with CUDA 10
|
||||
::atomicAdd(address, val);
|
||||
#else // __CUDA_ARCH__ < 700
|
||||
@ -50,7 +47,8 @@ static inline __device__ void atomicAdd(half *address, half val) {
|
||||
} while (assumed != old);
|
||||
#endif // __CUDA_ARCH__
|
||||
}
|
||||
#endif
|
||||
#endif // COMPILE_FP16
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -96,6 +94,81 @@ void IsNaN(const Tensor in, Ptr<Allocator> allocator, bool& isNaN, bool& isInf)
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void gSanitizeGradient(T* in, int length,
|
||||
bool* isNaN, bool* isInf,
|
||||
bool pruneNaN, bool clipInf,
|
||||
float forNaN = 0.f, float forInf = 65504.f, float forInfNeg = -65504.f) {
|
||||
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if(index < length) {
|
||||
float v = (float)in[index];
|
||||
// handle NaN
|
||||
if(isnan(v)) {
|
||||
if(pruneNaN) {
|
||||
in[index] = (T)forNaN;
|
||||
} else {
|
||||
*isNaN = true;
|
||||
}
|
||||
}
|
||||
// handle +/- Inf
|
||||
if(isinf(v)) {
|
||||
if(clipInf) {
|
||||
in[index] = v > 0 ? (T)forInf : (T)forInfNeg;
|
||||
} else {
|
||||
*isInf = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required.
|
||||
// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient.
|
||||
// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor.
|
||||
// In that case infinities do not result in a bad gradient, since they get clipped.
|
||||
// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result
|
||||
// in a bad gradient.
|
||||
// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`),
|
||||
// we return `false` indicating a bad gradient.
|
||||
bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
|
||||
cudaSetDevice(in->getDeviceId().no);
|
||||
|
||||
int length = in->size();
|
||||
|
||||
int threads = std::min(MAX_THREADS, length);
|
||||
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
|
||||
|
||||
auto mem = allocator->alloc<bool>(2);
|
||||
bool* dIsNaN = &mem->data<bool>()[0];
|
||||
bool* dIsInf = &mem->data<bool>()[1];
|
||||
fill(in->getBackend(), dIsNaN, dIsNaN + 2, false);
|
||||
|
||||
float forNaN = 0.f;
|
||||
float forInf = NumericLimits<float>(in->type()).max;
|
||||
float forInfNeg = NumericLimits<float>(in->type()).lowest;
|
||||
|
||||
if(in->type() == Type::float32) {
|
||||
gSanitizeGradient<<<blocks, threads>>>(in->data<float>(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg);
|
||||
#if COMPILE_FP16
|
||||
} else if(in->type() == Type::float16) {
|
||||
gSanitizeGradient<<<blocks, threads>>>(in->data<half>(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg);
|
||||
#endif
|
||||
} else {
|
||||
ABORT("gSanitizeGradient for type {} not implemented", in->type());
|
||||
}
|
||||
|
||||
bool isNaN, isInf;
|
||||
CudaCopy(dIsNaN, dIsNaN + 1, &isNaN);
|
||||
CudaCopy(dIsInf, dIsInf + 1, &isInf);
|
||||
|
||||
allocator->free(mem);
|
||||
|
||||
cudaStreamSynchronize(0);
|
||||
|
||||
return !isNaN && !isInf;
|
||||
}
|
||||
|
||||
template <bool add, typename To, typename From>
|
||||
__global__ void gCopyCastTo(To* out, const From* in, int length) {
|
||||
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
|
||||
@ -1090,7 +1163,7 @@ void PasteRows(Tensor out,
|
||||
size_t rowsToCopy = indices->size();
|
||||
|
||||
int threads = std::min(MAX_THREADS, (int)cols);
|
||||
#if 1 // @TODO: make this configurable with a 'deterministic' flag
|
||||
#if 0 // @TODO: make this configurable with a 'deterministic' flag
|
||||
// If we only use one block, then each core operates on a different column,
|
||||
// hence the summation becomes deterministic.
|
||||
// However, we only use e.g. 512 cores out of possibly 3000+, so this will be
|
||||
@ -1236,7 +1309,7 @@ __global__ void gSelect(T* out,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <bool add, typename T>
|
||||
__global__ void gInsert(T* out,
|
||||
functional::Shape outShape,
|
||||
const T* in,
|
||||
@ -1254,7 +1327,10 @@ __global__ void gInsert(T* out,
|
||||
int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
|
||||
dims[axis] = (int)d_indices[idxIndex];
|
||||
int outIndex = outShape.index(dims);
|
||||
out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
|
||||
if(add)
|
||||
out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
|
||||
else
|
||||
out[outIndex] = in[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1276,21 +1352,21 @@ void Select(Tensor out,
|
||||
|
||||
if(out->type() == Type::float32) {
|
||||
gSelect<<<blocks, threads>>>(out->data<float>(),
|
||||
out->shape(),
|
||||
in->data<float>(),
|
||||
in->shape(),
|
||||
axisGPU,
|
||||
indices->data<IndexType>(),
|
||||
indices->shape());
|
||||
out->shape(),
|
||||
in->data<float>(),
|
||||
in->shape(),
|
||||
axisGPU,
|
||||
indices->data<IndexType>(),
|
||||
indices->shape());
|
||||
#if COMPILE_FP16
|
||||
} else if (out->type() == Type::float16) {
|
||||
gSelect<<<blocks, threads>>>(out->data<half>(),
|
||||
out->shape(),
|
||||
in->data<half>(),
|
||||
in->shape(),
|
||||
axisGPU,
|
||||
indices->data<IndexType>(),
|
||||
indices->shape());
|
||||
out->shape(),
|
||||
in->data<half>(),
|
||||
in->shape(),
|
||||
axisGPU,
|
||||
indices->data<IndexType>(),
|
||||
indices->shape());
|
||||
#endif
|
||||
} else if(out->type() == Type::uint32) {
|
||||
gSelect<<<blocks, threads>>>(out->data<IndexType>(),
|
||||
@ -1305,6 +1381,7 @@ void Select(Tensor out,
|
||||
}
|
||||
}
|
||||
|
||||
template <bool add>
|
||||
void Insert(Tensor out,
|
||||
const Tensor in,
|
||||
const Tensor indices,
|
||||
@ -1320,28 +1397,31 @@ void Insert(Tensor out,
|
||||
int axisGPU = axis + functional::Shape::size() - out->shape().size();
|
||||
|
||||
if(out->type() == Type::float32) {
|
||||
gInsert<<<blocks, threads>>>(out->data<float>(),
|
||||
out->shape(),
|
||||
in->data<float>(),
|
||||
in->shape(),
|
||||
axisGPU,
|
||||
indices->data<IndexType>(),
|
||||
indices->shape());
|
||||
gInsert<add><<<blocks, threads>>>(out->data<float>(),
|
||||
out->shape(),
|
||||
in->data<float>(),
|
||||
in->shape(),
|
||||
axisGPU,
|
||||
indices->data<IndexType>(),
|
||||
indices->shape());
|
||||
#if COMPILE_FP16
|
||||
} else if (out->type() == Type::float16) {
|
||||
gInsert<<<blocks, threads>>>(out->data<half>(),
|
||||
out->shape(),
|
||||
in->data<half>(),
|
||||
in->shape(),
|
||||
axisGPU,
|
||||
indices->data<IndexType>(),
|
||||
indices->shape());
|
||||
gInsert<add><<<blocks, threads>>>(out->data<half>(),
|
||||
out->shape(),
|
||||
in->data<half>(),
|
||||
in->shape(),
|
||||
axisGPU,
|
||||
indices->data<IndexType>(),
|
||||
indices->shape());
|
||||
#endif
|
||||
} else {
|
||||
ABORT("Insert not implemented for type {}", out->type());
|
||||
}
|
||||
}
|
||||
|
||||
template void Insert<true>(Tensor out, const Tensor in, const Tensor indices, int axis);
|
||||
template void Insert<false>(Tensor out, const Tensor in, const Tensor indices, int axis);
|
||||
|
||||
template <typename T>
|
||||
__global__ void gGRUFastForward(T* out,
|
||||
const T* state,
|
||||
@ -1355,7 +1435,7 @@ __global__ void gGRUFastForward(T* out,
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int j = bid + blockIdx.x;
|
||||
if(j < rows) {
|
||||
T m = !mask || mask[j];
|
||||
float m = !mask || mask[j];
|
||||
T* rowOut = out + j * cols;
|
||||
const T* rowState = state + j * cols;
|
||||
|
||||
@ -1365,21 +1445,21 @@ __global__ void gGRUFastForward(T* out,
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int i = tid + threadIdx.x;
|
||||
if(i < cols) {
|
||||
T r = functional::Ops<T>::sigmoid(xWrow[i] + sUrow[i] + b[i]);
|
||||
float r = functional::Ops<float>::sigmoid((float)xWrow[i] + (float)sUrow[i] + (float)b[i]);
|
||||
|
||||
int k = i + cols;
|
||||
|
||||
T z = functional::Ops<T>::sigmoid(xWrow[k] + sUrow[k] + b[k]);
|
||||
float z = functional::Ops<float>::sigmoid((float)xWrow[k] + (float)sUrow[k] + (float)b[k]);
|
||||
|
||||
int l = i + 2 * cols;
|
||||
T h;
|
||||
float h;
|
||||
if(final)
|
||||
h = functional::Ops<T>::tanh(xWrow[l] + (sUrow[l] + b[l]) * r);
|
||||
h = functional::Ops<float>::tanh((float)xWrow[l] + ((float)sUrow[l] + (float)b[l]) * r);
|
||||
else
|
||||
h = functional::Ops<T>::tanh(xWrow[l] + sUrow[l] * r + b[l]);
|
||||
h = functional::Ops<float>::tanh((float)xWrow[l] + (float)sUrow[l] * r + (float)b[l]);
|
||||
|
||||
T out = ((T)1.f - z) * h + z * rowState[i];
|
||||
rowOut[i] = m * out + ((T)1.f - m) * rowState[i];
|
||||
float out = (1.f - z) * h + z * (float)rowState[i];
|
||||
rowOut[i] = (T)(m * out + (1.f - m) * (float)rowState[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1441,7 +1521,7 @@ __global__ void gGRUFastBackward(T* outState,
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int j = bid + blockIdx.x;
|
||||
if(j < rows) {
|
||||
T m = !mask || mask[j];
|
||||
float m = !mask || mask[j];
|
||||
|
||||
T* rowOutState = outState + j * cols;
|
||||
T* rowOutXW = outXW + j * cols * 3;
|
||||
@ -1459,56 +1539,56 @@ __global__ void gGRUFastBackward(T* outState,
|
||||
int k = i + cols;
|
||||
int l = i + 2 * cols;
|
||||
|
||||
T r = functional::Ops<T>::sigmoid(rowXW[i] + rowSU[i] + b[i]);
|
||||
T z = functional::Ops<T>::sigmoid(rowXW[k] + rowSU[k] + b[k]);
|
||||
float r = functional::Ops<float>::sigmoid((float)rowXW[i] + (float)rowSU[i] + (float)b[i]);
|
||||
float z = functional::Ops<float>::sigmoid((float)rowXW[k] + (float)rowSU[k] + (float)b[k]);
|
||||
|
||||
T h;
|
||||
float h;
|
||||
if(final)
|
||||
h = functional::Ops<T>::tanh(rowXW[l] + (rowSU[l] + b[l]) * r);
|
||||
h = functional::Ops<float>::tanh((float)rowXW[l] + ((float)rowSU[l] + (float)b[l]) * r);
|
||||
else
|
||||
h = functional::Ops<T>::tanh(rowXW[l] + rowSU[l] * r + b[l]);
|
||||
h = functional::Ops<float>::tanh((float)rowXW[l] + (float)rowSU[l] * r + (float)b[l]);
|
||||
|
||||
T adj = rowAdj[i];
|
||||
float adj = rowAdj[i];
|
||||
|
||||
T t = ((T)1.f - z) * ((T)1.f - h * h);
|
||||
float t = (1.f - z) * (1.f - h * h);
|
||||
|
||||
// df/ds
|
||||
if(outState)
|
||||
rowOutState[i] += (m * z - m + (T)1.f) * adj;
|
||||
rowOutState[i] += (T)((m * z - m + 1.f) * adj);
|
||||
|
||||
// df/d(xW_r) ...
|
||||
T dfdxW_r = m * r * ((T)1.f - r) * t * adj;
|
||||
float dfdxW_r = m * r * (1.f - r) * t * adj;
|
||||
if(final)
|
||||
dfdxW_r *= rowSU[l] + b[l];
|
||||
dfdxW_r *= (float)rowSU[l] + (float)b[l];
|
||||
else
|
||||
dfdxW_r *= rowSU[l];
|
||||
dfdxW_r *= (float)rowSU[l];
|
||||
if(outXW)
|
||||
rowOutXW[i] += dfdxW_r;
|
||||
rowOutXW[i] += (T)dfdxW_r;
|
||||
if(outSU)
|
||||
rowOutSU[i] += dfdxW_r;
|
||||
rowOutSU[i] += (T)dfdxW_r;
|
||||
if(outB)
|
||||
rowOutB[i] += dfdxW_r;
|
||||
rowOutB[i] += (T)dfdxW_r;
|
||||
|
||||
// df/d(xW_z) ...
|
||||
T dfdxW_z = m * ((T)1.f - z) * z * (rowState[i] - h) * adj;
|
||||
float dfdxW_z = m * (1.f - z) * z * ((float)rowState[i] - h) * adj;
|
||||
if(outXW)
|
||||
rowOutXW[k] += dfdxW_z;
|
||||
rowOutXW[k] += (T)dfdxW_z;
|
||||
if(outSU)
|
||||
rowOutSU[k] += dfdxW_z;
|
||||
rowOutSU[k] += (T)dfdxW_z;
|
||||
if(outB)
|
||||
rowOutB[k] += dfdxW_z;
|
||||
rowOutB[k] += (T)dfdxW_z;
|
||||
|
||||
// df/d(xW_x) ...
|
||||
T dfdxW_x = m * t * adj;
|
||||
float dfdxW_x = m * t * adj;
|
||||
if(outXW)
|
||||
rowOutXW[l] += dfdxW_x;
|
||||
rowOutXW[l] += (T)dfdxW_x;
|
||||
if(outSU)
|
||||
rowOutSU[l] += dfdxW_x * r;
|
||||
rowOutSU[l] += (T)(dfdxW_x * r);
|
||||
if(outB)
|
||||
if(final)
|
||||
rowOutB[l] += dfdxW_x * r;
|
||||
rowOutB[l] += (T)(dfdxW_x * r);
|
||||
else
|
||||
rowOutB[l] += dfdxW_x;
|
||||
rowOutB[l] += (T)dfdxW_x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -41,6 +41,25 @@ DISPATCH2(CopyCast, marian::Tensor, const marian::Tensor);
|
||||
DISPATCH2(AddCast, marian::Tensor, const marian::Tensor);
|
||||
DISPATCH4(IsNaN, const Tensor, Ptr<Allocator>, bool&, bool&);
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
namespace gpu {
|
||||
bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace cpu {
|
||||
bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
|
||||
}
|
||||
|
||||
static inline bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(in->getBackend()->getDeviceId().type == DeviceType::gpu)
|
||||
return gpu::SanitizeGradient(in, allocator, pruneNaN, clipInf);
|
||||
else
|
||||
#endif
|
||||
return cpu::SanitizeGradient(in, allocator, pruneNaN, clipInf);
|
||||
}
|
||||
|
||||
template <class Functor, class... Tensors>
|
||||
void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
|
||||
#ifdef CUDA_FOUND
|
||||
@ -278,7 +297,28 @@ DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor)
|
||||
DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor)
|
||||
|
||||
DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int)
|
||||
DISPATCH4(Insert, marian::Tensor, const marian::Tensor, const marian::Tensor, int)
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
namespace gpu {
|
||||
template <bool add>
|
||||
void Insert(Tensor out, const Tensor in, const Tensor indices, int axis);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace cpu {
|
||||
template <bool add>
|
||||
void Insert(Tensor out, const Tensor in, const Tensor indices, int axis);
|
||||
}
|
||||
|
||||
template <bool add>
|
||||
static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int axis) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
|
||||
gpu::Insert<add>(out, in, indices, axis);
|
||||
else
|
||||
#endif
|
||||
cpu::Insert<add>(out, in, indices, axis);
|
||||
}
|
||||
|
||||
DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr<Allocator>, const marian::Tensor, int, int, bool);
|
||||
|
||||
|
@ -10,25 +10,19 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
|
||||
mbRoundUp_(options_->get<bool>("mini-batch-round-up", true)) {
|
||||
if(options_->hasAndNotEmpty("cost-scaling")) {
|
||||
auto vcs = options_->get<std::vector<std::string>>("cost-scaling");
|
||||
costScale_ = true;
|
||||
float costExponent = std::stof(vcs[0]);
|
||||
costScaleFactor_ = std::pow(2.0f, costExponent);
|
||||
|
||||
if(vcs.size() > 1) costScaleFreq_ = std::stoul(vcs[1]);
|
||||
if(vcs.size() > 2) costScaleMultiplier_ = std::stof(vcs[2]);
|
||||
if(vcs.size() > 3) costScaleNanTolerance_ = std::stof(vcs[3]);
|
||||
if(vcs.size() > 4) costScaleNanRange_ = std::stoul(vcs[4]);
|
||||
if(vcs.size() > 5) costScaleFactorMinimum_ = std::stof(vcs[5]);
|
||||
|
||||
costScaling_ = true;
|
||||
costScalingFactor_ = std::stof( vcs[0]);
|
||||
if(vcs.size() > 1) costScalingFreq_ = std::stoul(vcs[1]);
|
||||
if(vcs.size() > 2) costScalingMultiplier_ = std::stof( vcs[2]);
|
||||
if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]);
|
||||
|
||||
LOG_ONCE(info,
|
||||
"Training with cost scaling - factor: 2^{} = {}, frequency: {}, multiplier: {}, tolerance: {}, range: {}, minimum: {}",
|
||||
costExponent,
|
||||
costScaleFactor_,
|
||||
costScaleFreq_,
|
||||
costScaleMultiplier_,
|
||||
costScaleNanTolerance_,
|
||||
costScaleNanRange_,
|
||||
costScaleFactorMinimum_);
|
||||
"Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}",
|
||||
costScalingFactor_,
|
||||
costScalingFreq_,
|
||||
costScalingMultiplier_,
|
||||
costScalingFactorMinimum_);
|
||||
}
|
||||
|
||||
if(options_->hasAndNotEmpty("dynamic-gradient-scaling")) {
|
||||
@ -37,11 +31,16 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
|
||||
|
||||
if(vgc.size() > 0) dynamicGradientScalingFactor_ = std::stof(vgc[0]);
|
||||
if(vgc.size() > 1) dynamicGradientScalingUseLogs_ = vgc[1] == "log";
|
||||
if(vgc.size() > 2) dynamicGradientScalingFadeout_ = std::stoul(vgc[2]);
|
||||
|
||||
LOG_ONCE(info,
|
||||
"Re-scaling gradient to have average gradient norm if (log={}) gradient norm diverges from average by {} sigmas",
|
||||
dynamicGradientScalingUseLogs_,
|
||||
dynamicGradientScalingFactor_);
|
||||
if(dynamicGradientScalingFadeout_ > 0)
|
||||
LOG_ONCE(info,
|
||||
"Dynamic gradient re-scaling will fade out linearly after {} updates",
|
||||
dynamicGradientScalingFadeout_);
|
||||
}
|
||||
|
||||
if(options_->get<bool>("check-gradient-nan")) {
|
||||
@ -96,21 +95,17 @@ void GraphGroup::initGraphsAndOpts() {
|
||||
// given number of iterations. Usually we increase by 2 which adds
|
||||
// one more bit for precision.
|
||||
void GraphGroup::increaseCostScaleFactor() {
|
||||
if(!costScale_)
|
||||
if(!costScaling_)
|
||||
return;
|
||||
|
||||
noNanSeen_++;
|
||||
|
||||
size_t total = nanSeen_ + noNanSeen_;
|
||||
float nanPercent = noNanSeen_ == (float)nanSeen_ / (float)total; // total is at least 1 because of noNanSeen_++
|
||||
|
||||
if(noNanSeen_ % costScaleFreq_ == 0) {
|
||||
costScaleFactor_ *= costScaleMultiplier_;
|
||||
LOG(debug,
|
||||
"NaN/Inf percentage {:.2f} after {} gradient updates. Increasing cost-scaling factor to {}",
|
||||
nanPercent,
|
||||
total,
|
||||
costScaleFactor_);
|
||||
if(noNanSeen_ % costScalingFreq_ == 0) {
|
||||
costScalingFactor_ *= costScalingMultiplier_;
|
||||
if(isMainProcess())
|
||||
LOG(debug, "No NaN/Inf after {} gradient updates. Increasing cost-scaling factor to {}", total, costScalingFactor_);
|
||||
|
||||
// Resetting counts after cost-scale change
|
||||
noNanSeen_ = 0;
|
||||
@ -120,48 +115,56 @@ void GraphGroup::increaseCostScaleFactor() {
|
||||
|
||||
// call when a NaN was seen to decrease cost-scaling factor
|
||||
void GraphGroup::decreaseCostScaleFactor() {
|
||||
if(!costScale_)
|
||||
if(!costScaling_)
|
||||
return;
|
||||
|
||||
nanSeen_++;
|
||||
|
||||
size_t total = nanSeen_ + noNanSeen_;
|
||||
float nanPercent = (float)nanSeen_ / (float)total; // total is at least 1 because of nanSeen_++
|
||||
if(total >= costScaleNanRange_ && nanPercent > costScaleNanTolerance_) {
|
||||
if(costScaleFactor_ > costScaleFactorMinimum_) {
|
||||
costScaleFactor_ /= costScaleMultiplier_;
|
||||
LOG(debug,
|
||||
"NaN/Inf percentage {:.2f} in {} gradient updates, reducing cost-scaling factor to {}",
|
||||
nanPercent,
|
||||
total,
|
||||
costScaleFactor_);
|
||||
} else {
|
||||
// @TODO: think if should this rather abort?
|
||||
LOG(warn,
|
||||
"NaN/Inf percentage {:.2f} in {} gradient updates, but cost-scaling factor {} is already at minimum",
|
||||
nanPercent,
|
||||
total,
|
||||
costScaleFactor_);
|
||||
}
|
||||
|
||||
// Resetting counts after cost-scale change
|
||||
noNanSeen_ = 0;
|
||||
nanSeen_ = 0;
|
||||
// do not reduce cost-scaling factor below minimum
|
||||
if(costScalingFactor_ > costScalingFactorMinimum_)
|
||||
costScalingFactor_ /= costScalingMultiplier_;
|
||||
|
||||
if(isMainProcess()) {
|
||||
if(costScalingFactor_ > costScalingFactorMinimum_)
|
||||
LOG(debug, "Seen NaN/Inf after {} gradient updates. Reduced cost-scaling factor to {}", total, costScalingFactor_);
|
||||
else
|
||||
LOG(debug, "Seen NaN/Inf after {} gradient updates, Reduced cost-scaling factor to minimum {}. Pruning NaNs now.", total, costScalingFactor_);
|
||||
}
|
||||
|
||||
// Resetting counts after cost-scale change
|
||||
noNanSeen_ = 0;
|
||||
nanSeen_ = 0;
|
||||
}
|
||||
|
||||
float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) {
|
||||
auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin);
|
||||
|
||||
if(checkGradientNan_ || costScale_) {
|
||||
bool hasNan = false, hasInf = false;
|
||||
IsNaN(curGrad, graphs_[i]->allocator(), hasNan, hasInf); // @TODO: make safe with different compiler options
|
||||
if(hasNan || hasInf) {
|
||||
LOG(debug, "Found Nan ({}) or Inf ({})", hasNan, hasInf);
|
||||
// If costScaling_ then check for NaN values if the costScalingFactor_ is larger than
|
||||
// the minimum. If a NaN value is seen we exit here and will reduce the factor next and
|
||||
// this skips an update.
|
||||
// If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces
|
||||
// NaNs with 0. Updates are not skipped any more.
|
||||
// Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type.
|
||||
// This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent
|
||||
// to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large.
|
||||
if(costScaling_ || checkGradientNan_) {
|
||||
bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_;
|
||||
bool clipInf = !checkGradientNan_;
|
||||
bool saneGradient = SanitizeGradient(curGrad, graphs_[i]->allocator(), pruneNaN, clipInf);
|
||||
|
||||
// This should never happen, if it does, something is wrong with the kernel above and needs to be fixed.
|
||||
ABORT_IF(pruneNaN && clipInf && !saneGradient, "We are removing NaNs and clipping Infs, but gradient is still not sane??");
|
||||
|
||||
if(!saneGradient) {
|
||||
LOG(debug, "Found NaN");
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// The optional clipping above will affect the norm here. The norm can be non-finite despite the above
|
||||
// gradient sanitization, hence check again and propagate a NaN.
|
||||
if(dynamicGradientScaling_) {
|
||||
auto gNorm = L2Norm(curGrad, graphs_[i]->allocator());
|
||||
if(isFinite(gNorm) && gNorm > 0.0)
|
||||
@ -197,8 +200,8 @@ float GraphGroup::executeAndCollectNorm(const std::function<float(size_t, size_t
|
||||
float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) {
|
||||
float normalizationFactor = 1.f;
|
||||
|
||||
if(costScale_)
|
||||
normalizationFactor *= costScaleFactor_;
|
||||
if(costScaling_)
|
||||
normalizationFactor *= costScalingFactor_;
|
||||
|
||||
if(options_->get<bool>("normalize-gradient"))
|
||||
normalizationFactor *= updateTrgWords;
|
||||
@ -207,9 +210,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
|
||||
return normalizationFactor;
|
||||
|
||||
if(dynamicGradientScaling_) {
|
||||
// make gradient norm invariant to changes in costScaleFactor_, luckily norm(c * g) = c * norm(g)
|
||||
if(costScale_)
|
||||
gNorm = gNorm / costScaleFactor_;
|
||||
// make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g)
|
||||
if(costScaling_)
|
||||
gNorm = gNorm / costScalingFactor_;
|
||||
|
||||
// Normalize gradient norm w.r.t. number of labels in batch for statistics,
|
||||
// there should be no gradient normalization before this point, @TODO: check this
|
||||
@ -231,11 +234,17 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
|
||||
auto deltaTransform = gNormTransform - gNormAvgTransform; // compute the difference between the current transformer gradient norm and the running average.
|
||||
auto gNormStdTransform = std::sqrt(gNormVarTransform); // compute STD for the running average of (log) gradient norms.
|
||||
|
||||
float fadeoutMultiplier = 1.f;
|
||||
if(dynamicGradientScalingFadeout_ > 0ul) // fade out linearly after that many updates @TODO: allow units other than updates
|
||||
fadeoutMultiplier = (float)std::max(dynamicGradientScalingFadeout_, scheduler_->numberOfBatches()) / (float)dynamicGradientScalingFadeout_;
|
||||
|
||||
float dynamicGradientScalingFactorWithFadeout = dynamicGradientScalingFactor_ * fadeoutMultiplier; // if fadeoutMultiplier increases dynamic gradient scaling becomes less and less likely to happen over time.
|
||||
// delta of (log) gradient norm vs (log) gradient norm average is larger than N standard deviations
|
||||
// hence rescale gradient using the average.
|
||||
if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactor_ * gNormStdTransform) {
|
||||
LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f}",
|
||||
dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactor_, gNormStdTransform);
|
||||
if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactorWithFadeout * gNormStdTransform) {
|
||||
if(isMainProcess())
|
||||
LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f} - scaling gradient by {:.4f}",
|
||||
dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactorWithFadeout, gNormStdTransform, gNormAvg / gNorm);
|
||||
|
||||
normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average.
|
||||
}
|
||||
@ -288,9 +297,7 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
|
||||
restoreFromCheckpoint(modelFileName, scatterFn);
|
||||
} else if(options_->hasAndNotEmpty("pretrained-model")) {
|
||||
std::string nameInit = options_->get<std::string>("pretrained-model");
|
||||
LOG(info,
|
||||
"[training] Initializing model weights with pre-trained model {}",
|
||||
nameInit);
|
||||
LOG(info, "[training] Initializing model weights with pre-trained model {}", nameInit);
|
||||
|
||||
size_t i = 0;
|
||||
for(auto graph : graphs_)
|
||||
|
@ -60,21 +60,21 @@ protected:
|
||||
double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words
|
||||
bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false
|
||||
|
||||
bool costScale_{false};
|
||||
float costScaleFactor_{1.f}; // @TODO, add current costScaleFactor_ to trainingState for serialization
|
||||
size_t costScaleFreq_{2000};
|
||||
float costScaleMultiplier_{2.f};
|
||||
float costScaleNanTolerance_{0.f};
|
||||
size_t costScaleNanRange_{1};
|
||||
float costScaleFactorMinimum_{1.f}; // @TODO make this configureable
|
||||
bool costScaling_{false};
|
||||
float costScalingFactor_{1.f}; // @TODO, add current costScalingFactor_ to trainingState for serialization
|
||||
size_t costScalingFreq_{2000};
|
||||
float costScalingMultiplier_{2.f};
|
||||
float costScalingFactorMinimum_{1.f};
|
||||
|
||||
size_t noNanSeen_{0}; // @TODO, add current noNanSeen_ to trainingState for serialization
|
||||
size_t nanSeen_{0};
|
||||
|
||||
bool checkGradientNan_{false};
|
||||
|
||||
bool dynamicGradientScaling_{false};
|
||||
float dynamicGradientScalingFactor_{2.f};
|
||||
bool dynamicGradientScalingUseLogs_{false};
|
||||
|
||||
bool checkGradientNan_{false};
|
||||
size_t dynamicGradientScalingFadeout_{0ul};
|
||||
|
||||
// determines the number of input streams (i.e. input files or fields in the TSV input) that need
|
||||
// to be included in the batch, i.e. without alignments and weights
|
||||
|
@ -143,13 +143,13 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
|
||||
thread_local Tensor accGradients;
|
||||
thread_local Ptr<TensorAllocator> accAlloc;
|
||||
|
||||
ABORT_IF(costScale_ ,"Cost-scaling not implemented for AsyncSGD");
|
||||
ABORT_IF(costScaling_ ,"Cost-scaling not implemented for AsyncSGD");
|
||||
|
||||
auto graph = graphs_[tid];
|
||||
Ptr<RationalLoss> dynamicLoss = models_[tid]->build(graph, batch);
|
||||
if(costScaleFactor_ != 1.f) {
|
||||
if(costScalingFactor_ != 1.f) {
|
||||
// it's ok to go out of scope, this will still insert the new top node into the graph
|
||||
auto costNode = dynamicLoss->loss() * costScaleFactor_;
|
||||
auto costNode = dynamicLoss->loss() * costScalingFactor_;
|
||||
}
|
||||
|
||||
if(t % optimizerDelay_ == 0) {
|
||||
|
@ -16,16 +16,16 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
|
||||
auto opt = optimizerShards_[0];
|
||||
|
||||
auto lossNode = model->build(graph, batch);
|
||||
if(costScaleFactor_ != 1.f) {
|
||||
if(costScalingFactor_ != 1.f) {
|
||||
// for fp16 training, it's ok to go out of scope, we do not use the scaled version for anything
|
||||
auto scaledLoss = lossNode->loss() * costScaleFactor_;
|
||||
auto scaledLoss = lossNode->loss() * costScalingFactor_;
|
||||
}
|
||||
|
||||
graph->forward();
|
||||
graph->backward();
|
||||
|
||||
bool noNanOrInf = true;
|
||||
if(costScale_) {
|
||||
if(costScaling_) {
|
||||
// Are there NaNs in the gradient?
|
||||
bool hasNan = false, hasInf = false;
|
||||
IsNaN(graph->params()->grads(), graph->allocator(), hasNan, hasInf);
|
||||
@ -39,7 +39,7 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
|
||||
opt->update(graph->params()->vals(),
|
||||
graph->params()->grads(),
|
||||
batch->wordsTrg(),
|
||||
costScaleFactor_);
|
||||
costScalingFactor_);
|
||||
|
||||
if(scheduler_) {
|
||||
scheduler_->update(*lossNode, batch);
|
||||
|
@ -252,8 +252,8 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
|
||||
|
||||
{ // let loss go out of scope, frees memory
|
||||
auto rationalLoss = models_[localDeviceIndex]->build(graph, subBatch);
|
||||
if(costScaleFactor_ != 1.f)
|
||||
rationalLoss->loss() * costScaleFactor_;
|
||||
if(costScalingFactor_ != 1.f)
|
||||
rationalLoss->loss() * costScalingFactor_;
|
||||
graph->forward();
|
||||
|
||||
localDeviceLosses[localDeviceIndex] += *rationalLoss;
|
||||
@ -262,7 +262,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
|
||||
graph->backward(/*zero=*/false); // (gradients are reset before we get here)
|
||||
}
|
||||
|
||||
#if 1
|
||||
#if 0 // @TODO: this can probably be removed now, keep around until confirmed.
|
||||
// experimental and should eventually be somewhere else
|
||||
// Handle local gradient explosion but only clip to largest possible value
|
||||
// given number of GPUs and type. Should clip rarely. Also clips inf
|
||||
@ -284,7 +284,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
|
||||
comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards
|
||||
|
||||
float gradNorm = 0.f;
|
||||
if(costScale_ || dynamicGradientScaling_ || checkGradientNan_) {
|
||||
if(costScaling_ || dynamicGradientScaling_ || checkGradientNan_) {
|
||||
// Wrapping member function
|
||||
auto checkNanOrNorm = [&](size_t i, size_t begin, size_t end) {
|
||||
return GraphGroup::checkNanOrNorm(i, begin, end);
|
||||
|
@ -94,7 +94,7 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
|
||||
// For factored decoding, the word is built over multiple decoding steps,
|
||||
// starting with the lemma, then adding factors one by one.
|
||||
if (factorGroup == 0) {
|
||||
word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
|
||||
word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx);
|
||||
std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
|
||||
//LOG(info, "{} + {} ({}) -> {} -> {}",
|
||||
// factoredVocab->decode(prevHyp->tracebackWords()),
|
||||
@ -115,7 +115,7 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
|
||||
}
|
||||
}
|
||||
else if (shortlist)
|
||||
word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) origBatchIdx, wordIdx));
|
||||
word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx));
|
||||
else
|
||||
word = Word::fromWordIndex(wordIdx);
|
||||
|
||||
@ -330,6 +330,7 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
|
||||
auto prevBatchIdxMap = batchIdxMap; // [origBatchIdx -> currentBatchIdx] but shifted by one time step
|
||||
// main loop over output time steps
|
||||
for (size_t t = 0; ; t++) {
|
||||
//std::cerr << "\nstep=" << t << std::endl;
|
||||
ABORT_IF(origDimBatch != beams.size(), "Lost a batch entry??");
|
||||
// determine beam size for next output time step, as max over still-active sentences
|
||||
// E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then
|
||||
|
@ -3,7 +3,9 @@
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "common/utils.h"
|
||||
#include "translator/nth_element.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <limits>
|
||||
|
@ -122,7 +122,7 @@ public:
|
||||
threadPool.enqueue(task, device, id++);
|
||||
}
|
||||
|
||||
if(options_->get<bool>("output-sampling", false)) {
|
||||
if(options_->hasAndNotEmpty("output-sampling")) {
|
||||
if(options_->get<size_t>("beam-size") > 1)
|
||||
LOG(warn,
|
||||
"[warning] Output sampling and beam search (beam-size > 1) are contradictory methods "
|
||||
|
Loading…
Reference in New Issue
Block a user