Merged PR 10996: A number of smaller changes and clean-up

* Downgrade NCCL to 2.3.7 as 2.4.2 is buggy (hangs with larger models)
* Actually enable gradient-checkpointing, previous option was inactive
* Clean-up training-only options that should not be displayed for decoder and scorer
* Re-enable conversion to FP16 if element types are compatible (belong to the same type class)
* A few typos and more verbose log messages.
This commit is contained in:
Martin Junczys-Dowmunt 2020-01-05 23:16:13 +00:00
parent 24f062cd27
commit 88d9980589
15 changed files with 72 additions and 54 deletions

View File

@ -53,6 +53,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Compilation with CUDA 10.1
### Changed
- Downgrade NCCL to 2.3.7 as 2.4.2 is buggy (hangs with larger models)
- Return error signal on SIGTERM
- Dropped support for CUDA 8.0, CUDA 9.0 is now minimal requirement
- Removed autotuner for now, will be switched back on later

View File

@ -1 +1 @@
v1.8.36
v1.8.37

2
src/3rd_party/nccl vendored

@ -1 +1 @@
Subproject commit 8e3a3f7c5b520babff49cec54a866fa3eda3a3b6
Subproject commit b56650c7f59b8cd40d18809784a6d6be38ef8acb

View File

@ -22,7 +22,7 @@ int main(int argc, char** argv) {
" ./marian-conv -f model.npz -t model.bin --gemm-type packed16");
cli->add<std::string>("--from,-f", "Input model", "model.npz");
cli->add<std::string>("--to,-t", "Output model", "model.bin");
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8", "float32");
cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512", "float32");
cli->parse(argc, argv);
options->merge(config);
}

View File

@ -504,7 +504,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
true);
// add ULR settings
addSuboptionsULR(cli);
addSuboptionsULR(cli);
cli.add<std::vector<std::string>>("--task",
"Use predefined set of options. Possible values: transformer, transformer-big");
@ -638,8 +638,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Noise output layer with gumbel noise",
false);
#if 0 // @TODO: Ask Hany if there are any decoding-time options
// add ULR settings
addSuboptionsULR(cli);
#endif
cli.switchGroup(previous_group);
// clang-format on
@ -749,29 +751,31 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
"Sorting strategy for maxi-batch: none, src, trg (not available for decoder)",
defaultMaxiBatchSort);
cli.add<bool>("--shuffle-in-ram",
"Keep shuffled corpus in RAM, do not write to temp file");
// @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
cli.add<size_t>("--all-caps-every",
"When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
cli.add<size_t>("--english-title-case-every",
"When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");
if(mode_ == cli::mode::training) {
cli.add<bool>("--shuffle-in-ram",
"Keep shuffled corpus in RAM, do not write to temp file");
// @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
cli.add<size_t>("--all-caps-every",
"When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
cli.add<size_t>("--english-title-case-every",
"When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");
cli.add<int>("--mini-batch-words-ref",
"If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
"--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
"Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
"Auto-adjusted to --mini-batch-words-ref if given",
{"0"});
cli.add<bool>("--mini-batch-track-lr",
"Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
cli.add<size_t>("--mini-batch-overstuff",
"[experimental] Stuff this much more data into a minibatch, but scale down the LR and progress counter",
1);
cli.add<size_t>("--mini-batch-understuff",
"[experimental] Break each batch into this many updates",
1);
cli.add<int>("--mini-batch-words-ref",
"If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
"--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
"Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
"Auto-adjusted to --mini-batch-words-ref if given",
{"0"});
cli.add<bool>("--mini-batch-track-lr",
"Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
cli.add<size_t>("--mini-batch-overstuff",
"[experimental] Stuff this much more data into a minibatch, but scale down the LR and progress counter",
1);
cli.add<size_t>("--mini-batch-understuff",
"[experimental] Break each batch into this many updates",
1);
}
// clang-format on
}

View File

@ -223,7 +223,8 @@ enum class TypeClass : size_t {
avx2_type = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only
avx512_type = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only
size_mask = 0x00FF
size_mask = 0x00FF,
class_mask = 0xFF00
};
constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
@ -260,6 +261,10 @@ static inline size_t operator&(TypeClass typeClass, Type type) {
return (size_t)typeClass & (size_t)type;
}
static inline bool isSameTypeClass(Type type1, Type type2) {
return (TypeClass::class_mask & type1) == (TypeClass::class_mask & type2);
}
static inline size_t sizeOf(Type type) {
return TypeClass::size_mask & type;
}

View File

@ -13,17 +13,17 @@ namespace data {
Corpus::Corpus(Ptr<Options> options, bool translate /*= false*/)
: CorpusBase(options, translate),
shuffleInRAM_(options_->get<bool>("shuffle-in-ram")),
allCapsEvery_(options_->get<size_t>("all-caps-every")),
titleCaseEvery_(options_->get<size_t>("english-title-case-every")) {}
shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
Corpus::Corpus(std::vector<std::string> paths,
std::vector<Ptr<Vocab>> vocabs,
Ptr<Options> options)
: CorpusBase(paths, vocabs, options),
shuffleInRAM_(options_->get<bool>("shuffle-in-ram")),
allCapsEvery_(options_->get<size_t>("all-caps-every")),
titleCaseEvery_(options_->get<size_t>("english-title-case-every")) {}
shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
void Corpus::preprocessLine(std::string& line, size_t streamId) {
if (allCapsEvery_ != 0 && pos_ % allCapsEvery_ == 0 && !inference_) {

View File

@ -354,7 +354,7 @@ public:
const Ptr<inits::NodeInitializer>& init,
const Type elementType,
bool fixed = false) {
// since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
// this param is called with a specified type
return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
}
@ -362,7 +362,7 @@ public:
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
bool fixed = false) {
// since this param is called with out a specified type, we assume defaultElementType but allow to check for a different type
// since this param is called without a specified type, we assume defaultElementType but allow to check for a different type
return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
}
@ -497,7 +497,12 @@ public:
// skip over special parameters starting with "special:"
if(pName.substr(0, 8) == "special:")
continue;
param(pName, item.shape, inits::fromItem(item), item.type, /*fixed=*/false);
// if during loading the loaded type is of the same type class as the default element type, allow conversion;
// otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both
// have type class TypeClass::float_type.
auto loadElementType = isSameTypeClass(item.type, defaultElementType_) ? defaultElementType_ : item.type;
param(pName, item.shape, inits::fromItem(item), loadElementType, /*fixed=*/false);
}
if(markReloaded)
setReloaded(true);

View File

@ -617,10 +617,10 @@ Expr unlikelihood(Expr logits, Expr indices) {
int dimBatch = logits->shape()[-2];
int dimTime = logits->shape()[-3];
// @TODO: fix the outside of this function in decoder.h etc.
// @TODO: fix this outside of this function in decoder.h etc.
auto indicesWithLayout = reshape(indices, {1, dimTime, dimBatch, 1});
// This is currently implemented with mutliple ops, might be worth doing a special operation like for cross_entropy
// This is currently implemented with multiple ops, might be worth doing a special operation like for cross_entropy
return -log(gather(1.f - softmax(logits), /*axis=*/-1, indicesWithLayout));
}

View File

@ -24,7 +24,11 @@ namespace marian {
ABORT_IF(empty(), "Attempted to read out logits on empty Logits object");
auto firstLogits = logits_.front()->loss();
ABORT_IF(labels.size() * firstLogits->shape()[-1] != firstLogits->shape().elements(), "Labels not matching logits shape??");
ABORT_IF(labels.size() * firstLogits->shape()[-1] != firstLogits->shape().elements(),
"Labels not matching logits shape ({} != {}, {})??",
labels.size() * firstLogits->shape()[-1],
firstLogits->shape().elements(),
firstLogits->shape());
// base case (no factors)
if (!factoredVocab_) {

View File

@ -534,6 +534,7 @@ public:
layer, // values
layerMask); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
layer = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", layer);
checkpoint(layer); // sets a manually specified checkpoint if gradient checkpointing is enabled, does nothing otherwise.
}
// restore organization of batch and time steps. This is currently required
@ -700,6 +701,9 @@ public:
encoderContexts.push_back(encoderContext);
encoderMasks.push_back(encoderMask);
checkpoint(encoderContext);
checkpoint(encoderMask);
}
rnn::States prevDecoderStates = state->getStates();
@ -735,6 +739,8 @@ public:
ABORT("Unknown auto-regressive layer type in transformer decoder {}",
layerType);
checkpoint(query);
// source-target attention
// Iterate over multiple encoders and simply stack the attention blocks
if(encoderContexts.size() > 0) {
@ -771,10 +777,14 @@ public:
}
}
checkpoint(query);
// remember decoder state
decoderStates.push_back(decoderState);
query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
checkpoint(query);
}
auto decoderContext = transposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]

View File

@ -69,11 +69,10 @@ public:
for(auto device : devices) {
auto graph = New<ExpressionGraph>(true);
graph->setDevice(device);
auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
graph->setDevice(device);
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
if (device.type == DeviceType::cpu) {
graph->getBackend()->setOptimized(options_->get<bool>("optimize"));

View File

@ -18,6 +18,7 @@ AsyncGraphGroup::AsyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
for(auto device : devices_) {
auto graph = New<ExpressionGraph>();
graph->setDevice(device);
graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_.push_back(graph);

View File

@ -34,6 +34,7 @@ public:
// Initialize graph
graph_ = New<ExpressionGraph>();
graph_->setDevice(deviceId);
graph_->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph_->getBackend()->setClip(options_->get<float>("clip-gemm"));
graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
opt_ = Optimizer(options_);

View File

@ -10,6 +10,7 @@ SyncGraphGroup::SyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
for(auto device : devices_) {
auto graph = New<ExpressionGraph>();
graph->setDevice(device);
graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
@ -57,19 +58,6 @@ void SyncGraphGroup::initialize(const Ptr<data::Batch>& exampleBatch) {
if (i > 0)
graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
});
//ThreadPool pool(graphs_.size() - 1, graphs_.size() - 1);
//for(size_t i = 1; i < graphs_.size(); ++i) {
// auto init = [&](size_t i) {
// // initialize i-th graph and weights
// builders_[i]->build(graphs_[i], exampleBatch);
// graphs_[i]->forward();
// // overwrite weights of i-th graph with weights from 0-th graph
// graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
// };
// pool.enqueue(init, i);
//}
//// ThreadPool destructor waits until completion of all tasks.
//// @TODO: can we use comm_->foreach()?
}
void SyncGraphGroup::initializeAvg() {