diff --git a/src/layers/rnn.h b/src/layers/rnn.h index 0635748e..e4f585a9 100644 --- a/src/layers/rnn.h +++ b/src/layers/rnn.h @@ -392,7 +392,6 @@ class GRU { Expr apply1(Expr input) { if(dropMaskX_) input = dropout(input, keywords::mask=dropMaskX_); - debug(input, "in"); auto xW = dot(input, W_); if(layerNorm_) xW = layer_norm(xW, gamma1_); @@ -403,7 +402,6 @@ class GRU { Expr mask = nullptr) { if(dropMaskS_) state = dropout(state, keywords::mask=dropMaskS_); - debug(state, "state"); auto sU = dot(state, U_); diff --git a/src/models/gnmt.h b/src/models/gnmt.h index 634f03e9..1464f557 100644 --- a/src/models/gnmt.h +++ b/src/models/gnmt.h @@ -26,12 +26,19 @@ namespace marian { bool skipDepth = options_->get("skip"); size_t encoderLayers = options_->get("layers-enc"); float dropoutRnn = options_->get("dropout-rnn"); + float dropoutSrc = options_->get("dropout-src"); auto xEmb = Embedding("Wemb", dimSrcVoc, dimSrcEmb)(graph); Expr x, xMask; std::tie(x, xMask) = prepareSource(xEmb, batch, batchIdx); + if(dropoutSrc) { + int srcWords = x->shape()[2]; + auto srcWordDrop = graph->dropout(dropoutSrc, {1, 1, srcWords}); + x = dropout(x, mask=srcWordDrop); + } + auto xFw = RNN(graph, "encoder_bi", dimSrcEmb, dimEncState, normalize=layerNorm, @@ -45,7 +52,6 @@ namespace marian { dropout_prob=dropoutRnn) (x, mask=xMask); - debug(xFw, "xFw"); if(encoderLayers > 1) { auto xBi = concatenate({xFw, xBw}, axis=1); @@ -90,9 +96,16 @@ class DecoderGNMT : public DecoderBase { bool skipDepth = options_->get("skip"); size_t decoderLayers = options_->get("layers-dec"); float dropoutRnn = options_->get("dropout-rnn"); + float dropoutTrg = options_->get("dropout-trg"); auto graph = embeddings->graph(); + if(dropoutTrg) { + int trgWords = embeddings->shape()[2]; + auto trgWordDrop = graph->dropout(dropoutTrg, {1, 1, trgWords}); + embeddings = dropout(embeddings, mask=trgWordDrop); + } + if(!attention_) attention_ = New("decoder", context, dimDecState, diff --git a/src/test/marian_translate.cu b/src/test/marian_translate.cu index 63c40878..1922f3d9 100644 --- a/src/test/marian_translate.cu +++ b/src/test/marian_translate.cu @@ -220,7 +220,7 @@ int main(int argc, char** argv) { target->load("../benchmark/marian32K/train.tok.true.bpe.de.json", 50000); auto encdec = New(options); - encdec->load(graph, "../benchmark/marian32K/modelML6.200000.npz"); + encdec->load(graph, "../benchmark/marian32K/model8.110000.npz"); graph->reserveWorkspaceMB(128); diff --git a/src/training/config.cpp b/src/training/config.cpp index ecd58d7a..15f7180e 100644 --- a/src/training/config.cpp +++ b/src/training/config.cpp @@ -199,6 +199,10 @@ void Config::addOptions(int argc, char** argv, bool doValidate) { "Enable layer normalization") ("dropout-rnn", po::value()->default_value(0), "Scaling dropout along rnn layers and time (0 = no dropout)") + ("dropout-src", po::value()->default_value(0), + "Dropout source words (0 = no dropout)") + ("dropout-trg", po::value()->default_value(0), + "Dropout target words (0 = no dropout)") ; po::options_description opt("Optimizer options"); @@ -307,6 +311,8 @@ void Config::addOptions(int argc, char** argv, bool doValidate) { SET_OPTION("no-shuffle", bool); SET_OPTION("normalize", bool); SET_OPTION("dropout-rnn", float); + SET_OPTION("dropout-src", float); + SET_OPTION("dropout-trg", float); SET_OPTION("skip", bool); if(doValidate)