From 22bbac328701cb2df0fd7026fb2609144379d6d6 Mon Sep 17 00:00:00 2001 From: Andre Martins Date: Sat, 17 Sep 2016 01:55:09 +0100 Subject: [PATCH 1/3] Eliminated non-deterministic behaviour. --- src/param_initializers.h | 7 +++++-- src/validate_encoder_decoder.cu | 18 +++++++++--------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/param_initializers.h b/src/param_initializers.h index 2698d36f..084e829c 100644 --- a/src/param_initializers.h +++ b/src/param_initializers.h @@ -9,6 +9,9 @@ namespace marian { +// Use a constant seed for deterministic behaviour. +std::default_random_engine engine(42); + void zeros(Tensor t) { t.set(0.f); } @@ -19,8 +22,8 @@ void ones(Tensor t) { template void distribution(Tensor t, float a, float b) { - std::random_device device; - std::default_random_engine engine(device()); + //std::random_device device; + //std::default_random_engine engine(device()); Distribution dist(a, b); auto gen = std::bind(dist, engine); diff --git a/src/validate_encoder_decoder.cu b/src/validate_encoder_decoder.cu index 2dffef14..8ff84936 100644 --- a/src/validate_encoder_decoder.cu +++ b/src/validate_encoder_decoder.cu @@ -43,13 +43,13 @@ ExpressionGraph build_graph(int source_vocabulary_size, // Source RNN parameters. Expr Wxh = named(g.param(shape={embedding_size, hidden_size}, - init=uniform()), "Wxh"); + init=uniform(-0.1, 0.1)), "Wxh"); Expr Whh = named(g.param(shape={hidden_size, hidden_size}, - init=uniform()), "Whh"); + init=uniform(-0.1, 0.1)), "Whh"); Expr bh = named(g.param(shape={1, hidden_size}, - init=uniform()), "bh"); + init=uniform(-0.1, 0.1)), "bh"); Expr h0 = named(g.param(shape={1, hidden_size}, - init=uniform()), "h0"); + init=uniform(-0.1, 0.1)), "h0"); std::cerr << "Building encoder RNN..." << std::endl; H.emplace_back(tanh(dot(dot(X[0], E), Wxh) + dot(h0, Whh) + bh)); @@ -59,11 +59,11 @@ ExpressionGraph build_graph(int source_vocabulary_size, // Target RNN parameters. Expr Wxh_d = named(g.param(shape={output_size, hidden_size}, - init=uniform()), "Wxh_d"); + init=uniform(-0.1, 0.1)), "Wxh_d"); Expr Whh_d = named(g.param(shape={hidden_size, hidden_size}, - init=uniform()), "Whh_d"); + init=uniform(-0.1, 0.1)), "Whh_d"); Expr bh_d = named(g.param(shape={1, hidden_size}, - init=uniform()), "bh_d"); + init=uniform(-0.1, 0.1)), "bh_d"); std::cerr << "Building decoder RNN..." << std::endl; auto h0_d = H[num_inputs]; @@ -74,9 +74,9 @@ ExpressionGraph build_graph(int source_vocabulary_size, // Output linear layer before softmax. Expr Why = named(g.param(shape={hidden_size, output_size}, - init=uniform()), "Why"); + init=uniform(-0.1, 0.1)), "Why"); Expr by = named(g.param(shape={1, output_size}, - init=uniform()), "by"); + init=uniform(-0.1, 0.1)), "by"); std::cerr << "Building output layer..." << std::endl; From eb57df2a3e9d94c6e03673d710afe599253ceb4f Mon Sep 17 00:00:00 2001 From: Andre Martins Date: Sat, 17 Sep 2016 02:00:03 +0100 Subject: [PATCH 2/3] Some cleaning. --- src/validate_encoder_decoder.cu | 62 --------------------------------- 1 file changed, 62 deletions(-) diff --git a/src/validate_encoder_decoder.cu b/src/validate_encoder_decoder.cu index 8ff84936..ec9951ec 100644 --- a/src/validate_encoder_decoder.cu +++ b/src/validate_encoder_decoder.cu @@ -1,4 +1,3 @@ - #include "marian.h" #include "mnist.h" #include "vocab.h" @@ -96,7 +95,6 @@ ExpressionGraph build_graph(int source_vocabulary_size, } int main(int argc, char** argv) { -#if 1 std::cerr << "Loading the data... "; Vocab source_vocab, target_vocab; @@ -193,66 +191,6 @@ int main(int argc, char** argv) { g[ss.str()] = Yt; } -#else - - int source_vocabulary_size = 10; - int target_vocabulary_size = 15; - int embedding_size = 8; - int hidden_size = 5; - int batch_size = 25; - int num_source_tokens = 8; - int num_target_tokens = 6; - - // Build the encoder-decoder computation graph. - ExpressionGraph g = build_graph(0, // cuda device. - source_vocabulary_size, - target_vocabulary_size, - embedding_size, - hidden_size, - num_source_tokens, - num_target_tokens); - - int input_size = source_vocabulary_size; - int output_size = target_vocabulary_size; - int num_inputs = num_source_tokens; - int num_outputs = num_target_tokens; - - // Generate input data (include the stop symbol). - for (int t = 0; t <= num_inputs; ++t) { - Tensor Xt({batch_size, input_size}); - float max = 1.; - std::vector values(batch_size * input_size); - std::vector classes(batch_size * output_size, 0.0); - int k = 0; - for (int i = 0; i < batch_size; ++i) { - for (int j = 0; j < input_size; ++j, ++k) { - values[k] = max * (2.0*static_cast(rand()) / RAND_MAX - 1.0); - } - } - thrust::copy(values.begin(), values.end(), Xt.begin()); - std::stringstream ss; - ss << "X" << t; - g[ss.str()] = Xt; - } - - // Generate output data (include the stop symbol). - for (int t = 0; t <= num_outputs; ++t) { - Tensor Yt({batch_size, output_size}); - - std::vector classes(batch_size * output_size, 0.0); - int l = 0; - for (int i = 0; i < batch_size; ++i) { - int gold = output_size * static_cast(rand()) / RAND_MAX; - classes[l + gold] = 1.0; - l += output_size; - } - thrust::copy(classes.begin(), classes.end(), Yt.begin()); - std::stringstream ss; - ss << "Y" << t; - g[ss.str()] = Yt; - } -#endif - std::cerr << "Printing the computation graph..." << std::endl; std::cout << g.graphviz() << std::endl; From f6de1677e1add7621dbfab8a3ababbc7882e7b25 Mon Sep 17 00:00:00 2001 From: Andre Martins Date: Sat, 17 Sep 2016 02:42:11 +0100 Subject: [PATCH 3/3] Implemented safe softmax (but doesn't solve the problem yet, we need log-softmax). --- src/node_operators.h | 1 + src/optimizers.h | 2 +- src/tensor_operators.cu | 50 +++++++++++++++++++++++++++++++++++++++++ src/tensor_operators.h | 4 ++++ 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/node_operators.h b/src/node_operators.h index e7994c0a..c444f24f 100644 --- a/src/node_operators.h +++ b/src/node_operators.h @@ -156,6 +156,7 @@ struct SoftmaxNodeOp : public UnaryNodeOp { void forward() { // B = softmax(A). val_ = a_->val(); + SubtractMax(&val_); // Safe version of softmax. Softmax(&val_); } diff --git a/src/optimizers.h b/src/optimizers.h index a977d7f8..184b063f 100644 --- a/src/optimizers.h +++ b/src/optimizers.h @@ -95,4 +95,4 @@ class Adam { std::vector vt_; }; -} \ No newline at end of file +} diff --git a/src/tensor_operators.cu b/src/tensor_operators.cu index aa92f0dd..34ab874a 100644 --- a/src/tensor_operators.cu +++ b/src/tensor_operators.cu @@ -55,6 +55,56 @@ void SubtractMean(Tensor* Out, Tensor &Weights) { cudaStreamSynchronize(0); } +__global__ void gSubtractMax(float* out, size_t rows, size_t cols) { + for(int bid = 0; bid < rows; bid += gridDim.x) { + int j = bid + blockIdx.x; + if (j < rows) { + extern __shared__ float _share[]; + float* _max = _share + blockDim.x; + float* sp = out + j * cols; + _max[threadIdx.x] = sp[threadIdx.x]; + for(int tid = 1; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if (id < cols) { + if (sp[id] > _max[threadIdx.x]) _max[threadIdx.x] = sp[id]; + } + } + __syncthreads(); + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if (threadIdx.x < (len >> 1)) { + if (_max[threadIdx.x + skip] > _max[threadIdx.x]) { + _max[threadIdx.x] = _max[threadIdx.x + skip]; + } + } + len = (len + 1) >> 1; + } + __syncthreads(); + for(int tid = 0; tid < cols; tid += blockDim.x){ + int id = tid + threadIdx.x; + if(id < cols) + sp[id] -= _max[0]; + } + } + } +} + +void SubtractMax(Tensor* Out) { + // Out is a m-by-k matrix, passed as input. + // The max element of each row of Out is computed and subtracted from Out. + // Out is both input and output. + size_t m = Out->shape()[0]; + size_t k = Out->shape()[1]; + + int blocks = std::min(MAX_BLOCKS, (int) m); + int threads = std::min(MAX_THREADS, (int) k); + int shared = sizeof(float) * threads * 2; + gSubtractMax<<>>(Out->data(), m, k); + cudaStreamSynchronize(0); +} + /////////////////////////////////////////////////////// __global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols) { for(int bid = 0; bid < rows; bid += gridDim.x) { diff --git a/src/tensor_operators.h b/src/tensor_operators.h index 60e989d2..039e6f39 100644 --- a/src/tensor_operators.h +++ b/src/tensor_operators.h @@ -147,6 +147,10 @@ __global__ void gSubtractMean(float* out, float* weights, void SubtractMean(Tensor* Out, Tensor &Weights); +__global__ void gSubtractMax(float* out, size_t rows, size_t cols); + +void SubtractMax(Tensor* Out); + __global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols); void Softmax(Tensor* Out);