diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55448e2e..995bf902 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,3 +20,14 @@ endif(Boost_FOUND)
 
 include_directories(${marian_SOURCE_DIR}/src)
 add_subdirectory(src)
+
+# add a target to generate API documentation with Doxygen
+find_package(Doxygen)
+if(DOXYGEN_FOUND)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
+    add_custom_target(doc
+        ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        COMMENT "Generating API documentation with Doxygen" VERBATIM
+    )
+endif(DOXYGEN_FOUND)
diff --git a/Doxyfile b/Doxyfile.in
similarity index 99%
rename from Doxyfile
rename to Doxyfile.in
index 0e1a68ce..e82982ff 100644
--- a/Doxyfile
+++ b/Doxyfile.in
@@ -743,7 +743,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = src
+INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/src
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/LICENSE.md b/LICENSE.md
index 54b0725f..7ec78493 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,5 +1,3 @@
-The MIT License (MIT)
-
 Copyright (c) 2016 Marcin Junczys-Dowmunt
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -9,8 +7,8 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
diff --git a/README.md b/README.md
index 67685375..7026c615 100644
--- a/README.md
+++ b/README.md
@@ -29,3 +29,6 @@ Compilation with `cmake > 3.5`:
     cmake ..
     make -j
 
+To compile API documentation using Doxygen, first cd to the build directory, and then:
+
+    make doc
diff --git a/src/expression_graph.cu b/src/expression_graph.cu
index 52a68893..fbdaa084 100644
--- a/src/expression_graph.cu
+++ b/src/expression_graph.cu
@@ -1,8 +1,6 @@
 #include <sstream>
 #include "expression_graph.h"
 
-using namespace std;
-
 namespace marian {
 
 Expr::Expr(ExpressionGraphPtr g, Chainable<Tensor>* chainable)
@@ -32,19 +30,10 @@ Expr::operator ChainPtr() {
 
 std::string Expr::Debug() const
 {
-	stringstream strm;
+	std::stringstream strm;
 	const Shape &shape = pimpl_->shape();
 	strm << marian::Debug(shape);
 	return strm.str();
 }
 
-///////////////////////////////////////////////////////
-//ExpressionGraph::ExpressionGraph(int cudaDevice)
-//: stack_(new ChainableStack)
-//{
-//  std::srand (time(NULL));
-//  cudaSetDevice(0);
-//
-//}
-
 }
diff --git a/src/expression_operators.cu b/src/expression_operators.cu
index 9f648768..59c1c52d 100644
--- a/src/expression_operators.cu
+++ b/src/expression_operators.cu
@@ -29,7 +29,7 @@ Expr operator-(Expr a) {
   return Expr(a.graph(), new NegNodeOp(a));
 };
 
-Expr softmax_fast(Expr a) {
+Expr softmax(Expr a) {
   return Expr(a.graph(), new SoftmaxNodeOp(a));
 }
 
diff --git a/src/expression_operators.h b/src/expression_operators.h
index 2072ddba..6a9b4e53 100644
--- a/src/expression_operators.h
+++ b/src/expression_operators.h
@@ -72,12 +72,12 @@ inline Expr sum(Expr a, Args ...args) {
 
 // inefficient
 template <typename ...Args>
-Expr softmax(Expr a, Args ...args) {
+Expr softmax_slow(Expr a, Args ...args) {
   Expr e = exp(a);
   return e / sum(e, args...);
 }
 
-Expr softmax_fast(Expr a);
+Expr softmax(Expr a);
 
 Expr argmax(Expr a);
 
diff --git a/src/node_operators.h b/src/node_operators.h
index 1de5e37d..e7994c0a 100644
--- a/src/node_operators.h
+++ b/src/node_operators.h
@@ -163,6 +163,13 @@ struct SoftmaxNodeOp : public UnaryNodeOp {
     // For each row, the Jacobian times vector is given by:
     // J * dy = p .* (dy - avg*1)
     // where avg = p'*dy and p is the softmax output (probabilities).
+    //
+    // For more information, see sec. 2.5 of the following reference:
+    // André F. T. Martins and Ramon Astudillo.
+    // "From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label
+    // Classification." ICML 2016.
+    // http://jmlr.org/proceedings/papers/v48/martins16.pdf
+
     Tensor result(adj_.shape());
     thrust::copy(adj_.begin(), adj_.end(), result.begin());
     SubtractMean(&result, val_);
diff --git a/src/sgd.h b/src/sgd.h
index 7d3f3200..fe0470b1 100644
--- a/src/sgd.h
+++ b/src/sgd.h
@@ -14,7 +14,8 @@ class Sgd {
       graph.backprop(batchSize);
       
       for(auto& param : graph.params())
-        Element(_1 -= eta_ * _2, param.val(), param.grad());
+        Element(_1 -= eta_ * _2,
+                param.val(), param.grad());
     }
     
   private:
@@ -26,16 +27,18 @@ class Adagrad {
     Adagrad(float eta=0.1) : eta_(eta) {}
     
     void operator()(ExpressionGraph& graph, int batchSize) {
+      float fudgeFactor = 1e-6;
+      graph.backprop(batchSize);
+      
       if(history_.size() < graph.params().size())
         for(auto& param : graph.params())
           history_.emplace_back(Tensor(param.grad().shape(), 0));
       
-      graph.backprop(batchSize);
-      
       auto it = history_.begin();
       for(auto& param : graph.params()) {    
-        Element(_1 -= eta_ / Sqrt(_2) * _3, param.val(), *it, param.grad());
         Element(_1 += _2 * _2, *it, param.grad());
+        Element(_1 -= eta_ / (fudgeFactor + Sqrt(_2)) * _3,
+                param.val(), *it, param.grad());
         it++;
       }
     }
diff --git a/src/test.cu b/src/test.cu
index 1201ad99..8c7dfc54 100644
--- a/src/test.cu
+++ b/src/test.cu
@@ -100,10 +100,10 @@ int main(int argc, char** argv) {
   std::cerr << "Building output layer..." << std::endl;
   std::vector<Expr> Yp;
 
-  Yp.emplace_back(softmax_fast(dot(H[0], Why) + by));
+  Yp.emplace_back(softmax(dot(H[0], Why) + by));
   Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
   for (int t = 1; t < num_inputs; ++t) {
-    Yp.emplace_back(softmax_fast(dot(H[t], Why) + by));
+    Yp.emplace_back(softmax(dot(H[t], Why) + by));
     cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
   }
   auto graph = -mean(cross_entropy, axis=0, name="cost");
diff --git a/src/train_mnist.cu b/src/train_mnist.cu
index 2dda8fde..e726ee83 100644
--- a/src/train_mnist.cu
+++ b/src/train_mnist.cu
@@ -25,7 +25,7 @@ int main(int argc, char** argv) {
   Expr b = named(g.param(shape={1, LABEL_SIZE}), "b");
 
   auto scores = dot(x, w) + b;
-  auto lr = softmax_fast(scores);
+  auto lr = softmax(scores);
   auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost");
   cerr << "lr=" << lr.Debug() << endl;
 
diff --git a/src/validate_encoder_decoder.cu b/src/validate_encoder_decoder.cu
index d1f54bde..2dffef14 100644
--- a/src/validate_encoder_decoder.cu
+++ b/src/validate_encoder_decoder.cu
@@ -7,17 +7,19 @@
 using namespace marian;
 using namespace keywords;
 
-const int input_size = 10;
-const int output_size = 15;
-const int embedding_size = 8;
-const int hidden_size = 5;
-const int batch_size = 25;
-const int num_inputs = 8;
-const int num_outputs = 6;
-
-ExpressionGraph build_graph() {
+ExpressionGraph build_graph(int source_vocabulary_size,
+                            int target_vocabulary_size,
+                            int embedding_size,
+                            int hidden_size,
+                            int num_source_tokens,
+                            int num_target_tokens) {
   std::cerr << "Building computation graph..." << std::endl;
 
+  int input_size = source_vocabulary_size;
+  int output_size = target_vocabulary_size;
+  int num_inputs = num_source_tokens;
+  int num_outputs = num_target_tokens;
+
   ExpressionGraph g;
   std::vector<Expr> X, Y, H, S;
 
@@ -25,14 +27,14 @@ ExpressionGraph build_graph() {
   for (int t = 0; t <= num_inputs; ++t) {
     std::stringstream ss;
     ss << "X" << t;
-    X.emplace_back(named(g.input(shape={batch_size, input_size}), ss.str()));
+    X.emplace_back(named(g.input(shape={whatevs, input_size}), ss.str()));
   }
 
   // We're including the stop symbol here.
   for (int t = 0; t <= num_outputs; ++t) {
     std::stringstream ss;
     ss << "Y" << t;
-    Y.emplace_back(named(g.input(shape={batch_size, output_size}), ss.str()));
+    Y.emplace_back(named(g.input(shape={whatevs, output_size}), ss.str()));
   }
 
   // Source embeddings.
@@ -80,10 +82,10 @@ ExpressionGraph build_graph() {
 
   // Softmax layer and cost function.
   std::vector<Expr> Yp;
-  Yp.emplace_back(named(softmax_fast(dot(h0_d, Why) + by), "pred"));
+  Yp.emplace_back(named(softmax(dot(h0_d, Why) + by), "pred"));
   Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
   for (int t = 1; t <= num_outputs; ++t) {
-    Yp.emplace_back(named(softmax_fast(dot(S[t-1], Why) + by), "pred"));
+    Yp.emplace_back(named(softmax(dot(S[t-1], Why) + by), "pred"));
     cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
   }
   auto cost = named(-mean(cross_entropy, axis=0), "cost");
@@ -96,30 +98,124 @@ ExpressionGraph build_graph() {
 int main(int argc, char** argv) {
 #if 1
   std::cerr << "Loading the data... ";
-  Vocab sourceVocab, targetVocab;
+  Vocab source_vocab, target_vocab;
 
   // read parallel corpus from file
-  std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
-  std::fstream targetFile("../examples/mt/dev/newstest2013.en");
+  std::fstream source_file("../examples/mt/dev/newstest2013.de");
+  std::fstream target_file("../examples/mt/dev/newstest2013.en");
 
+  // Right now we're only reading the first few sentence pairs, and defining
+  // that as the step size.
+  int batch_size = 64;
+  int num_source_tokens = -1;
+  int num_target_tokens = -1;
   std::vector<std::vector<size_t> > source_sentences, target_sentences;
-  std::string sourceLine, targetLine;
-  while (getline(sourceFile, sourceLine)) {
-    getline(targetFile, targetLine);
-    std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
-    std::vector<size_t> targetIds = targetVocab.ProcessSentence(targetLine);
-    source_sentences.push_back(sourceIds);
-    target_sentences.push_back(targetIds);
+  std::string source_line, target_line;
+  while (getline(source_file, source_line)) {
+    getline(target_file, target_line);
+    std::vector<size_t> source_ids = source_vocab.ProcessSentence(source_line);
+    source_ids.push_back(source_vocab.GetEOS()); // Append EOS token.
+    std::vector<size_t> target_ids = target_vocab.ProcessSentence(target_line);
+    target_ids.push_back(target_vocab.GetEOS()); // Append EOS token.
+    source_sentences.push_back(source_ids);
+    target_sentences.push_back(target_ids);
+    if (num_source_tokens < 0 || source_ids.size() > num_source_tokens) {
+      num_source_tokens = source_ids.size();
+    }
+    if (num_target_tokens < 0 || target_ids.size() > num_target_tokens) {
+      num_target_tokens = target_ids.size();
+    }
+    if (source_sentences.size() == batch_size) break;
   }
   std::cerr << "Done." << std::endl;
   std::cerr << source_sentences.size()
             << " sentence pairs read." << std::endl;
-  std::cerr << "Source vocabulary size: " << sourceVocab.Size() << std::endl;
-  std::cerr << "Target vocabulary size: " << targetVocab.Size() << std::endl;
-#endif
+  std::cerr << "Source vocabulary size: " << source_vocab.Size() << std::endl;
+  std::cerr << "Target vocabulary size: " << target_vocab.Size() << std::endl;
+  std::cerr << "Max source tokens: " << num_source_tokens << std::endl;
+  std::cerr << "Max target tokens: " << num_target_tokens << std::endl;
+
+  // Padding the source and target sentences.
+  for (auto &sentence : source_sentences) {
+    for (int i = sentence.size(); i < num_source_tokens; ++i) {
+      sentence.push_back(source_vocab.GetPAD());
+    }
+  }
+  for (auto &sentence : target_sentences) {
+    for (int i = sentence.size(); i < num_target_tokens; ++i) {
+      sentence.push_back(target_vocab.GetPAD());
+    }
+  }
+
+  std::cerr << "Building the encoder-decoder computation graph..." << std::endl;
 
   // Build the encoder-decoder computation graph.
-  ExpressionGraph g = build_graph();
+  int embedding_size = 50;
+  int hidden_size = 100;
+  ExpressionGraph g = build_graph(source_vocab.Size(),
+                                  target_vocab.Size(),
+                                  embedding_size,
+                                  hidden_size,
+                                  num_source_tokens-1,
+                                  num_target_tokens-1);
+
+  std::cerr << "Attaching the data to the computation graph..." << std::endl;
+
+  // Convert the data to dense one-hot vectors.
+  // TODO: make the graph handle sparse indices with a proper lookup layer.
+  for (int t = 0; t < num_source_tokens; ++t) {
+    Tensor Xt({batch_size, static_cast<int>(source_vocab.Size())});
+    std::vector<float> values(batch_size * source_vocab.Size(), 0.0);
+    int k = 0;
+    for (int i = 0; i < batch_size; ++i) {
+      values[k + source_sentences[i][t]] = 1.0;
+      k += source_vocab.Size();
+    }
+    thrust::copy(values.begin(), values.end(), Xt.begin());
+    // Attach this slice to the graph.
+    std::stringstream ss;
+    ss << "X" << t;
+    g[ss.str()] = Xt;
+  }
+
+  for (int t = 0; t < num_target_tokens; ++t) {
+    Tensor Yt({batch_size, static_cast<int>(target_vocab.Size())});
+    std::vector<float> values(batch_size * target_vocab.Size(), 0.0);
+    int k = 0;
+    for (int i = 0; i < batch_size; ++i) {
+      values[k + target_sentences[i][t]] = 1.0;
+      k += target_vocab.Size();
+    }
+    thrust::copy(values.begin(), values.end(), Yt.begin());
+    // Attach this slice to the graph.
+    std::stringstream ss;
+    ss << "Y" << t;
+    g[ss.str()] = Yt;
+  }
+
+#else
+
+  int source_vocabulary_size = 10;
+  int target_vocabulary_size = 15;
+  int embedding_size = 8;
+  int hidden_size = 5;
+  int batch_size = 25;
+  int num_source_tokens = 8;
+  int num_target_tokens = 6;
+
+  // Build the encoder-decoder computation graph.
+  ExpressionGraph g = build_graph(0, // cuda device.
+                                  source_vocabulary_size,
+                                  target_vocabulary_size,
+                                  embedding_size,
+                                  hidden_size,
+                                  num_source_tokens,
+                                  num_target_tokens);
+
+  int input_size = source_vocabulary_size;
+  int output_size = target_vocabulary_size;
+  int num_inputs = num_source_tokens;
+  int num_outputs = num_target_tokens;
 
   // Generate input data (include the stop symbol).
   for (int t = 0; t <= num_inputs; ++t) {
@@ -155,6 +251,7 @@ int main(int argc, char** argv) {
     ss << "Y" << t;
     g[ss.str()] = Yt;
   }
+#endif
 
   std::cerr << "Printing the computation graph..." << std::endl;
   std::cout << g.graphviz() << std::endl;
@@ -167,6 +264,7 @@ int main(int argc, char** argv) {
 
   std::cerr << g["cost"].val().Debug() << std::endl;
 
+#if 0
   std::cerr << g["X0"].val().Debug() << std::endl;
   std::cerr << g["Y0"].val().Debug() << std::endl;
   std::cerr << g["Whh"].grad().Debug() << std::endl;
@@ -175,6 +273,7 @@ int main(int argc, char** argv) {
   std::cerr << g["by"].grad().Debug() << std::endl;
   std::cerr << g["Wxh"].grad().Debug() << std::endl;
   std::cerr << g["h0"].grad().Debug() << std::endl;
+#endif
 
   return 0;
 }
diff --git a/src/validate_mnist.cu b/src/validate_mnist.cu
index f9bc0dcf..690d6f40 100644
--- a/src/validate_mnist.cu
+++ b/src/validate_mnist.cu
@@ -32,7 +32,7 @@ ExpressionGraph build_graph() {
                          init=from_vector(bData)), "b");
 
   auto probs = named(
-    softmax_fast(dot(x, w) + b), //, axis=1),
+    softmax(dot(x, w) + b), //, axis=1),
     "probs"
   );
   
diff --git a/src/validate_mnist_batch.cu b/src/validate_mnist_batch.cu
index d37e9ca3..76379eda 100644
--- a/src/validate_mnist_batch.cu
+++ b/src/validate_mnist_batch.cu
@@ -68,7 +68,7 @@ int main(int argc, char** argv) {
 
   std::cerr << "Building model...";
   auto layer1 = tanh(dot(x, w1) + b1);
-  auto layer2 = softmax(dot(layer1, w2) + b2, axis=1, name="layer2");
+  auto layer2 = softmax(dot(layer1, w2) + b2);
   auto predict = layer2;
 
   std::cerr << "Done." << std::endl;