diff --git a/README.md b/README.md
index 6bee418b..67685375 100644
--- a/README.md
+++ b/README.md
@@ -11,14 +11,15 @@ Installation
 
 Requirements:
 
-* g++ with C++14
+* g++ with c++11
 * CUDA and CuDNN
+* Boost (>= 1.56)
 
 Exporting some paths for CuDNN may be required (put it, for example, in your `.bashrc` file):
 
     export PATH=$PATH:$HOME/.local/bin:/usr/local/cuda/bin
-    export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cudnn-5/lib64
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cudnn-5/lib64
+    export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64
     export CPATH=$CPATH:/usr/local/cudnn-5/include
 
 Compilation with `cmake > 3.5`:
diff --git a/marian/.cproject b/marian/.cproject
index 2d8c666b..48ccc0b2 100644
--- a/marian/.cproject
+++ b/marian/.cproject
@@ -56,11 +56,11 @@
 							</tool>
 						</toolChain>
 					</folderInfo>
-					<fileInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.924444438" name="train_mnist.cu" rcbsApplicability="disable" resourcePath="src/train_mnist.cu" toolsToInvoke="nvcc.compiler.base.1979453423.2078504098">
-						<tool id="nvcc.compiler.base.1979453423.2078504098" name="NVCC Compiler" superClass="nvcc.compiler.base.1979453423"/>
+					<fileInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.843925199" name="validate_mnist_batch.cu" rcbsApplicability="disable" resourcePath="src/validate_mnist_batch.cu" toolsToInvoke="nvcc.compiler.base.1979453423.378728796">
+						<tool id="nvcc.compiler.base.1979453423.378728796" name="NVCC Compiler" superClass="nvcc.compiler.base.1979453423"/>
 					</fileInfo>
 					<sourceEntries>
-						<entry excluding="src/train_mnist.cu|src/validate_mnist.cu|src/npz_converter.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+						<entry excluding="src/validate_mnist_batch.cu|src/train_mnist.cu|src/validate_mnist.cu|src/npz_converter.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
 					</sourceEntries>
 				</configuration>
 			</storageModule>
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cb121111..6dc37391 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,7 @@ cuda_add_library(marian_lib
   cnpy/cnpy.cpp
   exception.cpp
   expressions.cu 
+  sgd.cu
   tensor.cu	
   tensor_operators.cu
 )
diff --git a/src/sgd.cu b/src/sgd.cu
new file mode 100644
index 00000000..26121f2f
--- /dev/null
+++ b/src/sgd.cu
@@ -0,0 +1,143 @@
+#include <ctime>
+#include <algorithm>
+#include <vector>
+#include "sgd.h"
+#include "thrust_functions.h"
+
+using namespace std;
+
+namespace marian {
+SGD::SGD(Expr& cost_func, Expr& inX, Expr& inY,
+    const std::vector<Expr*> params, float eta,
+    std::vector<float>& xData, size_t numFeatures,
+    std::vector<float>& yData, size_t numClasses,
+    size_t epochs, size_t batchSize)
+: cost_function_(&cost_func),
+  inX_(&inX),
+  inY_(&inY),
+  params_(params),
+  eta_(eta),
+  xData_(xData),
+  numFeatures_(numFeatures),
+  yData_(yData),
+  numClasses_(numClasses),
+  epochs_(epochs),
+  maxBatchSize_(batchSize)
+{}
+
+void SGD::Run()
+{
+  std::srand ( unsigned ( std::time(0) ) );
+
+  size_t numExamples = xData_.size()/ numFeatures_;
+  Tensor xt({(int)maxBatchSize_, (int)numExamples}, 0.0f);
+  Tensor yt({(int)maxBatchSize_, (int)numClasses_}, 0.0f);
+
+  vector<size_t> shuffle = CreateShuffle(numExamples);
+  //vector<size_t> shuffle;
+
+  for (size_t numEpoch = 0; numEpoch < epochs_; ++numEpoch) {
+    std::cerr << "Starting epoch #" << numEpoch << std::endl;
+    size_t startId = 0;
+    size_t endId = startId + maxBatchSize_;
+
+    while (endId < numExamples) {
+      PrepareBatch(startId, endId, maxBatchSize_, shuffle, xt, yt);
+      *inX_ = xt;
+      *inY_ = yt;
+
+      cost_function_->forward(maxBatchSize_);
+      cost_function_->backward();
+
+      UpdateModel();
+
+      startId += maxBatchSize_;
+      endId += maxBatchSize_;
+    }
+  }
+}
+
+std::vector<size_t> SGD::CreateShuffle(size_t numExamples) const {
+  vector<size_t> ret(numExamples);
+  std::iota(ret.begin(), ret.end(), 0);
+  std::random_shuffle ( ret.begin(), ret.end() );
+  /*
+  cerr << "shuffled" << endl;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    cerr << ret[i] << " ";
+  }
+  */
+  return ret;
+}
+
+void SGD::PrepareBatch(
+		size_t startId,
+		size_t endId,
+		size_t batchSize,
+		const std::vector<size_t> &shuffle,
+		Tensor& xt,
+		Tensor& yt) {
+  /*
+  std::vector<float> x(xData_.begin() + startId * numFeatures_,
+                       xData_.begin() + endId * numFeatures_);
+  std::vector<float> y(yData_.begin() + startId * numClasses_,
+                       yData_.begin() + endId * numClasses_);
+  */
+  std::vector<float> x(batchSize * numFeatures_);
+  std::vector<float> y(batchSize * numClasses_);
+  
+  /*
+  cerr << "startId=" << startId
+       << " " << endId
+       << " " << batchSize
+       << endl;
+  cerr << "numExamples=" << shuffle.size() << endl;
+  cerr << "numFeatures_=" << numFeatures_ << " " << numClasses_ << endl;
+  cerr << "sizes=" << x.size() 
+       << " " << y.size() 
+       << " " << xData_.size()
+       << " " << yData_.size()
+       << endl;
+  */
+  size_t startXId = 0;
+  size_t startYId = 0;
+  
+  for (size_t i = startId; i < endId; ++i) {
+    size_t ind = shuffle[i];
+    size_t startXDataId = ind * numFeatures_;
+    size_t startYDataId = ind * numClasses_;
+
+    size_t endXDataId = startXDataId + numFeatures_;
+    size_t endYDataId = startYDataId + numClasses_;
+    /*
+    cerr << "i=" << i
+    	 << " " << ind
+    	 << " " << startXDataId << "-" << endXDataId
+	 << " " << startYDataId << "-" << endYDataId
+	 << endl;
+    */
+    std::copy(xData_.begin() + startXDataId,
+        xData_.begin() + endXDataId,
+        x.begin() + startXId);
+
+    std::copy(yData_.begin() + startYDataId,
+        yData_.begin() + endYDataId,
+        y.begin() + startYId);
+
+    startXId += numFeatures_;
+    startYId += numClasses_;
+  }
+  
+  xt.set(x);
+  yt.set(y);
+}
+
+void SGD::UpdateModel() {
+  for (auto& param : params_) {
+    using namespace thrust::placeholders;
+    Element(_1 = _1 - eta_ * _2, param->val(), param->grad());
+  }
+}
+
+} // namespace
+
diff --git a/src/sgd.h b/src/sgd.h
index 0dab8df0..33364049 100644
--- a/src/sgd.h
+++ b/src/sgd.h
@@ -5,6 +5,7 @@
 
 #include "expressions.h"
 #include "thrust_functions.h"
+#include "tensor_operators.h"
 
 namespace marian {
 
@@ -14,67 +15,14 @@ class SGD {
         const std::vector<Expr*> params, float eta,
         std::vector<float>& xData, size_t numFeatures,
         std::vector<float>& yData, size_t numClasses,
-        size_t epochs, size_t batchSize)
-    : cost_function_(&cost_func),
-      inX_(&inX),
-      inY_(&inY),
-      params_(params),
-      eta_(eta),
-      xData_(xData),
-      numFeatures_(numFeatures),
-      yData_(yData),
-      numClasses_(numClasses),
-      epochs_(epochs),
-      batchSize_(batchSize)
-  {}
+        size_t epochs, size_t batchSize);
 
-    void Run() {
-      size_t numExamples = xData_.size()/ numFeatures_;
-      Tensor xt({(int)batchSize_, (int)numExamples}, 0.0f);
-      Tensor yt({(int)batchSize_, (int)numClasses_}, 0.0f);
-
-      for (size_t numEpoch = 0; numEpoch < epochs_; ++numEpoch) {
-        std::cerr << "Starting epoch #" << numEpoch << std::endl;
-        size_t startId = 0;
-        size_t endId = startId + batchSize_;
-
-        while (endId < numExamples) {
-          PrepareBatch(startId, endId, xt, yt);
-          *inX_ = xt;
-          *inY_ = yt;
-
-          cost_function_->forward(batchSize_);
-          cost_function_->backward();
-
-          UpdateModel();
-
-          startId += batchSize_;
-          endId += batchSize_;
-        }
-      }
-    }
-
-    void PrepareBatch(size_t startId, size_t endId, Tensor& xt, Tensor& yt) {
-      std::vector<float> x(xData_.begin() + startId * numFeatures_,
-                           xData_.begin() + endId * numFeatures_);
-      std::vector<float> y(yData_.begin() + startId * numClasses_,
-                           yData_.begin() + endId * numClasses_);
-
-      xt.set(x);
-      yt.set(y);
-    }
-
-    void UpdateModel() {
-      for (auto& param : params_) {
-        using namespace thrust::placeholders;
-        Element(_1 = _1 - eta_ * _2, param->val(), param->grad());
-      }
-    }
+    void Run();
 
   private:
-    std::shared_ptr<Expr> cost_function_;
-    std::shared_ptr<Expr> inX_;
-    std::shared_ptr<Expr> inY_;
+    Expr *cost_function_;
+    Expr *inX_;
+    Expr *inY_;
     std::vector<Expr*> params_;
     const float eta_;
     std::vector<float>& xData_;
@@ -82,7 +30,18 @@ class SGD {
     std::vector<float>& yData_;
     const size_t numClasses_;
     const size_t epochs_;
-    const size_t batchSize_;
+    const size_t maxBatchSize_;
+
+    std::vector<size_t> CreateShuffle(size_t numExamples) const;
+    void PrepareBatch(
+    		size_t startId,
+    		size_t endId,
+    		size_t batchSize,
+    		const std::vector<size_t> &shuffle,
+    		Tensor& xt,
+    		Tensor& yt);
+
+    void UpdateModel();
 };
 
 } // namespace marian
diff --git a/src/validate_mnist.cu b/src/validate_mnist.cu
index 9d9cdf8b..43e1fedc 100644
--- a/src/validate_mnist.cu
+++ b/src/validate_mnist.cu
@@ -21,7 +21,7 @@ int main(int argc, char** argv) {
   std::cerr << "Done." << std::endl;
 
   std::cerr << "Loading model params...";
-  NpzConverter converter("../scripts/test_model/model.npz");
+  NpzConverter converter("../scripts/test_model_single/model.npz");
 
   std::vector<float> wData, bData;
   Shape wShape, bShape;
diff --git a/src/validate_mnist_batch.cu b/src/validate_mnist_batch.cu
index ac4e7359..1c66198a 100644
--- a/src/validate_mnist_batch.cu
+++ b/src/validate_mnist_batch.cu
@@ -21,22 +21,39 @@ int main(int argc, char** argv) {
   std::cerr << "\tDone." << std::endl;
 
   std::cerr << "Loading model params...";
-  NpzConverter converter("../scripts/test_model/model.npz");
 
-  std::vector<float> wData;
-  Shape wShape;
-  converter.Load("weights", wData, wShape);
+  NpzConverter converter("../scripts/test_model_single/model.npz");
 
-  std::vector<float> bData;
-  Shape bShape;
-  converter.Load("bias", bData, bShape);
+  std::vector<float> wData1;
+  Shape wShape1;
+  converter.Load("weights1", wData1, wShape1);
+  
+  std::vector<float> bData1;
+  Shape bShape1;
+  converter.Load("bias1", bData1, bShape1);
+  
+  std::vector<float> wData2;
+  Shape wShape2;
+  converter.Load("weights2", wData2, wShape2);
+  
+  std::vector<float> bData2;
+  Shape bShape2;
+  converter.Load("bias2", bData2, bShape2);
 
-  auto initW = [wData](Tensor t) {
-    t.set(wData);
+  auto initW1 = [wData1](Tensor t) {
+    t.set(wData1);
   };
 
-  auto initB = [bData](Tensor t) {
-    t.set(bData);
+  auto initB1 = [bData1](Tensor t) {
+    t.set(bData1);
+  };
+  
+  auto initW2 = [wData2](Tensor t) {
+    t.set(wData2);
+  };
+
+  auto initB2 = [bData2](Tensor t) {
+    t.set(bData2);
   };
 
   std::cerr << "\tDone." << std::endl;
@@ -45,11 +62,15 @@ int main(int argc, char** argv) {
   auto x = input(shape={whatevs, IMAGE_SIZE}, name="X");
   auto y = input(shape={whatevs, LABEL_SIZE}, name="Y");
 
-  auto w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0", init=initW);
-  auto b = param(shape={1, LABEL_SIZE}, name="b0", init=initB);
+  auto w1 = param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1);
+  auto b1 = param(shape={1, 100}, name="b0", init=initB1);
+  auto w2 = param(shape={100, LABEL_SIZE}, name="W1", init=initW2);
+  auto b2 = param(shape={1, LABEL_SIZE}, name="b1", init=initB2);
 
   std::cerr << "Building model...";
-  auto predict = softmax(dot(x, w) + b, axis=1, name="pred");
+  auto layer1 = tanh(dot(x, w1) + b1);
+  auto layer2 = softmax(dot(layer1, w2) + b2, axis=1, name="layer2");
+  auto predict = layer2;
 
   std::cerr << "Done." << std::endl;
 
@@ -77,6 +98,7 @@ int main(int argc, char** argv) {
         if (testLabels[startId * LABEL_SIZE + i + j]) correct = j;
         if (results[i + j] > results[i + predicted]) predicted = j;
       }
+      /*std::cerr << "CORRECT: " << correct << " PREDICTED: " << predicted << std::endl;*/
       acc += (correct == predicted);
     }