diff --git a/.gitignore b/.gitignore
index 4dfd397b..53468680 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,4 @@ build
 
 # Examples
 examples/*/*.gz
+examples/mnist/*ubyte
diff --git a/examples/mnist/Makefile b/examples/mnist/Makefile
index 7e4e812f..051d5a60 100644
--- a/examples/mnist/Makefile
+++ b/examples/mnist/Makefile
@@ -2,10 +2,13 @@
 
 all: download
 
-download: train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx3-ubyte.gz
+download: train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
 
-%.gz:
-	wget http://yann.lecun.com/exdb/mnist/$*.gz -O $@
+%-ubyte: %-ubyte.gz
+	gzip -d < $^ > $@
+
+%-ubyte.gz:
+	wget http://yann.lecun.com/exdb/mnist/$*-ubyte.gz -O $@
 
 clean:
-	rm -f *.gz
+	rm -f *.gz *-ubyte
diff --git a/examples/xor/label.txt b/examples/xor/label.txt
new file mode 100644
index 00000000..d9ff83f1
--- /dev/null
+++ b/examples/xor/label.txt
@@ -0,0 +1,4 @@
+1
+1
+0
+0
diff --git a/examples/xor/train.txt b/examples/xor/train.txt
new file mode 100644
index 00000000..e6610d1d
--- /dev/null
+++ b/examples/xor/train.txt
@@ -0,0 +1,4 @@
+0 0
+1 1
+1 0
+0 1
\ No newline at end of file
diff --git a/marian/.cproject b/marian/.cproject
index 29c37771..195ef668 100644
--- a/marian/.cproject
+++ b/marian/.cproject
@@ -56,11 +56,11 @@
 							</tool>
 						</toolChain>
 					</folderInfo>
-					<fileInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.1233477758" name="tensor.cu" rcbsApplicability="disable" resourcePath="tensor.cu" toolsToInvoke="nvcc.compiler.base.1979453423.1456105605">
-						<tool id="nvcc.compiler.base.1979453423.1456105605" name="NVCC Compiler" superClass="nvcc.compiler.base.1979453423"/>
+					<fileInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.1770712140" name="tensor.cu" rcbsApplicability="disable" resourcePath="tensor.cu" toolsToInvoke="nvcc.compiler.base.1979453423.1452902875">
+						<tool id="nvcc.compiler.base.1979453423.1452902875" name="NVCC Compiler" superClass="nvcc.compiler.base.1979453423"/>
 					</fileInfo>
 					<sourceEntries>
-						<entry excluding="tensor.cu" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+						<entry flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
 					</sourceEntries>
 				</configuration>
 			</storageModule>
diff --git a/marian/.project b/marian/.project
index 215485f6..d1163076 100644
--- a/marian/.project
+++ b/marian/.project
@@ -26,94 +26,9 @@
 	</natures>
 	<linkedResources>
 		<link>
-			<name>CMakeLists.txt</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/CMakeLists.txt</locationURI>
-		</link>
-		<link>
-			<name>compile_time_crc32.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/compile_time_crc32.h</locationURI>
-		</link>
-		<link>
-			<name>definitions.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/definitions.h</locationURI>
-		</link>
-		<link>
-			<name>exception.cpp</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/exception.cpp</locationURI>
-		</link>
-		<link>
-			<name>exception.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/exception.h</locationURI>
-		</link>
-		<link>
-			<name>expression_operators.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/expression_operators.h</locationURI>
-		</link>
-		<link>
-			<name>expressions.cu</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/expressions.cu</locationURI>
-		</link>
-		<link>
-			<name>expressions.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/expressions.h</locationURI>
-		</link>
-		<link>
-			<name>graph.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/graph.h</locationURI>
-		</link>
-		<link>
-			<name>graph_operators.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/graph_operators.h</locationURI>
-		</link>
-		<link>
-			<name>keywords.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/keywords.h</locationURI>
-		</link>
-		<link>
-			<name>marian.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/marian.h</locationURI>
-		</link>
-		<link>
-			<name>tensor.cu</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/tensor.cu</locationURI>
-		</link>
-		<link>
-			<name>tensor.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/tensor.h</locationURI>
-		</link>
-		<link>
-			<name>tensor_operators.cu</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/tensor_operators.cu</locationURI>
-		</link>
-		<link>
-			<name>tensor_operators.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/tensor_operators.h</locationURI>
-		</link>
-		<link>
-			<name>test.cu</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/test.cu</locationURI>
-		</link>
-		<link>
-			<name>thrust_functions.h</name>
-			<type>1</type>
-			<locationURI>PARENT-1-PROJECT_LOC/src/thrust_functions.h</locationURI>
+			<name>src</name>
+			<type>2</type>
+			<locationURI>PARENT-1-PROJECT_LOC/src</locationURI>
 		</link>
 	</linkedResources>
 </projectDescription>
diff --git a/scripts/train_test_model.py b/scripts/train_test_model.py
new file mode 100755
index 00000000..4f3236a9
--- /dev/null
+++ b/scripts/train_test_model.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import numpy as np
+from keras.datasets import mnist
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.layers import Dropout
+
+def softmax(x):
+    return np.exp(x) / np.sum(np.exp(x), axis=1)[:, None]
+
+
+def baseline_model(pixels_count, classes_count):
+    model = Sequential()
+    # model.add(Dense(pixels_count, input_dim=pixels_count, init='normal', activation='relu'))
+    model.add(Dense(classes_count, input_dim=pixels_count, init='normal', activation='softmax'))
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    return model
+
+
+if __name__ == "__main__":
+    ### Load trainset from mnist
+
+    (X_train, y_train), (X_test, y_test) = mnist.load_data()
+
+    ### Flatten pictures into vectors
+   
+    pixels_count = X_train.shape[1] * X_train.shape[2]
+    X_train = X_train.reshape(X_train.shape[0], pixels_count).astype('float32')
+    print "X shape: ", X_train.shape
+
+    X_test = X_test.reshape(X_test.shape[0], pixels_count).astype('float32')
+
+    ### Normalize data to (0, 1)
+
+    X_train = X_train / 255
+    X_test = X_test / 255
+
+    ### Change classes to one hot encoding matrixes
+
+    y_train = np_utils.to_categorical(y_train)
+    classes_count = y_train.shape[1]
+    print "Y shape: ", y_train.shape
+    
+    y_test = np_utils.to_categorical(y_test)
+
+    # Train weight matrix
+
+    # Build the model
+    model = baseline_model(pixels_count, classes_count)
+    # Fit the model
+    model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=10, batch_size=200, verbose=2)
+    # Final evaluation of the model
+    scores = model.evaluate(X_test, y_test, verbose=0)
+    print("Baseline Error: %.2f%%" % (100-scores[1]*100))
+
+    ### Weight and bias matrixes - we extract them from the model
+    
+    # weights_ones = np.ones((pixels_count, classes_count))
+    # print weights_ones.shape
+
+    weights, bias = model.get_weights()
+    print weights.shape
+    print bias.shape
+    print bias
+
+    ### We calculate lr using softmax!
+
+    dot_out = np.dot(X_train, weights)
+    print "dot_out shape: ", dot_out.shape
+    # print dot_out[:10]
+    
+    add_out = np.add(bias, dot_out)
+    print "add_out shape: ", add_out.shape
+    # print add_out[:10]
+    
+    # lr = np.around(softmax(add_out), decimals = 6)
+    lr = softmax(add_out)
+    print "lr shape: ", lr.shape
+    # print lr[:10]
+    # print np.count_nonzero(lr)i
+
+    ### Save model to npz files
+    if not os.path.exists("test_model"):
+        os.makedirs("test_model")
+    np.savez("test_model/model", weights = weights, bias = bias)
+
+    print "Model saved! Check test_model directory"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 244977db..49832492 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,6 +10,7 @@ cuda_add_executable(
   test.cu
   expressions.cu
   tensor_operators.cu
+  tensor.cu
   $<TARGET_OBJECTS:libcommon>
 )
 
diff --git a/src/expression_operators.h b/src/expression_operators.h
index 8eabbd04..3d42400f 100644
--- a/src/expression_operators.h
+++ b/src/expression_operators.h
@@ -171,6 +171,13 @@ inline Expr softmax(Expr a, Args ...args) {
   return e / sum(e, args...);
 }
 
+template <typename ...Args>
+inline Expr softmax_fast(Expr a, Args ...args) {
+  Expr e = Expr(new SoftmaxNodeOp(a, args...));
+  return e;
+}
+
+
 // inefficient
 template <typename ...Args>
 inline Expr mean(Expr a, Args ...args) {
diff --git a/src/graph_operators.h b/src/graph_operators.h
index 30456153..5a12f807 100644
--- a/src/graph_operators.h
+++ b/src/graph_operators.h
@@ -101,6 +101,30 @@ struct TanhNodeOp : public UnaryNodeOp {
   }
 };
 
+struct SoftmaxNodeOp : public UnaryNodeOp {
+  template <typename ...Args>
+    SoftmaxNodeOp(ChainPtr a, Args ...args)
+    : UnaryNodeOp(a, keywords::shape=newShape(a),
+                  args...) { }
+  
+  Shape newShape(ChainPtr a) {
+    Shape shape = a->shape();
+    return shape;
+  }
+
+  void forward() {
+    // B = softmax(A).
+    val_ = a_->val();
+    Softmax(&val_);
+  }
+  
+  void backward() {
+    // TODO
+    Element(_1 += _2 * Exp(_3),
+            a_->grad(), adj_, a_->val());
+  }
+};
+
 struct LogNodeOp : public UnaryNodeOp {
   template <typename ...Args>
   LogNodeOp(Args ...args)
diff --git a/src/mnist.h b/src/mnist.h
new file mode 100644
index 00000000..43931ed2
--- /dev/null
+++ b/src/mnist.h
@@ -0,0 +1,105 @@
+//#pragma once
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cassert>
+
+namespace datasets {
+namespace mnist {
+
+typedef unsigned char uchar;
+
+
+const size_t IMAGE_MAGIC_NUMBER = 2051;
+const size_t LABEL_MAGIC_NUMBER = 2049;
+
+auto reverseInt = [](int i) {
+  unsigned char c1, c2, c3, c4;
+  c1 = i & 255, c2 = (i >> 8) & 255, c3 = (i >> 16) & 255, c4 = (i >> 24) & 255;
+  return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4;
+};
+
+std::vector<float> ReadImages(const std::string& full_path, int& number_of_images, int imgSize) {
+  std::ifstream file(full_path);
+
+  if (! file.is_open())
+    throw std::runtime_error("Cannot open file `" + full_path + "`!");
+
+  int magic_number = 0;
+  file.read((char *)&magic_number, sizeof(magic_number));
+  magic_number = reverseInt(magic_number);
+
+  if (magic_number != IMAGE_MAGIC_NUMBER)
+    throw std::runtime_error("Invalid MNIST image file!");
+
+  int n_rows = 0;
+  int n_cols = 0;
+  file.read((char *)&number_of_images, sizeof(number_of_images)), number_of_images = reverseInt(number_of_images);
+  file.read((char *)&n_rows, sizeof(n_rows)), n_rows = reverseInt(n_rows);
+  file.read((char *)&n_cols, sizeof(n_cols)), n_cols = reverseInt(n_cols);
+
+  assert(n_rows * n_cols == imgSize);
+
+  int n = number_of_images * imgSize;
+  std::vector<float> _dataset(n);
+
+  for (int i = 0; i < n; i++) {
+    unsigned char pixel = 0;
+    file.read((char*)&pixel, sizeof(pixel));
+    _dataset[i] = pixel / 255.0f;
+  }
+  return _dataset;
+}
+
+std::vector<float> ReadLabels(const std::string& full_path, int& number_of_labels, int label_size) {
+  std::ifstream file(full_path);
+
+  if (! file.is_open())
+    throw std::runtime_error("Cannot open file `" + full_path + "`!");
+
+  int magic_number = 0;
+  file.read((char *)&magic_number, sizeof(magic_number));
+  magic_number = reverseInt(magic_number);
+
+  if (magic_number != LABEL_MAGIC_NUMBER)
+    throw std::runtime_error("Invalid MNIST label file!");
+
+  file.read((char *)&number_of_labels, sizeof(number_of_labels)), number_of_labels = reverseInt(number_of_labels);
+
+  int n = number_of_labels * label_size;
+  std::vector<float> _dataset(n, 0.0f);
+
+  for (int i = 0; i < number_of_labels; i++) {
+    unsigned char label;
+    file.read((char*)&label, 1);
+    _dataset[(i * 10) + (int)(label)] = 1.0f;
+  }
+
+  return _dataset;
+}
+
+} // namespace mnist
+} // namespace datasets
+
+
+//int main(int argc, const char *argv[]) {
+  //int numImg = 0;
+  //auto images = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", numImg);
+  //auto labels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", numImg);
+
+  //std::cout << "Number of images: " << numImg << std::endl;
+
+  //for (int i = 0; i < 3; i++) {
+    //for (int j = 0; j < datasets::mnist::IMAGE_SIZE; j++) {
+      //std::cout << images[(i * datasets::mnist::IMAGE_SIZE) + j] << ",";
+    //}
+    //std::cout << "\nlabels= ";
+    //for (int k = 0; k < 10; k++) {
+      //std::cout << labels[(i * 10) + k] << ",";
+    //}
+    //std::cout << std::endl;
+  //}
+  //return 0;
+//}
diff --git a/src/tensor.cu b/src/tensor.cu
new file mode 100644
index 00000000..6048bb43
--- /dev/null
+++ b/src/tensor.cu
@@ -0,0 +1,93 @@
+#include <fstream>
+#include "tensor.h"
+
+using namespace std;
+
+namespace marian {
+
+inline std::vector<std::string> Tokenize(const std::string& str,
+    const std::string& delimiters = " \t")
+{
+  std::vector<std::string> tokens;
+  // Skip delimiters at beginning.
+  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+  // Find first "non-delimiter".
+  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
+
+  while (std::string::npos != pos || std::string::npos != lastPos) {
+    // Found a token, add it to the vector.
+    tokens.push_back(str.substr(lastPos, pos - lastPos));
+    // Skip delimiters.  Note the "not_of"
+    lastPos = str.find_first_not_of(delimiters, pos);
+    // Find next "non-delimiter"
+    pos = str.find_first_of(delimiters, lastPos);
+  }
+
+  return tokens;
+}
+
+//! convert string to variable of type T. Used to reading floats, int etc from files
+template<typename T>
+T Scan(const std::string &input)
+{
+  std::stringstream stream(input);
+  T ret;
+  stream >> ret;
+  return ret;
+}
+
+//! convert vectors of string to vectors of type T variables
+template<typename T>
+inline std::vector<T> Scan(const std::vector< std::string > &input)
+{
+  std::vector<T> output(input.size());
+  for (size_t i = 0 ; i < input.size() ; i++) {
+    output[i] = Scan<T>( input[i] );
+  }
+  return output;
+}
+
+//! tokenise input string to vector of type T
+template<typename T>
+inline std::vector<T> Tokenize( const std::string &input
+                                , const std::string& delimiters = " \t")
+{
+  std::vector<std::string> stringVector = Tokenize(input, delimiters);
+  return Scan<T>( stringVector );
+}
+
+
+void Tensor::Load(const std::string &path)
+{
+  size_t totSize = std::accumulate(pimpl_->shape().begin(), pimpl_->shape().end(),
+		  1, std::multiplies<int>());
+  cerr << "totSize=" << totSize << endl;
+  std::vector<float> hostData(totSize);
+
+  fstream strm;
+  strm.open(path.c_str());
+
+  string line;
+  size_t ind = 0;
+  while ( getline (strm, line) )
+  {
+	cerr << line << '\n';
+	vector<Float> toks = Tokenize<Float>(line);
+	for (size_t i = 0; i < toks.size(); ++i) {
+		hostData[ind] = toks[i];
+	}
+
+	++ind;
+  }
+  strm.close();
+
+  Load(hostData);
+}
+
+void Tensor::Load(const std::vector<float> &values)
+{
+	pimpl_->set(values);
+}
+
+}
+
diff --git a/src/tensor.h b/src/tensor.h
index cd4f642c..6792e9ac 100644
--- a/src/tensor.h
+++ b/src/tensor.h
@@ -152,6 +152,14 @@ class TensorImpl {
       thrust::fill(data_.begin(), data_.end(), value);
     }
 
+    void set(const std::vector<Float> &values) {
+	  size_t totSize = std::accumulate(shape().begin(), shape().end(),
+			  1, std::multiplies<int>());
+	  std::cerr << "tensor size=" << totSize << " vector size=" << values.size() << std::endl;
+	  assert(totSize == values.size());
+	  thrust::copy(values.begin(), values.end(), data_.begin());
+    }
+
     std::string Debug() const
     {
     	std::stringstream strm;
@@ -240,6 +248,16 @@ class Tensor {
     	return pimpl_->Debug();
     }
 
+    void Print() const {
+      for (int i = 0; i < size(); ++i) {
+        std::cerr << (*this)[i] << " ";
+      }
+      std::cerr << std::endl;
+    }
+
+    void Load(const std::string &path);
+    void Load(const std::vector<float> &values);
+
 };
 
 }
diff --git a/src/tensor_operators.cu b/src/tensor_operators.cu
index a8f72893..2d1d541d 100644
--- a/src/tensor_operators.cu
+++ b/src/tensor_operators.cu
@@ -2,6 +2,53 @@
 
 namespace marian {
 
+// TODO: implement this.
+__global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      extern __shared__ float _share[];
+      float* _sum = _share + blockDim.x;
+      float* sp = softMaxP + j * cols;
+      _sum[threadIdx.x] = 0.0;
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int id = tid + threadIdx.x;
+        if(id < cols) {
+          sp[id] = __expf(sp[id]);
+          _sum[threadIdx.x] += sp[id];
+        }
+      }
+      __syncthreads();
+      int len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if(threadIdx.x < (len >> 1))
+          _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+      for(int tid = 0; tid < cols; tid += blockDim.x){
+        int id = tid + threadIdx.x;
+        if(id < cols)
+          sp[id] /= _sum[0];
+      }
+    }
+  }
+}
+
+// TODO: implement this.
+void Softmax(Tensor* Out) {
+  size_t m = Out->shape()[0];
+  size_t k = Out->shape()[1];
+
+  int blocks = std::min(MAX_BLOCKS, (int) m);
+  int threads = std::min(MAX_THREADS, (int) k);
+  int shared = sizeof(float) * threads * 2;
+  gSoftMax<<<blocks, threads, shared>>>(Out->data(), m, k);
+  cudaStreamSynchronize(0);
+}
+
 Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
              bool transA, bool transB, Float beta) {
   Float alpha = 1.0;
diff --git a/src/tensor_operators.h b/src/tensor_operators.h
index 7ec4ca68..a0c30104 100644
--- a/src/tensor_operators.h
+++ b/src/tensor_operators.h
@@ -142,6 +142,10 @@ void Element(Functor functor,
   cudaStreamSynchronize(0);
 }
 
+__global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols);
+
+void Softmax(Tensor* Out);
+
 Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
              bool transA, bool transB, Float beta);
 
diff --git a/src/test.cu b/src/test.cu
index b5dbba18..a71939b4 100644
--- a/src/test.cu
+++ b/src/test.cu
@@ -1,36 +1,123 @@
 
 #include "marian.h"
+#include "mnist.h"
 
 using namespace std;
 
 int main(int argc, char** argv) {
+  /*int numImg = 0;*/
+  /*auto images = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", numImg);*/
+  /*auto labels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", numImg);*/
 
   using namespace marian;
   using namespace keywords;
   
-  Expr x = input(name="X");
-  Expr y = input(name="Y");
-  
-  Expr w = param(shape={784, 10}, name="W0");
-  Expr b = param(shape={1, 10}, name="b0");
-  
-  Expr pred = softmax(dot(x, w) + b, axis=1);
-  cerr << "lr=" << pred.Debug() << endl;
+  const size_t IMAGE_SIZE = 784;
+  const size_t LABEL_SIZE = 10;
 
-  Expr graph = -mean(sum(y * log(pred), axis=1),
-                     axis=0, name="cost");
+  Expr x = input(shape={whatevs, IMAGE_SIZE}, name="X");
+  Expr y = input(shape={whatevs, LABEL_SIZE}, name="Y");
   
-  Tensor tx({500, 784}, 1);
-  Tensor ty({500, 10}, 1);
+  Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0");
+  Expr b = param(shape={1, LABEL_SIZE}, name="b0");
   
+  auto scores = dot(x, w) + b;
+  auto lr = softmax_fast(scores, axis=1, name="pred");
+  auto graph = -mean(sum(y * log(lr), axis=1), axis=0, name="cost");
+  cerr << "lr=" << lr.Debug() << endl;
+
+#if 0
+  int numofdata;
+  vector<float> images = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", numofdata, IMAGE_SIZE);
+  vector<float> labels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", numofdata, LABEL_SIZE);
+  cerr << "images=" << images.size() << " labels=" << labels.size() << endl;
+  cerr << "numofdata=" << numofdata << endl;
+
+  Tensor tx({numofdata, IMAGE_SIZE}, 1);
+  Tensor ty({numofdata, LABEL_SIZE}, 1);
+
+  tx.Load(images);
+  ty.Load(labels);
+
   cerr << "tx=" << tx.Debug() << endl;
   cerr << "ty=" << ty.Debug() << endl;
+#else
+  Tensor tx({500, 784}, 1);
+  Tensor ty({500, 10}, 1);
+#endif
 
   x = tx;
   y = ty;
 
   graph.forward(500);
+
+  std::cerr << "Result: ";
+  for (auto val : scores.val().shape()) {
+    std::cerr << val << " ";
+  }
+  std::cerr << std::endl;
+  std::cerr << "Result: ";
+  for (auto val : lr.val().shape()) {
+    std::cerr << val << " ";
+  }
+  std::cerr << std::endl;
+  lr.val().Print();
+  std::cerr << "Log-likelihood: ";
+  for (auto val : graph.val().shape()) {
+    std::cerr << val << " ";
+  }
+  std::cerr << std::endl;
+  graph.val().Print();
+  
   graph.backward();
   
+  //std::cerr << graph["pred"].val()[0] << std::endl;
+  
+
+   // XOR
+  /*
+  Expr x = input(shape={whatevs, 2}, name="X");
+  Expr y = input(shape={whatevs, 2}, name="Y");
+
+  Expr w = param(shape={2, 1}, name="W0");
+  Expr b = param(shape={1, 1}, name="b0");
+
+  Expr n5 = dot(x, w);
+  Expr n6 = n5 + b;
+  Expr lr = softmax(n6, axis=1, name="pred");
+  cerr << "lr=" << lr.Debug() << endl;
+
+  Expr graph = -mean(sum(y * log(lr), axis=1), axis=0, name="cost");
+
+  Tensor tx({4, 2}, 1);
+  Tensor ty({4, 1}, 1);
+  cerr << "tx=" << tx.Debug() << endl;
+  cerr << "ty=" << ty.Debug() << endl;
+
+  tx.Load("../examples/xor/train.txt");
+  ty.Load("../examples/xor/label.txt");
+  */
+
+#if 0  
+  hook0(graph);
+  graph.autodiff();
+  std::cerr << graph["cost"].val()[0] << std::endl;
+  //hook1(graph);
+  for(auto p : graph.params()) {
+    auto update = _1 = _1 - alpha * _2;
+    Element(update, p.val(), p.grad());
+  }
+  hook2(graph);
+  
+  auto opt = adadelta(cost_function=cost,
+                      eta=0.9, gamma=0.1,
+                      set_batch=set,
+                      before_update=before,
+                      after_update=after,
+                      set_valid=valid,
+                      validation_freq=100,
+                      verbose=1, epochs=3, early_stopping=10);
+  opt.run();
+#endif  
   return 0;
 }