diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 49832492..2aa3fdc9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,6 +3,7 @@ include_directories(.)
 
 add_library(libcommon OBJECT
   exception.cpp
+  cnpy/cnpy.cpp
 )
 
 cuda_add_executable(
@@ -14,7 +15,25 @@ cuda_add_executable(
   $<TARGET_OBJECTS:libcommon>
 )
 
-foreach(exec marian)
+cuda_add_executable(
+  train_mnist
+  train_mnist.cu
+  expressions.cu
+  tensor_operators.cu
+  tensor.cu
+  $<TARGET_OBJECTS:libcommon>
+)
+
+cuda_add_executable(
+  validate_mnist
+  validate_mnist.cu
+  expressions.cu
+  tensor_operators.cu
+  tensor.cu
+  $<TARGET_OBJECTS:libcommon>
+)
+
+foreach(exec marian train_mnist validate_mnist)
   target_link_libraries(${exec} ${EXT_LIBS} cuda cudnn)
   cuda_add_cublas_to_target(${exec})
   set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
diff --git a/src/cnpy/LICENSE b/src/cnpy/LICENSE
new file mode 100644
index 00000000..e60eadbc
--- /dev/null
+++ b/src/cnpy/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) Carl Rogers, 2011
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/src/cnpy/cnpy.cpp b/src/cnpy/cnpy.cpp
new file mode 100644
index 00000000..85978dc8
--- /dev/null
+++ b/src/cnpy/cnpy.cpp
@@ -0,0 +1,251 @@
+//Copyright (C) 2011  Carl Rogers
+//Released under MIT License
+//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
+
+#include"cnpy.h"
+#include<complex>
+#include<cstdlib>
+#include<algorithm>
+#include<cstring>
+#include<iomanip>
+
+char cnpy::BigEndianTest() {
+    unsigned char x[] = {1,0};
+    short y = *(short*) x;
+    return y == 1 ? '<' : '>';
+}
+
+char cnpy::map_type(const std::type_info& t)
+{
+    if(t == typeid(float) ) return 'f';
+    if(t == typeid(double) ) return 'f';
+    if(t == typeid(long double) ) return 'f';
+
+    if(t == typeid(int) ) return 'i';
+    if(t == typeid(char) ) return 'i';
+    if(t == typeid(short) ) return 'i';
+    if(t == typeid(long) ) return 'i';
+    if(t == typeid(long long) ) return 'i';
+
+    if(t == typeid(unsigned char) ) return 'u';
+    if(t == typeid(unsigned short) ) return 'u';
+    if(t == typeid(unsigned long) ) return 'u';
+    if(t == typeid(unsigned long long) ) return 'u';
+    if(t == typeid(unsigned int) ) return 'u';
+
+    if(t == typeid(bool) ) return 'b';
+
+    if(t == typeid(std::complex<float>) ) return 'c';
+    if(t == typeid(std::complex<double>) ) return 'c';
+    if(t == typeid(std::complex<long double>) ) return 'c';
+
+    else return '?';
+}
+
+template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const std::string rhs) {
+    lhs.insert(lhs.end(),rhs.begin(),rhs.end());
+    return lhs;
+}
+
+template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const char* rhs) {
+    //write in little endian
+    size_t len = strlen(rhs);
+    lhs.reserve(len);
+    for(size_t byte = 0; byte < len; byte++) {
+        lhs.push_back(rhs[byte]);
+    }
+    return lhs;
+}
+
+void cnpy::parse_npy_header(FILE* fp, unsigned int& word_size, unsigned int*& shape, unsigned int& ndims, bool& fortran_order) {  
+    char buffer[256];
+    size_t res = fread(buffer,sizeof(char),11,fp);       
+    if(res != 11)
+        throw std::runtime_error("parse_npy_header: failed fread");
+    std::string header = fgets(buffer,256,fp);
+    assert(header[header.size()-1] == '\n');
+
+    int loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order")+16;
+    fortran_order = (header.substr(loc1,5) == "True" ? true : false);
+
+    //shape
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    if(str_shape.length() == 0) ndims = 0;
+    else if(str_shape[str_shape.size()-1] == ',') ndims = 1;
+    else ndims = std::count(str_shape.begin(),str_shape.end(),',')+1;
+    shape = new unsigned int[ndims];
+    for(unsigned int i = 0;i < ndims;i++) {
+        loc1 = str_shape.find(",");
+        shape[i] = atoi(str_shape.substr(0,loc1).c_str());
+        str_shape = str_shape.substr(loc1+1);
+    }
+
+    //endian, word size, data type
+    //byte order code | stands for not applicable. 
+    //not sure when this applies except for byte array
+    loc1 = header.find("descr")+9;
+    bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
+    assert(littleEndian);
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T));
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size = atoi(str_ws.substr(0,loc2).c_str());
+}
+
+void cnpy::parse_zip_footer(FILE* fp, unsigned short& nrecs, unsigned int& global_header_size, unsigned int& global_header_offset)
+{
+    std::vector<char> footer(22);
+    fseek(fp,-22,SEEK_END);
+    size_t res = fread(&footer[0],sizeof(char),22,fp);
+    if(res != 22)
+        throw std::runtime_error("parse_zip_footer: failed fread");
+
+    unsigned short disk_no, disk_start, nrecs_on_disk, comment_len;
+    disk_no = *(unsigned short*) &footer[4];
+    disk_start = *(unsigned short*) &footer[6];
+    nrecs_on_disk = *(unsigned short*) &footer[8];
+    nrecs = *(unsigned short*) &footer[10];
+    global_header_size = *(unsigned int*) &footer[12];
+    global_header_offset = *(unsigned int*) &footer[16];
+    comment_len = *(unsigned short*) &footer[20];
+
+    assert(disk_no == 0);
+    assert(disk_start == 0);
+    assert(nrecs_on_disk == nrecs);
+    assert(comment_len == 0);
+}
+
+cnpy::NpyArray load_the_npy_file(FILE* fp) {
+    unsigned int* shape;
+    unsigned int ndims, word_size;
+    bool fortran_order;
+    cnpy::parse_npy_header(fp,word_size,shape,ndims,fortran_order);
+    unsigned long long size = 1; //long long so no overflow when multiplying by word_size
+    for(unsigned int i = 0;i < ndims;i++) size *= shape[i];
+
+    cnpy::NpyArray arr;
+    arr.word_size = word_size;
+    arr.shape = std::vector<unsigned int>(shape,shape+ndims);
+    delete[] shape;
+    arr.data = new char[size*word_size];    
+    arr.fortran_order = fortran_order;
+    size_t nread = fread(arr.data,word_size,size,fp);
+    if(nread != size)
+        throw std::runtime_error("load_the_npy_file: failed fread");
+    return arr;
+}
+
+cnpy::npz_t cnpy::npz_load(std::string fname) {
+    FILE* fp = fopen(fname.c_str(),"rb");
+
+    if(!fp) printf("npz_load: Error! Unable to open file %s!\n",fname.c_str());
+    assert(fp);
+
+    cnpy::npz_t arrays;  
+
+    while(1) {
+        std::vector<char> local_header(30);
+        size_t headerres = fread(&local_header[0],sizeof(char),30,fp);
+        if(headerres != 30)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //if we've reached the global header, stop reading
+        if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
+
+        //read in the variable name
+        unsigned short name_len = *(unsigned short*) &local_header[26];
+        std::string varname(name_len,' ');
+        size_t vname_res = fread(&varname[0],sizeof(char),name_len,fp);
+        if(vname_res != name_len)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //erase the lagging .npy        
+        varname.erase(varname.end()-4,varname.end());
+
+        //read in the extra field
+        unsigned short extra_field_len = *(unsigned short*) &local_header[28];
+        if(extra_field_len > 0) {
+            std::vector<char> buff(extra_field_len);
+            size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp);
+            if(efield_res != extra_field_len)
+                throw std::runtime_error("npz_load: failed fread");
+        }
+
+        arrays[varname] = load_the_npy_file(fp);
+    }
+
+    fclose(fp);
+    return arrays;  
+}
+
+cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
+    FILE* fp = fopen(fname.c_str(),"rb");
+
+    if(!fp) {
+        printf("npz_load: Error! Unable to open file %s!\n",fname.c_str());
+        abort();
+    }       
+
+    while(1) {
+        std::vector<char> local_header(30);
+        size_t header_res = fread(&local_header[0],sizeof(char),30,fp);
+        if(header_res != 30)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //if we've reached the global header, stop reading
+        if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
+
+        //read in the variable name
+        unsigned short name_len = *(unsigned short*) &local_header[26];
+        std::string vname(name_len,' ');
+        size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp);      
+        if(vname_res != name_len)
+            throw std::runtime_error("npz_load: failed fread");
+        vname.erase(vname.end()-4,vname.end()); //erase the lagging .npy
+
+        //read in the extra field
+        unsigned short extra_field_len = *(unsigned short*) &local_header[28];
+        fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field
+
+        if(vname == varname) {
+            NpyArray array = load_the_npy_file(fp);
+            fclose(fp);
+            return array;
+        }
+        else {
+            //skip past the data
+            unsigned int size = *(unsigned int*) &local_header[22];
+            fseek(fp,size,SEEK_CUR);
+        }
+    }
+
+    fclose(fp);
+    printf("npz_load: Error! Variable name %s not found in %s!\n",varname.c_str(),fname.c_str());
+    abort();
+}
+
+cnpy::NpyArray cnpy::npy_load(std::string fname) {
+
+    FILE* fp = fopen(fname.c_str(), "rb");
+
+    if(!fp) {
+        printf("npy_load: Error! Unable to open file %s!\n",fname.c_str());
+        abort();  
+    }
+
+    NpyArray arr = load_the_npy_file(fp);
+
+    fclose(fp);
+    return arr;
+}
+
+
+
diff --git a/src/cnpy/cnpy.h b/src/cnpy/cnpy.h
new file mode 100644
index 00000000..b11013b9
--- /dev/null
+++ b/src/cnpy/cnpy.h
@@ -0,0 +1,241 @@
+//Copyright (C) 2011  Carl Rogers
+//Released under MIT License
+//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
+
+#ifndef LIBCNPY_H_
+#define LIBCNPY_H_
+
+#include<string>
+#include<stdexcept>
+#include<sstream>
+#include<vector>
+#include<cstdio>
+#include<typeinfo>
+#include<iostream>
+#include<cassert>
+#include<zlib.h>
+#include<map>
+
+namespace cnpy {
+
+    struct NpyArray {
+        char* data;
+        std::vector<unsigned int> shape;
+        unsigned int word_size;
+        bool fortran_order;
+        void destruct() {delete[] data;}
+    };
+    
+    struct npz_t : public std::map<std::string, NpyArray>
+    {
+        void destruct()
+        {
+            npz_t::iterator it = this->begin();
+            for(; it != this->end(); ++it) (*it).second.destruct();
+        }
+    };
+
+    char BigEndianTest();
+    char map_type(const std::type_info& t);
+    template<typename T> std::vector<char> create_npy_header(const T* data, const unsigned int* shape, const unsigned int ndims);
+    void parse_npy_header(FILE* fp,unsigned int& word_size, unsigned int*& shape, unsigned int& ndims, bool& fortran_order);
+    void parse_zip_footer(FILE* fp, unsigned short& nrecs, unsigned int& global_header_size, unsigned int& global_header_offset);
+    npz_t npz_load(std::string fname);
+    NpyArray npz_load(std::string fname, std::string varname);
+    NpyArray npy_load(std::string fname);
+
+    template<typename T> std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs) {
+        //write in little endian
+        for(char byte = 0; byte < sizeof(T); byte++) {
+            char val = *((char*)&rhs+byte); 
+            lhs.push_back(val);
+        }
+        return lhs;
+    }
+
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const std::string rhs); 
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const char* rhs); 
+
+
+    template<typename T> std::string tostring(T i, int pad = 0, char padval = ' ') {
+        std::stringstream s;
+        s << i;
+        return s.str();
+    }
+
+    template<typename T> void npy_save(std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w") {
+        FILE* fp = NULL;
+
+        if(mode == "a") fp = fopen(fname.c_str(),"r+b");
+
+        if(fp) {
+            //file exists. we need to append to it. read the header, modify the array size
+            unsigned int word_size, tmp_dims;
+            unsigned int* tmp_shape = 0;
+            bool fortran_order;
+            parse_npy_header(fp,word_size,tmp_shape,tmp_dims,fortran_order);
+            assert(!fortran_order);
+
+            if(word_size != sizeof(T)) {
+                std::cout<<"libnpy error: "<<fname<<" has word size "<<word_size<<" but npy_save appending data sized "<<sizeof(T)<<"\n";
+                assert( word_size == sizeof(T) );
+            }
+            if(tmp_dims != ndims) {
+                std::cout<<"libnpy error: npy_save attempting to append misdimensioned data to "<<fname<<"\n";
+                assert(tmp_dims == ndims);
+            }
+
+            for(int i = 1; i < ndims; i++) {
+                if(shape[i] != tmp_shape[i]) {
+                    std::cout<<"libnpy error: npy_save attempting to append misshaped data to "<<fname<<"\n";
+                    assert(shape[i] == tmp_shape[i]);
+                }
+            }
+            tmp_shape[0] += shape[0];
+
+            fseek(fp,0,SEEK_SET);
+            std::vector<char> header = create_npy_header(data,tmp_shape,ndims);
+            fwrite(&header[0],sizeof(char),header.size(),fp);
+            fseek(fp,0,SEEK_END);
+
+            delete[] tmp_shape;
+        }
+        else {
+            fp = fopen(fname.c_str(),"wb");
+            std::vector<char> header = create_npy_header(data,shape,ndims);
+            fwrite(&header[0],sizeof(char),header.size(),fp);
+        }
+
+        unsigned int nels = 1;
+        for(int i = 0;i < ndims;i++) nels *= shape[i];
+
+        fwrite(data,sizeof(T),nels,fp);
+        fclose(fp);
+    }
+
+    template<typename T> void npz_save(std::string zipname, std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w")
+    {
+        //first, append a .npy to the fname
+        fname += ".npy";
+
+        //now, on with the show
+        FILE* fp = NULL;
+        unsigned short nrecs = 0;
+        unsigned int global_header_offset = 0;
+        std::vector<char> global_header;
+
+        if(mode == "a") fp = fopen(zipname.c_str(),"r+b");
+
+        if(fp) {
+            //zip file exists. we need to add a new npy file to it.
+            //first read the footer. this gives us the offset and size of the global header
+            //then read and store the global header. 
+            //below, we will write the the new data at the start of the global header then append the global header and footer below it
+            unsigned int global_header_size;
+            parse_zip_footer(fp,nrecs,global_header_size,global_header_offset);
+            fseek(fp,global_header_offset,SEEK_SET);
+            global_header.resize(global_header_size);
+            size_t res = fread(&global_header[0],sizeof(char),global_header_size,fp);
+            if(res != global_header_size){
+                throw std::runtime_error("npz_save: header read error while adding to existing zip");
+            }
+            fseek(fp,global_header_offset,SEEK_SET);
+        }
+        else {
+            fp = fopen(zipname.c_str(),"wb");
+        }
+
+        std::vector<char> npy_header = create_npy_header(data,shape,ndims);
+
+        unsigned long nels = 1;
+        for (int m=0; m<ndims; m++ ) nels *= shape[m];
+        int nbytes = nels*sizeof(T) + npy_header.size();
+
+        //get the CRC of the data to be added
+        unsigned int crc = crc32(0L,(unsigned char*)&npy_header[0],npy_header.size());
+        crc = crc32(crc,(unsigned char*)data,nels*sizeof(T));
+
+        //build the local header
+        std::vector<char> local_header;
+        local_header += "PK"; //first part of sig
+        local_header += (unsigned short) 0x0403; //second part of sig
+        local_header += (unsigned short) 20; //min version to extract
+        local_header += (unsigned short) 0; //general purpose bit flag
+        local_header += (unsigned short) 0; //compression method
+        local_header += (unsigned short) 0; //file last mod time
+        local_header += (unsigned short) 0;     //file last mod date
+        local_header += (unsigned int) crc; //crc
+        local_header += (unsigned int) nbytes; //compressed size
+        local_header += (unsigned int) nbytes; //uncompressed size
+        local_header += (unsigned short) fname.size(); //fname length
+        local_header += (unsigned short) 0; //extra field length
+        local_header += fname;
+
+        //build global header
+        global_header += "PK"; //first part of sig
+        global_header += (unsigned short) 0x0201; //second part of sig
+        global_header += (unsigned short) 20; //version made by
+        global_header.insert(global_header.end(),local_header.begin()+4,local_header.begin()+30);
+        global_header += (unsigned short) 0; //file comment length
+        global_header += (unsigned short) 0; //disk number where file starts
+        global_header += (unsigned short) 0; //internal file attributes
+        global_header += (unsigned int) 0; //external file attributes
+        global_header += (unsigned int) global_header_offset; //relative offset of local file header, since it begins where the global header used to begin
+        global_header += fname;
+
+        //build footer
+        std::vector<char> footer;
+        footer += "PK"; //first part of sig
+        footer += (unsigned short) 0x0605; //second part of sig
+        footer += (unsigned short) 0; //number of this disk
+        footer += (unsigned short) 0; //disk where footer starts
+        footer += (unsigned short) (nrecs+1); //number of records on this disk
+        footer += (unsigned short) (nrecs+1); //total number of records
+        footer += (unsigned int) global_header.size(); //nbytes of global headers
+        footer += (unsigned int) (global_header_offset + nbytes + local_header.size()); //offset of start of global headers, since global header now starts after newly written array
+        footer += (unsigned short) 0; //zip file comment length
+
+        //write everything      
+        fwrite(&local_header[0],sizeof(char),local_header.size(),fp);
+        fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp);
+        fwrite(data,sizeof(T),nels,fp);
+        fwrite(&global_header[0],sizeof(char),global_header.size(),fp);
+        fwrite(&footer[0],sizeof(char),footer.size(),fp);
+        fclose(fp);
+    }
+
+    template<typename T> std::vector<char> create_npy_header(const T* data, const unsigned int* shape, const unsigned int ndims) {  
+
+        std::vector<char> dict;
+        dict += "{'descr': '";
+        dict += BigEndianTest();
+        dict += map_type(typeid(T));
+        dict += tostring(sizeof(T));
+        dict += "', 'fortran_order': False, 'shape': (";
+        dict += tostring(shape[0]);
+        for(int i = 1;i < ndims;i++) {
+            dict += ", ";
+            dict += tostring(shape[i]);
+        }
+        if(ndims == 1) dict += ",";
+        dict += "), }";
+        //pad with spaces so that preamble+dict is modulo 16 bytes. preamble is 10 bytes. dict needs to end with \n
+        int remainder = 16 - (10 + dict.size()) % 16;
+        dict.insert(dict.end(),remainder,' ');
+        dict.back() = '\n';
+
+        std::vector<char> header;
+        header += (char) 0x93;
+        header += "NUMPY";
+        header += (char) 0x01; //major version of numpy format
+        header += (char) 0x00; //minor version of numpy format
+        header += (unsigned short) dict.size();
+        header.insert(header.end(),dict.begin(),dict.end());
+
+        return header;
+    }
+
+
+}
+
+#endif
diff --git a/src/definitions.h b/src/definitions.h
index ea52024e..c1f24663 100644
--- a/src/definitions.h
+++ b/src/definitions.h
@@ -5,13 +5,13 @@
 #include <functional>
 
 namespace marian {
-  typedef float Float;  
+  typedef float Float;
   typedef std::vector<int> Shape;
   const int whatevs{-1};
 }
 
 #include "keywords.h"
-#include "tensor.h"
+// #include "tensor.h"
 
 namespace marian {
   class Tensor;
diff --git a/src/expressions.cu b/src/expressions.cu
index 2d656ce1..a95b1bef 100644
--- a/src/expressions.cu
+++ b/src/expressions.cu
@@ -10,7 +10,7 @@ Expr::Expr(Chainable<Tensor>* chainable) : pimpl_(chainable) {}
 Expr::Expr(Float v) : pimpl_(new ConstantNode(keywords::value=v,
                                               keywords::shape={1,1})) {}
 
-Tensor &Expr::val() {
+Tensor Expr::val() {
   return pimpl_->val();
 }
 
diff --git a/src/expressions.h b/src/expressions.h
index 09d0edfa..43016dac 100644
--- a/src/expressions.h
+++ b/src/expressions.h
@@ -9,25 +9,25 @@ class Expr {
   public:
     Expr(Chainable<Tensor>* chainable);
     Expr(Float v);
-    
+
     Expr operator=(Tensor t) {
       pimpl_->setVal(t);
       return *this;
     }
-    
-    Tensor &val();
+
+    Tensor val();
     Tensor grad();
-    
+
     void forward(size_t batchSize);
     void backward();
-    
+
     ChainPtr node();
     operator ChainPtr();
-    
+
     std::string Debug() const;
 
   private:
-    ChainPtr pimpl_; 
+    ChainPtr pimpl_;
 };
 
 }
diff --git a/src/marian.h b/src/marian.h
index 8c987ccf..0876d4cd 100644
--- a/src/marian.h
+++ b/src/marian.h
@@ -5,4 +5,5 @@
 #include "graph_operators.h"
 #include "expressions.h"
 #include "expression_operators.h"
+#include "param_initializers.h"
 
diff --git a/src/npz_converter.cpp b/src/npz_converter.cpp
new file mode 100644
index 00000000..1ecbc11c
--- /dev/null
+++ b/src/npz_converter.cpp
@@ -0,0 +1,39 @@
+#include "common/npz_converter.h"
+
+
+
+NpzConverter::NpzConverter(const std::string& file)
+  : model_(cnpy::npz_load(file)),
+    destructed_(false) {
+  }
+
+NpzConverter::~NpzConverter() {
+  if(!destructed_)
+    model_.destruct();
+}
+
+void NpzConverter::Destruct() {
+  model_.destruct();
+  destructed_ = true;
+}
+
+mblas::Matrix NpzConverter::operator[](const std::string& key) const {
+  typedef blaze::CustomMatrix<float, blaze::unaligned,
+    blaze::unpadded, blaze::rowMajor> BlazeWrapper;
+  mblas::Matrix matrix;
+  auto it = model_.find(key);
+  if(it != model_.end()) {
+    NpyMatrixWrapper np(it->second);
+    matrix = BlazeWrapper(np.data(), np.size1(), np.size2());
+  }
+  else {
+    std::cerr << "Missing " << key << std::endl;
+  }
+  return std::move(matrix);
+}
+
+mblas::Matrix NpzConverter::operator()(const std::string& key, bool transpose) const {
+  mblas::Matrix matrix = (*this)[key];
+  mblas::Trans(matrix);
+  return std::move(matrix);
+}
diff --git a/src/npz_converter.h b/src/npz_converter.h
new file mode 100644
index 00000000..96060cfc
--- /dev/null
+++ b/src/npz_converter.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "cnpy/cnpy.h"
+#include "tensor.h"
+
+class NpzConverter {
+  private:
+    class NpyMatrixWrapper {
+      public:
+        NpyMatrixWrapper(const cnpy::NpyArray& npy)
+        : npy_(npy) {}
+
+        size_t size() const {
+          return size1() * size2();
+        }
+
+        float* data() const {
+          return (float*)npy_.data;
+        }
+
+        float operator()(size_t i, size_t j) const {
+          return ((float*)npy_.data)[i * size2() + j];
+        }
+
+        size_t size1() const {
+          return npy_.shape[0];
+        }
+
+        size_t size2() const {
+          if(npy_.shape.size() == 1)
+            return 1;
+          else
+            return npy_.shape[1];
+        }
+
+      private:
+        const cnpy::NpyArray& npy_;
+    };
+
+  public:
+    NpzConverter(const std::string& file)
+      : model_(cnpy::npz_load(file)),
+        destructed_(false) {
+      }
+
+    ~NpzConverter() {
+      if(!destructed_)
+        model_.destruct();
+    }
+
+    void Destruct() {
+      model_.destruct();
+      destructed_ = true;
+    }
+
+    void Load(const std::string& key, std::vector<float>& data, marian::Shape& shape) const {
+      auto it = model_.find(key);
+      if(it != model_.end()) {
+        NpyMatrixWrapper np(it->second);
+        data.clear();
+        data.resize(np.size());
+        std::copy(np.data(), np.data() + np.size(), data.begin());
+
+        shape.clear();
+        shape.push_back(np.size1());
+        shape.push_back(np.size2());
+
+      }
+      else {
+        std::cerr << "Missing " << key << std::endl;
+      }
+    }
+
+  private:
+    cnpy::npz_t model_;
+    bool destructed_;
+};
diff --git a/src/param_initializers.h b/src/param_initializers.h
new file mode 100644
index 00000000..ab781064
--- /dev/null
+++ b/src/param_initializers.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <random>
+#include <algorithm>
+#include <iterator>
+#include <functional>
+
+#include "tensor.h"
+
+namespace marian {
+
+void zeros(Tensor t) {
+  std::vector<float> vals(t.size(), 0.0f);
+  thrust::copy(vals.begin(), vals.end(), t.begin());
+}
+
+void ones(Tensor t) {
+  std::vector<float> vals(t.size(), 1.0f);
+  thrust::copy(vals.begin(), vals.end(), t.begin());
+}
+
+void randreal(Tensor t) {
+  std::random_device device;
+  std::default_random_engine engine(device());
+  std::uniform_real_distribution<> dist(0, 1);
+  auto gen = std::bind(dist, engine);
+
+  std::vector<float> vals(t.size());
+  std::generate(begin(vals), end(vals), gen);
+
+  thrust::copy(vals.begin(), vals.end(), t.begin());
+}
+
+} // namespace marian
diff --git a/src/sgd.h b/src/sgd.h
new file mode 100644
index 00000000..298cd358
--- /dev/null
+++ b/src/sgd.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <memory>
+#include <iostream>
+
+#include "expressions.h"
+#include "thrust_functions.h"
+
+namespace marian {
+
+class SGD {
+  public:
+    SGD(Expr& cost_func, Expr& inX, Expr& inY,
+        const std::vector<Expr*> params, float eta,
+        std::vector<float>& xData, size_t numFeatures,
+        std::vector<float>& yData, size_t numClasses,
+        size_t epochs, size_t batchSize)
+    : cost_function_(&cost_func),
+      inX_(&inX),
+      inY_(&inY),
+      params_(params),
+      eta_(eta),
+      xData_(xData),
+      numFeatures_(numFeatures),
+      yData_(yData),
+      numClasses_(numClasses),
+      epochs_(epochs),
+      batchSize_(batchSize)
+  {}
+
+    void Run() {
+      size_t numExamples = xData_.size()/ numFeatures_;
+      Tensor xt({(int)batchSize_, (int)numExamples}, 0.0f);
+      Tensor yt({(int)batchSize_, (int)numClasses_}, 0.0f);
+
+      for (size_t numEpoch = 0; numEpoch < epochs_; ++numEpoch) {
+        std::cerr << "Starting epoch #" << numEpoch << std::endl;
+        size_t startId = 0;
+        size_t endId = startId + batchSize_;
+
+        while (endId < numExamples) {
+          PrepareBatch(startId, endId, xt, yt);
+          *inX_ = xt;
+          *inY_ = yt;
+
+          cost_function_->forward(batchSize_);
+          cost_function_->backward();
+
+          UpdateModel();
+
+          startId += batchSize_;
+          endId += batchSize_;
+        }
+      }
+    }
+
+    void PrepareBatch(size_t startId, size_t endId, Tensor& xt, Tensor& yt) {
+      std::vector<float> x(xData_.begin() + startId * numFeatures_,
+                           xData_.begin() + endId * numFeatures_);
+      std::vector<float> y(yData_.begin() + startId * numClasses_,
+                           yData_.begin() + endId * numClasses_);
+
+      xt.Load(x);
+      yt.Load(y);
+    }
+
+    void UpdateModel() {
+      for (auto& param : params_) {
+        using namespace thrust::placeholders;
+        Element(_1 = _1 - eta_ * _2, param->val(), param->grad());
+      }
+    }
+
+  private:
+    std::shared_ptr<Expr> cost_function_;
+    std::shared_ptr<Expr> inX_;
+    std::shared_ptr<Expr> inY_;
+    std::vector<Expr*> params_;
+    const float eta_;
+    std::vector<float>& xData_;
+    const size_t numFeatures_;
+    std::vector<float>& yData_;
+    const size_t numClasses_;
+    const size_t epochs_;
+    const size_t batchSize_;
+};
+
+} // namespace marian
diff --git a/src/tensor.cu b/src/tensor.cu
index 398b696a..09355b21 100644
--- a/src/tensor.cu
+++ b/src/tensor.cu
@@ -83,6 +83,12 @@ void Tensor::Load(const std::string &path)
   Load(hostData.begin(), hostData.begin());
 }
 
+void Tensor::Load(const std::vector<float>& data)
+{
+	pimpl_->set(data.begin(), data.end());
+}
+
+
 void Tensor::Load(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end)
 {
 	pimpl_->set(begin, end);
diff --git a/src/tensor.h b/src/tensor.h
index ff6ecd0b..b9c81a91 100644
--- a/src/tensor.h
+++ b/src/tensor.h
@@ -16,16 +16,16 @@ namespace marian {
 struct Handles {
   cudnnHandle_t cudnnHandle;
   cublasHandle_t cublasHandle;
-  
-  cudnnOpTensorDescriptor_t add;  
-  
+
+  cudnnOpTensorDescriptor_t add;
+
   Handles() {
     cudnnCreate(&cudnnHandle);
     cublasCreate(&cublasHandle);
     cudnnCreateOpTensorDescriptor(&add);
     cudnnSetOpTensorDescriptor(add, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN);
   }
-  
+
   ~Handles() {
     cudnnDestroy(cudnnHandle);
     cublasDestroy(cublasHandle);
@@ -35,7 +35,7 @@ struct Handles {
 
 const Handles handles;
 
-typedef std::vector<int> Shape;
+// typedef std::vector<int> Shape;
 
 inline std::string Debug(const Shape &shape)
 {
@@ -63,7 +63,7 @@ class TensorImpl {
     cudnnTensorDescriptor_t desc_;
     size_t tno_;
     static size_t tensorCounter;
-    
+
     cudnnDataType_t dataType() {
       switch(sizeof(Float)) {
         case 2: return CUDNN_DATA_HALF;
@@ -74,15 +74,15 @@ class TensorImpl {
 
   public:
     typedef Float value_type;
-    
+
     TensorImpl(const Shape& shape, value_type value = 0)
     : shape_(shape), tno_(tensorCounter++)
     {
-      
-      // @TODO: 
+
+      // @TODO:
       UTIL_THROW_IF2(shape_.size() != 2,
                      "For now, only 2D Tensors, will be fixed later.");
-      
+
       UTIL_THROW_IF2(shape_.size() < 1 || shape_.size() > 4,
                      "Wrong number of dimensions: " << shape_.size());
 
@@ -106,54 +106,54 @@ class TensorImpl {
                                      shape_[0], shape_[1], shape_[2], shape_[3]); break;
       }
     }
-   
+
     TensorImpl(const TensorImpl&) = delete;
     TensorImpl(TensorImpl&&) = delete;
-         
+
     ~TensorImpl() {
       cudnnDestroyTensorDescriptor(desc_);
     }
-   
+
    value_type operator[](size_t i) const {
       return data_[i];
     }
-      
+
     auto begin() -> decltype( data_.begin() ) {
       return data_.begin();
     }
-   
+
     auto begin() const -> decltype( data_.begin() ) {
       return data_.begin();
     }
-   
+
     auto end() -> decltype( data_.end() ) {
       return data_.end();
     }
-   
+
     auto end() const -> decltype( data_.end() ) {
       return data_.end();
     }
-   
+
     const Shape& shape() const {
         return shape_;
     }
-    
+
     size_t size() const {
       return data_.size();
     }
-    
+
     value_type* data() {
       return thrust::raw_pointer_cast(data_.data());
     }
-    
+
     cudnnTensorDescriptor_t desc() const {
       return desc_;
     }
-    
+
     size_t id() const {
       return tno_;
     }
-    
+
     void set(value_type value) {
       thrust::fill(data_.begin(), data_.end(), value);
     }
@@ -194,70 +194,70 @@ size_t TensorImpl<Type>::tensorCounter = 0;
 class Tensor {
   private:
     std::shared_ptr<TensorImpl<Float>> pimpl_;
-    
+
   public:
     typedef TensorImpl<Float>::value_type value_type;
-    
+
     Tensor() {}
-    Tensor(Shape shape, value_type value = 0) {
+    Tensor(const Shape& shape, value_type value = 0) {
       allocate(shape, value);
     }
-    
+
     ~Tensor() {}
-    
-    void allocate(Shape shape, value_type value = 0) {
+
+    void allocate(const Shape& shape, value_type value = 0) {
       if(!pimpl_)
         pimpl_.reset(new TensorImpl<Float>(shape, value));
     }
-    
+
     value_type operator[](size_t i) const {
       return (*pimpl_)[i];
     }
-    
+
     size_t size() const {
       return pimpl_->size();
     }
-    
+
     value_type* data() {
       return pimpl_->data();
     }
-    
+
     const value_type* data() const {
       return pimpl_->data();
     }
-    
+
     auto begin() -> decltype( pimpl_->begin() ) {
       return pimpl_->begin();
     }
-   
+
     auto begin() const -> decltype( pimpl_->begin() ) {
       return pimpl_->begin();
     }
-   
+
     auto end() -> decltype( pimpl_->begin() ) {
       return pimpl_->begin();
     }
-   
+
     auto end() const -> decltype( pimpl_->begin() ) {
       return pimpl_->begin();
     }
-    
+
     const Shape& shape() const {
       return pimpl_->shape();
     }
-    
+
     cudnnTensorDescriptor_t desc() const {
       return pimpl_->desc();
     }
-    
+
     void set(value_type value) {
       pimpl_->set(value);
     }
-    
+
     size_t id() const {
       return pimpl_->id();
     }
-    
+
     operator bool() {
       return pimpl_ != nullptr;
     }
@@ -275,6 +275,7 @@ class Tensor {
     }
 
     void Load(const std::string &path);
+    void Load(const std::vector<float>& data);
     void Load(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end);
 
 };
diff --git a/src/test.cu b/src/test.cu
index 0285e3a5..a86c60ee 100644
--- a/src/test.cu
+++ b/src/test.cu
@@ -20,6 +20,7 @@ int main(int argc, char** argv) {
   Expr y = input(shape={whatevs, LABEL_SIZE}, name="Y");
   
   Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0");
+  // Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0", init=randreal);
   Expr b = param(shape={1, LABEL_SIZE}, name="b0");
     
   Expr z = dot(x, w) + b;
diff --git a/src/train_mnist.cu b/src/train_mnist.cu
new file mode 100644
index 00000000..aa21597a
--- /dev/null
+++ b/src/train_mnist.cu
@@ -0,0 +1,37 @@
+
+#include "marian.h"
+#include "mnist.h"
+#include "sgd.h"
+
+using namespace std;
+
+int main(int argc, char** argv) {
+  const size_t IMAGE_SIZE = 784;
+  const size_t LABEL_SIZE = 10;
+  int numofdata;
+
+  vector<float> trainImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", numofdata, IMAGE_SIZE);
+  vector<float>trainLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", numofdata, LABEL_SIZE);
+
+  using namespace marian;
+  using namespace keywords;
+
+  Expr x = input(shape={whatevs, IMAGE_SIZE}, name="X");
+  Expr y = input(shape={whatevs, LABEL_SIZE}, name="Y");
+
+  Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0");
+  Expr b = param(shape={1, LABEL_SIZE}, name="b0");
+
+  std::vector<Expr*> params;
+  params.push_back(&w);
+  params.push_back(&b);
+
+  auto scores = dot(x, w) + b;
+  auto lr = softmax_fast(scores, axis=1, name="pred");
+  auto cost = -mean(sum(y * log(lr), axis=1), axis=0, name="cost");
+  cerr << "lr=" << lr.Debug() << endl;
+
+  SGD opt(cost, x, y, params, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24);
+  opt.Run();
+  return 0;
+}
diff --git a/src/validate_mnist.cu b/src/validate_mnist.cu
new file mode 100644
index 00000000..a42fa881
--- /dev/null
+++ b/src/validate_mnist.cu
@@ -0,0 +1,77 @@
+
+#include "marian.h"
+#include "mnist.h"
+#include "npz_converter.h"
+
+using namespace marian;
+using namespace keywords;
+
+int main(int argc, char** argv) {
+  const size_t IMAGE_SIZE = 784;
+  const size_t LABEL_SIZE = 10;
+  int numofdata;
+
+  std::cerr << "Loading test set...";
+  std::vector<float> testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", numofdata, IMAGE_SIZE);
+  std::vector<float>testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", numofdata, LABEL_SIZE);
+  std::cerr << "\tDone." << std::endl;
+
+  std::cerr << "Loading model params...";
+  NpzConverter converter("../scripts/test_model/model.npz");
+
+  std::vector<float> wData;
+  Shape wShape;
+  converter.Load("weights", wData, wShape);
+
+  std::vector<float> bData;
+  Shape bShape;
+  converter.Load("bias", bData, bShape);
+
+  auto initW = [&wData](Tensor t) {
+    thrust::copy(wData.begin(), wData.end(), t.begin());
+  };
+
+  auto initB = [&bData](Tensor t) {
+    thrust::copy(bData.begin(), bData.end(), t.begin());
+  };
+
+  std::cerr << "\tDone." << std::endl;
+
+
+  Expr x = input(shape={whatevs, IMAGE_SIZE}, name="X");
+
+  Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0", init=initW);
+  Expr b = param(shape={1, LABEL_SIZE}, name="b0", init=initB);
+
+  std::cerr << "Building model...";
+  auto scores = dot(x, w) + b;
+  auto predict = softmax(scores, axis=1, name="pred");
+  std::cerr << "\tDone." << std::endl;
+
+  Tensor xt({numofdata, IMAGE_SIZE});
+  xt.Load(testImages);
+
+  predict.forward(numofdata);
+
+  auto results = predict.val();
+
+  size_t acc = 0;
+
+  for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
+    size_t correct = 0;
+    size_t predicted = 0;
+    for (size_t j = 0; j < LABEL_SIZE; ++j) {
+      if (testLabels[i+j]) correct = j;
+      if (results[i + j] > results[i + predicted]) predicted = j;
+    }
+    acc += (correct == predicted);
+    std::cerr << "corect: " << correct << " | " << predicted <<  "(";
+    for (size_t j = 0; j < LABEL_SIZE; ++j) {
+      std::cerr << results[i+j] << " ";
+    }
+    std::cerr << std::endl;
+  }
+  std::cerr << "ACC: " << float(acc)/numofdata << std::endl;
+
+  return 0;
+}