Merge branch 'master' of https://github.com/emjotde/Marian

2024-09-17 09:47:34 +03:00 · 2016-09-16 18:15:11 +02:00 · 2016-09-16 18:15:11 +02:00 · 6534403830
commit 6534403830
parent e897f9d6a5 732bb9fa7a
19 changed files with 581 additions and 336 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -5,7 +5,6 @@ cuda_add_library(marian_lib
  cnpy/cnpy.cpp
  exception.cpp
  expression_graph.cu 
  sgd.cu
  tensor.cu
  tensor_operators.cu
  expression_operators.cu
--- a/src/chainable.h
+++ b/src/chainable.h
@ -18,7 +18,7 @@ struct Chainable {
    virtual void allocate(size_t) = 0;
    virtual std::string graphviz() = 0;
-
+    virtual const std::string &name() const = 0;
    virtual const Shape& shape() = 0;
    virtual DataType &val() = 0;
@ -33,4 +33,4 @@ typedef std::shared_ptr<ChainableStack> ChainableStackPtr;
 typedef std::shared_ptr<Chainable<Tensor>> ChainPtr;
-}
+}
--- a/src/expression_graph.cu
+++ b/src/expression_graph.cu
@ -39,12 +39,12 @@ std::string Expr::Debug() const
 }
 ///////////////////////////////////////////////////////
-ExpressionGraph::ExpressionGraph(int cudaDevice)
+//ExpressionGraph::ExpressionGraph(int cudaDevice)
-: stack_(new ChainableStack)
+//: stack_(new ChainableStack)
-{
+//{
-  std::srand (time(NULL));
+//  std::srand (time(NULL));
-  cudaSetDevice(0);
+//  cudaSetDevice(0);
-
+//
-}
+//}
 }
--- a/src/expression_graph.h
+++ b/src/expression_graph.h
@ -38,9 +38,14 @@ class Expr {
 class ExpressionGraph {
  public:
-    ExpressionGraph(int cudaDevice);
+    ExpressionGraph() : stack_(new ChainableStack) {}
-    void forward(size_t batchSize) {
+    void backprop(int batchSize) {
      forward(batchSize);
      backward();
    }
    void forward(int batchSize) {
      for(auto&& v : *stack_) {
        v->allocate(batchSize);
      }
@ -48,18 +53,6 @@ class ExpressionGraph {
        v->forward();    
    }
    std::string graphviz() {
      std::stringstream ss;
      ss << "digraph ExpressionGraph {" << std::endl;
      ss << "rankdir=BT" << std::endl;
      typedef typename ChainableStack::reverse_iterator It;
      for(It it = stack_->rbegin(); it != stack_->rend(); ++it)
        ss << (*it)->graphviz();
      ss << "}" << std::endl;
      return ss.str();
    }
    void backward() {
      for(auto&& v : *stack_)
        v->set_zero_adjoint();
@ -70,9 +63,25 @@ class ExpressionGraph {
        (*it)->backward();
    }
    std::string graphviz() {
      std::stringstream ss;
      ss << "digraph ExpressionGraph {" << std::endl;
      ss << "rankdir=BT" << std::endl;
      typedef typename ChainableStack::reverse_iterator It;
      for(It it = stack_->rbegin(); it != stack_->rend(); ++it) {
        ss << (*it)->graphviz();
      }
      ss << "}" << std::endl;
      return ss.str();
    }
    /*********************************************************/
    template <typename ...Args>
    inline Expr input(Args ...args) {
-      return Expr(this, new InputNode(args...));
+      Expr e(this, new InputNode(args...));
      inputs_.emplace_back(e);
      return e;
    }
    template <typename ...Args>
@ -117,14 +126,20 @@ class ExpressionGraph {
      named_.emplace(name, e);
    }
    std::vector<Expr>& inputs() {
      return inputs_;
    }
    std::vector<Expr>& params() {
      return params_;
    }
  private:
    ChainableStackPtr stack_;
    std::map<std::string, Expr> named_;
    std::vector<Expr> params_;
    std::vector<Expr> inputs_;
 };
 }
--- a/src/node.h
+++ b/src/node.h
@ -67,6 +67,8 @@ class Node : public Chainable<Tensor>,
    virtual const Shape& shape() {
      return shape_;    
    }
    const std::string &name() const { return name_; }
  protected:
    Shape shape_;
--- a/src/sgd.cu
+++ b/src/sgd.cu
@ -1,140 +0,0 @@
 #include <ctime>
 #include <algorithm>
 #include <vector>
 #include "sgd.h"
 #include "thrust_functions.h"
 using namespace std;
 namespace marian {
 SGD::SGD(ExpressionGraph& g, float eta,
    std::vector<float>& xData, size_t numFeatures,
    std::vector<float>& yData, size_t numClasses,
    size_t epochs, size_t batchSize)
 : graph_(g),
  eta_(eta),
  xData_(xData),
  numFeatures_(numFeatures),
  yData_(yData),
  numClasses_(numClasses),
  epochs_(epochs),
  maxBatchSize_(batchSize)
 {}
 void SGD::Run()
 {
  size_t numExamples = xData_.size()/ numFeatures_;
  Tensor xt({(int)maxBatchSize_, (int)numExamples}, 0.0f);
  Tensor yt({(int)maxBatchSize_, (int)numClasses_}, 0.0f);
  vector<size_t> shuffle = CreateShuffle(numExamples);
  //vector<size_t> shuffle;
  for (size_t numEpoch = 0; numEpoch < epochs_; ++numEpoch) {
    std::cerr << "Starting epoch #" << numEpoch << std::endl;
    size_t startId = 0;
    while (startId < numExamples) {
      size_t batchSize = std::min(maxBatchSize_, numExamples - startId);
      size_t endId = startId + batchSize;
      PrepareBatch(startId, endId, batchSize, shuffle, xt, yt);
      graph_["x"] = xt;
      graph_["y"] = yt;
      graph_.forward(maxBatchSize_);
      graph_.backward();
      UpdateModel();
      startId += maxBatchSize_;
    }
  }
 }
 std::vector<size_t> SGD::CreateShuffle(size_t numExamples) const {
  vector<size_t> ret(numExamples);
  std::iota(ret.begin(), ret.end(), 0);
  std::random_shuffle ( ret.begin(), ret.end() );
  /*
  cerr << "shuffled" << endl;
  for (size_t i = 0; i < ret.size(); ++i) {
    cerr << ret[i] << " ";
  }
  */
  return ret;
 }
 void SGD::PrepareBatch(
 		size_t startId,
 		size_t endId,
 		size_t batchSize,
 		const std::vector<size_t> &shuffle,
 		Tensor& xt,
 		Tensor& yt) {
  /*
  std::vector<float> x(xData_.begin() + startId * numFeatures_,
                       xData_.begin() + endId * numFeatures_);
  std::vector<float> y(yData_.begin() + startId * numClasses_,
                       yData_.begin() + endId * numClasses_);
  */
  std::vector<float> x(batchSize * numFeatures_);
  std::vector<float> y(batchSize * numClasses_);
  //cerr << "batchSize=" << batchSize << endl;
  /*
  cerr << "startId=" << startId
       << " " << endId
       << " " << batchSize
       << endl;
  cerr << "numExamples=" << shuffle.size() << endl;
  cerr << "numFeatures_=" << numFeatures_ << " " << numClasses_ << endl;
  cerr << "sizes=" << x.size() 
       << " " << y.size() 
       << " " << xData_.size()
       << " " << yData_.size()
       << endl;
  */
  size_t startXId = 0;
  size_t startYId = 0;
  for (size_t i = startId; i < endId; ++i) {
    size_t ind = shuffle[i];
    size_t startXDataId = ind * numFeatures_;
    size_t startYDataId = ind * numClasses_;
    size_t endXDataId = startXDataId + numFeatures_;
    size_t endYDataId = startYDataId + numClasses_;
    /*
    cerr << "i=" << i
    	 << " " << ind
    	 << " " << startXDataId << "-" << endXDataId
 	 << " " << startYDataId << "-" << endYDataId
 	 << endl;
    */
    std::copy(xData_.begin() + startXDataId,
        xData_.begin() + endXDataId,
        x.begin() + startXId);
    std::copy(yData_.begin() + startYDataId,
        yData_.begin() + endYDataId,
        y.begin() + startYId);
    startXId += numFeatures_;
    startYId += numClasses_;
  }
  xt.set(x);
  yt.set(y);
 }
 void SGD::UpdateModel() {
  for (auto& param : graph_.params()) {
    using namespace thrust::placeholders;
    Element(_1 -= eta_ * _2, param.val(), param.grad());
  }
 }
 } // namespace
--- a/src/sgd.h
+++ b/src/sgd.h
@ -1,43 +1,48 @@
 #pragma once
-#include <memory>
+#include <map>
-#include <iostream>
+#include <boost/any.hpp>
 #include "expression_graph.h"
 #include "thrust_functions.h"
 #include "tensor_operators.h"
 namespace marian {
-class SGD {
+class Sgd {
  public:
-    SGD(ExpressionGraph& g, float eta,
+    Sgd(float eta=0.1) : eta_(eta) {}
-        std::vector<float>& xData, size_t numFeatures,
+    
-        std::vector<float>& yData, size_t numClasses,
+    void operator()(ExpressionGraph& graph, int batchSize) {
-        size_t epochs, size_t batchSize);
+      graph.backprop(batchSize);
-
+      
-    void Run();
+      for(auto& param : graph.params())
-
+        Element(_1 -= eta_ * _2, param.val(), param.grad());
    }
  private:
-    ExpressionGraph& graph_;
+    float eta_;
    const float eta_;
    std::vector<float>& xData_;
    const size_t numFeatures_;
    std::vector<float>& yData_;
    const size_t numClasses_;
    const size_t epochs_;
    const size_t maxBatchSize_;
    std::vector<size_t> CreateShuffle(size_t numExamples) const;
    void PrepareBatch(
    		size_t startId,
    		size_t endId,
    		size_t batchSize,
    		const std::vector<size_t> &shuffle,
    		Tensor& xt,
    		Tensor& yt);
    void UpdateModel();
 };
-} // namespace marian
+class Adagrad {
  public:
    Adagrad(float eta=0.1) : eta_(eta) {}
    void operator()(ExpressionGraph& graph, int batchSize) {
      if(history_.size() < graph.params().size())
        for(auto& param : graph.params())
          history_.emplace_back(Tensor(param.grad().shape(), 0));
      graph.backprop(batchSize);
      auto it = history_.begin();
      for(auto& param : graph.params()) {    
        Element(_1 -= eta_ / Sqrt(_2) * _3, param.val(), *it, param.grad());
        Element(_1 += _2 * _2, *it, param.grad());
        it++;
      }
    }
  private:
    float eta_;
    std::vector<Tensor> history_;
 };
 }
--- a/src/tensor.cu
+++ b/src/tensor.cu
@ -1,8 +1,6 @@
 #include <fstream>
 #include "tensor.h"
 using namespace std;
 namespace marian {
 void Tensor::set(const std::vector<float>& data)
--- a/src/tensor.h
+++ b/src/tensor.h
@ -1,4 +1,21 @@
 #pragma once
 /* Copyright (C) 
 * 2016 - MLAMU & friends
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 * 
 */
 #include <cublas_v2.h>
 #include <thrust/device_vector.h>
@ -12,6 +29,13 @@
 namespace marian {
 /**
 * @brief Debug shape by printing it. 
 *
 * @param shape Shape of Tensor.
 *
 * @return String of shape.
 */
 inline std::string Debug(const Shape &shape)
 {
 	std::stringstream strm;
@ -23,6 +47,13 @@ inline std::string Debug(const Shape &shape)
 	return strm.str();
 }
 /**
 * @brief Calculate the vector size based on Tensor shape. 
 *
 * @param shape Shape of Tensor.
 *
 * @return Size of Tensor vector.
 */
 inline size_t GetTotalSize(const Shape &shape)
 {
 	size_t ret = std::accumulate(shape.begin(), shape.end(),
@ -30,17 +61,28 @@ inline size_t GetTotalSize(const Shape &shape)
 	return ret;
 }
 /**
 * @brief This class manages the Tensor on the GPU. 
 *
 * @tparam Float Data type.
 */
 template<class Float>
 class TensorImpl {
  private:
-    Shape shape_;
+    Shape shape_; /*!< Dimenions of Tensor */
-    thrust::device_vector<Float> data_;
+    thrust::device_vector<Float> data_; /*< Vector of data that Tensor is managing on GPU. */
-    size_t tno_;
+    size_t tno_; /*< Tensor number */
-    static size_t tensorCounter;
+    static size_t tensorCounter; /*< Static counter of created Tensors */
  public:
-    typedef Float value_type;
+    typedef Float value_type; /*< Tensor value type */
    /**
     * @brief Constructor
     *
     * @param shape Shape of Tensor.
     * @param value Value to fill Tensor's vector with.
     */
    TensorImpl(const Shape& shape, value_type value = 0)
    : shape_(shape), tno_(tensorCounter++)
    {
@ -59,54 +101,122 @@ class TensorImpl {
    TensorImpl(const TensorImpl&) = delete;
    TensorImpl(TensorImpl&&) = delete;
    /**
     * @brief Get the i-th element of Tensor vector.
     *
     * @param i Index.
     *
     * @return Value of Tensor vector indexed with i.
     */
   value_type operator[](size_t i) const {
      return data_[i];
    }
   /**
    * @brief Get begin iterator of Tensor's vector.
    *
    * @return Vector begin iterator.
    */
    auto begin() -> decltype( data_.begin() ) {
      return data_.begin();
    }
   /**
    * @brief Get begin iterator of Tensor's vector (const).
    *
    * @return Vector begin iterator (const)
    */
    auto begin() const -> decltype( data_.begin() ) {
      return data_.begin();
    }
   /**
    * @brief Get end iterator of Tensor's vector.
    *
    * @return Vector end iterator
    */
    auto end() -> decltype( data_.end() ) {
      return data_.end();
    }
   /**
    * @brief Get end iterator of Tensor's vector (const).
    *
    * @return Vector end iterator (const)
    */
    auto end() const -> decltype( data_.end() ) {
      return data_.end();
    }
    /**
     * @brief Get Tensor's shape (const)
     *
     * @return Shape of Tensor
     */
    const Shape& shape() const {
        return shape_;
    }
    /**
     * @brief Get size of Tensor's vector.
     *
     * @return Length of Tensor's vector.
     */
    size_t size() const {
      return data_.size();
    }
    /**
     * @brief Cast data from Tensor's GPU to value_type.
     *
     * @return Pointer of value_type array.
     */
    value_type* data() {
      return thrust::raw_pointer_cast(data_.data());
    }
    /**
     * @brief Get Tensor id (number).
     *
     * @return Tensor id.
     */
    size_t id() const {
      return tno_;
    }
    /**
     * @brief Fill Tensor's vector with specified value on the GPU.
     *
     * @param value Value to fill vector with.
     */
    void set(value_type value) {
      thrust::fill(data_.begin(), data_.end(), value);
    }
    /**
     * @brief Set Tensor's vector to values of specified vector by copying it to GPU.
     *
     * @param begin Begin iterator of a vector.
     * @param end End iterator of a vector.
     */
    void set(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end) {
 	  thrust::copy(begin, end, data_.begin());
    }
    /**
     * @brief Copy Tensor's vector from GPU to vector variable on CPU.
     *
     * @param out Vector to copy data to.
     */
    void get(std::vector<float>::iterator out) {
 	  thrust::copy(data_.begin(), data_.end(), out);      
    }
    /**
     * @brief Debug function.
     *
     * @return Vector in string form.
     */
    std::string Debug() const
    {
    	std::stringstream strm;
@ -133,78 +243,170 @@ class TensorImpl {
 template <typename Type>
 size_t TensorImpl<Type>::tensorCounter = 0;
 /**
 * @brief Class that communicates with GPU's Tensor.
 */
 class Tensor {
  private:
-    std::shared_ptr<TensorImpl<Float>> pimpl_;
+    std::shared_ptr<TensorImpl<Float>> pimpl_; /*< Pointer to Tensor working on GPU */
  public:
-    typedef TensorImpl<Float>::value_type value_type;
+    typedef TensorImpl<Float>::value_type value_type; /*< Get value type of GPU's Tensor data */
    /**
     * @brief Default constructor
     */
    Tensor() {}
    /**
     * @brief Constructor that allocates memory.
     *
     * @param shape Shape of Tensor. 
     * @param value Value to fill Tensor's vector with.
     */
    Tensor(const Shape& shape, value_type value = 0) {
      allocate(shape, value);
    }
    /**
     * @brief Default destructor
     */
    ~Tensor() {}
    /**
     * @brief Allocate memory if Tensor doesn't exist on GPU. Otherwise, do nothing.
     *
     * @param shape Shape of Tensor.
     * @param value Value to fill Tensor's vector with.
     */
    void allocate(const Shape& shape, value_type value = 0) {
      if(!pimpl_)
        pimpl_.reset(new TensorImpl<Float>(shape, value));
    }
    /**
     * @brief Get i-th element of GPU Tensor vector (const).
     *
     * @param i Index.
     *
     * @return Value of specified element of Tensor.
     */
    value_type operator[](size_t i) const {
      return (*pimpl_)[i];
    }
    /**
     * @brief Get size of GPU Tensor's vector.
     *
     * @return Size of Tensor vector.
     */
    size_t size() const {
      return pimpl_->size();
    }
    /**
     * @brief Return pointer to GPU Tensor's data.
     *
     * @return Pointer to GPU Tensor's data.
     */
    value_type* data() {
      return pimpl_->data();
    }
    /**
     * @brief Return pointer to GPU Tensor's data (const).
     *
     * @return Pointer to GPU Tensor's data.
     */
    const value_type* data() const {
      return pimpl_->data();
    }
   /**
    * @brief Get begin iterator of GPU Tensor's vector.
    *
    * @return Vector begin iterator.
    */
    auto begin() -> decltype( pimpl_->begin() ) {
      return pimpl_->begin();
    }
   /**
    * @brief Get begin iterator of GPU Tensor's vector (const).
    *
    * @return Vector begin iterator (const)
    */
    auto begin() const -> decltype( pimpl_->begin() ) {
      return pimpl_->begin();
    }
   /**
    * @brief Get end iterator of Tensor's vector.
    *
    * @return Vector end iterator
    */
    auto end() -> decltype( pimpl_->end() ) {
      return pimpl_->end();
    }
   /**
    * @brief Get end iterator of Tensor's vector (const).
    *
    * @return Vector end iterator (const)
    */
    auto end() const -> decltype( pimpl_->end() ) {
      return pimpl_->end();
    }
    /**
     * @brief Get GPU Tensor's shape.
     *
     * @return Tensor's shape.
     */
    const Shape& shape() const {
      return pimpl_->shape();
    }
    /**
     * @brief Fill GPU Tensor's vector with specified value.
     *
     * @param value Value to fill Tensor with.
     */
    void set(value_type value) {
      pimpl_->set(value);
    }
    /**
     * @brief Get GPU Tensor id (number).
     *
     * @return Tensor id.
     */
    size_t id() const {
      return pimpl_->id();
    }
    /**
     * @brief Check if Tensor is allocated.
     *
     * @return True or False
     */
    operator bool() {
      return pimpl_ != nullptr;
    }
    /**
     * @brief Run Debug on GPU Tensor.
     *
     * @return String of Tensor's data.
     */
    std::string Debug() const
    {
    	return pimpl_->Debug();
    }
    /**
     * @brief Print Tensor data on CPU (?) (const). 
     */
    void Print() const {
      for (int i = 0; i < size(); ++i) {
        std::cerr << (*this)[i] << " ";
@ -213,21 +415,59 @@ class Tensor {
    }
    //void Load(const std::string &path);
    /**
     * @brief Set GPU Tensor's vector to values of specified vector.
     *
     * @param data Vector copied to GPU.
     */
    void set(const std::vector<float>& data);
    /**
     * @brief Fill GPU Tensor's vector using values from the specified vector.
     *
     * @param begin Begin iterator of vector being copied.
     * @param end End iterator of vector being copied.
     */
    void set(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end);
    /**
     * @brief Copy Tensor's vector from GPU to vector variable on CPU (const).
     *
     * @param out Vector iterator used in copying.
     */
    void get(std::vector<float>::iterator out) const {
      pimpl_->get(out);
    }
    /**
     * @brief Copy Tensor's vector from GPU to vector variable on CPU.
     *
     * @param out Vector to copy data to.
     */
    void get(std::vector<float> &vout) const {
      vout.resize(size());
      pimpl_->get(vout.begin());
    }
 };
 /**
 * @brief Operator to set data on Tensor using vector.
 *
 * @param t Tensor.
 * @param vec Vector used to set data in Tensor.
 *
 * @return Tensor with assigned data.
 */
 Tensor& operator<<(Tensor& t, const std::vector<float> &vec);
 /**
 * @brief Operator to get data from Tensor to vector.
 *
 * @param vec Vector to save copied data.
 * @param t Tensor to copy data from.
 *
 * @return Vector with copied data.
 */
 std::vector<float>& operator<<(std::vector<float> &vec, const Tensor& t);
 }
--- a/src/tensor_operators.cu
+++ b/src/tensor_operators.cu
@ -1,5 +1,7 @@
 #include "tensor_operators.h"
 using namespace std;
 namespace marian {
 __global__ void gSubtractMean(float* out, float* weights,
@ -53,6 +55,7 @@ void SubtractMean(Tensor* Out, Tensor &Weights) {
  cudaStreamSynchronize(0);
 }
 ///////////////////////////////////////////////////////
 __global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols) {
  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int j = bid + blockIdx.x;
@ -97,6 +100,35 @@ void Softmax(Tensor* Out) {
  gSoftMax<<<blocks, threads, shared>>>(Out->data(), m, k);
  cudaStreamSynchronize(0);
 }
 ///////////////////////////////////////////////////////
 __global__ void gArgMax(float *out, const float *data, size_t rows, size_t cols) {
  size_t row = blockIdx.x;
    size_t startInd = row * cols;
    float maxScore = -99999;
    size_t maxInd;
    for (size_t col = 0; col < cols; ++col) {
      size_t ind = startInd + col;
      float score = data[ind];
      if (score > maxScore) {
        maxScore = score;
        maxInd = col;
      }
    }
    out[row] = maxInd;
 }
 void Argmax(Tensor* Out, const Tensor* In) {
  size_t m = In->shape()[0];
  size_t k = In->shape()[1];
  int blocks = m; //std::min(MAX_BLOCKS, (int) m);
  int threads = k; //std::min(MAX_THREADS, (int) k);
  //int shared = sizeof(float) * threads * 2;
  gArgMax<<<blocks, threads>>>(Out->data(), In->data(), m, k);
  cudaStreamSynchronize(0);
 }
 ///////////////////////////////////////////////////////
 Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
             bool transA, bool transB, Float beta) {
@ -137,4 +169,4 @@ Tensor Prod(Tensor C, const Tensor A, const Tensor B,
  return temp;
 }
-}
+}
--- a/src/tensor_operators.h
+++ b/src/tensor_operators.h
@ -151,6 +151,10 @@ __global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols);
 void Softmax(Tensor* Out);
 __global__ void gArgMax(float *out, const float *data, size_t rows, size_t cols);
 void Argmax(Tensor* Out, const Tensor* In);
 Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
             bool transA, bool transB, Float beta);
--- a/src/test.cu
+++ b/src/test.cu
@ -3,7 +3,51 @@
 #include "mnist.h"
 #include "vocab.h"
 #include "tensor_operators.h"
 using namespace std;
 ///////////////////////////////////////////////////////
 string output(const std::vector<float> &vec)
 {
  stringstream strm;
  for (size_t i = 0; i < vec.size(); ++i) {
  strm << vec[i] << " ";
  }
  return strm.str();
 }
 void temp()
 {
  using namespace std;
  using namespace marian;
  std::vector<float> hVec({29,19,  49,39,  79,99,  79,39});
        cerr << "hVec =" << output(hVec) << endl;
  thrust::device_vector<float> dVec(8);
  thrust::copy(hVec.begin(), hVec.end(), dVec.begin());
  float *data = thrust::raw_pointer_cast(dVec.data());
  thrust::device_vector<float> dLabel(4);
  float *labelPtr = thrust::raw_pointer_cast(dLabel.data());
  gArgMax<<<4, 1, sizeof(float)>>>(labelPtr, data, 4, 2);
  std::vector<float> hVec2(8);
  thrust::copy(dVec.begin(), dVec.end(), hVec2.begin());
  cerr << "hVec2=" << output(hVec2) << endl;
  std::vector<float> hLabel(4);
  thrust::copy(dLabel.begin(), dLabel.end(), hLabel.begin());
  cerr << "hLabel=" << output(hLabel) << endl;
  exit(0);
 }
 ///////////////////////////////////////////////////////
 int main(int argc, char** argv) {
  temp();
  using namespace std;
  using namespace marian;
@ -21,7 +65,7 @@ int main(int argc, char** argv) {
  std::vector<Expr> Y;
  std::vector<Expr> H;
-  ExpressionGraph g(0);
+  ExpressionGraph g;
  for (int t = 0; t < num_inputs; ++t) {
    X.emplace_back(g.input(shape={batch_size, input_size}));
@ -39,10 +83,9 @@ int main(int argc, char** argv) {
  string sourceLine, targetLine;
  while (getline(sourceFile, sourceLine)) {
-	  getline(targetFile, targetLine);
+    getline(targetFile, targetLine);
-
+    std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
-	  std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
+    std::vector<size_t> targetIds = sourceVocab.ProcessSentence(targetLine);
 	  std::vector<size_t> targetIds = sourceVocab.ProcessSentence(targetLine);
  }
  std::cerr << "Building RNN..." << std::endl;
--- a/src/thrust_functions.h
+++ b/src/thrust_functions.h
@ -85,6 +85,19 @@ namespace thrust
        return compose(unary_operator<unary_tanh>(), _1);
      }
      template<typename T>
      struct unary_sqrt : public thrust::unary_function<T,T> {
        __host__ __device__
        T operator()(const T &x) const { return sqrtf(x); }
      };
      template<typename Eval>
      __host__ __device__
      actor<composite<unary_operator<unary_sqrt>, actor<Eval>>>
      Sqrt(const actor<Eval> &_1) {
        return compose(unary_operator<unary_sqrt>(), _1);
      }
      template<typename T1, typename T2>
      __host__ __device__
      actor<composite<binary_operator<thrust::maximum>, actor<T1>, actor<T2>>>
--- a/src/train_mnist.cu
+++ b/src/train_mnist.cu
@ -11,12 +11,12 @@ int main(int argc, char** argv) {
  int numofdata;
  vector<float> trainImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", numofdata, IMAGE_SIZE);
-  vector<float>trainLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", numofdata, LABEL_SIZE);
+  vector<float> trainLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", numofdata, LABEL_SIZE);
  using namespace marian;
  using namespace keywords;
-  ExpressionGraph g(0);
+  ExpressionGraph g;
  Expr x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
  Expr y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
@ -24,16 +24,13 @@ int main(int argc, char** argv) {
  Expr w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE}), "w");
  Expr b = named(g.param(shape={1, LABEL_SIZE}), "b");
  std::vector<Expr*> params;
  params.push_back(&w);
  params.push_back(&b);
  auto scores = dot(x, w) + b;
  auto lr = softmax_fast(scores);
  auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost");
  cerr << "lr=" << lr.Debug() << endl;
-  SGD opt(g, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24);
+  Adagrad opt;
-  opt.Run();
+  opt(g, 300);
  return 0;
 }
--- a/src/validate_encoder_decoder.cu
+++ b/src/validate_encoder_decoder.cu
@ -1,97 +1,129 @@
 #include "marian.h"
 #include "mnist.h"
 #include "vocab.h"
 #include <assert.h>
 using namespace marian;
 using namespace keywords;
 const int input_size = 10;
 const int output_size = 15;
 const int embedding_size = 8;
 const int hidden_size = 5;
 const int batch_size = 25;
 const int num_inputs = 8;
 const int num_outputs = 6;
 #if 0
 ExpressionGraph build_graph() {
-  std::cerr << "Loading model params...";
+  std::cerr << "Building computation graph..." << std::endl;
  ExpressionGraph g;
  std::vector<Expr> X, Y, H, S;
  // We're including the stop symbol here.
  for (int t = 0; t <= num_inputs; ++t) {
    std::stringstream ss;
    ss << "X" << t;
    X.emplace_back(named(g.input(shape={batch_size, input_size}), ss.str()));
  }
  // We're including the stop symbol here.
  for (int t = 0; t <= num_outputs; ++t) {
    std::stringstream ss;
    ss << "Y" << t;
    Y.emplace_back(named(g.input(shape={batch_size, output_size}), ss.str()));
  }
  // Source embeddings.
  Expr E = named(g.param(shape={input_size, embedding_size},
                         init=uniform()), "E");
  // Source RNN parameters.
  Expr Wxh = named(g.param(shape={embedding_size, hidden_size},
                   init=uniform()), "Wxh");
  Expr Whh = named(g.param(shape={hidden_size, hidden_size},
                   init=uniform()), "Whh");
  Expr bh = named(g.param(shape={1, hidden_size},
                  init=uniform()), "bh");
  Expr h0 = named(g.param(shape={1, hidden_size},
                  init=uniform()), "h0");
  std::cerr << "Building encoder RNN..." << std::endl;
  H.emplace_back(tanh(dot(dot(X[0], E), Wxh) + dot(h0, Whh) + bh));
  for (int t = 1; t <= num_inputs; ++t) {
    H.emplace_back(tanh(dot(dot(X[t], E), Wxh) + dot(H[t-1], Whh) + bh));
  }
  // Target RNN parameters.
  Expr Wxh_d = named(g.param(shape={output_size, hidden_size},
                     init=uniform()), "Wxh_d");
  Expr Whh_d = named(g.param(shape={hidden_size, hidden_size},
                     init=uniform()), "Whh_d");
  Expr bh_d = named(g.param(shape={1, hidden_size},
                    init=uniform()), "bh_d");
  std::cerr << "Building decoder RNN..." << std::endl;
  auto h0_d = H[num_inputs];
  S.emplace_back(tanh(dot(Y[0], Wxh_d) + dot(h0_d, Whh_d) + bh_d));
  for (int t = 1; t < num_outputs; ++t) {
    S.emplace_back(tanh(dot(Y[t], Wxh_d) + dot(S[t-1], Whh_d) + bh_d));
  }
  // Output linear layer before softmax.
  Expr Why = named(g.param(shape={hidden_size, output_size},
                           init=uniform()), "Why");
  Expr by = named(g.param(shape={1, output_size},
                          init=uniform()), "by");
  std::cerr << "Building output layer..." << std::endl;
  // Softmax layer and cost function.
  std::vector<Expr> Yp;
  Yp.emplace_back(named(softmax_fast(dot(h0_d, Why) + by), "pred"));
  Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
  for (int t = 1; t <= num_outputs; ++t) {
    Yp.emplace_back(named(softmax_fast(dot(S[t-1], Why) + by), "pred"));
    cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
  }
  auto cost = named(-mean(cross_entropy, axis=0), "cost");
  std::cerr << "Done." << std::endl;
  return g;
 }
 int main(int argc, char** argv) {
 #if 1
  std::cerr << "Loading the data... ";
  Vocab sourceVocab, targetVocab;
  // read parallel corpus from file
  std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
  std::fstream targetFile("../examples/mt/dev/newstest2013.en");
  std::vector<std::vector<size_t> > source_sentences, target_sentences;
  std::string sourceLine, targetLine;
  while (getline(sourceFile, sourceLine)) {
    getline(targetFile, targetLine);
    std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
-    std::vector<size_t> targetIds = sourceVocab.ProcessSentence(targetLine);
+    std::vector<size_t> targetIds = targetVocab.ProcessSentence(targetLine);
    source_sentences.push_back(sourceIds);
    target_sentences.push_back(targetIds);
  }
  std::cerr << "Done." << std::endl;
  std::cerr << source_sentences.size()
            << " sentence pairs read." << std::endl;
  std::cerr << "Source vocabulary size: " << sourceVocab.Size() << std::endl;
  std::cerr << "Target vocabulary size: " << targetVocab.Size() << std::endl;
 #endif
  // Build the encoder-decoder computation graph.
  ExpressionGraph g = build_graph();
-int main(int argc, char** argv) {
+  // Generate input data (include the stop symbol).
  using namespace marian;
  using namespace keywords;
  int input_size = 10;
  int output_size = 15;
  int batch_size = 25;
  int hidden_size = 5;
  int num_inputs = 8;
  int num_outputs = 6;
  ExpressionGraph g(0);
  std::vector<Expr*> X(num_inputs+1); // For the stop symbol.
  std::vector<Expr*> Y(num_outputs);
  std::vector<Expr*> H(num_inputs+1); // For the stop symbol.
  std::vector<Expr*> S(num_outputs);
  // For the stop symbol.
  for (int t = 0; t <= num_inputs; ++t) {
    X[t] = new Expr(g.input(shape={batch_size, input_size}));
  }
  // For the stop symbol.
  for (int t = 0; t <= num_outputs; ++t) {
    Y[t] = new Expr(g.input(shape={batch_size, output_size}));
  }
  Expr Wxh = g.param(shape={input_size, hidden_size}, init=uniform(), name="Wxh");
  Expr Whh = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh");
  Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh");
  Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0");
  std::cerr << "Building encoder RNN..." << std::endl;
  H[0] = new Expr(tanh(dot(*X[0], Wxh) + dot(h0, Whh) + bh));
  for (int t = 1; t <= num_inputs; ++t) {
    H[t] = new Expr(tanh(dot(*X[t], Wxh) + dot(*H[t-1], Whh) + bh));
  }
  Expr Wxh_d = g.param(shape={output_size, hidden_size}, init=uniform(), name="Wxh_d");
  Expr Whh_d = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh_d");
  Expr bh_d = g.param(shape={1, hidden_size}, init=uniform(), name="bh_d");
  std::cerr << "Building decoder RNN..." << std::endl;
  auto h0_d = *H[num_inputs];
  S[0] = new Expr(tanh(dot(*Y[0], Wxh_d) + dot(h0_d, Whh_d) + bh_d));
  for (int t = 1; t < num_outputs; ++t) {
    S[t] = new Expr(tanh(dot(*Y[t], Wxh_d) + dot(*S[t-1], Whh_d) + bh_d));
  }
  Expr Why = g.param(shape={hidden_size, output_size}, init=uniform(), name="Why");
  Expr by = g.param(shape={1, output_size}, init=uniform(), name="by");
  std::cerr << "Building output layer..." << std::endl;
  std::vector<Expr*> Yp(num_outputs+1); // For the stop symbol.
  Expr* cross_entropy = NULL;
  for (int t = 0; t <= num_outputs; ++t) {
    if (t == 0) {
      Yp[t] = new Expr(named(softmax_fast(dot(h0_d, Why) + by), "pred"));
      cross_entropy = new Expr(sum(*Y[t] * log(*Yp[t]), axis=1));
    } else {
      Yp[t] = new Expr(named(softmax_fast(dot(*S[t-1], Why) + by), "pred"));
      *cross_entropy = *cross_entropy + sum(*Y[t] * log(*Yp[t]), axis=1);
    }
  }
  auto graph = -mean(*cross_entropy, axis=0, name="cost");
  // For the stop symbol.
  for (int t = 0; t <= num_inputs; ++t) {
    Tensor Xt({batch_size, input_size});
    float max = 1.;
    std::vector<float> values(batch_size * input_size);
    std::vector<float> classes(batch_size * output_size, 0.0);
@ -101,13 +133,14 @@ int main(int argc, char** argv) {
         values[k] = max * (2.0*static_cast<float>(rand()) / RAND_MAX - 1.0);
      }
    }
    thrust::copy(values.begin(), values.end(), Xt.begin());
-
+    std::stringstream ss;
-    *X[t] = Xt;
+    ss << "X" << t;
    g[ss.str()] = Xt;
  }
-  for (int t = 0; t < num_outputs; ++t) {
+  // Generate output data (include the stop symbol).
  for (int t = 0; t <= num_outputs; ++t) {
    Tensor Yt({batch_size, output_size});
    std::vector<float> classes(batch_size * output_size, 0.0);
@ -117,26 +150,31 @@ int main(int argc, char** argv) {
      classes[l + gold] = 1.0;
      l += output_size;
    }
    thrust::copy(classes.begin(), classes.end(), Yt.begin());
-
+    std::stringstream ss;
-    *Y[t] = Yt;
+    ss << "Y" << t;
    g[ss.str()] = Yt;
  }
  std::cerr << "Printing the computation graph..." << std::endl;
  std::cout << g.graphviz() << std::endl;
  std::cerr << "Running the forward step..." << std::endl;
  g.forward(batch_size);
  std::cerr << "Running the backward step..." << std::endl;
  g.backward();
  std::cerr << "Done." << std::endl;
-  std::cerr << graph.val().Debug() << std::endl;
+  std::cerr << g["cost"].val().Debug() << std::endl;
-  std::cerr << X[0]->val().Debug() << std::endl;
+  std::cerr << g["X0"].val().Debug() << std::endl;
-  std::cerr << Y[0]->val().Debug() << std::endl;
+  std::cerr << g["Y0"].val().Debug() << std::endl;
-
+  std::cerr << g["Whh"].grad().Debug() << std::endl;
-  std::cerr << Whh.grad().Debug() << std::endl;
+  std::cerr << g["bh"].grad().Debug() << std::endl;
-  std::cerr << bh.grad().Debug() << std::endl;
+  std::cerr << g["Why"].grad().Debug() << std::endl;
-  std::cerr << Why.grad().Debug() << std::endl;
+  std::cerr << g["by"].grad().Debug() << std::endl;
-  std::cerr << by.grad().Debug() << std::endl;
+  std::cerr << g["Wxh"].grad().Debug() << std::endl;
-  std::cerr << Wxh.grad().Debug() << std::endl;
+  std::cerr << g["h0"].grad().Debug() << std::endl;
  std::cerr << h0.grad().Debug() << std::endl;
  return 0;
 }
--- a/src/validate_mnist.cu
+++ b/src/validate_mnist.cu
@ -10,7 +10,7 @@ const size_t IMAGE_SIZE = 784;
 const size_t LABEL_SIZE = 10;
 int BATCH_SIZE = 10000;
-ExpressionGraph build_graph(int cudaDevice) {
+ExpressionGraph build_graph() {
  std::cerr << "Loading model params...";
  NpzConverter converter("../scripts/test_model_single/model.npz");
@ -22,7 +22,7 @@ ExpressionGraph build_graph(int cudaDevice) {
  std::cerr << "Building model...";
-  ExpressionGraph g(cudaDevice);
+  ExpressionGraph g;
  auto x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
  auto y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
@ -52,7 +52,7 @@ int main(int argc, char** argv) {
  std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", BATCH_SIZE, LABEL_SIZE);
  std::cerr << "Done." << std::endl;
-  ExpressionGraph g = build_graph(0);
+  ExpressionGraph g = build_graph();
  Tensor xt({BATCH_SIZE, IMAGE_SIZE});
  Tensor yt({BATCH_SIZE, LABEL_SIZE});
--- a/src/validate_mnist_batch.cu
+++ b/src/validate_mnist_batch.cu
@ -56,8 +56,7 @@ int main(int argc, char** argv) {
  std::cerr << "\tDone." << std::endl;
-
+  ExpressionGraph g;
  ExpressionGraph g(0);
  auto x = g.input(shape={whatevs, IMAGE_SIZE}, name="X");
  auto y = g.input(shape={whatevs, LABEL_SIZE}, name="Y");
--- a/src/vocab.cpp
+++ b/src/vocab.cpp
@ -24,22 +24,6 @@ inline std::vector<std::string> Tokenize(const std::string& str,
  return tokens;
 }
 ////////////////////////////////////////////////////////
 size_t Vocab::GetUNK() const
 {
 	return std::numeric_limits<size_t>::max();
 }
 size_t Vocab::GetPad() const
 {
 	return std::numeric_limits<size_t>::max() - 1;
 }
 size_t Vocab::GetEOS() const
 {
 	return std::numeric_limits<size_t>::max() - 2;
 }
 size_t Vocab::GetOrCreate(const std::string &word)
 {
@ -55,6 +39,12 @@ size_t Vocab::GetOrCreate(const std::string &word)
 	return id;
 }
 size_t Vocab::Get(const std::string &word) const
 {
 	Coll::const_iterator iter = coll_.find(word);
 	return iter->second;
 }
 std::vector<size_t> Vocab::ProcessSentence(const std::string &sentence)
 {
 	vector<string> toks = Tokenize(sentence);
--- a/src/vocab.h
+++ b/src/vocab.h
@ -7,12 +7,22 @@
 class Vocab
 {
 public:
-	size_t GetOrCreate(const std::string &word);
+  Vocab() {
    GetOrCreate("__UNK__");
    GetOrCreate("__PAD__");
    GetOrCreate("__EOS__");
  }
  virtual ~Vocab() {}
 public:
        size_t Size() const { return coll_.size(); }
        size_t Get(const std::string &word) const;
        size_t GetOrCreate(const std::string &word);
 	std::vector<size_t> ProcessSentence(const std::string &sentence);
-	size_t GetUNK() const;
+	size_t GetUNK() const { return Get("__UNK__"); }
-	size_t GetPad() const;
+	size_t GetPAD() const { return Get("__PAD__"); }
-	size_t GetEOS() const;
+	size_t GetEOS() const { return Get("__EOS__"); }
 protected:
 	typedef std::unordered_map<std::string, size_t> Coll;
 	Coll coll_;