Merge branch 'master' of https://github.com/emjotde/Marian

2024-11-03 20:13:47 +03:00 · 2016-09-16 12:40:37 +01:00 · 2016-09-16 12:40:37 +01:00 · 5d924dd160
commit 5d924dd160
parent 98d66f738a 728ca05152
22 changed files with 571 additions and 422 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.5.1)
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

 project(marian CXX)
-SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O0 -funroll-loops -Wno-unused-result -Wno-deprecated")
-LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O0; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
+SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
+LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
 add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
 SET(CUDA_PROPAGATE_HOST_FLAGS OFF)

--- a/examples/mt/download.sh
+++ b/examples/mt/download.sh
@ -0,0 +1,4 @@
+
+wget http://data.statmt.org/wmt16/translation-task/dev.tgz
+tar xvf dev.tgz
+
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -4,10 +4,12 @@ include_directories(.)
 cuda_add_library(marian_lib 
  cnpy/cnpy.cpp
  exception.cpp
-  expressions.cu 
+  expression_graph.cu 
  sgd.cu
-  tensor.cu	
+  tensor.cu
  tensor_operators.cu
+  expression_operators.cu
+  vocab.cpp
 )

 target_link_libraries(marian_lib)
--- a/src/chainable.h
+++ b/src/chainable.h
@ -0,0 +1,34 @@
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include "exception.h"
+
+namespace marian {
+
+template <class DataType>
+struct Chainable {
+    Chainable() { }
+    virtual ~Chainable() { }
+    virtual void forward() { }
+    virtual void backward() { }
+    virtual void init_dependent() { }
+    virtual void set_zero_adjoint() { }
+
+    virtual void allocate(size_t) = 0;
+    
+    virtual const Shape& shape() = 0;
+    virtual DataType &val() = 0;
+    virtual DataType grad() = 0;
+    virtual void setVal(DataType t) {
+      UTIL_THROW2("Tensors can only be assigned to input nodes"); 
+    };
+};
+
+typedef std::vector<Chainable<Tensor>*> ChainableStack;
+typedef std::shared_ptr<ChainableStack> ChainableStackPtr;    
+typedef std::shared_ptr<Chainable<Tensor>> ChainPtr;
+
+
+}
--- a/src/expression_graph.cu
+++ b/src/expression_graph.cu
@ -0,0 +1,41 @@
+#include <sstream>
+#include "expression_graph.h"
+
+using namespace std;
+
+namespace marian {
+
+Expr::Expr(ExpressionGraphPtr g, Chainable<Tensor>* chainable)
+  : graph_(g), pimpl_(chainable) {
+  graph_->stack()->push_back(chainable);    
+}
+
+Tensor Expr::val() {
+  return pimpl_->val();
+}
+
+Tensor Expr::grad() {
+    return pimpl_->grad();
+}
+
+ChainPtr Expr::node() {
+    return pimpl_;
+}
+
+ExpressionGraphPtr Expr::graph() {
+    return graph_;
+}
+  
+Expr::operator ChainPtr() {
+  return pimpl_;
+}
+
+std::string Expr::Debug() const
+{
+	stringstream strm;
+	const Shape &shape = pimpl_->shape();
+	strm << marian::Debug(shape);
+	return strm.str();
+}
+    
+}
--- a/src/expression_graph.h
+++ b/src/expression_graph.h
@ -0,0 +1,120 @@
+#pragma once
+
+#include <map>
+
+#include "definitions.h"
+#include "chainable.h"
+#include "node_operators.h"
+#include "tensor.h"
+
+namespace marian {
+
+class ExpressionGraph;
+typedef ExpressionGraph* ExpressionGraphPtr;
+
+class Expr {
+  public:
+    Expr(ExpressionGraphPtr g, Chainable<Tensor>* chainable);
+    
+    Expr operator=(Tensor t) {
+      pimpl_->setVal(t);
+      return *this;
+    }
+
+    Tensor val();
+    Tensor grad();
+
+    ExpressionGraphPtr graph();
+    
+    ChainPtr node();
+    operator ChainPtr();
+
+    std::string Debug() const;
+
+  private:
+    ExpressionGraphPtr graph_;
+    ChainPtr pimpl_;
+};
+
+class ExpressionGraph {
+  public:
+    ExpressionGraph()
+    : stack_(new ChainableStack)
+    {}
+    
+    void forward(size_t batchSize) {
+      for(auto&& v : *stack_) {
+        v->allocate(batchSize);
+      }
+      for(auto&& v : *stack_)
+        v->forward();    
+    }
+    
+    void backward() {
+      for(auto&& v : *stack_)
+        v->set_zero_adjoint();
+    
+      typedef typename ChainableStack::reverse_iterator It;
+      stack_->back()->init_dependent();
+      for(It it = stack_->rbegin(); it != stack_->rend(); ++it)
+        (*it)->backward();
+    }
+    
+    template <typename ...Args>
+    inline Expr input(Args ...args) {
+      return Expr(this, new InputNode(args...));
+    }
+    
+    template <typename ...Args>
+    inline Expr param(Args ...args) {
+      Expr e(this, new ParamNode(args...));
+      params_.emplace_back(e);
+      return e;
+    }
+    
+    template <typename ...Args>
+    inline Expr constant(Args ...args) {
+      return Expr(this, new ConstantNode(args...));
+    }
+    
+    template <typename ...Args>
+    inline Expr ones(Args ...args) {
+      return Expr(this, new ConstantNode(keywords::value=1, args...));
+    }
+    
+    template <typename ...Args>
+    inline Expr zeroes(Args ...args) {
+      return Expr(this, new ConstantNode(keywords::value=0, args...));
+    }
+    
+    /*********************************************************/
+        
+    ChainableStackPtr stack() {
+      return stack_;
+    }
+    
+    Expr& operator[](const std::string& name) {
+      auto it = named_.find(name);
+      UTIL_THROW_IF2(it == named_.end(), "No such named node in graph: " << name);
+      return it->second;  
+    }
+
+    bool has_node(const std::string& name) const {
+      return named_.count(name) > 0;
+    }
+    
+    void add_named_node(Expr e, const std::string& name) {
+      named_.emplace(name, e);
+    }
+    
+    std::vector<Expr>& params() {
+      return params_;
+    }
+    
+  private:
+    ChainableStackPtr stack_;
+    std::map<std::string, Expr> named_;
+    std::vector<Expr> params_;
+};
+
+}
--- a/src/expression_operators.cu
+++ b/src/expression_operators.cu
@ -0,0 +1,124 @@
+
+#include "expression_operators.h"
+#include "node_operators.h"
+
+namespace marian {
+
+Expr named(Expr a, const std::string& name) {
+  a.graph()->add_named_node(a, name);
+  return a;
+}
+
+Expr logit(Expr a) {
+  return Expr(a.graph(), new LogitNodeOp(a));
+}
+
+Expr tanh(Expr a) {
+  return Expr(a.graph(), new TanhNodeOp(a));
+}
+
+Expr log(Expr a) {
+  return Expr(a.graph(), new LogNodeOp(a));
+};
+
+Expr exp(Expr a) {
+  return Expr(a.graph(), new ExpNodeOp(a));
+};
+
+Expr operator-(Expr a) {
+  return Expr(a.graph(), new NegNodeOp(a));
+};
+
+Expr softmax_fast(Expr a) {
+  return Expr(a.graph(), new SoftmaxNodeOp(a));
+}
+
+/*********************************************************/
+
+static Shape newShape(ChainPtr a, ChainPtr b) {
+  size_t dimsA = a->shape().size();
+  size_t dimsB = b->shape().size();
+  UTIL_THROW_IF2(dimsA != dimsB,
+                 "Tensors have different numbers of dimensions");
+  Shape shape(dimsA);
+  for(size_t i = 0; i < dimsA; ++i) {
+    int dimA = a->shape()[i];
+    int dimB = b->shape()[i];
+    bool broadcastable = (dimA == dimB || dimA == 1 || dimB == 1);
+    UTIL_THROW_IF2(!broadcastable, "Different dimensions in elementwise "
+                   << "operation cannot be broadcasted: " << dimA << " != " << dimB);
+    shape[i] = std::max(dimA, dimB);
+    if(dimA == whatevs || dimB == whatevs)
+      shape[i] = whatevs;
+  }
+  return shape;
+}
+
+Expr broadcast(Shape bShape, Expr a) {
+  const Shape& aShape = a.node()->shape();
+  if(aShape == bShape) {
+    return a;
+  }
+  else {
+    size_t dimsA = aShape.size();
+    size_t dimsB = bShape.size();
+    UTIL_THROW_IF2(dimsA != dimsB,
+                   "Tensor and shape have different number of dimensions");
+    for(size_t i = 0; i < dimsA; ++i) {
+      int dimA = aShape[i];
+      int dimB = bShape[i];
+      bool broadcastable = (dimA == dimB || dimA == 1);
+      UTIL_THROW_IF2(!broadcastable,
+                     "Cannot broadcast tensor dimension "
+                     << dimA << " to " << dimB);
+      if(dimA == 1 && dimB != 1) {
+        if(i == 0) {
+          Expr one = a.graph()->ones(keywords::shape={bShape[0], 1});
+          a = dot(one, a);
+        }
+        else if(i == 1) {
+          Expr one = a.graph()->ones(keywords::shape={1, bShape[1]});
+          a = dot(a, one);
+        }
+        else {
+          UTIL_THROW2("Not implemented");        
+        }
+      }
+    }
+    return a;
+  }
+}
+
+Expr operator+(Expr a, Expr b) {
+  Shape shape = newShape(a, b);
+  Expr cast_a = broadcast(shape, a);
+  Expr cast_b = broadcast(shape, b);
+  return Expr(a.graph(), new PlusNodeOp(cast_a, cast_b));
+}
+
+Expr operator-(Expr a, Expr b) {
+  Shape shape = newShape(a, b);
+  Expr cast_a = broadcast(shape, a);
+  Expr cast_b = broadcast(shape, b);
+  return Expr(a.graph(), new MinusNodeOp(cast_a, cast_b));
+}
+
+Expr operator*(Expr a, Expr b) {
+  Shape shape = newShape(a, b);
+  Expr cast_a = broadcast(shape, a);
+  Expr cast_b = broadcast(shape, b);
+  return Expr(a.graph(), new MultNodeOp(cast_a, cast_b));
+}
+
+Expr operator/(Expr a, Expr b) {
+  Shape shape = newShape(a, b);
+  Expr cast_a = broadcast(shape, a);
+  Expr cast_b = broadcast(shape, b);
+  return Expr(a.graph(), new DivNodeOp(cast_a, cast_b));
+}
+
+Expr dot(Expr a, Expr b) {
+  return Expr(a.graph(), new DotNodeOp(a, b));
+}
+
+}
--- a/src/expression_operators.h
+++ b/src/expression_operators.h
@ -1,115 +1,36 @@
 #pragma once

-#include "graph.h"
-#include "graph_operators.h"
-#include "expressions.h"
+#include "expression_graph.h"

 namespace marian {

-template <typename ...Args>
-inline Expr input(Args ...args) {
-  return Expr(new InputNode(args...));
-}
+Expr named(Expr a, const std::string& name);

-template <typename ...Args>
-inline Expr param(Args ...args) {
-  return Expr(new ParamNode(args...));
-}
-template <typename ...Args>
-inline Expr constant(Args ...args) {
-  return Expr(new ConstantNode(args...));
-}
+Expr logit(Expr a);

-template <typename ...Args>
-inline Expr ones(Args ...args) {
-  return Expr(new ConstantNode(keywords::value=1, args...));
-}
+Expr tanh(Expr a);

-template <typename ...Args>
-inline Expr zeroes(Args ...args) {
-  return Expr(new ConstantNode(keywords::value=0, args...));
-}
+Expr log(Expr a);
+
+Expr exp(Expr a);
+
+Expr operator-(Expr a);

 /*********************************************************/

-inline Expr logit(Expr a) {
-  return Expr(new LogitNodeOp(a));
-}
+Expr operator+(Expr a, Expr b);

-inline Expr tanh(Expr a) {
-  return Expr(new TanhNodeOp(a));
-}
+Expr operator-(Expr a, Expr b);

-inline Expr log(Expr a) {
-  return Expr(new LogNodeOp(a));
-};
+Expr operator*(Expr a, Expr b);

-inline Expr exp(Expr a) {
-  return Expr(new ExpNodeOp(a));
-};
+Expr operator/(Expr a, Expr b);

-inline Expr operator-(Expr a) {
-  return Expr(new NegNodeOp(a));
-};
-
-/*********************************************************/
-
-inline Expr operator+(Expr a, Expr b) {
-  return Expr(new PlusNodeOp(a, b));
-}
-
-inline Expr operator-(Expr a, Expr b) {
-  return Expr(new MinusNodeOp(a, b));
-}
-
-inline Expr operator*(Expr a, Expr b) {
-  return Expr(new MultNodeOp(a, b));
-}
-
-inline Expr operator/(Expr a, Expr b) {
-  return Expr(new DivNodeOp(a, b));
-}
-
-inline Expr dot(Expr a, Expr b) {
-  return Expr(new DotNodeOp(a, b));
-}
+Expr dot(Expr a, Expr b);

 /******************************************************/

-Expr broadcast(Shape bShape, Expr a) {
-  const Shape& aShape = a.node()->shape();
-  if(aShape == bShape) {
-    return a;
-  }
-  else {
-    size_t dimsA = aShape.size();
-    size_t dimsB = bShape.size();
-    UTIL_THROW_IF2(dimsA != dimsB,
-                   "Tensor and shape have different number of dimensions");
-    for(size_t i = 0; i < dimsA; ++i) {
-      int dimA = aShape[i];
-      int dimB = bShape[i];
-      bool broadcastable = (dimA == dimB || dimA == 1);
-      UTIL_THROW_IF2(!broadcastable,
-                     "Cannot broadcast tensor dimension "
-                     << dimA << " to " << dimB);
-      if(dimA == 1 && dimB != 1) {
-        if(i == 0) {
-          Expr one = ones(keywords::shape={bShape[0], 1});
-          a = dot(one, a);
-        }
-        else if(i == 1) {
-          Expr one = ones(keywords::shape={1, bShape[1]});
-          a = dot(a, one);
-        }
-        else {
-          UTIL_THROW2("Not implemented");        
-        }
-      }
-    }
-    return a;
-  }
-}
+Expr broadcast(Shape bShape, Expr a);

 /*********************************************************/

@ -126,7 +47,7 @@ inline Expr sum(Expr a, Args ...args) {
      int rows = n->val().shape()[0];
      return {1, rows};
    };
-    Expr one = ones(shape={1, n->shape()[0]},
+    Expr one = a.graph()->ones(shape={1, n->shape()[0]},
                    lazy_shape=lshape);
    return dot(one, a);        
  }
@ -136,8 +57,8 @@ inline Expr sum(Expr a, Args ...args) {
      //std::cerr << "Shape will be " << cols << " by 1." << std::endl;
      return {cols, 1};
    };
-    Expr one = ones(shape={n->shape()[1], 1},
-                    lazy_shape=lshape);
+    Expr one = a.graph()->ones(shape={n->shape()[1], 1},
+                        lazy_shape=lshape);
    return dot(a, one);          
  }
  else if(ax == 2) {
@ -151,17 +72,12 @@ inline Expr sum(Expr a, Args ...args) {

 // inefficient
 template <typename ...Args>
-inline Expr softmax(Expr a, Args ...args) {
+Expr softmax(Expr a, Args ...args) {
  Expr e = exp(a);
  return e / sum(e, args...);
 }

-template <typename ...Args>
-inline Expr softmax_fast(Expr a, Args ...args) {
-  Expr e = Expr(new SoftmaxNodeOp(a, args...));
-  return e;
-}
-
+Expr softmax_fast(Expr a);

 // inefficient
 template <typename ...Args>
@ -173,12 +89,12 @@ inline Expr mean(Expr a, Args ...args) {
  ChainPtr n = a.node();
  switch (ax) {
    case 0:
-      return sum(a, axis=0) / constant(shape={1, 1},
+      return sum(a, axis=0) / a.graph()->constant(shape={1, 1},
                                       lazy_value=[n]() -> Float {
                                         return n->val().shape()[0];
                                       });
    case 1:
-      return sum(a, axis=1) / constant(shape={1, 1},
+      return sum(a, axis=1) / a.graph()->constant(shape={1, 1},
                                       lazy_value=[n]() -> Float {
                                         return n->val().shape()[1];
                                       });
@ -187,7 +103,7 @@ inline Expr mean(Expr a, Args ...args) {
    case 3:
      UTIL_THROW2("Not implemented");
    default:
-      return sum(a) / constant(shape={1, 1},
+      return sum(a) / a.graph()->constant(shape={1, 1},
                               lazy_value=[n]() -> Float {
                                 return n->val().size();
                               });
--- a/src/expressions.cu
+++ b/src/expressions.cu
@ -1,59 +0,0 @@
-#include <sstream>
-#include "expressions.h"
-#include "graph_operators.h"
-
-using namespace std;
-
-namespace marian {
-
-Expr::Expr(Chainable<Tensor>* chainable) : pimpl_(chainable) {}
-Expr::Expr(Float v) : pimpl_(new ConstantNode(keywords::value=v,
-                                              keywords::shape={1,1})) {}
-
-Tensor Expr::val() {
-  return pimpl_->val();
-}
-
-Tensor Expr::grad() {
-    return pimpl_->grad();
-}
-
-ChainPtr Expr::node() {
-    return pimpl_;
-}
-  
-void Expr::forward(size_t batchSize) {
-  UTIL_THROW_IF2(pimpl_.get() != Chainable<Tensor>::stack.back(),
-                 "Trying to call forward on non-root of computation graph");  
-  for(auto&& v : Chainable<Tensor>::stack) {
-    v->allocate(batchSize);
-  }
-  for(auto&& v : Chainable<Tensor>::stack)
-    v->forward();    
-}
-
-void Expr::backward() {
-  UTIL_THROW_IF2(pimpl_.get() != Chainable<Tensor>::stack.back(),
-                "Trying to call backward on non-root of computation graph");  
-  for(auto&& v : Chainable<Tensor>::stack)
-    v->set_zero_adjoint();
-
-  typedef typename Chainable<Tensor>::ChainableStack::reverse_iterator It;
-  pimpl_->init_dependent();
-  for(It it = Chainable<Tensor>::stack.rbegin(); it != Chainable<Tensor>::stack.rend(); ++it)
-    (*it)->backward();
-}
-
-Expr::operator ChainPtr() {
-  return pimpl_;
-}
-
-std::string Expr::Debug() const
-{
-	stringstream strm;
-	const Shape &shape = pimpl_->shape();
-	strm << marian::Debug(shape);
-	return strm.str();
-}
-    
-}
--- a/src/expressions.h
+++ b/src/expressions.h
@ -1,33 +0,0 @@
-#pragma once
-
-#include "definitions.h"
-#include "graph.h"
-
-namespace marian {
-
-class Expr {
-  public:
-    Expr(Chainable<Tensor>* chainable);
-    Expr(Float v);
-
-    Expr operator=(Tensor t) {
-      pimpl_->setVal(t);
-      return *this;
-    }
-
-    Tensor val();
-    Tensor grad();
-
-    void forward(size_t batchSize);
-    void backward();
-
-    ChainPtr node();
-    operator ChainPtr();
-
-    std::string Debug() const;
-
-  private:
-    ChainPtr pimpl_;
-};
-
-}
--- a/src/marian.h
+++ b/src/marian.h
@ -1,9 +1,7 @@
 #pragma once

 #include "definitions.h"
-#include "graph.h"
-#include "graph_operators.h"
-#include "expressions.h"
-#include "expression_operators.h"
+#include "expression_graph.h"
 #include "param_initializers.h"
+#include "expression_operators.h"

--- a/src/graph.h
+++ b/src/graph.h
@ -2,36 +2,10 @@

 #include "keywords.h"
 #include "tensor.h"
+#include "chainable.h"

 namespace marian {

-template <class DataType>
-struct Chainable {
-    Chainable() { }
-    virtual ~Chainable() { }
-    virtual void forward() { }
-    virtual void backward() { }
-    virtual void init_dependent() { }
-    virtual void set_zero_adjoint() { }
-
-    virtual void allocate(size_t) = 0;
-    
-    virtual const Shape& shape() = 0;
-    virtual DataType &val() = 0;
-    virtual DataType grad() = 0;
-    virtual void setVal(DataType t) {
-      UTIL_THROW2("Tensors can only be assigned to input nodes"); 
-    };
-    
-    typedef std::vector<Chainable<DataType>*> ChainableStack;
-    static ChainableStack stack;
-};
-
-template <class DataType>
-typename Chainable<DataType>::ChainableStack Chainable<DataType>::stack;
-
-typedef std::shared_ptr<Chainable<Tensor>> ChainPtr;
-
 class Node : public Chainable<Tensor>,
             public keywords::Keywords {
  public:
@ -40,9 +14,7 @@ class Node : public Chainable<Tensor>,
     : Keywords(args...),
       shape_(Get<Shape>(keywords::shape, {1, 1})),
       name_(Get<std::string>(keywords::name, "none"))
-    {
-      stack.push_back(this);
-    }
+    { }
    
    virtual ~Node() {};
    
--- a/src/graph_operators.h
+++ b/src/graph_operators.h
@ -1,7 +1,6 @@
 #pragma once

-#include "expressions.h"
-#include "graph.h"
+#include "node.h"
 #include "tensor_operators.h"

 namespace marian {
@ -108,49 +107,14 @@ struct TanhNodeOp : public UnaryNodeOp {
  }
 };

-struct ArgmaxOp : public UnaryNodeOp {
-  template <typename ...Args>
-  ArgmaxOp(ChainPtr a, Args ...args)
-  : UnaryNodeOp(a, keywords::shape=newShape(a, -1), args...),
-    axis_(-1) { }
-  
-  Shape newShape(ChainPtr a, int axis) {
-    Shape shape1 = a->shape();
-    UTIL_THROW_IF2(shape1.size() > 2,
-                   "Tensors with more than 2 dimensions not supported yet");
-    if(axis == 0) {
-      shape1[0] = 1;
-    }
-    else if(axis == 1) {
-      shape1[1] = 1;
-    }
-    else {
-      shape1 = {1, 1};
-    }
-    return shape1;
-  }
-  
-  void forward() {
-    //val_ = Argmax(a_->val(), axis_);
-    UTIL_THROW2("Not implemented");    
-  }
-  
-  void backward() {
-    UTIL_THROW2("Not implemented");    
-  }
-  
-  private:
-    int axis_;
-};
-
 // @TODO, make this numerically safe(r):
 // softmax(X) = softmax_safe(X - max(X, axis=1))
 // Probably best to do this directly in Softmax
 // function. 
 struct SoftmaxNodeOp : public UnaryNodeOp {
  template <typename ...Args>
-    SoftmaxNodeOp(ChainPtr a, Args ...args)
-    : UnaryNodeOp(a, args...) { }
+    SoftmaxNodeOp(Args ...args)
+    : UnaryNodeOp(args...) { }

  void forward() {
    // B = softmax(A).
@ -171,8 +135,8 @@ struct SoftmaxNodeOp : public UnaryNodeOp {

 struct LogNodeOp : public UnaryNodeOp {
  template <typename ...Args>
-  LogNodeOp(ChainPtr a, Args ...args)
-  : UnaryNodeOp(a, args...) {}
+  LogNodeOp(Args ...args)
+  : UnaryNodeOp(args...) {}

  void forward() {
    Element(_1 = Log(_2), val_, a_->val());
@ -186,8 +150,8 @@ struct LogNodeOp : public UnaryNodeOp {

 struct ExpNodeOp : public UnaryNodeOp {
  template <typename ...Args>
-    ExpNodeOp(ChainPtr a, Args ...args)
-    : UnaryNodeOp(a, args...) { }
+    ExpNodeOp(Args ...args)
+    : UnaryNodeOp(args...) { }

  void forward() {
    Element(_1 = Exp(_2), val_, a_->val());
@ -230,7 +194,7 @@ struct DotNodeOp : public BinaryNodeOp {
  template <typename ...Args>
  DotNodeOp(ChainPtr a, ChainPtr b, Args ...args)
  : BinaryNodeOp(a, b,
-                 keywords::shape=newShape(a,b),
+                 keywords::shape=newShape(a, b),
                 args...) { }

  Shape newShape(ChainPtr a, ChainPtr b) {
@ -258,41 +222,11 @@ struct DotNodeOp : public BinaryNodeOp {
  }
 };

-Expr broadcast(Shape shape, Expr a);
-
-struct BroadcastingNodeOp : public BinaryNodeOp {
+struct PlusNodeOp : public BinaryNodeOp {
  template <typename ...Args>
-  BroadcastingNodeOp(Expr a, Expr b, Args ...args)
-  : BinaryNodeOp(broadcast(newShape(a ,b), a),
-                 broadcast(newShape(a ,b), b),
-                 keywords::shape=newShape(a, b),
-                 args...) {}
-  
-  static Shape newShape(ChainPtr a, ChainPtr b) {
-    size_t dimsA = a->shape().size();
-    size_t dimsB = b->shape().size();
-    UTIL_THROW_IF2(dimsA != dimsB,
-                   "Tensors have different numbers of dimensions");
-    Shape shape(dimsA);
-    for(size_t i = 0; i < dimsA; ++i) {
-      int dimA = a->shape()[i];
-      int dimB = b->shape()[i];
-      bool broadcastable = (dimA == dimB || dimA == 1 || dimB == 1);
-      UTIL_THROW_IF2(!broadcastable, "Different dimensions in elementwise "
-                     << "operation cannot be broadcasted: " << dimA << " != " << dimB);
-      shape[i] = std::max(dimA, dimB);
-      if(dimA == whatevs || dimB == whatevs)
-        shape[i] = whatevs;
-    }
-    return shape;
-  }
-};
-
-
-struct PlusNodeOp : public BroadcastingNodeOp {
-  template <typename ...Args>
-  PlusNodeOp(Args ...args) : BroadcastingNodeOp(args...) { }
-  
+  PlusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
+    : BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
+    
  void forward() {
    Element(_1 = _2 + _3,
            val_, a_->val(), b_->val());
@ -306,10 +240,11 @@ struct PlusNodeOp : public BroadcastingNodeOp {
  }
 };

-struct MinusNodeOp : public BroadcastingNodeOp {
+struct MinusNodeOp : public BinaryNodeOp {
  template <typename ...Args>
-  MinusNodeOp(Args ...args) : BroadcastingNodeOp(args...) { }
-  
+  MinusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
+    : BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
+    
  void forward() {
    Element(_1 = _2 - _3,
            val_, a_->val(), b_->val());
@ -323,10 +258,11 @@ struct MinusNodeOp : public BroadcastingNodeOp {
  }
 };

-struct MultNodeOp : public BroadcastingNodeOp {
+struct MultNodeOp : public BinaryNodeOp {
  template <typename ...Args>
-  MultNodeOp(Args ...args) : BroadcastingNodeOp(args...) { }
-  
+  MultNodeOp(ChainPtr a, ChainPtr b, Args ...args)
+    : BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
+    
  void forward() {
    Element(_1 = _2 * _3,
            val_, a_->val(), b_->val());
@ -340,9 +276,10 @@ struct MultNodeOp : public BroadcastingNodeOp {
  }
 };

-struct DivNodeOp : public BroadcastingNodeOp {
+struct DivNodeOp : public BinaryNodeOp {
  template <typename ...Args>
-  DivNodeOp(Args ...args) : BroadcastingNodeOp(args...) { }
+  DivNodeOp(ChainPtr a, ChainPtr b, Args ...args)
+    : BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
    
  void forward() {
    Element(_1 = _2 / _3,
--- a/src/param_initializers.h
+++ b/src/param_initializers.h
@ -18,7 +18,7 @@ void ones(Tensor t) {
 }

 template <class Distribution>
-void distribution(Tensor t, float a=0.0, float b=0.1) {
+void distribution(Tensor t, float a, float b) {
  std::random_device device;
  std::default_random_engine engine(device());
  Distribution dist(a, b);
@ -43,7 +43,7 @@ std::function<void(Tensor)> uniform(float a = 0.0, float b = 0.1) {
 }

 std::function<void(Tensor)> from_vector(const std::vector<float>& v) {
-  return [&v](Tensor t) {
+  return [v](Tensor t) {
    t << v;
  };
 }
--- a/src/sgd.cu
+++ b/src/sgd.cu
@ -7,15 +7,11 @@
 using namespace std;

 namespace marian {
-SGD::SGD(Expr& cost_func, Expr& inX, Expr& inY,
-    const std::vector<Expr*> params, float eta,
+SGD::SGD(ExpressionGraph& g, float eta,
    std::vector<float>& xData, size_t numFeatures,
    std::vector<float>& yData, size_t numClasses,
    size_t epochs, size_t batchSize)
-: cost_function_(&cost_func),
-  inX_(&inX),
-  inY_(&inY),
-  params_(params),
+: graph_(g),
  eta_(eta),
  xData_(xData),
  numFeatures_(numFeatures),
@ -45,11 +41,11 @@ void SGD::Run()
      size_t endId = startId + batchSize;

      PrepareBatch(startId, endId, batchSize, shuffle, xt, yt);
-      *inX_ = xt;
-      *inY_ = yt;
+      graph_["x"] = xt;
+      graph_["y"] = yt;

-      cost_function_->forward(maxBatchSize_);
-      cost_function_->backward();
+      graph_.forward(maxBatchSize_);
+      graph_.backward();

      UpdateModel();

@ -136,9 +132,9 @@ void SGD::PrepareBatch(
 }

 void SGD::UpdateModel() {
-  for (auto& param : params_) {
+  for (auto& param : graph_.params()) {
    using namespace thrust::placeholders;
-    Element(_1 = _1 - eta_ * _2, param->val(), param->grad());
+    Element(_1 -= eta_ * _2, param.val(), param.grad());
  }
 }

--- a/src/sgd.h
+++ b/src/sgd.h
@ -3,7 +3,7 @@
 #include <memory>
 #include <iostream>

-#include "expressions.h"
+#include "expression_graph.h"
 #include "thrust_functions.h"
 #include "tensor_operators.h"

@ -11,8 +11,7 @@ namespace marian {

 class SGD {
  public:
-    SGD(Expr& cost_func, Expr& inX, Expr& inY,
-        const std::vector<Expr*> params, float eta,
+    SGD(ExpressionGraph& g, float eta,
        std::vector<float>& xData, size_t numFeatures,
        std::vector<float>& yData, size_t numClasses,
        size_t epochs, size_t batchSize);
@ -20,10 +19,7 @@ class SGD {
    void Run();

  private:
-    Expr *cost_function_;
-    Expr *inX_;
-    Expr *inY_;
-    std::vector<Expr*> params_;
+    ExpressionGraph& graph_;
    const float eta_;
    std::vector<float>& xData_;
    const size_t numFeatures_;
--- a/src/test.cu
+++ b/src/test.cu
@ -1,55 +1,70 @@
-
+#include <fstream>
 #include "marian.h"
 #include "mnist.h"
+#include "vocab.h"

 int main(int argc, char** argv) {
  cudaSetDevice(0);

+  using namespace std;
  using namespace marian;
  using namespace keywords;

+  Vocab sourceVocab, targetVocab;
+
  int input_size = 10;
  int output_size = 2;
  int batch_size = 25;
  int hidden_size = 5;
  int num_inputs = 8;

-  std::vector<Expr*> X(num_inputs);
-  std::vector<Expr*> Y(num_inputs);
-  std::vector<Expr*> H(num_inputs);
+  std::vector<Expr> X;
+  std::vector<Expr> Y;
+  std::vector<Expr> H;
+
+  ExpressionGraph g;

  for (int t = 0; t < num_inputs; ++t) {
-    X[t] = new Expr(input(shape={batch_size, input_size}));
-    Y[t] = new Expr(input(shape={batch_size, output_size}));
+    X.emplace_back(g.input(shape={batch_size, input_size}));
+    Y.emplace_back(g.input(shape={batch_size, output_size}));
  }

-  Expr Wxh = param(shape={input_size, hidden_size}, init=uniform(), name="Wxh");
-  Expr Whh = param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh");
-  Expr bh = param(shape={1, hidden_size}, init=uniform(), name="bh");
-  Expr h0 = param(shape={1, hidden_size}, init=uniform(), name="h0");
+  Expr Wxh = g.param(shape={input_size, hidden_size}, init=uniform(), name="Wxh");
+  Expr Whh = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh");
+  Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh");
+  Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0");
+
+  // read parallel corpus from file
+  std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
+  std::fstream targetFile("../examples/mt/dev/newstest2013.en");
+
+  string sourceLine, targetLine;
+  while (getline(sourceFile, sourceLine)) {
+	  getline(targetFile, targetLine);
+
+	  std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
+	  std::vector<size_t> targetIds = sourceVocab.ProcessSentence(targetLine);
+  }

  std::cerr << "Building RNN..." << std::endl;
-  H[0] = new Expr(tanh(dot(*X[0], Wxh) + dot(h0, Whh) + bh));
+  H.emplace_back(tanh(dot(X[0], Wxh) + dot(h0, Whh) + bh));
  for (int t = 1; t < num_inputs; ++t) {
-    H[t] = new Expr(tanh(dot(*X[t], Wxh) + dot(*H[t-1], Whh) + bh));
+    H.emplace_back(tanh(dot(X[t], Wxh) + dot(H[t-1], Whh) + bh));
  }

-  Expr Why = param(shape={hidden_size, output_size}, init=uniform(), name="Why");
-  Expr by = param(shape={1, output_size}, init=uniform(), name="by");
+  Expr Why = g.param(shape={hidden_size, output_size}, init=uniform(), name="Why");
+  Expr by = g.param(shape={1, output_size}, init=uniform(), name="by");

  std::cerr << "Building output layer..." << std::endl;
-  std::vector<Expr*> Yp(num_inputs);
+  std::vector<Expr> Yp;

-  Expr* cross_entropy = NULL;
-  for (int t = 0; t < num_inputs; ++t) {
-    Yp[t] = new Expr(softmax_fast(dot(*H[t], Why) + by, name="pred"));
-    if (!cross_entropy) {
-      cross_entropy = new Expr(sum(*Y[t] * log(*Yp[t]), axis=1));
-    } else {
-      *cross_entropy = *cross_entropy + sum(*Y[t] * log(*Yp[t]), axis=1);
-    }
+  Yp.emplace_back(softmax_fast(dot(H[0], Why) + by));
+  Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
+  for (int t = 1; t < num_inputs; ++t) {
+    Yp.emplace_back(softmax_fast(dot(H[t], Why) + by));
+    cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
  }
-  auto graph = -mean(*cross_entropy, axis=0, name="cost");
+  auto graph = -mean(cross_entropy, axis=0, name="cost");

  for (int t = 0; t < num_inputs; ++t) {
    Tensor Xt({batch_size, input_size});
@ -72,17 +87,17 @@ int main(int argc, char** argv) {
    thrust::copy(values.begin(), values.end(), Xt.begin());
    thrust::copy(classes.begin(), classes.end(), Yt.begin());

-    *X[t] = Xt;
-    *Y[t] = Yt;
+    X[t] = Xt;
+    Y[t] = Yt;
  }

-  graph.forward(batch_size);
-  graph.backward();
+  g.forward(batch_size);
+  g.backward();

  std::cerr << graph.val().Debug() << std::endl;

-  std::cerr << X[0]->val().Debug() << std::endl;
-  std::cerr << Y[0]->val().Debug() << std::endl;
+  std::cerr << X[0].val().Debug() << std::endl;
+  std::cerr << Y[0].val().Debug() << std::endl;

  std::cerr << Whh.grad().Debug() << std::endl;
  std::cerr << bh.grad().Debug() << std::endl;
--- a/src/train_mnist.cu
+++ b/src/train_mnist.cu
@ -16,22 +16,24 @@ int main(int argc, char** argv) {
  using namespace marian;
  using namespace keywords;

-  Expr x = input(shape={whatevs, IMAGE_SIZE}, name="X");
-  Expr y = input(shape={whatevs, LABEL_SIZE}, name="Y");
+  ExpressionGraph g;
+  
+  Expr x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
+  Expr y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");

-  Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0");
-  Expr b = param(shape={1, LABEL_SIZE}, name="b0");
+  Expr w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE}), "w");
+  Expr b = named(g.param(shape={1, LABEL_SIZE}), "b");

  std::vector<Expr*> params;
  params.push_back(&w);
  params.push_back(&b);

  auto scores = dot(x, w) + b;
-  auto lr = softmax_fast(scores, axis=1, name="pred");
-  auto cost = -mean(sum(y * log(lr), axis=1), axis=0, name="cost");
+  auto lr = softmax_fast(scores);
+  auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost");
  cerr << "lr=" << lr.Debug() << endl;

-  SGD opt(cost, x, y, params, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24);
+  SGD opt(g, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24);
  opt.Run();
  return 0;
 }
--- a/src/validate_mnist.cu
+++ b/src/validate_mnist.cu
@ -2,24 +2,15 @@
 #include "marian.h"
 #include "mnist.h"
 #include "npz_converter.h"
-#include "param_initializers.h"

 using namespace marian;
 using namespace keywords;

-int main(int argc, char** argv) {
-  
-  cudaSetDevice(1);
-  
-  const size_t IMAGE_SIZE = 784;
-  const size_t LABEL_SIZE = 10;
-  int BATCH_SIZE = 10000;
-  
-  std::cerr << "Loading test set...";
-  std::vector<float> testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", BATCH_SIZE, IMAGE_SIZE);
-  std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", BATCH_SIZE, LABEL_SIZE);
-  std::cerr << "Done." << std::endl;
+const size_t IMAGE_SIZE = 784;
+const size_t LABEL_SIZE = 10;
+int BATCH_SIZE = 10000;

+ExpressionGraph build_graph() {
  std::cerr << "Loading model params...";
  NpzConverter converter("../scripts/test_model_single/model.npz");

@ -31,29 +22,50 @@ int main(int argc, char** argv) {

  std::cerr << "Building model...";
  
-  auto x = input(shape={whatevs, IMAGE_SIZE});
-  auto y = input(shape={whatevs, LABEL_SIZE});
+  ExpressionGraph g;
+  auto x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
+  auto y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
  
-  auto w = param(shape={IMAGE_SIZE, LABEL_SIZE},
-                 init=from_vector(wData));
-  auto b = param(shape={1, LABEL_SIZE},
-                 init=from_vector(bData));
+  auto w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE},
+                         init=from_vector(wData)), "w");
+  auto b = named(g.param(shape={1, LABEL_SIZE},
+                         init=from_vector(bData)), "b");

-  auto probs = softmax_fast(dot(x, w) + b, axis=1);
-  auto cost = -mean(sum(y * log(probs), axis=1), axis=0);
+  auto probs = named(
+    softmax_fast(dot(x, w) + b), //, axis=1),
+    "probs"
+  );
+  
+  auto cost = named(
+    -mean(sum(y * log(probs), axis=1), axis=0),
+    "cost"
+  );
  
  std::cerr << "Done." << std::endl;
+  return g;
+}

+int main(int argc, char** argv) {
+  
+  cudaSetDevice(1);
+    
+  std::cerr << "Loading test set...";
+  std::vector<float> testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", BATCH_SIZE, IMAGE_SIZE);
+  std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", BATCH_SIZE, LABEL_SIZE);
+  std::cerr << "Done." << std::endl;
+
+  ExpressionGraph g = build_graph();
+  
  Tensor xt({BATCH_SIZE, IMAGE_SIZE});
  Tensor yt({BATCH_SIZE, LABEL_SIZE});
  
-  x = xt << testImages;
-  y = yt << testLabels;
+  g["x"] = (xt << testImages);
+  g["y"] = (yt << testLabels);
  
-  cost.forward(BATCH_SIZE);
+  g.forward(BATCH_SIZE);
 
  std::vector<float> results;
-  results << probs.val();
+  results << g["probs"].val();
  
  size_t acc = 0;
  for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
@ -65,22 +77,22 @@ int main(int argc, char** argv) {
    }
    acc += (correct == proposed);
  }
-  std::cerr << "Cost: " << cost.val()[0] <<  " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
-
+  std::cerr << "Cost: " << g["cost"].val()[0] <<  " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
+  
  float eta = 0.1;
  for (size_t j = 0; j < 10; ++j) {
    for(size_t i = 0; i < 60; ++i) {    
-      cost.backward();
+      g.backward();
    
      auto update_rule = _1 -= eta * _2;
-      Element(update_rule, w.val(), w.grad());
-      Element(update_rule, b.val(), b.grad());
+      for(auto param : g.params()) 
+        Element(update_rule, param.val(), param.grad());
      
-      cost.forward(BATCH_SIZE);
+      g.forward(BATCH_SIZE);
    }
    std::cerr << "Epoch: " << j << std::endl;
    std::vector<float> results;
-    results << probs.val();
+    results << g["probs"].val();
    
    size_t acc = 0;
    for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
@ -92,7 +104,7 @@ int main(int argc, char** argv) {
      }
      acc += (correct == proposed);
    }
-    std::cerr << "Cost: " << cost.val()[0] <<  " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
+    std::cerr << "Cost: " << g["cost"].val()[0] <<  " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
  }
  return 0;
 }
--- a/src/validate_mnist_batch.cu
+++ b/src/validate_mnist_batch.cu
@ -59,13 +59,15 @@ int main(int argc, char** argv) {
  std::cerr << "\tDone." << std::endl;


-  auto x = input(shape={whatevs, IMAGE_SIZE}, name="X");
-  auto y = input(shape={whatevs, LABEL_SIZE}, name="Y");
+  ExpressionGraph g;

-  auto w1 = param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1);
-  auto b1 = param(shape={1, 100}, name="b0", init=initB1);
-  auto w2 = param(shape={100, LABEL_SIZE}, name="W1", init=initW2);
-  auto b2 = param(shape={1, LABEL_SIZE}, name="b1", init=initB2);
+  auto x = g.input(shape={whatevs, IMAGE_SIZE}, name="X");
+  auto y = g.input(shape={whatevs, LABEL_SIZE}, name="Y");
+
+  auto w1 = g.param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1);
+  auto b1 = g.param(shape={1, 100}, name="b0", init=initB1);
+  auto w2 = g.param(shape={100, LABEL_SIZE}, name="W1", init=initW2);
+  auto b2 = g.param(shape={1, LABEL_SIZE}, name="b1", init=initB2);

  std::cerr << "Building model...";
  auto layer1 = tanh(dot(x, w1) + b1);
@ -86,7 +88,7 @@ int main(int argc, char** argv) {
    xt << tmp;
    x = xt;

-    predict.forward(BATCH_SIZE);
+    g.forward(BATCH_SIZE);

    std::vector<float> results(LABEL_SIZE * BATCH_SIZE);
    results << predict.val();
@ -113,7 +115,7 @@ int main(int argc, char** argv) {
      xt << tmp;
      x = xt;

-      predict.forward(endId - startId);
+      g.forward(endId - startId);

      std::vector<float> results(LABEL_SIZE * BATCH_SIZE);
      results << predict.val();
--- a/src/vocab.cpp
+++ b/src/vocab.cpp
@ -0,0 +1,53 @@
+#include "vocab.h"
+
+using namespace std;
+
+////////////////////////////////////////////////////////
+inline std::vector<std::string> Tokenize(const std::string& str,
+    const std::string& delimiters = " \t")
+{
+  std::vector<std::string> tokens;
+  // Skip delimiters at beginning.
+  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+  // Find first "non-delimiter".
+  std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+  while (std::string::npos != pos || std::string::npos != lastPos) {
+    // Found a token, add it to the vector.
+    tokens.push_back(str.substr(lastPos, pos - lastPos));
+    // Skip delimiters.  Note the "not_of"
+    lastPos = str.find_first_not_of(delimiters, pos);
+    // Find next "non-delimiter"
+    pos = str.find_first_of(delimiters, lastPos);
+  }
+
+  return tokens;
+}
+////////////////////////////////////////////////////////
+
+size_t Vocab::GetOrCreate(const std::string &word)
+{
+	size_t id;
+	Coll::const_iterator iter = coll_.find(word);
+	if (iter == coll_.end()) {
+		id = coll_.size();
+		coll_[word] = id;
+	}
+	else {
+		id = iter->second;
+	}
+	return id;
+}
+
+std::vector<size_t> Vocab::ProcessSentence(const std::string &sentence)
+{
+	vector<string> toks = Tokenize(sentence);
+	vector<size_t> ret(toks.size());
+
+	for (size_t i = 0; i < toks.size(); ++i) {
+		size_t id = GetOrCreate(toks[i]);
+		ret[i] = id;
+	}
+
+	return ret;
+}
--- a/src/vocab.h
+++ b/src/vocab.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+class Vocab
+{
+public:
+	size_t GetOrCreate(const std::string &word);
+	std::vector<size_t> ProcessSentence(const std::string &sentence);
+
+protected:
+	typedef std::unordered_map<std::string, size_t> Coll;
+	Coll coll_;
+};
+