diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1ad22e9d..c772e360 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,7 @@ cuda_add_library(marian_lib
   cnpy/cnpy.cpp
   exception.cpp
   expression_graph.cu 
+  sgd.cu
   tensor.cu
   tensor_operators.cu
   expression_operators.cu
diff --git a/src/sgd.cu b/src/sgd.cu
index 0a276835..5fe69138 100644
--- a/src/sgd.cu
+++ b/src/sgd.cu
@@ -7,15 +7,11 @@
 using namespace std;
 
 namespace marian {
-SGD::SGD(Expr& cost_func, Expr& inX, Expr& inY,
-    const std::vector<Expr*> params, float eta,
+SGD::SGD(ExpressionGraph& g, float eta,
     std::vector<float>& xData, size_t numFeatures,
     std::vector<float>& yData, size_t numClasses,
     size_t epochs, size_t batchSize)
-: cost_function_(&cost_func),
-  inX_(&inX),
-  inY_(&inY),
-  params_(params),
+: graph_(g),
   eta_(eta),
   xData_(xData),
   numFeatures_(numFeatures),
@@ -45,11 +41,11 @@ void SGD::Run()
       size_t endId = startId + batchSize;
 
       PrepareBatch(startId, endId, batchSize, shuffle, xt, yt);
-      *inX_ = xt;
-      *inY_ = yt;
+      graph_["x"] = xt;
+      graph_["y"] = yt;
 
-      cost_function_->forward(maxBatchSize_);
-      cost_function_->backward();
+      graph_.forward(maxBatchSize_);
+      graph_.backward();
 
       UpdateModel();
 
@@ -136,9 +132,9 @@ void SGD::PrepareBatch(
 }
 
 void SGD::UpdateModel() {
-  for (auto& param : params_) {
+  for (auto& param : graph_.params()) {
     using namespace thrust::placeholders;
-    Element(_1 = _1 - eta_ * _2, param->val(), param->grad());
+    Element(_1 -= eta_ * _2, param.val(), param.grad());
   }
 }
 
diff --git a/src/sgd.h b/src/sgd.h
index 33364049..a99acd75 100644
--- a/src/sgd.h
+++ b/src/sgd.h
@@ -3,7 +3,7 @@
 #include <memory>
 #include <iostream>
 
-#include "expressions.h"
+#include "expression_graph.h"
 #include "thrust_functions.h"
 #include "tensor_operators.h"
 
@@ -11,8 +11,7 @@ namespace marian {
 
 class SGD {
   public:
-    SGD(Expr& cost_func, Expr& inX, Expr& inY,
-        const std::vector<Expr*> params, float eta,
+    SGD(ExpressionGraph& g, float eta,
         std::vector<float>& xData, size_t numFeatures,
         std::vector<float>& yData, size_t numClasses,
         size_t epochs, size_t batchSize);
@@ -20,10 +19,7 @@ class SGD {
     void Run();
 
   private:
-    Expr *cost_function_;
-    Expr *inX_;
-    Expr *inY_;
-    std::vector<Expr*> params_;
+    ExpressionGraph& graph_;
     const float eta_;
     std::vector<float>& xData_;
     const size_t numFeatures_;
diff --git a/src/test.cu b/src/test.cu
index 4f63def6..d27591be 100644
--- a/src/test.cu
+++ b/src/test.cu
@@ -14,42 +14,41 @@ int main(int argc, char** argv) {
   int hidden_size = 5;
   int num_inputs = 8;
 
-  std::vector<Expr*> X(num_inputs);
-  std::vector<Expr*> Y(num_inputs);
-  std::vector<Expr*> H(num_inputs);
+  std::vector<Expr> X;
+  std::vector<Expr> Y;
+  std::vector<Expr> H;
+
+  ExpressionGraph g;
 
   for (int t = 0; t < num_inputs; ++t) {
-    X[t] = new Expr(input(shape={batch_size, input_size}));
-    Y[t] = new Expr(input(shape={batch_size, output_size}));
+    X.emplace_back(g.input(shape={batch_size, input_size}));
+    Y.emplace_back(g.input(shape={batch_size, output_size}));
   }
 
-  Expr Wxh = param(shape={input_size, hidden_size}, init=uniform(), name="Wxh");
-  Expr Whh = param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh");
-  Expr bh = param(shape={1, hidden_size}, init=uniform(), name="bh");
-  Expr h0 = param(shape={1, hidden_size}, init=uniform(), name="h0");
+  Expr Wxh = g.param(shape={input_size, hidden_size}, init=uniform(), name="Wxh");
+  Expr Whh = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh");
+  Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh");
+  Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0");
 
   std::cerr << "Building RNN..." << std::endl;
-  H[0] = new Expr(tanh(dot(*X[0], Wxh) + dot(h0, Whh) + bh));
+  H.emplace_back(tanh(dot(X[0], Wxh) + dot(h0, Whh) + bh));
   for (int t = 1; t < num_inputs; ++t) {
-    H[t] = new Expr(tanh(dot(*X[t], Wxh) + dot(*H[t-1], Whh) + bh));
+    H.emplace_back(tanh(dot(X[t], Wxh) + dot(H[t-1], Whh) + bh));
   }
 
-  Expr Why = param(shape={hidden_size, output_size}, init=uniform(), name="Why");
-  Expr by = param(shape={1, output_size}, init=uniform(), name="by");
+  Expr Why = g.param(shape={hidden_size, output_size}, init=uniform(), name="Why");
+  Expr by = g.param(shape={1, output_size}, init=uniform(), name="by");
 
   std::cerr << "Building output layer..." << std::endl;
-  std::vector<Expr*> Yp(num_inputs);
+  std::vector<Expr> Yp;
 
-  Expr* cross_entropy = NULL;
-  for (int t = 0; t < num_inputs; ++t) {
-    Yp[t] = new Expr(softmax_fast(dot(*H[t], Why) + by, name="pred"));
-    if (!cross_entropy) {
-      cross_entropy = new Expr(sum(*Y[t] * log(*Yp[t]), axis=1));
-    } else {
-      *cross_entropy = *cross_entropy + sum(*Y[t] * log(*Yp[t]), axis=1);
-    }
+  Yp.emplace_back(softmax_fast(dot(H[0], Why) + by));
+  Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
+  for (int t = 1; t < num_inputs; ++t) {
+    Yp.emplace_back(softmax_fast(dot(H[t], Why) + by));
+    cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
   }
-  auto graph = -mean(*cross_entropy, axis=0, name="cost");
+  auto graph = -mean(cross_entropy, axis=0, name="cost");
 
   for (int t = 0; t < num_inputs; ++t) {
     Tensor Xt({batch_size, input_size});
@@ -72,17 +71,17 @@ int main(int argc, char** argv) {
     thrust::copy(values.begin(), values.end(), Xt.begin());
     thrust::copy(classes.begin(), classes.end(), Yt.begin());
 
-    *X[t] = Xt;
-    *Y[t] = Yt;
+    X[t] = Xt;
+    Y[t] = Yt;
   }
 
-  graph.forward(batch_size);
-  graph.backward();
+  g.forward(batch_size);
+  g.backward();
 
   std::cerr << graph.val().Debug() << std::endl;
 
-  std::cerr << X[0]->val().Debug() << std::endl;
-  std::cerr << Y[0]->val().Debug() << std::endl;
+  std::cerr << X[0].val().Debug() << std::endl;
+  std::cerr << Y[0].val().Debug() << std::endl;
 
   std::cerr << Whh.grad().Debug() << std::endl;
   std::cerr << bh.grad().Debug() << std::endl;
diff --git a/src/train_mnist.cu b/src/train_mnist.cu
index aa21597a..64ccf564 100644
--- a/src/train_mnist.cu
+++ b/src/train_mnist.cu
@@ -16,22 +16,24 @@ int main(int argc, char** argv) {
   using namespace marian;
   using namespace keywords;
 
-  Expr x = input(shape={whatevs, IMAGE_SIZE}, name="X");
-  Expr y = input(shape={whatevs, LABEL_SIZE}, name="Y");
+  ExpressionGraph g;
+  
+  Expr x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
+  Expr y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
 
-  Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0");
-  Expr b = param(shape={1, LABEL_SIZE}, name="b0");
+  Expr w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE}), "w");
+  Expr b = named(g.param(shape={1, LABEL_SIZE}), "b");
 
   std::vector<Expr*> params;
   params.push_back(&w);
   params.push_back(&b);
 
   auto scores = dot(x, w) + b;
-  auto lr = softmax_fast(scores, axis=1, name="pred");
-  auto cost = -mean(sum(y * log(lr), axis=1), axis=0, name="cost");
+  auto lr = softmax_fast(scores);
+  auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost");
   cerr << "lr=" << lr.Debug() << endl;
 
-  SGD opt(cost, x, y, params, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24);
+  SGD opt(g, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24);
   opt.Run();
   return 0;
 }
diff --git a/src/validate_mnist_batch.cu b/src/validate_mnist_batch.cu
index 1c66198a..50ab97b5 100644
--- a/src/validate_mnist_batch.cu
+++ b/src/validate_mnist_batch.cu
@@ -59,13 +59,15 @@ int main(int argc, char** argv) {
   std::cerr << "\tDone." << std::endl;
 
 
-  auto x = input(shape={whatevs, IMAGE_SIZE}, name="X");
-  auto y = input(shape={whatevs, LABEL_SIZE}, name="Y");
+  ExpressionGraph g;
 
-  auto w1 = param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1);
-  auto b1 = param(shape={1, 100}, name="b0", init=initB1);
-  auto w2 = param(shape={100, LABEL_SIZE}, name="W1", init=initW2);
-  auto b2 = param(shape={1, LABEL_SIZE}, name="b1", init=initB2);
+  auto x = g.input(shape={whatevs, IMAGE_SIZE}, name="X");
+  auto y = g.input(shape={whatevs, LABEL_SIZE}, name="Y");
+
+  auto w1 = g.param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1);
+  auto b1 = g.param(shape={1, 100}, name="b0", init=initB1);
+  auto w2 = g.param(shape={100, LABEL_SIZE}, name="W1", init=initW2);
+  auto b2 = g.param(shape={1, LABEL_SIZE}, name="b1", init=initB2);
 
   std::cerr << "Building model...";
   auto layer1 = tanh(dot(x, w1) + b1);
@@ -86,7 +88,7 @@ int main(int argc, char** argv) {
     xt << tmp;
     x = xt;
 
-    predict.forward(BATCH_SIZE);
+    g.forward(BATCH_SIZE);
 
     std::vector<float> results(LABEL_SIZE * BATCH_SIZE);
     results << predict.val();
@@ -113,7 +115,7 @@ int main(int argc, char** argv) {
       xt << tmp;
       x = xt;
 
-      predict.forward(endId - startId);
+      g.forward(endId - startId);
 
       std::vector<float> results(LABEL_SIZE * BATCH_SIZE);
       results << predict.val();