diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1ad22e9d..c772e360 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,6 +5,7 @@ cuda_add_library(marian_lib cnpy/cnpy.cpp exception.cpp expression_graph.cu + sgd.cu tensor.cu tensor_operators.cu expression_operators.cu diff --git a/src/sgd.cu b/src/sgd.cu index 0a276835..5fe69138 100644 --- a/src/sgd.cu +++ b/src/sgd.cu @@ -7,15 +7,11 @@ using namespace std; namespace marian { -SGD::SGD(Expr& cost_func, Expr& inX, Expr& inY, - const std::vector params, float eta, +SGD::SGD(ExpressionGraph& g, float eta, std::vector& xData, size_t numFeatures, std::vector& yData, size_t numClasses, size_t epochs, size_t batchSize) -: cost_function_(&cost_func), - inX_(&inX), - inY_(&inY), - params_(params), +: graph_(g), eta_(eta), xData_(xData), numFeatures_(numFeatures), @@ -45,11 +41,11 @@ void SGD::Run() size_t endId = startId + batchSize; PrepareBatch(startId, endId, batchSize, shuffle, xt, yt); - *inX_ = xt; - *inY_ = yt; + graph_["x"] = xt; + graph_["y"] = yt; - cost_function_->forward(maxBatchSize_); - cost_function_->backward(); + graph_.forward(maxBatchSize_); + graph_.backward(); UpdateModel(); @@ -136,9 +132,9 @@ void SGD::PrepareBatch( } void SGD::UpdateModel() { - for (auto& param : params_) { + for (auto& param : graph_.params()) { using namespace thrust::placeholders; - Element(_1 = _1 - eta_ * _2, param->val(), param->grad()); + Element(_1 -= eta_ * _2, param.val(), param.grad()); } } diff --git a/src/sgd.h b/src/sgd.h index 33364049..a99acd75 100644 --- a/src/sgd.h +++ b/src/sgd.h @@ -3,7 +3,7 @@ #include #include -#include "expressions.h" +#include "expression_graph.h" #include "thrust_functions.h" #include "tensor_operators.h" @@ -11,8 +11,7 @@ namespace marian { class SGD { public: - SGD(Expr& cost_func, Expr& inX, Expr& inY, - const std::vector params, float eta, + SGD(ExpressionGraph& g, float eta, std::vector& xData, size_t numFeatures, std::vector& yData, size_t numClasses, size_t epochs, size_t batchSize); @@ -20,10 +19,7 @@ class SGD { void Run(); private: - Expr *cost_function_; - Expr *inX_; - Expr *inY_; - std::vector params_; + ExpressionGraph& graph_; const float eta_; std::vector& xData_; const size_t numFeatures_; diff --git a/src/test.cu b/src/test.cu index 4f63def6..d27591be 100644 --- a/src/test.cu +++ b/src/test.cu @@ -14,42 +14,41 @@ int main(int argc, char** argv) { int hidden_size = 5; int num_inputs = 8; - std::vector X(num_inputs); - std::vector Y(num_inputs); - std::vector H(num_inputs); + std::vector X; + std::vector Y; + std::vector H; + + ExpressionGraph g; for (int t = 0; t < num_inputs; ++t) { - X[t] = new Expr(input(shape={batch_size, input_size})); - Y[t] = new Expr(input(shape={batch_size, output_size})); + X.emplace_back(g.input(shape={batch_size, input_size})); + Y.emplace_back(g.input(shape={batch_size, output_size})); } - Expr Wxh = param(shape={input_size, hidden_size}, init=uniform(), name="Wxh"); - Expr Whh = param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh"); - Expr bh = param(shape={1, hidden_size}, init=uniform(), name="bh"); - Expr h0 = param(shape={1, hidden_size}, init=uniform(), name="h0"); + Expr Wxh = g.param(shape={input_size, hidden_size}, init=uniform(), name="Wxh"); + Expr Whh = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh"); + Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh"); + Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0"); std::cerr << "Building RNN..." << std::endl; - H[0] = new Expr(tanh(dot(*X[0], Wxh) + dot(h0, Whh) + bh)); + H.emplace_back(tanh(dot(X[0], Wxh) + dot(h0, Whh) + bh)); for (int t = 1; t < num_inputs; ++t) { - H[t] = new Expr(tanh(dot(*X[t], Wxh) + dot(*H[t-1], Whh) + bh)); + H.emplace_back(tanh(dot(X[t], Wxh) + dot(H[t-1], Whh) + bh)); } - Expr Why = param(shape={hidden_size, output_size}, init=uniform(), name="Why"); - Expr by = param(shape={1, output_size}, init=uniform(), name="by"); + Expr Why = g.param(shape={hidden_size, output_size}, init=uniform(), name="Why"); + Expr by = g.param(shape={1, output_size}, init=uniform(), name="by"); std::cerr << "Building output layer..." << std::endl; - std::vector Yp(num_inputs); + std::vector Yp; - Expr* cross_entropy = NULL; - for (int t = 0; t < num_inputs; ++t) { - Yp[t] = new Expr(softmax_fast(dot(*H[t], Why) + by, name="pred")); - if (!cross_entropy) { - cross_entropy = new Expr(sum(*Y[t] * log(*Yp[t]), axis=1)); - } else { - *cross_entropy = *cross_entropy + sum(*Y[t] * log(*Yp[t]), axis=1); - } + Yp.emplace_back(softmax_fast(dot(H[0], Why) + by)); + Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1); + for (int t = 1; t < num_inputs; ++t) { + Yp.emplace_back(softmax_fast(dot(H[t], Why) + by)); + cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1); } - auto graph = -mean(*cross_entropy, axis=0, name="cost"); + auto graph = -mean(cross_entropy, axis=0, name="cost"); for (int t = 0; t < num_inputs; ++t) { Tensor Xt({batch_size, input_size}); @@ -72,17 +71,17 @@ int main(int argc, char** argv) { thrust::copy(values.begin(), values.end(), Xt.begin()); thrust::copy(classes.begin(), classes.end(), Yt.begin()); - *X[t] = Xt; - *Y[t] = Yt; + X[t] = Xt; + Y[t] = Yt; } - graph.forward(batch_size); - graph.backward(); + g.forward(batch_size); + g.backward(); std::cerr << graph.val().Debug() << std::endl; - std::cerr << X[0]->val().Debug() << std::endl; - std::cerr << Y[0]->val().Debug() << std::endl; + std::cerr << X[0].val().Debug() << std::endl; + std::cerr << Y[0].val().Debug() << std::endl; std::cerr << Whh.grad().Debug() << std::endl; std::cerr << bh.grad().Debug() << std::endl; diff --git a/src/train_mnist.cu b/src/train_mnist.cu index aa21597a..64ccf564 100644 --- a/src/train_mnist.cu +++ b/src/train_mnist.cu @@ -16,22 +16,24 @@ int main(int argc, char** argv) { using namespace marian; using namespace keywords; - Expr x = input(shape={whatevs, IMAGE_SIZE}, name="X"); - Expr y = input(shape={whatevs, LABEL_SIZE}, name="Y"); + ExpressionGraph g; + + Expr x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x"); + Expr y = named(g.input(shape={whatevs, LABEL_SIZE}), "y"); - Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0"); - Expr b = param(shape={1, LABEL_SIZE}, name="b0"); + Expr w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE}), "w"); + Expr b = named(g.param(shape={1, LABEL_SIZE}), "b"); std::vector params; params.push_back(&w); params.push_back(&b); auto scores = dot(x, w) + b; - auto lr = softmax_fast(scores, axis=1, name="pred"); - auto cost = -mean(sum(y * log(lr), axis=1), axis=0, name="cost"); + auto lr = softmax_fast(scores); + auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost"); cerr << "lr=" << lr.Debug() << endl; - SGD opt(cost, x, y, params, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24); + SGD opt(g, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24); opt.Run(); return 0; } diff --git a/src/validate_mnist_batch.cu b/src/validate_mnist_batch.cu index 1c66198a..50ab97b5 100644 --- a/src/validate_mnist_batch.cu +++ b/src/validate_mnist_batch.cu @@ -59,13 +59,15 @@ int main(int argc, char** argv) { std::cerr << "\tDone." << std::endl; - auto x = input(shape={whatevs, IMAGE_SIZE}, name="X"); - auto y = input(shape={whatevs, LABEL_SIZE}, name="Y"); + ExpressionGraph g; - auto w1 = param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1); - auto b1 = param(shape={1, 100}, name="b0", init=initB1); - auto w2 = param(shape={100, LABEL_SIZE}, name="W1", init=initW2); - auto b2 = param(shape={1, LABEL_SIZE}, name="b1", init=initB2); + auto x = g.input(shape={whatevs, IMAGE_SIZE}, name="X"); + auto y = g.input(shape={whatevs, LABEL_SIZE}, name="Y"); + + auto w1 = g.param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1); + auto b1 = g.param(shape={1, 100}, name="b0", init=initB1); + auto w2 = g.param(shape={100, LABEL_SIZE}, name="W1", init=initW2); + auto b2 = g.param(shape={1, LABEL_SIZE}, name="b1", init=initB2); std::cerr << "Building model..."; auto layer1 = tanh(dot(x, w1) + b1); @@ -86,7 +88,7 @@ int main(int argc, char** argv) { xt << tmp; x = xt; - predict.forward(BATCH_SIZE); + g.forward(BATCH_SIZE); std::vector results(LABEL_SIZE * BATCH_SIZE); results << predict.val(); @@ -113,7 +115,7 @@ int main(int argc, char** argv) { xt << tmp; x = xt; - predict.forward(endId - startId); + g.forward(endId - startId); std::vector results(LABEL_SIZE * BATCH_SIZE); results << predict.val();