diff --git a/src/expressions.cu b/src/expressions.cu index a95b1bef..b2ff90ba 100644 --- a/src/expressions.cu +++ b/src/expressions.cu @@ -24,22 +24,17 @@ ChainPtr Expr::node() { void Expr::forward(size_t batchSize) { UTIL_THROW_IF2(pimpl_.get() != Chainable::stack.back(), - "Trying to call forward on non-root of computation graph"); - std::cerr << "forward:" << std::endl; - + "Trying to call forward on non-root of computation graph"); for(auto&& v : Chainable::stack) { v->allocate(batchSize); } - for(auto&& v : Chainable::stack) v->forward(); } void Expr::backward() { UTIL_THROW_IF2(pimpl_.get() != Chainable::stack.back(), - "Trying to call backward on non-root of computation graph"); - std::cerr << "backward:" << std::endl; - + "Trying to call backward on non-root of computation graph"); for(auto&& v : Chainable::stack) v->set_zero_adjoint(); @@ -56,7 +51,6 @@ Expr::operator ChainPtr() { std::string Expr::Debug() const { stringstream strm; - //const Chainable &ct = *pimpl_; const Shape &shape = pimpl_->shape(); strm << marian::Debug(shape); return strm.str(); diff --git a/src/graph_operators.h b/src/graph_operators.h index bf5a3336..2e016cac 100644 --- a/src/graph_operators.h +++ b/src/graph_operators.h @@ -42,7 +42,8 @@ struct ParamNode : public Node { template ParamNode(Args ...args) : Node(args...), - init_(Get>(keywords::init, [](Tensor){ })) + init_(Get>(keywords::init, [](Tensor){ })), + initialized_(false) { UTIL_THROW_IF2(!Has(keywords::shape) && !Has(keywords::lazy_shape), @@ -51,14 +52,18 @@ struct ParamNode : public Node { void forward() {} void backward() {} - + virtual void allocate(size_t batchSize) { val_.allocate(shape_); - init_(val_); + if(!initialized_) { + init_(val_); + initialized_ = true; + } } private: std::function init_; + bool initialized_; }; struct UnaryNodeOp : public Node { @@ -139,6 +144,7 @@ struct SoftmaxNodeOp : public UnaryNodeOp { SoftmaxNodeOp(ChainPtr a, Args ...args) : UnaryNodeOp(a, keywords::shape=newShape(a), args...) { } + Shape newShape(ChainPtr a) { Shape shape = a->shape(); return shape; @@ -164,8 +170,8 @@ struct SoftmaxNodeOp : public UnaryNodeOp { struct LogNodeOp : public UnaryNodeOp { template - LogNodeOp(Args ...args) - : UnaryNodeOp(args...) {} + LogNodeOp(ChainPtr a, Args ...args) + : UnaryNodeOp(a, keywords::shape=a->shape(), args...) {} void forward() { Element(_1 = Log(_2), val_, a_->val()); @@ -180,14 +186,9 @@ struct LogNodeOp : public UnaryNodeOp { struct ExpNodeOp : public UnaryNodeOp { template ExpNodeOp(ChainPtr a, Args ...args) - : UnaryNodeOp(a, keywords::shape=newShape(a), + : UnaryNodeOp(a, keywords::shape=a->shape(), args...) { } - Shape newShape(ChainPtr a) { - Shape shape = a->shape(); - return shape; - } - void forward() { Element(_1 = Exp(_2), val_, a_->val()); } diff --git a/src/validate_mnist.cu b/src/validate_mnist.cu index 6342a822..c31dd85a 100644 --- a/src/validate_mnist.cu +++ b/src/validate_mnist.cu @@ -8,7 +8,7 @@ using namespace keywords; int main(int argc, char** argv) { - cudaSetDevice(0); + cudaSetDevice(1); const size_t IMAGE_SIZE = 784; const size_t LABEL_SIZE = 10; @@ -30,18 +30,22 @@ int main(int argc, char** argv) { std::cerr << "Done." << std::endl; std::cerr << "Building model..."; - auto x = input(shape={whatevs, IMAGE_SIZE}, name="X"); - auto y = input(shape={whatevs, LABEL_SIZE}, name="Y"); - auto w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0", + auto x = input(shape={whatevs, IMAGE_SIZE}); + auto y = input(shape={whatevs, LABEL_SIZE}); + + auto w = param(shape={IMAGE_SIZE, LABEL_SIZE}, init=[wData](Tensor t) { t.set(wData); }); - auto b = param(shape={1, LABEL_SIZE}, name="b0", - init=[bData](Tensor t) {t.set(bData); }); + auto b = param(shape={1, LABEL_SIZE}, + init=[bData](Tensor t) { t.set(bData); }); - auto predict = softmax(dot(x, w) + b, - axis=1, name="pred"); - auto graph = -mean(sum(y * log(predict), axis=1), - axis=0, name="cost"); + auto zd = dot(x, w); + auto z = zd + b; + auto predict = softmax(z, axis=1); + auto logp = log(predict); + auto cost = sum(y * logp, axis=1); + auto graph = -mean(cost, axis=0); + std::cerr << "Done." << std::endl; Tensor xt({BATCH_SIZE, IMAGE_SIZE}); @@ -51,14 +55,20 @@ int main(int argc, char** argv) { y = yt << testLabels; graph.forward(BATCH_SIZE); - graph.backward(); + for(size_t i = 0; i < 1000; ++i) { + graph.backward(); + + auto update_rule = _1 -= 0.1 * _2; + Element(update_rule, w.val(), w.grad()); + Element(update_rule, b.val(), b.grad()); + + graph.forward(BATCH_SIZE); + } auto results = predict.val(); std::vector resultsv(results.size()); resultsv << results; - std::cerr << b.grad().Debug() << std::endl; - size_t acc = 0; for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) { size_t correct = 0; @@ -69,7 +79,7 @@ int main(int argc, char** argv) { } acc += (correct == predicted); } - std::cerr << "Accuracy: " << float(acc)/BATCH_SIZE << std::endl; + std::cerr << "Accuracy: " << float(acc) / BATCH_SIZE << std::endl; return 0; }