diff --git a/CMakeLists.txt b/CMakeLists.txt index 1be00783..55448e2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.5.1) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) project(marian CXX) -SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O0 -funroll-loops -Wno-unused-result -Wno-deprecated") -LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O0; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC') +SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated") +LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC') add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM) SET(CUDA_PROPAGATE_HOST_FLAGS OFF) diff --git a/examples/mt/download.sh b/examples/mt/download.sh new file mode 100755 index 00000000..1f6dd242 --- /dev/null +++ b/examples/mt/download.sh @@ -0,0 +1,4 @@ + +wget http://data.statmt.org/wmt16/translation-task/dev.tgz +tar xvf dev.tgz + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a0b82725..0694509c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,10 +4,12 @@ include_directories(.) cuda_add_library(marian_lib cnpy/cnpy.cpp exception.cpp - expressions.cu + expression_graph.cu sgd.cu - tensor.cu + tensor.cu tensor_operators.cu + expression_operators.cu + vocab.cpp ) target_link_libraries(marian_lib) diff --git a/src/chainable.h b/src/chainable.h new file mode 100644 index 00000000..9fe6d208 --- /dev/null +++ b/src/chainable.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include + +#include "exception.h" + +namespace marian { + +template +struct Chainable { + Chainable() { } + virtual ~Chainable() { } + virtual void forward() { } + virtual void backward() { } + virtual void init_dependent() { } + virtual void set_zero_adjoint() { } + + virtual void allocate(size_t) = 0; + + virtual const Shape& shape() = 0; + virtual DataType &val() = 0; + virtual DataType grad() = 0; + virtual void setVal(DataType t) { + UTIL_THROW2("Tensors can only be assigned to input nodes"); + }; +}; + +typedef std::vector*> ChainableStack; +typedef std::shared_ptr ChainableStackPtr; +typedef std::shared_ptr> ChainPtr; + + +} \ No newline at end of file diff --git a/src/expression_graph.cu b/src/expression_graph.cu new file mode 100644 index 00000000..61f8d2b5 --- /dev/null +++ b/src/expression_graph.cu @@ -0,0 +1,41 @@ +#include +#include "expression_graph.h" + +using namespace std; + +namespace marian { + +Expr::Expr(ExpressionGraphPtr g, Chainable* chainable) + : graph_(g), pimpl_(chainable) { + graph_->stack()->push_back(chainable); +} + +Tensor Expr::val() { + return pimpl_->val(); +} + +Tensor Expr::grad() { + return pimpl_->grad(); +} + +ChainPtr Expr::node() { + return pimpl_; +} + +ExpressionGraphPtr Expr::graph() { + return graph_; +} + +Expr::operator ChainPtr() { + return pimpl_; +} + +std::string Expr::Debug() const +{ + stringstream strm; + const Shape &shape = pimpl_->shape(); + strm << marian::Debug(shape); + return strm.str(); +} + +} diff --git a/src/expression_graph.h b/src/expression_graph.h new file mode 100644 index 00000000..f0d5f233 --- /dev/null +++ b/src/expression_graph.h @@ -0,0 +1,120 @@ +#pragma once + +#include + +#include "definitions.h" +#include "chainable.h" +#include "node_operators.h" +#include "tensor.h" + +namespace marian { + +class ExpressionGraph; +typedef ExpressionGraph* ExpressionGraphPtr; + +class Expr { + public: + Expr(ExpressionGraphPtr g, Chainable* chainable); + + Expr operator=(Tensor t) { + pimpl_->setVal(t); + return *this; + } + + Tensor val(); + Tensor grad(); + + ExpressionGraphPtr graph(); + + ChainPtr node(); + operator ChainPtr(); + + std::string Debug() const; + + private: + ExpressionGraphPtr graph_; + ChainPtr pimpl_; +}; + +class ExpressionGraph { + public: + ExpressionGraph() + : stack_(new ChainableStack) + {} + + void forward(size_t batchSize) { + for(auto&& v : *stack_) { + v->allocate(batchSize); + } + for(auto&& v : *stack_) + v->forward(); + } + + void backward() { + for(auto&& v : *stack_) + v->set_zero_adjoint(); + + typedef typename ChainableStack::reverse_iterator It; + stack_->back()->init_dependent(); + for(It it = stack_->rbegin(); it != stack_->rend(); ++it) + (*it)->backward(); + } + + template + inline Expr input(Args ...args) { + return Expr(this, new InputNode(args...)); + } + + template + inline Expr param(Args ...args) { + Expr e(this, new ParamNode(args...)); + params_.emplace_back(e); + return e; + } + + template + inline Expr constant(Args ...args) { + return Expr(this, new ConstantNode(args...)); + } + + template + inline Expr ones(Args ...args) { + return Expr(this, new ConstantNode(keywords::value=1, args...)); + } + + template + inline Expr zeroes(Args ...args) { + return Expr(this, new ConstantNode(keywords::value=0, args...)); + } + + /*********************************************************/ + + ChainableStackPtr stack() { + return stack_; + } + + Expr& operator[](const std::string& name) { + auto it = named_.find(name); + UTIL_THROW_IF2(it == named_.end(), "No such named node in graph: " << name); + return it->second; + } + + bool has_node(const std::string& name) const { + return named_.count(name) > 0; + } + + void add_named_node(Expr e, const std::string& name) { + named_.emplace(name, e); + } + + std::vector& params() { + return params_; + } + + private: + ChainableStackPtr stack_; + std::map named_; + std::vector params_; +}; + +} diff --git a/src/expression_operators.cu b/src/expression_operators.cu new file mode 100644 index 00000000..73c23f1d --- /dev/null +++ b/src/expression_operators.cu @@ -0,0 +1,124 @@ + +#include "expression_operators.h" +#include "node_operators.h" + +namespace marian { + +Expr named(Expr a, const std::string& name) { + a.graph()->add_named_node(a, name); + return a; +} + +Expr logit(Expr a) { + return Expr(a.graph(), new LogitNodeOp(a)); +} + +Expr tanh(Expr a) { + return Expr(a.graph(), new TanhNodeOp(a)); +} + +Expr log(Expr a) { + return Expr(a.graph(), new LogNodeOp(a)); +}; + +Expr exp(Expr a) { + return Expr(a.graph(), new ExpNodeOp(a)); +}; + +Expr operator-(Expr a) { + return Expr(a.graph(), new NegNodeOp(a)); +}; + +Expr softmax_fast(Expr a) { + return Expr(a.graph(), new SoftmaxNodeOp(a)); +} + +/*********************************************************/ + +static Shape newShape(ChainPtr a, ChainPtr b) { + size_t dimsA = a->shape().size(); + size_t dimsB = b->shape().size(); + UTIL_THROW_IF2(dimsA != dimsB, + "Tensors have different numbers of dimensions"); + Shape shape(dimsA); + for(size_t i = 0; i < dimsA; ++i) { + int dimA = a->shape()[i]; + int dimB = b->shape()[i]; + bool broadcastable = (dimA == dimB || dimA == 1 || dimB == 1); + UTIL_THROW_IF2(!broadcastable, "Different dimensions in elementwise " + << "operation cannot be broadcasted: " << dimA << " != " << dimB); + shape[i] = std::max(dimA, dimB); + if(dimA == whatevs || dimB == whatevs) + shape[i] = whatevs; + } + return shape; +} + +Expr broadcast(Shape bShape, Expr a) { + const Shape& aShape = a.node()->shape(); + if(aShape == bShape) { + return a; + } + else { + size_t dimsA = aShape.size(); + size_t dimsB = bShape.size(); + UTIL_THROW_IF2(dimsA != dimsB, + "Tensor and shape have different number of dimensions"); + for(size_t i = 0; i < dimsA; ++i) { + int dimA = aShape[i]; + int dimB = bShape[i]; + bool broadcastable = (dimA == dimB || dimA == 1); + UTIL_THROW_IF2(!broadcastable, + "Cannot broadcast tensor dimension " + << dimA << " to " << dimB); + if(dimA == 1 && dimB != 1) { + if(i == 0) { + Expr one = a.graph()->ones(keywords::shape={bShape[0], 1}); + a = dot(one, a); + } + else if(i == 1) { + Expr one = a.graph()->ones(keywords::shape={1, bShape[1]}); + a = dot(a, one); + } + else { + UTIL_THROW2("Not implemented"); + } + } + } + return a; + } +} + +Expr operator+(Expr a, Expr b) { + Shape shape = newShape(a, b); + Expr cast_a = broadcast(shape, a); + Expr cast_b = broadcast(shape, b); + return Expr(a.graph(), new PlusNodeOp(cast_a, cast_b)); +} + +Expr operator-(Expr a, Expr b) { + Shape shape = newShape(a, b); + Expr cast_a = broadcast(shape, a); + Expr cast_b = broadcast(shape, b); + return Expr(a.graph(), new MinusNodeOp(cast_a, cast_b)); +} + +Expr operator*(Expr a, Expr b) { + Shape shape = newShape(a, b); + Expr cast_a = broadcast(shape, a); + Expr cast_b = broadcast(shape, b); + return Expr(a.graph(), new MultNodeOp(cast_a, cast_b)); +} + +Expr operator/(Expr a, Expr b) { + Shape shape = newShape(a, b); + Expr cast_a = broadcast(shape, a); + Expr cast_b = broadcast(shape, b); + return Expr(a.graph(), new DivNodeOp(cast_a, cast_b)); +} + +Expr dot(Expr a, Expr b) { + return Expr(a.graph(), new DotNodeOp(a, b)); +} + +} diff --git a/src/expression_operators.h b/src/expression_operators.h index 957ceed1..4cb69dbb 100644 --- a/src/expression_operators.h +++ b/src/expression_operators.h @@ -1,115 +1,36 @@ #pragma once -#include "graph.h" -#include "graph_operators.h" -#include "expressions.h" +#include "expression_graph.h" namespace marian { -template -inline Expr input(Args ...args) { - return Expr(new InputNode(args...)); -} +Expr named(Expr a, const std::string& name); -template -inline Expr param(Args ...args) { - return Expr(new ParamNode(args...)); -} -template -inline Expr constant(Args ...args) { - return Expr(new ConstantNode(args...)); -} +Expr logit(Expr a); -template -inline Expr ones(Args ...args) { - return Expr(new ConstantNode(keywords::value=1, args...)); -} +Expr tanh(Expr a); -template -inline Expr zeroes(Args ...args) { - return Expr(new ConstantNode(keywords::value=0, args...)); -} +Expr log(Expr a); + +Expr exp(Expr a); + +Expr operator-(Expr a); /*********************************************************/ -inline Expr logit(Expr a) { - return Expr(new LogitNodeOp(a)); -} +Expr operator+(Expr a, Expr b); -inline Expr tanh(Expr a) { - return Expr(new TanhNodeOp(a)); -} +Expr operator-(Expr a, Expr b); -inline Expr log(Expr a) { - return Expr(new LogNodeOp(a)); -}; +Expr operator*(Expr a, Expr b); -inline Expr exp(Expr a) { - return Expr(new ExpNodeOp(a)); -}; +Expr operator/(Expr a, Expr b); -inline Expr operator-(Expr a) { - return Expr(new NegNodeOp(a)); -}; - -/*********************************************************/ - -inline Expr operator+(Expr a, Expr b) { - return Expr(new PlusNodeOp(a, b)); -} - -inline Expr operator-(Expr a, Expr b) { - return Expr(new MinusNodeOp(a, b)); -} - -inline Expr operator*(Expr a, Expr b) { - return Expr(new MultNodeOp(a, b)); -} - -inline Expr operator/(Expr a, Expr b) { - return Expr(new DivNodeOp(a, b)); -} - -inline Expr dot(Expr a, Expr b) { - return Expr(new DotNodeOp(a, b)); -} +Expr dot(Expr a, Expr b); /******************************************************/ -Expr broadcast(Shape bShape, Expr a) { - const Shape& aShape = a.node()->shape(); - if(aShape == bShape) { - return a; - } - else { - size_t dimsA = aShape.size(); - size_t dimsB = bShape.size(); - UTIL_THROW_IF2(dimsA != dimsB, - "Tensor and shape have different number of dimensions"); - for(size_t i = 0; i < dimsA; ++i) { - int dimA = aShape[i]; - int dimB = bShape[i]; - bool broadcastable = (dimA == dimB || dimA == 1); - UTIL_THROW_IF2(!broadcastable, - "Cannot broadcast tensor dimension " - << dimA << " to " << dimB); - if(dimA == 1 && dimB != 1) { - if(i == 0) { - Expr one = ones(keywords::shape={bShape[0], 1}); - a = dot(one, a); - } - else if(i == 1) { - Expr one = ones(keywords::shape={1, bShape[1]}); - a = dot(a, one); - } - else { - UTIL_THROW2("Not implemented"); - } - } - } - return a; - } -} +Expr broadcast(Shape bShape, Expr a); /*********************************************************/ @@ -126,7 +47,7 @@ inline Expr sum(Expr a, Args ...args) { int rows = n->val().shape()[0]; return {1, rows}; }; - Expr one = ones(shape={1, n->shape()[0]}, + Expr one = a.graph()->ones(shape={1, n->shape()[0]}, lazy_shape=lshape); return dot(one, a); } @@ -136,8 +57,8 @@ inline Expr sum(Expr a, Args ...args) { //std::cerr << "Shape will be " << cols << " by 1." << std::endl; return {cols, 1}; }; - Expr one = ones(shape={n->shape()[1], 1}, - lazy_shape=lshape); + Expr one = a.graph()->ones(shape={n->shape()[1], 1}, + lazy_shape=lshape); return dot(a, one); } else if(ax == 2) { @@ -151,17 +72,12 @@ inline Expr sum(Expr a, Args ...args) { // inefficient template -inline Expr softmax(Expr a, Args ...args) { +Expr softmax(Expr a, Args ...args) { Expr e = exp(a); return e / sum(e, args...); } -template -inline Expr softmax_fast(Expr a, Args ...args) { - Expr e = Expr(new SoftmaxNodeOp(a, args...)); - return e; -} - +Expr softmax_fast(Expr a); // inefficient template @@ -173,12 +89,12 @@ inline Expr mean(Expr a, Args ...args) { ChainPtr n = a.node(); switch (ax) { case 0: - return sum(a, axis=0) / constant(shape={1, 1}, + return sum(a, axis=0) / a.graph()->constant(shape={1, 1}, lazy_value=[n]() -> Float { return n->val().shape()[0]; }); case 1: - return sum(a, axis=1) / constant(shape={1, 1}, + return sum(a, axis=1) / a.graph()->constant(shape={1, 1}, lazy_value=[n]() -> Float { return n->val().shape()[1]; }); @@ -187,7 +103,7 @@ inline Expr mean(Expr a, Args ...args) { case 3: UTIL_THROW2("Not implemented"); default: - return sum(a) / constant(shape={1, 1}, + return sum(a) / a.graph()->constant(shape={1, 1}, lazy_value=[n]() -> Float { return n->val().size(); }); diff --git a/src/expressions.cu b/src/expressions.cu deleted file mode 100644 index b2ff90ba..00000000 --- a/src/expressions.cu +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include "expressions.h" -#include "graph_operators.h" - -using namespace std; - -namespace marian { - -Expr::Expr(Chainable* chainable) : pimpl_(chainable) {} -Expr::Expr(Float v) : pimpl_(new ConstantNode(keywords::value=v, - keywords::shape={1,1})) {} - -Tensor Expr::val() { - return pimpl_->val(); -} - -Tensor Expr::grad() { - return pimpl_->grad(); -} - -ChainPtr Expr::node() { - return pimpl_; -} - -void Expr::forward(size_t batchSize) { - UTIL_THROW_IF2(pimpl_.get() != Chainable::stack.back(), - "Trying to call forward on non-root of computation graph"); - for(auto&& v : Chainable::stack) { - v->allocate(batchSize); - } - for(auto&& v : Chainable::stack) - v->forward(); -} - -void Expr::backward() { - UTIL_THROW_IF2(pimpl_.get() != Chainable::stack.back(), - "Trying to call backward on non-root of computation graph"); - for(auto&& v : Chainable::stack) - v->set_zero_adjoint(); - - typedef typename Chainable::ChainableStack::reverse_iterator It; - pimpl_->init_dependent(); - for(It it = Chainable::stack.rbegin(); it != Chainable::stack.rend(); ++it) - (*it)->backward(); -} - -Expr::operator ChainPtr() { - return pimpl_; -} - -std::string Expr::Debug() const -{ - stringstream strm; - const Shape &shape = pimpl_->shape(); - strm << marian::Debug(shape); - return strm.str(); -} - -} diff --git a/src/expressions.h b/src/expressions.h deleted file mode 100644 index 43016dac..00000000 --- a/src/expressions.h +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include "definitions.h" -#include "graph.h" - -namespace marian { - -class Expr { - public: - Expr(Chainable* chainable); - Expr(Float v); - - Expr operator=(Tensor t) { - pimpl_->setVal(t); - return *this; - } - - Tensor val(); - Tensor grad(); - - void forward(size_t batchSize); - void backward(); - - ChainPtr node(); - operator ChainPtr(); - - std::string Debug() const; - - private: - ChainPtr pimpl_; -}; - -} diff --git a/src/marian.h b/src/marian.h index 0876d4cd..5cc06dd7 100644 --- a/src/marian.h +++ b/src/marian.h @@ -1,9 +1,7 @@ #pragma once #include "definitions.h" -#include "graph.h" -#include "graph_operators.h" -#include "expressions.h" -#include "expression_operators.h" +#include "expression_graph.h" #include "param_initializers.h" +#include "expression_operators.h" diff --git a/src/graph.h b/src/node.h similarity index 68% rename from src/graph.h rename to src/node.h index 329720b4..29d240cd 100644 --- a/src/graph.h +++ b/src/node.h @@ -2,36 +2,10 @@ #include "keywords.h" #include "tensor.h" +#include "chainable.h" namespace marian { -template -struct Chainable { - Chainable() { } - virtual ~Chainable() { } - virtual void forward() { } - virtual void backward() { } - virtual void init_dependent() { } - virtual void set_zero_adjoint() { } - - virtual void allocate(size_t) = 0; - - virtual const Shape& shape() = 0; - virtual DataType &val() = 0; - virtual DataType grad() = 0; - virtual void setVal(DataType t) { - UTIL_THROW2("Tensors can only be assigned to input nodes"); - }; - - typedef std::vector*> ChainableStack; - static ChainableStack stack; -}; - -template -typename Chainable::ChainableStack Chainable::stack; - -typedef std::shared_ptr> ChainPtr; - class Node : public Chainable, public keywords::Keywords { public: @@ -40,9 +14,7 @@ class Node : public Chainable, : Keywords(args...), shape_(Get(keywords::shape, {1, 1})), name_(Get(keywords::name, "none")) - { - stack.push_back(this); - } + { } virtual ~Node() {}; diff --git a/src/graph_operators.h b/src/node_operators.h similarity index 70% rename from src/graph_operators.h rename to src/node_operators.h index a6320201..e5cf2110 100644 --- a/src/graph_operators.h +++ b/src/node_operators.h @@ -1,7 +1,6 @@ #pragma once -#include "expressions.h" -#include "graph.h" +#include "node.h" #include "tensor_operators.h" namespace marian { @@ -108,49 +107,14 @@ struct TanhNodeOp : public UnaryNodeOp { } }; -struct ArgmaxOp : public UnaryNodeOp { - template - ArgmaxOp(ChainPtr a, Args ...args) - : UnaryNodeOp(a, keywords::shape=newShape(a, -1), args...), - axis_(-1) { } - - Shape newShape(ChainPtr a, int axis) { - Shape shape1 = a->shape(); - UTIL_THROW_IF2(shape1.size() > 2, - "Tensors with more than 2 dimensions not supported yet"); - if(axis == 0) { - shape1[0] = 1; - } - else if(axis == 1) { - shape1[1] = 1; - } - else { - shape1 = {1, 1}; - } - return shape1; - } - - void forward() { - //val_ = Argmax(a_->val(), axis_); - UTIL_THROW2("Not implemented"); - } - - void backward() { - UTIL_THROW2("Not implemented"); - } - - private: - int axis_; -}; - // @TODO, make this numerically safe(r): // softmax(X) = softmax_safe(X - max(X, axis=1)) // Probably best to do this directly in Softmax // function. struct SoftmaxNodeOp : public UnaryNodeOp { template - SoftmaxNodeOp(ChainPtr a, Args ...args) - : UnaryNodeOp(a, args...) { } + SoftmaxNodeOp(Args ...args) + : UnaryNodeOp(args...) { } void forward() { // B = softmax(A). @@ -171,8 +135,8 @@ struct SoftmaxNodeOp : public UnaryNodeOp { struct LogNodeOp : public UnaryNodeOp { template - LogNodeOp(ChainPtr a, Args ...args) - : UnaryNodeOp(a, args...) {} + LogNodeOp(Args ...args) + : UnaryNodeOp(args...) {} void forward() { Element(_1 = Log(_2), val_, a_->val()); @@ -186,8 +150,8 @@ struct LogNodeOp : public UnaryNodeOp { struct ExpNodeOp : public UnaryNodeOp { template - ExpNodeOp(ChainPtr a, Args ...args) - : UnaryNodeOp(a, args...) { } + ExpNodeOp(Args ...args) + : UnaryNodeOp(args...) { } void forward() { Element(_1 = Exp(_2), val_, a_->val()); @@ -230,7 +194,7 @@ struct DotNodeOp : public BinaryNodeOp { template DotNodeOp(ChainPtr a, ChainPtr b, Args ...args) : BinaryNodeOp(a, b, - keywords::shape=newShape(a,b), + keywords::shape=newShape(a, b), args...) { } Shape newShape(ChainPtr a, ChainPtr b) { @@ -258,41 +222,11 @@ struct DotNodeOp : public BinaryNodeOp { } }; -Expr broadcast(Shape shape, Expr a); - -struct BroadcastingNodeOp : public BinaryNodeOp { +struct PlusNodeOp : public BinaryNodeOp { template - BroadcastingNodeOp(Expr a, Expr b, Args ...args) - : BinaryNodeOp(broadcast(newShape(a ,b), a), - broadcast(newShape(a ,b), b), - keywords::shape=newShape(a, b), - args...) {} - - static Shape newShape(ChainPtr a, ChainPtr b) { - size_t dimsA = a->shape().size(); - size_t dimsB = b->shape().size(); - UTIL_THROW_IF2(dimsA != dimsB, - "Tensors have different numbers of dimensions"); - Shape shape(dimsA); - for(size_t i = 0; i < dimsA; ++i) { - int dimA = a->shape()[i]; - int dimB = b->shape()[i]; - bool broadcastable = (dimA == dimB || dimA == 1 || dimB == 1); - UTIL_THROW_IF2(!broadcastable, "Different dimensions in elementwise " - << "operation cannot be broadcasted: " << dimA << " != " << dimB); - shape[i] = std::max(dimA, dimB); - if(dimA == whatevs || dimB == whatevs) - shape[i] = whatevs; - } - return shape; - } -}; - - -struct PlusNodeOp : public BroadcastingNodeOp { - template - PlusNodeOp(Args ...args) : BroadcastingNodeOp(args...) { } - + PlusNodeOp(ChainPtr a, ChainPtr b, Args ...args) + : BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { } + void forward() { Element(_1 = _2 + _3, val_, a_->val(), b_->val()); @@ -306,10 +240,11 @@ struct PlusNodeOp : public BroadcastingNodeOp { } }; -struct MinusNodeOp : public BroadcastingNodeOp { +struct MinusNodeOp : public BinaryNodeOp { template - MinusNodeOp(Args ...args) : BroadcastingNodeOp(args...) { } - + MinusNodeOp(ChainPtr a, ChainPtr b, Args ...args) + : BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { } + void forward() { Element(_1 = _2 - _3, val_, a_->val(), b_->val()); @@ -323,10 +258,11 @@ struct MinusNodeOp : public BroadcastingNodeOp { } }; -struct MultNodeOp : public BroadcastingNodeOp { +struct MultNodeOp : public BinaryNodeOp { template - MultNodeOp(Args ...args) : BroadcastingNodeOp(args...) { } - + MultNodeOp(ChainPtr a, ChainPtr b, Args ...args) + : BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { } + void forward() { Element(_1 = _2 * _3, val_, a_->val(), b_->val()); @@ -340,9 +276,10 @@ struct MultNodeOp : public BroadcastingNodeOp { } }; -struct DivNodeOp : public BroadcastingNodeOp { +struct DivNodeOp : public BinaryNodeOp { template - DivNodeOp(Args ...args) : BroadcastingNodeOp(args...) { } + DivNodeOp(ChainPtr a, ChainPtr b, Args ...args) + : BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { } void forward() { Element(_1 = _2 / _3, diff --git a/src/param_initializers.h b/src/param_initializers.h index 04c6b48e..2698d36f 100644 --- a/src/param_initializers.h +++ b/src/param_initializers.h @@ -18,7 +18,7 @@ void ones(Tensor t) { } template -void distribution(Tensor t, float a=0.0, float b=0.1) { +void distribution(Tensor t, float a, float b) { std::random_device device; std::default_random_engine engine(device()); Distribution dist(a, b); @@ -43,7 +43,7 @@ std::function uniform(float a = 0.0, float b = 0.1) { } std::function from_vector(const std::vector& v) { - return [&v](Tensor t) { + return [v](Tensor t) { t << v; }; } diff --git a/src/sgd.cu b/src/sgd.cu index 0a276835..5fe69138 100644 --- a/src/sgd.cu +++ b/src/sgd.cu @@ -7,15 +7,11 @@ using namespace std; namespace marian { -SGD::SGD(Expr& cost_func, Expr& inX, Expr& inY, - const std::vector params, float eta, +SGD::SGD(ExpressionGraph& g, float eta, std::vector& xData, size_t numFeatures, std::vector& yData, size_t numClasses, size_t epochs, size_t batchSize) -: cost_function_(&cost_func), - inX_(&inX), - inY_(&inY), - params_(params), +: graph_(g), eta_(eta), xData_(xData), numFeatures_(numFeatures), @@ -45,11 +41,11 @@ void SGD::Run() size_t endId = startId + batchSize; PrepareBatch(startId, endId, batchSize, shuffle, xt, yt); - *inX_ = xt; - *inY_ = yt; + graph_["x"] = xt; + graph_["y"] = yt; - cost_function_->forward(maxBatchSize_); - cost_function_->backward(); + graph_.forward(maxBatchSize_); + graph_.backward(); UpdateModel(); @@ -136,9 +132,9 @@ void SGD::PrepareBatch( } void SGD::UpdateModel() { - for (auto& param : params_) { + for (auto& param : graph_.params()) { using namespace thrust::placeholders; - Element(_1 = _1 - eta_ * _2, param->val(), param->grad()); + Element(_1 -= eta_ * _2, param.val(), param.grad()); } } diff --git a/src/sgd.h b/src/sgd.h index 33364049..a99acd75 100644 --- a/src/sgd.h +++ b/src/sgd.h @@ -3,7 +3,7 @@ #include #include -#include "expressions.h" +#include "expression_graph.h" #include "thrust_functions.h" #include "tensor_operators.h" @@ -11,8 +11,7 @@ namespace marian { class SGD { public: - SGD(Expr& cost_func, Expr& inX, Expr& inY, - const std::vector params, float eta, + SGD(ExpressionGraph& g, float eta, std::vector& xData, size_t numFeatures, std::vector& yData, size_t numClasses, size_t epochs, size_t batchSize); @@ -20,10 +19,7 @@ class SGD { void Run(); private: - Expr *cost_function_; - Expr *inX_; - Expr *inY_; - std::vector params_; + ExpressionGraph& graph_; const float eta_; std::vector& xData_; const size_t numFeatures_; diff --git a/src/test.cu b/src/test.cu index 4f63def6..7da85c9d 100644 --- a/src/test.cu +++ b/src/test.cu @@ -1,55 +1,70 @@ - +#include #include "marian.h" #include "mnist.h" +#include "vocab.h" int main(int argc, char** argv) { cudaSetDevice(0); + using namespace std; using namespace marian; using namespace keywords; + Vocab sourceVocab, targetVocab; + int input_size = 10; int output_size = 2; int batch_size = 25; int hidden_size = 5; int num_inputs = 8; - std::vector X(num_inputs); - std::vector Y(num_inputs); - std::vector H(num_inputs); + std::vector X; + std::vector Y; + std::vector H; + + ExpressionGraph g; for (int t = 0; t < num_inputs; ++t) { - X[t] = new Expr(input(shape={batch_size, input_size})); - Y[t] = new Expr(input(shape={batch_size, output_size})); + X.emplace_back(g.input(shape={batch_size, input_size})); + Y.emplace_back(g.input(shape={batch_size, output_size})); } - Expr Wxh = param(shape={input_size, hidden_size}, init=uniform(), name="Wxh"); - Expr Whh = param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh"); - Expr bh = param(shape={1, hidden_size}, init=uniform(), name="bh"); - Expr h0 = param(shape={1, hidden_size}, init=uniform(), name="h0"); + Expr Wxh = g.param(shape={input_size, hidden_size}, init=uniform(), name="Wxh"); + Expr Whh = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh"); + Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh"); + Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0"); + + // read parallel corpus from file + std::fstream sourceFile("../examples/mt/dev/newstest2013.de"); + std::fstream targetFile("../examples/mt/dev/newstest2013.en"); + + string sourceLine, targetLine; + while (getline(sourceFile, sourceLine)) { + getline(targetFile, targetLine); + + std::vector sourceIds = sourceVocab.ProcessSentence(sourceLine); + std::vector targetIds = sourceVocab.ProcessSentence(targetLine); + } std::cerr << "Building RNN..." << std::endl; - H[0] = new Expr(tanh(dot(*X[0], Wxh) + dot(h0, Whh) + bh)); + H.emplace_back(tanh(dot(X[0], Wxh) + dot(h0, Whh) + bh)); for (int t = 1; t < num_inputs; ++t) { - H[t] = new Expr(tanh(dot(*X[t], Wxh) + dot(*H[t-1], Whh) + bh)); + H.emplace_back(tanh(dot(X[t], Wxh) + dot(H[t-1], Whh) + bh)); } - Expr Why = param(shape={hidden_size, output_size}, init=uniform(), name="Why"); - Expr by = param(shape={1, output_size}, init=uniform(), name="by"); + Expr Why = g.param(shape={hidden_size, output_size}, init=uniform(), name="Why"); + Expr by = g.param(shape={1, output_size}, init=uniform(), name="by"); std::cerr << "Building output layer..." << std::endl; - std::vector Yp(num_inputs); + std::vector Yp; - Expr* cross_entropy = NULL; - for (int t = 0; t < num_inputs; ++t) { - Yp[t] = new Expr(softmax_fast(dot(*H[t], Why) + by, name="pred")); - if (!cross_entropy) { - cross_entropy = new Expr(sum(*Y[t] * log(*Yp[t]), axis=1)); - } else { - *cross_entropy = *cross_entropy + sum(*Y[t] * log(*Yp[t]), axis=1); - } + Yp.emplace_back(softmax_fast(dot(H[0], Why) + by)); + Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1); + for (int t = 1; t < num_inputs; ++t) { + Yp.emplace_back(softmax_fast(dot(H[t], Why) + by)); + cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1); } - auto graph = -mean(*cross_entropy, axis=0, name="cost"); + auto graph = -mean(cross_entropy, axis=0, name="cost"); for (int t = 0; t < num_inputs; ++t) { Tensor Xt({batch_size, input_size}); @@ -72,17 +87,17 @@ int main(int argc, char** argv) { thrust::copy(values.begin(), values.end(), Xt.begin()); thrust::copy(classes.begin(), classes.end(), Yt.begin()); - *X[t] = Xt; - *Y[t] = Yt; + X[t] = Xt; + Y[t] = Yt; } - graph.forward(batch_size); - graph.backward(); + g.forward(batch_size); + g.backward(); std::cerr << graph.val().Debug() << std::endl; - std::cerr << X[0]->val().Debug() << std::endl; - std::cerr << Y[0]->val().Debug() << std::endl; + std::cerr << X[0].val().Debug() << std::endl; + std::cerr << Y[0].val().Debug() << std::endl; std::cerr << Whh.grad().Debug() << std::endl; std::cerr << bh.grad().Debug() << std::endl; diff --git a/src/train_mnist.cu b/src/train_mnist.cu index aa21597a..64ccf564 100644 --- a/src/train_mnist.cu +++ b/src/train_mnist.cu @@ -16,22 +16,24 @@ int main(int argc, char** argv) { using namespace marian; using namespace keywords; - Expr x = input(shape={whatevs, IMAGE_SIZE}, name="X"); - Expr y = input(shape={whatevs, LABEL_SIZE}, name="Y"); + ExpressionGraph g; + + Expr x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x"); + Expr y = named(g.input(shape={whatevs, LABEL_SIZE}), "y"); - Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0"); - Expr b = param(shape={1, LABEL_SIZE}, name="b0"); + Expr w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE}), "w"); + Expr b = named(g.param(shape={1, LABEL_SIZE}), "b"); std::vector params; params.push_back(&w); params.push_back(&b); auto scores = dot(x, w) + b; - auto lr = softmax_fast(scores, axis=1, name="pred"); - auto cost = -mean(sum(y * log(lr), axis=1), axis=0, name="cost"); + auto lr = softmax_fast(scores); + auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost"); cerr << "lr=" << lr.Debug() << endl; - SGD opt(cost, x, y, params, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24); + SGD opt(g, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24); opt.Run(); return 0; } diff --git a/src/validate_mnist.cu b/src/validate_mnist.cu index 43e1fedc..cbd3e0a3 100644 --- a/src/validate_mnist.cu +++ b/src/validate_mnist.cu @@ -2,24 +2,15 @@ #include "marian.h" #include "mnist.h" #include "npz_converter.h" -#include "param_initializers.h" using namespace marian; using namespace keywords; -int main(int argc, char** argv) { - - cudaSetDevice(1); - - const size_t IMAGE_SIZE = 784; - const size_t LABEL_SIZE = 10; - int BATCH_SIZE = 10000; - - std::cerr << "Loading test set..."; - std::vector testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", BATCH_SIZE, IMAGE_SIZE); - std::vector testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", BATCH_SIZE, LABEL_SIZE); - std::cerr << "Done." << std::endl; +const size_t IMAGE_SIZE = 784; +const size_t LABEL_SIZE = 10; +int BATCH_SIZE = 10000; +ExpressionGraph build_graph() { std::cerr << "Loading model params..."; NpzConverter converter("../scripts/test_model_single/model.npz"); @@ -31,29 +22,50 @@ int main(int argc, char** argv) { std::cerr << "Building model..."; - auto x = input(shape={whatevs, IMAGE_SIZE}); - auto y = input(shape={whatevs, LABEL_SIZE}); + ExpressionGraph g; + auto x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x"); + auto y = named(g.input(shape={whatevs, LABEL_SIZE}), "y"); - auto w = param(shape={IMAGE_SIZE, LABEL_SIZE}, - init=from_vector(wData)); - auto b = param(shape={1, LABEL_SIZE}, - init=from_vector(bData)); + auto w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE}, + init=from_vector(wData)), "w"); + auto b = named(g.param(shape={1, LABEL_SIZE}, + init=from_vector(bData)), "b"); - auto probs = softmax_fast(dot(x, w) + b, axis=1); - auto cost = -mean(sum(y * log(probs), axis=1), axis=0); + auto probs = named( + softmax_fast(dot(x, w) + b), //, axis=1), + "probs" + ); + + auto cost = named( + -mean(sum(y * log(probs), axis=1), axis=0), + "cost" + ); std::cerr << "Done." << std::endl; + return g; +} +int main(int argc, char** argv) { + + cudaSetDevice(1); + + std::cerr << "Loading test set..."; + std::vector testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", BATCH_SIZE, IMAGE_SIZE); + std::vector testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", BATCH_SIZE, LABEL_SIZE); + std::cerr << "Done." << std::endl; + + ExpressionGraph g = build_graph(); + Tensor xt({BATCH_SIZE, IMAGE_SIZE}); Tensor yt({BATCH_SIZE, LABEL_SIZE}); - x = xt << testImages; - y = yt << testLabels; + g["x"] = (xt << testImages); + g["y"] = (yt << testLabels); - cost.forward(BATCH_SIZE); + g.forward(BATCH_SIZE); std::vector results; - results << probs.val(); + results << g["probs"].val(); size_t acc = 0; for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) { @@ -65,22 +77,22 @@ int main(int argc, char** argv) { } acc += (correct == proposed); } - std::cerr << "Cost: " << cost.val()[0] << " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl; - + std::cerr << "Cost: " << g["cost"].val()[0] << " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl; + float eta = 0.1; for (size_t j = 0; j < 10; ++j) { for(size_t i = 0; i < 60; ++i) { - cost.backward(); + g.backward(); auto update_rule = _1 -= eta * _2; - Element(update_rule, w.val(), w.grad()); - Element(update_rule, b.val(), b.grad()); + for(auto param : g.params()) + Element(update_rule, param.val(), param.grad()); - cost.forward(BATCH_SIZE); + g.forward(BATCH_SIZE); } std::cerr << "Epoch: " << j << std::endl; std::vector results; - results << probs.val(); + results << g["probs"].val(); size_t acc = 0; for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) { @@ -92,7 +104,7 @@ int main(int argc, char** argv) { } acc += (correct == proposed); } - std::cerr << "Cost: " << cost.val()[0] << " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl; + std::cerr << "Cost: " << g["cost"].val()[0] << " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl; } return 0; } diff --git a/src/validate_mnist_batch.cu b/src/validate_mnist_batch.cu index 1c66198a..50ab97b5 100644 --- a/src/validate_mnist_batch.cu +++ b/src/validate_mnist_batch.cu @@ -59,13 +59,15 @@ int main(int argc, char** argv) { std::cerr << "\tDone." << std::endl; - auto x = input(shape={whatevs, IMAGE_SIZE}, name="X"); - auto y = input(shape={whatevs, LABEL_SIZE}, name="Y"); + ExpressionGraph g; - auto w1 = param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1); - auto b1 = param(shape={1, 100}, name="b0", init=initB1); - auto w2 = param(shape={100, LABEL_SIZE}, name="W1", init=initW2); - auto b2 = param(shape={1, LABEL_SIZE}, name="b1", init=initB2); + auto x = g.input(shape={whatevs, IMAGE_SIZE}, name="X"); + auto y = g.input(shape={whatevs, LABEL_SIZE}, name="Y"); + + auto w1 = g.param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1); + auto b1 = g.param(shape={1, 100}, name="b0", init=initB1); + auto w2 = g.param(shape={100, LABEL_SIZE}, name="W1", init=initW2); + auto b2 = g.param(shape={1, LABEL_SIZE}, name="b1", init=initB2); std::cerr << "Building model..."; auto layer1 = tanh(dot(x, w1) + b1); @@ -86,7 +88,7 @@ int main(int argc, char** argv) { xt << tmp; x = xt; - predict.forward(BATCH_SIZE); + g.forward(BATCH_SIZE); std::vector results(LABEL_SIZE * BATCH_SIZE); results << predict.val(); @@ -113,7 +115,7 @@ int main(int argc, char** argv) { xt << tmp; x = xt; - predict.forward(endId - startId); + g.forward(endId - startId); std::vector results(LABEL_SIZE * BATCH_SIZE); results << predict.val(); diff --git a/src/vocab.cpp b/src/vocab.cpp new file mode 100644 index 00000000..705c21b2 --- /dev/null +++ b/src/vocab.cpp @@ -0,0 +1,53 @@ +#include "vocab.h" + +using namespace std; + +//////////////////////////////////////////////////////// +inline std::vector Tokenize(const std::string& str, + const std::string& delimiters = " \t") +{ + std::vector tokens; + // Skip delimiters at beginning. + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (std::string::npos != pos || std::string::npos != lastPos) { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } + + return tokens; +} +//////////////////////////////////////////////////////// + +size_t Vocab::GetOrCreate(const std::string &word) +{ + size_t id; + Coll::const_iterator iter = coll_.find(word); + if (iter == coll_.end()) { + id = coll_.size(); + coll_[word] = id; + } + else { + id = iter->second; + } + return id; +} + +std::vector Vocab::ProcessSentence(const std::string &sentence) +{ + vector toks = Tokenize(sentence); + vector ret(toks.size()); + + for (size_t i = 0; i < toks.size(); ++i) { + size_t id = GetOrCreate(toks[i]); + ret[i] = id; + } + + return ret; +} diff --git a/src/vocab.h b/src/vocab.h new file mode 100644 index 00000000..5e055511 --- /dev/null +++ b/src/vocab.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include + +class Vocab +{ +public: + size_t GetOrCreate(const std::string &word); + std::vector ProcessSentence(const std::string &sentence); + +protected: + typedef std::unordered_map Coll; + Coll coll_; +}; +