mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-17 09:47:34 +03:00
Merge branch 'master' of https://github.com/emjotde/Marian
This commit is contained in:
commit
5d924dd160
@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.5.1)
|
|||||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
||||||
|
|
||||||
project(marian CXX)
|
project(marian CXX)
|
||||||
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O0 -funroll-loops -Wno-unused-result -Wno-deprecated")
|
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
|
||||||
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O0; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
|
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
|
||||||
add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
|
add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
|
||||||
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||||
|
|
||||||
|
4
examples/mt/download.sh
Executable file
4
examples/mt/download.sh
Executable file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
wget http://data.statmt.org/wmt16/translation-task/dev.tgz
|
||||||
|
tar xvf dev.tgz
|
||||||
|
|
@ -4,10 +4,12 @@ include_directories(.)
|
|||||||
cuda_add_library(marian_lib
|
cuda_add_library(marian_lib
|
||||||
cnpy/cnpy.cpp
|
cnpy/cnpy.cpp
|
||||||
exception.cpp
|
exception.cpp
|
||||||
expressions.cu
|
expression_graph.cu
|
||||||
sgd.cu
|
sgd.cu
|
||||||
tensor.cu
|
tensor.cu
|
||||||
tensor_operators.cu
|
tensor_operators.cu
|
||||||
|
expression_operators.cu
|
||||||
|
vocab.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(marian_lib)
|
target_link_libraries(marian_lib)
|
||||||
|
34
src/chainable.h
Normal file
34
src/chainable.h
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "exception.h"
|
||||||
|
|
||||||
|
namespace marian {
|
||||||
|
|
||||||
|
template <class DataType>
|
||||||
|
struct Chainable {
|
||||||
|
Chainable() { }
|
||||||
|
virtual ~Chainable() { }
|
||||||
|
virtual void forward() { }
|
||||||
|
virtual void backward() { }
|
||||||
|
virtual void init_dependent() { }
|
||||||
|
virtual void set_zero_adjoint() { }
|
||||||
|
|
||||||
|
virtual void allocate(size_t) = 0;
|
||||||
|
|
||||||
|
virtual const Shape& shape() = 0;
|
||||||
|
virtual DataType &val() = 0;
|
||||||
|
virtual DataType grad() = 0;
|
||||||
|
virtual void setVal(DataType t) {
|
||||||
|
UTIL_THROW2("Tensors can only be assigned to input nodes");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::vector<Chainable<Tensor>*> ChainableStack;
|
||||||
|
typedef std::shared_ptr<ChainableStack> ChainableStackPtr;
|
||||||
|
typedef std::shared_ptr<Chainable<Tensor>> ChainPtr;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
41
src/expression_graph.cu
Normal file
41
src/expression_graph.cu
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#include <sstream>
|
||||||
|
#include "expression_graph.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
namespace marian {
|
||||||
|
|
||||||
|
Expr::Expr(ExpressionGraphPtr g, Chainable<Tensor>* chainable)
|
||||||
|
: graph_(g), pimpl_(chainable) {
|
||||||
|
graph_->stack()->push_back(chainable);
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor Expr::val() {
|
||||||
|
return pimpl_->val();
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor Expr::grad() {
|
||||||
|
return pimpl_->grad();
|
||||||
|
}
|
||||||
|
|
||||||
|
ChainPtr Expr::node() {
|
||||||
|
return pimpl_;
|
||||||
|
}
|
||||||
|
|
||||||
|
ExpressionGraphPtr Expr::graph() {
|
||||||
|
return graph_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr::operator ChainPtr() {
|
||||||
|
return pimpl_;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Expr::Debug() const
|
||||||
|
{
|
||||||
|
stringstream strm;
|
||||||
|
const Shape &shape = pimpl_->shape();
|
||||||
|
strm << marian::Debug(shape);
|
||||||
|
return strm.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
120
src/expression_graph.h
Normal file
120
src/expression_graph.h
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
#include "definitions.h"
|
||||||
|
#include "chainable.h"
|
||||||
|
#include "node_operators.h"
|
||||||
|
#include "tensor.h"
|
||||||
|
|
||||||
|
namespace marian {
|
||||||
|
|
||||||
|
class ExpressionGraph;
|
||||||
|
typedef ExpressionGraph* ExpressionGraphPtr;
|
||||||
|
|
||||||
|
class Expr {
|
||||||
|
public:
|
||||||
|
Expr(ExpressionGraphPtr g, Chainable<Tensor>* chainable);
|
||||||
|
|
||||||
|
Expr operator=(Tensor t) {
|
||||||
|
pimpl_->setVal(t);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
Tensor val();
|
||||||
|
Tensor grad();
|
||||||
|
|
||||||
|
ExpressionGraphPtr graph();
|
||||||
|
|
||||||
|
ChainPtr node();
|
||||||
|
operator ChainPtr();
|
||||||
|
|
||||||
|
std::string Debug() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
ExpressionGraphPtr graph_;
|
||||||
|
ChainPtr pimpl_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class ExpressionGraph {
|
||||||
|
public:
|
||||||
|
ExpressionGraph()
|
||||||
|
: stack_(new ChainableStack)
|
||||||
|
{}
|
||||||
|
|
||||||
|
void forward(size_t batchSize) {
|
||||||
|
for(auto&& v : *stack_) {
|
||||||
|
v->allocate(batchSize);
|
||||||
|
}
|
||||||
|
for(auto&& v : *stack_)
|
||||||
|
v->forward();
|
||||||
|
}
|
||||||
|
|
||||||
|
void backward() {
|
||||||
|
for(auto&& v : *stack_)
|
||||||
|
v->set_zero_adjoint();
|
||||||
|
|
||||||
|
typedef typename ChainableStack::reverse_iterator It;
|
||||||
|
stack_->back()->init_dependent();
|
||||||
|
for(It it = stack_->rbegin(); it != stack_->rend(); ++it)
|
||||||
|
(*it)->backward();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ...Args>
|
||||||
|
inline Expr input(Args ...args) {
|
||||||
|
return Expr(this, new InputNode(args...));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ...Args>
|
||||||
|
inline Expr param(Args ...args) {
|
||||||
|
Expr e(this, new ParamNode(args...));
|
||||||
|
params_.emplace_back(e);
|
||||||
|
return e;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ...Args>
|
||||||
|
inline Expr constant(Args ...args) {
|
||||||
|
return Expr(this, new ConstantNode(args...));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ...Args>
|
||||||
|
inline Expr ones(Args ...args) {
|
||||||
|
return Expr(this, new ConstantNode(keywords::value=1, args...));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ...Args>
|
||||||
|
inline Expr zeroes(Args ...args) {
|
||||||
|
return Expr(this, new ConstantNode(keywords::value=0, args...));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*********************************************************/
|
||||||
|
|
||||||
|
ChainableStackPtr stack() {
|
||||||
|
return stack_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr& operator[](const std::string& name) {
|
||||||
|
auto it = named_.find(name);
|
||||||
|
UTIL_THROW_IF2(it == named_.end(), "No such named node in graph: " << name);
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool has_node(const std::string& name) const {
|
||||||
|
return named_.count(name) > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void add_named_node(Expr e, const std::string& name) {
|
||||||
|
named_.emplace(name, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Expr>& params() {
|
||||||
|
return params_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
ChainableStackPtr stack_;
|
||||||
|
std::map<std::string, Expr> named_;
|
||||||
|
std::vector<Expr> params_;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
124
src/expression_operators.cu
Normal file
124
src/expression_operators.cu
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
|
||||||
|
#include "expression_operators.h"
|
||||||
|
#include "node_operators.h"
|
||||||
|
|
||||||
|
namespace marian {
|
||||||
|
|
||||||
|
Expr named(Expr a, const std::string& name) {
|
||||||
|
a.graph()->add_named_node(a, name);
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr logit(Expr a) {
|
||||||
|
return Expr(a.graph(), new LogitNodeOp(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr tanh(Expr a) {
|
||||||
|
return Expr(a.graph(), new TanhNodeOp(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr log(Expr a) {
|
||||||
|
return Expr(a.graph(), new LogNodeOp(a));
|
||||||
|
};
|
||||||
|
|
||||||
|
Expr exp(Expr a) {
|
||||||
|
return Expr(a.graph(), new ExpNodeOp(a));
|
||||||
|
};
|
||||||
|
|
||||||
|
Expr operator-(Expr a) {
|
||||||
|
return Expr(a.graph(), new NegNodeOp(a));
|
||||||
|
};
|
||||||
|
|
||||||
|
Expr softmax_fast(Expr a) {
|
||||||
|
return Expr(a.graph(), new SoftmaxNodeOp(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*********************************************************/
|
||||||
|
|
||||||
|
static Shape newShape(ChainPtr a, ChainPtr b) {
|
||||||
|
size_t dimsA = a->shape().size();
|
||||||
|
size_t dimsB = b->shape().size();
|
||||||
|
UTIL_THROW_IF2(dimsA != dimsB,
|
||||||
|
"Tensors have different numbers of dimensions");
|
||||||
|
Shape shape(dimsA);
|
||||||
|
for(size_t i = 0; i < dimsA; ++i) {
|
||||||
|
int dimA = a->shape()[i];
|
||||||
|
int dimB = b->shape()[i];
|
||||||
|
bool broadcastable = (dimA == dimB || dimA == 1 || dimB == 1);
|
||||||
|
UTIL_THROW_IF2(!broadcastable, "Different dimensions in elementwise "
|
||||||
|
<< "operation cannot be broadcasted: " << dimA << " != " << dimB);
|
||||||
|
shape[i] = std::max(dimA, dimB);
|
||||||
|
if(dimA == whatevs || dimB == whatevs)
|
||||||
|
shape[i] = whatevs;
|
||||||
|
}
|
||||||
|
return shape;
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr broadcast(Shape bShape, Expr a) {
|
||||||
|
const Shape& aShape = a.node()->shape();
|
||||||
|
if(aShape == bShape) {
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
size_t dimsA = aShape.size();
|
||||||
|
size_t dimsB = bShape.size();
|
||||||
|
UTIL_THROW_IF2(dimsA != dimsB,
|
||||||
|
"Tensor and shape have different number of dimensions");
|
||||||
|
for(size_t i = 0; i < dimsA; ++i) {
|
||||||
|
int dimA = aShape[i];
|
||||||
|
int dimB = bShape[i];
|
||||||
|
bool broadcastable = (dimA == dimB || dimA == 1);
|
||||||
|
UTIL_THROW_IF2(!broadcastable,
|
||||||
|
"Cannot broadcast tensor dimension "
|
||||||
|
<< dimA << " to " << dimB);
|
||||||
|
if(dimA == 1 && dimB != 1) {
|
||||||
|
if(i == 0) {
|
||||||
|
Expr one = a.graph()->ones(keywords::shape={bShape[0], 1});
|
||||||
|
a = dot(one, a);
|
||||||
|
}
|
||||||
|
else if(i == 1) {
|
||||||
|
Expr one = a.graph()->ones(keywords::shape={1, bShape[1]});
|
||||||
|
a = dot(a, one);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
UTIL_THROW2("Not implemented");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr operator+(Expr a, Expr b) {
|
||||||
|
Shape shape = newShape(a, b);
|
||||||
|
Expr cast_a = broadcast(shape, a);
|
||||||
|
Expr cast_b = broadcast(shape, b);
|
||||||
|
return Expr(a.graph(), new PlusNodeOp(cast_a, cast_b));
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr operator-(Expr a, Expr b) {
|
||||||
|
Shape shape = newShape(a, b);
|
||||||
|
Expr cast_a = broadcast(shape, a);
|
||||||
|
Expr cast_b = broadcast(shape, b);
|
||||||
|
return Expr(a.graph(), new MinusNodeOp(cast_a, cast_b));
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr operator*(Expr a, Expr b) {
|
||||||
|
Shape shape = newShape(a, b);
|
||||||
|
Expr cast_a = broadcast(shape, a);
|
||||||
|
Expr cast_b = broadcast(shape, b);
|
||||||
|
return Expr(a.graph(), new MultNodeOp(cast_a, cast_b));
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr operator/(Expr a, Expr b) {
|
||||||
|
Shape shape = newShape(a, b);
|
||||||
|
Expr cast_a = broadcast(shape, a);
|
||||||
|
Expr cast_b = broadcast(shape, b);
|
||||||
|
return Expr(a.graph(), new DivNodeOp(cast_a, cast_b));
|
||||||
|
}
|
||||||
|
|
||||||
|
Expr dot(Expr a, Expr b) {
|
||||||
|
return Expr(a.graph(), new DotNodeOp(a, b));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,115 +1,36 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "graph.h"
|
#include "expression_graph.h"
|
||||||
#include "graph_operators.h"
|
|
||||||
#include "expressions.h"
|
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
|
|
||||||
template <typename ...Args>
|
Expr named(Expr a, const std::string& name);
|
||||||
inline Expr input(Args ...args) {
|
|
||||||
return Expr(new InputNode(args...));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename ...Args>
|
Expr logit(Expr a);
|
||||||
inline Expr param(Args ...args) {
|
|
||||||
return Expr(new ParamNode(args...));
|
|
||||||
}
|
|
||||||
template <typename ...Args>
|
|
||||||
inline Expr constant(Args ...args) {
|
|
||||||
return Expr(new ConstantNode(args...));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename ...Args>
|
Expr tanh(Expr a);
|
||||||
inline Expr ones(Args ...args) {
|
|
||||||
return Expr(new ConstantNode(keywords::value=1, args...));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename ...Args>
|
Expr log(Expr a);
|
||||||
inline Expr zeroes(Args ...args) {
|
|
||||||
return Expr(new ConstantNode(keywords::value=0, args...));
|
Expr exp(Expr a);
|
||||||
}
|
|
||||||
|
Expr operator-(Expr a);
|
||||||
|
|
||||||
/*********************************************************/
|
/*********************************************************/
|
||||||
|
|
||||||
inline Expr logit(Expr a) {
|
Expr operator+(Expr a, Expr b);
|
||||||
return Expr(new LogitNodeOp(a));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Expr tanh(Expr a) {
|
Expr operator-(Expr a, Expr b);
|
||||||
return Expr(new TanhNodeOp(a));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Expr log(Expr a) {
|
Expr operator*(Expr a, Expr b);
|
||||||
return Expr(new LogNodeOp(a));
|
|
||||||
};
|
|
||||||
|
|
||||||
inline Expr exp(Expr a) {
|
Expr operator/(Expr a, Expr b);
|
||||||
return Expr(new ExpNodeOp(a));
|
|
||||||
};
|
|
||||||
|
|
||||||
inline Expr operator-(Expr a) {
|
Expr dot(Expr a, Expr b);
|
||||||
return Expr(new NegNodeOp(a));
|
|
||||||
};
|
|
||||||
|
|
||||||
/*********************************************************/
|
|
||||||
|
|
||||||
inline Expr operator+(Expr a, Expr b) {
|
|
||||||
return Expr(new PlusNodeOp(a, b));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Expr operator-(Expr a, Expr b) {
|
|
||||||
return Expr(new MinusNodeOp(a, b));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Expr operator*(Expr a, Expr b) {
|
|
||||||
return Expr(new MultNodeOp(a, b));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Expr operator/(Expr a, Expr b) {
|
|
||||||
return Expr(new DivNodeOp(a, b));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Expr dot(Expr a, Expr b) {
|
|
||||||
return Expr(new DotNodeOp(a, b));
|
|
||||||
}
|
|
||||||
|
|
||||||
/******************************************************/
|
/******************************************************/
|
||||||
|
|
||||||
Expr broadcast(Shape bShape, Expr a) {
|
Expr broadcast(Shape bShape, Expr a);
|
||||||
const Shape& aShape = a.node()->shape();
|
|
||||||
if(aShape == bShape) {
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
size_t dimsA = aShape.size();
|
|
||||||
size_t dimsB = bShape.size();
|
|
||||||
UTIL_THROW_IF2(dimsA != dimsB,
|
|
||||||
"Tensor and shape have different number of dimensions");
|
|
||||||
for(size_t i = 0; i < dimsA; ++i) {
|
|
||||||
int dimA = aShape[i];
|
|
||||||
int dimB = bShape[i];
|
|
||||||
bool broadcastable = (dimA == dimB || dimA == 1);
|
|
||||||
UTIL_THROW_IF2(!broadcastable,
|
|
||||||
"Cannot broadcast tensor dimension "
|
|
||||||
<< dimA << " to " << dimB);
|
|
||||||
if(dimA == 1 && dimB != 1) {
|
|
||||||
if(i == 0) {
|
|
||||||
Expr one = ones(keywords::shape={bShape[0], 1});
|
|
||||||
a = dot(one, a);
|
|
||||||
}
|
|
||||||
else if(i == 1) {
|
|
||||||
Expr one = ones(keywords::shape={1, bShape[1]});
|
|
||||||
a = dot(a, one);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
UTIL_THROW2("Not implemented");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*********************************************************/
|
/*********************************************************/
|
||||||
|
|
||||||
@ -126,7 +47,7 @@ inline Expr sum(Expr a, Args ...args) {
|
|||||||
int rows = n->val().shape()[0];
|
int rows = n->val().shape()[0];
|
||||||
return {1, rows};
|
return {1, rows};
|
||||||
};
|
};
|
||||||
Expr one = ones(shape={1, n->shape()[0]},
|
Expr one = a.graph()->ones(shape={1, n->shape()[0]},
|
||||||
lazy_shape=lshape);
|
lazy_shape=lshape);
|
||||||
return dot(one, a);
|
return dot(one, a);
|
||||||
}
|
}
|
||||||
@ -136,8 +57,8 @@ inline Expr sum(Expr a, Args ...args) {
|
|||||||
//std::cerr << "Shape will be " << cols << " by 1." << std::endl;
|
//std::cerr << "Shape will be " << cols << " by 1." << std::endl;
|
||||||
return {cols, 1};
|
return {cols, 1};
|
||||||
};
|
};
|
||||||
Expr one = ones(shape={n->shape()[1], 1},
|
Expr one = a.graph()->ones(shape={n->shape()[1], 1},
|
||||||
lazy_shape=lshape);
|
lazy_shape=lshape);
|
||||||
return dot(a, one);
|
return dot(a, one);
|
||||||
}
|
}
|
||||||
else if(ax == 2) {
|
else if(ax == 2) {
|
||||||
@ -151,17 +72,12 @@ inline Expr sum(Expr a, Args ...args) {
|
|||||||
|
|
||||||
// inefficient
|
// inefficient
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
inline Expr softmax(Expr a, Args ...args) {
|
Expr softmax(Expr a, Args ...args) {
|
||||||
Expr e = exp(a);
|
Expr e = exp(a);
|
||||||
return e / sum(e, args...);
|
return e / sum(e, args...);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename ...Args>
|
Expr softmax_fast(Expr a);
|
||||||
inline Expr softmax_fast(Expr a, Args ...args) {
|
|
||||||
Expr e = Expr(new SoftmaxNodeOp(a, args...));
|
|
||||||
return e;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// inefficient
|
// inefficient
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
@ -173,12 +89,12 @@ inline Expr mean(Expr a, Args ...args) {
|
|||||||
ChainPtr n = a.node();
|
ChainPtr n = a.node();
|
||||||
switch (ax) {
|
switch (ax) {
|
||||||
case 0:
|
case 0:
|
||||||
return sum(a, axis=0) / constant(shape={1, 1},
|
return sum(a, axis=0) / a.graph()->constant(shape={1, 1},
|
||||||
lazy_value=[n]() -> Float {
|
lazy_value=[n]() -> Float {
|
||||||
return n->val().shape()[0];
|
return n->val().shape()[0];
|
||||||
});
|
});
|
||||||
case 1:
|
case 1:
|
||||||
return sum(a, axis=1) / constant(shape={1, 1},
|
return sum(a, axis=1) / a.graph()->constant(shape={1, 1},
|
||||||
lazy_value=[n]() -> Float {
|
lazy_value=[n]() -> Float {
|
||||||
return n->val().shape()[1];
|
return n->val().shape()[1];
|
||||||
});
|
});
|
||||||
@ -187,7 +103,7 @@ inline Expr mean(Expr a, Args ...args) {
|
|||||||
case 3:
|
case 3:
|
||||||
UTIL_THROW2("Not implemented");
|
UTIL_THROW2("Not implemented");
|
||||||
default:
|
default:
|
||||||
return sum(a) / constant(shape={1, 1},
|
return sum(a) / a.graph()->constant(shape={1, 1},
|
||||||
lazy_value=[n]() -> Float {
|
lazy_value=[n]() -> Float {
|
||||||
return n->val().size();
|
return n->val().size();
|
||||||
});
|
});
|
||||||
|
@ -1,59 +0,0 @@
|
|||||||
#include <sstream>
|
|
||||||
#include "expressions.h"
|
|
||||||
#include "graph_operators.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace marian {
|
|
||||||
|
|
||||||
Expr::Expr(Chainable<Tensor>* chainable) : pimpl_(chainable) {}
|
|
||||||
Expr::Expr(Float v) : pimpl_(new ConstantNode(keywords::value=v,
|
|
||||||
keywords::shape={1,1})) {}
|
|
||||||
|
|
||||||
Tensor Expr::val() {
|
|
||||||
return pimpl_->val();
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor Expr::grad() {
|
|
||||||
return pimpl_->grad();
|
|
||||||
}
|
|
||||||
|
|
||||||
ChainPtr Expr::node() {
|
|
||||||
return pimpl_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Expr::forward(size_t batchSize) {
|
|
||||||
UTIL_THROW_IF2(pimpl_.get() != Chainable<Tensor>::stack.back(),
|
|
||||||
"Trying to call forward on non-root of computation graph");
|
|
||||||
for(auto&& v : Chainable<Tensor>::stack) {
|
|
||||||
v->allocate(batchSize);
|
|
||||||
}
|
|
||||||
for(auto&& v : Chainable<Tensor>::stack)
|
|
||||||
v->forward();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Expr::backward() {
|
|
||||||
UTIL_THROW_IF2(pimpl_.get() != Chainable<Tensor>::stack.back(),
|
|
||||||
"Trying to call backward on non-root of computation graph");
|
|
||||||
for(auto&& v : Chainable<Tensor>::stack)
|
|
||||||
v->set_zero_adjoint();
|
|
||||||
|
|
||||||
typedef typename Chainable<Tensor>::ChainableStack::reverse_iterator It;
|
|
||||||
pimpl_->init_dependent();
|
|
||||||
for(It it = Chainable<Tensor>::stack.rbegin(); it != Chainable<Tensor>::stack.rend(); ++it)
|
|
||||||
(*it)->backward();
|
|
||||||
}
|
|
||||||
|
|
||||||
Expr::operator ChainPtr() {
|
|
||||||
return pimpl_;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string Expr::Debug() const
|
|
||||||
{
|
|
||||||
stringstream strm;
|
|
||||||
const Shape &shape = pimpl_->shape();
|
|
||||||
strm << marian::Debug(shape);
|
|
||||||
return strm.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,33 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "definitions.h"
|
|
||||||
#include "graph.h"
|
|
||||||
|
|
||||||
namespace marian {
|
|
||||||
|
|
||||||
class Expr {
|
|
||||||
public:
|
|
||||||
Expr(Chainable<Tensor>* chainable);
|
|
||||||
Expr(Float v);
|
|
||||||
|
|
||||||
Expr operator=(Tensor t) {
|
|
||||||
pimpl_->setVal(t);
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor val();
|
|
||||||
Tensor grad();
|
|
||||||
|
|
||||||
void forward(size_t batchSize);
|
|
||||||
void backward();
|
|
||||||
|
|
||||||
ChainPtr node();
|
|
||||||
operator ChainPtr();
|
|
||||||
|
|
||||||
std::string Debug() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
ChainPtr pimpl_;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -1,9 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "definitions.h"
|
#include "definitions.h"
|
||||||
#include "graph.h"
|
#include "expression_graph.h"
|
||||||
#include "graph_operators.h"
|
|
||||||
#include "expressions.h"
|
|
||||||
#include "expression_operators.h"
|
|
||||||
#include "param_initializers.h"
|
#include "param_initializers.h"
|
||||||
|
#include "expression_operators.h"
|
||||||
|
|
||||||
|
@ -2,36 +2,10 @@
|
|||||||
|
|
||||||
#include "keywords.h"
|
#include "keywords.h"
|
||||||
#include "tensor.h"
|
#include "tensor.h"
|
||||||
|
#include "chainable.h"
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
|
|
||||||
template <class DataType>
|
|
||||||
struct Chainable {
|
|
||||||
Chainable() { }
|
|
||||||
virtual ~Chainable() { }
|
|
||||||
virtual void forward() { }
|
|
||||||
virtual void backward() { }
|
|
||||||
virtual void init_dependent() { }
|
|
||||||
virtual void set_zero_adjoint() { }
|
|
||||||
|
|
||||||
virtual void allocate(size_t) = 0;
|
|
||||||
|
|
||||||
virtual const Shape& shape() = 0;
|
|
||||||
virtual DataType &val() = 0;
|
|
||||||
virtual DataType grad() = 0;
|
|
||||||
virtual void setVal(DataType t) {
|
|
||||||
UTIL_THROW2("Tensors can only be assigned to input nodes");
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef std::vector<Chainable<DataType>*> ChainableStack;
|
|
||||||
static ChainableStack stack;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class DataType>
|
|
||||||
typename Chainable<DataType>::ChainableStack Chainable<DataType>::stack;
|
|
||||||
|
|
||||||
typedef std::shared_ptr<Chainable<Tensor>> ChainPtr;
|
|
||||||
|
|
||||||
class Node : public Chainable<Tensor>,
|
class Node : public Chainable<Tensor>,
|
||||||
public keywords::Keywords {
|
public keywords::Keywords {
|
||||||
public:
|
public:
|
||||||
@ -40,9 +14,7 @@ class Node : public Chainable<Tensor>,
|
|||||||
: Keywords(args...),
|
: Keywords(args...),
|
||||||
shape_(Get<Shape>(keywords::shape, {1, 1})),
|
shape_(Get<Shape>(keywords::shape, {1, 1})),
|
||||||
name_(Get<std::string>(keywords::name, "none"))
|
name_(Get<std::string>(keywords::name, "none"))
|
||||||
{
|
{ }
|
||||||
stack.push_back(this);
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual ~Node() {};
|
virtual ~Node() {};
|
||||||
|
|
@ -1,7 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "expressions.h"
|
#include "node.h"
|
||||||
#include "graph.h"
|
|
||||||
#include "tensor_operators.h"
|
#include "tensor_operators.h"
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
@ -108,49 +107,14 @@ struct TanhNodeOp : public UnaryNodeOp {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ArgmaxOp : public UnaryNodeOp {
|
|
||||||
template <typename ...Args>
|
|
||||||
ArgmaxOp(ChainPtr a, Args ...args)
|
|
||||||
: UnaryNodeOp(a, keywords::shape=newShape(a, -1), args...),
|
|
||||||
axis_(-1) { }
|
|
||||||
|
|
||||||
Shape newShape(ChainPtr a, int axis) {
|
|
||||||
Shape shape1 = a->shape();
|
|
||||||
UTIL_THROW_IF2(shape1.size() > 2,
|
|
||||||
"Tensors with more than 2 dimensions not supported yet");
|
|
||||||
if(axis == 0) {
|
|
||||||
shape1[0] = 1;
|
|
||||||
}
|
|
||||||
else if(axis == 1) {
|
|
||||||
shape1[1] = 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
shape1 = {1, 1};
|
|
||||||
}
|
|
||||||
return shape1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void forward() {
|
|
||||||
//val_ = Argmax(a_->val(), axis_);
|
|
||||||
UTIL_THROW2("Not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
void backward() {
|
|
||||||
UTIL_THROW2("Not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
int axis_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// @TODO, make this numerically safe(r):
|
// @TODO, make this numerically safe(r):
|
||||||
// softmax(X) = softmax_safe(X - max(X, axis=1))
|
// softmax(X) = softmax_safe(X - max(X, axis=1))
|
||||||
// Probably best to do this directly in Softmax
|
// Probably best to do this directly in Softmax
|
||||||
// function.
|
// function.
|
||||||
struct SoftmaxNodeOp : public UnaryNodeOp {
|
struct SoftmaxNodeOp : public UnaryNodeOp {
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
SoftmaxNodeOp(ChainPtr a, Args ...args)
|
SoftmaxNodeOp(Args ...args)
|
||||||
: UnaryNodeOp(a, args...) { }
|
: UnaryNodeOp(args...) { }
|
||||||
|
|
||||||
void forward() {
|
void forward() {
|
||||||
// B = softmax(A).
|
// B = softmax(A).
|
||||||
@ -171,8 +135,8 @@ struct SoftmaxNodeOp : public UnaryNodeOp {
|
|||||||
|
|
||||||
struct LogNodeOp : public UnaryNodeOp {
|
struct LogNodeOp : public UnaryNodeOp {
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
LogNodeOp(ChainPtr a, Args ...args)
|
LogNodeOp(Args ...args)
|
||||||
: UnaryNodeOp(a, args...) {}
|
: UnaryNodeOp(args...) {}
|
||||||
|
|
||||||
void forward() {
|
void forward() {
|
||||||
Element(_1 = Log(_2), val_, a_->val());
|
Element(_1 = Log(_2), val_, a_->val());
|
||||||
@ -186,8 +150,8 @@ struct LogNodeOp : public UnaryNodeOp {
|
|||||||
|
|
||||||
struct ExpNodeOp : public UnaryNodeOp {
|
struct ExpNodeOp : public UnaryNodeOp {
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
ExpNodeOp(ChainPtr a, Args ...args)
|
ExpNodeOp(Args ...args)
|
||||||
: UnaryNodeOp(a, args...) { }
|
: UnaryNodeOp(args...) { }
|
||||||
|
|
||||||
void forward() {
|
void forward() {
|
||||||
Element(_1 = Exp(_2), val_, a_->val());
|
Element(_1 = Exp(_2), val_, a_->val());
|
||||||
@ -230,7 +194,7 @@ struct DotNodeOp : public BinaryNodeOp {
|
|||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
DotNodeOp(ChainPtr a, ChainPtr b, Args ...args)
|
DotNodeOp(ChainPtr a, ChainPtr b, Args ...args)
|
||||||
: BinaryNodeOp(a, b,
|
: BinaryNodeOp(a, b,
|
||||||
keywords::shape=newShape(a,b),
|
keywords::shape=newShape(a, b),
|
||||||
args...) { }
|
args...) { }
|
||||||
|
|
||||||
Shape newShape(ChainPtr a, ChainPtr b) {
|
Shape newShape(ChainPtr a, ChainPtr b) {
|
||||||
@ -258,41 +222,11 @@ struct DotNodeOp : public BinaryNodeOp {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Expr broadcast(Shape shape, Expr a);
|
struct PlusNodeOp : public BinaryNodeOp {
|
||||||
|
|
||||||
struct BroadcastingNodeOp : public BinaryNodeOp {
|
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
BroadcastingNodeOp(Expr a, Expr b, Args ...args)
|
PlusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
|
||||||
: BinaryNodeOp(broadcast(newShape(a ,b), a),
|
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
|
||||||
broadcast(newShape(a ,b), b),
|
|
||||||
keywords::shape=newShape(a, b),
|
|
||||||
args...) {}
|
|
||||||
|
|
||||||
static Shape newShape(ChainPtr a, ChainPtr b) {
|
|
||||||
size_t dimsA = a->shape().size();
|
|
||||||
size_t dimsB = b->shape().size();
|
|
||||||
UTIL_THROW_IF2(dimsA != dimsB,
|
|
||||||
"Tensors have different numbers of dimensions");
|
|
||||||
Shape shape(dimsA);
|
|
||||||
for(size_t i = 0; i < dimsA; ++i) {
|
|
||||||
int dimA = a->shape()[i];
|
|
||||||
int dimB = b->shape()[i];
|
|
||||||
bool broadcastable = (dimA == dimB || dimA == 1 || dimB == 1);
|
|
||||||
UTIL_THROW_IF2(!broadcastable, "Different dimensions in elementwise "
|
|
||||||
<< "operation cannot be broadcasted: " << dimA << " != " << dimB);
|
|
||||||
shape[i] = std::max(dimA, dimB);
|
|
||||||
if(dimA == whatevs || dimB == whatevs)
|
|
||||||
shape[i] = whatevs;
|
|
||||||
}
|
|
||||||
return shape;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
struct PlusNodeOp : public BroadcastingNodeOp {
|
|
||||||
template <typename ...Args>
|
|
||||||
PlusNodeOp(Args ...args) : BroadcastingNodeOp(args...) { }
|
|
||||||
|
|
||||||
void forward() {
|
void forward() {
|
||||||
Element(_1 = _2 + _3,
|
Element(_1 = _2 + _3,
|
||||||
val_, a_->val(), b_->val());
|
val_, a_->val(), b_->val());
|
||||||
@ -306,10 +240,11 @@ struct PlusNodeOp : public BroadcastingNodeOp {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct MinusNodeOp : public BroadcastingNodeOp {
|
struct MinusNodeOp : public BinaryNodeOp {
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
MinusNodeOp(Args ...args) : BroadcastingNodeOp(args...) { }
|
MinusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
|
||||||
|
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
|
||||||
|
|
||||||
void forward() {
|
void forward() {
|
||||||
Element(_1 = _2 - _3,
|
Element(_1 = _2 - _3,
|
||||||
val_, a_->val(), b_->val());
|
val_, a_->val(), b_->val());
|
||||||
@ -323,10 +258,11 @@ struct MinusNodeOp : public BroadcastingNodeOp {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct MultNodeOp : public BroadcastingNodeOp {
|
struct MultNodeOp : public BinaryNodeOp {
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
MultNodeOp(Args ...args) : BroadcastingNodeOp(args...) { }
|
MultNodeOp(ChainPtr a, ChainPtr b, Args ...args)
|
||||||
|
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
|
||||||
|
|
||||||
void forward() {
|
void forward() {
|
||||||
Element(_1 = _2 * _3,
|
Element(_1 = _2 * _3,
|
||||||
val_, a_->val(), b_->val());
|
val_, a_->val(), b_->val());
|
||||||
@ -340,9 +276,10 @@ struct MultNodeOp : public BroadcastingNodeOp {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct DivNodeOp : public BroadcastingNodeOp {
|
struct DivNodeOp : public BinaryNodeOp {
|
||||||
template <typename ...Args>
|
template <typename ...Args>
|
||||||
DivNodeOp(Args ...args) : BroadcastingNodeOp(args...) { }
|
DivNodeOp(ChainPtr a, ChainPtr b, Args ...args)
|
||||||
|
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
|
||||||
|
|
||||||
void forward() {
|
void forward() {
|
||||||
Element(_1 = _2 / _3,
|
Element(_1 = _2 / _3,
|
@ -18,7 +18,7 @@ void ones(Tensor t) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Distribution>
|
template <class Distribution>
|
||||||
void distribution(Tensor t, float a=0.0, float b=0.1) {
|
void distribution(Tensor t, float a, float b) {
|
||||||
std::random_device device;
|
std::random_device device;
|
||||||
std::default_random_engine engine(device());
|
std::default_random_engine engine(device());
|
||||||
Distribution dist(a, b);
|
Distribution dist(a, b);
|
||||||
@ -43,7 +43,7 @@ std::function<void(Tensor)> uniform(float a = 0.0, float b = 0.1) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::function<void(Tensor)> from_vector(const std::vector<float>& v) {
|
std::function<void(Tensor)> from_vector(const std::vector<float>& v) {
|
||||||
return [&v](Tensor t) {
|
return [v](Tensor t) {
|
||||||
t << v;
|
t << v;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
20
src/sgd.cu
20
src/sgd.cu
@ -7,15 +7,11 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
namespace marian {
|
namespace marian {
|
||||||
SGD::SGD(Expr& cost_func, Expr& inX, Expr& inY,
|
SGD::SGD(ExpressionGraph& g, float eta,
|
||||||
const std::vector<Expr*> params, float eta,
|
|
||||||
std::vector<float>& xData, size_t numFeatures,
|
std::vector<float>& xData, size_t numFeatures,
|
||||||
std::vector<float>& yData, size_t numClasses,
|
std::vector<float>& yData, size_t numClasses,
|
||||||
size_t epochs, size_t batchSize)
|
size_t epochs, size_t batchSize)
|
||||||
: cost_function_(&cost_func),
|
: graph_(g),
|
||||||
inX_(&inX),
|
|
||||||
inY_(&inY),
|
|
||||||
params_(params),
|
|
||||||
eta_(eta),
|
eta_(eta),
|
||||||
xData_(xData),
|
xData_(xData),
|
||||||
numFeatures_(numFeatures),
|
numFeatures_(numFeatures),
|
||||||
@ -45,11 +41,11 @@ void SGD::Run()
|
|||||||
size_t endId = startId + batchSize;
|
size_t endId = startId + batchSize;
|
||||||
|
|
||||||
PrepareBatch(startId, endId, batchSize, shuffle, xt, yt);
|
PrepareBatch(startId, endId, batchSize, shuffle, xt, yt);
|
||||||
*inX_ = xt;
|
graph_["x"] = xt;
|
||||||
*inY_ = yt;
|
graph_["y"] = yt;
|
||||||
|
|
||||||
cost_function_->forward(maxBatchSize_);
|
graph_.forward(maxBatchSize_);
|
||||||
cost_function_->backward();
|
graph_.backward();
|
||||||
|
|
||||||
UpdateModel();
|
UpdateModel();
|
||||||
|
|
||||||
@ -136,9 +132,9 @@ void SGD::PrepareBatch(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SGD::UpdateModel() {
|
void SGD::UpdateModel() {
|
||||||
for (auto& param : params_) {
|
for (auto& param : graph_.params()) {
|
||||||
using namespace thrust::placeholders;
|
using namespace thrust::placeholders;
|
||||||
Element(_1 = _1 - eta_ * _2, param->val(), param->grad());
|
Element(_1 -= eta_ * _2, param.val(), param.grad());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
10
src/sgd.h
10
src/sgd.h
@ -3,7 +3,7 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
#include "expressions.h"
|
#include "expression_graph.h"
|
||||||
#include "thrust_functions.h"
|
#include "thrust_functions.h"
|
||||||
#include "tensor_operators.h"
|
#include "tensor_operators.h"
|
||||||
|
|
||||||
@ -11,8 +11,7 @@ namespace marian {
|
|||||||
|
|
||||||
class SGD {
|
class SGD {
|
||||||
public:
|
public:
|
||||||
SGD(Expr& cost_func, Expr& inX, Expr& inY,
|
SGD(ExpressionGraph& g, float eta,
|
||||||
const std::vector<Expr*> params, float eta,
|
|
||||||
std::vector<float>& xData, size_t numFeatures,
|
std::vector<float>& xData, size_t numFeatures,
|
||||||
std::vector<float>& yData, size_t numClasses,
|
std::vector<float>& yData, size_t numClasses,
|
||||||
size_t epochs, size_t batchSize);
|
size_t epochs, size_t batchSize);
|
||||||
@ -20,10 +19,7 @@ class SGD {
|
|||||||
void Run();
|
void Run();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Expr *cost_function_;
|
ExpressionGraph& graph_;
|
||||||
Expr *inX_;
|
|
||||||
Expr *inY_;
|
|
||||||
std::vector<Expr*> params_;
|
|
||||||
const float eta_;
|
const float eta_;
|
||||||
std::vector<float>& xData_;
|
std::vector<float>& xData_;
|
||||||
const size_t numFeatures_;
|
const size_t numFeatures_;
|
||||||
|
75
src/test.cu
75
src/test.cu
@ -1,55 +1,70 @@
|
|||||||
|
#include <fstream>
|
||||||
#include "marian.h"
|
#include "marian.h"
|
||||||
#include "mnist.h"
|
#include "mnist.h"
|
||||||
|
#include "vocab.h"
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
cudaSetDevice(0);
|
cudaSetDevice(0);
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
using namespace marian;
|
using namespace marian;
|
||||||
using namespace keywords;
|
using namespace keywords;
|
||||||
|
|
||||||
|
Vocab sourceVocab, targetVocab;
|
||||||
|
|
||||||
int input_size = 10;
|
int input_size = 10;
|
||||||
int output_size = 2;
|
int output_size = 2;
|
||||||
int batch_size = 25;
|
int batch_size = 25;
|
||||||
int hidden_size = 5;
|
int hidden_size = 5;
|
||||||
int num_inputs = 8;
|
int num_inputs = 8;
|
||||||
|
|
||||||
std::vector<Expr*> X(num_inputs);
|
std::vector<Expr> X;
|
||||||
std::vector<Expr*> Y(num_inputs);
|
std::vector<Expr> Y;
|
||||||
std::vector<Expr*> H(num_inputs);
|
std::vector<Expr> H;
|
||||||
|
|
||||||
|
ExpressionGraph g;
|
||||||
|
|
||||||
for (int t = 0; t < num_inputs; ++t) {
|
for (int t = 0; t < num_inputs; ++t) {
|
||||||
X[t] = new Expr(input(shape={batch_size, input_size}));
|
X.emplace_back(g.input(shape={batch_size, input_size}));
|
||||||
Y[t] = new Expr(input(shape={batch_size, output_size}));
|
Y.emplace_back(g.input(shape={batch_size, output_size}));
|
||||||
}
|
}
|
||||||
|
|
||||||
Expr Wxh = param(shape={input_size, hidden_size}, init=uniform(), name="Wxh");
|
Expr Wxh = g.param(shape={input_size, hidden_size}, init=uniform(), name="Wxh");
|
||||||
Expr Whh = param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh");
|
Expr Whh = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh");
|
||||||
Expr bh = param(shape={1, hidden_size}, init=uniform(), name="bh");
|
Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh");
|
||||||
Expr h0 = param(shape={1, hidden_size}, init=uniform(), name="h0");
|
Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0");
|
||||||
|
|
||||||
|
// read parallel corpus from file
|
||||||
|
std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
|
||||||
|
std::fstream targetFile("../examples/mt/dev/newstest2013.en");
|
||||||
|
|
||||||
|
string sourceLine, targetLine;
|
||||||
|
while (getline(sourceFile, sourceLine)) {
|
||||||
|
getline(targetFile, targetLine);
|
||||||
|
|
||||||
|
std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
|
||||||
|
std::vector<size_t> targetIds = sourceVocab.ProcessSentence(targetLine);
|
||||||
|
}
|
||||||
|
|
||||||
std::cerr << "Building RNN..." << std::endl;
|
std::cerr << "Building RNN..." << std::endl;
|
||||||
H[0] = new Expr(tanh(dot(*X[0], Wxh) + dot(h0, Whh) + bh));
|
H.emplace_back(tanh(dot(X[0], Wxh) + dot(h0, Whh) + bh));
|
||||||
for (int t = 1; t < num_inputs; ++t) {
|
for (int t = 1; t < num_inputs; ++t) {
|
||||||
H[t] = new Expr(tanh(dot(*X[t], Wxh) + dot(*H[t-1], Whh) + bh));
|
H.emplace_back(tanh(dot(X[t], Wxh) + dot(H[t-1], Whh) + bh));
|
||||||
}
|
}
|
||||||
|
|
||||||
Expr Why = param(shape={hidden_size, output_size}, init=uniform(), name="Why");
|
Expr Why = g.param(shape={hidden_size, output_size}, init=uniform(), name="Why");
|
||||||
Expr by = param(shape={1, output_size}, init=uniform(), name="by");
|
Expr by = g.param(shape={1, output_size}, init=uniform(), name="by");
|
||||||
|
|
||||||
std::cerr << "Building output layer..." << std::endl;
|
std::cerr << "Building output layer..." << std::endl;
|
||||||
std::vector<Expr*> Yp(num_inputs);
|
std::vector<Expr> Yp;
|
||||||
|
|
||||||
Expr* cross_entropy = NULL;
|
Yp.emplace_back(softmax_fast(dot(H[0], Why) + by));
|
||||||
for (int t = 0; t < num_inputs; ++t) {
|
Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
|
||||||
Yp[t] = new Expr(softmax_fast(dot(*H[t], Why) + by, name="pred"));
|
for (int t = 1; t < num_inputs; ++t) {
|
||||||
if (!cross_entropy) {
|
Yp.emplace_back(softmax_fast(dot(H[t], Why) + by));
|
||||||
cross_entropy = new Expr(sum(*Y[t] * log(*Yp[t]), axis=1));
|
cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
|
||||||
} else {
|
|
||||||
*cross_entropy = *cross_entropy + sum(*Y[t] * log(*Yp[t]), axis=1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
auto graph = -mean(*cross_entropy, axis=0, name="cost");
|
auto graph = -mean(cross_entropy, axis=0, name="cost");
|
||||||
|
|
||||||
for (int t = 0; t < num_inputs; ++t) {
|
for (int t = 0; t < num_inputs; ++t) {
|
||||||
Tensor Xt({batch_size, input_size});
|
Tensor Xt({batch_size, input_size});
|
||||||
@ -72,17 +87,17 @@ int main(int argc, char** argv) {
|
|||||||
thrust::copy(values.begin(), values.end(), Xt.begin());
|
thrust::copy(values.begin(), values.end(), Xt.begin());
|
||||||
thrust::copy(classes.begin(), classes.end(), Yt.begin());
|
thrust::copy(classes.begin(), classes.end(), Yt.begin());
|
||||||
|
|
||||||
*X[t] = Xt;
|
X[t] = Xt;
|
||||||
*Y[t] = Yt;
|
Y[t] = Yt;
|
||||||
}
|
}
|
||||||
|
|
||||||
graph.forward(batch_size);
|
g.forward(batch_size);
|
||||||
graph.backward();
|
g.backward();
|
||||||
|
|
||||||
std::cerr << graph.val().Debug() << std::endl;
|
std::cerr << graph.val().Debug() << std::endl;
|
||||||
|
|
||||||
std::cerr << X[0]->val().Debug() << std::endl;
|
std::cerr << X[0].val().Debug() << std::endl;
|
||||||
std::cerr << Y[0]->val().Debug() << std::endl;
|
std::cerr << Y[0].val().Debug() << std::endl;
|
||||||
|
|
||||||
std::cerr << Whh.grad().Debug() << std::endl;
|
std::cerr << Whh.grad().Debug() << std::endl;
|
||||||
std::cerr << bh.grad().Debug() << std::endl;
|
std::cerr << bh.grad().Debug() << std::endl;
|
||||||
|
@ -16,22 +16,24 @@ int main(int argc, char** argv) {
|
|||||||
using namespace marian;
|
using namespace marian;
|
||||||
using namespace keywords;
|
using namespace keywords;
|
||||||
|
|
||||||
Expr x = input(shape={whatevs, IMAGE_SIZE}, name="X");
|
ExpressionGraph g;
|
||||||
Expr y = input(shape={whatevs, LABEL_SIZE}, name="Y");
|
|
||||||
|
Expr x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
|
||||||
|
Expr y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
|
||||||
|
|
||||||
Expr w = param(shape={IMAGE_SIZE, LABEL_SIZE}, name="W0");
|
Expr w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE}), "w");
|
||||||
Expr b = param(shape={1, LABEL_SIZE}, name="b0");
|
Expr b = named(g.param(shape={1, LABEL_SIZE}), "b");
|
||||||
|
|
||||||
std::vector<Expr*> params;
|
std::vector<Expr*> params;
|
||||||
params.push_back(&w);
|
params.push_back(&w);
|
||||||
params.push_back(&b);
|
params.push_back(&b);
|
||||||
|
|
||||||
auto scores = dot(x, w) + b;
|
auto scores = dot(x, w) + b;
|
||||||
auto lr = softmax_fast(scores, axis=1, name="pred");
|
auto lr = softmax_fast(scores);
|
||||||
auto cost = -mean(sum(y * log(lr), axis=1), axis=0, name="cost");
|
auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost");
|
||||||
cerr << "lr=" << lr.Debug() << endl;
|
cerr << "lr=" << lr.Debug() << endl;
|
||||||
|
|
||||||
SGD opt(cost, x, y, params, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24);
|
SGD opt(g, 0.9, trainImages, IMAGE_SIZE, trainLabels, LABEL_SIZE, 3, 24);
|
||||||
opt.Run();
|
opt.Run();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -2,24 +2,15 @@
|
|||||||
#include "marian.h"
|
#include "marian.h"
|
||||||
#include "mnist.h"
|
#include "mnist.h"
|
||||||
#include "npz_converter.h"
|
#include "npz_converter.h"
|
||||||
#include "param_initializers.h"
|
|
||||||
|
|
||||||
using namespace marian;
|
using namespace marian;
|
||||||
using namespace keywords;
|
using namespace keywords;
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
const size_t IMAGE_SIZE = 784;
|
||||||
|
const size_t LABEL_SIZE = 10;
|
||||||
cudaSetDevice(1);
|
int BATCH_SIZE = 10000;
|
||||||
|
|
||||||
const size_t IMAGE_SIZE = 784;
|
|
||||||
const size_t LABEL_SIZE = 10;
|
|
||||||
int BATCH_SIZE = 10000;
|
|
||||||
|
|
||||||
std::cerr << "Loading test set...";
|
|
||||||
std::vector<float> testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", BATCH_SIZE, IMAGE_SIZE);
|
|
||||||
std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", BATCH_SIZE, LABEL_SIZE);
|
|
||||||
std::cerr << "Done." << std::endl;
|
|
||||||
|
|
||||||
|
ExpressionGraph build_graph() {
|
||||||
std::cerr << "Loading model params...";
|
std::cerr << "Loading model params...";
|
||||||
NpzConverter converter("../scripts/test_model_single/model.npz");
|
NpzConverter converter("../scripts/test_model_single/model.npz");
|
||||||
|
|
||||||
@ -31,29 +22,50 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
std::cerr << "Building model...";
|
std::cerr << "Building model...";
|
||||||
|
|
||||||
auto x = input(shape={whatevs, IMAGE_SIZE});
|
ExpressionGraph g;
|
||||||
auto y = input(shape={whatevs, LABEL_SIZE});
|
auto x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
|
||||||
|
auto y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
|
||||||
|
|
||||||
auto w = param(shape={IMAGE_SIZE, LABEL_SIZE},
|
auto w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE},
|
||||||
init=from_vector(wData));
|
init=from_vector(wData)), "w");
|
||||||
auto b = param(shape={1, LABEL_SIZE},
|
auto b = named(g.param(shape={1, LABEL_SIZE},
|
||||||
init=from_vector(bData));
|
init=from_vector(bData)), "b");
|
||||||
|
|
||||||
auto probs = softmax_fast(dot(x, w) + b, axis=1);
|
auto probs = named(
|
||||||
auto cost = -mean(sum(y * log(probs), axis=1), axis=0);
|
softmax_fast(dot(x, w) + b), //, axis=1),
|
||||||
|
"probs"
|
||||||
|
);
|
||||||
|
|
||||||
|
auto cost = named(
|
||||||
|
-mean(sum(y * log(probs), axis=1), axis=0),
|
||||||
|
"cost"
|
||||||
|
);
|
||||||
|
|
||||||
std::cerr << "Done." << std::endl;
|
std::cerr << "Done." << std::endl;
|
||||||
|
return g;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
|
||||||
|
cudaSetDevice(1);
|
||||||
|
|
||||||
|
std::cerr << "Loading test set...";
|
||||||
|
std::vector<float> testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", BATCH_SIZE, IMAGE_SIZE);
|
||||||
|
std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", BATCH_SIZE, LABEL_SIZE);
|
||||||
|
std::cerr << "Done." << std::endl;
|
||||||
|
|
||||||
|
ExpressionGraph g = build_graph();
|
||||||
|
|
||||||
Tensor xt({BATCH_SIZE, IMAGE_SIZE});
|
Tensor xt({BATCH_SIZE, IMAGE_SIZE});
|
||||||
Tensor yt({BATCH_SIZE, LABEL_SIZE});
|
Tensor yt({BATCH_SIZE, LABEL_SIZE});
|
||||||
|
|
||||||
x = xt << testImages;
|
g["x"] = (xt << testImages);
|
||||||
y = yt << testLabels;
|
g["y"] = (yt << testLabels);
|
||||||
|
|
||||||
cost.forward(BATCH_SIZE);
|
g.forward(BATCH_SIZE);
|
||||||
|
|
||||||
std::vector<float> results;
|
std::vector<float> results;
|
||||||
results << probs.val();
|
results << g["probs"].val();
|
||||||
|
|
||||||
size_t acc = 0;
|
size_t acc = 0;
|
||||||
for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
|
for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
|
||||||
@ -65,22 +77,22 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
acc += (correct == proposed);
|
acc += (correct == proposed);
|
||||||
}
|
}
|
||||||
std::cerr << "Cost: " << cost.val()[0] << " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
|
std::cerr << "Cost: " << g["cost"].val()[0] << " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
|
||||||
|
|
||||||
float eta = 0.1;
|
float eta = 0.1;
|
||||||
for (size_t j = 0; j < 10; ++j) {
|
for (size_t j = 0; j < 10; ++j) {
|
||||||
for(size_t i = 0; i < 60; ++i) {
|
for(size_t i = 0; i < 60; ++i) {
|
||||||
cost.backward();
|
g.backward();
|
||||||
|
|
||||||
auto update_rule = _1 -= eta * _2;
|
auto update_rule = _1 -= eta * _2;
|
||||||
Element(update_rule, w.val(), w.grad());
|
for(auto param : g.params())
|
||||||
Element(update_rule, b.val(), b.grad());
|
Element(update_rule, param.val(), param.grad());
|
||||||
|
|
||||||
cost.forward(BATCH_SIZE);
|
g.forward(BATCH_SIZE);
|
||||||
}
|
}
|
||||||
std::cerr << "Epoch: " << j << std::endl;
|
std::cerr << "Epoch: " << j << std::endl;
|
||||||
std::vector<float> results;
|
std::vector<float> results;
|
||||||
results << probs.val();
|
results << g["probs"].val();
|
||||||
|
|
||||||
size_t acc = 0;
|
size_t acc = 0;
|
||||||
for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
|
for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
|
||||||
@ -92,7 +104,7 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
acc += (correct == proposed);
|
acc += (correct == proposed);
|
||||||
}
|
}
|
||||||
std::cerr << "Cost: " << cost.val()[0] << " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
|
std::cerr << "Cost: " << g["cost"].val()[0] << " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -59,13 +59,15 @@ int main(int argc, char** argv) {
|
|||||||
std::cerr << "\tDone." << std::endl;
|
std::cerr << "\tDone." << std::endl;
|
||||||
|
|
||||||
|
|
||||||
auto x = input(shape={whatevs, IMAGE_SIZE}, name="X");
|
ExpressionGraph g;
|
||||||
auto y = input(shape={whatevs, LABEL_SIZE}, name="Y");
|
|
||||||
|
|
||||||
auto w1 = param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1);
|
auto x = g.input(shape={whatevs, IMAGE_SIZE}, name="X");
|
||||||
auto b1 = param(shape={1, 100}, name="b0", init=initB1);
|
auto y = g.input(shape={whatevs, LABEL_SIZE}, name="Y");
|
||||||
auto w2 = param(shape={100, LABEL_SIZE}, name="W1", init=initW2);
|
|
||||||
auto b2 = param(shape={1, LABEL_SIZE}, name="b1", init=initB2);
|
auto w1 = g.param(shape={IMAGE_SIZE, 100}, name="W0", init=initW1);
|
||||||
|
auto b1 = g.param(shape={1, 100}, name="b0", init=initB1);
|
||||||
|
auto w2 = g.param(shape={100, LABEL_SIZE}, name="W1", init=initW2);
|
||||||
|
auto b2 = g.param(shape={1, LABEL_SIZE}, name="b1", init=initB2);
|
||||||
|
|
||||||
std::cerr << "Building model...";
|
std::cerr << "Building model...";
|
||||||
auto layer1 = tanh(dot(x, w1) + b1);
|
auto layer1 = tanh(dot(x, w1) + b1);
|
||||||
@ -86,7 +88,7 @@ int main(int argc, char** argv) {
|
|||||||
xt << tmp;
|
xt << tmp;
|
||||||
x = xt;
|
x = xt;
|
||||||
|
|
||||||
predict.forward(BATCH_SIZE);
|
g.forward(BATCH_SIZE);
|
||||||
|
|
||||||
std::vector<float> results(LABEL_SIZE * BATCH_SIZE);
|
std::vector<float> results(LABEL_SIZE * BATCH_SIZE);
|
||||||
results << predict.val();
|
results << predict.val();
|
||||||
@ -113,7 +115,7 @@ int main(int argc, char** argv) {
|
|||||||
xt << tmp;
|
xt << tmp;
|
||||||
x = xt;
|
x = xt;
|
||||||
|
|
||||||
predict.forward(endId - startId);
|
g.forward(endId - startId);
|
||||||
|
|
||||||
std::vector<float> results(LABEL_SIZE * BATCH_SIZE);
|
std::vector<float> results(LABEL_SIZE * BATCH_SIZE);
|
||||||
results << predict.val();
|
results << predict.val();
|
||||||
|
53
src/vocab.cpp
Normal file
53
src/vocab.cpp
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
#include "vocab.h"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
inline std::vector<std::string> Tokenize(const std::string& str,
|
||||||
|
const std::string& delimiters = " \t")
|
||||||
|
{
|
||||||
|
std::vector<std::string> tokens;
|
||||||
|
// Skip delimiters at beginning.
|
||||||
|
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
|
||||||
|
// Find first "non-delimiter".
|
||||||
|
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
|
||||||
|
|
||||||
|
while (std::string::npos != pos || std::string::npos != lastPos) {
|
||||||
|
// Found a token, add it to the vector.
|
||||||
|
tokens.push_back(str.substr(lastPos, pos - lastPos));
|
||||||
|
// Skip delimiters. Note the "not_of"
|
||||||
|
lastPos = str.find_first_not_of(delimiters, pos);
|
||||||
|
// Find next "non-delimiter"
|
||||||
|
pos = str.find_first_of(delimiters, lastPos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
size_t Vocab::GetOrCreate(const std::string &word)
|
||||||
|
{
|
||||||
|
size_t id;
|
||||||
|
Coll::const_iterator iter = coll_.find(word);
|
||||||
|
if (iter == coll_.end()) {
|
||||||
|
id = coll_.size();
|
||||||
|
coll_[word] = id;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
id = iter->second;
|
||||||
|
}
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<size_t> Vocab::ProcessSentence(const std::string &sentence)
|
||||||
|
{
|
||||||
|
vector<string> toks = Tokenize(sentence);
|
||||||
|
vector<size_t> ret(toks.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < toks.size(); ++i) {
|
||||||
|
size_t id = GetOrCreate(toks[i]);
|
||||||
|
ret[i] = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
17
src/vocab.h
Normal file
17
src/vocab.h
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
class Vocab
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
size_t GetOrCreate(const std::string &word);
|
||||||
|
std::vector<size_t> ProcessSentence(const std::string &sentence);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
typedef std::unordered_map<std::string, size_t> Coll;
|
||||||
|
Coll coll_;
|
||||||
|
};
|
||||||
|
|
Loading…
Reference in New Issue
Block a user