resolved conflicts

This commit is contained in:
Marcin Junczys-Dowmunt 2016-09-20 14:14:13 +02:00
commit 99b643dcfa
12 changed files with 656 additions and 544 deletions

View File

@ -37,11 +37,15 @@
</tool>
<tool id="nvcc.linker.base.635344589" name="NVCC Linker" superClass="nvcc.linker.base">
<option id="nvcc.linker.option.libs.1878015233" name="Libraries (-l)" superClass="nvcc.linker.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="boost_chrono"/>
<listOptionValue builtIn="false" value="boost_system"/>
<listOptionValue builtIn="false" value="boost_timer"/>
<listOptionValue builtIn="false" value="cudnn"/>
<listOptionValue builtIn="false" value="cuda"/>
<listOptionValue builtIn="false" value="cublas"/>
</option>
<option id="nvcc.linker.option.paths.1326041662" name="Library search path (-L)" superClass="nvcc.linker.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="/usr/local/cuda/lib"/>
<listOptionValue builtIn="false" value="/usr/lib"/>
</option>
@ -56,11 +60,11 @@
</tool>
</toolChain>
</folderInfo>
<fileInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.843925199" name="validate_mnist_batch.cu" rcbsApplicability="disable" resourcePath="src/validate_mnist_batch.cu" toolsToInvoke="nvcc.compiler.base.1979453423.378728796">
<tool id="nvcc.compiler.base.1979453423.378728796" name="NVCC Compiler" superClass="nvcc.compiler.base.1979453423"/>
<fileInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.799279171" name="mnist_benchmark.cu" rcbsApplicability="disable" resourcePath="src/mnist_benchmark.cu" toolsToInvoke="nvcc.compiler.base.1979453423.992734787">
<tool id="nvcc.compiler.base.1979453423.992734787" name="NVCC Compiler" superClass="nvcc.compiler.base.1979453423"/>
</fileInfo>
<sourceEntries>
<entry excluding="src/validate_mnist_batch.cu|src/train_mnist.cu|src/validate_mnist.cu|src/npz_converter.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
<entry excluding="src/mnist_benchmark.cu|src/validate_encoder_decoder.cu|src/test.cu|src/validate_mnist_batch.cu|src/train_mnist.cu|src/validate_mnist.cu|src/npz_converter.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>

View File

@ -18,8 +18,6 @@ cuda_add_executable(
test.cu
)
target_link_libraries(marian marian_lib)
cuda_add_executable(
mnist_benchmark
mnist_benchmark.cu
@ -35,11 +33,18 @@ cuda_add_executable(
validate_encoder_decoder.cu
)
cuda_add_executable(
test_nodes
test_nodes.cu
)
target_link_libraries(marian marian_lib)
target_link_libraries(mnist_benchmark marian_lib)
target_link_libraries(validate_mnist_batch marian_lib)
target_link_libraries(validate_encoder_decoder marian_lib)
target_link_libraries(test_nodes marian_lib)
foreach(exec marian mnist_benchmark validate_mnist_batch validate_encoder_decoder)
foreach(exec marian mnist_benchmark validate_mnist_batch validate_encoder_decoder test_nodes)
target_link_libraries(${exec} ${EXT_LIBS} cuda cudnn curand)
cuda_add_cublas_to_target(${exec})
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")

View File

@ -34,6 +34,8 @@ struct Chainable {
virtual ~Chainable() { }
virtual void forward() { }
virtual void backward() { }
virtual void backward_numeric(Float delta) { }
virtual void check() { }
virtual void init_dependent() { }
virtual void set_zero_adjoint() { }

View File

@ -127,6 +127,19 @@ class ExpressionGraph {
(*it)->backward();
}
void backward_numeric(Float delta) {
for(auto&& v : *stack_)
v->set_zero_adjoint();
typedef typename ChainableStack::reverse_iterator It;
stack_->back()->init_dependent();
for(It it = stack_->rbegin(); it != stack_->rend(); ++it) {
Chainable<Tensor> *chainable = *it;
//chainable->backward();
chainable->backward_numeric(delta);
}
}
/**
* @brief Returns a string representing this expression graph in <code>graphviz</code> notation.
*

View File

@ -33,8 +33,8 @@ ExpressionGraph build_graph(const std::vector<int>& dims) {
layers.emplace_back(x);
}
else {
//layers.emplace_back(reluplus(dot(layers.back(), weights.back()), biases.back()));
layers.emplace_back(relu(dot(layers.back(), weights.back()) + biases.back()));
layers.emplace_back(reluplus(dot(layers.back(), weights.back()), biases.back()));
//layers.emplace_back(relu(dot(layers.back(), weights.back()) + biases.back()));
}
weights.emplace_back(

View File

@ -23,6 +23,8 @@
#include "node.h"
#include "tensor_operators.h"
#include "node_operators_unary.h"
#include "node_operators_binary.h"
namespace marian {
@ -109,527 +111,4 @@ struct ParamNode : public Node {
bool initialized_;
};
struct UnaryNodeOp : public Node {
ChainPtr a_;
template <typename ...Args>
UnaryNodeOp(ChainPtr a, Args ...args)
: Node(keywords::shape=a->shape(), //@TODO: Check keywords?
args...), a_(a) {}
};
struct LogitNodeOp : public UnaryNodeOp {
template <typename ...Args>
LogitNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = Sigma(_2),
val_, a_->val());
}
void backward() {
Element(_1 += _2 * _3 * (1.0f - _3),
a_->grad(), adj_, val_);
}
void check() {
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("logit")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct TanhNodeOp : public UnaryNodeOp {
template <typename ...Args>
TanhNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = Tanh(_2),
val_, a_->val());
}
void backward() {
Element(_1 += _2 * (1.0f - (_3 * _3)),
a_->grad(), adj_, val_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("tanh")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct ReLUNodeOp : public UnaryNodeOp {
template <typename ...Args>
ReLUNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = ReLU(_2),
val_, a_->val());
}
void backward() {
Element(_1 += _2 * ReLUback(_3),
a_->grad(), adj_, a_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("ReLU")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
// @TODO: slow and probably buggy
struct DropoutNodeOp : public UnaryNodeOp {
template <typename ...Args>
DropoutNodeOp(Args ...args)
: UnaryNodeOp(args...),
p_(0.5), seed_(time(0)) { }
void forward() {
//Element(_1 = Bernoulli(p_, (size_t)this) * _2,
// val_, a_->val())
Dropout(val_, a_->val(), p_, seed_++);
}
void backward() {
Element(_1 += _2 * (_3 != 0.0f), // transform non-zero to 1
a_->grad(), adj_, val_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("dropout")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
private:
float p_;
int seed_;
};
struct SoftmaxNodeOp : public UnaryNodeOp {
template <typename ...Args>
SoftmaxNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
// B = softmax(A).
thrust::copy(a_->val().begin(), a_->val().end(), val_.begin());
// Safe version of softmax.
Softmax(&val_);
}
void backward() {
// For each row, the Jacobian times vector is given by:
// J * dy = p .* (dy - avg*1)
// where avg = p'*dy and p is the softmax output (probabilities).
//
// For more information, see sec. 2.5 of the following reference:
// André F. T. Martins and Ramon Astudillo.
// "From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label
// Classification." ICML 2016.
// http://jmlr.org/proceedings/papers/v48/martins16.pdf
SoftmaxGrad(a_->grad(), adj_, val_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("softmax")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct ArgmaxNodeOp : public UnaryNodeOp {
template <typename ...Args>
ArgmaxNodeOp(ChainPtr a, Args ...args)
: UnaryNodeOp(a, keywords::shape=newShape(a), args...) { }
void forward() {
// B = softmax(A).
Argmax(&val_, &a_->val());
}
void backward() {
}
Shape newShape(ChainPtr a) {
Shape shape = a->shape();
shape[1] = 1;
return shape;
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label="
<< label("argmax") << ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct LogNodeOp : public UnaryNodeOp {
template <typename ...Args>
LogNodeOp(Args ...args)
: UnaryNodeOp(args...) {}
void forward() {
Element(_1 = Log(_2), val_, a_->val());
}
void backward() {
Element(_1 += _2 * (1.f / _3),
a_->grad(), adj_, a_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label="
<< label("log") << ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct ExpNodeOp : public UnaryNodeOp {
template <typename ...Args>
ExpNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = Exp(_2), val_, a_->val());
}
void backward() {
Element(_1 += _2 * Exp(_3),
a_->grad(), adj_, a_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("exp")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct NegNodeOp : public UnaryNodeOp {
template <typename ...Args>
NegNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = -_2, val_, a_->val());
}
void backward() {
Element(_1 += -_2, a_->grad(), adj_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label="
<< label("-") << ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
/******************************************************/
struct BinaryNodeOp : public Node {
ChainPtr a_;
ChainPtr b_;
template <typename ...Args>
BinaryNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: Node(args...), a_(a), b_(b) {}
};
/*** Matrix Product ***/
struct DotNodeOp : public BinaryNodeOp {
template <typename ...Args>
DotNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b,
keywords::shape=newShape(a, b),
args...) { }
Shape newShape(ChainPtr a, ChainPtr b) {
Shape shape1 = a->shape();
Shape shape2 = b->shape();
UTIL_THROW_IF2(shape1[1] != shape2[0],
"matrix product requires dimensions to match");
shape1[1] = shape2[1];
return shape1;
}
void forward() {
// C = A*B
Prod(val_, a_->val(), b_->val(), false, false);
}
void backward() {
// D is the adjoint, the matrix of derivatives
// df/dA += D*B.T
// df/dB += A.T*D
// beta set to 1.0 in gemm, C = dot(A,B) + beta * C
// to sum gradients from different graph parts
Prod(a_->grad(), adj_, b_->val(), false, true, 1.0);
Prod(b_->grad(), a_->val(), adj_, true, false, 1.0);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("×")
<< ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct PlusNodeOp : public BinaryNodeOp {
template <typename ...Args>
PlusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = _2 + _3,
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2,
a_->grad(), adj_);
Element(_1 += _2,
b_->grad(), adj_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("+")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct ReLUPlusNodeOp : public BinaryNodeOp {
template <typename ...Args>
ReLUPlusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
// v = f(g(a, b))
Element(_1 = ReLU(_2 + _3),
val_, a_->val(), b_->val());
}
void backward() {
// df/da = adj * f'(g(a, b)) : dg/da * df/dg
// df/db = adj * f'(g(a, b)) : dg/db * df/dg
Element(_1 += _2 * ReLUback(_3 + _4),
a_->grad(), adj_, a_->val(), b_->val());
Element(_1 += _2 * ReLUback(_3 + _4),
b_->grad(), adj_, a_->val(), b_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("ReLU<br/>+")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct MinusNodeOp : public BinaryNodeOp {
template <typename ...Args>
MinusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = _2 - _3,
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2,
a_->grad(), adj_);
Element(_1 -= _2,
b_->grad(), adj_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("-")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct MultNodeOp : public BinaryNodeOp {
template <typename ...Args>
MultNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = _2 * _3,
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2 * _3,
a_->grad(), adj_, b_->val());
Element(_1 += _2 * _3,
b_->grad(), adj_, a_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct DivNodeOp : public BinaryNodeOp {
template <typename ...Args>
DivNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = _2 / _3,
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2 * 1.0f / _3,
a_->grad(), adj_, b_->val());
Element(_1 -= _2 * _3 / (_4 * _4),
b_->grad(), adj_, a_->val(), b_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("÷")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
// Cross-entropy node. It computes -b*log(softmax(a)), summing rowwise.
struct CrossEntropyNodeOp : public BinaryNodeOp {
template <typename ...Args>
CrossEntropyNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b,
keywords::shape=newShape(a, b),
args...) { }
Shape newShape(ChainPtr a, ChainPtr b) {
Shape shape1 = a->shape();
Shape shape2 = b->shape();
UTIL_THROW_IF2(shape1[0] != shape2[0] || shape1[1] != shape2[1],
"cross entropy requires dimensions to match");
shape1[1] = 1;
return shape1;
}
// We're caching the softmax probabilities here because we'll need them for
// the backward computation.
void forward() {
// C = -dot(B, log(softmax(A))).
if (probs_) {
probs_.set(0.0);
} else {
probs_.allocate(a_->val().shape(), 0.0);
}
thrust::copy(a_->val().begin(), a_->val().end(), probs_.begin());
Softmax(&probs_); // Safe version of softmax.
Tensor result(a_->val().shape());
Element(_1 = -_2 * Log(_3), result, b_->val(), probs_);
SumRowwise(result, val_);
}
// @TODO: In most cases it's wasteful to compute the derivative with respect
// to the second input which is typically an input node in the computation
// graph. In general the backward functions can skip the computation of
// gradients wrt input nodes.
void backward() {
// For each row, the first input derivative is given by adj * (p - y),
// where y is the gold label distribution (e.g. one hot vector) and
// p is the softmax output (probabilities).
// The second input derivative is -adj*log(p).
Tensor result(probs_.shape());
// Compute first input derivative.
Element(_1 = _2 - _3, result, probs_, b_->val());
ScaleRowwise(result, adj_);
Element(_1 += _2, a_->grad(), result);
// Compute second input derivative.
Element(_1 = -Log(_2), result, probs_); // @TODO: use a cached log here.
ScaleRowwise(result, adj_);
Element(_1 += _2, b_->grad(), result);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("x-ent")
<< ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
protected:
Tensor probs_;
};
}

270
src/node_operators_binary.h Normal file
View File

@ -0,0 +1,270 @@
#include "node.h"
#include "tensor_operators.h"
namespace marian {
struct BinaryNodeOp : public Node {
ChainPtr a_;
ChainPtr b_;
template <typename ...Args>
BinaryNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: Node(args...), a_(a), b_(b) {}
};
/*** Matrix Product ***/
struct DotNodeOp : public BinaryNodeOp {
template <typename ...Args>
DotNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b,
keywords::shape=newShape(a, b),
args...) { }
Shape newShape(ChainPtr a, ChainPtr b) {
Shape shape1 = a->shape();
Shape shape2 = b->shape();
UTIL_THROW_IF2(shape1[1] != shape2[0],
"matrix product requires dimensions to match");
shape1[1] = shape2[1];
return shape1;
}
void forward() {
// C = A*B
Prod(val_, a_->val(), b_->val(), false, false);
}
void backward() {
// D is the adjoint, the matrix of derivatives
// df/dA += D*B.T
// df/dB += A.T*D
// beta set to 1.0 in gemm, C = dot(A,B) + beta * C
// to sum gradients from different graph parts
Prod(a_->grad(), adj_, b_->val(), false, true, 1.0);
Prod(b_->grad(), a_->val(), adj_, true, false, 1.0);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("×")
<< ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct PlusNodeOp : public BinaryNodeOp {
template <typename ...Args>
PlusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = _2 + _3,
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2,
a_->grad(), adj_);
Element(_1 += _2,
b_->grad(), adj_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("+")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct ReLUPlusNodeOp : public BinaryNodeOp {
template <typename ...Args>
ReLUPlusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = ReLU(_2 + _3),
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2 * ReLUback(_3 + _4),
a_->grad(), adj_, a_->val(), b_->val());
Element(_1 += _2 * ReLUback(_3 + _4),
b_->grad(), adj_, a_->val(), b_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("ReLU<br/>+")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct MinusNodeOp : public BinaryNodeOp {
template <typename ...Args>
MinusNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = _2 - _3,
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2,
a_->grad(), adj_);
Element(_1 -= _2,
b_->grad(), adj_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("-")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct MultNodeOp : public BinaryNodeOp {
template <typename ...Args>
MultNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = _2 * _3,
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2 * _3,
a_->grad(), adj_, b_->val());
Element(_1 += _2 * _3,
b_->grad(), adj_, a_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct DivNodeOp : public BinaryNodeOp {
template <typename ...Args>
DivNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b, keywords::shape=a->shape(), args...) { }
void forward() {
Element(_1 = _2 / _3,
val_, a_->val(), b_->val());
}
void backward() {
Element(_1 += _2 * 1.0f / _3,
a_->grad(), adj_, b_->val());
Element(_1 -= _2 * _3 / (_4 * _4),
b_->grad(), adj_, a_->val(), b_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("÷")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
// Cross-entropy node. It computes -b*log(softmax(a)), summing rowwise.
struct CrossEntropyNodeOp : public BinaryNodeOp {
template <typename ...Args>
CrossEntropyNodeOp(ChainPtr a, ChainPtr b, Args ...args)
: BinaryNodeOp(a, b,
keywords::shape=newShape(a, b),
args...) { }
Shape newShape(ChainPtr a, ChainPtr b) {
Shape shape1 = a->shape();
Shape shape2 = b->shape();
UTIL_THROW_IF2(shape1[0] != shape2[0] || shape1[1] != shape2[1],
"cross entropy requires dimensions to match");
shape1[1] = 1;
return shape1;
}
// We're caching the softmax probabilities here because we'll need them for
// the backward computation.
void forward() {
// C = -dot(B, log(softmax(A))).
if (probs_) {
probs_.set(0.0);
} else {
probs_.allocate(a_->val().shape(), 0.0);
}
thrust::copy(a_->val().begin(), a_->val().end(), probs_.begin());
Softmax(&probs_); // Safe version of softmax.
Tensor result(a_->val().shape());
Element(_1 = -_2 * Log(_3), result, b_->val(), probs_);
SumRowwise(result, val_);
}
// @TODO: In most cases it's wasteful to compute the derivative with respect
// to the second input which is typically an input node in the computation
// graph. In general the backward functions can skip the computation of
// gradients wrt input nodes.
void backward() {
// For each row, the first input derivative is given by adj * (p - y),
// where y is the gold label distribution (e.g. one hot vector) and
// p is the softmax output (probabilities).
// The second input derivative is -adj*log(p).
Tensor result(probs_.shape());
// Compute first input derivative.
Element(_1 = _2 - _3, result, probs_, b_->val());
ScaleRowwise(result, adj_);
Element(_1 += _2, a_->grad(), result);
// Compute second input derivative.
Element(_1 = -Log(_2), result, probs_); // @TODO: use a cached log here.
ScaleRowwise(result, adj_);
Element(_1 += _2, b_->grad(), result);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("x-ent")
<< ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
protected:
Tensor probs_;
};
}

264
src/node_operators_unary.h Normal file
View File

@ -0,0 +1,264 @@
#include "node.h"
#include "tensor_operators.h"
namespace marian {
struct UnaryNodeOp : public Node {
ChainPtr a_;
template <typename ...Args>
UnaryNodeOp(ChainPtr a, Args ...args)
: Node(keywords::shape=a->shape(), //@TODO: Check keywords?
args...), a_(a) {}
};
struct LogitNodeOp : public UnaryNodeOp {
template <typename ...Args>
LogitNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = Sigma(_2),
val_, a_->val());
}
void backward() {
Element(_1 += _2 * _3 * (1.0f - _3),
a_->grad(), adj_, val_);
}
void check() {
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("logit")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct TanhNodeOp : public UnaryNodeOp {
template <typename ...Args>
TanhNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = Tanh(_2),
val_, a_->val());
}
void backward() {
Element(_1 += _2 * (1.0f - (_3 * _3)),
a_->grad(), adj_, val_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("tanh")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct ReLUNodeOp : public UnaryNodeOp {
template <typename ...Args>
ReLUNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = ReLU(_2),
val_, a_->val());
}
void backward() {
Element(_1 += _2 * ReLUback(_3),
a_->grad(), adj_, a_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("ReLU")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
// @TODO: slow and probably buggy
struct DropoutNodeOp : public UnaryNodeOp {
template <typename ...Args>
DropoutNodeOp(Args ...args)
: UnaryNodeOp(args...),
p_(0.5), seed_(time(0)) { }
void forward() {
//Element(_1 = Bernoulli(p_, (size_t)this) * _2,
// val_, a_->val())
Dropout(val_, a_->val(), p_, seed_++);
}
void backward() {
Element(_1 += _2 * (_3 != 0.0f), // transform non-zero to 1
a_->grad(), adj_, val_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("dropout")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
private:
float p_;
int seed_;
};
struct SoftmaxNodeOp : public UnaryNodeOp {
template <typename ...Args>
SoftmaxNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
// B = softmax(A).
thrust::copy(a_->val().begin(), a_->val().end(), val_.begin());
// Safe version of softmax.
Softmax(&val_);
}
void backward() {
// For each row, the Jacobian times vector is given by:
// J * dy = p .* (dy - avg*1)
// where avg = p'*dy and p is the softmax output (probabilities).
//
// For more information, see sec. 2.5 of the following reference:
// André F. T. Martins and Ramon Astudillo.
// "From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label
// Classification." ICML 2016.
// http://jmlr.org/proceedings/papers/v48/martins16.pdf
SoftmaxGrad(a_->grad(), adj_, val_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("softmax")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct ArgmaxNodeOp : public UnaryNodeOp {
template <typename ...Args>
ArgmaxNodeOp(ChainPtr a, Args ...args)
: UnaryNodeOp(a, keywords::shape=newShape(a), args...) { }
void forward() {
// B = softmax(A).
Argmax(&val_, &a_->val());
}
void backward() {
}
Shape newShape(ChainPtr a) {
Shape shape = a->shape();
shape[1] = 1;
return shape;
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label="
<< label("argmax") << ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct LogNodeOp : public UnaryNodeOp {
template <typename ...Args>
LogNodeOp(Args ...args)
: UnaryNodeOp(args...) {}
void forward() {
Element(_1 = Log(_2), val_, a_->val());
}
void backward() {
Element(_1 += _2 * (1.f / _3),
a_->grad(), adj_, a_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label="
<< label("log") << ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct ExpNodeOp : public UnaryNodeOp {
template <typename ...Args>
ExpNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = Exp(_2), val_, a_->val());
}
void backward() {
Element(_1 += _2 * Exp(_3),
a_->grad(), adj_, a_->val());
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label=" << label("exp")
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
struct NegNodeOp : public UnaryNodeOp {
template <typename ...Args>
NegNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
void forward() {
Element(_1 = -_2, val_, a_->val());
}
void backward() {
Element(_1 += -_2, a_->grad(), adj_);
}
virtual std::string graphviz() {
std::stringstream ss;
ss << "\"" << this << "\" [shape=\"box\", label="
<< label("-") << ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
return ss.str();
};
};
}

View File

@ -207,6 +207,12 @@ class TensorImpl {
thrust::copy(begin, end, data_.begin());
}
void incr(Float incr) {
for (size_t i = 0; i < data_.size(); ++i) {
data_[i] += incr;
}
}
/**
* @brief Copy Tensor's vector from GPU to vector variable on CPU.
*
@ -405,17 +411,12 @@ class Tensor {
*/
std::string Debug() const
{
return pimpl_->Debug();
}
/**
* @brief Print Tensor data on CPU (?) (const).
*/
void Print() const {
for (int i = 0; i < size(); ++i) {
std::cerr << (*this)[i] << " ";
}
std::cerr << std::endl;
if (!pimpl_) {
return "Not yet set";
}
else {
return pimpl_->Debug();
}
}
//void Load(const std::string &path);
@ -434,6 +435,10 @@ class Tensor {
*/
void set(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end);
void incr(Float incr) {
pimpl_->incr(incr);
}
/**
* @brief Copy Tensor's vector from GPU to vector variable on CPU (const).
*

71
src/test_nodes.cu Normal file
View File

@ -0,0 +1,71 @@
#include <vector>
#include <random>
#include "marian.h"
#include "expression_graph.h"
#include "keywords.h"
#include "definitions.h"
float Rand()
{
float LO = -10;
float HI = +20;
float r3 = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
return r3;
}
int main(int argc, char** argv)
{
using namespace std;
using namespace marian;
using namespace keywords;
int input_size = 10;
int output_size = 10;
int batch_size = 25;
// define graph
ExpressionGraph g;
Expr inExpr = g.input(shape={batch_size, input_size});
Expr labelExpr = g.input(shape={batch_size, output_size});
//Expr outExpr = softmax(inExpr);
//Expr outExpr = tanh(inExpr);
Expr outExpr = - inExpr;
Expr ceExpr = cross_entropy(outExpr, labelExpr);
Expr cost = mean(ceExpr, axis=0);
// create data
srand(0);
std::vector<float> values(batch_size * input_size);
generate(begin(values), end(values), Rand);
std::vector<float> labels(batch_size * input_size);
generate(begin(labels), end(labels), Rand);
Tensor inTensor({batch_size, input_size});
thrust::copy(values.begin(), values.end(), inTensor.begin());
Tensor labelTensor({batch_size, input_size});
thrust::copy(labels.begin(), labels.end(), labelTensor.begin());
inExpr = inTensor;
labelExpr = labelTensor;
// train
g.forward(batch_size);
//g.backward();
g.backward_numeric(0.01);
std::cout << g.graphviz() << std::endl;
std::cerr << "inTensor=" << inTensor.Debug() << std::endl;
Tensor outTensor = outExpr.val();
std::cerr << "outTensor=" << outTensor.Debug() << std::endl;
Tensor outGrad = outExpr.grad();
std::cerr << "outGrad=" << outGrad.Debug() << std::endl;
}

View File

@ -133,9 +133,7 @@ int main(int argc, char** argv) {
while (getline(source_file, source_line)) {
getline(target_file, target_line);
std::vector<size_t> source_ids = source_vocab.ProcessSentence(source_line);
source_ids.push_back(source_vocab.GetEOS()); // Append EOS token.
std::vector<size_t> target_ids = target_vocab.ProcessSentence(target_line);
target_ids.push_back(target_vocab.GetEOS()); // Append EOS token.
source_sentences.push_back(source_ids);
target_sentences.push_back(target_ids);
if (num_source_tokens < 0 || source_ids.size() > num_source_tokens) {

View File

@ -75,6 +75,7 @@ std::vector<size_t> Vocab::ProcessSentence(const std::string &sentence)
size_t id = GetOrCreate(toks[i]);
ret[i] = id;
}
ret.push_back(GetEOS()); // Append EOS token.
return ret;
}