Merge ../Marian

This commit is contained in:
Hieu Hoang 2016-09-16 19:56:54 +02:00
commit ea66688a13
14 changed files with 166 additions and 56 deletions

View File

@ -20,3 +20,14 @@ endif(Boost_FOUND)
include_directories(${marian_SOURCE_DIR}/src)
add_subdirectory(src)
# add a target to generate API documentation with Doxygen
find_package(Doxygen)
if(DOXYGEN_FOUND)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
add_custom_target(doc
${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMENT "Generating API documentation with Doxygen" VERBATIM
)
endif(DOXYGEN_FOUND)

View File

@ -743,7 +743,7 @@ WARN_LOGFILE =
# spaces.
# Note: If this tag is empty the current directory is searched.
INPUT = src
INPUT = @CMAKE_CURRENT_SOURCE_DIR@/src
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses

View File

@ -1,5 +1,3 @@
The MIT License (MIT)
Copyright (c) 2016 Marcin Junczys-Dowmunt
Permission is hereby granted, free of charge, to any person obtaining a copy
@ -9,8 +7,8 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

View File

@ -29,3 +29,6 @@ Compilation with `cmake > 3.5`:
cmake ..
make -j
To compile API documentation using Doxygen, first cd to the build directory, and then:
make doc

View File

@ -1,8 +1,6 @@
#include <sstream>
#include "expression_graph.h"
using namespace std;
namespace marian {
Expr::Expr(ExpressionGraphPtr g, Chainable<Tensor>* chainable)
@ -32,19 +30,10 @@ Expr::operator ChainPtr() {
std::string Expr::Debug() const
{
stringstream strm;
std::stringstream strm;
const Shape &shape = pimpl_->shape();
strm << marian::Debug(shape);
return strm.str();
}
///////////////////////////////////////////////////////
//ExpressionGraph::ExpressionGraph(int cudaDevice)
//: stack_(new ChainableStack)
//{
// std::srand (time(NULL));
// cudaSetDevice(0);
//
//}
}

View File

@ -29,7 +29,7 @@ Expr operator-(Expr a) {
return Expr(a.graph(), new NegNodeOp(a));
};
Expr softmax_fast(Expr a) {
Expr softmax(Expr a) {
return Expr(a.graph(), new SoftmaxNodeOp(a));
}

View File

@ -72,12 +72,12 @@ inline Expr sum(Expr a, Args ...args) {
// inefficient
template <typename ...Args>
Expr softmax(Expr a, Args ...args) {
Expr softmax_slow(Expr a, Args ...args) {
Expr e = exp(a);
return e / sum(e, args...);
}
Expr softmax_fast(Expr a);
Expr softmax(Expr a);
Expr argmax(Expr a);

View File

@ -163,6 +163,13 @@ struct SoftmaxNodeOp : public UnaryNodeOp {
// For each row, the Jacobian times vector is given by:
// J * dy = p .* (dy - avg*1)
// where avg = p'*dy and p is the softmax output (probabilities).
//
// For more information, see sec. 2.5 of the following reference:
// André F. T. Martins and Ramon Astudillo.
// "From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label
// Classification." ICML 2016.
// http://jmlr.org/proceedings/papers/v48/martins16.pdf
Tensor result(adj_.shape());
thrust::copy(adj_.begin(), adj_.end(), result.begin());
SubtractMean(&result, val_);

View File

@ -14,7 +14,8 @@ class Sgd {
graph.backprop(batchSize);
for(auto& param : graph.params())
Element(_1 -= eta_ * _2, param.val(), param.grad());
Element(_1 -= eta_ * _2,
param.val(), param.grad());
}
private:
@ -26,16 +27,18 @@ class Adagrad {
Adagrad(float eta=0.1) : eta_(eta) {}
void operator()(ExpressionGraph& graph, int batchSize) {
float fudgeFactor = 1e-6;
graph.backprop(batchSize);
if(history_.size() < graph.params().size())
for(auto& param : graph.params())
history_.emplace_back(Tensor(param.grad().shape(), 0));
graph.backprop(batchSize);
auto it = history_.begin();
for(auto& param : graph.params()) {
Element(_1 -= eta_ / Sqrt(_2) * _3, param.val(), *it, param.grad());
Element(_1 += _2 * _2, *it, param.grad());
Element(_1 -= eta_ / (fudgeFactor + Sqrt(_2)) * _3,
param.val(), *it, param.grad());
it++;
}
}

View File

@ -100,10 +100,10 @@ int main(int argc, char** argv) {
std::cerr << "Building output layer..." << std::endl;
std::vector<Expr> Yp;
Yp.emplace_back(softmax_fast(dot(H[0], Why) + by));
Yp.emplace_back(softmax(dot(H[0], Why) + by));
Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
for (int t = 1; t < num_inputs; ++t) {
Yp.emplace_back(softmax_fast(dot(H[t], Why) + by));
Yp.emplace_back(softmax(dot(H[t], Why) + by));
cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
}
auto graph = -mean(cross_entropy, axis=0, name="cost");

View File

@ -25,7 +25,7 @@ int main(int argc, char** argv) {
Expr b = named(g.param(shape={1, LABEL_SIZE}), "b");
auto scores = dot(x, w) + b;
auto lr = softmax_fast(scores);
auto lr = softmax(scores);
auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost");
cerr << "lr=" << lr.Debug() << endl;

View File

@ -7,17 +7,19 @@
using namespace marian;
using namespace keywords;
const int input_size = 10;
const int output_size = 15;
const int embedding_size = 8;
const int hidden_size = 5;
const int batch_size = 25;
const int num_inputs = 8;
const int num_outputs = 6;
ExpressionGraph build_graph() {
ExpressionGraph build_graph(int source_vocabulary_size,
int target_vocabulary_size,
int embedding_size,
int hidden_size,
int num_source_tokens,
int num_target_tokens) {
std::cerr << "Building computation graph..." << std::endl;
int input_size = source_vocabulary_size;
int output_size = target_vocabulary_size;
int num_inputs = num_source_tokens;
int num_outputs = num_target_tokens;
ExpressionGraph g;
std::vector<Expr> X, Y, H, S;
@ -25,14 +27,14 @@ ExpressionGraph build_graph() {
for (int t = 0; t <= num_inputs; ++t) {
std::stringstream ss;
ss << "X" << t;
X.emplace_back(named(g.input(shape={batch_size, input_size}), ss.str()));
X.emplace_back(named(g.input(shape={whatevs, input_size}), ss.str()));
}
// We're including the stop symbol here.
for (int t = 0; t <= num_outputs; ++t) {
std::stringstream ss;
ss << "Y" << t;
Y.emplace_back(named(g.input(shape={batch_size, output_size}), ss.str()));
Y.emplace_back(named(g.input(shape={whatevs, output_size}), ss.str()));
}
// Source embeddings.
@ -80,10 +82,10 @@ ExpressionGraph build_graph() {
// Softmax layer and cost function.
std::vector<Expr> Yp;
Yp.emplace_back(named(softmax_fast(dot(h0_d, Why) + by), "pred"));
Yp.emplace_back(named(softmax(dot(h0_d, Why) + by), "pred"));
Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
for (int t = 1; t <= num_outputs; ++t) {
Yp.emplace_back(named(softmax_fast(dot(S[t-1], Why) + by), "pred"));
Yp.emplace_back(named(softmax(dot(S[t-1], Why) + by), "pred"));
cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
}
auto cost = named(-mean(cross_entropy, axis=0), "cost");
@ -96,30 +98,124 @@ ExpressionGraph build_graph() {
int main(int argc, char** argv) {
#if 1
std::cerr << "Loading the data... ";
Vocab sourceVocab, targetVocab;
Vocab source_vocab, target_vocab;
// read parallel corpus from file
std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
std::fstream targetFile("../examples/mt/dev/newstest2013.en");
std::fstream source_file("../examples/mt/dev/newstest2013.de");
std::fstream target_file("../examples/mt/dev/newstest2013.en");
// Right now we're only reading the first few sentence pairs, and defining
// that as the step size.
int batch_size = 64;
int num_source_tokens = -1;
int num_target_tokens = -1;
std::vector<std::vector<size_t> > source_sentences, target_sentences;
std::string sourceLine, targetLine;
while (getline(sourceFile, sourceLine)) {
getline(targetFile, targetLine);
std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
std::vector<size_t> targetIds = targetVocab.ProcessSentence(targetLine);
source_sentences.push_back(sourceIds);
target_sentences.push_back(targetIds);
std::string source_line, target_line;
while (getline(source_file, source_line)) {
getline(target_file, target_line);
std::vector<size_t> source_ids = source_vocab.ProcessSentence(source_line);
source_ids.push_back(source_vocab.GetEOS()); // Append EOS token.
std::vector<size_t> target_ids = target_vocab.ProcessSentence(target_line);
target_ids.push_back(target_vocab.GetEOS()); // Append EOS token.
source_sentences.push_back(source_ids);
target_sentences.push_back(target_ids);
if (num_source_tokens < 0 || source_ids.size() > num_source_tokens) {
num_source_tokens = source_ids.size();
}
if (num_target_tokens < 0 || target_ids.size() > num_target_tokens) {
num_target_tokens = target_ids.size();
}
if (source_sentences.size() == batch_size) break;
}
std::cerr << "Done." << std::endl;
std::cerr << source_sentences.size()
<< " sentence pairs read." << std::endl;
std::cerr << "Source vocabulary size: " << sourceVocab.Size() << std::endl;
std::cerr << "Target vocabulary size: " << targetVocab.Size() << std::endl;
#endif
std::cerr << "Source vocabulary size: " << source_vocab.Size() << std::endl;
std::cerr << "Target vocabulary size: " << target_vocab.Size() << std::endl;
std::cerr << "Max source tokens: " << num_source_tokens << std::endl;
std::cerr << "Max target tokens: " << num_target_tokens << std::endl;
// Padding the source and target sentences.
for (auto &sentence : source_sentences) {
for (int i = sentence.size(); i < num_source_tokens; ++i) {
sentence.push_back(source_vocab.GetPAD());
}
}
for (auto &sentence : target_sentences) {
for (int i = sentence.size(); i < num_target_tokens; ++i) {
sentence.push_back(target_vocab.GetPAD());
}
}
std::cerr << "Building the encoder-decoder computation graph..." << std::endl;
// Build the encoder-decoder computation graph.
ExpressionGraph g = build_graph();
int embedding_size = 50;
int hidden_size = 100;
ExpressionGraph g = build_graph(source_vocab.Size(),
target_vocab.Size(),
embedding_size,
hidden_size,
num_source_tokens-1,
num_target_tokens-1);
std::cerr << "Attaching the data to the computation graph..." << std::endl;
// Convert the data to dense one-hot vectors.
// TODO: make the graph handle sparse indices with a proper lookup layer.
for (int t = 0; t < num_source_tokens; ++t) {
Tensor Xt({batch_size, static_cast<int>(source_vocab.Size())});
std::vector<float> values(batch_size * source_vocab.Size(), 0.0);
int k = 0;
for (int i = 0; i < batch_size; ++i) {
values[k + source_sentences[i][t]] = 1.0;
k += source_vocab.Size();
}
thrust::copy(values.begin(), values.end(), Xt.begin());
// Attach this slice to the graph.
std::stringstream ss;
ss << "X" << t;
g[ss.str()] = Xt;
}
for (int t = 0; t < num_target_tokens; ++t) {
Tensor Yt({batch_size, static_cast<int>(target_vocab.Size())});
std::vector<float> values(batch_size * target_vocab.Size(), 0.0);
int k = 0;
for (int i = 0; i < batch_size; ++i) {
values[k + target_sentences[i][t]] = 1.0;
k += target_vocab.Size();
}
thrust::copy(values.begin(), values.end(), Yt.begin());
// Attach this slice to the graph.
std::stringstream ss;
ss << "Y" << t;
g[ss.str()] = Yt;
}
#else
int source_vocabulary_size = 10;
int target_vocabulary_size = 15;
int embedding_size = 8;
int hidden_size = 5;
int batch_size = 25;
int num_source_tokens = 8;
int num_target_tokens = 6;
// Build the encoder-decoder computation graph.
ExpressionGraph g = build_graph(0, // cuda device.
source_vocabulary_size,
target_vocabulary_size,
embedding_size,
hidden_size,
num_source_tokens,
num_target_tokens);
int input_size = source_vocabulary_size;
int output_size = target_vocabulary_size;
int num_inputs = num_source_tokens;
int num_outputs = num_target_tokens;
// Generate input data (include the stop symbol).
for (int t = 0; t <= num_inputs; ++t) {
@ -155,6 +251,7 @@ int main(int argc, char** argv) {
ss << "Y" << t;
g[ss.str()] = Yt;
}
#endif
std::cerr << "Printing the computation graph..." << std::endl;
std::cout << g.graphviz() << std::endl;
@ -167,6 +264,7 @@ int main(int argc, char** argv) {
std::cerr << g["cost"].val().Debug() << std::endl;
#if 0
std::cerr << g["X0"].val().Debug() << std::endl;
std::cerr << g["Y0"].val().Debug() << std::endl;
std::cerr << g["Whh"].grad().Debug() << std::endl;
@ -175,6 +273,7 @@ int main(int argc, char** argv) {
std::cerr << g["by"].grad().Debug() << std::endl;
std::cerr << g["Wxh"].grad().Debug() << std::endl;
std::cerr << g["h0"].grad().Debug() << std::endl;
#endif
return 0;
}

View File

@ -32,7 +32,7 @@ ExpressionGraph build_graph() {
init=from_vector(bData)), "b");
auto probs = named(
softmax_fast(dot(x, w) + b), //, axis=1),
softmax(dot(x, w) + b), //, axis=1),
"probs"
);

View File

@ -68,7 +68,7 @@ int main(int argc, char** argv) {
std::cerr << "Building model...";
auto layer1 = tanh(dot(x, w1) + b1);
auto layer2 = softmax(dot(layer1, w2) + b2, axis=1, name="layer2");
auto layer2 = softmax(dot(layer1, w2) + b2);
auto predict = layer2;
std::cerr << "Done." << std::endl;