mirror of
https://github.com/marian-nmt/marian.git
synced 2024-11-04 14:04:24 +03:00
Merge ../Marian
This commit is contained in:
commit
ea66688a13
@ -20,3 +20,14 @@ endif(Boost_FOUND)
|
||||
|
||||
include_directories(${marian_SOURCE_DIR}/src)
|
||||
add_subdirectory(src)
|
||||
|
||||
# add a target to generate API documentation with Doxygen
|
||||
find_package(Doxygen)
|
||||
if(DOXYGEN_FOUND)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
|
||||
add_custom_target(doc
|
||||
${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
COMMENT "Generating API documentation with Doxygen" VERBATIM
|
||||
)
|
||||
endif(DOXYGEN_FOUND)
|
||||
|
@ -743,7 +743,7 @@ WARN_LOGFILE =
|
||||
# spaces.
|
||||
# Note: If this tag is empty the current directory is searched.
|
||||
|
||||
INPUT = src
|
||||
INPUT = @CMAKE_CURRENT_SOURCE_DIR@/src
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
@ -1,5 +1,3 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 Marcin Junczys-Dowmunt
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
@ -9,8 +7,8 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
|
@ -29,3 +29,6 @@ Compilation with `cmake > 3.5`:
|
||||
cmake ..
|
||||
make -j
|
||||
|
||||
To compile API documentation using Doxygen, first cd to the build directory, and then:
|
||||
|
||||
make doc
|
||||
|
@ -1,8 +1,6 @@
|
||||
#include <sstream>
|
||||
#include "expression_graph.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace marian {
|
||||
|
||||
Expr::Expr(ExpressionGraphPtr g, Chainable<Tensor>* chainable)
|
||||
@ -32,19 +30,10 @@ Expr::operator ChainPtr() {
|
||||
|
||||
std::string Expr::Debug() const
|
||||
{
|
||||
stringstream strm;
|
||||
std::stringstream strm;
|
||||
const Shape &shape = pimpl_->shape();
|
||||
strm << marian::Debug(shape);
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
//ExpressionGraph::ExpressionGraph(int cudaDevice)
|
||||
//: stack_(new ChainableStack)
|
||||
//{
|
||||
// std::srand (time(NULL));
|
||||
// cudaSetDevice(0);
|
||||
//
|
||||
//}
|
||||
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ Expr operator-(Expr a) {
|
||||
return Expr(a.graph(), new NegNodeOp(a));
|
||||
};
|
||||
|
||||
Expr softmax_fast(Expr a) {
|
||||
Expr softmax(Expr a) {
|
||||
return Expr(a.graph(), new SoftmaxNodeOp(a));
|
||||
}
|
||||
|
||||
|
@ -72,12 +72,12 @@ inline Expr sum(Expr a, Args ...args) {
|
||||
|
||||
// inefficient
|
||||
template <typename ...Args>
|
||||
Expr softmax(Expr a, Args ...args) {
|
||||
Expr softmax_slow(Expr a, Args ...args) {
|
||||
Expr e = exp(a);
|
||||
return e / sum(e, args...);
|
||||
}
|
||||
|
||||
Expr softmax_fast(Expr a);
|
||||
Expr softmax(Expr a);
|
||||
|
||||
Expr argmax(Expr a);
|
||||
|
||||
|
@ -163,6 +163,13 @@ struct SoftmaxNodeOp : public UnaryNodeOp {
|
||||
// For each row, the Jacobian times vector is given by:
|
||||
// J * dy = p .* (dy - avg*1)
|
||||
// where avg = p'*dy and p is the softmax output (probabilities).
|
||||
//
|
||||
// For more information, see sec. 2.5 of the following reference:
|
||||
// André F. T. Martins and Ramon Astudillo.
|
||||
// "From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label
|
||||
// Classification." ICML 2016.
|
||||
// http://jmlr.org/proceedings/papers/v48/martins16.pdf
|
||||
|
||||
Tensor result(adj_.shape());
|
||||
thrust::copy(adj_.begin(), adj_.end(), result.begin());
|
||||
SubtractMean(&result, val_);
|
||||
|
11
src/sgd.h
11
src/sgd.h
@ -14,7 +14,8 @@ class Sgd {
|
||||
graph.backprop(batchSize);
|
||||
|
||||
for(auto& param : graph.params())
|
||||
Element(_1 -= eta_ * _2, param.val(), param.grad());
|
||||
Element(_1 -= eta_ * _2,
|
||||
param.val(), param.grad());
|
||||
}
|
||||
|
||||
private:
|
||||
@ -26,16 +27,18 @@ class Adagrad {
|
||||
Adagrad(float eta=0.1) : eta_(eta) {}
|
||||
|
||||
void operator()(ExpressionGraph& graph, int batchSize) {
|
||||
float fudgeFactor = 1e-6;
|
||||
graph.backprop(batchSize);
|
||||
|
||||
if(history_.size() < graph.params().size())
|
||||
for(auto& param : graph.params())
|
||||
history_.emplace_back(Tensor(param.grad().shape(), 0));
|
||||
|
||||
graph.backprop(batchSize);
|
||||
|
||||
auto it = history_.begin();
|
||||
for(auto& param : graph.params()) {
|
||||
Element(_1 -= eta_ / Sqrt(_2) * _3, param.val(), *it, param.grad());
|
||||
Element(_1 += _2 * _2, *it, param.grad());
|
||||
Element(_1 -= eta_ / (fudgeFactor + Sqrt(_2)) * _3,
|
||||
param.val(), *it, param.grad());
|
||||
it++;
|
||||
}
|
||||
}
|
||||
|
@ -100,10 +100,10 @@ int main(int argc, char** argv) {
|
||||
std::cerr << "Building output layer..." << std::endl;
|
||||
std::vector<Expr> Yp;
|
||||
|
||||
Yp.emplace_back(softmax_fast(dot(H[0], Why) + by));
|
||||
Yp.emplace_back(softmax(dot(H[0], Why) + by));
|
||||
Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
|
||||
for (int t = 1; t < num_inputs; ++t) {
|
||||
Yp.emplace_back(softmax_fast(dot(H[t], Why) + by));
|
||||
Yp.emplace_back(softmax(dot(H[t], Why) + by));
|
||||
cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
|
||||
}
|
||||
auto graph = -mean(cross_entropy, axis=0, name="cost");
|
||||
|
@ -25,7 +25,7 @@ int main(int argc, char** argv) {
|
||||
Expr b = named(g.param(shape={1, LABEL_SIZE}), "b");
|
||||
|
||||
auto scores = dot(x, w) + b;
|
||||
auto lr = softmax_fast(scores);
|
||||
auto lr = softmax(scores);
|
||||
auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost");
|
||||
cerr << "lr=" << lr.Debug() << endl;
|
||||
|
||||
|
@ -7,17 +7,19 @@
|
||||
using namespace marian;
|
||||
using namespace keywords;
|
||||
|
||||
const int input_size = 10;
|
||||
const int output_size = 15;
|
||||
const int embedding_size = 8;
|
||||
const int hidden_size = 5;
|
||||
const int batch_size = 25;
|
||||
const int num_inputs = 8;
|
||||
const int num_outputs = 6;
|
||||
|
||||
ExpressionGraph build_graph() {
|
||||
ExpressionGraph build_graph(int source_vocabulary_size,
|
||||
int target_vocabulary_size,
|
||||
int embedding_size,
|
||||
int hidden_size,
|
||||
int num_source_tokens,
|
||||
int num_target_tokens) {
|
||||
std::cerr << "Building computation graph..." << std::endl;
|
||||
|
||||
int input_size = source_vocabulary_size;
|
||||
int output_size = target_vocabulary_size;
|
||||
int num_inputs = num_source_tokens;
|
||||
int num_outputs = num_target_tokens;
|
||||
|
||||
ExpressionGraph g;
|
||||
std::vector<Expr> X, Y, H, S;
|
||||
|
||||
@ -25,14 +27,14 @@ ExpressionGraph build_graph() {
|
||||
for (int t = 0; t <= num_inputs; ++t) {
|
||||
std::stringstream ss;
|
||||
ss << "X" << t;
|
||||
X.emplace_back(named(g.input(shape={batch_size, input_size}), ss.str()));
|
||||
X.emplace_back(named(g.input(shape={whatevs, input_size}), ss.str()));
|
||||
}
|
||||
|
||||
// We're including the stop symbol here.
|
||||
for (int t = 0; t <= num_outputs; ++t) {
|
||||
std::stringstream ss;
|
||||
ss << "Y" << t;
|
||||
Y.emplace_back(named(g.input(shape={batch_size, output_size}), ss.str()));
|
||||
Y.emplace_back(named(g.input(shape={whatevs, output_size}), ss.str()));
|
||||
}
|
||||
|
||||
// Source embeddings.
|
||||
@ -80,10 +82,10 @@ ExpressionGraph build_graph() {
|
||||
|
||||
// Softmax layer and cost function.
|
||||
std::vector<Expr> Yp;
|
||||
Yp.emplace_back(named(softmax_fast(dot(h0_d, Why) + by), "pred"));
|
||||
Yp.emplace_back(named(softmax(dot(h0_d, Why) + by), "pred"));
|
||||
Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
|
||||
for (int t = 1; t <= num_outputs; ++t) {
|
||||
Yp.emplace_back(named(softmax_fast(dot(S[t-1], Why) + by), "pred"));
|
||||
Yp.emplace_back(named(softmax(dot(S[t-1], Why) + by), "pred"));
|
||||
cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
|
||||
}
|
||||
auto cost = named(-mean(cross_entropy, axis=0), "cost");
|
||||
@ -96,30 +98,124 @@ ExpressionGraph build_graph() {
|
||||
int main(int argc, char** argv) {
|
||||
#if 1
|
||||
std::cerr << "Loading the data... ";
|
||||
Vocab sourceVocab, targetVocab;
|
||||
Vocab source_vocab, target_vocab;
|
||||
|
||||
// read parallel corpus from file
|
||||
std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
|
||||
std::fstream targetFile("../examples/mt/dev/newstest2013.en");
|
||||
std::fstream source_file("../examples/mt/dev/newstest2013.de");
|
||||
std::fstream target_file("../examples/mt/dev/newstest2013.en");
|
||||
|
||||
// Right now we're only reading the first few sentence pairs, and defining
|
||||
// that as the step size.
|
||||
int batch_size = 64;
|
||||
int num_source_tokens = -1;
|
||||
int num_target_tokens = -1;
|
||||
std::vector<std::vector<size_t> > source_sentences, target_sentences;
|
||||
std::string sourceLine, targetLine;
|
||||
while (getline(sourceFile, sourceLine)) {
|
||||
getline(targetFile, targetLine);
|
||||
std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
|
||||
std::vector<size_t> targetIds = targetVocab.ProcessSentence(targetLine);
|
||||
source_sentences.push_back(sourceIds);
|
||||
target_sentences.push_back(targetIds);
|
||||
std::string source_line, target_line;
|
||||
while (getline(source_file, source_line)) {
|
||||
getline(target_file, target_line);
|
||||
std::vector<size_t> source_ids = source_vocab.ProcessSentence(source_line);
|
||||
source_ids.push_back(source_vocab.GetEOS()); // Append EOS token.
|
||||
std::vector<size_t> target_ids = target_vocab.ProcessSentence(target_line);
|
||||
target_ids.push_back(target_vocab.GetEOS()); // Append EOS token.
|
||||
source_sentences.push_back(source_ids);
|
||||
target_sentences.push_back(target_ids);
|
||||
if (num_source_tokens < 0 || source_ids.size() > num_source_tokens) {
|
||||
num_source_tokens = source_ids.size();
|
||||
}
|
||||
if (num_target_tokens < 0 || target_ids.size() > num_target_tokens) {
|
||||
num_target_tokens = target_ids.size();
|
||||
}
|
||||
if (source_sentences.size() == batch_size) break;
|
||||
}
|
||||
std::cerr << "Done." << std::endl;
|
||||
std::cerr << source_sentences.size()
|
||||
<< " sentence pairs read." << std::endl;
|
||||
std::cerr << "Source vocabulary size: " << sourceVocab.Size() << std::endl;
|
||||
std::cerr << "Target vocabulary size: " << targetVocab.Size() << std::endl;
|
||||
#endif
|
||||
std::cerr << "Source vocabulary size: " << source_vocab.Size() << std::endl;
|
||||
std::cerr << "Target vocabulary size: " << target_vocab.Size() << std::endl;
|
||||
std::cerr << "Max source tokens: " << num_source_tokens << std::endl;
|
||||
std::cerr << "Max target tokens: " << num_target_tokens << std::endl;
|
||||
|
||||
// Padding the source and target sentences.
|
||||
for (auto &sentence : source_sentences) {
|
||||
for (int i = sentence.size(); i < num_source_tokens; ++i) {
|
||||
sentence.push_back(source_vocab.GetPAD());
|
||||
}
|
||||
}
|
||||
for (auto &sentence : target_sentences) {
|
||||
for (int i = sentence.size(); i < num_target_tokens; ++i) {
|
||||
sentence.push_back(target_vocab.GetPAD());
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << "Building the encoder-decoder computation graph..." << std::endl;
|
||||
|
||||
// Build the encoder-decoder computation graph.
|
||||
ExpressionGraph g = build_graph();
|
||||
int embedding_size = 50;
|
||||
int hidden_size = 100;
|
||||
ExpressionGraph g = build_graph(source_vocab.Size(),
|
||||
target_vocab.Size(),
|
||||
embedding_size,
|
||||
hidden_size,
|
||||
num_source_tokens-1,
|
||||
num_target_tokens-1);
|
||||
|
||||
std::cerr << "Attaching the data to the computation graph..." << std::endl;
|
||||
|
||||
// Convert the data to dense one-hot vectors.
|
||||
// TODO: make the graph handle sparse indices with a proper lookup layer.
|
||||
for (int t = 0; t < num_source_tokens; ++t) {
|
||||
Tensor Xt({batch_size, static_cast<int>(source_vocab.Size())});
|
||||
std::vector<float> values(batch_size * source_vocab.Size(), 0.0);
|
||||
int k = 0;
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
values[k + source_sentences[i][t]] = 1.0;
|
||||
k += source_vocab.Size();
|
||||
}
|
||||
thrust::copy(values.begin(), values.end(), Xt.begin());
|
||||
// Attach this slice to the graph.
|
||||
std::stringstream ss;
|
||||
ss << "X" << t;
|
||||
g[ss.str()] = Xt;
|
||||
}
|
||||
|
||||
for (int t = 0; t < num_target_tokens; ++t) {
|
||||
Tensor Yt({batch_size, static_cast<int>(target_vocab.Size())});
|
||||
std::vector<float> values(batch_size * target_vocab.Size(), 0.0);
|
||||
int k = 0;
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
values[k + target_sentences[i][t]] = 1.0;
|
||||
k += target_vocab.Size();
|
||||
}
|
||||
thrust::copy(values.begin(), values.end(), Yt.begin());
|
||||
// Attach this slice to the graph.
|
||||
std::stringstream ss;
|
||||
ss << "Y" << t;
|
||||
g[ss.str()] = Yt;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int source_vocabulary_size = 10;
|
||||
int target_vocabulary_size = 15;
|
||||
int embedding_size = 8;
|
||||
int hidden_size = 5;
|
||||
int batch_size = 25;
|
||||
int num_source_tokens = 8;
|
||||
int num_target_tokens = 6;
|
||||
|
||||
// Build the encoder-decoder computation graph.
|
||||
ExpressionGraph g = build_graph(0, // cuda device.
|
||||
source_vocabulary_size,
|
||||
target_vocabulary_size,
|
||||
embedding_size,
|
||||
hidden_size,
|
||||
num_source_tokens,
|
||||
num_target_tokens);
|
||||
|
||||
int input_size = source_vocabulary_size;
|
||||
int output_size = target_vocabulary_size;
|
||||
int num_inputs = num_source_tokens;
|
||||
int num_outputs = num_target_tokens;
|
||||
|
||||
// Generate input data (include the stop symbol).
|
||||
for (int t = 0; t <= num_inputs; ++t) {
|
||||
@ -155,6 +251,7 @@ int main(int argc, char** argv) {
|
||||
ss << "Y" << t;
|
||||
g[ss.str()] = Yt;
|
||||
}
|
||||
#endif
|
||||
|
||||
std::cerr << "Printing the computation graph..." << std::endl;
|
||||
std::cout << g.graphviz() << std::endl;
|
||||
@ -167,6 +264,7 @@ int main(int argc, char** argv) {
|
||||
|
||||
std::cerr << g["cost"].val().Debug() << std::endl;
|
||||
|
||||
#if 0
|
||||
std::cerr << g["X0"].val().Debug() << std::endl;
|
||||
std::cerr << g["Y0"].val().Debug() << std::endl;
|
||||
std::cerr << g["Whh"].grad().Debug() << std::endl;
|
||||
@ -175,6 +273,7 @@ int main(int argc, char** argv) {
|
||||
std::cerr << g["by"].grad().Debug() << std::endl;
|
||||
std::cerr << g["Wxh"].grad().Debug() << std::endl;
|
||||
std::cerr << g["h0"].grad().Debug() << std::endl;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ ExpressionGraph build_graph() {
|
||||
init=from_vector(bData)), "b");
|
||||
|
||||
auto probs = named(
|
||||
softmax_fast(dot(x, w) + b), //, axis=1),
|
||||
softmax(dot(x, w) + b), //, axis=1),
|
||||
"probs"
|
||||
);
|
||||
|
||||
|
@ -68,7 +68,7 @@ int main(int argc, char** argv) {
|
||||
|
||||
std::cerr << "Building model...";
|
||||
auto layer1 = tanh(dot(x, w1) + b1);
|
||||
auto layer2 = softmax(dot(layer1, w2) + b2, axis=1, name="layer2");
|
||||
auto layer2 = softmax(dot(layer1, w2) + b2);
|
||||
auto predict = layer2;
|
||||
|
||||
std::cerr << "Done." << std::endl;
|
||||
|
Loading…
Reference in New Issue
Block a user