Merge ../Marian

This commit is contained in:
Hieu Hoang 2016-09-16 17:39:31 +02:00
commit 7803f44a97
7 changed files with 402 additions and 122 deletions

View File

@ -18,7 +18,7 @@ struct Chainable {
virtual void allocate(size_t) = 0;
virtual std::string graphviz() = 0;
virtual const std::string &name() const = 0;
virtual const Shape& shape() = 0;
virtual DataType &val() = 0;

View File

@ -52,10 +52,10 @@ class ExpressionGraph {
std::stringstream ss;
ss << "digraph ExpressionGraph {" << std::endl;
ss << "rankdir=BT" << std::endl;
typedef typename ChainableStack::reverse_iterator It;
for(It it = stack_->rbegin(); it != stack_->rend(); ++it)
for(It it = stack_->rbegin(); it != stack_->rend(); ++it) {
ss << (*it)->graphviz();
}
ss << "}" << std::endl;
return ss.str();
}

View File

@ -68,6 +68,8 @@ class Node : public Chainable<Tensor>,
return shape_;
}
const std::string &name() const { return name_; }
protected:
Shape shape_;
std::string name_;

View File

@ -1,4 +1,21 @@
#pragma once
/* Copyright (C)
* 2016 - MLAMU & friends
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
#include <cublas_v2.h>
#include <thrust/device_vector.h>
@ -12,6 +29,13 @@
namespace marian {
/**
* @brief Debug shape by printing it.
*
* @param shape Shape of Tensor.
*
* @return String of shape.
*/
inline std::string Debug(const Shape &shape)
{
std::stringstream strm;
@ -23,6 +47,13 @@ inline std::string Debug(const Shape &shape)
return strm.str();
}
/**
* @brief Calculate the vector size based on Tensor shape.
*
* @param shape Shape of Tensor.
*
* @return Size of Tensor vector.
*/
inline size_t GetTotalSize(const Shape &shape)
{
size_t ret = std::accumulate(shape.begin(), shape.end(),
@ -30,17 +61,28 @@ inline size_t GetTotalSize(const Shape &shape)
return ret;
}
/**
* @brief This class manages the Tensor on the GPU.
*
* @tparam Float Data type.
*/
template<class Float>
class TensorImpl {
private:
Shape shape_;
thrust::device_vector<Float> data_;
size_t tno_;
static size_t tensorCounter;
Shape shape_; /*!< Dimenions of Tensor */
thrust::device_vector<Float> data_; /*< Vector of data that Tensor is managing on GPU. */
size_t tno_; /*< Tensor number */
static size_t tensorCounter; /*< Static counter of created Tensors */
public:
typedef Float value_type;
typedef Float value_type; /*< Tensor value type */
/**
* @brief Constructor
*
* @param shape Shape of Tensor.
* @param value Value to fill Tensor's vector with.
*/
TensorImpl(const Shape& shape, value_type value = 0)
: shape_(shape), tno_(tensorCounter++)
{
@ -59,54 +101,122 @@ class TensorImpl {
TensorImpl(const TensorImpl&) = delete;
TensorImpl(TensorImpl&&) = delete;
/**
* @brief Get the i-th element of Tensor vector.
*
* @param i Index.
*
* @return Value of Tensor vector indexed with i.
*/
value_type operator[](size_t i) const {
return data_[i];
}
/**
* @brief Get begin iterator of Tensor's vector.
*
* @return Vector begin iterator.
*/
auto begin() -> decltype( data_.begin() ) {
return data_.begin();
}
/**
* @brief Get begin iterator of Tensor's vector (const).
*
* @return Vector begin iterator (const)
*/
auto begin() const -> decltype( data_.begin() ) {
return data_.begin();
}
/**
* @brief Get end iterator of Tensor's vector.
*
* @return Vector end iterator
*/
auto end() -> decltype( data_.end() ) {
return data_.end();
}
/**
* @brief Get end iterator of Tensor's vector (const).
*
* @return Vector end iterator (const)
*/
auto end() const -> decltype( data_.end() ) {
return data_.end();
}
/**
* @brief Get Tensor's shape (const)
*
* @return Shape of Tensor
*/
const Shape& shape() const {
return shape_;
}
/**
* @brief Get size of Tensor's vector.
*
* @return Length of Tensor's vector.
*/
size_t size() const {
return data_.size();
}
/**
* @brief Cast data from Tensor's GPU to value_type.
*
* @return Pointer of value_type array.
*/
value_type* data() {
return thrust::raw_pointer_cast(data_.data());
}
/**
* @brief Get Tensor id (number).
*
* @return Tensor id.
*/
size_t id() const {
return tno_;
}
/**
* @brief Fill Tensor's vector with specified value on the GPU.
*
* @param value Value to fill vector with.
*/
void set(value_type value) {
thrust::fill(data_.begin(), data_.end(), value);
}
/**
* @brief Set Tensor's vector to values of specified vector by copying it to GPU.
*
* @param begin Begin iterator of a vector.
* @param end End iterator of a vector.
*/
void set(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end) {
thrust::copy(begin, end, data_.begin());
}
/**
* @brief Copy Tensor's vector from GPU to vector variable on CPU.
*
* @param out Vector to copy data to.
*/
void get(std::vector<float>::iterator out) {
thrust::copy(data_.begin(), data_.end(), out);
}
/**
* @brief Debug function.
*
* @return Vector in string form.
*/
std::string Debug() const
{
std::stringstream strm;
@ -133,78 +243,170 @@ class TensorImpl {
template <typename Type>
size_t TensorImpl<Type>::tensorCounter = 0;
/**
* @brief Class that communicates with GPU's Tensor.
*/
class Tensor {
private:
std::shared_ptr<TensorImpl<Float>> pimpl_;
std::shared_ptr<TensorImpl<Float>> pimpl_; /*< Pointer to Tensor working on GPU */
public:
typedef TensorImpl<Float>::value_type value_type;
typedef TensorImpl<Float>::value_type value_type; /*< Get value type of GPU's Tensor data */
/**
* @brief Default constructor
*/
Tensor() {}
/**
* @brief Constructor that allocates memory.
*
* @param shape Shape of Tensor.
* @param value Value to fill Tensor's vector with.
*/
Tensor(const Shape& shape, value_type value = 0) {
allocate(shape, value);
}
/**
* @brief Default destructor
*/
~Tensor() {}
/**
* @brief Allocate memory if Tensor doesn't exist on GPU. Otherwise, do nothing.
*
* @param shape Shape of Tensor.
* @param value Value to fill Tensor's vector with.
*/
void allocate(const Shape& shape, value_type value = 0) {
if(!pimpl_)
pimpl_.reset(new TensorImpl<Float>(shape, value));
}
/**
* @brief Get i-th element of GPU Tensor vector (const).
*
* @param i Index.
*
* @return Value of specified element of Tensor.
*/
value_type operator[](size_t i) const {
return (*pimpl_)[i];
}
/**
* @brief Get size of GPU Tensor's vector.
*
* @return Size of Tensor vector.
*/
size_t size() const {
return pimpl_->size();
}
/**
* @brief Return pointer to GPU Tensor's data.
*
* @return Pointer to GPU Tensor's data.
*/
value_type* data() {
return pimpl_->data();
}
/**
* @brief Return pointer to GPU Tensor's data (const).
*
* @return Pointer to GPU Tensor's data.
*/
const value_type* data() const {
return pimpl_->data();
}
/**
* @brief Get begin iterator of GPU Tensor's vector.
*
* @return Vector begin iterator.
*/
auto begin() -> decltype( pimpl_->begin() ) {
return pimpl_->begin();
}
/**
* @brief Get begin iterator of GPU Tensor's vector (const).
*
* @return Vector begin iterator (const)
*/
auto begin() const -> decltype( pimpl_->begin() ) {
return pimpl_->begin();
}
/**
* @brief Get end iterator of Tensor's vector.
*
* @return Vector end iterator
*/
auto end() -> decltype( pimpl_->end() ) {
return pimpl_->end();
}
/**
* @brief Get end iterator of Tensor's vector (const).
*
* @return Vector end iterator (const)
*/
auto end() const -> decltype( pimpl_->end() ) {
return pimpl_->end();
}
/**
* @brief Get GPU Tensor's shape.
*
* @return Tensor's shape.
*/
const Shape& shape() const {
return pimpl_->shape();
}
/**
* @brief Fill GPU Tensor's vector with specified value.
*
* @param value Value to fill Tensor with.
*/
void set(value_type value) {
pimpl_->set(value);
}
/**
* @brief Get GPU Tensor id (number).
*
* @return Tensor id.
*/
size_t id() const {
return pimpl_->id();
}
/**
* @brief Check if Tensor is allocated.
*
* @return True or False
*/
operator bool() {
return pimpl_ != nullptr;
}
/**
* @brief Run Debug on GPU Tensor.
*
* @return String of Tensor's data.
*/
std::string Debug() const
{
return pimpl_->Debug();
}
/**
* @brief Print Tensor data on CPU (?) (const).
*/
void Print() const {
for (int i = 0; i < size(); ++i) {
std::cerr << (*this)[i] << " ";
@ -213,21 +415,59 @@ class Tensor {
}
//void Load(const std::string &path);
/**
* @brief Set GPU Tensor's vector to values of specified vector.
*
* @param data Vector copied to GPU.
*/
void set(const std::vector<float>& data);
/**
* @brief Fill GPU Tensor's vector using values from the specified vector.
*
* @param begin Begin iterator of vector being copied.
* @param end End iterator of vector being copied.
*/
void set(const std::vector<float>::const_iterator &begin, const std::vector<float>::const_iterator &end);
/**
* @brief Copy Tensor's vector from GPU to vector variable on CPU (const).
*
* @param out Vector iterator used in copying.
*/
void get(std::vector<float>::iterator out) const {
pimpl_->get(out);
}
/**
* @brief Copy Tensor's vector from GPU to vector variable on CPU.
*
* @param out Vector to copy data to.
*/
void get(std::vector<float> &vout) const {
vout.resize(size());
pimpl_->get(vout.begin());
}
};
/**
* @brief Operator to set data on Tensor using vector.
*
* @param t Tensor.
* @param vec Vector used to set data in Tensor.
*
* @return Tensor with assigned data.
*/
Tensor& operator<<(Tensor& t, const std::vector<float> &vec);
/**
* @brief Operator to get data from Tensor to vector.
*
* @param vec Vector to save copied data.
* @param t Tensor to copy data from.
*
* @return Vector with copied data.
*/
std::vector<float>& operator<<(std::vector<float> &vec, const Tensor& t);
}

View File

@ -1,97 +1,129 @@
#include "marian.h"
#include "mnist.h"
#include "vocab.h"
#include <assert.h>
#if 0
ExpressionGraph build_graph() {
std::cerr << "Loading model params...";
using namespace marian;
using namespace keywords;
const int input_size = 10;
const int output_size = 15;
const int embedding_size = 8;
const int hidden_size = 5;
const int batch_size = 25;
const int num_inputs = 8;
const int num_outputs = 6;
ExpressionGraph build_graph(int cuda_device) {
std::cerr << "Building computation graph..." << std::endl;
ExpressionGraph g(cuda_device);
std::vector<Expr> X, Y, H, S;
// We're including the stop symbol here.
for (int t = 0; t <= num_inputs; ++t) {
std::stringstream ss;
ss << "X" << t;
X.emplace_back(named(g.input(shape={batch_size, input_size}), ss.str()));
}
// We're including the stop symbol here.
for (int t = 0; t <= num_outputs; ++t) {
std::stringstream ss;
ss << "Y" << t;
Y.emplace_back(named(g.input(shape={batch_size, output_size}), ss.str()));
}
// Source embeddings.
Expr E = named(g.param(shape={input_size, embedding_size},
init=uniform()), "E");
// Source RNN parameters.
Expr Wxh = named(g.param(shape={embedding_size, hidden_size},
init=uniform()), "Wxh");
Expr Whh = named(g.param(shape={hidden_size, hidden_size},
init=uniform()), "Whh");
Expr bh = named(g.param(shape={1, hidden_size},
init=uniform()), "bh");
Expr h0 = named(g.param(shape={1, hidden_size},
init=uniform()), "h0");
std::cerr << "Building encoder RNN..." << std::endl;
H.emplace_back(tanh(dot(dot(X[0], E), Wxh) + dot(h0, Whh) + bh));
for (int t = 1; t <= num_inputs; ++t) {
H.emplace_back(tanh(dot(dot(X[t], E), Wxh) + dot(H[t-1], Whh) + bh));
}
// Target RNN parameters.
Expr Wxh_d = named(g.param(shape={output_size, hidden_size},
init=uniform()), "Wxh_d");
Expr Whh_d = named(g.param(shape={hidden_size, hidden_size},
init=uniform()), "Whh_d");
Expr bh_d = named(g.param(shape={1, hidden_size},
init=uniform()), "bh_d");
std::cerr << "Building decoder RNN..." << std::endl;
auto h0_d = H[num_inputs];
S.emplace_back(tanh(dot(Y[0], Wxh_d) + dot(h0_d, Whh_d) + bh_d));
for (int t = 1; t < num_outputs; ++t) {
S.emplace_back(tanh(dot(Y[t], Wxh_d) + dot(S[t-1], Whh_d) + bh_d));
}
// Output linear layer before softmax.
Expr Why = named(g.param(shape={hidden_size, output_size},
init=uniform()), "Why");
Expr by = named(g.param(shape={1, output_size},
init=uniform()), "by");
std::cerr << "Building output layer..." << std::endl;
// Softmax layer and cost function.
std::vector<Expr> Yp;
Yp.emplace_back(named(softmax_fast(dot(h0_d, Why) + by), "pred"));
Expr cross_entropy = sum(Y[0] * log(Yp[0]), axis=1);
for (int t = 1; t <= num_outputs; ++t) {
Yp.emplace_back(named(softmax_fast(dot(S[t-1], Why) + by), "pred"));
cross_entropy = cross_entropy + sum(Y[t] * log(Yp[t]), axis=1);
}
auto cost = named(-mean(cross_entropy, axis=0), "cost");
std::cerr << "Done." << std::endl;
return g;
}
int main(int argc, char** argv) {
#if 1
std::cerr << "Loading the data... ";
Vocab sourceVocab, targetVocab;
// read parallel corpus from file
std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
std::fstream targetFile("../examples/mt/dev/newstest2013.en");
std::vector<std::vector<size_t> > source_sentences, target_sentences;
std::string sourceLine, targetLine;
while (getline(sourceFile, sourceLine)) {
getline(targetFile, targetLine);
std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
std::vector<size_t> targetIds = sourceVocab.ProcessSentence(targetLine);
std::vector<size_t> targetIds = targetVocab.ProcessSentence(targetLine);
source_sentences.push_back(sourceIds);
target_sentences.push_back(targetIds);
}
std::cerr << "Done." << std::endl;
std::cerr << source_sentences.size()
<< " sentence pairs read." << std::endl;
std::cerr << "Source vocabulary size: " << sourceVocab.Size() << std::endl;
std::cerr << "Target vocabulary size: " << targetVocab.Size() << std::endl;
#endif
// Build the encoder-decoder computation graph.
ExpressionGraph g = build_graph(0);
int main(int argc, char** argv) {
using namespace marian;
using namespace keywords;
int input_size = 10;
int output_size = 15;
int batch_size = 25;
int hidden_size = 5;
int num_inputs = 8;
int num_outputs = 6;
ExpressionGraph g(0);
std::vector<Expr*> X(num_inputs+1); // For the stop symbol.
std::vector<Expr*> Y(num_outputs);
std::vector<Expr*> H(num_inputs+1); // For the stop symbol.
std::vector<Expr*> S(num_outputs);
// For the stop symbol.
for (int t = 0; t <= num_inputs; ++t) {
X[t] = new Expr(g.input(shape={batch_size, input_size}));
}
// For the stop symbol.
for (int t = 0; t <= num_outputs; ++t) {
Y[t] = new Expr(g.input(shape={batch_size, output_size}));
}
Expr Wxh = g.param(shape={input_size, hidden_size}, init=uniform(), name="Wxh");
Expr Whh = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh");
Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh");
Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0");
std::cerr << "Building encoder RNN..." << std::endl;
H[0] = new Expr(tanh(dot(*X[0], Wxh) + dot(h0, Whh) + bh));
for (int t = 1; t <= num_inputs; ++t) {
H[t] = new Expr(tanh(dot(*X[t], Wxh) + dot(*H[t-1], Whh) + bh));
}
Expr Wxh_d = g.param(shape={output_size, hidden_size}, init=uniform(), name="Wxh_d");
Expr Whh_d = g.param(shape={hidden_size, hidden_size}, init=uniform(), name="Whh_d");
Expr bh_d = g.param(shape={1, hidden_size}, init=uniform(), name="bh_d");
std::cerr << "Building decoder RNN..." << std::endl;
auto h0_d = *H[num_inputs];
S[0] = new Expr(tanh(dot(*Y[0], Wxh_d) + dot(h0_d, Whh_d) + bh_d));
for (int t = 1; t < num_outputs; ++t) {
S[t] = new Expr(tanh(dot(*Y[t], Wxh_d) + dot(*S[t-1], Whh_d) + bh_d));
}
Expr Why = g.param(shape={hidden_size, output_size}, init=uniform(), name="Why");
Expr by = g.param(shape={1, output_size}, init=uniform(), name="by");
std::cerr << "Building output layer..." << std::endl;
std::vector<Expr*> Yp(num_outputs+1); // For the stop symbol.
Expr* cross_entropy = NULL;
for (int t = 0; t <= num_outputs; ++t) {
if (t == 0) {
Yp[t] = new Expr(named(softmax_fast(dot(h0_d, Why) + by), "pred"));
cross_entropy = new Expr(sum(*Y[t] * log(*Yp[t]), axis=1));
} else {
Yp[t] = new Expr(named(softmax_fast(dot(*S[t-1], Why) + by), "pred"));
*cross_entropy = *cross_entropy + sum(*Y[t] * log(*Yp[t]), axis=1);
}
}
auto graph = -mean(*cross_entropy, axis=0, name="cost");
// For the stop symbol.
// Generate input data (include the stop symbol).
for (int t = 0; t <= num_inputs; ++t) {
Tensor Xt({batch_size, input_size});
float max = 1.;
std::vector<float> values(batch_size * input_size);
std::vector<float> classes(batch_size * output_size, 0.0);
@ -101,13 +133,14 @@ int main(int argc, char** argv) {
values[k] = max * (2.0*static_cast<float>(rand()) / RAND_MAX - 1.0);
}
}
thrust::copy(values.begin(), values.end(), Xt.begin());
*X[t] = Xt;
std::stringstream ss;
ss << "X" << t;
g[ss.str()] = Xt;
}
for (int t = 0; t < num_outputs; ++t) {
// Generate output data (include the stop symbol).
for (int t = 0; t <= num_outputs; ++t) {
Tensor Yt({batch_size, output_size});
std::vector<float> classes(batch_size * output_size, 0.0);
@ -117,26 +150,31 @@ int main(int argc, char** argv) {
classes[l + gold] = 1.0;
l += output_size;
}
thrust::copy(classes.begin(), classes.end(), Yt.begin());
*Y[t] = Yt;
std::stringstream ss;
ss << "Y" << t;
g[ss.str()] = Yt;
}
std::cerr << "Printing the computation graph..." << std::endl;
std::cout << g.graphviz() << std::endl;
std::cerr << "Running the forward step..." << std::endl;
g.forward(batch_size);
std::cerr << "Running the backward step..." << std::endl;
g.backward();
std::cerr << "Done." << std::endl;
std::cerr << graph.val().Debug() << std::endl;
std::cerr << g["cost"].val().Debug() << std::endl;
std::cerr << X[0]->val().Debug() << std::endl;
std::cerr << Y[0]->val().Debug() << std::endl;
std::cerr << Whh.grad().Debug() << std::endl;
std::cerr << bh.grad().Debug() << std::endl;
std::cerr << Why.grad().Debug() << std::endl;
std::cerr << by.grad().Debug() << std::endl;
std::cerr << Wxh.grad().Debug() << std::endl;
std::cerr << h0.grad().Debug() << std::endl;
std::cerr << g["X0"].val().Debug() << std::endl;
std::cerr << g["Y0"].val().Debug() << std::endl;
std::cerr << g["Whh"].grad().Debug() << std::endl;
std::cerr << g["bh"].grad().Debug() << std::endl;
std::cerr << g["Why"].grad().Debug() << std::endl;
std::cerr << g["by"].grad().Debug() << std::endl;
std::cerr << g["Wxh"].grad().Debug() << std::endl;
std::cerr << g["h0"].grad().Debug() << std::endl;
return 0;
}

View File

@ -24,22 +24,6 @@ inline std::vector<std::string> Tokenize(const std::string& str,
return tokens;
}
////////////////////////////////////////////////////////
size_t Vocab::GetUNK() const
{
return std::numeric_limits<size_t>::max();
}
size_t Vocab::GetPad() const
{
return std::numeric_limits<size_t>::max() - 1;
}
size_t Vocab::GetEOS() const
{
return std::numeric_limits<size_t>::max() - 2;
}
size_t Vocab::GetOrCreate(const std::string &word)
{
@ -55,6 +39,12 @@ size_t Vocab::GetOrCreate(const std::string &word)
return id;
}
size_t Vocab::Get(const std::string &word) const
{
Coll::const_iterator iter = coll_.find(word);
return iter->second;
}
std::vector<size_t> Vocab::ProcessSentence(const std::string &sentence)
{
vector<string> toks = Tokenize(sentence);

View File

@ -7,12 +7,22 @@
class Vocab
{
public:
Vocab() {
GetOrCreate("__UNK__");
GetOrCreate("__PAD__");
GetOrCreate("__EOS__");
}
virtual ~Vocab() {}
public:
size_t Size() const { return coll_.size(); }
size_t Get(const std::string &word) const;
size_t GetOrCreate(const std::string &word);
std::vector<size_t> ProcessSentence(const std::string &sentence);
size_t GetUNK() const;
size_t GetPad() const;
size_t GetEOS() const;
size_t GetUNK() const { return Get("__UNK__"); }
size_t GetPAD() const { return Get("__PAD__"); }
size_t GetEOS() const { return Get("__EOS__"); }
protected:
typedef std::unordered_map<std::string, size_t> Coll;
Coll coll_;