towards lazy allocationi

This commit is contained in:
Marcin Junczys-Dowmunt 2016-05-09 11:55:24 +02:00
parent 3db0f30312
commit 4b79f3e72a
5 changed files with 68 additions and 658 deletions

View File

@ -1,400 +0,0 @@
#pragma once
#include <memory>
#include <functional>
#include <vector>
#include <cmath>
#include <cudnn.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include "exception.h"
#include "thrust_functions.h"
namespace marian {
struct Handles {
cudnnHandle_t cudnnHandle;
cublasHandle_t cublasHandle;
cudnnOpTensorDescriptor_t add;
Handles() {
cudnnCreate(&cudnnHandle);
cublasCreate(&cublasHandle);
cudnnCreateOpTensorDescriptor(&add);
cudnnSetOpTensorDescriptor(add, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN);
}
~Handles() {
cudnnDestroy(cudnnHandle);
cublasDestroy(cublasHandle);
cudnnDestroyOpTensorDescriptor(add);
}
};
Handles handles;
typedef std::vector<int> Shape;
template<class Float>
class TensorImpl {
private:
Shape shape_;
thrust::device_vector<Float> data_;
cudnnTensorDescriptor_t desc_;
size_t tno_;
static size_t tensorCounter;
cudnnDataType_t dataType() {
switch(sizeof(Float)) {
case 2: return CUDNN_DATA_HALF;
case 8: return CUDNN_DATA_DOUBLE;
default: return CUDNN_DATA_FLOAT;
}
}
public:
typedef Float value_type;
TensorImpl(const Shape& shape, value_type value = 0)
: shape_(shape), tno_(tensorCounter++)
{
// @TODO:
UTIL_THROW_IF2(shape_.size() != 2,
"For now, only 2D Tensors, will be fixed later.");
UTIL_THROW_IF2(shape_.size() < 1 || shape_.size() > 4,
"Wrong number of dimensions: " << shape_.size());
int size = std::accumulate(shape_.begin(), shape_.end(),
1, std::multiplies<int>());
data_.resize(size, value);
cudnnCreateTensorDescriptor(&desc_);
switch (shape_.size()) {
case 1:
cudnnSetTensor4dDescriptor(desc_, CUDNN_TENSOR_NCHW, dataType(),
shape_[0], 1, 1, 1); break;
case 2:
cudnnSetTensor4dDescriptor(desc_, CUDNN_TENSOR_NCHW, dataType(),
shape_[0], shape_[1], 1, 1); break;
case 3:
cudnnSetTensor4dDescriptor(desc_, CUDNN_TENSOR_NCHW, dataType(),
shape_[0], shape_[1], shape_[2], 1); break;
case 4:
cudnnSetTensor4dDescriptor(desc_, CUDNN_TENSOR_NCHW, dataType(),
shape_[0], shape_[1], shape_[2], shape_[3]); break;
}
}
TensorImpl(const TensorImpl&) = delete;
TensorImpl(TensorImpl&&) = delete;
~TensorImpl() {
cudnnDestroyTensorDescriptor(desc_);
}
value_type operator[](size_t i) const {
return data_[i];
}
auto begin() -> decltype( data_.begin() ) {
return data_.begin();
}
auto begin() const -> decltype( data_.begin() ) {
return data_.begin();
}
auto end() -> decltype( data_.end() ) {
return data_.end();
}
auto end() const -> decltype( data_.end() ) {
return data_.end();
}
const Shape& shape() const {
return shape_;
}
size_t size() const {
return data_.size();
}
value_type* data() {
return thrust::raw_pointer_cast(data_.data());
}
cudnnTensorDescriptor_t desc() const {
return desc_;
}
size_t id() const {
return tno_;
}
void set(value_type value) {
thrust::fill(data_.begin(), data_.end(), value);
}
};
template <typename Type>
size_t TensorImpl<Type>::tensorCounter = 0;
class Tensor {
private:
std::shared_ptr<TensorImpl<float>> pimpl_;
public:
typedef TensorImpl<float>::value_type value_type;
Tensor(const Shape& shape, value_type value = 0)
: pimpl_(new TensorImpl<value_type>(shape, value)) {}
// Single value with broadcasting super powers. Might be
// worth getting rid of this performance-wise, but is saves
// so much typing when defining operators.
Tensor(value_type value)
: pimpl_(new TensorImpl<value_type>({1, 1}, value)) {}
Tensor() {}
~Tensor() {}
value_type operator[](size_t i) const {
return (*pimpl_)[i];
}
size_t size() const {
return pimpl_->size();
}
value_type* data() {
return pimpl_->data();
}
const value_type* data() const {
return pimpl_->data();
}
auto begin() -> decltype( pimpl_->begin() ) {
return pimpl_->begin();
}
auto begin() const -> decltype( pimpl_->begin() ) {
return pimpl_->begin();
}
auto end() -> decltype( pimpl_->begin() ) {
return pimpl_->begin();
}
auto end() const -> decltype( pimpl_->begin() ) {
return pimpl_->begin();
}
const Shape& shape() const {
return pimpl_->shape();
}
cudnnTensorDescriptor_t desc() const {
return pimpl_->desc();
}
void set(value_type value) {
pimpl_->set(value);
}
size_t id() const {
return pimpl_->id();
}
operator bool() {
return pimpl_ != nullptr;
}
};
Tensor uniform(Tensor t, float a=-0.1, float b=0.1) {
std::vector<float> r(t.size());
for(int i = 0; i < r.size(); i++)
r[i] = (float(rand() % 2000) - 1000.0)/10000.0;
thrust::copy(r.begin(), r.end(), t.begin());
return t;
};
using namespace thrust::placeholders;
#define MAX_THREADS 512
#define MAX_BLOCKS 65535
template <class Functor>
__global__ void gElement(Functor functor, float* out,
size_t rows, size_t cols) {
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
float* rowOut = out + j * cols;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols)
rowOut[i] = functor(rowOut[i]);;
}
}
}
}
template <class Functor>
__global__ void gElement(Functor functor,
float* out, const float* in,
size_t rows, size_t cols) {
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
float* rowOut = out + j * cols;
const float* rowIn = in + j * cols;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols)
rowOut[i] = functor(rowOut[i], rowIn[i]);;
}
}
}
}
template <class Functor>
__global__ void gElement(Functor functor,
float* out, const float* in1, const float* in2,
size_t rows, size_t cols) {
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
float* rowOut = out + j * cols;
const float* rowIn1 = in1 + j * cols;
const float* rowIn2 = in2 + j * cols;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols)
rowOut[i] = functor(rowOut[i], rowIn1[i], rowIn2[i]);
}
}
}
}
template <class Functor>
__global__ void gElement(Functor functor,
float* out, const float* in1,
const float* in2, const float* in3,
size_t rows, size_t cols) {
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
float* rowOut = out + j * cols;
const float* rowIn1 = in1 + j * cols;
const float* rowIn2 = in2 + j * cols;
const float* rowIn3 = in3 + j * cols;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols)
rowOut[i] = functor(rowOut[i], rowIn1[i], rowIn2[i], rowIn3[i]);
}
}
}
}
// @TODO add broadcasting
template <class Functor>
void Element(Functor functor, Tensor Out) {
float* d_out = Out.data();
int blocks = std::min(MAX_BLOCKS, (int)Out.shape()[0]);
int threads = std::min(MAX_THREADS, (int)Out.shape()[1]);
gElement<<<blocks, threads>>>(functor, d_out,
Out.shape()[0], Out.shape()[1]);
cudaStreamSynchronize(0);
}
template <class Functor>
void Element(Functor functor,
Tensor Out, const Tensor In) {
float* d_out = Out.data();
const float* d_in = In.data();
int blocks = std::min(MAX_BLOCKS, (int)Out.shape()[0]);
int threads = std::min(MAX_THREADS, (int)Out.shape()[1]);
gElement<<<blocks, threads>>>(functor, d_out, d_in,
Out.shape()[0], Out.shape()[1]);
cudaStreamSynchronize(0);
}
template <class Functor>
void Element(Functor functor,
Tensor Out, const Tensor In1, const Tensor In2) {
float* d_out = Out.data();
const float* d_in1 = In1.data();
const float* d_in2 = In2.data();
int blocks = std::min(MAX_BLOCKS, (int)Out.shape()[0]);
int threads = std::min(MAX_THREADS, (int)Out.shape()[1]);
gElement<<<blocks, threads>>>(functor, d_out, d_in1, d_in2,
Out.shape()[0], Out.shape()[1]);
cudaStreamSynchronize(0);
}
template <class Functor>
void Element(Functor functor,
Tensor Out, const Tensor In1,
const Tensor In2, const Tensor In3) {
float* d_out = Out.data();
const float* d_in1 = In1.data();
const float* d_in2 = In2.data();
const float* d_in3 = In3.data();
int blocks = std::min(MAX_BLOCKS, (int)Out.shape()[0]);
int threads = std::min(MAX_THREADS, (int)Out.shape()[1]);
gElement<<<blocks, threads>>>(functor, d_out, d_in1, d_in2, d_in3,
Out.shape()[0], Out.shape()[1]);
cudaStreamSynchronize(0);
}
Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
bool transA, bool transB, float beta) {
float alpha = 1.0;
size_t m = A.shape()[0];
size_t k = A.shape()[1];
if(transA)
std::swap(m, k);
size_t l = B.shape()[0];
size_t n = B.shape()[1];
if(transB)
std::swap(l, n);
size_t lda = A.shape()[1];
size_t ldb = B.shape()[1];
size_t ldc = B.shape()[1];
if(transB)
ldc = B.shape()[0];
cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasSgemm(handle, opB, opA,
n, m, k, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc);
return C;
}
Tensor Prod(Tensor C, const Tensor A, const Tensor B,
bool transA, bool transB, float beta = 0) {
return Prod(handles.cublasHandle, C, A, B, transA, transB, beta);
}
}

View File

@ -1,7 +1,5 @@
#pragma once
#include <iostream>
#include <vector>
#include <typeinfo>
#include <typeindex>
#include <unordered_map>
@ -12,7 +10,7 @@
namespace marian {
namespace keywords {
template <int key, typename Value>
template <unsigned key, typename Value>
class Keyword {
public:
typedef Value value_type;
@ -72,32 +70,6 @@ namespace keywords {
#define KEY(name, value_type) \
typedef Keyword<COMPILE_TIME_CRC32_STR(#name),value_type> name ## _k; \
name ## _k name(#name);
KEY(shape, std::vector<int>)
KEY(prefix, std::string)
KEY(axis, size_t);
}
class demo : public keywords::Keywords {
public:
template <typename ...Args>
demo(size_t size, Args ...args)
: Keywords(args...),
size_(size),
prefix_(Get<std::string>(keywords::prefix, std::string("_"))),
shape_(Get<std::vector<int>>(keywords::shape, std::vector<int>()))
{}
private:
size_t size_;
std::string prefix_;
std::vector<int> shape_;
};
void demo_main() {
using namespace keywords;
demo(300, shape={1,3}, prefix="layer1_", axis=0);
}
}

View File

@ -1,115 +1,9 @@
#pragma once
#include <memory>
#include <functional>
#include <vector>
#include <cmath>
#include "exception.h"
#include "cudnn_tensor.h"
namespace marian {
template <class DataType>
struct Chainable : public std::enable_shared_from_this<Chainable<DataType>> {
Chainable() { }
virtual ~Chainable() { }
virtual void forward() { }
virtual void backward() { }
virtual void init_dependent() { }
virtual void set_zero_adjoint() { }
virtual DataType val() = 0;
virtual DataType grad() = 0;
};
typedef std::vector<Chainable<Tensor>*> ChainableStack;
typedef std::shared_ptr<Chainable<Tensor>> ChainPtr;
ChainableStack stack;
class Node : public Chainable<Tensor> {
public:
Node(const Tensor t) : val_(t) {
//std::cerr << "Putting node with tensor " << t.id() << " on stack" << std::endl;
stack.push_back(this);
}
virtual ~Node() {};
virtual void init_dependent() {
if(adj_) {
adj_.set(1);
}
else {
adj_ = Tensor(val_.shape(), 1);
}
}
virtual void set_zero_adjoint() {
if(adj_) {
adj_.set(0);
}
else {
adj_ = Tensor(val_.shape(), 0);
}
}
virtual Tensor val() { return val_; };
virtual Tensor grad() { return adj_; };
protected:
Tensor val_;
Tensor adj_;
};
class Var {
public:
Var() : pimpl_(nullptr) {}
Var(const Tensor t) : pimpl_(new Node(t)) {}
Var(const Tensor::value_type v) : pimpl_(new Node(Tensor(v))) {}
Var(const ChainPtr chainable) : pimpl_(chainable) {}
Var(Chainable<Tensor>* chainable) : pimpl_(chainable) {}
Tensor val() {
return pimpl_->val();
}
Tensor grad() {
return pimpl_->grad();
}
ChainPtr pimpl() {
return pimpl_;
}
void forward() {
UTIL_THROW_IF2(pimpl_.get() != stack.back(),
"Trying to call forward on non-root of computation graph");
for(auto&& v : stack)
v->forward();
}
void backward() {
UTIL_THROW_IF2(pimpl_.get() != stack.back(),
"Trying to call backward on non-root of computation graph");
for(auto&& v : stack)
v->set_zero_adjoint();
typedef ChainableStack::reverse_iterator It;
pimpl_->init_dependent();
for(It it = stack.rbegin(); it != stack.rend(); ++it)
(*it)->backward();
}
operator ChainPtr() {
return pimpl_;
}
private:
ChainPtr pimpl_;
};
}
#include "definitions.h"
#include "graph.h"
#include "graph_operators.h"
#include "expressions.h"
#include "expression_operators.h"
//#include "tensor.h"
//#include "tensor_operators.h"

View File

@ -183,7 +183,7 @@ Var broadcast(Shape shape, Var a) {
a = dot(a, one);
}
else {
UTIL_THROW2("Not inplemented");
UTIL_THROW2("Not implemented");
}
}
}
@ -327,10 +327,10 @@ inline Var sum(Var a, Axis axis = Axis::undef) {
return dot(a, one);
}
else if(axis == Axis::axis2) {
UTIL_THROW2("Not inplemented");
UTIL_THROW2("Not implemented");
}
else if(axis == Axis::axis3) {
UTIL_THROW2("Not inplemented");
UTIL_THROW2("Not implemented");
}
return sum(sum(a, Axis::axis0), Axis::axis1);
}
@ -358,13 +358,4 @@ inline Var mean(Var a, Axis axis = Axis::undef) {
}
}
// FAKE
inline Var input(const std::string& name, Var v) {
return v;
}
inline Var forsave(const std::string& name, Var v) {
return v;
}
}

View File

@ -1,112 +1,65 @@
#include <iostream>
#include <ctime>
#include <vector>
#include <algorithm>
#include <random>
#include <boost/timer/timer.hpp>
#include <typeinfo>
#include <typeindex>
#include <unordered_map>
#include <boost/any.hpp>
#include "marian.h"
#include "operators.h"
#include "keywords.h"
using namespace marian;
int main(int argc, char** argv) {
using namespace keywords;
auto layer = demo(300, prefix="test_");
//auto x = input("X", shape={1, 768});
//auto y = input("Y", shape={1, 10});
//
//auto l = x;
//for(auto n : { 300, 200, 100, 50, 20 })
// l = dense(n, l, activation=tanh);
//
//auto w = param("W", init=orthogonal, shape={20, 10});
//auto b = param("b", init=orthogonal, shape={1, 10});
//l = sigmoid(dot(w, l) + b);
//
//auto lp = dense(10, l, activation=softmax(axis=1));
//auto cost = -mean(sum(y * log(lp), axis=1));
using namespace marian;
using namespace keywords;
auto x = data(shape={whatevs, 784}, name="X");
auto y = data(shape={whatevs, 10}, name="Y");
auto w = param(shape={784, 10}, name="W0");
auto b = param(shape={1, 10}, name="b0");
auto lr = softmax(dot(x, w) + b, axis=1);
auto cost = -mean(sum(y * log(lr), axis=1), axis=0);
//auto x1 = input(k::name="x0", k::shape={1,100});
//auto x2 = input(k::name="x1", k::shape={1,100});
//auto y = output(k::name="y", k::shape={1,10});
//
//auto l1 = dense(100,
// k::name="layer1",
// k::input={x1, x2},
// k::activation=sigmoid,
// k::init_w=orthogonal,
// k::init_b=uniform(-0.1,0.1)
// k::merge=concat);
//auto l2 = dense(100, k::input=l1, k::name="charlie"
// k::activation=tanh);
//auto lout = dense(10, k::input=l2,
// k::activation=softmax);
//
//auto cost = -mean(sum(y * log(lout), k::axis=1));
//
//auto w = cost["charlie_w"];
//auto b = cost["layer1_b"];
//
//auto opt = optimizer(cost,
// k::method=adadelta);
//
//Tensor X(k::shape={60, 768}, k::init=mnist(""));
//Tensor Y(k::shape={60, 10}, k::init=mnist(""));
//
//float c = opt.fit_batch({X1, X2}, Y, k::logger=logger);
//
//Tensor xTrain
// (shape, {60000, 784})
// (init, mnist("train.ubyte"));
//
//Tensor yTrain
// (shape, {60000, 10})
// (init, mnist("train.ubyte", true));
//
//Tensor xBatch = slice(xTrain, {0, 50, 5});
//
//Var x = input("X");
//Var y = input("Y");
//
//ry = dense(input=x, size=200, activation=tanh,
// init_w=orthogonal, init_b=uniform(-0.1. 0.1));
//
//ry = dense(ry)(size, 100)(activation, tanh);
//ry = dense(ry)(size, 10)(activation, softmax);
//
//Var cost = -mean(y * log(ry) + (1 - y) * log(1 - ry));
//
//boost::timer::auto_cpu_timer t;
//float eta = 0.01;
//for(size_t i = 0; i < 2000; ++i) {
// cost.forward();
//
// if(i % 200 == 0) {
// for(size_t j = 0; j < 4; ++j) {
// std::cerr << ry.val()[j] << std::endl;
// }
// std::cerr << i << " ct: " << cost.val()[0] << std::endl;
// }
//
// cost.backward();
// for(auto p : params) {
// auto update =
// _1 -= eta * _2;
// Element(update, p.val(), p.grad());
// }
//}
return 0;
cost.forward();
//auto set = [](size_t i, Expr c) {
// size_t bid = (i + 1) % batches;
// Tensor x = c["X"].val();
// thrust::copy(XBatches[bid].begin(), XBatches[bid].end(),
// x.begin());
// Tensor y = c["Y"].val();
// thrust::copy(YBatches[bid].begin(), YBatches[bid].end(),
// y.begin());
//};
//
//auto before = [](size_t i, Expr c) {
// for(auto&& p : c.params())
// clip(p.grad(), type=norm, max=10);
//};
//
//
//float sum;
//auto after = [&sum](size_t i, Expr c) {
// sum += c.val()[0];
//
// if(i % 100 == 0) {
// std::cerr << sum / i << std::endl;
// std::cerr << i << " : " << c.val()[0] << std::endl;
// }
//
// if(i % 10000 == 0) {
// std::cerr << "Saving model " << i << std::endl;
// std::stringstream name;
// name << "model.iter" << i << ".yml.gz";
// dump(c, name.str());
// }
//
//};
//
//auto opt = adadelta(cost_function=cost,
// eta=0.9, gamma=0.1,
// set_batch=set,
// before_update=before,
// after_update=after,
// set_valid=valid,
// validation_freq=100,
// verbose=1, epochs=3, early_stopping=10);
//opt.run();
return 0;
}