diff --git a/src/cudnn_tensor.h b/src/cudnn_tensor.h deleted file mode 100644 index cd71d942..00000000 --- a/src/cudnn_tensor.h +++ /dev/null @@ -1,400 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "exception.h" -#include "thrust_functions.h" - -namespace marian { - -struct Handles { - cudnnHandle_t cudnnHandle; - cublasHandle_t cublasHandle; - - cudnnOpTensorDescriptor_t add; - - Handles() { - cudnnCreate(&cudnnHandle); - cublasCreate(&cublasHandle); - cudnnCreateOpTensorDescriptor(&add); - cudnnSetOpTensorDescriptor(add, CUDNN_OP_TENSOR_ADD, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN); - } - - ~Handles() { - cudnnDestroy(cudnnHandle); - cublasDestroy(cublasHandle); - cudnnDestroyOpTensorDescriptor(add); - } -}; - -Handles handles; - -typedef std::vector Shape; - -template -class TensorImpl { - private: - Shape shape_; - thrust::device_vector data_; - cudnnTensorDescriptor_t desc_; - size_t tno_; - static size_t tensorCounter; - - cudnnDataType_t dataType() { - switch(sizeof(Float)) { - case 2: return CUDNN_DATA_HALF; - case 8: return CUDNN_DATA_DOUBLE; - default: return CUDNN_DATA_FLOAT; - } - } - - public: - typedef Float value_type; - - TensorImpl(const Shape& shape, value_type value = 0) - : shape_(shape), tno_(tensorCounter++) - { - // @TODO: - UTIL_THROW_IF2(shape_.size() != 2, - "For now, only 2D Tensors, will be fixed later."); - - UTIL_THROW_IF2(shape_.size() < 1 || shape_.size() > 4, - "Wrong number of dimensions: " << shape_.size()); - int size = std::accumulate(shape_.begin(), shape_.end(), - 1, std::multiplies()); - data_.resize(size, value); - cudnnCreateTensorDescriptor(&desc_); - switch (shape_.size()) { - case 1: - cudnnSetTensor4dDescriptor(desc_, CUDNN_TENSOR_NCHW, dataType(), - shape_[0], 1, 1, 1); break; - case 2: - cudnnSetTensor4dDescriptor(desc_, CUDNN_TENSOR_NCHW, dataType(), - shape_[0], shape_[1], 1, 1); break; - case 3: - cudnnSetTensor4dDescriptor(desc_, CUDNN_TENSOR_NCHW, dataType(), - shape_[0], shape_[1], shape_[2], 1); break; - case 4: - cudnnSetTensor4dDescriptor(desc_, CUDNN_TENSOR_NCHW, dataType(), - shape_[0], shape_[1], shape_[2], shape_[3]); break; - } - } - - TensorImpl(const TensorImpl&) = delete; - TensorImpl(TensorImpl&&) = delete; - - ~TensorImpl() { - cudnnDestroyTensorDescriptor(desc_); - } - - value_type operator[](size_t i) const { - return data_[i]; - } - - auto begin() -> decltype( data_.begin() ) { - return data_.begin(); - } - - auto begin() const -> decltype( data_.begin() ) { - return data_.begin(); - } - - auto end() -> decltype( data_.end() ) { - return data_.end(); - } - - auto end() const -> decltype( data_.end() ) { - return data_.end(); - } - - const Shape& shape() const { - return shape_; - } - - size_t size() const { - return data_.size(); - } - - value_type* data() { - return thrust::raw_pointer_cast(data_.data()); - } - - cudnnTensorDescriptor_t desc() const { - return desc_; - } - - size_t id() const { - return tno_; - } - - void set(value_type value) { - thrust::fill(data_.begin(), data_.end(), value); - } -}; - -template -size_t TensorImpl::tensorCounter = 0; - -class Tensor { - private: - std::shared_ptr> pimpl_; - - public: - typedef TensorImpl::value_type value_type; - - Tensor(const Shape& shape, value_type value = 0) - : pimpl_(new TensorImpl(shape, value)) {} - - // Single value with broadcasting super powers. Might be - // worth getting rid of this performance-wise, but is saves - // so much typing when defining operators. - Tensor(value_type value) - : pimpl_(new TensorImpl({1, 1}, value)) {} - - Tensor() {} - - ~Tensor() {} - - value_type operator[](size_t i) const { - return (*pimpl_)[i]; - } - - size_t size() const { - return pimpl_->size(); - } - - value_type* data() { - return pimpl_->data(); - } - - const value_type* data() const { - return pimpl_->data(); - } - - auto begin() -> decltype( pimpl_->begin() ) { - return pimpl_->begin(); - } - - auto begin() const -> decltype( pimpl_->begin() ) { - return pimpl_->begin(); - } - - auto end() -> decltype( pimpl_->begin() ) { - return pimpl_->begin(); - } - - auto end() const -> decltype( pimpl_->begin() ) { - return pimpl_->begin(); - } - - const Shape& shape() const { - return pimpl_->shape(); - } - - cudnnTensorDescriptor_t desc() const { - return pimpl_->desc(); - } - - void set(value_type value) { - pimpl_->set(value); - } - - size_t id() const { - return pimpl_->id(); - } - - operator bool() { - return pimpl_ != nullptr; - } -}; - -Tensor uniform(Tensor t, float a=-0.1, float b=0.1) { - std::vector r(t.size()); - for(int i = 0; i < r.size(); i++) - r[i] = (float(rand() % 2000) - 1000.0)/10000.0; - thrust::copy(r.begin(), r.end(), t.begin()); - return t; -}; - -using namespace thrust::placeholders; -#define MAX_THREADS 512 -#define MAX_BLOCKS 65535 - -template -__global__ void gElement(Functor functor, float* out, - size_t rows, size_t cols) { - for(int bid = 0; bid < rows; bid += gridDim.x) { - int j = bid + blockIdx.x; - if(j < rows) { - float* rowOut = out + j * cols; - for(int tid = 0; tid < cols; tid += blockDim.x) { - int i = tid + threadIdx.x; - if(i < cols) - rowOut[i] = functor(rowOut[i]);; - } - } - } -} - -template -__global__ void gElement(Functor functor, - float* out, const float* in, - size_t rows, size_t cols) { - for(int bid = 0; bid < rows; bid += gridDim.x) { - int j = bid + blockIdx.x; - if(j < rows) { - float* rowOut = out + j * cols; - const float* rowIn = in + j * cols; - - for(int tid = 0; tid < cols; tid += blockDim.x) { - int i = tid + threadIdx.x; - if(i < cols) - rowOut[i] = functor(rowOut[i], rowIn[i]);; - } - } - } -} - -template -__global__ void gElement(Functor functor, - float* out, const float* in1, const float* in2, - size_t rows, size_t cols) { - for(int bid = 0; bid < rows; bid += gridDim.x) { - int j = bid + blockIdx.x; - if(j < rows) { - float* rowOut = out + j * cols; - const float* rowIn1 = in1 + j * cols; - const float* rowIn2 = in2 + j * cols; - - for(int tid = 0; tid < cols; tid += blockDim.x) { - int i = tid + threadIdx.x; - if(i < cols) - rowOut[i] = functor(rowOut[i], rowIn1[i], rowIn2[i]); - } - } - } -} - -template -__global__ void gElement(Functor functor, - float* out, const float* in1, - const float* in2, const float* in3, - size_t rows, size_t cols) { - for(int bid = 0; bid < rows; bid += gridDim.x) { - int j = bid + blockIdx.x; - if(j < rows) { - float* rowOut = out + j * cols; - const float* rowIn1 = in1 + j * cols; - const float* rowIn2 = in2 + j * cols; - const float* rowIn3 = in3 + j * cols; - - for(int tid = 0; tid < cols; tid += blockDim.x) { - int i = tid + threadIdx.x; - if(i < cols) - rowOut[i] = functor(rowOut[i], rowIn1[i], rowIn2[i], rowIn3[i]); - } - } - } -} - -// @TODO add broadcasting - -template -void Element(Functor functor, Tensor Out) { - float* d_out = Out.data(); - int blocks = std::min(MAX_BLOCKS, (int)Out.shape()[0]); - int threads = std::min(MAX_THREADS, (int)Out.shape()[1]); - gElement<<>>(functor, d_out, - Out.shape()[0], Out.shape()[1]); - cudaStreamSynchronize(0); -} - -template -void Element(Functor functor, - Tensor Out, const Tensor In) { - float* d_out = Out.data(); - const float* d_in = In.data(); - - int blocks = std::min(MAX_BLOCKS, (int)Out.shape()[0]); - int threads = std::min(MAX_THREADS, (int)Out.shape()[1]); - gElement<<>>(functor, d_out, d_in, - Out.shape()[0], Out.shape()[1]); - cudaStreamSynchronize(0); -} - -template -void Element(Functor functor, - Tensor Out, const Tensor In1, const Tensor In2) { - - float* d_out = Out.data(); - const float* d_in1 = In1.data(); - const float* d_in2 = In2.data(); - - int blocks = std::min(MAX_BLOCKS, (int)Out.shape()[0]); - int threads = std::min(MAX_THREADS, (int)Out.shape()[1]); - gElement<<>>(functor, d_out, d_in1, d_in2, - Out.shape()[0], Out.shape()[1]); - cudaStreamSynchronize(0); -} - -template -void Element(Functor functor, - Tensor Out, const Tensor In1, - const Tensor In2, const Tensor In3) { - - float* d_out = Out.data(); - const float* d_in1 = In1.data(); - const float* d_in2 = In2.data(); - const float* d_in3 = In3.data(); - - int blocks = std::min(MAX_BLOCKS, (int)Out.shape()[0]); - int threads = std::min(MAX_THREADS, (int)Out.shape()[1]); - gElement<<>>(functor, d_out, d_in1, d_in2, d_in3, - Out.shape()[0], Out.shape()[1]); - cudaStreamSynchronize(0); -} - -Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B, - bool transA, bool transB, float beta) { - float alpha = 1.0; - - size_t m = A.shape()[0]; - size_t k = A.shape()[1]; - if(transA) - std::swap(m, k); - - size_t l = B.shape()[0]; - size_t n = B.shape()[1]; - if(transB) - std::swap(l, n); - - size_t lda = A.shape()[1]; - size_t ldb = B.shape()[1]; - size_t ldc = B.shape()[1]; - - if(transB) - ldc = B.shape()[0]; - - cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - - cublasSgemm(handle, opB, opA, - n, m, k, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc); - return C; -} - -Tensor Prod(Tensor C, const Tensor A, const Tensor B, - bool transA, bool transB, float beta = 0) { - - return Prod(handles.cublasHandle, C, A, B, transA, transB, beta); -} - -} \ No newline at end of file diff --git a/src/keywords.h b/src/keywords.h index a0d6d4c9..522af5d2 100644 --- a/src/keywords.h +++ b/src/keywords.h @@ -1,7 +1,5 @@ #pragma once -#include -#include #include #include #include @@ -12,7 +10,7 @@ namespace marian { namespace keywords { - template + template class Keyword { public: typedef Value value_type; @@ -72,32 +70,6 @@ namespace keywords { #define KEY(name, value_type) \ typedef Keyword name ## _k; \ name ## _k name(#name); - - KEY(shape, std::vector) - KEY(prefix, std::string) - KEY(axis, size_t); -} - -class demo : public keywords::Keywords { - public: - template - demo(size_t size, Args ...args) - : Keywords(args...), - size_(size), - prefix_(Get(keywords::prefix, std::string("_"))), - shape_(Get>(keywords::shape, std::vector())) - {} - - private: - size_t size_; - std::string prefix_; - std::vector shape_; -}; - -void demo_main() { - using namespace keywords; - - demo(300, shape={1,3}, prefix="layer1_", axis=0); } } \ No newline at end of file diff --git a/src/marian.h b/src/marian.h index b8320d91..18510943 100644 --- a/src/marian.h +++ b/src/marian.h @@ -1,115 +1,9 @@ #pragma once -#include -#include -#include -#include - -#include "exception.h" -#include "cudnn_tensor.h" - -namespace marian { - -template -struct Chainable : public std::enable_shared_from_this> { - Chainable() { } - virtual ~Chainable() { } - virtual void forward() { } - virtual void backward() { } - virtual void init_dependent() { } - virtual void set_zero_adjoint() { } - - virtual DataType val() = 0; - virtual DataType grad() = 0; -}; - -typedef std::vector*> ChainableStack; -typedef std::shared_ptr> ChainPtr; - -ChainableStack stack; - -class Node : public Chainable { - public: - Node(const Tensor t) : val_(t) { - //std::cerr << "Putting node with tensor " << t.id() << " on stack" << std::endl; - stack.push_back(this); - } - - virtual ~Node() {}; - - virtual void init_dependent() { - if(adj_) { - adj_.set(1); - } - else { - adj_ = Tensor(val_.shape(), 1); - } - } - - virtual void set_zero_adjoint() { - if(adj_) { - adj_.set(0); - } - else { - adj_ = Tensor(val_.shape(), 0); - } - } - - virtual Tensor val() { return val_; }; - virtual Tensor grad() { return adj_; }; - - protected: - Tensor val_; - Tensor adj_; -}; - -class Var { - public: - Var() : pimpl_(nullptr) {} - Var(const Tensor t) : pimpl_(new Node(t)) {} - Var(const Tensor::value_type v) : pimpl_(new Node(Tensor(v))) {} - Var(const ChainPtr chainable) : pimpl_(chainable) {} - Var(Chainable* chainable) : pimpl_(chainable) {} - - Tensor val() { - return pimpl_->val(); - } - - Tensor grad() { - return pimpl_->grad(); - } - - ChainPtr pimpl() { - return pimpl_; - } - - void forward() { - UTIL_THROW_IF2(pimpl_.get() != stack.back(), - "Trying to call forward on non-root of computation graph"); - - for(auto&& v : stack) - v->forward(); - } - - void backward() { - UTIL_THROW_IF2(pimpl_.get() != stack.back(), - "Trying to call backward on non-root of computation graph"); - - for(auto&& v : stack) - v->set_zero_adjoint(); - - typedef ChainableStack::reverse_iterator It; - pimpl_->init_dependent(); - for(It it = stack.rbegin(); it != stack.rend(); ++it) - (*it)->backward(); - } - - operator ChainPtr() { - return pimpl_; - } - - private: - ChainPtr pimpl_; -}; - -} \ No newline at end of file +#include "definitions.h" +#include "graph.h" +#include "graph_operators.h" +#include "expressions.h" +#include "expression_operators.h" +//#include "tensor.h" +//#include "tensor_operators.h" diff --git a/src/operators.h b/src/operators.h index 340e5188..ef756dd2 100644 --- a/src/operators.h +++ b/src/operators.h @@ -183,7 +183,7 @@ Var broadcast(Shape shape, Var a) { a = dot(a, one); } else { - UTIL_THROW2("Not inplemented"); + UTIL_THROW2("Not implemented"); } } } @@ -327,10 +327,10 @@ inline Var sum(Var a, Axis axis = Axis::undef) { return dot(a, one); } else if(axis == Axis::axis2) { - UTIL_THROW2("Not inplemented"); + UTIL_THROW2("Not implemented"); } else if(axis == Axis::axis3) { - UTIL_THROW2("Not inplemented"); + UTIL_THROW2("Not implemented"); } return sum(sum(a, Axis::axis0), Axis::axis1); } @@ -358,13 +358,4 @@ inline Var mean(Var a, Axis axis = Axis::undef) { } } -// FAKE -inline Var input(const std::string& name, Var v) { - return v; -} - -inline Var forsave(const std::string& name, Var v) { - return v; -} - } \ No newline at end of file diff --git a/src/test.cu b/src/test.cu index 4fd0d18e..d35150b5 100644 --- a/src/test.cu +++ b/src/test.cu @@ -1,112 +1,65 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include #include "marian.h" -#include "operators.h" -#include "keywords.h" - -using namespace marian; - int main(int argc, char** argv) { - using namespace keywords; - - auto layer = demo(300, prefix="test_"); - - //auto x = input("X", shape={1, 768}); - //auto y = input("Y", shape={1, 10}); - // - //auto l = x; - //for(auto n : { 300, 200, 100, 50, 20 }) - // l = dense(n, l, activation=tanh); - // - //auto w = param("W", init=orthogonal, shape={20, 10}); - //auto b = param("b", init=orthogonal, shape={1, 10}); - //l = sigmoid(dot(w, l) + b); - // - //auto lp = dense(10, l, activation=softmax(axis=1)); - //auto cost = -mean(sum(y * log(lp), axis=1)); + using namespace marian; + using namespace keywords; + + auto x = data(shape={whatevs, 784}, name="X"); + auto y = data(shape={whatevs, 10}, name="Y"); + auto w = param(shape={784, 10}, name="W0"); + auto b = param(shape={1, 10}, name="b0"); + + auto lr = softmax(dot(x, w) + b, axis=1); + auto cost = -mean(sum(y * log(lr), axis=1), axis=0); - //auto x1 = input(k::name="x0", k::shape={1,100}); - //auto x2 = input(k::name="x1", k::shape={1,100}); - //auto y = output(k::name="y", k::shape={1,10}); - // - //auto l1 = dense(100, - // k::name="layer1", - // k::input={x1, x2}, - // k::activation=sigmoid, - // k::init_w=orthogonal, - // k::init_b=uniform(-0.1,0.1) - // k::merge=concat); - //auto l2 = dense(100, k::input=l1, k::name="charlie" - // k::activation=tanh); - //auto lout = dense(10, k::input=l2, - // k::activation=softmax); - // - //auto cost = -mean(sum(y * log(lout), k::axis=1)); - // - //auto w = cost["charlie_w"]; - //auto b = cost["layer1_b"]; - // - //auto opt = optimizer(cost, - // k::method=adadelta); - // - //Tensor X(k::shape={60, 768}, k::init=mnist("")); - //Tensor Y(k::shape={60, 10}, k::init=mnist("")); - // - //float c = opt.fit_batch({X1, X2}, Y, k::logger=logger); - // - //Tensor xTrain - // (shape, {60000, 784}) - // (init, mnist("train.ubyte")); - // - //Tensor yTrain - // (shape, {60000, 10}) - // (init, mnist("train.ubyte", true)); - // - //Tensor xBatch = slice(xTrain, {0, 50, 5}); - // - //Var x = input("X"); - //Var y = input("Y"); - // - //ry = dense(input=x, size=200, activation=tanh, - // init_w=orthogonal, init_b=uniform(-0.1. 0.1)); - // - //ry = dense(ry)(size, 100)(activation, tanh); - //ry = dense(ry)(size, 10)(activation, softmax); - // - //Var cost = -mean(y * log(ry) + (1 - y) * log(1 - ry)); - // - //boost::timer::auto_cpu_timer t; - //float eta = 0.01; - //for(size_t i = 0; i < 2000; ++i) { - // cost.forward(); - // - // if(i % 200 == 0) { - // for(size_t j = 0; j < 4; ++j) { - // std::cerr << ry.val()[j] << std::endl; - // } - // std::cerr << i << " ct: " << cost.val()[0] << std::endl; - // } - // - // cost.backward(); - // for(auto p : params) { - // auto update = - // _1 -= eta * _2; - // Element(update, p.val(), p.grad()); - // } - //} - - return 0; + cost.forward(); + + //auto set = [](size_t i, Expr c) { + // size_t bid = (i + 1) % batches; + // Tensor x = c["X"].val(); + // thrust::copy(XBatches[bid].begin(), XBatches[bid].end(), + // x.begin()); + // Tensor y = c["Y"].val(); + // thrust::copy(YBatches[bid].begin(), YBatches[bid].end(), + // y.begin()); + //}; + // + //auto before = [](size_t i, Expr c) { + // for(auto&& p : c.params()) + // clip(p.grad(), type=norm, max=10); + //}; + // + // + //float sum; + //auto after = [&sum](size_t i, Expr c) { + // sum += c.val()[0]; + // + // if(i % 100 == 0) { + // std::cerr << sum / i << std::endl; + // std::cerr << i << " : " << c.val()[0] << std::endl; + // } + // + // if(i % 10000 == 0) { + // std::cerr << "Saving model " << i << std::endl; + // std::stringstream name; + // name << "model.iter" << i << ".yml.gz"; + // dump(c, name.str()); + // } + // + //}; + // + //auto opt = adadelta(cost_function=cost, + // eta=0.9, gamma=0.1, + // set_batch=set, + // before_update=before, + // after_update=after, + // set_valid=valid, + // validation_freq=100, + // verbose=1, epochs=3, early_stopping=10); + //opt.run(); + + return 0; } \ No newline at end of file