mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-19 18:59:18 +03:00
merged with memory branch
This commit is contained in:
commit
bb1e69b7ec
@ -2,8 +2,8 @@ cmake_minimum_required(VERSION 3.5.1)
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
||||
|
||||
project(marian CXX)
|
||||
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
|
||||
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_35; -lineinfo; --use_fast_math; -Xcompiler '-fPIC')
|
||||
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -Ofast -Wno-unused-result -Wno-deprecated -fPIC")
|
||||
LIST(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -std=c++11; -g; -O3; -arch=sm_61; --use_fast_math; -Xcompiler '-fPIC')
|
||||
add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM)
|
||||
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||
|
||||
|
@ -3,16 +3,17 @@ include_directories(.)
|
||||
include_directories(3rd_party)
|
||||
|
||||
cuda_add_library(marian_lib
|
||||
3rd_party/cnpy/cnpy.cpp
|
||||
# 3rd_party/cnpy/cnpy.cpp
|
||||
3rd_party/exception.cpp
|
||||
expression_graph.cu
|
||||
expression_operators.cu
|
||||
node.cu
|
||||
node_operators.cu
|
||||
node_operators_unary.cu
|
||||
node_operators_binary.cu
|
||||
tensor.cu
|
||||
tensors/tensor.cu
|
||||
tensor_operators.cu
|
||||
vocab.cpp
|
||||
# vocab.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(marian_lib)
|
||||
@ -23,13 +24,13 @@ cuda_add_executable(
|
||||
)
|
||||
|
||||
cuda_add_executable(
|
||||
mnist_benchmark
|
||||
mnist_benchmark.cu
|
||||
tensor_test
|
||||
tensor_test.cu
|
||||
)
|
||||
|
||||
cuda_add_executable(
|
||||
xor
|
||||
xor.cu
|
||||
mnist_benchmark
|
||||
mnist_benchmark.cu
|
||||
)
|
||||
|
||||
#cuda_add_executable(
|
||||
@ -37,18 +38,25 @@ cuda_add_executable(
|
||||
# validate_encoder_decoder.cu
|
||||
#)
|
||||
|
||||
cuda_add_executable(
|
||||
test_nodes
|
||||
test_nodes.cu
|
||||
)
|
||||
#cuda_add_executable(
|
||||
# test_nodes
|
||||
# test_nodes.cu
|
||||
#)
|
||||
|
||||
#target_link_libraries(softmax_benchmark marian_lib)
|
||||
#target_link_libraries(mnist_benchmark marian_lib)
|
||||
target_link_libraries(softmax_benchmark marian_lib)
|
||||
target_link_libraries(tensor_test marian_lib)
|
||||
target_link_libraries(mnist_benchmark marian_lib)
|
||||
#target_link_libraries(validate_encoder_decoder marian_lib)
|
||||
#target_link_libraries(test_nodes marian_lib)
|
||||
|
||||
foreach(exec mnist_benchmark softmax_benchmark xor test_nodes )
|
||||
target_link_libraries(${exec} marian_lib ${EXT_LIBS} cuda cudnn curand)
|
||||
#foreach(exec mnist_benchmark softmax_benchmark test_nodes tensor_test)
|
||||
# target_link_libraries(${exec} ${EXT_LIBS} cuda cudnn curand)
|
||||
# cuda_add_cublas_to_target(${exec})
|
||||
# set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
|
||||
#endforeach(exec)
|
||||
|
||||
foreach(exec mnist_benchmark tensor_test softmax_benchmark)
|
||||
target_link_libraries(${exec} ${EXT_LIBS} cuda cudnn curand)
|
||||
cuda_add_cublas_to_target(${exec})
|
||||
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
|
||||
endforeach(exec)
|
||||
|
@ -131,8 +131,9 @@ class BatchGenerator {
|
||||
return currentBatch_;
|
||||
}
|
||||
|
||||
void prepare() {
|
||||
void prepare(bool shuffle=true) {
|
||||
//boost::timer::cpu_timer total;
|
||||
if(shuffle)
|
||||
data_->shuffle();
|
||||
//std::cerr << "shuffle: " << total.format(5, "%ws") << std::endl;
|
||||
current_ = data_->begin();
|
||||
|
@ -100,15 +100,8 @@ struct Chainable {
|
||||
virtual ExpressionGraphPtr graph() = 0;
|
||||
virtual const Shape& shape() = 0;
|
||||
|
||||
virtual DataType val() = 0;
|
||||
virtual DataType grad() = 0;
|
||||
|
||||
virtual void setVal(DataType t) {
|
||||
UTIL_THROW2("Tensors can only be assigned to input and parameter nodes");
|
||||
};
|
||||
virtual void setGrad(DataType t) {
|
||||
UTIL_THROW2("Gradients can only be assigned to parameter nodes");
|
||||
};
|
||||
virtual DataType& val() = 0;
|
||||
virtual DataType& grad() = 0;
|
||||
};
|
||||
|
||||
/** @brief Defines a convenience type to represent a shared pointer to a Chainable<Tensor> object. */
|
||||
|
@ -65,7 +65,7 @@ class Input {
|
||||
/** @brief Constructs a new Input object with the specified Shape */
|
||||
Input(const Shape& shape)
|
||||
: shape_(shape),
|
||||
data_(new Data(shape_.totalSize(), 0.0f)) {}
|
||||
data_(new Data(shape_.elements(), 0.0f)) {}
|
||||
|
||||
/** @brief Gets an iterator pointing to the beginning of this object's ::Data */
|
||||
Data::iterator begin() {
|
||||
|
@ -25,6 +25,9 @@
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <cuda.h>
|
||||
|
||||
#include "shape.h"
|
||||
|
||||
namespace marian {
|
||||
/** @brief Creates shared_ptr of any type, passes all arguments to any available constructor */
|
||||
@ -33,7 +36,6 @@ namespace marian {
|
||||
return std::shared_ptr<T>(new T(std::forward<Args>(args)...));
|
||||
}
|
||||
|
||||
const size_t SHAPE_SIZE = 2;
|
||||
|
||||
typedef float Float;
|
||||
|
||||
@ -44,113 +46,14 @@ namespace marian {
|
||||
* In that case, this placeholder would be used to specify that the batch size value will be defined at some later point.
|
||||
*/
|
||||
const int whatevs{-1};
|
||||
|
||||
/**
|
||||
* @brief Represents the size of each dimension in a tensor.
|
||||
*
|
||||
* Note: this class currently is hard-coded to 2 dimensions.
|
||||
* This is likely to change.
|
||||
*/
|
||||
class Shape {
|
||||
private:
|
||||
int shape_[SHAPE_SIZE];
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* @brief Constructs a default shape.
|
||||
*
|
||||
* This default shape has two dimensions.
|
||||
* The size of each dimension is 1.
|
||||
*/
|
||||
Shape() : shape_{1, 1} { }
|
||||
|
||||
/**
|
||||
* @brief Constructs a shape.
|
||||
*
|
||||
* @param i A list of integers representing the size of each dimension.
|
||||
*/
|
||||
Shape(std::initializer_list<int> il) {
|
||||
std::copy(il.begin(), il.end(), begin());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets a reference to the int representing the size of the <code>i</code>th dimension represented by this object.
|
||||
*
|
||||
* @return a reference to the int representing the size of the <code>i</code>th dimension represented by this object
|
||||
*/
|
||||
int& operator[](int i) {
|
||||
return shape_[i];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets the size of the <code>i</code>th dimension represented by this object.
|
||||
*
|
||||
* @return the size of the <code>i</code>th dimension represented by this object
|
||||
*/
|
||||
const int& operator[](int i) const {
|
||||
return shape_[i];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets the number of dimensions represented by this object
|
||||
*
|
||||
* @return the number of dimensions represented by this object
|
||||
*/
|
||||
size_t size() const {
|
||||
return SHAPE_SIZE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets the total number of elements in a tensor of this shape.
|
||||
*
|
||||
* For example, if this shape represents a 5x100 tensor, this method would return 500.
|
||||
*
|
||||
* @return the total number of elements in a tensor of this shape
|
||||
*/
|
||||
size_t totalSize() const {
|
||||
size_t s = 1;
|
||||
for(int i = 0; i < size(); ++i)
|
||||
s *= shape_[i];
|
||||
return s;
|
||||
}
|
||||
|
||||
/** @brief Gets a pointer to an int that specifies the size of the first dimension represented by this object */
|
||||
int* begin() { return shape_; }
|
||||
|
||||
/** @brief Gets a pointer to an int that specifies the size of the last dimension represented by this object */
|
||||
int* end() { return shape_ + SHAPE_SIZE; }
|
||||
|
||||
/** @brief Gets a const pointer to an int that specifies the size of the first dimension represented by this object */
|
||||
const int* begin() const { return shape_; }
|
||||
|
||||
/** @brief Gets a const pointer to an int that specifies the size of the last dimension represented by this object */
|
||||
const int* end() const { return shape_+ SHAPE_SIZE; }
|
||||
|
||||
/**
|
||||
* @brief Tests this object for equality against another <code>Shape</code> object.
|
||||
*
|
||||
* @return <code>true</code> if the size of each dimension in this object
|
||||
* is equal to the size of the corresponding dimension in the other object,
|
||||
* <code>false</code> otherwise
|
||||
*/
|
||||
bool operator==(const Shape& other) const {
|
||||
return std::equal(begin(), end(), other.begin());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Tests this object for inequality against another <code>Shape</code> object.
|
||||
*/
|
||||
bool operator!=(const Shape& other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#include "keywords.h"
|
||||
|
||||
namespace marian {
|
||||
class Tensor;
|
||||
class TensorBase;
|
||||
typedef std::shared_ptr<TensorBase> Tensor;
|
||||
|
||||
class OptimizerBase;
|
||||
typedef std::shared_ptr<OptimizerBase> OptimizerBasePtr;
|
||||
@ -173,7 +76,7 @@ namespace marian {
|
||||
KEY(value, float)
|
||||
KEY(lazy_shape, std::function<Shape()>)
|
||||
KEY(lazy_value, std::function<float()>)
|
||||
KEY(init, std::function<void(Tensor)>)
|
||||
KEY(init, std::function<void(Tensor&)>)
|
||||
|
||||
KEY(optimizer, OptimizerBasePtr)
|
||||
KEY(batch_size, int)
|
||||
|
@ -28,16 +28,74 @@
|
||||
#include "definitions.h"
|
||||
#include "chainable.h"
|
||||
#include "node_operators.h"
|
||||
#include "tensor.h"
|
||||
#include "batch_generator.h"
|
||||
#include "tensors/tensor_allocator.h"
|
||||
#include "tensors/tensor_gpu.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
// Forward declaration of ExpressionGraph class; this enables it to be used in the following typedef of ExpressionGraphPtr
|
||||
class ExpressionGraph;
|
||||
class Parameters {
|
||||
private:
|
||||
/** @brief List of all parameter nodes of this expression graph. */
|
||||
std::vector<Expr> params_;
|
||||
TensorAllocator vals_;
|
||||
TensorAllocator grads_;
|
||||
|
||||
/** @brief A pointer to an expression graph. */
|
||||
typedef std::shared_ptr<ExpressionGraph> ExpressionGraphPtr;
|
||||
public:
|
||||
Parameters()
|
||||
: vals_(newTensorAllocator<DeviceGPU>()),
|
||||
grads_(newTensorAllocator<DeviceGPU>())
|
||||
{}
|
||||
|
||||
auto begin() -> decltype(params_.begin()) {
|
||||
return params_.begin();
|
||||
}
|
||||
|
||||
auto end() -> decltype(params_.begin()) {
|
||||
return params_.end();
|
||||
}
|
||||
|
||||
size_t size() {
|
||||
return params_.size();
|
||||
}
|
||||
|
||||
size_t totalSize() {
|
||||
size_t sum = 0;
|
||||
for(auto p : params_)
|
||||
sum += p->shape().elements();
|
||||
return sum;
|
||||
}
|
||||
|
||||
void add(Expr p) {
|
||||
params_.push_back(p);
|
||||
}
|
||||
|
||||
void allocateForward() {
|
||||
if(vals_->capacity() == 0) {
|
||||
vals_->reserveExact(totalSize());
|
||||
for(auto p: params_)
|
||||
if(!p->val())
|
||||
vals_->allocate(p->val(), p->shape());
|
||||
}
|
||||
}
|
||||
|
||||
void allocateBackward() {
|
||||
if(grads_->capacity() == 0) {
|
||||
grads_->reserveExact(totalSize());
|
||||
for(auto p: params_)
|
||||
if(!p->grad())
|
||||
grads_->allocate(p->grad(), p->shape());
|
||||
}
|
||||
}
|
||||
|
||||
Tensor vals() {
|
||||
return vals_->asTensor();
|
||||
}
|
||||
|
||||
Tensor grads() {
|
||||
return grads_->asTensor();
|
||||
}
|
||||
};
|
||||
|
||||
template <class T, typename ...Args>
|
||||
Expr Expression(Args&& ... args);
|
||||
@ -50,7 +108,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
/** @brief Constructs a new expression graph
|
||||
* Constructor is private to force use of New<ExpressionGraph>()
|
||||
*/
|
||||
ExpressionGraph() {}
|
||||
ExpressionGraph() : tensors_(newTensorAllocator<DeviceGPU>()) {}
|
||||
|
||||
// delete copy and move constructors
|
||||
ExpressionGraph(const ExpressionGraph&) = delete;
|
||||
@ -69,8 +127,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
|
||||
for(int i = 0; i < gInputs.size(); ++i) {
|
||||
if(!gInputs[i]->val())
|
||||
gInputs[i]->setVal(Tensor(bInputs[i].shape()));
|
||||
gInputs[i]->val().set(bInputs[i].begin(), bInputs[i].end());
|
||||
tensor(gInputs[i]->val(), bInputs[i].shape());
|
||||
gInputs[i]->val()->set(bInputs[i].data());
|
||||
}
|
||||
}
|
||||
|
||||
@ -80,7 +138,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
* Backpropogation is implemented by performing first the forward pass
|
||||
* and then the backward pass of algorithmic differentiation (AD) on the nodes of the graph.
|
||||
*
|
||||
* @param batchSize XXX Marcin, could you provide a description of this param?
|
||||
* @param batch A batch of training data
|
||||
*/
|
||||
void backprop(data::BatchPtr batch) {
|
||||
forward(batch);
|
||||
@ -103,6 +161,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
* @param batchSize XXX Marcin, could you provide a description of this param?
|
||||
*/
|
||||
void forward(data::BatchPtr batch) {
|
||||
params_.allocateForward();
|
||||
|
||||
for(auto&& v : tape_)
|
||||
if(!v->skipped_training())
|
||||
v->allocate(batch->dim());
|
||||
@ -144,6 +204,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
UTIL_THROW_IF2(topNodes_.size() > 1,
|
||||
"There are more than one top most node for backward step");
|
||||
|
||||
params_.allocateBackward();
|
||||
|
||||
for(auto&& v : tape_)
|
||||
if(!v->skipped_training())
|
||||
v->set_zero_adjoint();
|
||||
@ -241,7 +303,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
template <typename ...Args>
|
||||
inline Expr param(Args ...args) {
|
||||
auto e = Expression<ParamNode>(shared_from_this(), args...);
|
||||
params_.emplace_back(e);
|
||||
params_.add(e);
|
||||
return e;
|
||||
}
|
||||
|
||||
@ -318,7 +380,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
*
|
||||
* @return the list of all parameter nodes of this expression graph
|
||||
*/
|
||||
std::vector<Expr>& params() {
|
||||
Parameters& params() {
|
||||
return params_;
|
||||
}
|
||||
|
||||
@ -346,14 +408,6 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
named_.emplace(name, e);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Returns a pointer to the list of items contained in this graph.
|
||||
*
|
||||
* The items in the list will be in the order they were created.
|
||||
*
|
||||
* @return a pointer to the list of items contained in this graph
|
||||
*/
|
||||
|
||||
void add(Expr node) {
|
||||
tape_.push_back(node);
|
||||
if(!node->skipped_training())
|
||||
@ -364,24 +418,34 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
|
||||
topNodes_.erase(node);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <class ...Args>
|
||||
void tensor(Tensor& t, Args&&... args) {
|
||||
tensors_->allocate(t, args...);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
/** @brief Pointer to the list of nodes */
|
||||
/** @brief The full list of nodes */
|
||||
Tape tape_;
|
||||
|
||||
/** @brief Maps from name to expression node. */
|
||||
std::map<std::string, Expr> named_;
|
||||
|
||||
/** @brief List of all parameter nodes of this expression graph. */
|
||||
std::vector<Expr> params_;
|
||||
|
||||
/** @brief List of all input nodes of this expression graph. */
|
||||
std::vector<Expr> inputs_;
|
||||
|
||||
/** @brief Contains all nodes with regard to which we want to calculate derivatives */
|
||||
std::unordered_set<Expr> topNodes_;
|
||||
|
||||
Parameters params_;
|
||||
TensorAllocator tensors_;
|
||||
};
|
||||
|
||||
/** @brief A pointer to an expression graph. */
|
||||
typedef std::shared_ptr<ExpressionGraph> ExpressionGraphPtr;
|
||||
|
||||
template <class T, typename ...Args>
|
||||
Expr Expression(Args&& ... args) {
|
||||
auto e = Expr(new T(std::forward<Args>(args)...));
|
||||
@ -389,5 +453,4 @@ Expr Expression(Args&& ... args) {
|
||||
return e;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -92,39 +92,9 @@ Expr reluplus(Expr a, Expr b);
|
||||
|
||||
/*********************************************************/
|
||||
|
||||
// inefficient
|
||||
template <typename ...Args>
|
||||
inline Expr sum(Expr a, Args ...args) {
|
||||
using namespace keywords;
|
||||
Keywords params(args...);
|
||||
int ax = params.Get(axis, whatevs);
|
||||
|
||||
if(ax == 0) {
|
||||
auto lshape = [a]() -> Shape {
|
||||
int rows = a->val().shape()[0];
|
||||
return {1, rows};
|
||||
};
|
||||
Expr one = a->graph()->ones(shape={1, a->shape()[0]},
|
||||
lazy_shape=lshape);
|
||||
return dot(one, a);
|
||||
}
|
||||
else if(ax == 1) {
|
||||
auto lshape = [a]() -> Shape {
|
||||
int cols = a->val().shape()[1];
|
||||
//std::cerr << "Shape will be " << cols << " by 1." << std::endl;
|
||||
return {cols, 1};
|
||||
};
|
||||
Expr one = a->graph()->ones(shape={a->shape()[1], 1},
|
||||
lazy_shape=lshape);
|
||||
return dot(a, one);
|
||||
}
|
||||
else if(ax == 2) {
|
||||
UTIL_THROW2("Not implemented");
|
||||
}
|
||||
else if(ax == 3) {
|
||||
UTIL_THROW2("Not implemented");
|
||||
}
|
||||
return sum(sum(a, axis=0), axis=1);
|
||||
Expr sum(Expr a, Args ...args) {
|
||||
return Expression<SumNodeOp>(a, args...);
|
||||
}
|
||||
|
||||
Expr softmax(Expr a);
|
||||
@ -133,34 +103,9 @@ Expr logsoftmax(Expr a);
|
||||
|
||||
Expr argmax(Expr a);
|
||||
|
||||
// inefficient
|
||||
template <typename ...Args>
|
||||
inline Expr mean(Expr a, Args ...args) {
|
||||
using namespace keywords;
|
||||
Keywords params(args...);
|
||||
size_t ax = params.Get(axis, whatevs);
|
||||
|
||||
switch (ax) {
|
||||
case 0:
|
||||
return sum(a, axis=0) / a->graph()->constant(shape={1, 1},
|
||||
lazy_value=[a]() -> Float {
|
||||
return a->val().shape()[0];
|
||||
});
|
||||
case 1:
|
||||
return sum(a, axis=1) / a->graph()->constant(shape={1, 1},
|
||||
lazy_value=[a]() -> Float {
|
||||
return a->val().shape()[1];
|
||||
});
|
||||
case 2:
|
||||
UTIL_THROW2("Not implemented");
|
||||
case 3:
|
||||
UTIL_THROW2("Not implemented");
|
||||
default:
|
||||
return sum(a) / a->graph()->constant(shape={1, 1},
|
||||
lazy_value=[a]() -> Float {
|
||||
return a->val().size();
|
||||
});
|
||||
}
|
||||
Expr mean(Expr a, Args ...args) {
|
||||
return Expression<MeanNodeOp>(a, args...);
|
||||
}
|
||||
|
||||
Expr cross_entropy(Expr a, Expr b);
|
||||
|
@ -10,6 +10,10 @@
|
||||
#include "trainer.h"
|
||||
#include "models/feedforward.h"
|
||||
|
||||
#include "tensors/tensor.h"
|
||||
#include "tensors/tensor_gpu.h"
|
||||
#include "tensors/tensor_allocator.h"
|
||||
|
||||
using namespace marian;
|
||||
using namespace keywords;
|
||||
using namespace data;
|
||||
|
143
src/node.cu
143
src/node.cu
@ -9,67 +9,96 @@ void Node::skip_training() {
|
||||
graph_->remove_top_node(shared_from_this());
|
||||
}
|
||||
|
||||
void Node::allocate(size_t batchSize) {
|
||||
auto it1 = shape_.begin();
|
||||
auto it2 = givenShape_.begin();
|
||||
while(it1 != shape_.end()) {
|
||||
if(*it2 == whatevs)
|
||||
*it1 = batchSize;
|
||||
it1++; it2++;
|
||||
}
|
||||
|
||||
graph_->tensor(val_, shape_);
|
||||
|
||||
if(Has(keywords::value))
|
||||
val_->set(Get(keywords::value, 0));
|
||||
}
|
||||
|
||||
void Node::init_dependent() {
|
||||
if(!adj_)
|
||||
graph_->tensor(adj_, shape_);
|
||||
adj_->set(1);
|
||||
}
|
||||
|
||||
void Node::set_zero_adjoint() {
|
||||
if(!adj_) {
|
||||
graph_->tensor(adj_, shape_);
|
||||
}
|
||||
adj_->set(0);
|
||||
}
|
||||
|
||||
|
||||
// GPU
|
||||
void Node::calc_numeric_grad(Float delta, Tensor input, Tensor grad) {
|
||||
using namespace std;
|
||||
|
||||
size_t inputSize = GetTotalSize(input.shape());
|
||||
size_t valSize = GetTotalSize(val_.shape());
|
||||
|
||||
UTIL_THROW_IF2(inputSize != GetTotalSize(grad.shape()),
|
||||
"inputSize != gradSize:" << inputSize << "!=" << GetTotalSize(grad.shape()));
|
||||
UTIL_THROW_IF2(valSize != GetTotalSize(adj_.shape()),
|
||||
"valSize != adjSize :" << valSize << "!=" << GetTotalSize(adj_.shape()));
|
||||
|
||||
cerr << "inputSize=grad=" << Debug(input.shape())<< "=" << inputSize << " "
|
||||
<< "valSize=adj_=" << Debug(val_.shape()) << "=" << valSize
|
||||
<< endl;
|
||||
|
||||
//cerr << "input=" << input.Debug() << endl;
|
||||
//cerr << "adj_=" << adj_.Debug() << endl;
|
||||
|
||||
std::vector<float> prevCalcGrad;
|
||||
prevCalcGrad << grad;
|
||||
//cerr << "origGrad=" << grad.Debug() << endl;
|
||||
//output("diffGrad", diffGrad);
|
||||
|
||||
//output("prevCalcGrad", prevCalcGrad.begin(), prevCalcGrad.end());
|
||||
|
||||
Tensor newValTensor(input.shape());
|
||||
|
||||
// LOOP thru each element in input & add delta
|
||||
for (size_t inputInd = 0; inputInd < inputSize; ++inputInd) {
|
||||
input.incr(inputInd, delta);
|
||||
//output("input", input.begin(), input.end());
|
||||
|
||||
forward();
|
||||
|
||||
val_.sum(newValTensor, inputInd);
|
||||
//cudaDeviceSynchronize();
|
||||
|
||||
input.incr(inputInd, -delta);
|
||||
}
|
||||
|
||||
std::vector<float> newVal;
|
||||
newVal << newValTensor;
|
||||
//cudaDeviceSynchronize();
|
||||
|
||||
// orig value
|
||||
forward();
|
||||
|
||||
float sumValOrig = val_.sum();
|
||||
//float sumValOrig = thrust::reduce(val_.begin(), val_.end(), (float) 0.0f, thrust::plus<float>());
|
||||
//cudaDeviceSynchronize();
|
||||
|
||||
//output("newVal", newVal.begin(), newVal.end());
|
||||
|
||||
// calc gradient
|
||||
Tensor prevGradTensor(input.shape());
|
||||
thrust::copy(grad.begin(), grad.end(), prevGradTensor.begin());
|
||||
|
||||
Tensor gradTensor(input.shape());
|
||||
Element(_1 = (_2 - sumValOrig) / delta, gradTensor, newValTensor);
|
||||
Element(_1 = _2 * _3 + _4, grad, adj_, gradTensor, prevGradTensor);
|
||||
// size_t inputSize = GetTotalSize(input->shape());
|
||||
// size_t valSize = GetTotalSize(val_->shape());
|
||||
//
|
||||
// UTIL_THROW_IF2(inputSize != GetTotalSize(grad->shape()),
|
||||
// "inputSize != gradSize:" << inputSize << "!=" << grad->shape()->elements());
|
||||
// UTIL_THROW_IF2(valSize != GetTotalSize(adj_->shape()),
|
||||
// "valSize != adjSize :" << valSize << "!=" << adj_->shape()->elements());
|
||||
//
|
||||
// cerr << "inputSize=grad=" << Debug(input->shape())<< "=" << inputSize << " "
|
||||
// << "valSize=adj_=" << Debug(val_->shape()) << "=" << valSize
|
||||
// << endl;
|
||||
//
|
||||
// //cerr << "input=" << input.Debug() << endl;
|
||||
// //cerr << "adj_=" << adj_.Debug() << endl;
|
||||
//
|
||||
// std::vector<float> prevCalcGrad;
|
||||
// prevCalcGrad << grad;
|
||||
// //cerr << "origGrad=" << grad.Debug() << endl;
|
||||
// //output("diffGrad", diffGrad);
|
||||
//
|
||||
// //output("prevCalcGrad", prevCalcGrad.begin(), prevCalcGrad.end());
|
||||
//
|
||||
// Tensor newValTensor(input.shape());
|
||||
//
|
||||
// // LOOP thru each element in input & add delta
|
||||
// for (size_t inputInd = 0; inputInd < inputSize; ++inputInd) {
|
||||
// input.incr(inputInd, delta);
|
||||
// //output("input", input.begin(), input.end());
|
||||
//
|
||||
// forward();
|
||||
//
|
||||
// val_.sum(newValTensor, inputInd);
|
||||
// //cudaDeviceSynchronize();
|
||||
//
|
||||
// input.incr(inputInd, -delta);
|
||||
// }
|
||||
//
|
||||
// std::vector<float> newVal;
|
||||
// newVal << newValTensor;
|
||||
// //cudaDeviceSynchronize();
|
||||
//
|
||||
// // orig value
|
||||
// forward();
|
||||
//
|
||||
// float sumValOrig = val_.sum();
|
||||
// //float sumValOrig = thrust::reduce(val_.begin(), val_.end(), (float) 0.0f, thrust::plus<float>());
|
||||
// //cudaDeviceSynchronize();
|
||||
//
|
||||
// //output("newVal", newVal.begin(), newVal.end());
|
||||
//
|
||||
// // calc gradient
|
||||
// Tensor prevGradTensor(input.shape());
|
||||
// thrust::copy(grad.begin(), grad.end(), prevGradTensor.begin());
|
||||
//
|
||||
// Tensor gradTensor(input.shape());
|
||||
// Element(_1 = (_2 - sumValOrig) / delta, gradTensor, newValTensor);
|
||||
// Element(_1 = _2 * _3 + _4, grad, adj_, gradTensor, prevGradTensor);
|
||||
}
|
||||
|
||||
/*
|
||||
|
48
src/node.h
48
src/node.h
@ -22,9 +22,10 @@
|
||||
// SOFTWARE.
|
||||
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
|
||||
#include "keywords.h"
|
||||
#include "tensor.h"
|
||||
#include "tensors/tensor.h"
|
||||
#include "chainable.h"
|
||||
|
||||
namespace marian {
|
||||
@ -60,52 +61,17 @@ class Node : public Chainable<Tensor>,
|
||||
|
||||
virtual bool skipped_training() { return skipTraining_; }
|
||||
|
||||
virtual void allocate(size_t batchSize) {
|
||||
auto it1 = shape_.begin();
|
||||
auto it2 = givenShape_.begin();
|
||||
while(it1 != shape_.end()) {
|
||||
if(*it2 == whatevs)
|
||||
*it1 = batchSize;
|
||||
it1++; it2++;
|
||||
}
|
||||
virtual void allocate(size_t batchSize);
|
||||
|
||||
if(Has(keywords::lazy_shape)) {
|
||||
auto defaultShape = [this]() -> Shape { return shape_; };
|
||||
shape_ = Get(keywords::lazy_shape, defaultShape)();
|
||||
}
|
||||
if(Has(keywords::lazy_value))
|
||||
val_.allocate(shape_, Get(
|
||||
keywords::lazy_value, []()->Float{return 0.f;})());
|
||||
else if(Has(keywords::value))
|
||||
val_.allocate(shape_, Get(keywords::value, 0));
|
||||
else
|
||||
val_.allocate(shape_);
|
||||
}
|
||||
virtual void init_dependent();
|
||||
|
||||
virtual void init_dependent() {
|
||||
if(adj_) {
|
||||
adj_.set(1);
|
||||
}
|
||||
else {
|
||||
adj_.allocate(shape_, 1);
|
||||
}
|
||||
}
|
||||
virtual void set_zero_adjoint();
|
||||
|
||||
virtual void set_zero_adjoint() {
|
||||
if(adj_) {
|
||||
adj_.set(0);
|
||||
}
|
||||
else {
|
||||
adj_.allocate(shape_, 0);
|
||||
}
|
||||
}
|
||||
|
||||
virtual Tensor val() {
|
||||
virtual Tensor& val() {
|
||||
return val_;
|
||||
};
|
||||
|
||||
virtual Tensor grad() {
|
||||
//UTIL_THROW_IF2(!adj_, "Tensor has not been allocated");
|
||||
virtual Tensor& grad() {
|
||||
return adj_;
|
||||
};
|
||||
|
||||
|
37
src/node_operators.cu
Normal file
37
src/node_operators.cu
Normal file
@ -0,0 +1,37 @@
|
||||
// This file is part of the Marian toolkit.
|
||||
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "node_operators.h"
|
||||
#include "expression_graph.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
void ParamNode::allocate(size_t batchSize) {
|
||||
// @TODO params
|
||||
graph()->tensor(val_, shape_);
|
||||
if(!initialized_) {
|
||||
init_(val_);
|
||||
initialized_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -39,12 +39,6 @@ struct InputNode : public Node {
|
||||
|
||||
~InputNode() {}
|
||||
|
||||
virtual void setVal(Tensor t) {
|
||||
val_ = t;
|
||||
shape_ = t.shape();
|
||||
//@todo, shape checking
|
||||
}
|
||||
|
||||
void forward() {}
|
||||
void backward() {}
|
||||
|
||||
@ -82,7 +76,7 @@ struct ParamNode : public Node {
|
||||
template <typename ...Args>
|
||||
ParamNode(Args ...args)
|
||||
: Node(args...),
|
||||
init_(Get(keywords::init, [](Tensor){ })),
|
||||
init_(Get(keywords::init, [](Tensor&){ })),
|
||||
initialized_(false)
|
||||
{
|
||||
UTIL_THROW_IF2(!Has(keywords::shape) &&
|
||||
@ -90,30 +84,12 @@ struct ParamNode : public Node {
|
||||
"Param items require shape information");
|
||||
}
|
||||
|
||||
virtual void setVal(Tensor t) {
|
||||
val_ = t;
|
||||
shape_ = t.shape();
|
||||
//@todo, shape checking
|
||||
};
|
||||
|
||||
~ParamNode() {}
|
||||
|
||||
virtual void setGrad(Tensor t) {
|
||||
adj_ = t;
|
||||
shape_ = t.shape();
|
||||
//@todo, shape checking
|
||||
};
|
||||
|
||||
void forward() {}
|
||||
void backward() {}
|
||||
|
||||
virtual void allocate(size_t batchSize) {
|
||||
val_.allocate(shape_);
|
||||
if(!initialized_) {
|
||||
init_(val_);
|
||||
initialized_ = true;
|
||||
}
|
||||
}
|
||||
virtual void allocate(size_t batchSize);
|
||||
|
||||
virtual std::string graphviz() {
|
||||
std::stringstream ss;
|
||||
@ -125,7 +101,7 @@ struct ParamNode : public Node {
|
||||
|
||||
|
||||
private:
|
||||
std::function<void(Tensor)> init_;
|
||||
std::function<void(Tensor&)> init_;
|
||||
bool initialized_;
|
||||
};
|
||||
|
||||
|
@ -6,4 +6,21 @@ namespace marian {
|
||||
graph_->remove_top_node(a_);
|
||||
graph_->remove_top_node(b_);
|
||||
}
|
||||
|
||||
// We're caching the logsoftmax probabilities here because we'll need them for
|
||||
// the backward computation.
|
||||
void CrossEntropyNodeOp::forward() {
|
||||
// C = sum(-B * logsoftmax(A))
|
||||
if(!probs_)
|
||||
graph_->tensor(probs_, a_->val()->shape());
|
||||
|
||||
CudnnLogSoftmax(probs_, a_->val());
|
||||
|
||||
if(!result_)
|
||||
graph_->tensor(result_, a_->val()->shape());
|
||||
Element(_1 = -_2 * _3, result_, b_->val(), probs_);
|
||||
Sum(val_, result_, 1);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "node.h"
|
||||
#include "thrust_functions.h"
|
||||
#include "tensor_operators.h"
|
||||
|
||||
namespace marian {
|
||||
@ -29,53 +30,53 @@ struct BinaryNodeOp : public Node {
|
||||
void remove_children_from_top_nodes();
|
||||
|
||||
void backward_debug(Float delta) {
|
||||
using namespace std;
|
||||
|
||||
cerr << "BinaryNodeOp::" << typeid(*this).name() << "::backward_debug()" << endl;
|
||||
|
||||
std::vector<float> preCalcGradA, diffGradA, numericalGradA;
|
||||
preCalcGradA << a_->grad();
|
||||
//output("preCalcGradA", preCalcGradA);
|
||||
|
||||
std::vector<float> preCalcGradB, diffGradB, numericalGradB;
|
||||
preCalcGradB << b_->grad();
|
||||
//output("preCalcGradB", preCalcGradB);
|
||||
|
||||
// use df/dx to calc grad
|
||||
backward();
|
||||
cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
|
||||
cerr << "orig b_->grad()=" << b_->grad().Debug() << endl;
|
||||
|
||||
diffGradA << a_->grad();
|
||||
diffGradB << b_->grad();
|
||||
|
||||
//using namespace std;
|
||||
//
|
||||
//cerr << "BinaryNodeOp::" << typeid(*this).name() << "::backward_debug()" << endl;
|
||||
//
|
||||
//std::vector<float> preCalcGradA, diffGradA, numericalGradA;
|
||||
//preCalcGradA << a_->grad();
|
||||
////output("preCalcGradA", preCalcGradA);
|
||||
//
|
||||
//std::vector<float> preCalcGradB, diffGradB, numericalGradB;
|
||||
//preCalcGradB << b_->grad();
|
||||
////output("preCalcGradB", preCalcGradB);
|
||||
//
|
||||
//// use df/dx to calc grad
|
||||
//backward();
|
||||
//cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
|
||||
//cerr << "orig b_->grad()=" << b_->grad().Debug() << endl;
|
||||
|
||||
cerr << "TENSOR A:" << endl;
|
||||
a_->grad().set(preCalcGradA);
|
||||
b_->grad().set(preCalcGradB);
|
||||
|
||||
calc_numeric_grad(delta, a_->val(), a_->grad());
|
||||
cerr << "numerical a_->grad()=" << a_->grad().Debug() << endl;
|
||||
|
||||
numericalGradA << a_->grad();
|
||||
outputL2Norm("TENSOR A", diffGradA, numericalGradA);
|
||||
|
||||
|
||||
cerr << "TENSOR B:" << endl;
|
||||
a_->grad().set(preCalcGradA);
|
||||
b_->grad().set(preCalcGradB);
|
||||
|
||||
calc_numeric_grad(delta, b_->val(), b_->grad());
|
||||
cerr << "numerical b_->grad()=" << b_->grad().Debug() << endl;
|
||||
|
||||
numericalGradB << b_->grad();
|
||||
outputL2Norm("TENSOR B", diffGradB, numericalGradB);
|
||||
|
||||
// reset to diff grad
|
||||
a_->grad().set(diffGradA);
|
||||
b_->grad().set(diffGradB);
|
||||
//
|
||||
//diffGradA << a_->grad();
|
||||
//diffGradB << b_->grad();
|
||||
//
|
||||
////cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
|
||||
////cerr << "orig b_->grad()=" << b_->grad().Debug() << endl;
|
||||
//
|
||||
//cerr << "TENSOR A:" << endl;
|
||||
//a_->grad().set(preCalcGradA);
|
||||
//b_->grad().set(preCalcGradB);
|
||||
//
|
||||
//calc_numeric_grad(delta, a_->val(), a_->grad());
|
||||
//cerr << "numerical a_->grad()=" << a_->grad().Debug() << endl;
|
||||
//
|
||||
//numericalGradA << a_->grad();
|
||||
//outputL2Norm("TENSOR A", diffGradA, numericalGradA);
|
||||
//
|
||||
//
|
||||
//cerr << "TENSOR B:" << endl;
|
||||
//a_->grad().set(preCalcGradA);
|
||||
//b_->grad().set(preCalcGradB);
|
||||
//
|
||||
//calc_numeric_grad(delta, b_->val(), b_->grad());
|
||||
//cerr << "numerical b_->grad()=" << b_->grad().Debug() << endl;
|
||||
//
|
||||
//numericalGradB << b_->grad();
|
||||
//outputL2Norm("TENSOR B", diffGradB, numericalGradB);
|
||||
//
|
||||
//// reset to diff grad
|
||||
//a_->grad().set(diffGradA);
|
||||
//b_->grad().set(diffGradB);
|
||||
}
|
||||
|
||||
|
||||
@ -231,7 +232,7 @@ struct MultNodeOp : public BinaryNodeOp {
|
||||
|
||||
virtual std::string graphviz() {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"box\", label=" << label("x")
|
||||
ss << "\"" << this << "\" [shape=\"box\", label=" << label("×")
|
||||
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
|
||||
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
|
||||
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
|
||||
@ -285,22 +286,7 @@ struct CrossEntropyNodeOp : public BinaryNodeOp {
|
||||
return shape1;
|
||||
}
|
||||
|
||||
// We're caching the logsoftmax probabilities here because we'll need them for
|
||||
// the backward computation.
|
||||
void forward() {
|
||||
// C = -dot(B, logsoftmax(A)).
|
||||
if (probs_) {
|
||||
probs_.set(0.0);
|
||||
} else {
|
||||
probs_.allocate(a_->val().shape(), 0.0);
|
||||
}
|
||||
|
||||
CudnnLogSoftmax(probs_, a_->val());
|
||||
if(!result_)
|
||||
result_.allocate(a_->val().shape());
|
||||
Element(_1 = -_2 * _3, result_, b_->val(), probs_);
|
||||
SumRowwise(result_, val_);
|
||||
}
|
||||
void forward();
|
||||
|
||||
// @TODO: In most cases it's wasteful to compute the derivative with respect
|
||||
// to the second input which is typically an input node in the computation
|
||||
@ -318,15 +304,15 @@ struct CrossEntropyNodeOp : public BinaryNodeOp {
|
||||
a_->grad(), adj_, probs_, b_->val());
|
||||
|
||||
// Compute second input derivative.
|
||||
Element(_1 -= _2 * _3, b_->grad(),
|
||||
adj_, probs_);
|
||||
Element(_1 -= _2 * _3,
|
||||
b_->grad(), adj_, probs_);
|
||||
}
|
||||
|
||||
virtual std::string graphviz() {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"box\", label=" << label("x-ent")
|
||||
<< ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
|
||||
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
|
||||
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
|
||||
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
|
||||
return ss.str();
|
||||
};
|
||||
|
@ -1,7 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include "node.h"
|
||||
#include "tensors/tensor.h"
|
||||
#include "tensor_operators.h"
|
||||
#include "thrust_functions.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -26,29 +28,29 @@ struct UnaryNodeOp : public Node {
|
||||
|
||||
void backward_debug(Float delta) {
|
||||
using namespace std;
|
||||
|
||||
cerr << "UnaryNodeOp::" << typeid(*this).name() << "::backward_numeric()" << endl;
|
||||
|
||||
std::vector<float> preCalcGradA, diffGradA, numericalGradA;
|
||||
preCalcGradA << a_->grad();
|
||||
//output("preCalcGradA", preCalcGradA);
|
||||
|
||||
// use df/dx to calc grad
|
||||
backward();
|
||||
cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
|
||||
diffGradA << a_->grad();
|
||||
|
||||
a_->grad().set(preCalcGradA);
|
||||
|
||||
calc_numeric_grad(delta, a_->val(), a_->grad());
|
||||
cerr << "numerical a_->grad()=" << a_->grad().Debug() << endl;
|
||||
|
||||
numericalGradA << a_->grad();
|
||||
|
||||
outputL2Norm("", diffGradA, numericalGradA);
|
||||
|
||||
// reset to diff grad
|
||||
a_->grad().set(diffGradA);
|
||||
//
|
||||
//cerr << "UnaryNodeOp::" << typeid(*this).name() << "::backward_numeric()" << endl;
|
||||
//
|
||||
//std::vector<float> preCalcGradA, diffGradA, numericalGradA;
|
||||
//a_->grad() >> preCalcGradA ;
|
||||
////output("preCalcGradA", preCalcGradA);
|
||||
//
|
||||
//// use df/dx to calc grad
|
||||
//backward();
|
||||
//cerr << "orig a_->grad()=" << a_->grad().Debug() << endl;
|
||||
//a_->grad() >> diffGradA;
|
||||
//
|
||||
//a_->grad()->set(preCalcGradA);
|
||||
//
|
||||
//calc_numeric_grad(delta, a_->val(), a_->grad());
|
||||
////cerr << "numerical a_->grad()=" << a_->grad()->Debug() << endl;
|
||||
//
|
||||
//a_->grad() >> numericalGradA;
|
||||
//
|
||||
//outputL2Norm("", diffGradA, numericalGradA);
|
||||
//
|
||||
//// reset to diff grad
|
||||
//a_->grad()->set(diffGradA);
|
||||
}
|
||||
|
||||
};
|
||||
@ -68,10 +70,6 @@ struct LogitNodeOp : public UnaryNodeOp {
|
||||
a_->grad(), adj_, val_);
|
||||
}
|
||||
|
||||
void check() {
|
||||
|
||||
}
|
||||
|
||||
virtual std::string graphviz() {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"box\", label=" << label("logit")
|
||||
@ -210,7 +208,7 @@ struct SoftmaxNodeOp : public UnaryNodeOp {
|
||||
: UnaryNodeOp(args...) { }
|
||||
|
||||
void forward() {
|
||||
CudnnSoftmax(val_, a_->val());
|
||||
Softmax(val_, a_->val());
|
||||
}
|
||||
|
||||
void backward() {
|
||||
@ -292,12 +290,94 @@ struct ArgmaxNodeOp : public UnaryNodeOp {
|
||||
|
||||
};
|
||||
|
||||
struct SumNodeOp : public UnaryNodeOp {
|
||||
template <typename ...Args>
|
||||
SumNodeOp(Expr a, Args ...args)
|
||||
: UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { }
|
||||
|
||||
void forward() {
|
||||
Sum(val_, a_->val(), Get(keywords::axis, -1));
|
||||
}
|
||||
|
||||
void backward() {
|
||||
SumBackward(a_->grad(), adj_, Get(keywords::axis, -1));
|
||||
}
|
||||
|
||||
template <class ...Args>
|
||||
Shape newShape(Expr a, Args ...args) {
|
||||
int ax = keywords::Get(keywords::axis, -1, args...);
|
||||
Shape shape = a->shape();
|
||||
if(ax == 0) {
|
||||
shape[0] = 1;
|
||||
}
|
||||
else if(ax == 1) {
|
||||
shape[1] = 1;
|
||||
}
|
||||
else {
|
||||
shape[0] = 1;
|
||||
shape[1] = 1;
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
|
||||
virtual std::string graphviz() {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"box\", label="
|
||||
<< label("sum") << ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
|
||||
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
struct MeanNodeOp : public UnaryNodeOp {
|
||||
template <typename ...Args>
|
||||
MeanNodeOp(Expr a, Args ...args)
|
||||
: UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { }
|
||||
|
||||
void forward() {
|
||||
Sum(val_, a_->val(), Get(keywords::axis, -1), true);
|
||||
}
|
||||
|
||||
void backward() {
|
||||
SumBackward(a_->grad(), adj_, Get(keywords::axis, -1), true);
|
||||
}
|
||||
|
||||
template <class ...Args>
|
||||
Shape newShape(Expr a, Args ...args) {
|
||||
int ax = keywords::Get(keywords::axis, -1, args...);
|
||||
Shape shape = a->shape();
|
||||
if(ax == 0) {
|
||||
shape[0] = 1;
|
||||
}
|
||||
else if(ax == 1) {
|
||||
shape[1] = 1;
|
||||
}
|
||||
else {
|
||||
shape[0] = 1;
|
||||
shape[1] = 1;
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
|
||||
virtual std::string graphviz() {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"box\", label="
|
||||
<< label("mean") << ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
|
||||
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
|
||||
return ss.str();
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
struct LogNodeOp : public UnaryNodeOp {
|
||||
template <typename ...Args>
|
||||
LogNodeOp(Args ...args)
|
||||
: UnaryNodeOp(args...) {}
|
||||
|
||||
void forward() {
|
||||
std::cerr << val_.get() << " <-> " << a_->val().get() << std::endl;
|
||||
Element(_1 = Log(_2), val_, a_->val());
|
||||
}
|
||||
|
||||
|
@ -38,29 +38,35 @@ class Sgd : public OptimizerBase {
|
||||
class Adagrad : public OptimizerBase {
|
||||
public:
|
||||
Adagrad(float eta=0.01, float eps=1e-8)
|
||||
: eta_(eta), eps_(eps) {}
|
||||
: eta_(eta), eps_(eps),
|
||||
alloc_(newTensorAllocator<DeviceGPU>())
|
||||
{}
|
||||
|
||||
void update(ExpressionGraphPtr graph, data::BatchPtr batch) {
|
||||
graph->backprop(batch);
|
||||
|
||||
if(gt_.size() < graph->params().size())
|
||||
for(auto& param : graph->params())
|
||||
gt_.emplace_back(Tensor(param->grad().shape(), 0));
|
||||
|
||||
auto gtIt = gt_.begin();
|
||||
for(auto& param : graph->params()) {
|
||||
Element(_1 += (_2 * _2),
|
||||
*gtIt, param->grad());
|
||||
Element(_1 -= (eta_ / (Sqrt(_2) + eps_)) * _3,
|
||||
param->val(), *gtIt, param->grad());
|
||||
gtIt++;
|
||||
if(!gt_) {
|
||||
int totalSize = graph->params().totalSize();
|
||||
alloc_->reserveExact(totalSize);
|
||||
alloc_->allocate(gt_, {1, totalSize});
|
||||
gt_->set(0);
|
||||
}
|
||||
|
||||
Tensor pVals = graph->params().vals();
|
||||
Tensor pGrads = graph->params().grads();
|
||||
|
||||
ElementVec(_1 += (_2 * _2),
|
||||
gt_, pGrads);
|
||||
|
||||
ElementVec(_1 -= (eta_ / (Sqrt(_2) + eps_)) * _3,
|
||||
pVals, gt_, pGrads);
|
||||
}
|
||||
|
||||
private:
|
||||
float eta_;
|
||||
float eps_;
|
||||
std::vector<Tensor> gt_;
|
||||
TensorAllocator alloc_;
|
||||
Tensor gt_;
|
||||
};
|
||||
|
||||
|
||||
@ -69,34 +75,39 @@ class Adagrad : public OptimizerBase {
|
||||
class Adam : public OptimizerBase {
|
||||
public:
|
||||
Adam(float eta=0.001, float beta1=0.9, float beta2=0.999, float eps=1e-8)
|
||||
: eta_(eta), beta1_(beta1), beta2_(beta2), eps_(eps), t_(0) {}
|
||||
: eta_(eta), beta1_(beta1), beta2_(beta2), eps_(eps), t_(0),
|
||||
mtAlloc_(newTensorAllocator<DeviceGPU>()),
|
||||
vtAlloc_(newTensorAllocator<DeviceGPU>())
|
||||
{}
|
||||
|
||||
void update(ExpressionGraphPtr graph, data::BatchPtr batch) {
|
||||
graph->backprop(batch);
|
||||
|
||||
if(mt_.size() < graph->params().size()) {
|
||||
for(auto& param : graph->params()) {
|
||||
mt_.emplace_back(Tensor(param->grad().shape(), 0));
|
||||
vt_.emplace_back(Tensor(param->grad().shape(), 0));
|
||||
}
|
||||
if(!mt_) {
|
||||
int totalSize = graph->params().totalSize();
|
||||
mtAlloc_->reserveExact(totalSize);
|
||||
mtAlloc_->allocate(mt_, {1, totalSize});
|
||||
mt_->set(0);
|
||||
|
||||
vtAlloc_->reserveExact(totalSize);
|
||||
vtAlloc_->allocate(vt_, {1, totalSize});
|
||||
vt_->set(0);
|
||||
}
|
||||
|
||||
t_++;
|
||||
float denom1 = 1 - pow(beta1_, t_);
|
||||
float denom2 = 1 - pow(beta2_, t_);
|
||||
|
||||
auto mtIt = mt_.begin();
|
||||
auto vtIt = vt_.begin();
|
||||
Tensor pVals = graph->params().vals();
|
||||
Tensor pGrads = graph->params().grads();
|
||||
|
||||
for(auto& param : graph->params()) {
|
||||
Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2),
|
||||
*mtIt, param->grad());
|
||||
Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)),
|
||||
*vtIt, param->grad());
|
||||
Element(_1 -= eta_ * (_2 / denom1) / (Sqrt(_3 / denom2) + eps_),
|
||||
param->val(), *mtIt, *vtIt);
|
||||
mtIt++; vtIt++;
|
||||
}
|
||||
ElementVec(_1 = (beta1_ * _1) + ((1 - beta1_) * _2),
|
||||
mt_, pGrads);
|
||||
ElementVec(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)),
|
||||
vt_, pGrads);
|
||||
|
||||
ElementVec(_1 -= eta_ * (_2 / denom1) / (Sqrt(_3 / denom2) + eps_),
|
||||
pVals, mt_, vt_);
|
||||
}
|
||||
|
||||
private:
|
||||
@ -105,8 +116,11 @@ class Adam : public OptimizerBase {
|
||||
float beta2_;
|
||||
float eps_;
|
||||
size_t t_;
|
||||
std::vector<Tensor> mt_;
|
||||
std::vector<Tensor> vt_;
|
||||
|
||||
TensorAllocator mtAlloc_;
|
||||
Tensor mt_;
|
||||
TensorAllocator vtAlloc_;
|
||||
Tensor vt_;
|
||||
};
|
||||
|
||||
template <class Algorithm, typename ...Args>
|
||||
|
@ -27,7 +27,7 @@
|
||||
#include <functional>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "tensor.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -47,58 +47,58 @@ float xor128() {
|
||||
// Use a constant seed for deterministic behaviour.
|
||||
std::default_random_engine engine(42);
|
||||
|
||||
void zeros(Tensor t) {
|
||||
t.set(0.f);
|
||||
void zeros(Tensor& t) {
|
||||
t->set(0.f);
|
||||
}
|
||||
|
||||
void ones(Tensor t) {
|
||||
t.set(1.0f);
|
||||
void ones(Tensor& t) {
|
||||
t->set(1.0f);
|
||||
}
|
||||
|
||||
template <class Distribution>
|
||||
void distribution(Tensor t, float a, float b) {
|
||||
void distribution(Tensor& t, float a, float b) {
|
||||
//std::random_device device;
|
||||
//std::default_random_engine engine(device());
|
||||
Distribution dist(a, b);
|
||||
auto gen = std::bind(dist, engine);
|
||||
|
||||
std::vector<float> vals(t.size());
|
||||
std::vector<float> vals(t->size());
|
||||
std::generate(begin(vals), end(vals), gen);
|
||||
|
||||
t << vals;
|
||||
}
|
||||
|
||||
std::function<void(Tensor)> normal(float mean = 0.0, float std = 0.05) {
|
||||
return [mean, std](Tensor t) {
|
||||
std::function<void(Tensor&)> normal(float mean = 0.0, float std = 0.05) {
|
||||
return [mean, std](Tensor& t) {
|
||||
distribution<std::normal_distribution<float>>(t, mean, std);
|
||||
};
|
||||
}
|
||||
|
||||
std::function<void(Tensor)> uniform(float a = -0.05, float b = 0.05) {
|
||||
return [a, b](Tensor t) {
|
||||
std::function<void(Tensor&)> uniform(float a = -0.05, float b = 0.05) {
|
||||
return [a, b](Tensor& t) {
|
||||
distribution<std::uniform_real_distribution<float>>(t, a, b);
|
||||
};
|
||||
}
|
||||
|
||||
void glorot_uniform(Tensor t) {
|
||||
float b = sqrtf( 6.0f / (t.shape()[0] + t.shape()[1]) );
|
||||
void glorot_uniform(Tensor& t) {
|
||||
float b = sqrtf( 6.0f / (t->shape()[0] + t->shape()[1]) );
|
||||
distribution<std::uniform_real_distribution<float>>(t, -b, b);
|
||||
}
|
||||
|
||||
void xorshift(Tensor t) {
|
||||
std::vector<float> vals(t.size());
|
||||
void xorshift(Tensor& t) {
|
||||
std::vector<float> vals(t->size());
|
||||
for(auto&& v : vals)
|
||||
v = xor128();
|
||||
t << vals;
|
||||
}
|
||||
|
||||
void glorot_normal(Tensor t) {
|
||||
float b = sqrtf( 2.0f / (t.shape()[0] + t.shape()[1]) );
|
||||
void glorot_normal(Tensor& t) {
|
||||
float b = sqrtf( 2.0f / (t->shape()[0] + t->shape()[1]) );
|
||||
distribution<std::uniform_real_distribution<float>>(t, -b, b);
|
||||
}
|
||||
|
||||
std::function<void(Tensor)> from_vector(const std::vector<float>& v) {
|
||||
return [v](Tensor t) {
|
||||
std::function<void(Tensor&)> from_vector(const std::vector<float>& v) {
|
||||
return [v](Tensor& t) {
|
||||
t << v;
|
||||
};
|
||||
}
|
||||
|
136
src/shape.h
Normal file
136
src/shape.h
Normal file
@ -0,0 +1,136 @@
|
||||
#pragma once
|
||||
|
||||
// This file is part of the Marian toolkit.
|
||||
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "exception.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
/**
|
||||
* @brief Represents the size of each dimension in a tensor.
|
||||
*
|
||||
* Note: this class currently is hard-coded to four dimensions.
|
||||
*/
|
||||
|
||||
const size_t SHAPE_SIZE = 2;
|
||||
|
||||
|
||||
struct Shape {
|
||||
int shape_[SHAPE_SIZE];
|
||||
|
||||
/**
|
||||
* @brief Constructs a default shape.
|
||||
*
|
||||
* This default shape has four dimensions.
|
||||
* The size of each dimension is 1.
|
||||
*/
|
||||
Shape() : shape_{1, 1} { }
|
||||
|
||||
/**
|
||||
* @brief Constructs a shape.
|
||||
*
|
||||
* @param i A list of integers representing the size of each dimension.
|
||||
*/
|
||||
Shape(std::initializer_list<int> il) {
|
||||
std::copy(il.begin(), il.end(), begin());
|
||||
}
|
||||
|
||||
Shape(const Shape& shape) {
|
||||
std::copy(shape.begin(), shape.end(), begin());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets a reference to the int representing the size of the <code>i</code>th dimension represented by this object.
|
||||
*
|
||||
* @return a reference to the int representing the size of the <code>i</code>th dimension represented by this object
|
||||
*/
|
||||
__host__ __device__
|
||||
int& operator[](int i) {
|
||||
return shape_[i];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets the size of the <code>i</code>th dimension represented by this object.
|
||||
*
|
||||
* @return the size of the <code>i</code>th dimension represented by this object
|
||||
*/
|
||||
__host__ __device__
|
||||
const int& operator[](int i) const {
|
||||
return shape_[i];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets the number of dimensions represented by this object
|
||||
*
|
||||
* @return the number of dimensions represented by this object
|
||||
*/
|
||||
size_t size() const {
|
||||
return SHAPE_SIZE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Gets the total number of elements in a tensor of this shape.
|
||||
*
|
||||
* For example, if this shape represents a 5x100 tensor, this method would return 500.
|
||||
*
|
||||
* @return the total number of elements in a tensor of this shape
|
||||
*/
|
||||
size_t elements() const {
|
||||
size_t s = 1;
|
||||
for(int i = 0; i < size(); ++i)
|
||||
s *= shape_[i];
|
||||
return s;
|
||||
}
|
||||
|
||||
/** @brief Gets a pointer to an int that specifies the size of the first dimension represented by this object */
|
||||
int* begin() { return shape_; }
|
||||
|
||||
/** @brief Gets a pointer to an int that specifies the size of the last dimension represented by this object */
|
||||
int* end() { return shape_ + SHAPE_SIZE; }
|
||||
|
||||
/** @brief Gets a const pointer to an int that specifies the size of the first dimension represented by this object */
|
||||
const int* begin() const { return shape_; }
|
||||
|
||||
/** @brief Gets a const pointer to an int that specifies the size of the last dimension represented by this object */
|
||||
const int* end() const { return shape_+ SHAPE_SIZE; }
|
||||
|
||||
/**
|
||||
* @brief Tests this object for equality against another <code>Shape</code> object.
|
||||
*
|
||||
* @return <code>true</code> if the size of each dimension in this object
|
||||
* is equal to the size of the corresponding dimension in the other object,
|
||||
* <code>false</code> otherwise
|
||||
*/
|
||||
bool operator==(const Shape& other) const {
|
||||
return std::equal(begin(), end(), other.begin());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Tests this object for inequality against another <code>Shape</code> object.
|
||||
*/
|
||||
bool operator!=(const Shape& other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
};
|
||||
}
|
@ -2,10 +2,13 @@
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <cudnn.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <boost/timer/timer.hpp>
|
||||
|
||||
#include "tensor.h"
|
||||
#include "tensors/tensor_allocator.h"
|
||||
#include "tensors/tensor_gpu.h"
|
||||
|
||||
#include "tensor_operators.h"
|
||||
#include "param_initializers.h"
|
||||
|
||||
@ -15,8 +18,12 @@ template <class F>
|
||||
void testForward(F f, size_t l,
|
||||
const Shape& shape,
|
||||
const std::string& desc) {
|
||||
Tensor in(shape);
|
||||
Tensor out(shape);
|
||||
|
||||
auto ta = newTensorAllocator<DeviceGPU>();
|
||||
|
||||
Tensor in, out;
|
||||
ta->allocate(in, shape);
|
||||
ta->allocate(out, shape);
|
||||
|
||||
uniform(-5, 5)(in);
|
||||
|
||||
@ -34,9 +41,14 @@ template <class F>
|
||||
void testBackward(F f, size_t l,
|
||||
const Shape& shape,
|
||||
const std::string& desc) {
|
||||
Tensor in(shape);
|
||||
Tensor adj(shape, 1);
|
||||
Tensor grad(shape);
|
||||
|
||||
auto ta = newTensorAllocator<DeviceGPU>();
|
||||
|
||||
Tensor in, adj, grad;
|
||||
ta->allocate(in, shape);
|
||||
ta->allocate(adj, shape);
|
||||
adj->set(1);
|
||||
ta->allocate(grad, shape);
|
||||
|
||||
uniform(-5, 5)(in);
|
||||
|
||||
|
@ -39,63 +39,73 @@ static cudnnHandle_t create_handle_dnn() {
|
||||
cublasHandle_t cublasHandle = create_handle();
|
||||
cudnnHandle_t cudnnHandle = create_handle_dnn();
|
||||
|
||||
void CudnnSoftmax(Tensor out, Tensor in) {
|
||||
void CudnnSoftmax(Tensor& out, Tensor& in) {
|
||||
float alpha = 1, beta = 0;
|
||||
auto inGpu = static_cast<TensorGPU*>(in.get());
|
||||
auto outGpu = static_cast<TensorGPU*>(out.get());
|
||||
cudnnSoftmaxForward(cudnnHandle,
|
||||
CUDNN_SOFTMAX_ACCURATE,
|
||||
CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha,
|
||||
in.cudnn(),
|
||||
in.data(),
|
||||
inGpu->cudnn(),
|
||||
inGpu->data(),
|
||||
&beta,
|
||||
out.cudnn(),
|
||||
out.data());
|
||||
outGpu->cudnn(),
|
||||
outGpu->data());
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void CudnnLogSoftmax(Tensor out, Tensor in) {
|
||||
void CudnnLogSoftmax(Tensor& out, Tensor& in) {
|
||||
float alpha = 1, beta = 0;
|
||||
auto inGpu = static_cast<TensorGPU*>(in.get());
|
||||
auto outGpu = static_cast<TensorGPU*>(out.get());
|
||||
cudnnSoftmaxForward(cudnnHandle,
|
||||
CUDNN_SOFTMAX_LOG,
|
||||
CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha,
|
||||
in.cudnn(),
|
||||
in.data(),
|
||||
inGpu->cudnn(),
|
||||
inGpu->data(),
|
||||
&beta,
|
||||
out.cudnn(),
|
||||
out.data());
|
||||
outGpu->cudnn(),
|
||||
outGpu->data());
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void CudnnSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
|
||||
void CudnnSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val) {
|
||||
float alpha = 1, beta = 0;
|
||||
auto valGpu = static_cast<TensorGPU*>(val.get());
|
||||
auto adjGpu = static_cast<TensorGPU*>(adj.get());
|
||||
auto gradGpu = static_cast<TensorGPU*>(grad.get());
|
||||
cudnnSoftmaxBackward(cudnnHandle,
|
||||
CUDNN_SOFTMAX_ACCURATE,
|
||||
CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha,
|
||||
val.cudnn(),
|
||||
val.data(),
|
||||
adj.cudnn(),
|
||||
adj.data(),
|
||||
valGpu->cudnn(),
|
||||
valGpu->data(),
|
||||
adjGpu->cudnn(),
|
||||
adjGpu->data(),
|
||||
&beta,
|
||||
grad.cudnn(),
|
||||
grad.data());
|
||||
gradGpu->cudnn(),
|
||||
gradGpu->data());
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void CudnnLogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
|
||||
void CudnnLogSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val) {
|
||||
float alpha = 1, beta = 0;
|
||||
auto valGpu = static_cast<TensorGPU*>(val.get());
|
||||
auto adjGpu = static_cast<TensorGPU*>(adj.get());
|
||||
auto gradGpu = static_cast<TensorGPU*>(grad.get());
|
||||
cudnnSoftmaxBackward(cudnnHandle,
|
||||
CUDNN_SOFTMAX_LOG,
|
||||
CUDNN_SOFTMAX_MODE_CHANNEL,
|
||||
&alpha,
|
||||
val.cudnn(),
|
||||
val.data(),
|
||||
adj.cudnn(),
|
||||
adj.data(),
|
||||
valGpu->cudnn(),
|
||||
valGpu->data(),
|
||||
adjGpu->cudnn(),
|
||||
adjGpu->data(),
|
||||
&beta,
|
||||
grad.cudnn(),
|
||||
grad.data());
|
||||
gradGpu->cudnn(),
|
||||
gradGpu->data());
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
@ -137,18 +147,18 @@ __global__ void gSubtractMax(float* out, const float* in,
|
||||
}
|
||||
}
|
||||
|
||||
void SubtractMax(Tensor out, Tensor in) {
|
||||
void SubtractMax(Tensor& out, Tensor& in) {
|
||||
// Out is a m-by-k matrix, passed as input.
|
||||
// The max element of each row of Out is computed and subtracted from Out.
|
||||
// Out is both input and output.
|
||||
size_t m = out.shape()[0];
|
||||
size_t k = out.shape()[1];
|
||||
size_t m = out->shape()[0];
|
||||
size_t k = out->shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, (int) m);
|
||||
int threads = std::min(MAX_THREADS, (int) k);
|
||||
int shared = sizeof(float) * threads * 2;
|
||||
gSubtractMax<<<blocks, threads, shared>>>(out.data(),
|
||||
in.data(), m, k);
|
||||
gSubtractMax<<<blocks, threads, shared>>>(out->data(),
|
||||
in->data(), m, k);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
@ -186,18 +196,18 @@ __global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols) {
|
||||
}
|
||||
}
|
||||
|
||||
void Softmax(Tensor out, Tensor in) {
|
||||
size_t m = out.shape()[0];
|
||||
size_t k = out.shape()[1];
|
||||
void Softmax(Tensor& out, Tensor& in) {
|
||||
size_t m = out->shape()[0];
|
||||
size_t k = out->shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, (int) m);
|
||||
int threads = std::min(MAX_THREADS, (int) k);
|
||||
int shared = sizeof(float) * threads * 2;
|
||||
// Subtract the max rowwise for numerical stability (safe softmax).
|
||||
gSubtractMax<<<blocks, threads, shared>>>(out.data(),
|
||||
in.data(), m, k);
|
||||
gSubtractMax<<<blocks, threads, shared>>>(out->data(),
|
||||
in->data(), m, k);
|
||||
cudaStreamSynchronize(0);
|
||||
gSoftMax<<<blocks, threads, shared>>>(out.data(), m, k);
|
||||
gSoftMax<<<blocks, threads, shared>>>(out->data(), m, k);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
@ -240,18 +250,18 @@ __global__ void gSoftmaxGrad(float* grad, const float* adj, const float* val,
|
||||
}
|
||||
}
|
||||
|
||||
void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
|
||||
void SoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val) {
|
||||
// grad and val are both m-by-k matrices, passed as input.
|
||||
// A weighted average of each row of grad (according to the weights
|
||||
// specified in val) is computed and subtracted from Out.
|
||||
// adj is multiplied for each element to get backward step in autodiff
|
||||
int m = grad.shape()[0];
|
||||
int k = grad.shape()[1];
|
||||
int m = grad->shape()[0];
|
||||
int k = grad->shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, k);
|
||||
int shared = sizeof(float) * threads * 2;
|
||||
gSoftmaxGrad<<<blocks, threads, shared>>>(grad.data(), adj.data(), val.data(),
|
||||
gSoftmaxGrad<<<blocks, threads, shared>>>(grad->data(), adj->data(), val->data(),
|
||||
m, k);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
@ -293,19 +303,19 @@ __global__ void gLogSoftmaxGrad(float* grad, const float* adj, const float* val,
|
||||
}
|
||||
}
|
||||
|
||||
void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
|
||||
void LogSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val) {
|
||||
// grad and val are both m-by-k matrices, passed as input.
|
||||
// A weighted average of each row of grad (according to the weights
|
||||
// specified in val) is computed and subtracted from Out.
|
||||
// adj is multiplied for each element to get backward step in autodiff
|
||||
int m = grad.shape()[0];
|
||||
int k = grad.shape()[1];
|
||||
int m = grad->shape()[0];
|
||||
int k = grad->shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, k);
|
||||
int shared = sizeof(float) * threads * 2;
|
||||
gLogSoftmaxGrad<<<blocks, threads, shared>>>(grad.data(),
|
||||
adj.data(), val.data(),
|
||||
gLogSoftmaxGrad<<<blocks, threads, shared>>>(grad->data(),
|
||||
adj->data(), val->data(),
|
||||
m, k);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
@ -327,80 +337,154 @@ __global__ void gArgmax(float *out, const float *data, size_t rows, size_t cols)
|
||||
out[row] = maxInd;
|
||||
}
|
||||
|
||||
void Argmax(Tensor* Out, const Tensor* In) {
|
||||
size_t m = In->shape()[0];
|
||||
size_t k = In->shape()[1];
|
||||
|
||||
int blocks = m; //std::min(MAX_BLOCKS, (int) m);
|
||||
int threads = k; //std::min(MAX_THREADS, (int) k);
|
||||
//int shared = sizeof(float) * threads * 2;
|
||||
gArgmax<<<blocks, threads>>>(Out->data(), In->data(), m, k);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
//void Argmax(Tensor* Out, const Tensor* In) {
|
||||
// size_t m = In->shape()[0];
|
||||
// size_t k = In->shape()[1];
|
||||
//
|
||||
// int blocks = m; //std::min(MAX_BLOCKS, (int) m);
|
||||
// int threads = k; //std::min(MAX_THREADS, (int) k);
|
||||
// //int shared = sizeof(float) * threads * 2;
|
||||
// gArgmax<<<blocks, threads>>>(Out->data(), In->data(), m, k);
|
||||
// cudaStreamSynchronize(0);
|
||||
//}
|
||||
|
||||
///////////////////////////////////////////////////////
|
||||
|
||||
Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
|
||||
void Prod(cublasHandle_t handle, Tensor& C, const Tensor& A, const Tensor& B,
|
||||
bool transA, bool transB, Float beta) {
|
||||
Float alpha = 1.0;
|
||||
|
||||
size_t m = A.shape()[0];
|
||||
size_t k = A.shape()[1];
|
||||
size_t m = A->shape()[0];
|
||||
size_t k = A->shape()[1];
|
||||
if(transA)
|
||||
std::swap(m, k);
|
||||
|
||||
size_t l = B.shape()[0];
|
||||
size_t n = B.shape()[1];
|
||||
size_t l = B->shape()[0];
|
||||
size_t n = B->shape()[1];
|
||||
if(transB)
|
||||
std::swap(l, n);
|
||||
|
||||
size_t lda = A.shape()[1];
|
||||
size_t ldb = B.shape()[1];
|
||||
size_t ldc = B.shape()[1];
|
||||
size_t lda = A->shape()[1];
|
||||
size_t ldb = B->shape()[1];
|
||||
size_t ldc = B->shape()[1];
|
||||
|
||||
if(transB)
|
||||
ldc = B.shape()[0];
|
||||
ldc = B->shape()[0];
|
||||
|
||||
cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
|
||||
cublasSgemm(handle, opB, opA,
|
||||
n, m, k, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc);
|
||||
return C;
|
||||
n, m, k, &alpha, B->data(), ldb, A->data(), lda, &beta, C->data(), ldc);
|
||||
}
|
||||
|
||||
Tensor Prod(Tensor C, const Tensor A, const Tensor B,
|
||||
void Prod(Tensor& C, const Tensor& A, const Tensor& B,
|
||||
bool transA, bool transB, Float beta) {
|
||||
|
||||
Tensor temp = Prod(cublasHandle, C, A, B, transA, transB, beta);
|
||||
return temp;
|
||||
Prod(cublasHandle, C, A, B, transA, transB, beta);
|
||||
}
|
||||
|
||||
Tensor SumRowwise(cublasHandle_t handle, const Tensor A, Tensor result) {
|
||||
size_t rows = A.shape()[0];
|
||||
size_t cols = A.shape()[1];
|
||||
thrust::device_vector<float> d_ones(cols, 1.f);
|
||||
Float alpha = 1.f;
|
||||
Float beta = 0.f;
|
||||
cublasSgemv(handle, CUBLAS_OP_T, cols, rows, &alpha,
|
||||
A.data(), cols,
|
||||
thrust::raw_pointer_cast(d_ones.data()), 1, &beta,
|
||||
result.data(), 1);
|
||||
return result;
|
||||
void Sum(Tensor& out, const Tensor& in, int axis, bool mean) {
|
||||
int rows = in->shape()[0];
|
||||
int cols = in->shape()[1];
|
||||
|
||||
if(axis == 0) {
|
||||
float scale = 1.f;
|
||||
if(mean)
|
||||
scale = 1.f / rows;
|
||||
|
||||
thrust::device_vector<float> d_ones(rows, scale);
|
||||
Tensor ones(new TensorGPU(thrust::raw_pointer_cast(d_ones.data()),
|
||||
{1, rows}));
|
||||
Prod(out, ones, in, false, false);
|
||||
}
|
||||
else if(axis == 1) {
|
||||
float scale = 1.f;
|
||||
if(mean)
|
||||
scale = 1.f / cols;
|
||||
|
||||
thrust::device_vector<float> d_ones(cols, scale);
|
||||
Tensor ones(new TensorGPU(thrust::raw_pointer_cast(d_ones.data()),
|
||||
{cols, 1}));
|
||||
Prod(out, in, ones, false, false);
|
||||
}
|
||||
else {
|
||||
float scale1 = 1.f;
|
||||
float scale2 = 1.f;
|
||||
if(mean) {
|
||||
scale1 = 1.f / rows;
|
||||
scale2 = 1.f / cols;
|
||||
}
|
||||
thrust::device_vector<float> d_ones1(rows, scale1);
|
||||
Tensor ones1(new TensorGPU(thrust::raw_pointer_cast(d_ones1.data()),
|
||||
{1, rows}));
|
||||
thrust::device_vector<float> d_ones2(cols, scale2);
|
||||
Tensor ones2(new TensorGPU(thrust::raw_pointer_cast(d_ones2.data()),
|
||||
{cols, 1}));
|
||||
thrust::device_vector<float> d_temp(cols, 0.f);
|
||||
Tensor temp(new TensorGPU(thrust::raw_pointer_cast(d_temp.data()),
|
||||
{1, cols}));
|
||||
|
||||
Prod(temp, ones1, in, false, false);
|
||||
Prod(out, temp, ones2, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
Tensor SumRowwise(const Tensor A, Tensor result) {
|
||||
Tensor temp = SumRowwise(cublasHandle, A, result);
|
||||
return temp;
|
||||
void SumBackward(Tensor& out, const Tensor& in, int axis, bool mean) {
|
||||
int rows = out->shape()[0];
|
||||
int cols = out->shape()[1];
|
||||
|
||||
if(axis == 0) {
|
||||
float scale = 1.f;
|
||||
if(mean)
|
||||
scale = 1.f / rows;
|
||||
|
||||
thrust::device_vector<float> d_ones(rows, scale);
|
||||
Tensor ones(new TensorGPU(thrust::raw_pointer_cast(d_ones.data()),
|
||||
{rows, 1}));
|
||||
Prod(out, ones, in, false, false);
|
||||
}
|
||||
else if(axis == 1) {
|
||||
float scale = 1.f;
|
||||
if(mean)
|
||||
scale = 1.f / cols;
|
||||
|
||||
thrust::device_vector<float> d_ones(cols, scale);
|
||||
Tensor ones(new TensorGPU(thrust::raw_pointer_cast(d_ones.data()),
|
||||
{1, cols}));
|
||||
Prod(out, in, ones, false, false);
|
||||
}
|
||||
else {
|
||||
float scale1 = 1.f;
|
||||
float scale2 = 1.f;
|
||||
if(mean) {
|
||||
scale1 = 1.f / rows;
|
||||
scale2 = 1.f / cols;
|
||||
}
|
||||
thrust::device_vector<float> d_ones1(rows, scale1);
|
||||
Tensor ones1(new TensorGPU(thrust::raw_pointer_cast(d_ones1.data()),
|
||||
{rows, 1}));
|
||||
thrust::device_vector<float> d_ones2(cols, scale2);
|
||||
Tensor ones2(new TensorGPU(thrust::raw_pointer_cast(d_ones2.data()),
|
||||
{1, cols}));
|
||||
thrust::device_vector<float> d_temp(rows, 0.f);
|
||||
Tensor temp(new TensorGPU(thrust::raw_pointer_cast(d_temp.data()),
|
||||
{rows, 1}));
|
||||
|
||||
Prod(temp, ones1, in, false, false);
|
||||
Prod(out, temp, ones2, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
void CudnnDropoutPrepare(Tensor in, float p,
|
||||
|
||||
void CudnnDropoutPrepare(Tensor& in, float p,
|
||||
cudnnDropoutDescriptor_t* dropDesc,
|
||||
void** space, size_t* spaceSize,
|
||||
void** states, size_t seed) {
|
||||
size_t statesSize;
|
||||
cudnnDropoutGetStatesSize(cudnnHandle, &statesSize);
|
||||
cudnnDropoutGetReserveSpaceSize(in.cudnn(), spaceSize);
|
||||
auto inGpu = static_cast<TensorGPU*>(in.get());
|
||||
cudnnDropoutGetReserveSpaceSize(inGpu->cudnn(), spaceSize);
|
||||
|
||||
cudaMalloc((void**)states, statesSize);
|
||||
cudaMalloc((void**)space, *spaceSize);
|
||||
@ -423,26 +507,30 @@ void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,
|
||||
|
||||
void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
|
||||
void* space, size_t spaceSize,
|
||||
Tensor out, Tensor in) {
|
||||
Tensor& out, Tensor& in) {
|
||||
auto inGpu = static_cast<TensorGPU*>(in.get());
|
||||
auto outGpu = static_cast<TensorGPU*>(out.get());
|
||||
cudnnDropoutForward(cudnnHandle,
|
||||
dropoutDesc,
|
||||
in.cudnn(),
|
||||
in.data(),
|
||||
out.cudnn(),
|
||||
out.data(),
|
||||
inGpu->cudnn(),
|
||||
inGpu->data(),
|
||||
outGpu->cudnn(),
|
||||
outGpu->data(),
|
||||
space,
|
||||
spaceSize);
|
||||
}
|
||||
|
||||
void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc,
|
||||
void* space, size_t spaceSize,
|
||||
Tensor out, Tensor in) {
|
||||
Tensor& out, Tensor& in) {
|
||||
auto inGpu = static_cast<TensorGPU*>(in.get());
|
||||
auto outGpu = static_cast<TensorGPU*>(out.get());
|
||||
cudnnDropoutBackward(cudnnHandle,
|
||||
dropoutDesc,
|
||||
in.cudnn(),
|
||||
in.data(),
|
||||
out.cudnn(),
|
||||
out.data(),
|
||||
inGpu->cudnn(),
|
||||
inGpu->data(),
|
||||
outGpu->cudnn(),
|
||||
outGpu->data(),
|
||||
space,
|
||||
spaceSize);
|
||||
}
|
||||
|
@ -21,7 +21,10 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "tensor.h"
|
||||
#include <cublas_v2.h>
|
||||
#include <thrust/functional.h>
|
||||
|
||||
#include "tensors/tensor_gpu.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -29,11 +32,79 @@ using namespace thrust::placeholders;
|
||||
#define MAX_THREADS 512
|
||||
#define MAX_BLOCKS 65535
|
||||
|
||||
template <class Functor>
|
||||
__global__ void gElementVec(Functor functor,
|
||||
float* out, const float* in,
|
||||
int length) {
|
||||
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
|
||||
int noColumn = bid + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (noColumn < length) {
|
||||
out[noColumn] = functor(out[noColumn], in[noColumn]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class T1, class T2>
|
||||
void ElementVec(Functor functor,
|
||||
T1 out, T2 in) {
|
||||
|
||||
int rows = out->shape()[0];
|
||||
int cols = out->shape()[1];
|
||||
|
||||
int length = rows * cols;
|
||||
|
||||
float* d_out = out->data();
|
||||
float* d_in = in->data();
|
||||
|
||||
int threads = std::min(MAX_THREADS, length);
|
||||
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
|
||||
|
||||
gElementVec<<<blocks, threads>>>(functor, d_out, d_in, length);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
template <class Functor>
|
||||
__global__ void gElementVec(Functor functor,
|
||||
float* out,
|
||||
const float* in1,
|
||||
const float* in2,
|
||||
int length) {
|
||||
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
|
||||
int noColumn = bid + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (noColumn < length) {
|
||||
out[noColumn] = functor(out[noColumn],
|
||||
in1[noColumn],
|
||||
in2[noColumn]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class T1, class T2, class T3>
|
||||
void ElementVec(Functor functor,
|
||||
T1 out, T2 in1, T3 in2) {
|
||||
|
||||
int rows = out->shape()[0];
|
||||
int cols = out->shape()[1];
|
||||
|
||||
int length = rows * cols;
|
||||
|
||||
float* d_out = out->data();
|
||||
float* d_in1 = in1->data();
|
||||
float* d_in2 = in2->data();
|
||||
|
||||
int threads = std::min(MAX_THREADS, (int)length);
|
||||
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
|
||||
|
||||
gElementVec<<<blocks, threads>>>(functor, d_out, d_in1, d_in2, length);
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
template <class Functor, class T>
|
||||
__global__ void gElement(Functor functor,
|
||||
T out) {
|
||||
int rows = out.rows();
|
||||
int cols = out.cols();
|
||||
int rows = out.shape()[0];
|
||||
int cols = out.shape()[1];
|
||||
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
if(i < rows) {
|
||||
@ -47,14 +118,17 @@ __global__ void gElement(Functor functor,
|
||||
}
|
||||
|
||||
template <class Functor, class T>
|
||||
void Element(Functor functor, T out) {
|
||||
void Element(Functor functor, T& out) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
int m = out->shape()[0];
|
||||
int n = out->shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu());
|
||||
|
||||
auto outGpu = static_cast<TensorGPU*>(out.get());
|
||||
|
||||
gElement<<<blocks, threads>>>(functor, outGpu->access());
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
@ -62,38 +136,50 @@ void Element(Functor functor, T out) {
|
||||
template <class Functor, class T1, class T2>
|
||||
__global__ void gElement(Functor functor,
|
||||
T1 out, T2 in) {
|
||||
int rows = out.rows();
|
||||
int cols = out.cols();
|
||||
int rows = out.shape()[0];
|
||||
int cols = out.shape()[1];
|
||||
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
if(i < rows) {
|
||||
for(int tid = 0; tid < cols; tid += blockDim.x) {
|
||||
int j = tid + threadIdx.x;
|
||||
if(j < cols)
|
||||
if(j < cols) {
|
||||
out(i, j) = functor(out(i, j), in(i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class T1, class T2>
|
||||
void Element(Functor functor,
|
||||
T1 out, T2 in) {
|
||||
T1& out, T2& in) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
if(out->shape() == in->shape()) {
|
||||
ElementVec(functor, out, in);
|
||||
}
|
||||
else {
|
||||
int m = out->shape()[0];
|
||||
int n = out->shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(), in.gpu());
|
||||
|
||||
auto inGpu = static_cast<TensorGPU*>(in.get());
|
||||
auto outGpu = static_cast<TensorGPU*>(out.get());
|
||||
|
||||
gElement<<<blocks, threads>>>(functor,
|
||||
outGpu->access(), inGpu->access());
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class T1, class T2, class T3>
|
||||
__global__ void gElement(Functor functor,
|
||||
T1 out, T2 in1, T3 in2) {
|
||||
int rows = out.rows();
|
||||
int cols = out.cols();
|
||||
int rows = out.shape()[0];
|
||||
int cols = out.shape()[1];
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
if(i < rows) {
|
||||
@ -108,23 +194,35 @@ __global__ void gElement(Functor functor,
|
||||
|
||||
template <class Functor, class T1, class T2, class T3>
|
||||
void Element(Functor functor,
|
||||
T1 out, T2 in1, T3 in2) {
|
||||
T1& out, T2& in1, T3& in2) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
if(out->shape() == in1->shape() && in1->shape() == in2->shape()) {
|
||||
ElementVec(functor, out, in1, in2);
|
||||
}
|
||||
else {
|
||||
auto in1Gpu = static_cast<TensorGPU*>(in1.get());
|
||||
auto in2Gpu = static_cast<TensorGPU*>(in2.get());
|
||||
auto outGpu = static_cast<TensorGPU*>(out.get());
|
||||
|
||||
int m = out->shape()[0];
|
||||
int n = out->shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(),
|
||||
in1.gpu(), in2.gpu());
|
||||
gElement<<<blocks, threads>>>(functor,
|
||||
outGpu->access(),
|
||||
in1Gpu->access(),
|
||||
in2Gpu->access());
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class T1, class T2, class T3, class T4>
|
||||
__global__ void gElement(Functor functor,
|
||||
T1 out, T2 in1, T3 in2, T4 in3) {
|
||||
int rows = out.rows();
|
||||
int cols = out.cols();
|
||||
int rows = out.shape()[0];
|
||||
int cols = out.shape()[1];
|
||||
|
||||
for(int bid = 0; bid < rows; bid += gridDim.x) {
|
||||
int i = bid + blockIdx.x;
|
||||
if(i < rows) {
|
||||
@ -139,48 +237,53 @@ __global__ void gElement(Functor functor,
|
||||
|
||||
template <class Functor, class T1, class T2, class T3, class T4>
|
||||
void Element(Functor functor,
|
||||
T1 out, T2 in1, T3 in2, T4 in3) {
|
||||
T1& out, T2& in1, T3& in2, T4& in3) {
|
||||
|
||||
int m = out.shape()[0];
|
||||
int n = out.shape()[1];
|
||||
auto in1Gpu = static_cast<TensorGPU*>(in1.get());
|
||||
auto in2Gpu = static_cast<TensorGPU*>(in2.get());
|
||||
auto in3Gpu = static_cast<TensorGPU*>(in3.get());
|
||||
auto outGpu = static_cast<TensorGPU*>(out.get());
|
||||
|
||||
int m = outGpu->shape()[0];
|
||||
int n = outGpu->shape()[1];
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, n);
|
||||
gElement<<<blocks, threads>>>(functor, out.gpu(),
|
||||
in1.gpu(), in2.gpu(), in3.gpu());
|
||||
gElement<<<blocks, threads>>>(functor,
|
||||
outGpu->access(),
|
||||
in1Gpu->access(),
|
||||
in2Gpu->access(),
|
||||
in3Gpu->access());
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
|
||||
void ClipNorm(Tensor out, float threshold);
|
||||
void ClipNorm(Tensor& out, float threshold);
|
||||
|
||||
void SubtractMax(Tensor out, Tensor in);
|
||||
void SubtractMax(Tensor& out, Tensor& in);
|
||||
|
||||
void Softmax(Tensor out, Tensor in);
|
||||
void Softmax(Tensor& out, Tensor& in);
|
||||
|
||||
void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
|
||||
void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
|
||||
void SoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val);
|
||||
void LogSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val);
|
||||
|
||||
void CudnnSoftmax(Tensor out, Tensor in);
|
||||
void CudnnSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
|
||||
void CudnnSoftmax(Tensor& out, Tensor& in);
|
||||
void CudnnSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val);
|
||||
|
||||
void CudnnLogSoftmax(Tensor out, Tensor in);
|
||||
void CudnnLogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
|
||||
void CudnnLogSoftmax(Tensor& out, Tensor& in);
|
||||
void CudnnLogSoftmaxGrad(Tensor& grad, Tensor& adj, Tensor& val);
|
||||
|
||||
void Argmax(Tensor* Out, const Tensor* In);
|
||||
void Argmax(Tensor& Out, const Tensor& In);
|
||||
|
||||
Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
|
||||
void Prod(cublasHandle_t handle, Tensor& C, const Tensor& A, const Tensor& B,
|
||||
bool transA, bool transB, Float beta);
|
||||
|
||||
Tensor Prod(Tensor C, const Tensor A, const Tensor B,
|
||||
void Prod(Tensor& C, const Tensor& A, const Tensor& B,
|
||||
bool transA, bool transB, Float beta = 0);
|
||||
|
||||
Tensor SumRowwise(cublasHandle_t handle, const Tensor A, Tensor result);
|
||||
void Sum(Tensor& out, const Tensor& in, int axis=-1, bool mean=false);
|
||||
void SumBackward(Tensor& out, const Tensor& in, int axis=-1, bool mean=false);
|
||||
|
||||
Tensor SumRowwise(const Tensor A, Tensor result);
|
||||
|
||||
void ScaleRowwise(Tensor Out, const Tensor ScalingFactors);
|
||||
|
||||
void CudnnDropoutPrepare(Tensor in, float p,
|
||||
void CudnnDropoutPrepare(Tensor& in, float p,
|
||||
cudnnDropoutDescriptor_t* dropDesc,
|
||||
void** space, size_t* spaceSize,
|
||||
void** states, size_t seed);
|
||||
@ -190,11 +293,11 @@ void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,
|
||||
|
||||
void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
|
||||
void* space, size_t spaceSize,
|
||||
Tensor out, Tensor in);
|
||||
Tensor& out, Tensor& in);
|
||||
|
||||
void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc,
|
||||
void* space, size_t spaceSize,
|
||||
Tensor out, Tensor in);
|
||||
Tensor& out, Tensor& in);
|
||||
|
||||
|
||||
}
|
||||
|
28
src/tensor_test.cu
Normal file
28
src/tensor_test.cu
Normal file
@ -0,0 +1,28 @@
|
||||
#include <iostream>
|
||||
|
||||
#include "tensors/tensor_allocator.h"
|
||||
#include "tensors/tensor_gpu.h"
|
||||
#include "tensor_operators.h"
|
||||
|
||||
using namespace marian;
|
||||
|
||||
int main() {
|
||||
TensorAllocator params = newTensorAllocator<DeviceGPU>();
|
||||
//params->allocate(100000000);
|
||||
|
||||
std::vector<Tensor> tensors;
|
||||
for (int i = 0; i < 200; ++i) {
|
||||
std::cerr << i << std::endl;
|
||||
tensors.emplace_back();
|
||||
params->allocate(tensors.back(), {784,2048});
|
||||
std::cerr << tensors.back()->size() << std::endl;
|
||||
std::cerr << params->capacity() << " " << params->size() << std::endl;
|
||||
}
|
||||
|
||||
for(int i = 0; i < 200; i++) {
|
||||
tensors[i]->set(0, 3.14 * i);
|
||||
std::cerr << tensors[i]->get(0) << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -236,8 +236,7 @@ class TensorImpl {
|
||||
*
|
||||
* @return Vector in string form.
|
||||
*/
|
||||
std::string Debug() const
|
||||
{
|
||||
std::string Debug() const {
|
||||
std::stringstream strm;
|
||||
assert(shape_.size());
|
||||
strm << "shape=" << marian::Debug(shape_) << std::endl;
|
36
src/tensors/tensor.cu
Normal file
36
src/tensors/tensor.cu
Normal file
@ -0,0 +1,36 @@
|
||||
// This file is part of the Marian toolkit.
|
||||
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
Tensor& operator<<(Tensor& t, const std::vector<float>& v) {
|
||||
t->set(v);
|
||||
return t;
|
||||
}
|
||||
|
||||
Tensor& operator>>(Tensor& t, std::vector<float>& v) {
|
||||
t->get(v);
|
||||
return t;
|
||||
}
|
||||
|
||||
}
|
76
src/tensors/tensor.h
Normal file
76
src/tensors/tensor.h
Normal file
@ -0,0 +1,76 @@
|
||||
#pragma once
|
||||
|
||||
// This file is part of the Marian toolkit.
|
||||
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <memory>
|
||||
#include <iostream>
|
||||
|
||||
#include "definitions.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
class TensorBase {
|
||||
public:
|
||||
TensorBase(float* data, Shape shape)
|
||||
: data_(data), shape_(shape)
|
||||
{ }
|
||||
|
||||
virtual ~TensorBase() {}
|
||||
|
||||
virtual void reset(float* data) {
|
||||
data_ = data;
|
||||
}
|
||||
|
||||
virtual float* data() {
|
||||
return data_;
|
||||
}
|
||||
|
||||
virtual Shape& shape() {
|
||||
return shape_;
|
||||
}
|
||||
|
||||
virtual size_t size() {
|
||||
return shape_.elements();
|
||||
}
|
||||
|
||||
virtual float get(size_t) = 0;
|
||||
virtual void set(size_t, float) = 0;
|
||||
|
||||
virtual void set(float) = 0;
|
||||
|
||||
virtual void get(std::vector<float> &v) = 0;
|
||||
virtual void set(const std::vector<float> &v) = 0;
|
||||
|
||||
virtual std::string debug() = 0;
|
||||
|
||||
protected:
|
||||
float* data_;
|
||||
Shape shape_;
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<TensorBase> Tensor;
|
||||
|
||||
Tensor& operator<<(Tensor& t, const std::vector<float>& v);
|
||||
|
||||
Tensor& operator>>(Tensor& t, std::vector<float>& v);
|
||||
|
||||
}
|
137
src/tensors/tensor_allocator.h
Normal file
137
src/tensors/tensor_allocator.h
Normal file
@ -0,0 +1,137 @@
|
||||
#pragma once
|
||||
|
||||
// This file is part of the Marian toolkit.
|
||||
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <set>
|
||||
|
||||
#include "definitions.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
class TensorAllocatorBase {
|
||||
public:
|
||||
virtual ~TensorAllocatorBase() {};
|
||||
virtual void reserve(size_t) = 0;
|
||||
virtual void reserveExact(size_t) = 0;
|
||||
virtual void clear() = 0;
|
||||
virtual void allocate(Tensor&, Shape) = 0;
|
||||
virtual size_t capacity() = 0;
|
||||
virtual size_t size() = 0;
|
||||
virtual Tensor asTensor() = 0;
|
||||
};
|
||||
|
||||
template <class Device>
|
||||
class TensorAllocatorDerived : public TensorAllocatorBase {
|
||||
private:
|
||||
const size_t CHUNK = 128;
|
||||
const size_t MBYTE = 1024 * 1024;
|
||||
const size_t FLOATS = CHUNK * MBYTE / sizeof(float);
|
||||
|
||||
Device device_;
|
||||
std::vector<Tensor> allocated_;
|
||||
|
||||
void reset(Tensor t, float* start) {
|
||||
t->reset(start);
|
||||
}
|
||||
|
||||
void resetAllocated() {
|
||||
float* start = device_.data();
|
||||
for(auto t : allocated_) {
|
||||
reset(t, start);
|
||||
start += t->size();
|
||||
}
|
||||
}
|
||||
|
||||
void checkSpace(Shape shape) {
|
||||
float* start = device_.data();
|
||||
if(!allocated_.empty()) {
|
||||
start = allocated_.back()->data() + allocated_.back()->size();
|
||||
}
|
||||
|
||||
size_t available = device_.data() + device_.capacity() - start;
|
||||
if(shape.elements() > available) {
|
||||
reserve(device_.capacity() - available + shape.elements());
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
void reserve(size_t elements = 0) {
|
||||
float mult = elements / FLOATS + 1;
|
||||
std::cerr << "Reserving " << mult * CHUNK << " MB" << std::endl;
|
||||
device_.reserve(mult * FLOATS);
|
||||
resetAllocated();
|
||||
}
|
||||
|
||||
void reserveExact(size_t elements = 0) {
|
||||
size_t mbytes = (elements * sizeof(float)) / MBYTE;
|
||||
std::cerr << "Reserving space for " << elements
|
||||
<< " floats (" << mbytes << " MB)" << std::endl;
|
||||
device_.reserve(elements);
|
||||
resetAllocated();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
allocated_.clear();
|
||||
}
|
||||
|
||||
void allocate(Tensor &t, Shape shape) {
|
||||
if(!t || t->shape() != shape) {
|
||||
checkSpace(shape);
|
||||
|
||||
float* start = device_.data();
|
||||
if(!allocated_.empty()) {
|
||||
start = allocated_.back()->data() + allocated_.back()->size();
|
||||
}
|
||||
|
||||
t.reset(new typename Device::tensor_type(start, shape));
|
||||
allocated_.push_back(t);
|
||||
}
|
||||
}
|
||||
|
||||
Tensor asTensor() {
|
||||
float* start = device_.data();
|
||||
return Tensor(new typename Device::tensor_type(start, {1, (int)size()}));
|
||||
}
|
||||
|
||||
size_t capacity() {
|
||||
return device_.capacity();
|
||||
}
|
||||
|
||||
size_t size() {
|
||||
float* start = device_.data();
|
||||
float* end = start;
|
||||
if(!allocated_.empty())
|
||||
end = allocated_.back()->data() + allocated_.back()->size();
|
||||
|
||||
return end - start;
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<TensorAllocatorBase> TensorAllocator;
|
||||
|
||||
template <class Device>
|
||||
TensorAllocator newTensorAllocator() {
|
||||
return TensorAllocator(new TensorAllocatorDerived<Device>());
|
||||
}
|
||||
|
||||
}
|
95
src/tensors/tensor_cpu.h
Normal file
95
src/tensors/tensor_cpu.h
Normal file
@ -0,0 +1,95 @@
|
||||
#pragma once
|
||||
|
||||
// This file is part of the Marian toolkit.
|
||||
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
class TensorCPU : public TensorBase {
|
||||
public:
|
||||
TensorCPU(float* data, Shape shape)
|
||||
: TensorBase(data, shape) {}
|
||||
|
||||
float get(size_t i) {
|
||||
return data_[i];
|
||||
}
|
||||
|
||||
void set(size_t i, float value) {
|
||||
data_[i] = value;
|
||||
}
|
||||
|
||||
void get(std::vector<float> &v) {
|
||||
v.resize(size());
|
||||
std::copy(data_, data_ + size(), v.begin());
|
||||
}
|
||||
|
||||
void set(float value) {
|
||||
std::fill(data_, data_ + size(), value);
|
||||
}
|
||||
|
||||
void set(const std::vector<float> &v) {
|
||||
std::copy(v.begin(), v.end(), data_);
|
||||
}
|
||||
};
|
||||
|
||||
class DeviceCPU {
|
||||
private:
|
||||
float* data_;
|
||||
size_t size_
|
||||
|
||||
public:
|
||||
DeviceCPU()
|
||||
: data_(0), size_(0) {}
|
||||
|
||||
~DeviceCPU() {
|
||||
if(data_)
|
||||
delete[] data_;
|
||||
}
|
||||
|
||||
typedef TensorCPU tensor_type;
|
||||
|
||||
void reserve(size_t size) {
|
||||
UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
|
||||
float* temp = new float[size];
|
||||
|
||||
if(data_) {
|
||||
std::memcpy(temp, data_, size_* sizeof(float));
|
||||
delete[] data_;
|
||||
}
|
||||
|
||||
data_ = temp;
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
float* data() {
|
||||
return data_;
|
||||
}
|
||||
|
||||
size_t capacity() {
|
||||
return size_;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
202
src/tensors/tensor_gpu.h
Normal file
202
src/tensors/tensor_gpu.h
Normal file
@ -0,0 +1,202 @@
|
||||
#pragma once
|
||||
|
||||
// This file is part of the Marian toolkit.
|
||||
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include <sstream>
|
||||
#include <cuda.h>
|
||||
#include <cudnn.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include "exception.h"
|
||||
#include "definitions.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
|
||||
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
|
||||
{
|
||||
if (code != cudaSuccess)
|
||||
{
|
||||
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
|
||||
if (abort) exit(code);
|
||||
}
|
||||
}
|
||||
|
||||
struct Access {
|
||||
float* data_;
|
||||
Shape shape_;
|
||||
|
||||
Access(float* data, const Shape& shape)
|
||||
: data_(data), shape_(shape) {}
|
||||
|
||||
__device__
|
||||
inline float& operator()(size_t i, size_t j) {
|
||||
int rows = shape_[0];
|
||||
int cols = shape_[1];
|
||||
if(rows != 1 && cols != 1)
|
||||
return data_[i * cols + j];
|
||||
if(rows != 1 && cols == 1)
|
||||
return data_[i];
|
||||
if(rows == 1 && cols != 1)
|
||||
return data_[j];
|
||||
return data_[0];
|
||||
}
|
||||
|
||||
__device__ __host__
|
||||
float* data() {
|
||||
return data_;
|
||||
}
|
||||
|
||||
__device__ __host__
|
||||
Shape& shape() {
|
||||
return shape_;
|
||||
}
|
||||
|
||||
//Access* toDevice() {
|
||||
// Access* ptr;
|
||||
// cudaMalloc(&ptr, sizeof(Access));
|
||||
// cudaMemcpy(ptr, this, sizeof(Access), cudaMemcpyHostToDevice);
|
||||
// return ptr;
|
||||
//}
|
||||
};
|
||||
|
||||
class TensorGPU : public TensorBase {
|
||||
private:
|
||||
// cuDNN stuff
|
||||
cudnnTensorDescriptor_t cudnnDesc_;
|
||||
|
||||
public:
|
||||
TensorGPU(float* data, Shape shape)
|
||||
: TensorBase(data, shape) {
|
||||
cudnnCreateTensorDescriptor(&cudnnDesc_);
|
||||
cudnnSetTensor4dDescriptorEx(cudnnDesc_, CUDNN_DATA_FLOAT,
|
||||
shape_[0], shape_[1], 1, 1,
|
||||
shape_[1], 1, 1, 1);
|
||||
}
|
||||
|
||||
~TensorGPU() {
|
||||
cudnnDestroyTensorDescriptor(cudnnDesc_);
|
||||
}
|
||||
|
||||
|
||||
float get(size_t i) {
|
||||
float temp;
|
||||
CUDA_CHECK(cudaMemcpy(&temp, data_ + i, sizeof(float),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return temp;
|
||||
}
|
||||
|
||||
void set(size_t i, float value) {
|
||||
CUDA_CHECK(cudaMemcpy(data_ + i, &value, sizeof(float),
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
void get(std::vector<float> &v) {
|
||||
v.resize(size());
|
||||
CUDA_CHECK(cudaMemcpy(v.data(), data_, size() * sizeof(float),
|
||||
cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void set(float value) {
|
||||
thrust::fill(thrust::device_ptr<float>(data_),
|
||||
thrust::device_ptr<float>(data_ + size()), value);
|
||||
}
|
||||
|
||||
void set(const std::vector<float> &v) {
|
||||
CUDA_CHECK(cudaMemcpy(data_, v.data(), v.size() * sizeof(float),
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
cudnnTensorDescriptor_t& cudnn() {
|
||||
return cudnnDesc_;
|
||||
}
|
||||
|
||||
Access access() {
|
||||
return Access(data_, shape_);
|
||||
}
|
||||
|
||||
std::string debug() {
|
||||
std::stringstream strm;
|
||||
assert(shape_.size());
|
||||
strm << "shape=" << shape_[0] << "x" << shape_[1] << std::endl;
|
||||
|
||||
// values
|
||||
size_t totSize = shape_.elements();
|
||||
std::vector<Float> values(totSize);
|
||||
get(values);
|
||||
|
||||
size_t ind = 0;
|
||||
for (size_t i = 0; i < shape()[0]; ++i) {
|
||||
for (size_t j = 0; j < shape()[1]; ++j) {
|
||||
strm << values[ind] << " ";
|
||||
++ind;
|
||||
}
|
||||
strm << std::endl;
|
||||
}
|
||||
return strm.str();
|
||||
}
|
||||
};
|
||||
|
||||
class DeviceGPU {
|
||||
private:
|
||||
float* data_;
|
||||
size_t size_;
|
||||
|
||||
public:
|
||||
DeviceGPU()
|
||||
: data_(0), size_(0) {}
|
||||
|
||||
~DeviceGPU() {
|
||||
if(data_)
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
}
|
||||
|
||||
typedef TensorGPU tensor_type;
|
||||
|
||||
void reserve(size_t size) {
|
||||
UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
|
||||
float *temp;
|
||||
CUDA_CHECK(cudaMalloc(&temp, size * sizeof(float)));
|
||||
|
||||
if(data_) {
|
||||
CUDA_CHECK(cudaMemcpy(temp, data_, size_* sizeof(float),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
}
|
||||
|
||||
data_ = temp;
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
float* data() {
|
||||
return data_;
|
||||
}
|
||||
|
||||
size_t capacity() {
|
||||
return size_;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
@ -57,7 +57,7 @@ class Trainer : public RunBase,
|
||||
while(bg) {
|
||||
BatchPtr batch = bg.next();
|
||||
opt->update(graph_, batch);
|
||||
cost += (*graph_)["cost"]->val()[0] * batch->dim();
|
||||
cost += (*graph_)["cost"]->val()->get(0) * batch->dim();
|
||||
totalExamples += batch->dim();
|
||||
update++;
|
||||
}
|
||||
@ -115,7 +115,7 @@ class Validator : public RunBase,
|
||||
BatchGenerator bg(dataset_, batchSize);
|
||||
|
||||
size_t update = 0;
|
||||
bg.prepare();
|
||||
bg.prepare(false);
|
||||
|
||||
float total = 0;
|
||||
float cor = 0;
|
||||
@ -123,7 +123,7 @@ class Validator : public RunBase,
|
||||
BatchPtr batch = bg.next();
|
||||
graph_->inference(batch);
|
||||
std::vector<float> scores;
|
||||
scores << (*graph_)["scores"]->val();
|
||||
(*graph_)["scores"]->val()->get(scores);
|
||||
|
||||
cor += correct(scores, batch->inputs()[1].data());
|
||||
total += batch->dim();
|
||||
|
Loading…
Reference in New Issue
Block a user