mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-17 09:47:34 +03:00
Autoformat files
This commit is contained in:
parent
5f2eedc6e5
commit
6d0c75cf48
@ -1,3 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
find ./src \( -path ./src/3rd_party -o -path ./src/tests -o -path ./src/models/experimental \) -prune -o -iname *.h -o -iname *.cpp -o -iname *.cu | xargs clang-format-3.8 -i
|
||||
find ./src \( -path ./src/3rd_party -o -path ./src/tests -o -path ./src/models/experimental \) -prune -o -iname *.h -o -iname *.cpp -o -iname *.cu | xargs clang-format -i
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include "training/graph_group_async_drop.h"
|
||||
#endif
|
||||
|
||||
|
||||
bool configureMPI(int, char**);
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
@ -18,7 +17,7 @@ int main(int argc, char** argv) {
|
||||
|
||||
auto options = New<Config>(argc, argv);
|
||||
auto devices = options->getDevices();
|
||||
|
||||
|
||||
if(options->get<bool>("multi-node")) {
|
||||
ABORT_IF(!configureMPI(argc, argv), "MPI not found.");
|
||||
|
||||
|
@ -65,5 +65,4 @@ void Config::AddYamlToNpz(const YAML::Node& yaml,
|
||||
unsigned shape = out.size() + 1;
|
||||
cnpy::npz_save(fName, varName, out.c_str(), &shape, 1, "a");
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -76,7 +76,9 @@ public:
|
||||
log();
|
||||
|
||||
if(has("version"))
|
||||
LOG(info, "[config] Model created with Marian {}", get("version").as<std::string>());
|
||||
LOG(info,
|
||||
"[config] Model created with Marian {}",
|
||||
get("version").as<std::string>());
|
||||
}
|
||||
|
||||
Config(const Config& other) : config_(YAML::Clone(other.config_)) {}
|
||||
@ -110,9 +112,7 @@ public:
|
||||
YAML::Node getModelParameters();
|
||||
void loadModelParameters(const std::string& name);
|
||||
|
||||
const std::vector<DeviceId>& getDevices() {
|
||||
return devices_;
|
||||
}
|
||||
const std::vector<DeviceId>& getDevices() { return devices_; }
|
||||
|
||||
void save(const std::string& name) {
|
||||
OutputFileStream out(name);
|
||||
|
@ -2,8 +2,8 @@
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#if MKL_FOUND
|
||||
//#include <omp.h>
|
||||
@ -15,9 +15,9 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#include "3rd_party/cnpy/cnpy.h"
|
||||
#include "common/definitions.h"
|
||||
|
||||
#include "common/config.h"
|
||||
#include "common/config_parser.h"
|
||||
#include "common/file_stream.h"
|
||||
@ -146,13 +146,14 @@ bool ConfigParser::has(const std::string& key) const {
|
||||
|
||||
void ConfigParser::validateOptions() const {
|
||||
if(mode_ == ConfigMode::translating) {
|
||||
UTIL_THROW_IF2(!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
|
||||
UTIL_THROW_IF2(
|
||||
!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
|
||||
"Translating, but vocabularies are not given!");
|
||||
|
||||
for(const auto& modelFile : get<std::vector<std::string>>("models")) {
|
||||
boost::filesystem::path modelPath(modelFile);
|
||||
UTIL_THROW_IF2(!boost::filesystem::exists(modelPath),
|
||||
"Model file does not exist: " + modelFile);
|
||||
"Model file does not exist: " + modelFile);
|
||||
}
|
||||
|
||||
return;
|
||||
@ -177,9 +178,10 @@ void ConfigParser::validateOptions() const {
|
||||
|
||||
if(mode_ == ConfigMode::rescoring) {
|
||||
UTIL_THROW_IF2(!boost::filesystem::exists(modelPath),
|
||||
"Model file does not exist: " + modelPath.string());
|
||||
"Model file does not exist: " + modelPath.string());
|
||||
|
||||
UTIL_THROW_IF2(!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
|
||||
UTIL_THROW_IF2(
|
||||
!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
|
||||
"Scoring, but vocabularies are not given!");
|
||||
|
||||
return;
|
||||
@ -193,8 +195,9 @@ void ConfigParser::validateOptions() const {
|
||||
!modelDir.empty() && !boost::filesystem::is_directory(modelDir),
|
||||
"Model directory does not exist");
|
||||
|
||||
UTIL_THROW_IF2(!modelDir.empty() && !(boost::filesystem::status(modelDir).permissions()
|
||||
& boost::filesystem::owner_write),
|
||||
UTIL_THROW_IF2(!modelDir.empty()
|
||||
&& !(boost::filesystem::status(modelDir).permissions()
|
||||
& boost::filesystem::owner_write),
|
||||
"No write permission in model directory");
|
||||
|
||||
UTIL_THROW_IF2(
|
||||
@ -835,7 +838,6 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
|
||||
SET_OPTION("transformer-dim-ffn", int);
|
||||
SET_OPTION("transformer-ffn-activation", std::string);
|
||||
|
||||
|
||||
#ifdef CUDNN
|
||||
SET_OPTION("char-stride", int);
|
||||
SET_OPTION("char-highway", int);
|
||||
@ -976,7 +978,7 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
|
||||
SET_OPTION("relative-paths", bool);
|
||||
SET_OPTION("devices", std::vector<std::string>);
|
||||
SET_OPTION("cpu-threads", size_t);
|
||||
//SET_OPTION("omp-threads", size_t);
|
||||
// SET_OPTION("omp-threads", size_t);
|
||||
|
||||
SET_OPTION("mini-batch", int);
|
||||
SET_OPTION("maxi-batch", int);
|
||||
@ -1021,24 +1023,22 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// @TODO: this should probably be in processOptionDevices()
|
||||
//#ifdef BLAS_FOUND
|
||||
// //omp_set_num_threads(vm_["omp-threads"].as<size_t>());
|
||||
//#ifdef MKL_FOUND
|
||||
// mkl_set_num_threads(vm_["omp-threads"].as<size_t>());
|
||||
//#endif
|
||||
//#endif
|
||||
// @TODO: this should probably be in processOptionDevices()
|
||||
//#ifdef BLAS_FOUND
|
||||
// //omp_set_num_threads(vm_["omp-threads"].as<size_t>());
|
||||
//#ifdef MKL_FOUND
|
||||
// mkl_set_num_threads(vm_["omp-threads"].as<size_t>());
|
||||
//#endif
|
||||
//#endif
|
||||
}
|
||||
|
||||
std::vector<DeviceId> ConfigParser::getDevices() {
|
||||
std::vector<DeviceId> devices;
|
||||
|
||||
try {
|
||||
|
||||
std::string devicesStr
|
||||
= Join(config_["devices"].as<std::vector<std::string>>());
|
||||
|
||||
|
||||
if(mode_ == ConfigMode::training && get<bool>("multi-node")) {
|
||||
auto parts = Split(devicesStr, ":");
|
||||
for(size_t i = 1; i < parts.size(); ++i) {
|
||||
@ -1061,11 +1061,10 @@ std::vector<DeviceId> ConfigParser::getDevices() {
|
||||
if(config_["cpu-threads"].as<size_t>() > 0) {
|
||||
devices.clear();
|
||||
for(size_t i = 0; i < config_["cpu-threads"].as<size_t>(); ++i)
|
||||
devices.push_back({i, DeviceType::cpu});
|
||||
devices.push_back({i, DeviceType::cpu});
|
||||
}
|
||||
|
||||
}
|
||||
catch(...) {
|
||||
} catch(...) {
|
||||
ABORT("Problem parsing devices, please report an issue on github");
|
||||
}
|
||||
|
||||
|
@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "common/logging.h"
|
||||
#include "shape.h"
|
||||
@ -57,7 +57,6 @@ struct DeviceId {
|
||||
friend bool operator==(DeviceId id1, DeviceId id2) {
|
||||
return id1.no == id2.no && id1.type == id2.type;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
class TensorBase;
|
||||
|
@ -1,195 +1,191 @@
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include "common/logging.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
struct Shape {
|
||||
public:
|
||||
std::vector<int> shape_;
|
||||
public:
|
||||
std::vector<int> shape_;
|
||||
|
||||
public:
|
||||
Shape() : shape_{1} {}
|
||||
public:
|
||||
Shape() : shape_{1} {}
|
||||
|
||||
Shape(std::initializer_list<int> il) : Shape() {
|
||||
shape_.resize(il.size());
|
||||
std::copy(il.begin(), il.end(), begin());
|
||||
Shape(std::initializer_list<int> il) : Shape() {
|
||||
shape_.resize(il.size());
|
||||
std::copy(il.begin(), il.end(), begin());
|
||||
}
|
||||
|
||||
void resize(size_t n) { shape_.resize(n, 1); }
|
||||
|
||||
const int* data() const { return shape_.data(); }
|
||||
|
||||
int* data() { return shape_.data(); }
|
||||
|
||||
Shape(const Shape& shape) : Shape() {
|
||||
shape_.resize(shape.size());
|
||||
std::copy(shape.begin(), shape.end(), begin());
|
||||
}
|
||||
|
||||
inline void set(int i, int val) { dim(i) = val; }
|
||||
|
||||
inline int& dim(int i) {
|
||||
if(i >= 0) {
|
||||
ABORT_IF(i >= size(),
|
||||
"Index {} is out of bounds, shape has {} dimension",
|
||||
i,
|
||||
size());
|
||||
return shape_[i];
|
||||
} else {
|
||||
ABORT_IF((int)size() + i < 0,
|
||||
"Negative index {} is out of bounds, shape has {} dimension",
|
||||
i,
|
||||
size());
|
||||
return shape_[size() + i];
|
||||
}
|
||||
}
|
||||
|
||||
void resize(size_t n) {
|
||||
shape_.resize(n, 1);
|
||||
}
|
||||
inline const int& dim(int i) const {
|
||||
return const_cast<Shape&>(*this).dim(i);
|
||||
}
|
||||
|
||||
const int* data() const {
|
||||
return shape_.data();
|
||||
}
|
||||
inline int operator[](int i) { return dim(i); }
|
||||
|
||||
int* data() {
|
||||
return shape_.data();
|
||||
}
|
||||
inline int operator[](int i) const { return dim(i); }
|
||||
|
||||
Shape(const Shape& shape) : Shape() {
|
||||
shape_.resize(shape.size());
|
||||
std::copy(shape.begin(), shape.end(), begin());
|
||||
}
|
||||
inline int& back() { return shape_.back(); }
|
||||
|
||||
inline void set(int i, int val) {
|
||||
dim(i) = val;
|
||||
}
|
||||
inline int stride(int i) const {
|
||||
std::vector<int> stride(shape_.size(), 1);
|
||||
for(int j = shape_.size() - 2; j >= 0; --j)
|
||||
stride[j] = stride[j + 1] * shape_[j + 1];
|
||||
|
||||
inline int& dim(int i) {
|
||||
if(i >= 0) {
|
||||
ABORT_IF(i >= size(),
|
||||
"Index {} is out of bounds, shape has {} dimension", i, size());
|
||||
return shape_[i];
|
||||
}
|
||||
else {
|
||||
ABORT_IF((int)size() + i < 0,
|
||||
"Negative index {} is out of bounds, shape has {} dimension", i, size());
|
||||
return shape_[size() + i];
|
||||
if(i >= 0)
|
||||
return stride[i];
|
||||
else
|
||||
return stride[size() + i];
|
||||
}
|
||||
|
||||
inline size_t size() const { return shape_.size(); }
|
||||
|
||||
inline int elements() const {
|
||||
int el = 1;
|
||||
for(auto s : shape_)
|
||||
el *= s;
|
||||
return el;
|
||||
}
|
||||
|
||||
inline void dims(int i, std::vector<int>& d) const {
|
||||
d.resize(shape_.size());
|
||||
|
||||
std::vector<int> stride(shape_.size(), 1);
|
||||
for(int j = shape_.size() - 2; j >= 0; --j)
|
||||
stride[j] = stride[j + 1] * shape_[j + 1];
|
||||
|
||||
for(int j = 0; j < d.size(); ++j)
|
||||
d[j] = (i / stride[j]) % shape_[j];
|
||||
}
|
||||
|
||||
auto begin() -> decltype(shape_.begin()) { return shape_.begin(); }
|
||||
auto begin() const -> decltype(shape_.begin()) { return shape_.begin(); }
|
||||
|
||||
auto end() -> decltype(shape_.end()) { return shape_.end(); }
|
||||
auto end() const -> decltype(shape_.end()) { return shape_.end(); }
|
||||
|
||||
auto rbegin() -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
|
||||
auto rbegin() const -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
|
||||
|
||||
auto rend() -> decltype(shape_.rend()) { return shape_.rend(); }
|
||||
auto rend() const -> decltype(shape_.rend()) { return shape_.rend(); }
|
||||
|
||||
bool operator==(const Shape& other) const {
|
||||
return size() == other.size() && std::equal(begin(), end(), other.begin());
|
||||
}
|
||||
|
||||
bool operator!=(const Shape& other) const { return !(*this == other); }
|
||||
|
||||
std::string toString() const {
|
||||
std::stringstream strm;
|
||||
strm << "shape=" << (*this)[0];
|
||||
for(int i = 1; i < size(); ++i)
|
||||
strm << "x" << (*this)[i];
|
||||
strm << " size=" << elements() << " (" << elements() * sizeof(float)
|
||||
<< "B)";
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& strm, const Shape& shape) {
|
||||
strm << shape.toString();
|
||||
return strm;
|
||||
}
|
||||
|
||||
operator std::string() const {
|
||||
std::stringstream ss;
|
||||
ss << *this;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
int axis(int ax) {
|
||||
if(ax < 0)
|
||||
return size() + ax;
|
||||
else
|
||||
return ax;
|
||||
}
|
||||
|
||||
static Shape broadcast(const std::vector<Shape>& shapes) {
|
||||
int maxDims = 0;
|
||||
for(auto& s : shapes)
|
||||
if(s.size() > maxDims)
|
||||
maxDims = s.size();
|
||||
|
||||
Shape shape;
|
||||
shape.resize(maxDims);
|
||||
|
||||
for(auto& s : shapes) {
|
||||
for(int i = 0; i < s.size(); ++i) {
|
||||
ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1,
|
||||
"Shapes {} and {} cannot be broadcasted",
|
||||
(std::string)shape,
|
||||
(std::string)s);
|
||||
shape.set(-i, std::max(shape[-i], s[-i]));
|
||||
}
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
|
||||
inline const int& dim(int i) const { return const_cast<Shape&>(*this).dim(i); }
|
||||
template <typename T>
|
||||
static Shape broadcast(const std::initializer_list<T>& il) {
|
||||
return broadcast(std::vector<T>(il));
|
||||
}
|
||||
|
||||
inline int operator[](int i) { return dim(i); }
|
||||
template <typename T>
|
||||
static Shape broadcast(const std::vector<T>& nodes) {
|
||||
int maxDims = 0;
|
||||
for(auto& n : nodes)
|
||||
if(n->shape().size() > maxDims)
|
||||
maxDims = n->shape().size();
|
||||
|
||||
inline int operator[](int i) const { return dim(i); }
|
||||
Shape shape;
|
||||
shape.resize(maxDims);
|
||||
|
||||
inline int& back() { return shape_.back(); }
|
||||
|
||||
inline int stride(int i) const {
|
||||
std::vector<int> stride(shape_.size(), 1);
|
||||
for(int j = shape_.size() - 2; j >= 0; --j)
|
||||
stride[j] = stride[j + 1] * shape_[j + 1];
|
||||
|
||||
if(i >= 0)
|
||||
return stride[i];
|
||||
else
|
||||
return stride[size() + i];
|
||||
}
|
||||
|
||||
inline size_t size() const { return shape_.size(); }
|
||||
|
||||
inline int elements() const {
|
||||
int el = 1;
|
||||
for(auto s : shape_)
|
||||
el *= s;
|
||||
return el;
|
||||
}
|
||||
|
||||
inline void dims(int i, std::vector<int>& d) const {
|
||||
d.resize(shape_.size());
|
||||
|
||||
std::vector<int> stride(shape_.size(), 1);
|
||||
for(int j = shape_.size() - 2; j >= 0; --j)
|
||||
stride[j] = stride[j + 1] * shape_[j + 1];
|
||||
|
||||
for(int j = 0; j < d.size(); ++j)
|
||||
d[j] = (i / stride[j]) % shape_[j];
|
||||
}
|
||||
|
||||
auto begin() -> decltype(shape_.begin()) { return shape_.begin(); }
|
||||
auto begin() const -> decltype(shape_.begin()) { return shape_.begin(); }
|
||||
|
||||
auto end() -> decltype(shape_.end()) { return shape_.end(); }
|
||||
auto end() const -> decltype(shape_.end()) { return shape_.end(); }
|
||||
|
||||
auto rbegin() -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
|
||||
auto rbegin() const -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
|
||||
|
||||
auto rend() -> decltype(shape_.rend()) { return shape_.rend(); }
|
||||
auto rend() const -> decltype(shape_.rend()) { return shape_.rend(); }
|
||||
|
||||
bool operator==(const Shape& other) const {
|
||||
return size() == other.size() && std::equal(begin(), end(), other.begin());
|
||||
}
|
||||
|
||||
bool operator!=(const Shape& other) const { return !(*this == other); }
|
||||
|
||||
std::string toString() const {
|
||||
std::stringstream strm;
|
||||
strm << "shape=" << (*this)[0];
|
||||
for(int i = 1; i < size(); ++i)
|
||||
strm << "x" << (*this)[i];
|
||||
strm << " size=" << elements() << " ("
|
||||
<< elements() * sizeof(float) << "B)";
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& strm, const Shape& shape) {
|
||||
strm << shape.toString();
|
||||
return strm;
|
||||
}
|
||||
|
||||
operator std::string() const {
|
||||
std::stringstream ss;
|
||||
ss << *this;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
int axis(int ax) {
|
||||
if(ax < 0)
|
||||
return size() + ax;
|
||||
else
|
||||
return ax;
|
||||
}
|
||||
|
||||
static Shape broadcast(const std::vector<Shape>& shapes) {
|
||||
int maxDims = 0;
|
||||
for(auto& s : shapes)
|
||||
if(s.size() > maxDims)
|
||||
maxDims = s.size();
|
||||
|
||||
Shape shape;
|
||||
shape.resize(maxDims);
|
||||
|
||||
for(auto& s : shapes) {
|
||||
for(int i = 0; i < s.size(); ++i) {
|
||||
ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1,
|
||||
"Shapes {} and {} cannot be broadcasted",
|
||||
(std::string)shape,
|
||||
(std::string)s);
|
||||
shape.set(-i, std::max(shape[-i], s[-i]));
|
||||
}
|
||||
for(auto& node : nodes) {
|
||||
const Shape& shapen = node->shape();
|
||||
for(int i = 1; i <= shapen.size(); ++i) {
|
||||
ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1,
|
||||
"Shapes {} and {} cannot be broadcasted",
|
||||
(std::string)shape,
|
||||
(std::string)shapen);
|
||||
shape.set(-i, std::max(shape[-i], shapen[-i]));
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static Shape broadcast(const std::initializer_list<T>& il) {
|
||||
return broadcast(std::vector<T>(il));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static Shape broadcast(const std::vector<T>& nodes) {
|
||||
int maxDims = 0;
|
||||
for(auto& n : nodes)
|
||||
if(n->shape().size() > maxDims)
|
||||
maxDims = n->shape().size();
|
||||
|
||||
Shape shape;
|
||||
shape.resize(maxDims);
|
||||
|
||||
for(auto& node : nodes) {
|
||||
const Shape& shapen = node->shape();
|
||||
for(int i = 1; i <= shapen.size(); ++i) {
|
||||
ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1,
|
||||
"Shapes {} and {} cannot be broadcasted",
|
||||
(std::string)shape,
|
||||
(std::string)shapen);
|
||||
shape.set(-i, std::max(shape[-i], shapen[-i]));
|
||||
}
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -1,11 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include <boost/timer/timer.hpp>
|
||||
#include <condition_variable>
|
||||
#include <deque>
|
||||
#include <functional>
|
||||
#include <queue>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <boost/timer/timer.hpp>
|
||||
#include <queue>
|
||||
|
||||
#include "common/config.h"
|
||||
#include "data/batch_stats.h"
|
||||
@ -47,16 +47,17 @@ private:
|
||||
|
||||
void fillBatches(bool shuffle = true) {
|
||||
typedef typename sample::value_type Item;
|
||||
auto itemCmp = [](const Item& sa, const Item& sb) {
|
||||
return sa.size() < sb.size();
|
||||
};
|
||||
auto itemCmp
|
||||
= [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); };
|
||||
|
||||
auto cmpSrc = [itemCmp](const sample& a, const sample& b) {
|
||||
return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(), itemCmp);
|
||||
return std::lexicographical_compare(
|
||||
a.begin(), a.end(), b.begin(), b.end(), itemCmp);
|
||||
};
|
||||
|
||||
auto cmpTrg = [itemCmp](const sample& a, const sample& b) {
|
||||
return std::lexicographical_compare(a.rbegin(), a.rend(), b.rbegin(), b.rend(), itemCmp);
|
||||
return std::lexicographical_compare(
|
||||
a.rbegin(), a.rend(), b.rbegin(), b.rend(), itemCmp);
|
||||
};
|
||||
|
||||
auto cmpNone = [](const sample& a, const sample& b) { return &a < &b; };
|
||||
@ -168,9 +169,8 @@ public:
|
||||
operator bool() const {
|
||||
// wait if empty but loading
|
||||
std::unique_lock<std::mutex> lock(loadMutex_);
|
||||
loadCondition_.wait(lock, [this]{
|
||||
return loadReady_ || !bufferedBatches_.empty();
|
||||
});
|
||||
loadCondition_.wait(
|
||||
lock, [this] { return loadReady_ || !bufferedBatches_.empty(); });
|
||||
|
||||
return !bufferedBatches_.empty();
|
||||
}
|
||||
@ -178,15 +178,16 @@ public:
|
||||
BatchPtr next() {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(loadMutex_);
|
||||
loadCondition_.wait(lock, [this]{
|
||||
return loadReady_ || !bufferedBatches_.empty();
|
||||
});
|
||||
loadCondition_.wait(
|
||||
lock, [this] { return loadReady_ || !bufferedBatches_.empty(); });
|
||||
}
|
||||
|
||||
ABORT_IF(bufferedBatches_.empty(), "No batches to fetch, run prepare()");
|
||||
currentBatch_ = bufferedBatches_.front();
|
||||
|
||||
if(loadReady_ && bufferedBatches_.size() <= std::max(options_->get<int>("maxi-batch") / 5, 1)) {
|
||||
if(loadReady_
|
||||
&& bufferedBatches_.size()
|
||||
<= std::max(options_->get<int>("maxi-batch") / 5, 1)) {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(loadMutex_);
|
||||
loadReady_ = false;
|
||||
|
@ -400,7 +400,6 @@ public:
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
class CorpusIterator;
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include <random>
|
||||
|
||||
#include "data/corpus_nbest.h"
|
||||
#include "common/utils.h"
|
||||
#include "data/corpus_nbest.h"
|
||||
|
||||
namespace marian {
|
||||
namespace data {
|
||||
@ -10,8 +10,8 @@ CorpusNBest::CorpusNBest(Ptr<Config> options, bool translate /*= false*/)
|
||||
: CorpusBase(options, translate) {}
|
||||
|
||||
CorpusNBest::CorpusNBest(std::vector<std::string> paths,
|
||||
std::vector<Ptr<Vocab>> vocabs,
|
||||
Ptr<Config> options)
|
||||
std::vector<Ptr<Vocab>> vocabs,
|
||||
Ptr<Config> options)
|
||||
: CorpusBase(paths, vocabs, options) {}
|
||||
|
||||
int numFromNbest(const std::string& line) {
|
||||
@ -19,7 +19,8 @@ int numFromNbest(const std::string& line) {
|
||||
Split(line, fields, " ||| ", true);
|
||||
ABORT_IF(fields.size() < 4,
|
||||
"Too few fields ({}) in line \"{}\", is this a correct n-best list?",
|
||||
fields.size(), line);
|
||||
fields.size(),
|
||||
line);
|
||||
return std::stoi(fields[0]);
|
||||
}
|
||||
|
||||
@ -28,7 +29,8 @@ std::string lineFromNbest(const std::string& line) {
|
||||
Split(line, fields, " ||| ", true);
|
||||
ABORT_IF(fields.size() < 4,
|
||||
"Too few fields ({}) in line \"{}\", is this a correct n-best list?",
|
||||
fields.size(), line);
|
||||
fields.size(),
|
||||
line);
|
||||
return fields[1];
|
||||
}
|
||||
|
||||
@ -56,7 +58,8 @@ SentenceTuple CorpusNBest::next() {
|
||||
for(size_t i = 0; i < last; ++i) {
|
||||
if(curr_num > lastNum_) {
|
||||
ABORT_IF(!std::getline((std::istream&)*files_[i], lastLines_[i]),
|
||||
"Too few lines in input {}", i);
|
||||
"Too few lines in input {}",
|
||||
i);
|
||||
}
|
||||
addWordsToSentenceTuple(lastLines_[i], i, tup);
|
||||
}
|
||||
@ -90,6 +93,5 @@ void CorpusNBest::reset() {
|
||||
files_.emplace_back(new InputFileStream(path));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -154,6 +154,5 @@ void CorpusSQLite::restore(Ptr<TrainingState> ts) {
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -19,7 +19,6 @@
|
||||
#include <SQLiteCpp/SQLiteCpp.h>
|
||||
#include <SQLiteCpp/sqlite3/sqlite3.h>
|
||||
|
||||
|
||||
static void SQLiteRandomSeed(sqlite3_context* context,
|
||||
int argc,
|
||||
sqlite3_value** argv) {
|
||||
|
@ -138,8 +138,9 @@ void Vocab::create(const std::string& vocabPath, const std::string& trainPath) {
|
||||
"Specified vocab directory {} does not exist",
|
||||
dir);
|
||||
|
||||
ABORT_IF(!dir.empty() && !(boost::filesystem::status(dir).permissions()
|
||||
& boost::filesystem::owner_write),
|
||||
ABORT_IF(!dir.empty()
|
||||
&& !(boost::filesystem::status(dir).permissions()
|
||||
& boost::filesystem::owner_write),
|
||||
"No write permission in vocab directory {}",
|
||||
dir);
|
||||
|
||||
|
@ -32,7 +32,5 @@ struct Array {
|
||||
data_[i] = val;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -4,93 +4,90 @@
|
||||
#include "functional/operands.h"
|
||||
|
||||
namespace marian {
|
||||
namespace functional {
|
||||
namespace functional {
|
||||
|
||||
namespace float2unsigned {
|
||||
constexpr float abs(float x) { return x < 0 ? -x : x; }
|
||||
namespace float2unsigned {
|
||||
constexpr float abs(float x) {
|
||||
return x < 0 ? -x : x;
|
||||
}
|
||||
|
||||
constexpr int exponent(float x) {
|
||||
return abs(x) >= 2 ? exponent(x / 2) + 1 :
|
||||
abs(x) < 1 ? exponent(x * 2) - 1 :
|
||||
0;
|
||||
}
|
||||
// clang-format off
|
||||
constexpr int exponent(float x) {
|
||||
return abs(x) >= 2 ? exponent(x / 2) + 1 :
|
||||
abs(x) < 1 ? exponent(x * 2) - 1 :
|
||||
0;
|
||||
}
|
||||
|
||||
constexpr float scalbn(float value, int exponent) {
|
||||
return exponent == 0 ? value :
|
||||
exponent > 0 ? scalbn(value * 2, exponent - 1) :
|
||||
scalbn(value / 2, exponent + 1);
|
||||
}
|
||||
constexpr float scalbn(float value, int exponent) {
|
||||
return exponent == 0 ? value :
|
||||
exponent > 0 ? scalbn(value * 2, exponent - 1) :
|
||||
scalbn(value / 2, exponent + 1);
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
constexpr unsigned mantissa(float x, int exp) {
|
||||
// remove hidden 1 and bias the exponent to get integer
|
||||
return abs(x) < std::numeric_limits<float>::infinity() ?
|
||||
scalbn(scalbn(abs(x), -exp) - 1, 23) : 0;
|
||||
}
|
||||
constexpr unsigned mantissa(float x, int exp) {
|
||||
// remove hidden 1 and bias the exponent to get integer
|
||||
return abs(x) < std::numeric_limits<float>::infinity()
|
||||
? scalbn(scalbn(abs(x), -exp) - 1, 23)
|
||||
: 0;
|
||||
}
|
||||
|
||||
constexpr unsigned to_binary(float x, unsigned sign, int exp) {
|
||||
return sign * (1u << 31)
|
||||
+ (exp + 127) * (1u << 23)
|
||||
+ mantissa(x, exp);
|
||||
}
|
||||
constexpr unsigned to_binary(float x, unsigned sign, int exp) {
|
||||
return sign * (1u << 31) + (exp + 127) * (1u << 23) + mantissa(x, exp);
|
||||
}
|
||||
|
||||
constexpr unsigned to_binary(float x) {
|
||||
return x == 0 ? 0 : to_binary(x, x < 0, exponent(x));
|
||||
}
|
||||
constexpr unsigned to_binary(float x) {
|
||||
return x == 0 ? 0 : to_binary(x, x < 0, exponent(x));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
namespace unsigned2float {
|
||||
|
||||
namespace unsigned2float {
|
||||
constexpr float sign(unsigned i) {
|
||||
return (i & (1u << 31)) ? -1.f : 1.f;
|
||||
}
|
||||
|
||||
constexpr float sign(unsigned i) {
|
||||
return (i & (1u << 31)) ? -1.f : 1.f;
|
||||
}
|
||||
constexpr int exponent(unsigned i) {
|
||||
return int((i >> 23) & 255u) - 127;
|
||||
}
|
||||
|
||||
constexpr int exponent(unsigned i) {
|
||||
return int((i >> 23) & 255u) - 127;
|
||||
}
|
||||
constexpr float sig(unsigned i, unsigned shift) {
|
||||
return ((i >> shift) & 1u) * 1.f / (1u << (23 - shift))
|
||||
+ (shift > 0 ? sig(i, shift - 1) : 0);
|
||||
}
|
||||
|
||||
constexpr float sig(unsigned i, unsigned shift) {
|
||||
return ((i >> shift) & 1u) * 1.f / (1u << (23 - shift))
|
||||
+ (shift > 0 ? sig(i, shift - 1) : 0);
|
||||
}
|
||||
constexpr float powr(int exp) {
|
||||
return exp > 0 ? 2.f * powr(exp - 1) : 1.f;
|
||||
}
|
||||
|
||||
constexpr float powr(int exp) {
|
||||
return exp > 0 ? 2.f * powr(exp - 1) : 1.f;
|
||||
}
|
||||
constexpr float pow(int exp) {
|
||||
return exp < 0 ? 1.f / powr(-exp) : powr(exp);
|
||||
}
|
||||
|
||||
constexpr float pow(int exp) {
|
||||
return exp < 0 ? 1.f / powr(-exp) : powr(exp);
|
||||
}
|
||||
constexpr float from_binary(unsigned i) {
|
||||
return (1.f + sig(i, 22u)) * pow(exponent(i)) * sign(i);
|
||||
}
|
||||
}
|
||||
|
||||
constexpr float from_binary(unsigned i) {
|
||||
return (1.f + sig(i, 22u))
|
||||
* pow(exponent(i))
|
||||
* sign(i);
|
||||
}
|
||||
}
|
||||
constexpr unsigned f2i(float x) {
|
||||
return float2unsigned::to_binary(x);
|
||||
}
|
||||
|
||||
constexpr unsigned f2i(float x) {
|
||||
return float2unsigned::to_binary(x);
|
||||
}
|
||||
constexpr float i2f(float x) {
|
||||
return unsigned2float::from_binary(x);
|
||||
}
|
||||
|
||||
constexpr float i2f(float x) {
|
||||
return unsigned2float::from_binary(x);
|
||||
}
|
||||
|
||||
template <unsigned V>
|
||||
struct F {
|
||||
static constexpr auto value = i2f(V);
|
||||
static constexpr auto binary = V;
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ constexpr float operator()(Args&&... args) const {
|
||||
return value;
|
||||
}
|
||||
|
||||
std::string to_string() {
|
||||
return "F<" + std::to_string(value) + ">";
|
||||
}
|
||||
};
|
||||
template <unsigned V>
|
||||
struct F {
|
||||
static constexpr auto value = i2f(V);
|
||||
static constexpr auto binary = V;
|
||||
|
||||
template <typename... Args>
|
||||
__HDI__ constexpr float operator()(Args&&... args) const {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
std::string to_string() { return "F<" + std::to_string(value) + ">"; }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -4,25 +4,25 @@
|
||||
#include "functional/predicates.h"
|
||||
|
||||
namespace marian {
|
||||
namespace functional {
|
||||
namespace functional {
|
||||
|
||||
template <int N>
|
||||
using ref = Assignee<N>;
|
||||
template <int N>
|
||||
using ref = Assignee<N>;
|
||||
|
||||
static ref<1> _1;
|
||||
static ref<2> _2;
|
||||
static ref<3> _3;
|
||||
static ref<4> _4;
|
||||
static ref<5> _5;
|
||||
static ref<6> _6;
|
||||
static ref<7> _7;
|
||||
static ref<8> _8;
|
||||
static ref<9> _9;
|
||||
static ref<1> _1;
|
||||
static ref<2> _2;
|
||||
static ref<3> _3;
|
||||
static ref<4> _4;
|
||||
static ref<5> _5;
|
||||
static ref<6> _6;
|
||||
static ref<7> _7;
|
||||
static ref<8> _8;
|
||||
static ref<9> _9;
|
||||
|
||||
static C<0> _0c;
|
||||
static C<1> _1c;
|
||||
static C<2> _2c;
|
||||
static C<-1> _1cneg;
|
||||
static C<-2> _2cneg;
|
||||
}
|
||||
static C<0> _0c;
|
||||
static C<1> _1c;
|
||||
static C<2> _2c;
|
||||
static C<-1> _1cneg;
|
||||
static C<-2> _2cneg;
|
||||
}
|
||||
}
|
@ -5,71 +5,69 @@
|
||||
#include "functional/defs.h"
|
||||
|
||||
namespace marian {
|
||||
namespace functional {
|
||||
namespace functional {
|
||||
|
||||
template <class C>
|
||||
using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type;
|
||||
|
||||
template <int N>
|
||||
struct Select {
|
||||
template <typename T, typename ...Args>
|
||||
__HDI__ static auto apply(T&& arg, Args&&... args) -> decltype(Select<N-1>::apply(args...)) {
|
||||
return Select<N-1>::apply(args...);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Select<0> {
|
||||
template <typename T, typename ...Args>
|
||||
__HDI__ static T apply(T&& arg, Args&&... args) {
|
||||
return arg;
|
||||
}
|
||||
};
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
template <int V>
|
||||
struct C {
|
||||
static constexpr auto value = V;
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ float operator()(Args&&... args) { return V; }
|
||||
|
||||
std::string to_string() {
|
||||
return "C<" + std::to_string(V) + ">";
|
||||
}
|
||||
};
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
struct Capture {
|
||||
float value;
|
||||
|
||||
Capture(float val) : value(val) {};
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ float operator()(Args&&... args) { return value; }
|
||||
|
||||
std::string to_string() {
|
||||
return "Cap(" + std::to_string(value) + ")";
|
||||
}
|
||||
};
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
template <int N>
|
||||
struct Var {
|
||||
static constexpr auto index = N;
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ float& operator()(Args&&... args) {
|
||||
return Select<N-1>::apply(args...);
|
||||
}
|
||||
|
||||
std::string to_string() {
|
||||
return "Var<" + std::to_string(N) + ">";
|
||||
}
|
||||
};
|
||||
template <class C>
|
||||
using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type;
|
||||
|
||||
template <int N>
|
||||
struct Select {
|
||||
template <typename T, typename... Args>
|
||||
__HDI__ static auto apply(T&& arg, Args&&... args)
|
||||
-> decltype(Select<N - 1>::apply(args...)) {
|
||||
return Select<N - 1>::apply(args...);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Select<0> {
|
||||
template <typename T, typename... Args>
|
||||
__HDI__ static T apply(T&& arg, Args&&... args) {
|
||||
return arg;
|
||||
}
|
||||
};
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
template <int V>
|
||||
struct C {
|
||||
static constexpr auto value = V;
|
||||
|
||||
template <typename... Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return V;
|
||||
}
|
||||
|
||||
std::string to_string() { return "C<" + std::to_string(V) + ">"; }
|
||||
};
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
struct Capture {
|
||||
float value;
|
||||
|
||||
Capture(float val) : value(val){};
|
||||
|
||||
template <typename... Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return value;
|
||||
}
|
||||
|
||||
std::string to_string() { return "Cap(" + std::to_string(value) + ")"; }
|
||||
};
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
template <int N>
|
||||
struct Var {
|
||||
static constexpr auto index = N;
|
||||
|
||||
template <typename... Args>
|
||||
__HDI__ float& operator()(Args&&... args) {
|
||||
return Select<N - 1>::apply(args...);
|
||||
}
|
||||
|
||||
std::string to_string() { return "Var<" + std::to_string(N) + ">"; }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -4,237 +4,230 @@
|
||||
#include "functional/operands.h"
|
||||
|
||||
namespace marian {
|
||||
namespace functional {
|
||||
namespace functional {
|
||||
|
||||
template <typename Function, typename X>
|
||||
struct UnaryFunctor {
|
||||
X x;
|
||||
template <typename Function, typename X>
|
||||
struct UnaryFunctor {
|
||||
X x;
|
||||
|
||||
template <class Arg>
|
||||
UnaryFunctor(Arg a) : x(a) {}
|
||||
template <class Arg>
|
||||
UnaryFunctor(Arg a) : x(a) {}
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return Function::apply(x(args...));
|
||||
}
|
||||
template <typename... Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return Function::apply(x(args...));
|
||||
}
|
||||
|
||||
std::string to_string() {
|
||||
return Function::n() + "<" + x.to_string() + ">";
|
||||
}
|
||||
};
|
||||
std::string to_string() { return Function::n() + "<" + x.to_string() + ">"; }
|
||||
};
|
||||
|
||||
template <class Function, class X, class Y>
|
||||
struct BinaryFunctor {
|
||||
X x;
|
||||
Y y;
|
||||
template <class Function, class X, class Y>
|
||||
struct BinaryFunctor {
|
||||
X x;
|
||||
Y y;
|
||||
|
||||
template <class Arg1, class Arg2>
|
||||
BinaryFunctor(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
|
||||
template <class Arg1, class Arg2>
|
||||
BinaryFunctor(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return Function::apply(x(args...), y(args...));
|
||||
}
|
||||
template <typename... Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return Function::apply(x(args...), y(args...));
|
||||
}
|
||||
|
||||
std::string to_string() {
|
||||
return Function::n() +
|
||||
"<" + x.to_string() +
|
||||
"," + y.to_string() + ">";
|
||||
}
|
||||
};
|
||||
std::string to_string() {
|
||||
return Function::n() + "<" + x.to_string() + "," + y.to_string() + ">";
|
||||
}
|
||||
};
|
||||
|
||||
#define UNARY(name, name2, func) \
|
||||
namespace elem { \
|
||||
struct name { \
|
||||
__HDI__ static float apply(float x) { return func; } \
|
||||
static std::string n() { return #name; }\
|
||||
}; \
|
||||
}\
|
||||
template <class X> using name = UnaryFunctor<elem::name, X>;\
|
||||
template <typename X>\
|
||||
name<IsClass<X>> name2(X x) {\
|
||||
return name<X>(x);\
|
||||
}\
|
||||
static name<Capture> name2(Capture x) {\
|
||||
return name<Capture>(x);\
|
||||
}
|
||||
#define UNARY(name, name2, func) \
|
||||
namespace elem { \
|
||||
struct name { \
|
||||
__HDI__ static float apply(float x) { return func; } \
|
||||
static std::string n() { return #name; } \
|
||||
}; \
|
||||
} \
|
||||
template <class X> \
|
||||
using name = UnaryFunctor<elem::name, X>; \
|
||||
template <typename X> \
|
||||
name<IsClass<X>> name2(X x) { \
|
||||
return name<X>(x); \
|
||||
} \
|
||||
static name<Capture> name2(Capture x) { return name<Capture>(x); }
|
||||
|
||||
#define BINARY(name, name2, func) \
|
||||
namespace elem { \
|
||||
struct name { \
|
||||
__HDI__ static float apply(float x, float y) { return func; } \
|
||||
static std::string n() { return #name; }\
|
||||
}; \
|
||||
}\
|
||||
template <class X, class Y> using name = BinaryFunctor<elem::name, X, Y>;\
|
||||
template <class X, class Y>\
|
||||
name<IsClass<X>, IsClass<Y>> name2(X x, Y y) {\
|
||||
return name<X, Y>(x, y);\
|
||||
}\
|
||||
template <class Y>\
|
||||
name<Capture, IsClass<Y>> name2(Capture x, Y y) {\
|
||||
return name<Capture, Y>(x, y);\
|
||||
}\
|
||||
template <class X>\
|
||||
name<IsClass<X>, Capture> name2(X x, Capture y) {\
|
||||
return name<X, Capture>(x, y);\
|
||||
}
|
||||
#define BINARY(name, name2, func) \
|
||||
namespace elem { \
|
||||
struct name { \
|
||||
__HDI__ static float apply(float x, float y) { return func; } \
|
||||
static std::string n() { return #name; } \
|
||||
}; \
|
||||
} \
|
||||
template <class X, class Y> \
|
||||
using name = BinaryFunctor<elem::name, X, Y>; \
|
||||
template <class X, class Y> \
|
||||
name<IsClass<X>, IsClass<Y>> name2(X x, Y y) { \
|
||||
return name<X, Y>(x, y); \
|
||||
} \
|
||||
template <class Y> \
|
||||
name<Capture, IsClass<Y>> name2(Capture x, Y y) { \
|
||||
return name<Capture, Y>(x, y); \
|
||||
} \
|
||||
template <class X> \
|
||||
name<IsClass<X>, Capture> name2(X x, Capture y) { \
|
||||
return name<X, Capture>(x, y); \
|
||||
}
|
||||
|
||||
UNARY(Tanh, tanh, tanhf(x));
|
||||
UNARY(Sin, sin, sinf(x));
|
||||
UNARY(Cos, cos, cosf(x));
|
||||
UNARY(Tan, tan, tanf(x));
|
||||
UNARY(Log, log, logf(x));
|
||||
UNARY(Exp, exp, expf(x));
|
||||
UNARY(Abs, abs, fabs(x));
|
||||
UNARY(Sqrt, sqrt, sqrtf(x));
|
||||
UNARY(Neg, operator-, -x);
|
||||
UNARY(Logit, logit, x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x))));
|
||||
UNARY(Tanh, tanh, tanhf(x));
|
||||
UNARY(Sin, sin, sinf(x));
|
||||
UNARY(Cos, cos, cosf(x));
|
||||
UNARY(Tan, tan, tanf(x));
|
||||
UNARY(Log, log, logf(x));
|
||||
UNARY(Exp, exp, expf(x));
|
||||
UNARY(Abs, abs, fabs(x));
|
||||
UNARY(Sqrt, sqrt, sqrtf(x));
|
||||
UNARY(Neg, operator-, -x);
|
||||
UNARY(Logit,
|
||||
logit,
|
||||
x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x))));
|
||||
|
||||
BINARY(Plus, operator+, x + y);
|
||||
BINARY(Minus, operator-, x - y);
|
||||
BINARY(Mult, operator*, x * y);
|
||||
BINARY(Div, operator/, x / y);
|
||||
BINARY(Plus, operator+, x + y);
|
||||
BINARY(Minus, operator-, x - y);
|
||||
BINARY(Mult, operator*, x* y);
|
||||
BINARY(Div, operator/, x / y);
|
||||
|
||||
UNARY(Negate, operator!, !x);
|
||||
BINARY(Eq, operator==, x == y);
|
||||
BINARY(NEq, operator!=, x != y);
|
||||
BINARY(Gt, operator>, x > y);
|
||||
BINARY(Lt, operator<, x < y);
|
||||
BINARY(Geq, operator>=, x >= y);
|
||||
BINARY(Leq, operator<=, x <= y);
|
||||
BINARY(And, operator&&, x && y);
|
||||
BINARY(Or, operator||, x || y);
|
||||
UNARY(Negate, operator!, !x);
|
||||
BINARY(Eq, operator==, x == y);
|
||||
BINARY(NEq, operator!=, x != y);
|
||||
BINARY(Gt, operator>, x> y);
|
||||
BINARY(Lt, operator<, x<y);
|
||||
BINARY(Geq, operator>=, x >= y);
|
||||
BINARY(Leq, operator<=, x <= y);
|
||||
BINARY(And, operator&&, x&& y);
|
||||
BINARY(Or, operator||, x || y);
|
||||
|
||||
template <typename T>
|
||||
__HDI__ T sgn(T val) {
|
||||
return (float(0) < val) - (val < float(0));
|
||||
}
|
||||
template <typename T>
|
||||
__HDI__ T sgn(T val) {
|
||||
return (float(0) < val) - (val < float(0));
|
||||
}
|
||||
|
||||
UNARY(Sgn, sgn, sgn(x));
|
||||
UNARY(Sgn, sgn, sgn(x));
|
||||
|
||||
BINARY(Pow, pow, pow(x, y));
|
||||
BINARY(Pow, pow, pow(x, y));
|
||||
|
||||
BINARY(Clip, clip, fabs(x) >= y ? sgn(x) * y : x);
|
||||
BINARY(Clip, clip, fabs(x) >= y ? sgn(x) * y : x);
|
||||
|
||||
UNARY(sReLU, ReLU, x > 0.f ? x : 0.f);
|
||||
UNARY(sReLUBack, ReLUback, x > 0.f ? 1.f : 0.f);
|
||||
BINARY(sPReLU, PReLU, x > 0.f ? x : x * y);
|
||||
BINARY(sPReLUBack, PReLUback, x > 0.f ? 1.f : y);
|
||||
UNARY(sReLU, ReLU, x > 0.f ? x : 0.f);
|
||||
UNARY(sReLUBack, ReLUback, x > 0.f ? 1.f : 0.f);
|
||||
BINARY(sPReLU, PReLU, x > 0.f ? x : x * y);
|
||||
BINARY(sPReLUBack, PReLUback, x > 0.f ? 1.f : y);
|
||||
|
||||
template <class Function, class X, class Y, class Z>
|
||||
struct TernaryFunctor {
|
||||
X x;
|
||||
Y y;
|
||||
Z z;
|
||||
template <class Function, class X, class Y, class Z>
|
||||
struct TernaryFunctor {
|
||||
X x;
|
||||
Y y;
|
||||
Z z;
|
||||
|
||||
template <class Arg1, class Arg2, class Arg3>
|
||||
TernaryFunctor(Arg1 arg1, Arg2 arg2, Arg3 arg3)
|
||||
: x(arg1), y(arg2), z(arg3) {}
|
||||
template <class Arg1, class Arg2, class Arg3>
|
||||
TernaryFunctor(Arg1 arg1, Arg2 arg2, Arg3 arg3) : x(arg1), y(arg2), z(arg3) {}
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return Function::apply(x(args...), y(args...), z(args...));
|
||||
}
|
||||
};
|
||||
template <typename... Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return Function::apply(x(args...), y(args...), z(args...));
|
||||
}
|
||||
};
|
||||
|
||||
#define TERNARY(name, name2, func) \
|
||||
namespace elem { \
|
||||
struct name { \
|
||||
__HDI__ static float apply(float x, float y, float z) { return func; } \
|
||||
}; \
|
||||
}\
|
||||
template <class X, class Y, class Z> using name = TernaryFunctor<elem::name, X, Y, Z>;\
|
||||
template <typename X, typename Y, typename Z>\
|
||||
name<IsClass<X>, IsClass<Y>, IsClass<Z>> name2(X x, Y y, Z z) {\
|
||||
return name<X, Y, Z>(x, y, z);\
|
||||
}\
|
||||
template <typename X, typename Z>\
|
||||
name<IsClass<X>, Capture, IsClass<Z>> name2(X x, Capture y, Z z) {\
|
||||
return name<X, Capture, Z>(x, y, z);\
|
||||
}\
|
||||
template <typename Y, typename Z>\
|
||||
name<Capture, IsClass<Y>, IsClass<Z>> name2(Capture x, Y y, Z z) {\
|
||||
return name<Capture, Y, Z>(x, y, z);\
|
||||
}\
|
||||
template <typename X>\
|
||||
name<IsClass<X>, Capture, Capture> name2(X x, Capture y, Capture z) {\
|
||||
return name<X, Capture, Capture>(x, y, z);\
|
||||
}\
|
||||
template <typename Y>\
|
||||
name<Capture, IsClass<Y>, Capture> name2(Capture x, Y y, Capture z) {\
|
||||
return name<Capture, Y, Capture>(x, y, z);\
|
||||
}\
|
||||
template <typename Z>\
|
||||
name<Capture, Capture, IsClass<Z>> name2(Capture x, Capture y, Z z) {\
|
||||
return name<Capture, Capture, Z>(x, y, z);\
|
||||
}
|
||||
#define TERNARY(name, name2, func) \
|
||||
namespace elem { \
|
||||
struct name { \
|
||||
__HDI__ static float apply(float x, float y, float z) { return func; } \
|
||||
}; \
|
||||
} \
|
||||
template <class X, class Y, class Z> \
|
||||
using name = TernaryFunctor<elem::name, X, Y, Z>; \
|
||||
template <typename X, typename Y, typename Z> \
|
||||
name<IsClass<X>, IsClass<Y>, IsClass<Z>> name2(X x, Y y, Z z) { \
|
||||
return name<X, Y, Z>(x, y, z); \
|
||||
} \
|
||||
template <typename X, typename Z> \
|
||||
name<IsClass<X>, Capture, IsClass<Z>> name2(X x, Capture y, Z z) { \
|
||||
return name<X, Capture, Z>(x, y, z); \
|
||||
} \
|
||||
template <typename Y, typename Z> \
|
||||
name<Capture, IsClass<Y>, IsClass<Z>> name2(Capture x, Y y, Z z) { \
|
||||
return name<Capture, Y, Z>(x, y, z); \
|
||||
} \
|
||||
template <typename X> \
|
||||
name<IsClass<X>, Capture, Capture> name2(X x, Capture y, Capture z) { \
|
||||
return name<X, Capture, Capture>(x, y, z); \
|
||||
} \
|
||||
template <typename Y> \
|
||||
name<Capture, IsClass<Y>, Capture> name2(Capture x, Y y, Capture z) { \
|
||||
return name<Capture, Y, Capture>(x, y, z); \
|
||||
} \
|
||||
template <typename Z> \
|
||||
name<Capture, Capture, IsClass<Z>> name2(Capture x, Capture y, Z z) { \
|
||||
return name<Capture, Capture, Z>(x, y, z); \
|
||||
}
|
||||
|
||||
TERNARY(IfThenElse, if_then_else, x ? y : z);
|
||||
TERNARY(IfThenElse, if_then_else, x ? y : z);
|
||||
|
||||
template <class X, class Y>
|
||||
struct Assign {
|
||||
X x;
|
||||
Y y;
|
||||
|
||||
template <class Arg1, class Arg2>
|
||||
Assign(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
|
||||
|
||||
template <class X, class Y>
|
||||
struct Assign {
|
||||
X x;
|
||||
Y y;
|
||||
template <typename... Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return x(args...) = y(args...);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Arg1, class Arg2>
|
||||
Assign(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
|
||||
template <int N>
|
||||
struct Assignee {
|
||||
Var<N> var;
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ float operator()(Args&&... args) {
|
||||
return x(args...) = y(args...);
|
||||
}
|
||||
};
|
||||
Assignee() {}
|
||||
Assignee(Var<N> v) : var(v) {}
|
||||
|
||||
template <int N>
|
||||
struct Assignee {
|
||||
Var<N> var;
|
||||
template <typename... Args>
|
||||
__HDI__ float& operator()(Args&&... args) {
|
||||
return var(args...);
|
||||
}
|
||||
|
||||
Assignee() {}
|
||||
Assignee(Var<N> v) : var(v) {}
|
||||
template <class X>
|
||||
Assign<Var<N>, IsClass<X>> operator=(X x) {
|
||||
return Assign<Var<N>, X>(var, x);
|
||||
}
|
||||
|
||||
template <typename ...Args>
|
||||
__HDI__ float& operator()(Args&&... args) {
|
||||
return var(args...);
|
||||
}
|
||||
Assign<Var<N>, Capture> operator=(Capture x) {
|
||||
return Assign<Var<N>, Capture>(var, x);
|
||||
}
|
||||
|
||||
template <class X>
|
||||
Assign<Var<N>, IsClass<X>> operator=(X x) {
|
||||
return Assign<Var<N>, X>(var, x);
|
||||
}
|
||||
template <class X>
|
||||
auto operator+=(X x) -> decltype(*this = *this + x) {
|
||||
return *this = *this + x;
|
||||
}
|
||||
|
||||
Assign<Var<N>, Capture> operator=(Capture x) {
|
||||
return Assign<Var<N>, Capture>(var, x);
|
||||
}
|
||||
template <class X>
|
||||
auto operator-=(X x) -> decltype(*this = *this - x) {
|
||||
return *this = *this - x;
|
||||
}
|
||||
|
||||
template <class X>
|
||||
auto operator+=(X x)->decltype(*this = *this + x) {
|
||||
return *this = *this + x;
|
||||
}
|
||||
template <class X>
|
||||
auto operator*=(X x) -> decltype(*this = *this * x) {
|
||||
return *this = *this * x;
|
||||
}
|
||||
|
||||
template <class X>
|
||||
auto operator-=(X x)->decltype(*this = *this - x) {
|
||||
return *this = *this - x;
|
||||
}
|
||||
template <class X>
|
||||
auto operator/=(X x) -> decltype(*this = *this / x) {
|
||||
return *this = *this / x;
|
||||
}
|
||||
|
||||
template <class X>
|
||||
auto operator*=(X x)->decltype(*this = *this * x) {
|
||||
return *this = *this * x;
|
||||
}
|
||||
|
||||
template <class X>
|
||||
auto operator/=(X x)->decltype(*this = *this / x) {
|
||||
return *this = *this / x;
|
||||
}
|
||||
|
||||
std::string to_string() {
|
||||
return var.to_string();
|
||||
}
|
||||
};
|
||||
std::string to_string() { return var.to_string(); }
|
||||
};
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -17,7 +17,6 @@ namespace functional {
|
||||
* @brief Represents the size of each dimension in a tensor.
|
||||
*/
|
||||
|
||||
|
||||
template <const int N>
|
||||
struct ConstantShape {
|
||||
Array<int, N> shape_;
|
||||
@ -32,10 +31,10 @@ struct ConstantShape {
|
||||
}
|
||||
|
||||
__HD__ ConstantShape(const ConstantShape& shape)
|
||||
: shape_(shape.shape_),
|
||||
stride_(shape.stride_),
|
||||
bstride_(shape.bstride_),
|
||||
elements_(shape.elements_) {}
|
||||
: shape_(shape.shape_),
|
||||
stride_(shape.stride_),
|
||||
bstride_(shape.bstride_),
|
||||
elements_(shape.elements_) {}
|
||||
|
||||
ConstantShape(const Shape& shape) {
|
||||
size_t filled = shape.size();
|
||||
@ -43,7 +42,8 @@ struct ConstantShape {
|
||||
ABORT_IF(filled > N,
|
||||
"Recompile with CONST_SHAPE_DIMS >= " + std::to_string(filled));
|
||||
|
||||
std::copy(shape.shape_.begin(), shape.shape_.end(), shape_.begin() + N - filled);
|
||||
std::copy(
|
||||
shape.shape_.begin(), shape.shape_.end(), shape_.begin() + N - filled);
|
||||
if(N - filled)
|
||||
std::fill_n(shape_.begin(), N - filled, 1);
|
||||
updateStrides();
|
||||
@ -51,7 +51,6 @@ struct ConstantShape {
|
||||
}
|
||||
|
||||
__HDI__ void updateStrides() {
|
||||
|
||||
stride_[N - 1] = 1;
|
||||
bstride_[N - 1] = shape_[N - 1] == 1 ? 0 : stride_[N - 1];
|
||||
|
||||
@ -73,7 +72,6 @@ struct ConstantShape {
|
||||
updateElements();
|
||||
}
|
||||
|
||||
|
||||
__HDI__ int dim(int i) { return shape_[i]; }
|
||||
|
||||
__HDI__ int dim(int i) const {
|
||||
@ -92,9 +90,7 @@ struct ConstantShape {
|
||||
|
||||
__HDI__ static constexpr size_t size() { return N; }
|
||||
|
||||
__HDI__ int elements() const {
|
||||
return elements_;
|
||||
}
|
||||
__HDI__ int elements() const { return elements_; }
|
||||
|
||||
__HDI__ int index(const Array<int, N>& d) const {
|
||||
int i = 0;
|
||||
@ -113,7 +109,7 @@ struct ConstantShape {
|
||||
__HDI__ void dims(int i, Array<int, N>& d) const {
|
||||
for(int j = 0; j < N; ++j)
|
||||
d[j] = (i / stride_[j]) % shape_[j];
|
||||
}
|
||||
}
|
||||
|
||||
__HDI__ bool operator==(const ConstantShape& other) const {
|
||||
for(int i = 0; i < N; ++i)
|
||||
@ -128,7 +124,5 @@ struct ConstantShape {
|
||||
};
|
||||
|
||||
typedef ConstantShape<CONST_SHAPE_DIMS> Shape;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -7,7 +7,7 @@
|
||||
namespace marian {
|
||||
namespace functional {
|
||||
|
||||
template<typename T>
|
||||
template <typename T>
|
||||
struct Tensor {
|
||||
T* data_;
|
||||
functional::Shape shape_;
|
||||
@ -15,19 +15,20 @@ struct Tensor {
|
||||
__HD__ Tensor() {}
|
||||
|
||||
__HD__ Tensor(T* ptr, const functional::Shape& shape)
|
||||
: data_(ptr), shape_(shape) {}
|
||||
: data_(ptr), shape_(shape) {}
|
||||
|
||||
__H__ Tensor(marian::Tensor t)
|
||||
: data_(t->data()), shape_(t->shape()) {}
|
||||
__H__ Tensor(marian::Tensor t) : data_(t->data()), shape_(t->shape()) {}
|
||||
|
||||
__HDI__ float& operator[](size_t i) { return data_[i]; }
|
||||
__HDI__ const float& operator[](size_t i) const { return data_[i]; }
|
||||
|
||||
__HDI__ float& operator[](const functional::Array<int, functional::Shape::size()>& indices) {
|
||||
__HDI__ float& operator[](
|
||||
const functional::Array<int, functional::Shape::size()>& indices) {
|
||||
return data_[shape_.index(indices)];
|
||||
}
|
||||
|
||||
__HDI__ const float& operator[](const functional::Array<int, functional::Shape::size()>& indices) const {
|
||||
__HDI__ const float& operator[](
|
||||
const functional::Array<int, functional::Shape::size()>& indices) const {
|
||||
return data_[shape_.index(indices)];
|
||||
}
|
||||
|
||||
@ -37,6 +38,5 @@ struct Tensor {
|
||||
__HDI__ Shape& shape() { return shape_; }
|
||||
__HDI__ const Shape& shape() const { return shape_; }
|
||||
};
|
||||
|
||||
}
|
||||
}
|
@ -12,82 +12,86 @@ struct FApply {};
|
||||
|
||||
template <class Functor>
|
||||
struct FApply<1, Functor> {
|
||||
__HDI__ static float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 1>& in,
|
||||
const functional::Array<int, 1>& indices) {
|
||||
__HDI__ static float apply(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 1>& in,
|
||||
const functional::Array<int, 1>& indices) {
|
||||
return functor(in[0][indices[0]]);
|
||||
}
|
||||
|
||||
__HDI__ static float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 1>& in,
|
||||
int index) {
|
||||
__HDI__ static float apply(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 1>& in,
|
||||
int index) {
|
||||
return functor(in[0][index]);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Functor>
|
||||
struct FApply<2, Functor> {
|
||||
__HDI__ static float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 2>& in,
|
||||
const functional::Array<int, 2>& indices) {
|
||||
__HDI__ static float apply(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 2>& in,
|
||||
const functional::Array<int, 2>& indices) {
|
||||
return functor(in[0][indices[0]], in[1][indices[1]]);
|
||||
}
|
||||
|
||||
__HDI__ static float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 2>& in,
|
||||
int index) {
|
||||
__HDI__ static float apply(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 2>& in,
|
||||
int index) {
|
||||
return functor(in[0][index], in[1][index]);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Functor>
|
||||
struct FApply<3, Functor> {
|
||||
__HDI__ static float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 3>& in,
|
||||
const functional::Array<int, 3>& indices) {
|
||||
__HDI__ static float apply(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 3>& in,
|
||||
const functional::Array<int, 3>& indices) {
|
||||
return functor(in[0][indices[0]], in[1][indices[1]], in[2][indices[2]]);
|
||||
}
|
||||
|
||||
__HDI__ static float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 3>& in,
|
||||
int index) {
|
||||
__HDI__ static float apply(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 3>& in,
|
||||
int index) {
|
||||
return functor(in[0][index], in[1][index], in[2][index]);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <class Functor>
|
||||
struct FApply<4, Functor> {
|
||||
__HDI__ static float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 4>& in,
|
||||
const functional::Array<int, 4>& indices) {
|
||||
__HDI__ static float apply(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 4>& in,
|
||||
const functional::Array<int, 4>& indices) {
|
||||
return functor(in[0][indices[0]],
|
||||
in[1][indices[1]],
|
||||
in[2][indices[2]],
|
||||
in[3][indices[3]]);
|
||||
}
|
||||
|
||||
__HDI__ static float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 4>& in,
|
||||
int index) {
|
||||
return functor(in[0][index],
|
||||
in[1][index],
|
||||
in[2][index],
|
||||
in[3][index]);
|
||||
__HDI__ static float apply(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, 4>& in,
|
||||
int index) {
|
||||
return functor(in[0][index], in[1][index], in[2][index], in[3][index]);
|
||||
}
|
||||
};
|
||||
|
||||
template <size_t K, class Functor>
|
||||
__HDI__ float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K>& in,
|
||||
const functional::Array<int, K>& indices) {
|
||||
__HDI__ float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K>& in,
|
||||
const functional::Array<int, K>& indices) {
|
||||
return FApply<K, Functor>::apply(functor, in, indices);
|
||||
}
|
||||
|
||||
template <size_t K, class Functor>
|
||||
__HDI__ float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K>& in,
|
||||
int index) {
|
||||
__HDI__ float apply(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K>& in,
|
||||
int index) {
|
||||
return FApply<K, Functor>::apply(functor, in, index);
|
||||
}
|
||||
|
||||
@ -96,11 +100,12 @@ template <size_t K, class Functor>
|
||||
template <size_t n, size_t N, size_t K>
|
||||
struct Loop {
|
||||
template <class Functor>
|
||||
__HDI__ static float result(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K>& in,
|
||||
const functional::Array<int, K>& pAcc,
|
||||
const functional::Array<int, N>& length,
|
||||
const functional::Array<int, N>& dim) {
|
||||
__HDI__ static float result(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K>& in,
|
||||
const functional::Array<int, K>& pAcc,
|
||||
const functional::Array<int, N>& length,
|
||||
const functional::Array<int, N>& dim) {
|
||||
float sum = 0;
|
||||
functional::Array<int, K> acc;
|
||||
for(int i = 0; i < length[N - n]; ++i) {
|
||||
@ -116,11 +121,12 @@ struct Loop {
|
||||
template <size_t N, size_t K>
|
||||
struct Loop<1, N, K> {
|
||||
template <class Functor>
|
||||
__HDI__ static float result(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K>& in,
|
||||
const functional::Array<int, K>& pAcc,
|
||||
const functional::Array<int, N>& length,
|
||||
const functional::Array<int, N>& dim) {
|
||||
__HDI__ static float result(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K>& in,
|
||||
const functional::Array<int, K>& pAcc,
|
||||
const functional::Array<int, N>& length,
|
||||
const functional::Array<int, N>& dim) {
|
||||
float sum = 0;
|
||||
functional::Array<int, K> acc;
|
||||
for(int i = 0; i < length[N - 1]; ++i) {
|
||||
@ -141,6 +147,5 @@ __HDI__ float loops(Functor functor,
|
||||
functional::Array<int, K> acc = {0};
|
||||
return Loop<N, N, K>::result(functor, in, acc, length, dim);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include <sstream>
|
||||
#include "graph/expression_graph.h"
|
||||
#include <sstream>
|
||||
|
||||
#include "tensors/tensor_operators.h"
|
||||
|
||||
@ -18,15 +18,12 @@ void ExpressionGraph::setDevice(DeviceId deviceId) {
|
||||
}
|
||||
|
||||
Expr ExpressionGraph::dropout(float prob, const Shape& shape) {
|
||||
return Expression<ConstantNode>(shared_from_this(),
|
||||
shape,
|
||||
[prob, this](Tensor t) {
|
||||
Dropout(t, prob);
|
||||
});
|
||||
return Expression<ConstantNode>(
|
||||
shared_from_this(), shape, [prob, this](Tensor t) { Dropout(t, prob); });
|
||||
}
|
||||
|
||||
void ExpressionGraph::checkNan(Tensor t) {
|
||||
ABORT_IF(throwNaN_, "Not implemented");
|
||||
//ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN");
|
||||
// ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN");
|
||||
}
|
||||
}
|
||||
|
@ -215,7 +215,9 @@ public:
|
||||
ABORT_IF(shape != p->shape(),
|
||||
"Requested shape {} for existing parameter '{}' does not match "
|
||||
"original shape {}",
|
||||
shape, name, p->shape());
|
||||
shape,
|
||||
name,
|
||||
p->shape());
|
||||
|
||||
p->setTrainable(!fixed);
|
||||
add(p);
|
||||
@ -239,10 +241,8 @@ public:
|
||||
return p;
|
||||
}
|
||||
|
||||
Expr constant(const Shape& shape,
|
||||
const NodeInitializer& init) {
|
||||
return Expression<ConstantNode>(
|
||||
shared_from_this(), shape, init);
|
||||
Expr constant(const Shape& shape, const NodeInitializer& init) {
|
||||
return Expression<ConstantNode>(shared_from_this(), shape, init);
|
||||
}
|
||||
|
||||
Expr ones(const Shape& shape) {
|
||||
|
@ -126,7 +126,6 @@ Expr repeat(Expr a, size_t repeats, keywords::axis_k ax) {
|
||||
return concatenate(std::vector<Expr>(repeats, a), ax);
|
||||
}
|
||||
|
||||
|
||||
Expr reshape(Expr a, Shape shape) {
|
||||
return Expression<ReshapeNodeOp>(a, shape);
|
||||
}
|
||||
@ -165,10 +164,7 @@ Expr flatten(Expr a) {
|
||||
}
|
||||
|
||||
Expr flatten_2d(Expr a) {
|
||||
Shape shape = {
|
||||
a->shape().elements() / a->shape()[-1],
|
||||
a->shape()[-1]
|
||||
};
|
||||
Shape shape = {a->shape().elements() / a->shape()[-1], a->shape()[-1]};
|
||||
|
||||
return Expression<ReshapeNodeOp>(a, shape);
|
||||
}
|
||||
@ -232,17 +228,16 @@ Expr step(Expr a, int step, int axis) {
|
||||
}
|
||||
|
||||
Expr cross_entropy(Expr a, Expr b) {
|
||||
//auto sOrig = a->shape();
|
||||
//auto sOut = a->shape();
|
||||
//Shape sTemp({sOrig[0] * sOrig[2] * sOrig[3], sOrig[1], 1, 1});
|
||||
//sOut.set(1, 1);
|
||||
//return reshape(Expression<CrossEntropyNodeOp>(reshape(a, sTemp), b), sOut);
|
||||
// auto sOrig = a->shape();
|
||||
// auto sOut = a->shape();
|
||||
// Shape sTemp({sOrig[0] * sOrig[2] * sOrig[3], sOrig[1], 1, 1});
|
||||
// sOut.set(1, 1);
|
||||
// return reshape(Expression<CrossEntropyNodeOp>(reshape(a, sTemp), b), sOut);
|
||||
|
||||
return Expression<CrossEntropyNodeOp>(a, b);
|
||||
}
|
||||
|
||||
Expr affine(Expr a, Expr b, Expr c,
|
||||
bool transA, bool transB, float scalar) {
|
||||
Expr affine(Expr a, Expr b, Expr c, bool transA, bool transB, float scalar) {
|
||||
std::vector<Expr> nodes = {a, b, c};
|
||||
return Expression<AffineNodeOp>(nodes, transA, transB, scalar);
|
||||
}
|
||||
@ -299,6 +294,7 @@ Expr highway(Expr y, Expr x, Expr t) {
|
||||
}
|
||||
|
||||
Expr highway(const std::string prefix, Expr x) {
|
||||
// clang-format off
|
||||
size_t outDim = x->shape()[-1];
|
||||
auto g = mlp::dense(x->graph())
|
||||
("prefix", prefix + "_highway_d1")
|
||||
@ -311,6 +307,7 @@ Expr highway(const std::string prefix, Expr x) {
|
||||
("activation", mlp::act::ReLU)
|
||||
.construct()->apply(x);
|
||||
return (g * relued) + ((1 - g) * x);
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Expr batch_norm(Expr x, Expr gamma, Expr beta) {
|
||||
@ -334,41 +331,26 @@ Expr shift(Expr a, Shape shift) {
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
|
||||
Expr avg_pooling(
|
||||
Expr x,
|
||||
int height,
|
||||
int width,
|
||||
int padHeight,
|
||||
int padWidth,
|
||||
int strideHeight,
|
||||
int strideWidth) {
|
||||
return Expression<PoolingOp>(x,
|
||||
height,
|
||||
width,
|
||||
padHeight,
|
||||
padWidth,
|
||||
strideHeight,
|
||||
strideWidth,
|
||||
"avg");
|
||||
Expr avg_pooling(Expr x,
|
||||
int height,
|
||||
int width,
|
||||
int padHeight,
|
||||
int padWidth,
|
||||
int strideHeight,
|
||||
int strideWidth) {
|
||||
return Expression<PoolingOp>(
|
||||
x, height, width, padHeight, padWidth, strideHeight, strideWidth, "avg");
|
||||
}
|
||||
|
||||
Expr max_pooling(
|
||||
Expr x,
|
||||
int height,
|
||||
int width,
|
||||
int padHeight,
|
||||
int padWidth,
|
||||
int strideHeight,
|
||||
int strideWidth)
|
||||
{
|
||||
return Expression<PoolingOp>(x,
|
||||
height,
|
||||
width,
|
||||
padHeight,
|
||||
padWidth,
|
||||
strideHeight,
|
||||
strideWidth,
|
||||
"max");
|
||||
Expr max_pooling(Expr x,
|
||||
int height,
|
||||
int width,
|
||||
int padHeight,
|
||||
int padWidth,
|
||||
int strideHeight,
|
||||
int strideWidth) {
|
||||
return Expression<PoolingOp>(
|
||||
x, height, width, padHeight, padWidth, strideHeight, strideWidth, "max");
|
||||
}
|
||||
|
||||
Expr convert2cudnnFormat(Expr x) {
|
||||
@ -377,13 +359,13 @@ Expr convert2cudnnFormat(Expr x) {
|
||||
int embSize = x->shape()[2];
|
||||
|
||||
std::vector<size_t> newIndeces;
|
||||
for (int b = 0; b < numExamples; ++b) {
|
||||
for (int t = 0; t < numWords; ++t) {
|
||||
for(int b = 0; b < numExamples; ++b) {
|
||||
for(int t = 0; t < numWords; ++t) {
|
||||
newIndeces.push_back((t * numExamples) + b);
|
||||
}
|
||||
}
|
||||
|
||||
auto xRows = reshape(x, {x->shape()[0] * x ->shape()[1], x->shape()[2]});
|
||||
auto xRows = reshape(x, {x->shape()[0] * x->shape()[1], x->shape()[2]});
|
||||
|
||||
Shape outShape({numExamples, 1, numWords, embSize});
|
||||
return reshape(rows(xRows, newIndeces), outShape);
|
||||
@ -397,8 +379,8 @@ Expr convertFromcudnnFormat(Expr x) {
|
||||
auto reshapedX = reshape(x, {batchDim * sentenceDim, embSize});
|
||||
|
||||
std::vector<size_t> newIndeces;
|
||||
for (int t = 0; t < sentenceDim; ++t) {
|
||||
for (int b = 0; b < batchDim; ++b) {
|
||||
for(int t = 0; t < sentenceDim; ++t) {
|
||||
for(int b = 0; b < batchDim; ++b) {
|
||||
newIndeces.push_back(b * sentenceDim + t);
|
||||
}
|
||||
}
|
||||
@ -412,5 +394,4 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
@ -110,7 +110,6 @@ Expr mean(Expr a, keywords::axis_k ax = 0);
|
||||
|
||||
Expr cross_entropy(Expr a, Expr b);
|
||||
|
||||
|
||||
Expr scalar_product(Expr a, Expr b, keywords::axis_k ax = 0);
|
||||
|
||||
Expr weighted_average(Expr in, Expr weights, keywords::axis_k ax = 0);
|
||||
@ -161,6 +160,5 @@ Expr max_pooling(Expr x,
|
||||
int strideHeight = 1,
|
||||
int strideWidth = 1);
|
||||
|
||||
Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven=false);
|
||||
|
||||
Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven = false);
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
#include "tensors/backend.h"
|
||||
|
||||
#include "graph/expression_graph.h"
|
||||
#include "graph/node.h"
|
||||
|
||||
|
@ -33,8 +33,7 @@ protected:
|
||||
|
||||
public:
|
||||
Node(Ptr<ExpressionGraph> graph, Shape shape)
|
||||
: graph_(graph),
|
||||
shape_(shape) {}
|
||||
: graph_(graph), shape_(shape) {}
|
||||
|
||||
virtual ~Node() {
|
||||
if(destroy_) {
|
||||
@ -152,7 +151,7 @@ struct NaryNodeOp : public Node {
|
||||
}
|
||||
|
||||
NaryNodeOp(const std::vector<Expr>& nodes)
|
||||
: NaryNodeOp(nodes, nodes[0]->shape()) {}
|
||||
: NaryNodeOp(nodes, nodes[0]->shape()) {}
|
||||
|
||||
virtual ~NaryNodeOp() {}
|
||||
|
||||
|
@ -109,9 +109,8 @@ void ortho(Tensor t) {
|
||||
|
||||
NodeInitializer from_vector(const std::vector<float>& v) {
|
||||
auto vPtr = New<std::vector<float>>(v.begin(), v.end());
|
||||
return [vPtr](Tensor t) {
|
||||
t->set(vPtr->data(), vPtr->data() + vPtr->size());
|
||||
};
|
||||
return
|
||||
[vPtr](Tensor t) { t->set(vPtr->data(), vPtr->data() + vPtr->size()); };
|
||||
}
|
||||
|
||||
NodeInitializer from_vector(const std::vector<size_t>& v) {
|
||||
@ -138,9 +137,9 @@ NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np) {
|
||||
|
||||
// move this somewhere else
|
||||
NodeInitializer from_word2vec(const std::string& file,
|
||||
int dimVoc,
|
||||
int dimEmb,
|
||||
bool normalize /*= false*/) {
|
||||
int dimVoc,
|
||||
int dimEmb,
|
||||
bool normalize /*= false*/) {
|
||||
return [file, dimVoc, dimEmb, normalize](Tensor t) {
|
||||
auto embs = Word2VecReader().read(file, dimVoc, dimEmb);
|
||||
|
||||
|
@ -70,9 +70,9 @@ NodeInitializer from_sparse_vector(
|
||||
NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np);
|
||||
|
||||
NodeInitializer from_word2vec(const std::string& file,
|
||||
int dimVoc,
|
||||
int dimEmb,
|
||||
bool normalize = false);
|
||||
int dimVoc,
|
||||
int dimEmb,
|
||||
bool normalize = false);
|
||||
}
|
||||
|
||||
} // namespace marian
|
||||
|
@ -7,11 +7,12 @@
|
||||
namespace marian {
|
||||
|
||||
struct ConstantNode : public Node {
|
||||
ConstantNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init)
|
||||
ConstantNode(Ptr<ExpressionGraph> graph,
|
||||
const Shape& shape,
|
||||
const NodeInitializer& init)
|
||||
: Node(graph, shape),
|
||||
init_(new NodeInitializer(init)),
|
||||
initialized_(false) {
|
||||
|
||||
setTrainable(false);
|
||||
}
|
||||
|
||||
@ -41,11 +42,13 @@ private:
|
||||
};
|
||||
|
||||
struct ParamNode : public Node {
|
||||
ParamNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init, bool fixed = false)
|
||||
ParamNode(Ptr<ExpressionGraph> graph,
|
||||
const Shape& shape,
|
||||
const NodeInitializer& init,
|
||||
bool fixed = false)
|
||||
: Node(graph, shape),
|
||||
init_(new NodeInitializer(init)),
|
||||
initialized_(false) {
|
||||
|
||||
setTrainable(!fixed);
|
||||
}
|
||||
|
||||
|
@ -16,13 +16,8 @@ private:
|
||||
float scalar_;
|
||||
|
||||
public:
|
||||
DotNodeOp(Expr a,
|
||||
Expr b,
|
||||
bool transA,
|
||||
bool transB,
|
||||
float scalar)
|
||||
: NaryNodeOp({a, b},
|
||||
newShape(a, b, transA, transB)),
|
||||
DotNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar)
|
||||
: NaryNodeOp({a, b}, newShape(a, b, transA, transB)),
|
||||
transA_(transA),
|
||||
transB_(transB),
|
||||
scalar_(scalar) {}
|
||||
@ -49,14 +44,13 @@ public:
|
||||
|
||||
NodeOps forwardOps() {
|
||||
// C = alpha * dot(op(A), op(B))
|
||||
return {NodeOp(Prod(
|
||||
val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
transA_,
|
||||
transB_,
|
||||
0.f,
|
||||
scalar_))};
|
||||
return {NodeOp(Prod(val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
transA_,
|
||||
transB_,
|
||||
0.f,
|
||||
scalar_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
@ -149,7 +143,7 @@ public:
|
||||
: NaryNodeOp(nodes, newShape(nodes[0], nodes[1], transA, transB)),
|
||||
transA_(transA),
|
||||
transB_(transB),
|
||||
scalar_(scalar){}
|
||||
scalar_(scalar) {}
|
||||
|
||||
Shape newShape(Expr a, Expr b, bool transA, bool transB) {
|
||||
auto shapeA = a->shape();
|
||||
@ -171,19 +165,17 @@ public:
|
||||
return outShape;
|
||||
}
|
||||
|
||||
|
||||
NodeOps forwardOps() {
|
||||
using namespace functional;
|
||||
return {
|
||||
NodeOp(Prod(
|
||||
val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
transA_,
|
||||
transB_,
|
||||
0.f,
|
||||
scalar_);
|
||||
Add(_1, val_, child(2)->val()))
|
||||
NodeOp(Prod(val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
transA_,
|
||||
transB_,
|
||||
0.f,
|
||||
scalar_);
|
||||
Add(_1, val_, child(2)->val()))
|
||||
};
|
||||
}
|
||||
|
||||
@ -266,7 +258,6 @@ public:
|
||||
const std::string type() { return "affine"; }
|
||||
};
|
||||
|
||||
|
||||
class DotBatchedNodeOp : public NaryNodeOp {
|
||||
private:
|
||||
bool transA_;
|
||||
@ -274,13 +265,8 @@ private:
|
||||
float scalar_;
|
||||
|
||||
public:
|
||||
DotBatchedNodeOp(Expr a,
|
||||
Expr b,
|
||||
bool transA,
|
||||
bool transB,
|
||||
float scalar)
|
||||
: NaryNodeOp({a, b},
|
||||
newShape(a, b, transA, transB)),
|
||||
DotBatchedNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar)
|
||||
: NaryNodeOp({a, b}, newShape(a, b, transA, transB)),
|
||||
transA_(transA),
|
||||
transB_(transB),
|
||||
scalar_(scalar) {}
|
||||
@ -307,14 +293,13 @@ public:
|
||||
|
||||
NodeOps forwardOps() {
|
||||
// C = alpha * dot(op(A), op(B))
|
||||
return {NodeOp(ProdBatched(
|
||||
val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
transA_,
|
||||
transB_,
|
||||
0.f,
|
||||
scalar_))};
|
||||
return {NodeOp(ProdBatched(val_,
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
transA_,
|
||||
transB_,
|
||||
0.f,
|
||||
scalar_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
@ -325,71 +310,67 @@ public:
|
||||
// to sum gradients from different graph parts
|
||||
|
||||
if(!transA_ && transB_)
|
||||
return {
|
||||
NodeOp(ProdBatched(child(0)->grad(),
|
||||
adj_,
|
||||
child(1)->val(),
|
||||
false,
|
||||
false,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(child(1)->grad(),
|
||||
adj_,
|
||||
child(0)->val(),
|
||||
true,
|
||||
false,
|
||||
1.0,
|
||||
scalar_))};
|
||||
return {NodeOp(ProdBatched(child(0)->grad(),
|
||||
adj_,
|
||||
child(1)->val(),
|
||||
false,
|
||||
false,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(child(1)->grad(),
|
||||
adj_,
|
||||
child(0)->val(),
|
||||
true,
|
||||
false,
|
||||
1.0,
|
||||
scalar_))};
|
||||
|
||||
if(transA_ && !transB_)
|
||||
return {
|
||||
NodeOp(ProdBatched(child(0)->grad(),
|
||||
child(1)->val(),
|
||||
adj_,
|
||||
false,
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(child(1)->grad(),
|
||||
child(0)->val(),
|
||||
adj_,
|
||||
false,
|
||||
false,
|
||||
1.0,
|
||||
scalar_))};
|
||||
return {NodeOp(ProdBatched(child(0)->grad(),
|
||||
child(1)->val(),
|
||||
adj_,
|
||||
false,
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(child(1)->grad(),
|
||||
child(0)->val(),
|
||||
adj_,
|
||||
false,
|
||||
false,
|
||||
1.0,
|
||||
scalar_))};
|
||||
|
||||
if(transA_ && transB_)
|
||||
return {
|
||||
NodeOp(ProdBatched(child(0)->grad(),
|
||||
child(1)->val(),
|
||||
adj_,
|
||||
true,
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(child(1)->grad(),
|
||||
adj_,
|
||||
child(0)->val(),
|
||||
true,
|
||||
true,
|
||||
1.0,
|
||||
scalar_))};
|
||||
return {NodeOp(ProdBatched(child(0)->grad(),
|
||||
child(1)->val(),
|
||||
adj_,
|
||||
true,
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(child(1)->grad(),
|
||||
adj_,
|
||||
child(0)->val(),
|
||||
true,
|
||||
true,
|
||||
1.0,
|
||||
scalar_))};
|
||||
|
||||
return {
|
||||
NodeOp(ProdBatched(child(0)->grad(),
|
||||
adj_,
|
||||
child(1)->val(),
|
||||
false,
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(child(1)->grad(),
|
||||
child(0)->val(),
|
||||
adj_,
|
||||
true,
|
||||
false,
|
||||
1.0,
|
||||
scalar_))};
|
||||
return {NodeOp(ProdBatched(child(0)->grad(),
|
||||
adj_,
|
||||
child(1)->val(),
|
||||
false,
|
||||
true,
|
||||
1.0,
|
||||
scalar_)),
|
||||
NodeOp(ProdBatched(child(1)->grad(),
|
||||
child(0)->val(),
|
||||
adj_,
|
||||
true,
|
||||
false,
|
||||
1.0,
|
||||
scalar_))};
|
||||
}
|
||||
|
||||
const std::string type() { return "•"; }
|
||||
@ -400,8 +381,7 @@ public:
|
||||
struct ScalarProductNodeOp : public NaryNodeOp {
|
||||
template <typename... Args>
|
||||
ScalarProductNodeOp(Expr a, Expr b, Args... args)
|
||||
: NaryNodeOp({a, b}, newShape(a, b, args...)) {
|
||||
}
|
||||
: NaryNodeOp({a, b}, newShape(a, b, args...)) {}
|
||||
|
||||
template <typename... Args>
|
||||
Shape newShape(Expr a, Expr b, Args... args) {
|
||||
@ -433,12 +413,9 @@ struct ScalarProductNodeOp : public NaryNodeOp {
|
||||
};
|
||||
|
||||
struct ElementBinaryNodeOp : public NaryNodeOp {
|
||||
ElementBinaryNodeOp(Expr a, Expr b)
|
||||
: NaryNodeOp({a, b}, newShape(a, b)) {}
|
||||
ElementBinaryNodeOp(Expr a, Expr b) : NaryNodeOp({a, b}, newShape(a, b)) {}
|
||||
|
||||
Shape newShape(Expr a, Expr b) {
|
||||
return Shape::broadcast({a, b});
|
||||
}
|
||||
Shape newShape(Expr a, Expr b) { return Shape::broadcast({a, b}); }
|
||||
|
||||
const std::string color() { return "yellow"; }
|
||||
};
|
||||
@ -553,8 +530,7 @@ struct DivNodeOp : public ElementBinaryNodeOp {
|
||||
|
||||
// Cross-entropy node. It computes -b*log(softmax(a)), summing rowwise.
|
||||
struct CrossEntropyNodeOp : public NaryNodeOp {
|
||||
CrossEntropyNodeOp(Expr a, Expr b)
|
||||
: NaryNodeOp({a, b}, newShape(a)) {}
|
||||
CrossEntropyNodeOp(Expr a, Expr b) : NaryNodeOp({a, b}, newShape(a)) {}
|
||||
|
||||
Shape newShape(Expr a) {
|
||||
Shape shape1 = a->shape();
|
||||
@ -578,7 +554,9 @@ struct CrossEntropyNodeOp : public NaryNodeOp {
|
||||
struct ConcatenateNodeOp : public NaryNodeOp {
|
||||
template <typename... Args>
|
||||
ConcatenateNodeOp(const std::vector<Expr>& nodes, Args... args)
|
||||
: NaryNodeOp(nodes, newShape(nodes, keywords::Get(keywords::axis, 0, args...))) {}
|
||||
: NaryNodeOp(nodes,
|
||||
newShape(nodes, keywords::Get(keywords::axis, 0, args...))) {
|
||||
}
|
||||
|
||||
Shape newShape(const std::vector<Expr>& nodes, int ax) {
|
||||
Shape shape = nodes.back()->shape();
|
||||
@ -730,38 +708,33 @@ struct HighwayNodeOp : public NaryNodeOp {
|
||||
|
||||
class ConvolutionOp : public NaryNodeOp {
|
||||
public:
|
||||
ConvolutionOp(
|
||||
const std::vector<Expr>& nodes,
|
||||
int hPad = 0,
|
||||
int wPad = 0,
|
||||
int hStride = 1,
|
||||
int wStride = 1)
|
||||
: NaryNodeOp(nodes),
|
||||
conv_(nodes[1]->shape(),
|
||||
nodes[2]->shape(),
|
||||
hPad,
|
||||
wPad,
|
||||
hStride,
|
||||
wStride) {
|
||||
ConvolutionOp(const std::vector<Expr>& nodes,
|
||||
int hPad = 0,
|
||||
int wPad = 0,
|
||||
int hStride = 1,
|
||||
int wStride = 1)
|
||||
: NaryNodeOp(nodes),
|
||||
conv_(nodes[1]->shape(),
|
||||
nodes[2]->shape(),
|
||||
hPad,
|
||||
wPad,
|
||||
hStride,
|
||||
wStride) {
|
||||
conv_.getOutputShape(nodes[0]->shape(), shape_);
|
||||
}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
return {NodeOp(conv_.forward(
|
||||
child(0)->val(),
|
||||
child(1)->val(),
|
||||
child(2)->val(),
|
||||
val_))};
|
||||
child(0)->val(), child(1)->val(), child(2)->val(), val_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
return {NodeOp(conv_.backward(
|
||||
child(0)->val(),
|
||||
child(0)->grad(),
|
||||
child(1)->val(),
|
||||
child(1)->grad(),
|
||||
child(2)->grad(),
|
||||
adj_))};
|
||||
return {NodeOp(conv_.backward(child(0)->val(),
|
||||
child(0)->grad(),
|
||||
child(1)->val(),
|
||||
child(1)->grad(),
|
||||
child(2)->grad(),
|
||||
adj_))};
|
||||
}
|
||||
|
||||
const std::string type() { return "layer_convolution"; }
|
||||
@ -769,5 +742,4 @@ public:
|
||||
protected:
|
||||
ConvolutionWrapper conv_;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -12,11 +12,9 @@
|
||||
namespace marian {
|
||||
|
||||
struct UnaryNodeOp : public NaryNodeOp {
|
||||
UnaryNodeOp(Expr a, Shape shape)
|
||||
: NaryNodeOp({a}, shape) {}
|
||||
UnaryNodeOp(Expr a, Shape shape) : NaryNodeOp({a}, shape) {}
|
||||
|
||||
UnaryNodeOp(Expr a)
|
||||
: NaryNodeOp({a}, a->shape()) {}
|
||||
UnaryNodeOp(Expr a) : NaryNodeOp({a}, a->shape()) {}
|
||||
|
||||
const std::string color() { return "yellow"; }
|
||||
};
|
||||
@ -26,9 +24,7 @@ private:
|
||||
float scalar_{0};
|
||||
|
||||
public:
|
||||
ScalarAddNodeOp(Expr a, float scalar)
|
||||
: UnaryNodeOp(a),
|
||||
scalar_{scalar} {}
|
||||
ScalarAddNodeOp(Expr a, float scalar) : UnaryNodeOp(a), scalar_{scalar} {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
using namespace functional;
|
||||
@ -67,8 +63,7 @@ private:
|
||||
float scalar_{0};
|
||||
|
||||
public:
|
||||
ScalarMultNodeOp(Expr a, float scalar)
|
||||
: UnaryNodeOp(a), scalar_{scalar} {}
|
||||
ScalarMultNodeOp(Expr a, float scalar) : UnaryNodeOp(a), scalar_{scalar} {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
using namespace functional;
|
||||
@ -210,7 +205,6 @@ struct TanhNodeOp : public NaryNodeOp {
|
||||
const std::string type() { return "tanh"; }
|
||||
};
|
||||
|
||||
|
||||
struct ReLUNodeOp : public UnaryNodeOp {
|
||||
ReLUNodeOp(Expr a) : UnaryNodeOp(a) {}
|
||||
|
||||
@ -262,8 +256,7 @@ struct ReLUNodeOp : public UnaryNodeOp {
|
||||
* \f]
|
||||
*/
|
||||
struct PReLUNodeOp : public UnaryNodeOp {
|
||||
PReLUNodeOp(float alpha, Expr a)
|
||||
: UnaryNodeOp(a), alpha_(alpha) {}
|
||||
PReLUNodeOp(float alpha, Expr a) : UnaryNodeOp(a), alpha_(alpha) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
using namespace functional;
|
||||
@ -334,11 +327,9 @@ struct SwishNodeOp : public UnaryNodeOp {
|
||||
};
|
||||
|
||||
struct SoftmaxNodeOp : public UnaryNodeOp {
|
||||
SoftmaxNodeOp(Expr a)
|
||||
: UnaryNodeOp(a), mask_(nullptr) {}
|
||||
SoftmaxNodeOp(Expr a) : UnaryNodeOp(a), mask_(nullptr) {}
|
||||
|
||||
SoftmaxNodeOp(Expr a, Expr mask)
|
||||
: UnaryNodeOp(a), mask_(mask) {}
|
||||
SoftmaxNodeOp(Expr a, Expr mask) : UnaryNodeOp(a), mask_(mask) {}
|
||||
|
||||
Expr mask_;
|
||||
|
||||
@ -407,17 +398,18 @@ struct SumNodeOp : public UnaryNodeOp {
|
||||
int ax_;
|
||||
|
||||
template <typename... Args>
|
||||
SumNodeOp(Expr a, Args... args)
|
||||
: UnaryNodeOp(a, newShape(a, args...)) {}
|
||||
SumNodeOp(Expr a, Args... args) : UnaryNodeOp(a, newShape(a, args...)) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
using namespace functional;
|
||||
|
||||
return {NodeOp(Reduce(_1, val_, child(0)->val()))}; }
|
||||
return {NodeOp(Reduce(_1, val_, child(0)->val()))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
using namespace functional;
|
||||
return {NodeOp(Add(_1, child(0)->grad(), adj_))}; }
|
||||
return {NodeOp(Add(_1, child(0)->grad(), adj_))};
|
||||
}
|
||||
|
||||
template <class... Args>
|
||||
Shape newShape(Expr a, Args... args) {
|
||||
@ -456,8 +448,7 @@ struct MeanNodeOp : public UnaryNodeOp {
|
||||
int ax_;
|
||||
|
||||
template <typename... Args>
|
||||
MeanNodeOp(Expr a, Args... args)
|
||||
: UnaryNodeOp(a, newShape(a, args...)) {}
|
||||
MeanNodeOp(Expr a, Args... args) : UnaryNodeOp(a, newShape(a, args...)) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
using namespace functional;
|
||||
@ -543,8 +534,7 @@ struct ExpNodeOp : public UnaryNodeOp {
|
||||
struct SqrtNodeOp : public UnaryNodeOp {
|
||||
float epsilon_;
|
||||
|
||||
SqrtNodeOp(Expr a, float epsilon)
|
||||
: UnaryNodeOp(a), epsilon_(epsilon) {}
|
||||
SqrtNodeOp(Expr a, float epsilon) : UnaryNodeOp(a), epsilon_(epsilon) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
using namespace functional;
|
||||
@ -614,8 +604,7 @@ struct NegNodeOp : public UnaryNodeOp {
|
||||
|
||||
struct RowsNodeOp : public UnaryNodeOp {
|
||||
RowsNodeOp(Expr a, const std::vector<size_t>& indeces)
|
||||
: UnaryNodeOp(a, newShape(a, indeces)),
|
||||
indices_(indeces) {}
|
||||
: UnaryNodeOp(a, newShape(a, indeces)), indices_(indeces) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
// @TODO: solve this with a tensor!
|
||||
@ -666,8 +655,7 @@ struct RowsNodeOp : public UnaryNodeOp {
|
||||
|
||||
struct ColsNodeOp : public UnaryNodeOp {
|
||||
ColsNodeOp(Expr a, const std::vector<size_t>& indeces)
|
||||
: UnaryNodeOp(a, newShape(a, indeces)),
|
||||
indices_(indeces) {}
|
||||
: UnaryNodeOp(a, newShape(a, indeces)), indices_(indeces) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
// @TODO: solve this with a tensor!
|
||||
@ -716,8 +704,7 @@ struct ColsNodeOp : public UnaryNodeOp {
|
||||
|
||||
struct SelectNodeOp : public UnaryNodeOp {
|
||||
SelectNodeOp(Expr a, int axis, const std::vector<size_t>& indeces)
|
||||
: UnaryNodeOp(a, newShape(a, axis, indeces)),
|
||||
indices_(indeces) {}
|
||||
: UnaryNodeOp(a, newShape(a, axis, indeces)), indices_(indeces) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
return {NodeOp(
|
||||
@ -772,8 +759,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
|
||||
std::vector<int> axes_;
|
||||
|
||||
TransposeNodeOp(Expr a, const std::vector<int>& axes)
|
||||
: UnaryNodeOp(a, newShape(a, axes)),
|
||||
axes_{axes} {}
|
||||
: UnaryNodeOp(a, newShape(a, axes)), axes_{axes} {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
return {NodeOp(TransposeND(val_, child(0)->val(), axes_))};
|
||||
@ -788,7 +774,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
|
||||
Shape shape = a->shape();
|
||||
|
||||
ABORT_IF(shape.size() != axes.size(),
|
||||
"Shape and transpose axes have different number of dimensions");
|
||||
"Shape and transpose axes have different number of dimensions");
|
||||
|
||||
for(int i = 0; i < shape.size(); ++i)
|
||||
shape.set(i, a->shape()[axes[i]]);
|
||||
@ -829,8 +815,7 @@ private:
|
||||
|
||||
public:
|
||||
template <typename... Args>
|
||||
ReshapeNodeOp(Expr a, Shape shape)
|
||||
: UnaryNodeOp(a, shape), reshapee_(a) {
|
||||
ReshapeNodeOp(Expr a, Shape shape) : UnaryNodeOp(a, shape), reshapee_(a) {
|
||||
Node::destroy_ = false;
|
||||
}
|
||||
|
||||
@ -894,9 +879,7 @@ private:
|
||||
|
||||
public:
|
||||
StepNodeOp(Expr a, int step, int axis)
|
||||
: UnaryNodeOp(a, newShape(a, axis)),
|
||||
stepNode_(a),
|
||||
step_(step) {
|
||||
: UnaryNodeOp(a, newShape(a, axis)), stepNode_(a), step_(step) {
|
||||
Node::destroy_ = false;
|
||||
}
|
||||
|
||||
@ -1056,67 +1039,54 @@ public:
|
||||
padWidth,
|
||||
strideHeight,
|
||||
strideWidth,
|
||||
mode) {
|
||||
}
|
||||
mode) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
return {NodeOp(pooling_.forward(child(0)->val(), val_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
return {NodeOp(pooling_.backward(
|
||||
child(0)->val(),
|
||||
child(0)->grad(),
|
||||
val_,
|
||||
adj_))};
|
||||
return {NodeOp(
|
||||
pooling_.backward(child(0)->val(), child(0)->grad(), val_, adj_))};
|
||||
}
|
||||
|
||||
const std::string type() { return "layer_pooling"; }
|
||||
|
||||
|
||||
protected:
|
||||
PoolingWrapper pooling_;
|
||||
};
|
||||
|
||||
class PoolingWithMaskingOp : public UnaryNodeOp {
|
||||
public:
|
||||
PoolingWithMaskingOp( Expr x, Expr mask, int width, bool isEven=false)
|
||||
: UnaryNodeOp(x),
|
||||
mask_(mask),
|
||||
width_(width),
|
||||
isEven_(isEven)
|
||||
{
|
||||
auto xShape = x->shape();
|
||||
int dimBatch = xShape[0];
|
||||
int dimWord = xShape[1];
|
||||
int cols = (isEven_) ? xShape[2] - 1 : xShape[2];
|
||||
int dimSentence = (cols / width_) + (cols % width_ != 0);
|
||||
shape_ = {dimBatch, dimWord, dimSentence};
|
||||
}
|
||||
public:
|
||||
PoolingWithMaskingOp(Expr x, Expr mask, int width, bool isEven = false)
|
||||
: UnaryNodeOp(x), mask_(mask), width_(width), isEven_(isEven) {
|
||||
auto xShape = x->shape();
|
||||
int dimBatch = xShape[0];
|
||||
int dimWord = xShape[1];
|
||||
int cols = (isEven_) ? xShape[2] - 1 : xShape[2];
|
||||
int dimSentence = (cols / width_) + (cols % width_ != 0);
|
||||
shape_ = {dimBatch, dimWord, dimSentence};
|
||||
}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
return {NodeOp(PoolingWithMaskingForward(val_,
|
||||
NodeOps forwardOps() {
|
||||
return {NodeOp(PoolingWithMaskingForward(
|
||||
val_, child(0)->val(), mask_->val(), width_, isEven_))};
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
return {NodeOp(PoolingWithMaskingBackward(adj_,
|
||||
child(0)->grad(),
|
||||
child(0)->val(),
|
||||
mask_->val(),
|
||||
width_,
|
||||
isEven_))};
|
||||
}
|
||||
}
|
||||
|
||||
NodeOps backwardOps() {
|
||||
return {NodeOp(PoolingWithMaskingBackward(adj_,
|
||||
child(0)->grad(),
|
||||
child(0)->val(),
|
||||
mask_->val(),
|
||||
width_,
|
||||
isEven_))};
|
||||
}
|
||||
const std::string type() { return "layer_pooling"; }
|
||||
|
||||
const std::string type() {return "layer_pooling";}
|
||||
|
||||
protected:
|
||||
Expr mask_;
|
||||
int width_;
|
||||
bool isEven_;
|
||||
protected:
|
||||
Expr mask_;
|
||||
int width_;
|
||||
bool isEven_;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -23,18 +23,18 @@ struct LayerFactory : public Factory {
|
||||
return as<Cast>() != nullptr;
|
||||
}
|
||||
|
||||
virtual Ptr<Layer> construct() = 0;
|
||||
virtual Ptr<Layer> construct() = 0;
|
||||
};
|
||||
|
||||
class DenseFactory : public LayerFactory {
|
||||
protected:
|
||||
//std::vector<std::pair<std::string, std::string>> tiedParams_;
|
||||
// std::vector<std::pair<std::string, std::string>> tiedParams_;
|
||||
std::vector<std::pair<std::string, std::string>> tiedParamsTransposed_;
|
||||
|
||||
public:
|
||||
DenseFactory(Ptr<ExpressionGraph> graph) : LayerFactory(graph) {}
|
||||
|
||||
//Accumulator<DenseFactory> tie(const std::string& param,
|
||||
// Accumulator<DenseFactory> tie(const std::string& param,
|
||||
// const std::string& tied) {
|
||||
// tiedParams_.push_back({param, tied});
|
||||
// return Accumulator<DenseFactory>(*this);
|
||||
@ -48,21 +48,20 @@ public:
|
||||
|
||||
Ptr<Layer> construct() {
|
||||
auto dense = New<Dense>(graph_, options_);
|
||||
//for(auto& p : tiedParams_)
|
||||
// for(auto& p : tiedParams_)
|
||||
// dense->tie(p.first, p.second);
|
||||
for(auto& p : tiedParamsTransposed_)
|
||||
dense->tie_transposed(p.first, p.second);
|
||||
return dense;
|
||||
}
|
||||
|
||||
|
||||
DenseFactory clone() {
|
||||
DenseFactory aClone(graph_);
|
||||
aClone.options_->merge(options_);
|
||||
//aClone.tiedParams_ = tiedParams_;
|
||||
// aClone.tiedParams_ = tiedParams_;
|
||||
aClone.tiedParamsTransposed_ = tiedParamsTransposed_;
|
||||
return aClone;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
typedef Accumulator<DenseFactory> dense;
|
||||
@ -95,7 +94,6 @@ public:
|
||||
}
|
||||
|
||||
void push_back(Ptr<Layer> layer) { layers_.push_back(layer); }
|
||||
|
||||
};
|
||||
|
||||
class MLPFactory : public Factory {
|
||||
@ -121,7 +119,7 @@ public:
|
||||
layers_.push_back(New<LF>(lf));
|
||||
return Accumulator<MLPFactory>(*this);
|
||||
}
|
||||
|
||||
|
||||
MLPFactory clone() {
|
||||
MLPFactory aClone(graph_);
|
||||
aClone.options_->merge(options_);
|
||||
@ -129,7 +127,6 @@ public:
|
||||
aClone.push_back(lf->clone());
|
||||
return aClone;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
typedef Accumulator<MLPFactory> mlp;
|
||||
|
@ -2,8 +2,7 @@
|
||||
#include "graph/node_operators_binary.h"
|
||||
|
||||
namespace marian {
|
||||
Convolution::Convolution(Ptr<ExpressionGraph> graph)
|
||||
: Factory(graph) {}
|
||||
Convolution::Convolution(Ptr<ExpressionGraph> graph) : Factory(graph) {}
|
||||
|
||||
Expr Convolution::apply(Expr x) {
|
||||
auto prefix = opt<std::string>("prefix");
|
||||
@ -13,28 +12,21 @@ Expr Convolution::apply(Expr x) {
|
||||
auto strides = opt<std::pair<int, int>>("strides", std::make_pair(1, 1));
|
||||
|
||||
int layerIn = x->shape()[1];
|
||||
auto kernel = graph_->param(prefix + "_conv_kernels",
|
||||
{layerIn,
|
||||
kernelNum,
|
||||
kernelDims.first,
|
||||
kernelDims.second},
|
||||
inits::glorot_uniform);
|
||||
auto kernel
|
||||
= graph_->param(prefix + "_conv_kernels",
|
||||
{layerIn, kernelNum, kernelDims.first, kernelDims.second},
|
||||
inits::glorot_uniform);
|
||||
|
||||
auto bias = graph_->param(prefix + "_conv_bias",
|
||||
{1, kernelNum, 1, 1},
|
||||
inits::zeros);
|
||||
auto bias = graph_->param(
|
||||
prefix + "_conv_bias", {1, kernelNum, 1, 1}, inits::zeros);
|
||||
|
||||
std::vector<Expr> nodes = {x, kernel, bias};
|
||||
return Expression<ConvolutionOp>(nodes,
|
||||
paddings.first,
|
||||
paddings.second,
|
||||
strides.first,
|
||||
strides.second);
|
||||
return Expression<ConvolutionOp>(
|
||||
nodes, paddings.first, paddings.second, strides.first, strides.second);
|
||||
}
|
||||
|
||||
Expr Convolution::apply(const std::vector<Expr>&) {
|
||||
ABORT("Can't apply convolution on many inputs at once");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -33,6 +33,7 @@ public:
|
||||
template <class BaseFactory>
|
||||
class Accumulator : public BaseFactory {
|
||||
typedef BaseFactory Factory;
|
||||
|
||||
public:
|
||||
Accumulator() : Factory(nullptr) {}
|
||||
Accumulator(Ptr<ExpressionGraph> graph) : Factory(graph) {}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
|
||||
#include "layers/factory.h"
|
||||
|
||||
namespace marian {
|
||||
@ -75,11 +76,9 @@ public:
|
||||
if(tiedParams_.count(nameW)) {
|
||||
W = tiedParams_[nameW];
|
||||
transposeW = true;
|
||||
}
|
||||
else {
|
||||
W = g->param(name + "_" + nameW,
|
||||
{in->shape()[-1], dim},
|
||||
inits::glorot_uniform);
|
||||
} else {
|
||||
W = g->param(
|
||||
name + "_" + nameW, {in->shape()[-1], dim}, inits::glorot_uniform);
|
||||
}
|
||||
|
||||
Expr b;
|
||||
@ -87,8 +86,7 @@ public:
|
||||
if(tiedParams_.count(nameB))
|
||||
b = tiedParams_[nameB];
|
||||
else
|
||||
b = g->param(
|
||||
name + "_" + nameB, {1, dim}, inits::zeros);
|
||||
b = g->param(name + "_" + nameB, {1, dim}, inits::zeros);
|
||||
|
||||
params_.push_back(W);
|
||||
params_.push_back(b);
|
||||
@ -98,19 +96,19 @@ public:
|
||||
auto ln_s = g->param(name + "_ln_s" + std::to_string(i),
|
||||
{1, dim},
|
||||
inits::from_value(1.f));
|
||||
auto ln_b = g->param(name + "_ln_b" + std::to_string(i),
|
||||
{1, dim},
|
||||
inits::zeros);
|
||||
auto ln_b = g->param(
|
||||
name + "_ln_b" + std::to_string(i), {1, dim}, inits::zeros);
|
||||
|
||||
outputs.push_back(
|
||||
layer_norm(affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS));
|
||||
outputs.push_back(layer_norm(
|
||||
affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS));
|
||||
} else {
|
||||
auto gamma = g->param(name + "_gamma" + std::to_string(i),
|
||||
{1, dim},
|
||||
inits::from_value(1.0));
|
||||
|
||||
params_.push_back(gamma);
|
||||
outputs.push_back(layer_norm(dot(in, W, false, transposeW), gamma, b));
|
||||
outputs.push_back(
|
||||
layer_norm(dot(in, W, false, transposeW), gamma, b));
|
||||
}
|
||||
|
||||
} else {
|
||||
@ -147,11 +145,9 @@ public:
|
||||
if(tiedParams_.count(nameW)) {
|
||||
transposeW = true;
|
||||
W = tiedParams_[nameW];
|
||||
}
|
||||
else {
|
||||
W = g->param(name + "_" + nameW,
|
||||
{input->shape()[-1], dim},
|
||||
inits::glorot_uniform);
|
||||
} else {
|
||||
W = g->param(
|
||||
name + "_" + nameW, {input->shape()[-1], dim}, inits::glorot_uniform);
|
||||
}
|
||||
Expr b;
|
||||
std::string nameB = "b";
|
||||
@ -165,16 +161,14 @@ public:
|
||||
Expr out;
|
||||
if(layerNorm) {
|
||||
if(nematusNorm) {
|
||||
auto ln_s = g->param(
|
||||
name + "_ln_s", {1, dim}, inits::from_value(1.f));
|
||||
auto ln_b
|
||||
= g->param(name + "_ln_b", {1, dim}, inits::zeros);
|
||||
auto ln_s = g->param(name + "_ln_s", {1, dim}, inits::from_value(1.f));
|
||||
auto ln_b = g->param(name + "_ln_b", {1, dim}, inits::zeros);
|
||||
|
||||
out = layer_norm(affine(input, W, b, false, transposeW),
|
||||
ln_s, ln_b, NEMATUS_LN_EPS);
|
||||
out = layer_norm(
|
||||
affine(input, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS);
|
||||
} else {
|
||||
auto gamma = g->param(
|
||||
name + "_gamma", {1, dim}, inits::from_value(1.0));
|
||||
auto gamma
|
||||
= g->param(name + "_gamma", {1, dim}, inits::from_value(1.0));
|
||||
|
||||
params_.push_back(gamma);
|
||||
out = layer_norm(dot(input, W, false, transposeW), gamma, b);
|
||||
@ -217,22 +211,18 @@ struct EmbeddingFactory : public Factory {
|
||||
}
|
||||
}
|
||||
|
||||
return graph_->param(name,
|
||||
{dimVoc, dimEmb},
|
||||
initFunc,
|
||||
fixed);
|
||||
return graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
|
||||
}
|
||||
};
|
||||
|
||||
typedef Accumulator<EmbeddingFactory> embedding;
|
||||
|
||||
static inline
|
||||
Expr Cost(Expr logits,
|
||||
Expr indices,
|
||||
Expr mask,
|
||||
std::string costType = "cross-entropy",
|
||||
float smoothing = 0,
|
||||
Expr weights = nullptr) {
|
||||
static inline Expr Cost(Expr logits,
|
||||
Expr indices,
|
||||
Expr mask,
|
||||
std::string costType = "cross-entropy",
|
||||
float smoothing = 0,
|
||||
Expr weights = nullptr) {
|
||||
using namespace keywords;
|
||||
|
||||
auto ce = cross_entropy(logits, indices);
|
||||
@ -255,15 +245,17 @@ Expr Cost(Expr logits,
|
||||
// axes:
|
||||
// - time axis (words): -3
|
||||
// - batch axis (sentences): -2
|
||||
if(costType == "ce-mean" || costType == "cross-entropy") { // sum over words; average over sentences
|
||||
if(costType == "ce-mean"
|
||||
|| costType
|
||||
== "cross-entropy") { // sum over words; average over sentences
|
||||
cost = mean(costSum, axis = -2);
|
||||
} else if(costType == "ce-mean-words") { // average over target tokens
|
||||
} else if(costType == "ce-mean-words") { // average over target tokens
|
||||
cost = sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2);
|
||||
} else if(costType == "ce-sum") { // sum over target tokens
|
||||
} else if(costType == "ce-sum") { // sum over target tokens
|
||||
cost = sum(costSum, axis = -2);
|
||||
} else if(costType == "perplexity") { // ==exp('ce-mean-words')
|
||||
} else if(costType == "perplexity") { // ==exp('ce-mean-words')
|
||||
cost = exp(sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2));
|
||||
} else if(costType == "ce-rescore") { // sum over words, keep batch axis
|
||||
} else if(costType == "ce-rescore") { // sum over words, keep batch axis
|
||||
cost = -costSum;
|
||||
} else { // same as ce-mean
|
||||
cost = mean(costSum, axis = -2);
|
||||
|
@ -4,20 +4,18 @@
|
||||
|
||||
namespace marian {
|
||||
|
||||
static inline
|
||||
Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph,
|
||||
Ptr<data::CorpusBatch> batch,
|
||||
Ptr<Options> options,
|
||||
Expr att) {
|
||||
static inline Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph,
|
||||
Ptr<data::CorpusBatch> batch,
|
||||
Ptr<Options> options,
|
||||
Expr att) {
|
||||
using namespace keywords;
|
||||
|
||||
int dimBatch = att->shape()[0];
|
||||
int dimSrc = att->shape()[2];
|
||||
int dimTrg = att->shape()[3];
|
||||
|
||||
auto aln = graph->constant(
|
||||
{dimBatch, 1, dimSrc, dimTrg},
|
||||
inits::from_vector(batch->getGuidedAlignment()));
|
||||
auto aln = graph->constant({dimBatch, 1, dimSrc, dimTrg},
|
||||
inits::from_vector(batch->getGuidedAlignment()));
|
||||
|
||||
std::string guidedCostType
|
||||
= options->get<std::string>("guided-alignment-cost");
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include "marian.h"
|
||||
#include "models/s2s.h"
|
||||
|
||||
#include "layers/convolution.h"
|
||||
|
||||
namespace marian {
|
||||
@ -29,21 +30,18 @@ public:
|
||||
|
||||
int dimEmb = opt<int>("dim-emb");
|
||||
auto convSizes = options_->get<std::vector<int>>("char-conv-filters-num");
|
||||
auto convWidths = options_->get<std::vector<int>>("char-conv-filters-widths");
|
||||
auto convWidths
|
||||
= options_->get<std::vector<int>>("char-conv-filters-widths");
|
||||
int stride = opt<int>("char-stride");
|
||||
int highwayNum = opt<int>("char-highway");
|
||||
|
||||
auto conved = CharConvPooling(
|
||||
prefix_ + "conv_pooling",
|
||||
dimEmb,
|
||||
convWidths,
|
||||
convSizes,
|
||||
stride)
|
||||
(batchEmbeddings, batchMask);
|
||||
prefix_ + "conv_pooling", dimEmb, convWidths, convSizes, stride)(
|
||||
batchEmbeddings, batchMask);
|
||||
|
||||
auto inHighway = conved;
|
||||
for (int i = 0; i < highwayNum; ++i) {
|
||||
inHighway = highway(prefix_ +"_" + std::to_string(i), inHighway);
|
||||
for(int i = 0; i < highwayNum; ++i) {
|
||||
inHighway = highway(prefix_ + "_" + std::to_string(i), inHighway);
|
||||
}
|
||||
|
||||
Expr stridedMask = getStridedMask(graph, batch, stride);
|
||||
@ -52,24 +50,26 @@ public:
|
||||
|
||||
return New<EncoderState>(context, stridedMask, batch);
|
||||
}
|
||||
|
||||
protected:
|
||||
Expr getStridedMask(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch,
|
||||
Expr getStridedMask(Ptr<ExpressionGraph> graph,
|
||||
Ptr<data::CorpusBatch> batch,
|
||||
int stride) {
|
||||
auto subBatch = (*batch)[batchIndex_];
|
||||
|
||||
int dimBatch = subBatch->batchSize();
|
||||
|
||||
std::vector<float> strided;
|
||||
for (size_t wordIdx = 0; wordIdx < subBatch->mask().size(); wordIdx += stride * dimBatch) {
|
||||
for (size_t j = wordIdx; j < wordIdx + dimBatch; ++j) {
|
||||
for(size_t wordIdx = 0; wordIdx < subBatch->mask().size();
|
||||
wordIdx += stride * dimBatch) {
|
||||
for(size_t j = wordIdx; j < wordIdx + dimBatch; ++j) {
|
||||
strided.push_back(subBatch->mask()[j]);
|
||||
}
|
||||
}
|
||||
int dimWords = strided.size() / dimBatch;
|
||||
auto stridedMask = graph->constant({dimWords, dimBatch, 1},
|
||||
inits::from_vector(strided));
|
||||
auto stridedMask
|
||||
= graph->constant({dimWords, dimBatch, 1}, inits::from_vector(strided));
|
||||
return stridedMask;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
|
||||
#include "layers/generic.h"
|
||||
#include "layers/guided_alignment.h"
|
||||
#include "model_base.h"
|
||||
@ -30,8 +31,8 @@ protected:
|
||||
|
||||
auto batchEmbeddings
|
||||
= reshape(chosenEmbeddings, {dimWords, dimBatch, dimEmb});
|
||||
auto batchMask = graph->constant(
|
||||
{dimWords, dimBatch, 1}, inits::from_vector(subBatch->mask()));
|
||||
auto batchMask = graph->constant({dimWords, dimBatch, 1},
|
||||
inits::from_vector(subBatch->mask()));
|
||||
|
||||
return std::make_tuple(batchEmbeddings, batchMask);
|
||||
}
|
||||
@ -129,7 +130,8 @@ public:
|
||||
virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
|
||||
Ptr<DecoderState> state,
|
||||
const std::vector<size_t>& embIdx,
|
||||
int dimBatch, int dimBeam) {
|
||||
int dimBatch,
|
||||
int dimBeam) {
|
||||
using namespace keywords;
|
||||
|
||||
int dimTrgEmb = opt<int>("dim-emb");
|
||||
@ -149,12 +151,10 @@ public:
|
||||
|
||||
Expr selectedEmbs;
|
||||
if(embIdx.empty()) {
|
||||
selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb},
|
||||
inits::zeros);
|
||||
selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb}, inits::zeros);
|
||||
} else {
|
||||
selectedEmbs = rows(yEmb, embIdx);
|
||||
selectedEmbs
|
||||
= reshape(selectedEmbs, {dimBeam, 1, dimBatch, dimTrgEmb});
|
||||
selectedEmbs = reshape(selectedEmbs, {dimBeam, 1, dimBatch, dimTrgEmb});
|
||||
}
|
||||
state->setTargetEmbeddings(selectedEmbs);
|
||||
}
|
||||
@ -174,14 +174,16 @@ public:
|
||||
virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
|
||||
Ptr<DecoderState> state,
|
||||
const std::vector<size_t>&,
|
||||
int dimBatch, int beamSize)
|
||||
int dimBatch,
|
||||
int beamSize)
|
||||
= 0;
|
||||
|
||||
virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
|
||||
Ptr<DecoderState>,
|
||||
const std::vector<size_t>&,
|
||||
const std::vector<size_t>&,
|
||||
int dimBatch, int beamSize)
|
||||
int dimBatch,
|
||||
int beamSize)
|
||||
= 0;
|
||||
|
||||
virtual Ptr<DecoderState> step(Ptr<ExpressionGraph>, Ptr<DecoderState>) = 0;
|
||||
@ -224,7 +226,8 @@ protected:
|
||||
|
||||
decoder["mini-batch"] = opt<size_t>("valid-mini-batch");
|
||||
decoder["maxi-batch"] = opt<size_t>("valid-mini-batch") > 1 ? 100 : 1;
|
||||
decoder["maxi-batch-sort"] = opt<size_t>("valid-mini-batch") > 1 ? "trg" : "none";
|
||||
decoder["maxi-batch-sort"]
|
||||
= opt<size_t>("valid-mini-batch") > 1 ? "trg" : "none";
|
||||
|
||||
decoder["relative-paths"] = false;
|
||||
|
||||
@ -320,8 +323,10 @@ public:
|
||||
Ptr<DecoderState> state,
|
||||
const std::vector<size_t>& hypIndices,
|
||||
const std::vector<size_t>& embIndices,
|
||||
int dimBatch, int beamSize) {
|
||||
auto selectedState = hypIndices.empty() ? state : state->select(hypIndices, beamSize);
|
||||
int dimBatch,
|
||||
int beamSize) {
|
||||
auto selectedState
|
||||
= hypIndices.empty() ? state : state->select(hypIndices, beamSize);
|
||||
selectEmbeddings(graph, selectedState, embIndices, dimBatch, beamSize);
|
||||
selectedState->setSingleStep(true);
|
||||
auto nextState = step(graph, selectedState);
|
||||
@ -332,7 +337,8 @@ public:
|
||||
virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
|
||||
Ptr<DecoderState> state,
|
||||
const std::vector<size_t>& embIdx,
|
||||
int dimBatch, int beamSize) {
|
||||
int dimBatch,
|
||||
int beamSize) {
|
||||
decoders_[0]->selectEmbeddings(graph, state, embIdx, dimBatch, beamSize);
|
||||
}
|
||||
|
||||
@ -366,17 +372,12 @@ public:
|
||||
int dimBatch = batch->size();
|
||||
int dimWords = sentenceWeighting ? 1 : batch->back()->batchWidth();
|
||||
|
||||
weights = graph->constant(
|
||||
{1, dimWords, dimBatch, 1},
|
||||
inits::from_vector(batch->getDataWeights()));
|
||||
weights = graph->constant({1, dimWords, dimBatch, 1},
|
||||
inits::from_vector(batch->getDataWeights()));
|
||||
}
|
||||
|
||||
auto cost = Cost(nextState->getProbs(),
|
||||
trgIdx,
|
||||
trgMask,
|
||||
costType,
|
||||
ls,
|
||||
weights);
|
||||
auto cost
|
||||
= Cost(nextState->getProbs(), trgIdx, trgMask, costType, ls, weights);
|
||||
|
||||
if(options_->has("guided-alignment") && !inference_) {
|
||||
auto alignments = decoders_[0]->getAlignments();
|
||||
@ -428,7 +429,7 @@ public:
|
||||
|
||||
do {
|
||||
size_t current = (start + end) / 2;
|
||||
//std::cerr << i << " " << current << std::endl;
|
||||
// std::cerr << i << " " << current << std::endl;
|
||||
auto batch = data::CorpusBatch::fakeBatch(lengths, current, options_);
|
||||
build(graph, batch);
|
||||
fits = graph->fits();
|
||||
@ -436,8 +437,7 @@ public:
|
||||
if(fits) {
|
||||
stats->add(batch, multiplier);
|
||||
start = current + 1;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
end = current - 1;
|
||||
}
|
||||
} while(end - start > step);
|
||||
|
@ -1,9 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
|
||||
#include "layers/generic.h"
|
||||
#include "rnn/types.h"
|
||||
#include "rnn/attention_constructors.h"
|
||||
#include "rnn/types.h"
|
||||
|
||||
#include <numeric>
|
||||
|
||||
@ -21,13 +22,16 @@ public:
|
||||
: DecoderState(states, probs, encStates),
|
||||
attentionIndices_(attentionIndices) {}
|
||||
|
||||
virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, int beamSize) {
|
||||
virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx,
|
||||
int beamSize) {
|
||||
std::vector<size_t> selectedAttentionIndices;
|
||||
for(auto i : selIdx)
|
||||
selectedAttentionIndices.push_back(attentionIndices_[i]);
|
||||
|
||||
return New<DecoderStateHardAtt>(
|
||||
states_.select(selIdx, beamSize), probs_, encStates_, selectedAttentionIndices);
|
||||
return New<DecoderStateHardAtt>(states_.select(selIdx, beamSize),
|
||||
probs_,
|
||||
encStates_,
|
||||
selectedAttentionIndices);
|
||||
}
|
||||
|
||||
virtual void setAttentionIndices(
|
||||
@ -85,11 +89,11 @@ public:
|
||||
Expr start;
|
||||
if(!meanContexts.empty()) {
|
||||
// apply single layer network to mean to map into decoder space
|
||||
auto mlp = mlp::mlp(graph) //
|
||||
.push_back(mlp::dense(graph) //
|
||||
("prefix", prefix_ + "_ff_state") //
|
||||
("dim", opt<int>("dim-rnn")) //
|
||||
("activation", (int)mlp::act::tanh)//
|
||||
auto mlp = mlp::mlp(graph) //
|
||||
.push_back(mlp::dense(graph) //
|
||||
("prefix", prefix_ + "_ff_state") //
|
||||
("dim", opt<int>("dim-rnn")) //
|
||||
("activation", (int)mlp::act::tanh) //
|
||||
("layer-normalization",
|
||||
opt<bool>("layer-normalization")));
|
||||
start = mlp->apply(meanContexts);
|
||||
@ -133,7 +137,8 @@ public:
|
||||
int dimBeam = trgEmbeddings->shape()[-4];
|
||||
|
||||
if(dropoutTrg) {
|
||||
trgEmbeddings = dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1});
|
||||
trgEmbeddings
|
||||
= dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1});
|
||||
}
|
||||
|
||||
auto flatContext = reshape(context, {dimBatch * dimSrcWords, dimContext});
|
||||
|
@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
#include <string>
|
||||
#include "marian.h"
|
||||
|
||||
namespace marian {
|
||||
namespace models {
|
||||
|
@ -1,12 +1,13 @@
|
||||
#include "marian.h"
|
||||
|
||||
#include "models/model_factory.h"
|
||||
|
||||
#include "models/amun.h"
|
||||
#include "models/encdec.h"
|
||||
#include "models/hardatt.h"
|
||||
#include "models/nematus.h"
|
||||
#include "models/s2s.h"
|
||||
#include "models/transformer.h"
|
||||
#include "models/hardatt.h"
|
||||
#include "models/amun.h"
|
||||
#include "models/nematus.h"
|
||||
#include "models/encdec.h"
|
||||
|
||||
#ifdef CUDNN
|
||||
#include "models/char_s2s.h"
|
||||
|
@ -1,8 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
#include "layers/factory.h"
|
||||
|
||||
#include "encdec.h"
|
||||
#include "layers/factory.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
|
@ -1,9 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
|
||||
#include "layers/constructors.h"
|
||||
#include "rnn/constructors.h"
|
||||
#include "rnn/attention_constructors.h"
|
||||
#include "rnn/constructors.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
|
@ -48,8 +48,10 @@ public:
|
||||
virtual Expr getProbs() { return probs_; }
|
||||
virtual void setProbs(Expr probs) { probs_ = probs; }
|
||||
|
||||
virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, int beamSize) {
|
||||
return New<DecoderState>(states_.select(selIdx, beamSize), probs_, encStates_);
|
||||
virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx,
|
||||
int beamSize) {
|
||||
return New<DecoderState>(
|
||||
states_.select(selIdx, beamSize), probs_, encStates_);
|
||||
}
|
||||
|
||||
virtual const rnn::States& getStates() { return states_; }
|
||||
|
@ -1,11 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
#include "layers/factory.h"
|
||||
|
||||
#include "encdec.h"
|
||||
#include "layers/constructors.h"
|
||||
#include "layers/factory.h"
|
||||
#include "model_base.h"
|
||||
#include "model_factory.h"
|
||||
#include "encdec.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -34,8 +35,8 @@ public:
|
||||
}
|
||||
|
||||
// shared across batch entries
|
||||
auto signal = graph->constant({dimWords, 1, dimEmb},
|
||||
inits::from_vector(vPos));
|
||||
auto signal
|
||||
= graph->constant({dimWords, 1, dimEmb}, inits::from_vector(vPos));
|
||||
return input + signal;
|
||||
}
|
||||
|
||||
@ -47,15 +48,14 @@ public:
|
||||
for(int i = 0; i < length; ++i)
|
||||
for(int j = 0; j <= i; ++j)
|
||||
vMask[i * length + j] = 1.f;
|
||||
return graph->constant({1, length, length},
|
||||
inits::from_vector(vMask));
|
||||
return graph->constant({1, length, length}, inits::from_vector(vMask));
|
||||
}
|
||||
|
||||
Expr InverseMask(Expr mask) {
|
||||
// convert 0/1 mask to transformer style -inf mask
|
||||
auto ms = mask->shape();
|
||||
mask = (1 - mask) * -99999999.f;
|
||||
return reshape(mask, {ms[-3], 1, ms[-2], ms[-1]}) ;
|
||||
return reshape(mask, {ms[-3], 1, ms[-2], ms[-1]});
|
||||
}
|
||||
|
||||
Expr SplitHeads(Expr input, int dimHeads) {
|
||||
@ -135,18 +135,17 @@ public:
|
||||
if(op == 'h') {
|
||||
auto Wh = graph->param(
|
||||
prefix + "_Wh", {dimModel, dimModel}, inits::glorot_uniform);
|
||||
auto bh
|
||||
= graph->param(prefix + "_bh", {1, dimModel}, inits::zeros);
|
||||
auto bh = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros);
|
||||
|
||||
auto t = affine(prevInput, Wh, bh);
|
||||
output = highway(output, prevInput, t);
|
||||
}
|
||||
// layer normalization
|
||||
if(op == 'n') {
|
||||
auto scale = graph->param(
|
||||
prefix + "_ln_scale", {1, dimModel}, inits::ones);
|
||||
auto bias = graph->param(
|
||||
prefix + "_ln_bias", {1, dimModel}, inits::zeros);
|
||||
auto scale
|
||||
= graph->param(prefix + "_ln_scale", {1, dimModel}, inits::ones);
|
||||
auto bias
|
||||
= graph->param(prefix + "_ln_bias", {1, dimModel}, inits::zeros);
|
||||
output = layer_norm(output, scale, bias, 1e-6);
|
||||
}
|
||||
}
|
||||
@ -219,17 +218,13 @@ public:
|
||||
if(i > 0)
|
||||
prefixProj += "_enc" + std::to_string(i + 1);
|
||||
|
||||
auto Wk = graph->param(prefixProj + "_Wk",
|
||||
{dimModel, dimModel},
|
||||
inits::glorot_uniform);
|
||||
auto bk = graph->param(
|
||||
prefixProj + "_bk", {1, dimModel}, inits::zeros);
|
||||
auto Wk = graph->param(
|
||||
prefixProj + "_Wk", {dimModel, dimModel}, inits::glorot_uniform);
|
||||
auto bk = graph->param(prefixProj + "_bk", {1, dimModel}, inits::zeros);
|
||||
|
||||
auto Wv = graph->param(prefixProj + "_Wv",
|
||||
{dimModel, dimModel},
|
||||
inits::glorot_uniform);
|
||||
auto bv = graph->param(
|
||||
prefixProj + "_bv", {1, dimModel}, inits::zeros);
|
||||
auto Wv = graph->param(
|
||||
prefixProj + "_Wv", {dimModel, dimModel}, inits::glorot_uniform);
|
||||
auto bv = graph->param(prefixProj + "_bv", {1, dimModel}, inits::zeros);
|
||||
|
||||
auto kh = affine(keys[i], Wk, bk);
|
||||
auto vh = affine(values[i], Wv, bv);
|
||||
@ -254,8 +249,8 @@ public:
|
||||
|
||||
int dimAtt = output->shape()[-1];
|
||||
|
||||
auto Wo = graph->param(
|
||||
prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
|
||||
auto Wo
|
||||
= graph->param(prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
|
||||
auto bo = graph->param(prefix + "_bo", {1, dimOut}, inits::zeros);
|
||||
output = affine(output, Wo, bo);
|
||||
|
||||
@ -468,11 +463,12 @@ public:
|
||||
std::vector<Ptr<EncoderState>> &encStates)
|
||||
: DecoderState(states, probs, encStates) {}
|
||||
|
||||
virtual Ptr<DecoderState> select(const std::vector<size_t> &selIdx, int beamSize) {
|
||||
virtual Ptr<DecoderState> select(const std::vector<size_t> &selIdx,
|
||||
int beamSize) {
|
||||
rnn::States selectedStates;
|
||||
|
||||
int dimDepth = states_[0].output->shape()[-1];
|
||||
int dimTime = states_[0].output->shape()[-2];
|
||||
int dimTime = states_[0].output->shape()[-2];
|
||||
int dimBatch = selIdx.size() / beamSize;
|
||||
|
||||
std::vector<size_t> selIdx2;
|
||||
@ -553,7 +549,7 @@ public:
|
||||
decoderMask = reshape(TransposeTimeBatch(decoderMask),
|
||||
{1, dimBatch, 1, dimTrgWords});
|
||||
selfMask = selfMask * decoderMask;
|
||||
//if(dimBeam > 1)
|
||||
// if(dimBeam > 1)
|
||||
// selfMask = repeat(selfMask, dimBeam, axis = -4);
|
||||
}
|
||||
|
||||
@ -586,7 +582,8 @@ public:
|
||||
for(int i = 1; i <= opt<int>("dec-depth"); ++i) {
|
||||
auto values = query;
|
||||
if(prevDecoderStates.size() > 0)
|
||||
values = concatenate({prevDecoderStates[i - 1].output, query}, axis = -2);
|
||||
values
|
||||
= concatenate({prevDecoderStates[i - 1].output, query}, axis = -2);
|
||||
|
||||
decoderStates.push_back({values, nullptr});
|
||||
|
||||
|
@ -15,5 +15,4 @@ void Norm::clip(Tensor t) {
|
||||
if(l2Norm >= c_)
|
||||
Element(_1 = (c_ / l2Norm) * _1, t);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -58,7 +58,8 @@ void Adagrad::load(const std::string& name,
|
||||
// extract data into vectors
|
||||
if(name == "adagrad_gt") {
|
||||
vGt.resize(totalSize);
|
||||
std::copy((float*)np->data(), (float*)np->data() + totalSize, vGt.begin());
|
||||
std::copy(
|
||||
(float*)np->data(), (float*)np->data() + totalSize, vGt.begin());
|
||||
}
|
||||
}
|
||||
|
||||
@ -178,11 +179,13 @@ void Adam::load(const std::string& name,
|
||||
// extract data into vectors
|
||||
if(name == "adam_mt") {
|
||||
vMt.resize(totalSize);
|
||||
std::copy((float*)np->data(), (float*)np->data() + totalSize, vMt.begin());
|
||||
std::copy(
|
||||
(float*)np->data(), (float*)np->data() + totalSize, vMt.begin());
|
||||
}
|
||||
if(name == "adam_vt") {
|
||||
vVt.resize(totalSize);
|
||||
std::copy((float*)np->data(), (float*)np->data() + totalSize, vVt.begin());
|
||||
std::copy(
|
||||
(float*)np->data(), (float*)np->data() + totalSize, vVt.begin());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <algorithm>
|
||||
|
||||
#include "common/config.h"
|
||||
#include "graph/expression_graph.h"
|
||||
|
@ -42,9 +42,11 @@ private:
|
||||
public:
|
||||
Rescore(Ptr<Config> options)
|
||||
: options_(options),
|
||||
corpus_(options_->get<bool>("n-best") ?
|
||||
std::static_pointer_cast<CorpusBase>(New<CorpusNBest>(options_)) :
|
||||
std::static_pointer_cast<CorpusBase>(New<Corpus>(options_))) {
|
||||
corpus_(
|
||||
options_->get<bool>("n-best")
|
||||
? std::static_pointer_cast<CorpusBase>(
|
||||
New<CorpusNBest>(options_))
|
||||
: std::static_pointer_cast<CorpusBase>(New<Corpus>(options_))) {
|
||||
corpus_->prepare();
|
||||
|
||||
auto devices = options_->getDevices();
|
||||
@ -66,12 +68,12 @@ public:
|
||||
models_.resize(graphs_.size());
|
||||
ThreadPool pool(graphs_.size(), graphs_.size());
|
||||
for(int i = 0; i < graphs_.size(); ++i) {
|
||||
|
||||
pool.enqueue([=](int j) {
|
||||
models_[j] = New<Model>(temp);
|
||||
models_[j]->load(graphs_[j], modelFile);
|
||||
}, i);
|
||||
|
||||
pool.enqueue(
|
||||
[=](int j) {
|
||||
models_[j] = New<Model>(temp);
|
||||
models_[j]->load(graphs_[j], modelFile);
|
||||
},
|
||||
i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -81,9 +83,10 @@ public:
|
||||
auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
|
||||
batchGenerator->prepare(false);
|
||||
|
||||
Ptr<ScoreCollector> output = options_->get<bool>("n-best") ?
|
||||
std::static_pointer_cast<ScoreCollector>(New<ScoreCollectorNBest>(options_)) :
|
||||
New<ScoreCollector>();
|
||||
Ptr<ScoreCollector> output = options_->get<bool>("n-best")
|
||||
? std::static_pointer_cast<ScoreCollector>(
|
||||
New<ScoreCollectorNBest>(options_))
|
||||
: New<ScoreCollector>();
|
||||
|
||||
bool summarize = options_->has("summary");
|
||||
std::string summary
|
||||
|
@ -13,8 +13,7 @@ namespace marian {
|
||||
|
||||
class ScoreCollector {
|
||||
public:
|
||||
ScoreCollector()
|
||||
: nextId_(0), outStrm_(new OutputFileStream(std::cout)) {};
|
||||
ScoreCollector() : nextId_(0), outStrm_(new OutputFileStream(std::cout)){};
|
||||
|
||||
virtual void Write(long id, const std::string& message) {
|
||||
boost::mutex::scoped_lock lock(mutex_);
|
||||
@ -52,9 +51,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Write(long id, float value) {
|
||||
Write(id, std::to_string(value));
|
||||
}
|
||||
virtual void Write(long id, float value) { Write(id, std::to_string(value)); }
|
||||
|
||||
protected:
|
||||
long nextId_{0};
|
||||
@ -86,7 +83,9 @@ public:
|
||||
|
||||
ScoreCollectorNBest(const ScoreCollectorNBest&) = delete;
|
||||
|
||||
std::string addToNBest(const std::string nbest, const std::string feature, float score) {
|
||||
std::string addToNBest(const std::string nbest,
|
||||
const std::string feature,
|
||||
float score) {
|
||||
std::vector<std::string> fields;
|
||||
Split(nbest, fields, "|||");
|
||||
std::stringstream ss;
|
||||
@ -96,13 +95,15 @@ public:
|
||||
}
|
||||
|
||||
virtual void Write(long id, float score) {
|
||||
|
||||
std::string line;
|
||||
{
|
||||
boost::mutex::scoped_lock lock(mutex_);
|
||||
auto iter = buffer_.find(id);
|
||||
if(iter == buffer_.end()) {
|
||||
ABORT_IF(lastRead_ >= id, "Entry {} < {} already read but not in buffer", id, lastRead_);
|
||||
ABORT_IF(lastRead_ >= id,
|
||||
"Entry {} < {} already read but not in buffer",
|
||||
id,
|
||||
lastRead_);
|
||||
std::string line;
|
||||
while(lastRead_ < id && std::getline((std::istream&)*file_, line)) {
|
||||
lastRead_++;
|
||||
@ -116,6 +117,5 @@ public:
|
||||
|
||||
ScoreCollector::Write(id, addToNBest(line, fname_, score));
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -1,10 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
|
||||
#include "layers/factory.h"
|
||||
#include "rnn/types.h"
|
||||
#include "rnn/constructors.h"
|
||||
#include "rnn/attention.h"
|
||||
#include "rnn/constructors.h"
|
||||
#include "rnn/types.h"
|
||||
|
||||
namespace marian {
|
||||
namespace rnn {
|
||||
|
@ -52,8 +52,7 @@ Expr gruOps(const std::vector<Expr>& nodes, bool final) {
|
||||
/******************************************************************************/
|
||||
|
||||
struct LSTMCellNodeOp : public NaryNodeOp {
|
||||
LSTMCellNodeOp(const std::vector<Expr>& nodes)
|
||||
: NaryNodeOp(nodes) {}
|
||||
LSTMCellNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
std::vector<Tensor> inputs;
|
||||
@ -89,8 +88,7 @@ struct LSTMCellNodeOp : public NaryNodeOp {
|
||||
};
|
||||
|
||||
struct LSTMOutputNodeOp : public NaryNodeOp {
|
||||
LSTMOutputNodeOp(const std::vector<Expr>& nodes)
|
||||
: NaryNodeOp(nodes) {}
|
||||
LSTMOutputNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {}
|
||||
|
||||
NodeOps forwardOps() {
|
||||
std::vector<Tensor> inputs;
|
||||
|
@ -43,8 +43,7 @@ public:
|
||||
{dimInput, dimState},
|
||||
inits::glorot_uniform);
|
||||
|
||||
b_ = graph->param(
|
||||
prefix + "_b", {1, dimState}, inits::zeros);
|
||||
b_ = graph->param(prefix + "_b", {1, dimState}, inits::zeros);
|
||||
|
||||
if(dropout_ > 0.0f) {
|
||||
if(dimInput)
|
||||
@ -520,8 +519,7 @@ public:
|
||||
{dimInput, 4 * dimState},
|
||||
inits::glorot_uniform);
|
||||
|
||||
b_ = graph->param(
|
||||
prefix + "_b", {1, 4 * dimState}, inits::zeros);
|
||||
b_ = graph->param(prefix + "_b", {1, 4 * dimState}, inits::zeros);
|
||||
|
||||
if(dropout_ > 0.0f) {
|
||||
if(dimInput)
|
||||
|
@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
#include "layers/factory.h"
|
||||
#include "marian.h"
|
||||
#include "rnn/rnn.h"
|
||||
|
||||
namespace marian {
|
||||
|
@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include "marian.h"
|
||||
#include "layers/generic.h"
|
||||
#include "rnn/types.h"
|
||||
#include "marian.h"
|
||||
#include "rnn/cells.h"
|
||||
#include "rnn/types.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
@ -74,10 +74,9 @@ private:
|
||||
j = timeSteps - i - 1;
|
||||
|
||||
std::vector<Expr> steps(xWs.size());
|
||||
std::transform(xWs.begin(),
|
||||
xWs.end(),
|
||||
steps.begin(),
|
||||
[j](Expr e) { return step(e, j, -3); });
|
||||
std::transform(xWs.begin(), xWs.end(), steps.begin(), [j](Expr e) {
|
||||
return step(e, j, -3);
|
||||
});
|
||||
|
||||
if(mask)
|
||||
state = cell_->applyState(steps, state, step(mask, j, -3));
|
||||
|
@ -9,8 +9,8 @@
|
||||
#include <vector>
|
||||
|
||||
#include "common/definitions.h"
|
||||
#include "tensors/memory_piece.h"
|
||||
#include "tensors/device.h"
|
||||
#include "tensors/memory_piece.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -92,8 +92,8 @@ private:
|
||||
gaps_.swap(oldGaps);
|
||||
|
||||
for(auto gap : oldGaps)
|
||||
gaps_.insert(
|
||||
Gap(device_->data() + std::distance(oldData, gap.data()), gap.size()));
|
||||
gaps_.insert(Gap(device_->data() + std::distance(oldData, gap.data()),
|
||||
gap.size()));
|
||||
insertGap(Gap(device_->data() + oldSize, add));
|
||||
|
||||
std::unordered_map<uint8_t*, Ptr<MemoryPiece>> oldAllocated;
|
||||
@ -141,7 +141,10 @@ private:
|
||||
}
|
||||
|
||||
public:
|
||||
Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256)
|
||||
Allocator(DeviceId deviceId,
|
||||
size_t bytes,
|
||||
size_t step,
|
||||
size_t alignment = 256)
|
||||
: device_(DispatchDevice(deviceId, alignment)),
|
||||
step_(step),
|
||||
available_(0),
|
||||
|
@ -16,5 +16,4 @@ Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
|
||||
#endif
|
||||
return New<cpu::Backend>(deviceId, seed);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -10,8 +10,7 @@ protected:
|
||||
size_t seed_;
|
||||
|
||||
public:
|
||||
Backend(DeviceId deviceId, size_t seed)
|
||||
: deviceId_(deviceId), seed_(seed) {}
|
||||
Backend(DeviceId deviceId, size_t seed) : deviceId_(deviceId), seed_(seed) {}
|
||||
|
||||
virtual DeviceId getDevice() { return deviceId_; };
|
||||
virtual void setDevice() = 0;
|
||||
@ -19,5 +18,4 @@ public:
|
||||
};
|
||||
|
||||
Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed);
|
||||
|
||||
}
|
||||
|
@ -5,24 +5,22 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensors/tensor.h"
|
||||
#include "functional/functional.h"
|
||||
#include "functional/shape.h"
|
||||
#include "functional/tmp.h"
|
||||
#include "functional/tensor.h"
|
||||
#include "functional/tmp.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
namespace cpu {
|
||||
|
||||
|
||||
template <size_t K, class Functor>
|
||||
void gAddGeneric(Functor functor,
|
||||
const functional::Shape full,
|
||||
functional::Tensor<float> out,
|
||||
functional::Array<functional::Tensor<float>, K> ins,
|
||||
float scale = 1.0) {
|
||||
|
||||
int outLength = out.shape().elements();
|
||||
bool same = outLength == full.elements();
|
||||
for(int i = 0; i < K; ++i)
|
||||
@ -73,7 +71,6 @@ void gAddReduce(Functor functor,
|
||||
functional::Tensor<float> out,
|
||||
functional::Array<functional::Tensor<float>, K> ins,
|
||||
float scale = 1.0) {
|
||||
|
||||
int rows = full.elements() / full.back();
|
||||
int cols = full.back();
|
||||
|
||||
@ -100,12 +97,8 @@ void gAddReduce(Functor functor,
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Add(Functor functor,
|
||||
float scale,
|
||||
marian::Tensor out,
|
||||
Tensors... tensors) {
|
||||
|
||||
template <class Functor, class... Tensors>
|
||||
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
|
||||
auto full = marian::Shape::broadcast({out, tensors...});
|
||||
|
||||
int length = out->shape().elements();
|
||||
@ -113,7 +106,7 @@ void Add(Functor functor,
|
||||
constexpr size_t K = sizeof...(Tensors);
|
||||
|
||||
functional::Tensor<float> gOut = out;
|
||||
functional::Array<functional::Tensor<float>, K> gIns = {tensors ...};
|
||||
functional::Array<functional::Tensor<float>, K> gIns = {tensors...};
|
||||
|
||||
if(full.back() != 1 && out->shape().back() == 1) {
|
||||
size_t m = full.elements() / length;
|
||||
@ -128,8 +121,5 @@ void Add(Functor functor,
|
||||
cpu::gAddGeneric(functor, full, gOut, gIns, scale);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -15,17 +15,13 @@ private:
|
||||
|
||||
public:
|
||||
Backend(DeviceId deviceId, size_t seed)
|
||||
: marian::Backend(deviceId, seed),
|
||||
gen_(seed_) {}
|
||||
: marian::Backend(deviceId, seed), gen_(seed_) {}
|
||||
|
||||
void setDevice() { }
|
||||
void setDevice() {}
|
||||
|
||||
void synchronize() {}
|
||||
|
||||
std::default_random_engine& getRandomGenerator() {
|
||||
return gen_;
|
||||
}
|
||||
std::default_random_engine& getRandomGenerator() { return gen_; }
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -1,31 +1,31 @@
|
||||
#include <iostream>
|
||||
#include "tensors/device.h"
|
||||
#include <iostream>
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
|
||||
Device::~Device() {
|
||||
Device::~Device() {
|
||||
free(data_);
|
||||
data_ = nullptr;
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
void Device::reserve(size_t size) {
|
||||
size = align(size);
|
||||
ABORT_IF(size < size_ || size == 0,
|
||||
"New size must be larger than old size and larger than 0");
|
||||
|
||||
if(data_) {
|
||||
uint8_t *temp = static_cast<uint8_t *>(aligned_alloc(alignment_, size));
|
||||
std::copy(data_, data_ + size_, temp);
|
||||
free(data_);
|
||||
data_ = nullptr;
|
||||
size_ = 0;
|
||||
data_ = temp;
|
||||
} else {
|
||||
data_ = static_cast<uint8_t *>(aligned_alloc(alignment_, size));
|
||||
}
|
||||
|
||||
void Device::reserve(size_t size) {
|
||||
size = align(size);
|
||||
ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
|
||||
|
||||
if(data_) {
|
||||
uint8_t *temp = static_cast<uint8_t*>(aligned_alloc(alignment_, size));
|
||||
std::copy(data_, data_ + size_, temp);
|
||||
free(data_);
|
||||
data_ = temp;
|
||||
} else {
|
||||
data_ = static_cast<uint8_t*>(aligned_alloc(alignment_, size));
|
||||
}
|
||||
size_ = size;
|
||||
}
|
||||
|
||||
size_ = size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,19 +1,20 @@
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
#include "tensors/tensor_operators.h"
|
||||
#include "tensors/cpu/backend.h"
|
||||
#include "tensors/tensor_operators.h"
|
||||
|
||||
namespace marian {
|
||||
namespace cpu {
|
||||
namespace cpu {
|
||||
|
||||
void Dropout(Tensor tensor, float p) {
|
||||
auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
|
||||
auto &gen = cpuBackend->getRandomGenerator();
|
||||
std::bernoulli_distribution dist(1.f - p);
|
||||
std::generate(tensor->data(), tensor->data() + tensor->size(),
|
||||
[&]() { return dist(gen) / (1.f - p); });
|
||||
}
|
||||
|
||||
}
|
||||
void Dropout(Tensor tensor, float p) {
|
||||
auto cpuBackend
|
||||
= std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
|
||||
auto &gen = cpuBackend->getRandomGenerator();
|
||||
std::bernoulli_distribution dist(1.f - p);
|
||||
std::generate(tensor->data(), tensor->data() + tensor->size(), [&]() {
|
||||
return dist(gen) / (1.f - p);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -13,12 +13,11 @@ namespace cpu {
|
||||
template <size_t K, bool broadcast, class Functor>
|
||||
void gElement(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K> tensors) {
|
||||
|
||||
int length = tensors[0].shape().elements();
|
||||
functional::Array<int, functional::Shape::size()> dims;
|
||||
functional::Array<int, K> indices;
|
||||
|
||||
#pragma omp parallel for simd
|
||||
#pragma omp parallel for simd
|
||||
for(int index = 0; index < length; ++index) {
|
||||
indices.fill(index);
|
||||
if(broadcast) {
|
||||
@ -30,8 +29,8 @@ void gElement(Functor functor,
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
|
||||
template <class Functor, class... Tensors>
|
||||
void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
|
||||
constexpr size_t K = sizeof...(tensors) + 1;
|
||||
functional::Array<functional::Tensor<float>, K> gTensors = {out, tensors...};
|
||||
|
||||
@ -46,6 +45,5 @@ void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
|
||||
else
|
||||
cpu::gElement<K, false>(functor, gTensors);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -1,15 +1,15 @@
|
||||
/* All or part of this file was contributed by Intel under license:
|
||||
* Copyright (C) 2017-2018 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
/* All or part of this file was contributed by Intel under license:
|
||||
* Copyright (C) 2017-2018 Intel Corporation
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "tensors/tensor.h"
|
||||
#include "tensors/cpu/backend.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
#if MKL_FOUND
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#if BLAS_FOUND
|
||||
#if BLAS_FOUND
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
#endif
|
||||
@ -25,7 +25,6 @@ void Prod(marian::Tensor C,
|
||||
bool transB,
|
||||
float beta,
|
||||
float scalar) {
|
||||
|
||||
#if BLAS_FOUND
|
||||
float alpha = scalar;
|
||||
|
||||
@ -46,19 +45,20 @@ void Prod(marian::Tensor C,
|
||||
if(transB)
|
||||
ldc = B->shape().elements() / B->shape()[-1];
|
||||
|
||||
cblas_sgemm(
|
||||
CblasColMajor,
|
||||
transB ? CblasTrans : CblasNoTrans,
|
||||
transA ? CblasTrans : CblasNoTrans,
|
||||
n, m, k,
|
||||
alpha,
|
||||
B->data(),
|
||||
ldb,
|
||||
A->data(),
|
||||
lda,
|
||||
beta,
|
||||
C->data(),
|
||||
ldc);
|
||||
cblas_sgemm(CblasColMajor,
|
||||
transB ? CblasTrans : CblasNoTrans,
|
||||
transA ? CblasTrans : CblasNoTrans,
|
||||
n,
|
||||
m,
|
||||
k,
|
||||
alpha,
|
||||
B->data(),
|
||||
ldb,
|
||||
A->data(),
|
||||
lda,
|
||||
beta,
|
||||
C->data(),
|
||||
ldc);
|
||||
#else
|
||||
ABORT("Not implemented!");
|
||||
#endif
|
||||
@ -73,7 +73,7 @@ void ProdBatched(marian::Tensor C,
|
||||
float scalar) {
|
||||
#if BLAS_FOUND
|
||||
float alpha = scalar;
|
||||
|
||||
|
||||
size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]);
|
||||
size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]);
|
||||
|
||||
@ -95,33 +95,34 @@ void ProdBatched(marian::Tensor C,
|
||||
ldc = B->shape()[-2];
|
||||
|
||||
auto opA = transA ? CblasTrans : CblasNoTrans;
|
||||
auto opB = transB ? CblasTrans : CblasNoTrans;
|
||||
|
||||
auto opB = transB ? CblasTrans : CblasNoTrans;
|
||||
|
||||
auto strideB = batchB == 1 ? 0 : n * k;
|
||||
auto strideA = batchA == 1 ? 0 : m * k;
|
||||
auto strideC = n * m;
|
||||
|
||||
|
||||
int steps = std::max(batchA, batchB);
|
||||
|
||||
|
||||
int offsetA = 0;
|
||||
int offsetB = 0;
|
||||
int offsetC = 0;
|
||||
|
||||
|
||||
for(int i = 0; i < steps; ++i) {
|
||||
cblas_sgemm(
|
||||
CblasColMajor,
|
||||
opB,
|
||||
opA,
|
||||
n, m, k,
|
||||
alpha,
|
||||
B->data() + offsetB,
|
||||
ldb,
|
||||
A->data() + offsetA,
|
||||
lda,
|
||||
beta,
|
||||
C->data() + offsetC,
|
||||
ldc);
|
||||
|
||||
cblas_sgemm(CblasColMajor,
|
||||
opB,
|
||||
opA,
|
||||
n,
|
||||
m,
|
||||
k,
|
||||
alpha,
|
||||
B->data() + offsetB,
|
||||
ldb,
|
||||
A->data() + offsetA,
|
||||
lda,
|
||||
beta,
|
||||
C->data() + offsetC,
|
||||
ldc);
|
||||
|
||||
offsetA += strideA;
|
||||
offsetB += strideB;
|
||||
offsetC += strideC;
|
||||
@ -130,6 +131,5 @@ void ProdBatched(marian::Tensor C,
|
||||
ABORT("Not implemented!");
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -44,13 +44,13 @@ void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
|
||||
}
|
||||
|
||||
inline void gInsertCols(float* out,
|
||||
const float* in,
|
||||
size_t rows,
|
||||
size_t cols,
|
||||
size_t cols_out,
|
||||
size_t cols_in,
|
||||
size_t offset_out,
|
||||
size_t offset_in) {
|
||||
const float* in,
|
||||
size_t rows,
|
||||
size_t cols,
|
||||
size_t cols_out,
|
||||
size_t cols_in,
|
||||
size_t offset_out,
|
||||
size_t offset_in) {
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float* rowOut = out + j * cols_out + offset_out;
|
||||
const float* rowIn = in + j * cols_in + offset_in;
|
||||
@ -68,9 +68,10 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
|
||||
|
||||
for(auto in : inputs) {
|
||||
ABORT_IF(rows != in->shape().elements() / in->shape().back(),
|
||||
"First dimension must be equal");
|
||||
"First dimension must be equal");
|
||||
int cols_in = in->shape().back();
|
||||
cpu::gInsertCols(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
|
||||
cpu::gInsertCols(
|
||||
out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
|
||||
offset += cols_in;
|
||||
}
|
||||
}
|
||||
@ -88,11 +89,10 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
|
||||
int cols_in = in->shape().back();
|
||||
for(auto out : outputs) {
|
||||
ABORT_IF(rows != out->shape().elements() / out->shape().back(),
|
||||
"First dimension must be equal");
|
||||
"First dimension must be equal");
|
||||
int cols_out = out->shape().back();
|
||||
cpu::gInsertCols(out->data(), in->data(),
|
||||
rows, cols_out, cols_out, cols_in,
|
||||
0, offset);
|
||||
cpu::gInsertCols(
|
||||
out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset);
|
||||
offset += cols_out;
|
||||
}
|
||||
}
|
||||
@ -158,24 +158,24 @@ void Softmax(Tensor out_, Tensor in_, Tensor mask_) {
|
||||
int rows = out_->shape().elements() / out_->shape().back();
|
||||
int cols = out_->shape().back();
|
||||
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
float* so = out + j*cols;
|
||||
const float* sp = in + j*cols;
|
||||
const float* mp = mask ? mask + j*cols : nullptr;
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float* so = out + j * cols;
|
||||
const float* sp = in + j * cols;
|
||||
const float* mp = mask ? mask + j * cols : nullptr;
|
||||
|
||||
float max = sp[0];
|
||||
for (int i = 1; i < cols; ++i) {
|
||||
for(int i = 1; i < cols; ++i) {
|
||||
max = std::max(max, sp[i]);
|
||||
}
|
||||
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
float ex = !mask || mp[i] ? std::exp(sp[i] - max) : 0.f;
|
||||
so[i] = ex;
|
||||
sum += ex;
|
||||
}
|
||||
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
so[i] /= sum;
|
||||
}
|
||||
}
|
||||
@ -188,24 +188,24 @@ void LogSoftmax(Tensor out_, Tensor in_) {
|
||||
int rows = out_->shape().elements() / out_->shape().back();
|
||||
int cols = out_->shape().back();
|
||||
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float* so = out + j * cols;
|
||||
const float* sp = in + j*cols;
|
||||
const float* sp = in + j * cols;
|
||||
|
||||
float max = sp[0];
|
||||
for (int i = 1; i < cols; ++i) {
|
||||
for(int i = 1; i < cols; ++i) {
|
||||
max = std::max(max, sp[i]);
|
||||
}
|
||||
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
float sm = sp[i] - max;
|
||||
float ex = std::exp(sm);
|
||||
so[i] = sm;
|
||||
sum += ex;
|
||||
}
|
||||
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
so[i] -= std::log(sum);
|
||||
}
|
||||
}
|
||||
@ -219,17 +219,17 @@ void SoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) {
|
||||
const float* adj = adj_->data();
|
||||
const float* val = val_->data();
|
||||
|
||||
for (size_t j = 0; j < rows; ++j) {
|
||||
float* gradRow = grad + j*cols;
|
||||
const float* adjRow = adj + j*cols;
|
||||
const float* valRow = val + j*cols;
|
||||
for(size_t j = 0; j < rows; ++j) {
|
||||
float* gradRow = grad + j * cols;
|
||||
const float* adjRow = adj + j * cols;
|
||||
const float* valRow = val + j * cols;
|
||||
|
||||
float sum = 0.f;
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
sum += valRow[i] * adjRow[i];
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
gradRow[i] += valRow[i] * (adjRow[i] - sum);
|
||||
}
|
||||
}
|
||||
@ -243,62 +243,68 @@ void LogSoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) {
|
||||
const float* adj = adj_->data();
|
||||
const float* val = val_->data();
|
||||
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
float* gradRow = grad + j*cols;
|
||||
const float* adjRow = adj + j*cols;
|
||||
const float* valRow = val + j*cols;
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float* gradRow = grad + j * cols;
|
||||
const float* adjRow = adj + j * cols;
|
||||
const float* valRow = val + j * cols;
|
||||
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
sum += adjRow[i];
|
||||
}
|
||||
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
gradRow[i] += adjRow[i] - sum*std::exp(valRow[i]);
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
gradRow[i] += adjRow[i] - sum * std::exp(valRow[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CopyRows(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
|
||||
void CopyRows(Tensor out_,
|
||||
const Tensor in_,
|
||||
const std::vector<size_t>& indices) {
|
||||
size_t cols = in_->shape()[1];
|
||||
size_t rows = indices.size();
|
||||
|
||||
float* out = out_->data();
|
||||
const float* in = in_->data();
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
#pragma omp parallel for
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
size_t dst = j;
|
||||
size_t src = indices[j];
|
||||
|
||||
float* rowOut = out + dst*cols;
|
||||
const float* rowIn = in + src*cols;
|
||||
float* rowOut = out + dst * cols;
|
||||
const float* rowIn = in + src * cols;
|
||||
|
||||
std::copy(rowIn, rowIn + cols, rowOut);
|
||||
}
|
||||
}
|
||||
|
||||
void PasteRows(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
|
||||
void PasteRows(Tensor out_,
|
||||
const Tensor in_,
|
||||
const std::vector<size_t>& indices) {
|
||||
size_t cols = in_->shape()[-1];
|
||||
size_t rows = indices.size();
|
||||
|
||||
float* out = out_->data();
|
||||
const float* in = in_->data();
|
||||
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
size_t dst = indices[j]; // not a permutation - may alias, unlike PasteCols
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
size_t dst = indices[j]; // not a permutation - may alias, unlike PasteCols
|
||||
size_t src = j;
|
||||
|
||||
float* rowOut = out + dst*cols;
|
||||
const float* rowIn = in + src*cols;
|
||||
float* rowOut = out + dst * cols;
|
||||
const float* rowIn = in + src * cols;
|
||||
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
rowOut[i] += rowIn[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CopyCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
|
||||
void CopyCols(Tensor out_,
|
||||
const Tensor in_,
|
||||
const std::vector<size_t>& indices) {
|
||||
size_t rows = in_->shape().elements() / in_->shape()[-1];
|
||||
size_t colsIn = in_->shape()[-1];
|
||||
size_t colsOut = indices.size();
|
||||
@ -306,18 +312,20 @@ void CopyCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices)
|
||||
float* out = out_->data();
|
||||
const float* in = in_->data();
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
const float* rowIn = in + j*colsIn;
|
||||
float* rowOut = out + j*colsOut;
|
||||
#pragma omp parallel for
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
const float* rowIn = in + j * colsIn;
|
||||
float* rowOut = out + j * colsOut;
|
||||
|
||||
for (int i = 0; i < colsOut; ++i) {
|
||||
for(int i = 0; i < colsOut; ++i) {
|
||||
rowOut[i] = rowIn[indices[i]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PasteCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
|
||||
void PasteCols(Tensor out_,
|
||||
const Tensor in_,
|
||||
const std::vector<size_t>& indices) {
|
||||
size_t rows = out_->shape().elements() / out_->shape()[-1];
|
||||
size_t colsOut = out_->shape()[-1];
|
||||
size_t colsIn = indices.size();
|
||||
@ -328,12 +336,12 @@ void PasteCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices
|
||||
/* n.b. Unlike PasteRows, currently appears safe to assume indices[i] is a
|
||||
* permutation i.e. no racy aliases, and no need to sum vs. just assign.
|
||||
*/
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
const float* rowIn = in + j*colsIn;
|
||||
float* rowOut = out + j*colsOut;
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
const float* rowIn = in + j * colsIn;
|
||||
float* rowOut = out + j * colsOut;
|
||||
|
||||
// @TODO: should this be a sum?
|
||||
for (int i = 0; i < colsIn; ++i) {
|
||||
for(int i = 0; i < colsIn; ++i) {
|
||||
rowOut[indices[i]] = rowIn[i];
|
||||
}
|
||||
}
|
||||
@ -367,8 +375,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
|
||||
const float* b = inputs[3]->data();
|
||||
const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
#pragma omp parallel for
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float m = !mask || mask[j];
|
||||
float* rowOut = out + j * cols;
|
||||
const float* rowState = state + j * cols;
|
||||
@ -376,8 +384,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
|
||||
const float* xWrow = xW + j * cols * 3;
|
||||
const float* sUrow = sU + j * cols * 3;
|
||||
|
||||
#pragma omp simd
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
#pragma omp simd
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
// @TODO: stable logit
|
||||
float r = stableLogit(xWrow[i] + sUrow[i] + b[i]);
|
||||
|
||||
@ -417,8 +425,8 @@ void GRUFastBackward(std::vector<Tensor> outputs,
|
||||
const float* mask = inputs.size() > 4 ? inputs[4]->data() : 0;
|
||||
const float* adj = adj_->data();
|
||||
|
||||
#pragma omp parallel
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
#pragma omp parallel
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float m = !mask || mask[j];
|
||||
|
||||
float* rowOutState = outState + j * cols;
|
||||
@ -430,8 +438,8 @@ void GRUFastBackward(std::vector<Tensor> outputs,
|
||||
const float* rowSU = sU + j * cols * 3;
|
||||
const float* rowAdj = adj + j * cols;
|
||||
|
||||
#pragma omp for simd nowait
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
#pragma omp for simd nowait
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
int k = i + cols;
|
||||
int l = i + 2 * cols;
|
||||
|
||||
@ -446,10 +454,11 @@ void GRUFastBackward(std::vector<Tensor> outputs,
|
||||
|
||||
float adj = rowAdj[i];
|
||||
|
||||
float t = (1-z)*(1-h*h);
|
||||
float t = (1 - z) * (1 - h * h);
|
||||
|
||||
// df/ds
|
||||
if(outState) rowOutState[i] += (m * z - m + 1) * adj;
|
||||
if(outState)
|
||||
rowOutState[i] += (m * z - m + 1) * adj;
|
||||
|
||||
// df/d(xW_r) ...
|
||||
float dfdxW_r = m * r * (1 - r) * t * adj;
|
||||
@ -457,20 +466,28 @@ void GRUFastBackward(std::vector<Tensor> outputs,
|
||||
dfdxW_r *= rowSU[l] + b[l];
|
||||
else
|
||||
dfdxW_r *= rowSU[l];
|
||||
if(outXW) rowOutXW[i] += dfdxW_r;
|
||||
if(outSU) rowOutSU[i] += dfdxW_r;
|
||||
if(outB) outB[i] += dfdxW_r;
|
||||
if(outXW)
|
||||
rowOutXW[i] += dfdxW_r;
|
||||
if(outSU)
|
||||
rowOutSU[i] += dfdxW_r;
|
||||
if(outB)
|
||||
outB[i] += dfdxW_r;
|
||||
|
||||
// df/d(xW_z) ...
|
||||
float dfdxW_z = m * (1 - z) * z * (rowState[i] - h) * adj;
|
||||
if(outXW) rowOutXW[k] += dfdxW_z;
|
||||
if(outSU) rowOutSU[k] += dfdxW_z;
|
||||
if(outB) outB[k] += dfdxW_z;
|
||||
if(outXW)
|
||||
rowOutXW[k] += dfdxW_z;
|
||||
if(outSU)
|
||||
rowOutSU[k] += dfdxW_z;
|
||||
if(outB)
|
||||
outB[k] += dfdxW_z;
|
||||
|
||||
// df/d(xW_x) ...
|
||||
float dfdxW_x = m * t * adj;
|
||||
if(outXW) rowOutXW[l] += dfdxW_x;
|
||||
if(outSU) rowOutSU[l] += dfdxW_x * r;
|
||||
if(outXW)
|
||||
rowOutXW[l] += dfdxW_x;
|
||||
if(outSU)
|
||||
rowOutSU[l] += dfdxW_x * r;
|
||||
if(outB)
|
||||
if(final)
|
||||
outB[l] += dfdxW_x * r;
|
||||
@ -490,18 +507,18 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) {
|
||||
int rows = inShape.elements() / inShape.back();
|
||||
int cols = inShape.back();
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
const float* sp = in + j*cols;
|
||||
#pragma omp parallel for
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
const float* sp = in + j * cols;
|
||||
float max = sp[0];
|
||||
#pragma omp simd reduction(max:max)
|
||||
for (int i = 1; i < cols; ++i) {
|
||||
#pragma omp simd reduction(max : max)
|
||||
for(int i = 1; i < cols; ++i) {
|
||||
max = std::max(max, sp[i]);
|
||||
}
|
||||
|
||||
float sum = 0.f;
|
||||
#pragma omp simd reduction(+:sum)
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
#pragma omp simd reduction(+ : sum)
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
sum += std::exp(sp[i] - max);
|
||||
}
|
||||
|
||||
@ -512,7 +529,10 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) {
|
||||
}
|
||||
}
|
||||
|
||||
void CrossEntropyPickBackward(Tensor out_, Tensor adj_, Tensor a, Tensor pick_) {
|
||||
void CrossEntropyPickBackward(Tensor out_,
|
||||
Tensor adj_,
|
||||
Tensor a,
|
||||
Tensor pick_) {
|
||||
float* out = out_->data();
|
||||
Shape& outShape = out_->shape();
|
||||
const float* adj = adj_->data();
|
||||
@ -522,23 +542,23 @@ void CrossEntropyPickBackward(Tensor out_, Tensor adj_, Tensor a, Tensor pick_)
|
||||
int rows = outShape.elements() / outShape.back();
|
||||
int cols = outShape.back();
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
const float* sp = in + j*cols;
|
||||
float* so = out + j*cols;
|
||||
#pragma omp parallel for
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
const float* sp = in + j * cols;
|
||||
float* so = out + j * cols;
|
||||
|
||||
float max = sp[0];
|
||||
for (int i = 1; i < cols; ++i) {
|
||||
for(int i = 1; i < cols; ++i) {
|
||||
max = std::max(max, sp[i]);
|
||||
}
|
||||
|
||||
float sum = 0.f;
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
sum += std::exp(sp[i] - max);
|
||||
}
|
||||
|
||||
// cross-entropy
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
float sub = (float)(i == (int)pick[j]);
|
||||
so[i] += adj[j] * (std::exp(sp[i] - max) / sum - sub);
|
||||
}
|
||||
@ -549,8 +569,8 @@ float L2Norm(Tensor in) {
|
||||
float sum = 0.f;
|
||||
size_t size = in->size();
|
||||
const float* data = in->data();
|
||||
#pragma omp parallel for simd reduction(+:sum)
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
#pragma omp parallel for simd reduction(+ : sum)
|
||||
for(size_t i = 0; i < size; ++i) {
|
||||
sum += data[i] * data[i];
|
||||
}
|
||||
return std::sqrt(sum);
|
||||
@ -570,15 +590,15 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) {
|
||||
int rows = m;
|
||||
int cols = k;
|
||||
|
||||
#pragma omp parallel for
|
||||
for (size_t j = 0; j < rows; ++j) {
|
||||
#pragma omp parallel for
|
||||
for(size_t j = 0; j < rows; ++j) {
|
||||
const float* vaRow = va;
|
||||
const float* ctxRow = ctx + (j % (b * t)) * cols;
|
||||
const float* stateRow = state + ((j / (b * t)) * b + j % b) * cols;
|
||||
|
||||
float sum = 0.f;
|
||||
#pragma omp simd reduction(+:sum)
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
#pragma omp simd reduction(+ : sum)
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
float z = ctxRow[i] + stateRow[i];
|
||||
sum += std::tanh(z) * vaRow[i];
|
||||
}
|
||||
@ -587,8 +607,12 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) {
|
||||
}
|
||||
}
|
||||
|
||||
void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,
|
||||
Tensor va_, Tensor context_, Tensor state_,
|
||||
void AttBack(Tensor gVa_,
|
||||
Tensor gContext_,
|
||||
Tensor gState_,
|
||||
Tensor va_,
|
||||
Tensor context_,
|
||||
Tensor state_,
|
||||
Tensor adj_) {
|
||||
float* gVa = gVa_->data();
|
||||
float* gContext = gContext_->data();
|
||||
@ -603,8 +627,8 @@ void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,
|
||||
size_t k = context_->shape()[-1];
|
||||
size_t n = context_->shape()[-2];
|
||||
|
||||
#pragma omp parallel for reduction(+:gState[:n*k], gVa[:k])
|
||||
for (size_t j = 0; j < m; ++j) {
|
||||
#pragma omp parallel for reduction(+ : gState[ : n* k], gVa[ : k])
|
||||
for(size_t j = 0; j < m; ++j) {
|
||||
float* gcRow = gContext + j * k;
|
||||
float* gsRow = gState + (j % n) * k;
|
||||
|
||||
@ -613,8 +637,8 @@ void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,
|
||||
|
||||
float adj_j = adj[j];
|
||||
|
||||
#pragma omp simd
|
||||
for (size_t i = 0; i < k; ++i) {
|
||||
#pragma omp simd
|
||||
for(size_t i = 0; i < k; ++i) {
|
||||
float z = cRow[i] + sRow[i];
|
||||
|
||||
float t = std::tanh(z);
|
||||
@ -642,31 +666,31 @@ void LayerNormalization(Tensor out_,
|
||||
int rows = in_->shape().elements() / in_->shape().back();
|
||||
int cols = in_->shape().back();
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
float* so = out + j*cols;
|
||||
const float* sp = in + j*cols;
|
||||
#pragma omp parallel for
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float* so = out + j * cols;
|
||||
const float* sp = in + j * cols;
|
||||
|
||||
float sum = 0.f;
|
||||
#pragma omp simd reduction(+:sum)
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
#pragma omp simd reduction(+ : sum)
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
sum += sp[i];
|
||||
}
|
||||
|
||||
float mean = sum / cols;
|
||||
float sqSum = 0.f;
|
||||
#pragma omp simd reduction(+:sqSum)
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
#pragma omp simd reduction(+ : sqSum)
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
float ex = sp[i] - mean;
|
||||
sqSum += ex*ex;
|
||||
sqSum += ex * ex;
|
||||
}
|
||||
|
||||
float sigma = std::sqrt(eps + sqSum / cols);
|
||||
|
||||
#pragma omp simd
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
#pragma omp simd
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
float t = alpha[i] * ((sp[i] - mean) / sigma);
|
||||
if (beta != nullptr) {
|
||||
if(beta != nullptr) {
|
||||
t += beta[i];
|
||||
}
|
||||
|
||||
@ -696,36 +720,36 @@ void LayerNormalizationGrad(Tensor gradX_,
|
||||
size_t rows = y_->shape().elements() / y_->shape()[-1];
|
||||
size_t cols = y_->shape()[-1];
|
||||
|
||||
if (beta) {
|
||||
#pragma omp parallel for reduction(+:gradGamma[:cols], gradBeta[:cols])
|
||||
for (size_t j = 0; j < rows; ++j) {
|
||||
const float* xRow = x + j*cols;
|
||||
const float* yRow = y + j*cols;
|
||||
const float* adjRow = adj + j*cols;
|
||||
float* gradXRow = gradX + j*cols;
|
||||
if(beta) {
|
||||
#pragma omp parallel for reduction(+ : gradGamma[ : cols], gradBeta[ : cols])
|
||||
for(size_t j = 0; j < rows; ++j) {
|
||||
const float* xRow = x + j * cols;
|
||||
const float* yRow = y + j * cols;
|
||||
const float* adjRow = adj + j * cols;
|
||||
float* gradXRow = gradX + j * cols;
|
||||
|
||||
float sum_x = 0.f;
|
||||
float sum_adj = 0.f;
|
||||
float sum_adj_x = 0.f;
|
||||
float sum_sqr = 0.f;
|
||||
|
||||
#pragma omp simd reduction(+:sum_x, sum_adj_x, sum_adj)
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj)
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
sum_x += xRow[i];
|
||||
sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[i] : 0.f)) / gamma[i];
|
||||
sum_adj += adjRow[i];
|
||||
}
|
||||
|
||||
float mean = sum_x / cols;
|
||||
#pragma omp simd reduction(+:sum_sqr)
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
#pragma omp simd reduction(+ : sum_sqr)
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
float ex = xRow[i] - mean;
|
||||
sum_sqr += ex*ex;
|
||||
sum_sqr += ex * ex;
|
||||
}
|
||||
|
||||
float sigma = std::sqrt(eps + sum_sqr / cols);
|
||||
#pragma omp simd
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
#pragma omp simd
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
float grad_x = 0.f;
|
||||
float x_hat = (yRow[i] - beta[i]) / gamma[i];
|
||||
grad_x += cols * adjRow[i];
|
||||
@ -739,35 +763,35 @@ void LayerNormalizationGrad(Tensor gradX_,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#pragma omp parallel for reduction(+:gradGamma[:cols])
|
||||
for (size_t j = 0; j < rows; ++j) {
|
||||
const float* xRow = x + j*cols;
|
||||
const float* yRow = y + j*cols;
|
||||
const float* adjRow = adj + j*cols;
|
||||
float* gradXRow = gradX + j*cols;
|
||||
#pragma omp parallel for reduction(+ : gradGamma[ : cols])
|
||||
for(size_t j = 0; j < rows; ++j) {
|
||||
const float* xRow = x + j * cols;
|
||||
const float* yRow = y + j * cols;
|
||||
const float* adjRow = adj + j * cols;
|
||||
float* gradXRow = gradX + j * cols;
|
||||
|
||||
float sum_x = 0.f;
|
||||
float sum_adj = 0.f;
|
||||
float sum_adj_x = 0.f;
|
||||
float sum_sqr = 0.f;
|
||||
|
||||
#pragma omp simd reduction(+:sum_x, sum_adj_x, sum_adj)
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj)
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
sum_x += xRow[i];
|
||||
sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[i] : 0.f)) / gamma[i];
|
||||
sum_adj += adjRow[i];
|
||||
}
|
||||
|
||||
float mean = sum_x / cols;
|
||||
#pragma omp simd reduction(+:sum_sqr)
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
#pragma omp simd reduction(+ : sum_sqr)
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
float ex = xRow[i] - mean;
|
||||
sum_sqr += ex*ex;
|
||||
sum_sqr += ex * ex;
|
||||
}
|
||||
|
||||
float sigma = std::sqrt(eps + sum_sqr / cols);
|
||||
#pragma omp simd
|
||||
for (size_t i = 0; i < cols; ++i) {
|
||||
#pragma omp simd
|
||||
for(size_t i = 0; i < cols; ++i) {
|
||||
float grad_x = 0.f;
|
||||
float x_hat = yRow[i] / gamma[i];
|
||||
grad_x += cols * adjRow[i];
|
||||
@ -794,9 +818,9 @@ void Shift(Tensor out_, Tensor in_, marian::Shape shift, bool invert) {
|
||||
const float* in = in_->data();
|
||||
|
||||
int length = out_->shape().elements();
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (i - offset < 0 || i - offset >= length) {
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < length; ++i) {
|
||||
if(i - offset < 0 || i - offset >= length) {
|
||||
out[i] = 0.f;
|
||||
} else {
|
||||
out[i] = in[i - offset];
|
||||
@ -808,7 +832,7 @@ void SetSparse(float* out,
|
||||
const std::vector<size_t>& indices,
|
||||
const std::vector<float>& values) {
|
||||
int length = indices.size();
|
||||
for (int index = 0; index < length; ++index) {
|
||||
for(int index = 0; index < length; ++index) {
|
||||
out[indices[index]] = values[index];
|
||||
}
|
||||
}
|
||||
@ -824,26 +848,26 @@ void LSTMCellForward(Tensor out_, std::vector<Tensor> inputs) {
|
||||
const float* b = inputs[3]->data();
|
||||
const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;
|
||||
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float m = !mask || mask[j];
|
||||
|
||||
float* rowOut = out + j*cols;
|
||||
const float* rowCell = cell + j*cols;
|
||||
float* rowOut = out + j * cols;
|
||||
const float* rowCell = cell + j * cols;
|
||||
|
||||
const float* xWrow = xW + j*cols*4;
|
||||
const float* sUrow = sU + j*cols*4;
|
||||
const float* xWrow = xW + j * cols * 4;
|
||||
const float* sUrow = sU + j * cols * 4;
|
||||
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);
|
||||
|
||||
int k = i + cols;
|
||||
float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);
|
||||
|
||||
int l = i + 2*cols;
|
||||
int l = i + 2 * cols;
|
||||
float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);
|
||||
|
||||
float cout = gf*rowCell[i] + gi*gc;
|
||||
rowOut[i] = m*cout + (1-m)*rowCell[i];
|
||||
float cout = gf * rowCell[i] + gi * gc;
|
||||
rowOut[i] = m * cout + (1 - m) * rowCell[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -858,15 +882,15 @@ void LSTMOutputForward(Tensor out_, std::vector<Tensor> inputs) {
|
||||
const float* sU = inputs[2]->data();
|
||||
const float* b = inputs[3]->data();
|
||||
|
||||
for (int j = 0; j <rows; ++j) {
|
||||
float* rowOut = out + j*cols;
|
||||
const float* rowCell = cell + j*cols;
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float* rowOut = out + j * cols;
|
||||
const float* rowCell = cell + j * cols;
|
||||
|
||||
const float* xWrow = xW + j*cols*4;
|
||||
const float* sUrow = sU + j*cols*4;
|
||||
const float* xWrow = xW + j * cols * 4;
|
||||
const float* sUrow = sU + j * cols * 4;
|
||||
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
int k = i + 3*cols;
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
int k = i + 3 * cols;
|
||||
float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);
|
||||
|
||||
rowOut[i] = go * std::tanh(rowCell[i]);
|
||||
@ -893,52 +917,70 @@ void LSTMCellBackward(std::vector<Tensor> outputs,
|
||||
const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;
|
||||
const float* adj = adj_->data();
|
||||
|
||||
for (int j = 0; j <rows; ++j) {
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float m = !mask || mask[j];
|
||||
|
||||
float* rowOutCell = outCell + j*cols;
|
||||
float* rowOutXW = outXW + j*cols*4;
|
||||
float* rowOutSU = outSU + j*cols*4;
|
||||
float* rowOutCell = outCell + j * cols;
|
||||
float* rowOutXW = outXW + j * cols * 4;
|
||||
float* rowOutSU = outSU + j * cols * 4;
|
||||
|
||||
const float* rowCell = cell + j*cols;
|
||||
const float* xWrow = xW + j*cols*4;
|
||||
const float* sUrow = sU + j*cols*4;
|
||||
const float* rowCell = cell + j * cols;
|
||||
const float* xWrow = xW + j * cols * 4;
|
||||
const float* sUrow = sU + j * cols * 4;
|
||||
|
||||
const float* rowAdj = adj + j*cols;
|
||||
const float* rowAdj = adj + j * cols;
|
||||
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);
|
||||
|
||||
int k = i + cols;
|
||||
float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);
|
||||
|
||||
int l = i + 2*cols;
|
||||
int l = i + 2 * cols;
|
||||
float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);
|
||||
|
||||
float adj = rowAdj[i];
|
||||
|
||||
// dc/dx_{t-1}
|
||||
if (outCell) {
|
||||
rowOutCell[i] += (m*gf - m + 1)*adj;
|
||||
if(outCell) {
|
||||
rowOutCell[i] += (m * gf - m + 1) * adj;
|
||||
}
|
||||
|
||||
// dc/d(b_f) = dc/d(xW_f) ...
|
||||
float dcdxf = m*rowCell[i] * gf*(1-gf) * adj;
|
||||
if (outXW) { rowOutXW[i] += dcdxf; }
|
||||
if (outSU) { rowOutSU[i] += dcdxf; }
|
||||
if (outB) { outB[i] += dcdxf; }
|
||||
float dcdxf = m * rowCell[i] * gf * (1 - gf) * adj;
|
||||
if(outXW) {
|
||||
rowOutXW[i] += dcdxf;
|
||||
}
|
||||
if(outSU) {
|
||||
rowOutSU[i] += dcdxf;
|
||||
}
|
||||
if(outB) {
|
||||
outB[i] += dcdxf;
|
||||
}
|
||||
|
||||
// dc/d(b_i) ...
|
||||
float dcdb_i = m * gc * gi*(1-gi) * adj;
|
||||
if (outXW) { rowOutXW[k] += dcdb_i; }
|
||||
if (outSU) { rowOutSU[k] += dcdb_i; }
|
||||
if (outB) { outB[k] += dcdb_i; }
|
||||
float dcdb_i = m * gc * gi * (1 - gi) * adj;
|
||||
if(outXW) {
|
||||
rowOutXW[k] += dcdb_i;
|
||||
}
|
||||
if(outSU) {
|
||||
rowOutSU[k] += dcdb_i;
|
||||
}
|
||||
if(outB) {
|
||||
outB[k] += dcdb_i;
|
||||
}
|
||||
|
||||
// dc/d(b_c) ...
|
||||
float dcdxc = m * gi * (1 - gc*gc) * adj;
|
||||
if (outXW) { rowOutXW[l] += dcdxc; }
|
||||
if (outSU) { rowOutSU[l] += dcdxc; }
|
||||
if (outB) { outB[l] += dcdxc; }
|
||||
float dcdxc = m * gi * (1 - gc * gc) * adj;
|
||||
if(outXW) {
|
||||
rowOutXW[l] += dcdxc;
|
||||
}
|
||||
if(outSU) {
|
||||
rowOutSU[l] += dcdxc;
|
||||
}
|
||||
if(outB) {
|
||||
outB[l] += dcdxc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -961,19 +1003,19 @@ void LSTMOutputBackward(std::vector<Tensor> outputs,
|
||||
|
||||
const float* adj = adj_->data();
|
||||
|
||||
for (int j = 0; j < rows; ++j) {
|
||||
float* rowOutCell = outCell + j*cols;
|
||||
float* rowOutXW = outXW + j*cols*4;
|
||||
float* rowOutSU = outSU + j*cols*4;
|
||||
for(int j = 0; j < rows; ++j) {
|
||||
float* rowOutCell = outCell + j * cols;
|
||||
float* rowOutXW = outXW + j * cols * 4;
|
||||
float* rowOutSU = outSU + j * cols * 4;
|
||||
|
||||
const float* rowCell = cell + j*cols;
|
||||
const float* xWrow = xW + j*cols*4;
|
||||
const float* sUrow = sU + j*cols*4;
|
||||
const float* rowCell = cell + j * cols;
|
||||
const float* xWrow = xW + j * cols * 4;
|
||||
const float* sUrow = sU + j * cols * 4;
|
||||
|
||||
const float* rowAdj = adj + j*cols;
|
||||
const float* rowAdj = adj + j * cols;
|
||||
|
||||
for (int i = 0; i < cols; ++i) {
|
||||
int k = i + 3*cols;
|
||||
for(int i = 0; i < cols; ++i) {
|
||||
int k = i + 3 * cols;
|
||||
float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);
|
||||
|
||||
float t = std::tanh(rowCell[i]);
|
||||
@ -981,15 +1023,21 @@ void LSTMOutputBackward(std::vector<Tensor> outputs,
|
||||
float adj = rowAdj[i];
|
||||
|
||||
// dc/dc_{t-1}
|
||||
if (outCell) {
|
||||
rowOutCell[i] += go * (1 - t*t) * adj;
|
||||
if(outCell) {
|
||||
rowOutCell[i] += go * (1 - t * t) * adj;
|
||||
}
|
||||
|
||||
// dc/d(b_o) = dc/d(xW_f) ...
|
||||
float dcdxo = t * go*(1-go) * adj;
|
||||
if (outXW) { rowOutXW[k] += dcdxo; }
|
||||
if (outSU) { rowOutSU[k] += dcdxo; }
|
||||
if (outB) { outB[k] += dcdxo; }
|
||||
float dcdxo = t * go * (1 - go) * adj;
|
||||
if(outXW) {
|
||||
rowOutXW[k] += dcdxo;
|
||||
}
|
||||
if(outSU) {
|
||||
rowOutSU[k] += dcdxo;
|
||||
}
|
||||
if(outB) {
|
||||
outB[k] += dcdxo;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1027,6 +1075,5 @@ void PoolingWithMaskingBackward(Tensor adj,
|
||||
bool isEven) {
|
||||
ABORT("Not implemented!");
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace marian
|
||||
|
@ -23,7 +23,7 @@ public:
|
||||
Device(DeviceId deviceId, size_t alignment = 256)
|
||||
: deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {}
|
||||
|
||||
virtual ~Device() {};
|
||||
virtual ~Device(){};
|
||||
|
||||
virtual void reserve(size_t size) = 0;
|
||||
|
||||
@ -35,30 +35,31 @@ public:
|
||||
};
|
||||
|
||||
namespace gpu {
|
||||
class Device : public marian::Device {
|
||||
public:
|
||||
Device(DeviceId deviceId, size_t alignment = 256)
|
||||
class Device : public marian::Device {
|
||||
public:
|
||||
Device(DeviceId deviceId, size_t alignment = 256)
|
||||
: marian::Device(deviceId, alignment) {}
|
||||
|
||||
~Device();
|
||||
~Device();
|
||||
|
||||
void reserve(size_t size);
|
||||
};
|
||||
void reserve(size_t size);
|
||||
};
|
||||
}
|
||||
|
||||
namespace cpu {
|
||||
class Device : public marian::Device {
|
||||
public:
|
||||
Device(DeviceId deviceId, size_t alignment = 256)
|
||||
class Device : public marian::Device {
|
||||
public:
|
||||
Device(DeviceId deviceId, size_t alignment = 256)
|
||||
: marian::Device(deviceId, alignment) {}
|
||||
|
||||
~Device();
|
||||
~Device();
|
||||
|
||||
void reserve(size_t size);
|
||||
};
|
||||
void reserve(size_t size);
|
||||
};
|
||||
}
|
||||
|
||||
static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
|
||||
static inline Ptr<Device> DispatchDevice(DeviceId deviceId,
|
||||
size_t alignment = 256) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(deviceId.type == DeviceType::gpu)
|
||||
return New<gpu::Device>(deviceId, alignment);
|
||||
@ -71,5 +72,4 @@ static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 2
|
||||
return New<cpu::Device>(deviceId, alignment);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,204 +2,250 @@
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
|
||||
#define DISPATCH1(Function, Arg1) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1); \
|
||||
} \
|
||||
void Function(Arg1 arg1) { \
|
||||
#define DISPATCH1(Function, Arg1) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1); \
|
||||
} \
|
||||
void Function(Arg1 arg1) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1); \
|
||||
else \
|
||||
cpu::Function(arg1); \
|
||||
gpu::Function(arg1); \
|
||||
else \
|
||||
cpu::Function(arg1); \
|
||||
}
|
||||
|
||||
#define DISPATCH2(Function, Arg1, Arg2) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2) { \
|
||||
#define DISPATCH2(Function, Arg1, Arg2) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2); \
|
||||
gpu::Function(arg1, arg2); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2); \
|
||||
}
|
||||
|
||||
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3); \
|
||||
} \
|
||||
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3); \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3); \
|
||||
}
|
||||
|
||||
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4); \
|
||||
} \
|
||||
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4); \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4); \
|
||||
}
|
||||
|
||||
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
|
||||
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
|
||||
} \
|
||||
static inline void Function( \
|
||||
Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
|
||||
}
|
||||
|
||||
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
|
||||
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
|
||||
} \
|
||||
static inline void Function( \
|
||||
Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
|
||||
}
|
||||
|
||||
#define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, \
|
||||
Arg2 arg2, \
|
||||
Arg3 arg3, \
|
||||
Arg4 arg4, \
|
||||
Arg5 arg5, \
|
||||
Arg6 arg6, \
|
||||
Arg7 arg7) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
|
||||
}
|
||||
|
||||
#define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, \
|
||||
Arg2 arg2, \
|
||||
Arg3 arg3, \
|
||||
Arg4 arg4, \
|
||||
Arg5 arg5, \
|
||||
Arg6 arg6, \
|
||||
Arg7 arg7, \
|
||||
Arg8 arg8) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
|
||||
}
|
||||
|
||||
#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
#define DISPATCH9( \
|
||||
Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
|
||||
namespace gpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
|
||||
} \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, \
|
||||
Arg2 arg2, \
|
||||
Arg3 arg3, \
|
||||
Arg4 arg4, \
|
||||
Arg5 arg5, \
|
||||
Arg6 arg6, \
|
||||
Arg7 arg7, \
|
||||
Arg8 arg8, \
|
||||
Arg9 arg9) { \
|
||||
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
|
||||
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
|
||||
else \
|
||||
else \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define DISPATCH1(Function, Arg1) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1); \
|
||||
} \
|
||||
void Function(Arg1 arg1) { \
|
||||
cpu::Function(arg1); \
|
||||
}
|
||||
namespace cpu { \
|
||||
void Function(Arg1); \
|
||||
} \
|
||||
void Function(Arg1 arg1) { cpu::Function(arg1); }
|
||||
|
||||
#define DISPATCH2(Function, Arg1, Arg2) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2); \
|
||||
} \
|
||||
#define DISPATCH2(Function, Arg1, Arg2) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2) { \
|
||||
cpu::Function(arg1, arg2); \
|
||||
cpu::Function(arg1, arg2); \
|
||||
}
|
||||
|
||||
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3); \
|
||||
} \
|
||||
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
|
||||
cpu::Function(arg1, arg2, arg3); \
|
||||
cpu::Function(arg1, arg2, arg3); \
|
||||
}
|
||||
|
||||
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4); \
|
||||
} \
|
||||
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4); \
|
||||
cpu::Function(arg1, arg2, arg3, arg4); \
|
||||
}
|
||||
|
||||
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
|
||||
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
|
||||
} \
|
||||
static inline void Function( \
|
||||
Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
|
||||
}
|
||||
|
||||
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
|
||||
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
|
||||
} \
|
||||
static inline void Function( \
|
||||
Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
|
||||
}
|
||||
|
||||
#define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, \
|
||||
Arg2 arg2, \
|
||||
Arg3 arg3, \
|
||||
Arg4 arg4, \
|
||||
Arg5 arg5, \
|
||||
Arg6 arg6, \
|
||||
Arg7 arg7) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
|
||||
}
|
||||
|
||||
#define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, \
|
||||
Arg2 arg2, \
|
||||
Arg3 arg3, \
|
||||
Arg4 arg4, \
|
||||
Arg5 arg5, \
|
||||
Arg6 arg6, \
|
||||
Arg7 arg7, \
|
||||
Arg8 arg8) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
|
||||
}
|
||||
|
||||
#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
|
||||
#define DISPATCH9( \
|
||||
Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
|
||||
namespace cpu { \
|
||||
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
|
||||
} \
|
||||
static inline void Function(Arg1 arg1, \
|
||||
Arg2 arg2, \
|
||||
Arg3 arg3, \
|
||||
Arg4 arg4, \
|
||||
Arg5 arg5, \
|
||||
Arg6 arg6, \
|
||||
Arg7 arg7, \
|
||||
Arg8 arg8, \
|
||||
Arg9 arg9) { \
|
||||
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -9,8 +9,8 @@
|
||||
|
||||
#include "functional/functional.h"
|
||||
#include "functional/shape.h"
|
||||
#include "functional/tmp.h"
|
||||
#include "functional/tensor.h"
|
||||
#include "functional/tmp.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -22,7 +22,6 @@ __global__ void gAddGeneric(Functor functor,
|
||||
functional::Tensor<float> out,
|
||||
functional::Array<functional::Tensor<float>, K> ins,
|
||||
float scale = 1.0) {
|
||||
|
||||
int outLength = out.shape().elements();
|
||||
bool same = outLength == full.elements();
|
||||
for(int i = 0; i < K; ++i)
|
||||
@ -37,14 +36,12 @@ __global__ void gAddGeneric(Functor functor,
|
||||
for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if(index < outLength) {
|
||||
|
||||
if(same) {
|
||||
out[index] += functional::apply(functor, ins, index) * scale;
|
||||
} else {
|
||||
out.shape().dims(index, dims);
|
||||
out[index] += functional::loops(functor, ins, len, dims) * scale;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -81,7 +78,6 @@ __global__ void gAddReduce(Functor functor,
|
||||
functional::Tensor<float> out,
|
||||
functional::Array<functional::Tensor<float>, K> ins,
|
||||
float scale = 1.0) {
|
||||
|
||||
int rows = full.elements() / full.back();
|
||||
int cols = full.back();
|
||||
|
||||
@ -133,12 +129,8 @@ __global__ void gAddReduce(Functor functor,
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Add(Functor functor,
|
||||
float scale,
|
||||
marian::Tensor out,
|
||||
Tensors... tensors) {
|
||||
|
||||
template <class Functor, class... Tensors>
|
||||
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
|
||||
cudaSetDevice(out->getDevice().no);
|
||||
|
||||
auto full = marian::Shape::broadcast({out, tensors...});
|
||||
@ -148,7 +140,7 @@ void Add(Functor functor,
|
||||
constexpr size_t K = sizeof...(Tensors);
|
||||
|
||||
functional::Tensor<float> gOut = out;
|
||||
functional::Array<functional::Tensor<float>, K> gIns = {tensors ...};
|
||||
functional::Array<functional::Tensor<float>, K> gIns = {tensors...};
|
||||
|
||||
if(full.back() != 1 && out->shape().back() == 1) {
|
||||
size_t m = full.elements() / length;
|
||||
@ -180,6 +172,5 @@ void Add(Functor functor,
|
||||
}
|
||||
|
||||
#include "tensors/gpu/add.inc"
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -6,11 +6,7 @@ namespace marian {
|
||||
|
||||
namespace gpu {
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Add(Functor functor,
|
||||
float scale,
|
||||
marian::Tensor out,
|
||||
Tensors... tensors);
|
||||
|
||||
template <class Functor, class... Tensors>
|
||||
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors);
|
||||
}
|
||||
}
|
||||
|
@ -4,45 +4,49 @@
|
||||
#include "tensors/gpu/cuda_helpers.h"
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
template <typename T>
|
||||
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
CudaCopy(begin, end, dest);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
namespace gpu {
|
||||
template <typename T>
|
||||
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
CudaCopy(begin, end, dest);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
}
|
||||
|
||||
template void copy<float>(Ptr<Backend> backend,
|
||||
const float* begin,
|
||||
const float* end,
|
||||
float* dest);
|
||||
template void copy<int>(Ptr<Backend> backend,
|
||||
const int* begin,
|
||||
const int* end,
|
||||
int* dest);
|
||||
|
||||
__global__ void gFill(float* d_in, int size, float val) {
|
||||
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if(index < size) {
|
||||
d_in[index] = val;
|
||||
}
|
||||
|
||||
template void copy<float>(Ptr<Backend> backend, const float* begin, const float* end, float* dest);
|
||||
template void copy<int>(Ptr<Backend> backend, const int* begin, const int* end, int* dest);
|
||||
|
||||
|
||||
__global__ void gFill(float *d_in, int size, float val) {
|
||||
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
|
||||
if(index < size) {
|
||||
d_in[index] = val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
int size = end - begin;
|
||||
int threads = std::min(512, size);
|
||||
int blocks = (size / threads) + (size % threads != 0);
|
||||
gFill<<<blocks, threads>>>(begin, size, value);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
}
|
||||
|
||||
void setSparse(Ptr<Backend> backend,
|
||||
const std::vector<size_t>& keys,
|
||||
const std::vector<float>& values,
|
||||
float* data) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
ABORT("no SetSparse");
|
||||
//gpu::SetSparse(data, keys, values);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
int size = end - begin;
|
||||
int threads = std::min(512, size);
|
||||
int blocks = (size / threads) + (size % threads != 0);
|
||||
gFill<<<blocks, threads>>>(begin, size, value);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
}
|
||||
|
||||
void setSparse(Ptr<Backend> backend,
|
||||
const std::vector<size_t>& keys,
|
||||
const std::vector<float>& values,
|
||||
float* data) {
|
||||
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
|
||||
ABORT("no SetSparse");
|
||||
// gpu::SetSparse(data, keys, values);
|
||||
CUDA_CHECK(cudaStreamSynchronize(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,12 +3,15 @@
|
||||
#include "tensors/backend.h"
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
template <typename T>
|
||||
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest);
|
||||
|
||||
void fill(Ptr<Backend> backend, float* begin, float* end, float value);
|
||||
namespace gpu {
|
||||
template <typename T>
|
||||
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest);
|
||||
|
||||
void setSparse(Ptr<Backend> backend, const std::vector<size_t>&, const std::vector<float>&, float*);
|
||||
}
|
||||
void fill(Ptr<Backend> backend, float* begin, float* end, float value);
|
||||
|
||||
void setSparse(Ptr<Backend> backend,
|
||||
const std::vector<size_t>&,
|
||||
const std::vector<float>&,
|
||||
float*);
|
||||
}
|
||||
}
|
||||
|
@ -25,13 +25,9 @@ public:
|
||||
setHandles();
|
||||
}
|
||||
|
||||
void setDevice() {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
}
|
||||
void setDevice() { cudaSetDevice(deviceId_.no); }
|
||||
|
||||
void synchronize() {
|
||||
cudaStreamSynchronize(0);
|
||||
}
|
||||
void synchronize() { cudaStreamSynchronize(0); }
|
||||
|
||||
cublasHandle_t getCublasHandle() { return cublasHandle_; }
|
||||
|
||||
@ -41,13 +37,11 @@ private:
|
||||
cublasHandle_t cublasHandle_;
|
||||
curandGenerator_t curandGenerator_;
|
||||
|
||||
|
||||
void setHandles() {
|
||||
cublasHandle_ = create_handle();
|
||||
curandGenerator_ = createCurandGenerator();
|
||||
}
|
||||
|
||||
|
||||
curandGenerator_t createCurandGenerator() {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
curandGenerator_t generator;
|
||||
@ -67,6 +61,5 @@ private:
|
||||
return cublasHandle;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -13,9 +13,8 @@ const int MAX_BLOCKS = 65535;
|
||||
#define CUDA_CHECK(ans) \
|
||||
{ gpuAssert((ans), __FILE__, __LINE__); }
|
||||
|
||||
|
||||
inline void gpuAssert(cudaError_t code,
|
||||
const char *file,
|
||||
const char* file,
|
||||
int line,
|
||||
bool abort = true) {
|
||||
if(code != cudaSuccess) {
|
||||
@ -26,8 +25,8 @@ inline void gpuAssert(cudaError_t code,
|
||||
|
||||
template <typename T>
|
||||
void CudaCopy(const T* start, const T* end, T* dest) {
|
||||
CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T),
|
||||
cudaMemcpyDefault));
|
||||
CUDA_CHECK(cudaMemcpy(
|
||||
(void*)dest, (void*)start, (end - start) * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
#define CUSPARSE_CHECK(x) \
|
||||
|
@ -7,34 +7,34 @@
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
|
||||
Device::~Device() {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
if(data_) {
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void Device::reserve(size_t size) {
|
||||
size = align(size);
|
||||
cudaSetDevice(deviceId_.no);
|
||||
|
||||
ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
|
||||
|
||||
if(data_) {
|
||||
// Allocate memory by going through host memory
|
||||
uint8_t *temp = new uint8_t[size_];
|
||||
CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
CUDA_CHECK(cudaMalloc(&data_, size));
|
||||
CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
|
||||
delete[] temp;
|
||||
} else {
|
||||
CUDA_CHECK(cudaMalloc(&data_, size));
|
||||
}
|
||||
|
||||
size_ = size;
|
||||
Device::~Device() {
|
||||
cudaSetDevice(deviceId_.no);
|
||||
if(data_) {
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
void Device::reserve(size_t size) {
|
||||
size = align(size);
|
||||
cudaSetDevice(deviceId_.no);
|
||||
|
||||
ABORT_IF(size < size_ || size == 0,
|
||||
"New size must be larger than old size and larger than 0");
|
||||
|
||||
if(data_) {
|
||||
// Allocate memory by going through host memory
|
||||
uint8_t *temp = new uint8_t[size_];
|
||||
CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaFree(data_));
|
||||
CUDA_CHECK(cudaMalloc(&data_, size));
|
||||
CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
|
||||
delete[] temp;
|
||||
} else {
|
||||
CUDA_CHECK(cudaMalloc(&data_, size));
|
||||
}
|
||||
|
||||
size_ = size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -22,31 +22,29 @@
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
|
||||
__global__ void gScale(float* data, int n, float p) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
while(index < n) {
|
||||
data[index] = (data[index] < p) / p;
|
||||
index += gridDim.x * blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
void Dropout(Tensor tensor, float p) {
|
||||
auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
|
||||
curandGenerator_t gen = gpuBackend->getCurandGenerator();
|
||||
int n = tensor->size();
|
||||
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
|
||||
|
||||
int numThreads = std::min(n, 512);
|
||||
int numBlocks = n / numThreads + (n % numThreads != 0);
|
||||
|
||||
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
|
||||
}
|
||||
namespace gpu {
|
||||
|
||||
__global__ void gScale(float* data, int n, float p) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
while(index < n) {
|
||||
data[index] = (data[index] < p) / p;
|
||||
index += gridDim.x * blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
void Dropout(Tensor tensor, float p) {
|
||||
auto gpuBackend
|
||||
= std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
|
||||
curandGenerator_t gen = gpuBackend->getCurandGenerator();
|
||||
int n = tensor->size();
|
||||
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
|
||||
|
||||
int numThreads = std::min(n, 512);
|
||||
int numBlocks = n / numThreads + (n % numThreads != 0);
|
||||
|
||||
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
|
||||
|
||||
#include "tensors/gpu/element.h"
|
||||
|
||||
#include "tensors/gpu/cuda_helpers.h"
|
||||
#include "functional/array.h"
|
||||
#include "functional/tensor.h"
|
||||
@ -11,9 +10,9 @@ namespace marian {
|
||||
namespace gpu {
|
||||
|
||||
template <size_t K, bool broadcast, class Functor>
|
||||
__global__ void gElement(Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K> tensors) {
|
||||
|
||||
__global__ void gElement(
|
||||
Functor functor,
|
||||
functional::Array<functional::Tensor<float>, K> tensors) {
|
||||
int length = tensors[0].shape().elements();
|
||||
functional::Array<int, functional::Shape::size()> dims;
|
||||
functional::Array<int, K> indices;
|
||||
@ -21,7 +20,6 @@ __global__ void gElement(Functor functor,
|
||||
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
|
||||
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if(index < length) {
|
||||
|
||||
indices.fill(index);
|
||||
|
||||
if(broadcast) {
|
||||
@ -35,8 +33,8 @@ __global__ void gElement(Functor functor,
|
||||
}
|
||||
}
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Element(Functor functor, Tensor out, Tensors ...tensors) {
|
||||
template <class Functor, class... Tensors>
|
||||
void Element(Functor functor, Tensor out, Tensors... tensors) {
|
||||
cudaSetDevice(out->getDevice().no);
|
||||
|
||||
constexpr size_t K = sizeof...(tensors) + 1;
|
||||
@ -57,8 +55,5 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) {
|
||||
}
|
||||
|
||||
#include "tensors/gpu/element.inc"
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5,8 +5,7 @@
|
||||
namespace marian {
|
||||
namespace gpu {
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Element(Functor functor, Tensor out, Tensors ...tensors);
|
||||
|
||||
template <class Functor, class... Tensors>
|
||||
void Element(Functor functor, Tensor out, Tensors... tensors);
|
||||
}
|
||||
}
|
||||
|
@ -38,10 +38,11 @@ void Prod(marian::Tensor C,
|
||||
cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
|
||||
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle();
|
||||
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())
|
||||
->getCublasHandle();
|
||||
|
||||
#if CUDA_VERSION >= 9000
|
||||
//cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
|
||||
// cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
|
||||
#endif
|
||||
|
||||
cublasSgemm(cublasHandle,
|
||||
@ -59,7 +60,7 @@ void Prod(marian::Tensor C,
|
||||
C->data(),
|
||||
ldc);
|
||||
#if CUDA_VERSION >= 9000
|
||||
//cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
|
||||
// cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -96,10 +97,11 @@ void ProdBatched(marian::Tensor C,
|
||||
cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
|
||||
|
||||
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle();
|
||||
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())
|
||||
->getCublasHandle();
|
||||
|
||||
#if CUDA_VERSION >= 9000
|
||||
//cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
|
||||
// cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
|
||||
#endif
|
||||
cublasSgemmStridedBatched(cublasHandle,
|
||||
opB,
|
||||
@ -120,10 +122,8 @@ void ProdBatched(marian::Tensor C,
|
||||
n * m,
|
||||
std::max(batchA, batchB));
|
||||
#if CUDA_VERSION >= 9000
|
||||
//cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
|
||||
// cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -21,6 +21,5 @@ void ProdBatched(marian::Tensor C,
|
||||
bool transB,
|
||||
float beta = 0,
|
||||
float scalar = 1);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -2,8 +2,8 @@
|
||||
#include <cusparse_v2.h>
|
||||
|
||||
#include "common/definitions.h"
|
||||
#include "tensors/tensor.h"
|
||||
#include "kernels/cuda_helpers.h"
|
||||
#include "tensors/tensor.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
|
@ -28,11 +28,11 @@ __device__ inline float stableLogit(float x) {
|
||||
}
|
||||
|
||||
bool IsNan(Tensor in) {
|
||||
//cudaSetDevice(in->getDevice().no);
|
||||
//thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data());
|
||||
//thrust::device_ptr<float> end
|
||||
// cudaSetDevice(in->getDevice().no);
|
||||
// thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data());
|
||||
// thrust::device_ptr<float> end
|
||||
// = thrust::device_pointer_cast(in->data() + in->size());
|
||||
//return thrust::transform_reduce(
|
||||
// return thrust::transform_reduce(
|
||||
// begin, end, isnan_test(), 0, thrust::plus<bool>());
|
||||
return false;
|
||||
}
|
||||
@ -93,10 +93,9 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
|
||||
|
||||
for(auto in : inputs) {
|
||||
ABORT_IF(rows != in->shape().elements() / in->shape().back(),
|
||||
"First dimension must be equal");
|
||||
"First dimension must be equal");
|
||||
int cols_in = in->shape().back();
|
||||
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, rows);
|
||||
int threads = std::min(MAX_THREADS, cols_in);
|
||||
|
||||
@ -122,7 +121,7 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
|
||||
int cols_in = in->shape().back();
|
||||
for(auto out : outputs) {
|
||||
ABORT_IF(rows != out->shape().elements() / out->shape().back(),
|
||||
"First dimension must be equal");
|
||||
"First dimension must be equal");
|
||||
int cols_out = out->shape().back();
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, rows);
|
||||
@ -166,10 +165,10 @@ void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
|
||||
SplitCont(outputs, in, ax);
|
||||
}
|
||||
|
||||
__global__ void gTransposeND(functional::Tensor<float> out,
|
||||
const functional::Tensor<float> in,
|
||||
const functional::Array<int, functional::Shape::size()> permute) {
|
||||
|
||||
__global__ void gTransposeND(
|
||||
functional::Tensor<float> out,
|
||||
const functional::Tensor<float> in,
|
||||
const functional::Array<int, functional::Shape::size()> permute) {
|
||||
constexpr size_t N = functional::Shape::size();
|
||||
functional::Array<int, N> oDims;
|
||||
functional::Array<int, N> pDims;
|
||||
@ -458,7 +457,6 @@ void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
|
||||
int m = grad->shape().elements() / grad->shape().back();
|
||||
int k = grad->shape().back();
|
||||
|
||||
|
||||
int blocks = std::min(MAX_BLOCKS, m);
|
||||
int threads = std::min(MAX_THREADS, k);
|
||||
int shared = sizeof(float) * threads * 2;
|
||||
@ -784,7 +782,9 @@ void Select(Tensor out,
|
||||
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
|
||||
|
||||
auto mp_indices = allocator->alloc<size_t>(indices.size());
|
||||
CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
|
||||
CudaCopy(indices.data(),
|
||||
indices.data() + indices.size(),
|
||||
mp_indices->data<size_t>());
|
||||
|
||||
int axisGPU = axis + functional::Shape::size() - out->shape().size();
|
||||
gSelect<<<blocks, threads>>>(out->data(),
|
||||
@ -810,7 +810,9 @@ void Insert(Tensor out,
|
||||
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
|
||||
|
||||
auto mp_indices = allocator->alloc<size_t>(indices.size());
|
||||
CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
|
||||
CudaCopy(indices.data(),
|
||||
indices.data() + indices.size(),
|
||||
mp_indices->data<size_t>());
|
||||
|
||||
int axisGPU = axis + functional::Shape::size() - out->shape().size();
|
||||
gInsert<<<blocks, threads>>>(out->data(),
|
||||
@ -1174,19 +1176,18 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
|
||||
out->data(), out->shape(), adj->data(), a->data(), pick->data());
|
||||
}
|
||||
|
||||
|
||||
float L2Norm(Tensor in) {
|
||||
|
||||
cudaSetDevice(in->getDevice().no);
|
||||
|
||||
int size = in->shape().elements();
|
||||
int threads = std::min(MAX_THREADS, size);
|
||||
int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
|
||||
int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
|
||||
|
||||
uint8_t* data;
|
||||
cudaMalloc(&data, blocks * sizeof(float));
|
||||
Tensor out(new TensorBase(
|
||||
New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend()));
|
||||
Tensor out(new TensorBase(New<MemoryPiece>(data, blocks * sizeof(float)),
|
||||
{1, blocks},
|
||||
in->getBackend()));
|
||||
|
||||
using namespace functional;
|
||||
ReduceAll(_1 * _1, out, in);
|
||||
@ -1203,7 +1204,7 @@ __global__ void gAtt(float* out,
|
||||
int m, // total rows (batch x time x beam)
|
||||
int k, // depth
|
||||
int b, // batch size
|
||||
int t // time of ctx
|
||||
int t // time of ctx
|
||||
) {
|
||||
int rows = m;
|
||||
int cols = k;
|
||||
@ -1255,14 +1256,8 @@ void Att(Tensor out, Tensor va, Tensor context, Tensor state) {
|
||||
int threads = std::min(MAX_THREADS, (int)k);
|
||||
int shared = sizeof(float) * threads * 2;
|
||||
|
||||
gAtt<<<blocks, threads, shared>>>(out->data(),
|
||||
va->data(),
|
||||
context->data(),
|
||||
state->data(),
|
||||
m,
|
||||
k,
|
||||
b,
|
||||
t);
|
||||
gAtt<<<blocks, threads, shared>>>(
|
||||
out->data(), va->data(), context->data(), state->data(), m, k, b, t);
|
||||
}
|
||||
|
||||
__global__ void gAttBack(float* gVa,
|
||||
@ -1576,7 +1571,6 @@ __global__ void gShift(float* out, const float* in, int length, int offset) {
|
||||
}
|
||||
|
||||
void Shift(Tensor out, Tensor in, marian::Shape shift, bool invert) {
|
||||
|
||||
ABORT_IF(in->shape().size() != shift.size(), "bad dimensions");
|
||||
|
||||
int offset = 0;
|
||||
@ -2006,21 +2000,22 @@ __global__ void gMaxPoolingForward(float* out,
|
||||
int lastWidth) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid >= outRows * outCols) return;
|
||||
if(tid >= outRows * outCols)
|
||||
return;
|
||||
|
||||
int rowId = tid / outRows;
|
||||
int colId = tid % outRows;
|
||||
|
||||
float* b = in + (rowId * inCols) + (colId * width);
|
||||
float* localMask = mask + (rowId / numKernels) * maskCols + colId * width;
|
||||
float* localMask = mask + (rowId / numKernels) * maskCols + colId * width;
|
||||
|
||||
if (colId == outRows - 1) {
|
||||
if(colId == outRows - 1) {
|
||||
width = lastWidth;
|
||||
}
|
||||
|
||||
float currentMax = b[0] * localMask[0];
|
||||
for (int i = 1; i < width; ++i) {
|
||||
if (b[i] * localMask[i] > currentMax) {
|
||||
for(int i = 1; i < width; ++i) {
|
||||
if(b[i] * localMask[i] > currentMax) {
|
||||
currentMax = b[i] * localMask[i];
|
||||
}
|
||||
}
|
||||
@ -2045,15 +2040,20 @@ void PoolingWithMaskingForward(Tensor out,
|
||||
int outRows = outShape[2];
|
||||
int outCols = outShape[0] * outShape[1];
|
||||
|
||||
int lastWidth = ((inCols - isEven) % width == 0)
|
||||
? width
|
||||
: (inCols - isEven) % width;
|
||||
int lastWidth
|
||||
= ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width;
|
||||
|
||||
gMaxPoolingForward<<<blocks, threads>>>(
|
||||
out->data(), outRows, outCols,
|
||||
in->data(), inRows, inCols,
|
||||
mask->data(), outShape[1], mask->shape()[2],
|
||||
width, lastWidth);
|
||||
gMaxPoolingForward<<<blocks, threads>>>(out->data(),
|
||||
outRows,
|
||||
outCols,
|
||||
in->data(),
|
||||
inRows,
|
||||
inCols,
|
||||
mask->data(),
|
||||
outShape[1],
|
||||
mask->shape()[2],
|
||||
width,
|
||||
lastWidth);
|
||||
}
|
||||
|
||||
__global__ void gMaxPoolingBackward(float* adj,
|
||||
@ -2067,30 +2067,31 @@ __global__ void gMaxPoolingBackward(float* adj,
|
||||
int numKernels,
|
||||
int maskCols,
|
||||
int width,
|
||||
int lastWidth)
|
||||
{
|
||||
int lastWidth) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid >= adjRows * adjCols) return;
|
||||
if(tid >= adjRows * adjCols)
|
||||
return;
|
||||
|
||||
int rowId = tid / adjRows;
|
||||
int colId = tid % adjRows;
|
||||
|
||||
float* b = in + (rowId * inCols) + (colId * width);
|
||||
|
||||
if (colId == adjRows - 1) {
|
||||
if(colId == adjRows - 1) {
|
||||
width = lastWidth;
|
||||
}
|
||||
|
||||
float* localMask = mask + (rowId / numKernels) * maskCols + colId * width;
|
||||
size_t currentMaxIdx = 0;
|
||||
for (int i = 1; i < width; ++i) {
|
||||
if (b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) {
|
||||
for(int i = 1; i < width; ++i) {
|
||||
if(b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) {
|
||||
currentMaxIdx = i;
|
||||
}
|
||||
}
|
||||
|
||||
adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx] += adj[rowId + (colId * adjCols)];
|
||||
adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx]
|
||||
+= adj[rowId + (colId * adjCols)];
|
||||
}
|
||||
|
||||
void PoolingWithMaskingBackward(Tensor adj,
|
||||
@ -2111,16 +2112,21 @@ void PoolingWithMaskingBackward(Tensor adj,
|
||||
int adjRows = adjShape[2];
|
||||
int adjCols = adjShape[0] * adjShape[1];
|
||||
|
||||
int lastWidth = ((inCols - isEven) % width == 0)
|
||||
? width
|
||||
: (inCols - isEven) % width;
|
||||
int lastWidth
|
||||
= ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width;
|
||||
|
||||
gMaxPoolingBackward<<<blocks, threads>>>(
|
||||
adj->data(), adjRows, adjCols,
|
||||
in->data(), adjIn->data(), inRows, inCols,
|
||||
mask->data(), adjShape[1], mask->shape()[2],
|
||||
width, lastWidth);
|
||||
gMaxPoolingBackward<<<blocks, threads>>>(adj->data(),
|
||||
adjRows,
|
||||
adjCols,
|
||||
in->data(),
|
||||
adjIn->data(),
|
||||
inRows,
|
||||
inCols,
|
||||
mask->data(),
|
||||
adjShape[1],
|
||||
mask->shape()[2],
|
||||
width,
|
||||
lastWidth);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace marian
|
||||
|
@ -78,7 +78,7 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
void get(std::vector<float> &v) {
|
||||
void get(std::vector<float>& v) {
|
||||
v.resize(size());
|
||||
if(backend_->getDevice().type == DeviceType::cpu) {
|
||||
std::copy(data(), data() + size(), v.data());
|
||||
@ -101,9 +101,7 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
void set(const std::vector<float> &v) {
|
||||
set(v.data(), v.data() + v.size());
|
||||
}
|
||||
void set(const std::vector<float>& v) { set(v.data(), v.data() + v.size()); }
|
||||
|
||||
void set(float value) {
|
||||
if(backend_->getDevice().type == DeviceType::cpu) {
|
||||
@ -116,8 +114,7 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
void setSparse(const std::vector<size_t> &k,
|
||||
const std::vector<float> &v) {
|
||||
void setSparse(const std::vector<size_t>& k, const std::vector<float>& v) {
|
||||
if(backend_->getDevice().type == DeviceType::cpu) {
|
||||
for(int i = 0; i < k.size(); ++i)
|
||||
data()[k[i]] = v[i];
|
||||
@ -130,8 +127,8 @@ public:
|
||||
}
|
||||
|
||||
void copyFrom(Tensor in) {
|
||||
if(in->getBackend()->getDevice().type == DeviceType::cpu &&
|
||||
backend_->getDevice().type == DeviceType::cpu) {
|
||||
if(in->getBackend()->getDevice().type == DeviceType::cpu
|
||||
&& backend_->getDevice().type == DeviceType::cpu) {
|
||||
std::copy(in->data(), in->data() + in->size(), data());
|
||||
}
|
||||
#ifdef CUDA_FOUND
|
||||
@ -167,7 +164,6 @@ public:
|
||||
disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
|
||||
|
||||
if(disp) {
|
||||
|
||||
if(dims.back() == 0) {
|
||||
bool par = true;
|
||||
std::vector<std::string> p;
|
||||
@ -182,9 +178,7 @@ public:
|
||||
strm << " ";
|
||||
}
|
||||
|
||||
strm << std::setw(12)
|
||||
<< values[i]
|
||||
<< " ";
|
||||
strm << std::setw(12) << values[i] << " ";
|
||||
|
||||
if(dims.back() + 1 == shape().back()) {
|
||||
for(int j = dims.size() - 1; j >= 0; --j) {
|
||||
@ -214,9 +208,7 @@ public:
|
||||
strm << std::endl;
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<TensorBase> Tensor;
|
||||
|
||||
}
|
||||
|
@ -21,53 +21,47 @@
|
||||
|
||||
namespace marian {
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
|
||||
template <class Functor, class... Tensors>
|
||||
void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(out->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::Element(functor, out, tensors...);
|
||||
else
|
||||
if(out->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::Element(functor, out, tensors...);
|
||||
else
|
||||
#endif
|
||||
cpu::Element(functor, out, tensors...);
|
||||
}
|
||||
cpu::Element(functor, out, tensors...);
|
||||
}
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Add(Functor functor,
|
||||
float scale,
|
||||
marian::Tensor out,
|
||||
Tensors... tensors) {
|
||||
template <class Functor, class... Tensors>
|
||||
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(out->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::Add(functor, scale, out, tensors...);
|
||||
else
|
||||
if(out->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::Add(functor, scale, out, tensors...);
|
||||
else
|
||||
#endif
|
||||
cpu::Add(functor, scale, out, tensors...);
|
||||
}
|
||||
cpu::Add(functor, scale, out, tensors...);
|
||||
}
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Add(Functor functor,
|
||||
marian::Tensor out,
|
||||
Tensors... tensors) {
|
||||
Add(functor, 1, out, tensors...);
|
||||
}
|
||||
template <class Functor, class... Tensors>
|
||||
void Add(Functor functor, marian::Tensor out, Tensors... tensors) {
|
||||
Add(functor, 1, out, tensors...);
|
||||
}
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Reduce(Functor functor,
|
||||
float scale,
|
||||
marian::Tensor out,
|
||||
Tensors... tensors) {
|
||||
out->set(0);
|
||||
Add(functor, scale, out, tensors...);
|
||||
}
|
||||
template <class Functor, class... Tensors>
|
||||
void Reduce(Functor functor,
|
||||
float scale,
|
||||
marian::Tensor out,
|
||||
Tensors... tensors) {
|
||||
out->set(0);
|
||||
Add(functor, scale, out, tensors...);
|
||||
}
|
||||
|
||||
template <class Functor, class ...Tensors>
|
||||
void Reduce(Functor functor,
|
||||
marian::Tensor out,
|
||||
Tensors... tensors) {
|
||||
out->set(0);
|
||||
Add(functor, out, tensors...);
|
||||
}
|
||||
template <class Functor, class... Tensors>
|
||||
void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
|
||||
out->set(0);
|
||||
Add(functor, out, tensors...);
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
DISPATCH7(Prod, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
|
||||
DISPATCH7(ProdBatched, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
|
||||
|
||||
@ -86,26 +80,34 @@ namespace marian {
|
||||
DISPATCH4(Shift, marian::Tensor, marian::Tensor, marian::Shape, bool)
|
||||
|
||||
DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
|
||||
// clang-format on
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
namespace gpu {
|
||||
void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
|
||||
}
|
||||
namespace gpu {
|
||||
void Deconcatenate(std::vector<marian::Tensor>& outputs,
|
||||
const marian::Tensor in,
|
||||
int ax);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace cpu {
|
||||
void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
|
||||
}
|
||||
namespace cpu {
|
||||
void Deconcatenate(std::vector<marian::Tensor>& outputs,
|
||||
const marian::Tensor in,
|
||||
int ax);
|
||||
}
|
||||
|
||||
static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) {
|
||||
static inline void Deconcatenate(std::vector<marian::Tensor>& outputs,
|
||||
const marian::Tensor in,
|
||||
int ax) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(in->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::Deconcatenate(outputs, in, ax);
|
||||
else
|
||||
if(in->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::Deconcatenate(outputs, in, ax);
|
||||
else
|
||||
#endif
|
||||
cpu::Deconcatenate(outputs, in, ax);
|
||||
}
|
||||
cpu::Deconcatenate(outputs, in, ax);
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
|
||||
DISPATCH9(LayerNormalizationGrad, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
|
||||
|
||||
@ -120,113 +122,116 @@ namespace marian {
|
||||
DISPATCH5(Select, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>)
|
||||
DISPATCH5(Insert, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>)
|
||||
|
||||
|
||||
DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>)
|
||||
DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>);
|
||||
// clang-format on
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
namespace gpu {
|
||||
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj);
|
||||
}
|
||||
namespace gpu {
|
||||
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace cpu {
|
||||
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj);
|
||||
}
|
||||
namespace cpu {
|
||||
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj);
|
||||
}
|
||||
|
||||
static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
|
||||
static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::LSTMCellBackward(outputs, inputs, adj);
|
||||
else
|
||||
#endif
|
||||
cpu::LSTMCellBackward(outputs, inputs, adj);
|
||||
}
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
namespace gpu {
|
||||
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace cpu {
|
||||
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj);
|
||||
}
|
||||
|
||||
static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::LSTMCellBackward(outputs, inputs, adj);
|
||||
else
|
||||
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::LSTMOutputBackward(outputs, inputs, adj);
|
||||
else
|
||||
#endif
|
||||
cpu::LSTMCellBackward(outputs, inputs, adj);
|
||||
}
|
||||
cpu::LSTMOutputBackward(outputs, inputs, adj);
|
||||
}
|
||||
|
||||
DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
namespace gpu {
|
||||
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj);
|
||||
}
|
||||
namespace gpu {
|
||||
void GRUFastBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj,
|
||||
bool final);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace cpu {
|
||||
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj);
|
||||
}
|
||||
namespace cpu {
|
||||
void GRUFastBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj,
|
||||
bool final);
|
||||
}
|
||||
|
||||
static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj) {
|
||||
static inline void GRUFastBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj,
|
||||
bool final = false) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::LSTMOutputBackward(outputs, inputs, adj);
|
||||
else
|
||||
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::GRUFastBackward(outputs, inputs, adj, final);
|
||||
else
|
||||
#endif
|
||||
cpu::LSTMOutputBackward(outputs, inputs, adj);
|
||||
}
|
||||
|
||||
DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
namespace gpu {
|
||||
void GRUFastBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj,
|
||||
bool final);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace cpu {
|
||||
void GRUFastBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj,
|
||||
bool final);
|
||||
}
|
||||
|
||||
static inline void GRUFastBackward(std::vector<marian::Tensor> outputs,
|
||||
std::vector<marian::Tensor> inputs,
|
||||
marian::Tensor adj,
|
||||
bool final = false) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
gpu::GRUFastBackward(outputs, inputs, adj, final);
|
||||
else
|
||||
#endif
|
||||
cpu::GRUFastBackward(outputs, inputs, adj, final);
|
||||
}
|
||||
cpu::GRUFastBackward(outputs, inputs, adj, final);
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
|
||||
DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
|
||||
// clang-format on
|
||||
|
||||
#ifdef CUDA_FOUND
|
||||
namespace gpu {
|
||||
float L2Norm(marian::Tensor in);
|
||||
}
|
||||
namespace gpu {
|
||||
float L2Norm(marian::Tensor in);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace cpu {
|
||||
float L2Norm(marian::Tensor in);
|
||||
}
|
||||
namespace cpu {
|
||||
float L2Norm(marian::Tensor in);
|
||||
}
|
||||
|
||||
static inline float L2Norm(marian::Tensor in) {
|
||||
static inline float L2Norm(marian::Tensor in) {
|
||||
#ifdef CUDA_FOUND
|
||||
if(in->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
return gpu::L2Norm(in);
|
||||
else
|
||||
if(in->getBackend()->getDevice().type == DeviceType::gpu)
|
||||
return gpu::L2Norm(in);
|
||||
else
|
||||
#endif
|
||||
return cpu::L2Norm(in);
|
||||
}
|
||||
return cpu::L2Norm(in);
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
DISPATCH5(PoolingWithMaskingForward, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
|
||||
DISPATCH6(PoolingWithMaskingBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
@ -104,7 +104,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
|
||||
totalSize -= __size__;
|
||||
|
||||
Tensor param;
|
||||
Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
|
||||
Ptr<TensorAllocator> allocator
|
||||
= New<TensorAllocator>(graph->getBackend());
|
||||
allocator->reserveExact(__size__ * sizeof(float));
|
||||
allocator->allocate(param, {1, __size__});
|
||||
paramsAlloc_.push_back(allocator);
|
||||
@ -122,7 +123,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
|
||||
int __size__ = std::min(shardSize_, totalSize);
|
||||
totalSize -= __size__;
|
||||
Tensor grad_;
|
||||
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(graph->getBackend());
|
||||
Ptr<TensorAllocator> allocator_
|
||||
= New<TensorAllocator>(graph->getBackend());
|
||||
|
||||
allocator_->reserveExact(__size__ * sizeof(float));
|
||||
allocator_->allocate(grad_, {1, __size__});
|
||||
@ -139,7 +141,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
|
||||
int __size__ = std::min(shardSize_, totalSize);
|
||||
totalSize -= __size__;
|
||||
Tensor paramAvg;
|
||||
Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
|
||||
Ptr<TensorAllocator> allocator
|
||||
= New<TensorAllocator>(graph->getBackend());
|
||||
|
||||
allocator->reserveExact(__size__ * sizeof(float));
|
||||
allocator->allocate(paramAvg, {1, __size__});
|
||||
@ -230,7 +233,8 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
|
||||
scheduler_->update(cost, batch);
|
||||
|
||||
if(scheduler_->saving() || scheduler_->validating()) {
|
||||
// Wait with validation or saving until all other threads are done with update.
|
||||
// Wait with validation or saving until all other threads are done with
|
||||
// update.
|
||||
// We want to reuse the graphs for validation, so they need to be in
|
||||
// a safe state.
|
||||
pool_->wait_for_others(lock);
|
||||
|
@ -70,7 +70,6 @@ public:
|
||||
movingAvg_{options_->get<float>("exponential-smoothing") > 0},
|
||||
mvDecay_{options_->get<float>("exponential-smoothing")},
|
||||
tau_{options_->get<size_t>("optimizer-delay")} {
|
||||
|
||||
pool_.reset(new ThreadPool(devices_.size(), devices_.size()));
|
||||
|
||||
for(auto device : devices_) {
|
||||
|
@ -150,7 +150,8 @@ void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
|
||||
size_t modelSize = clientGraphs_[0]->params()->vals()->size();
|
||||
for(int client = 0; client < devices_.size(); client++) {
|
||||
// Communication overlap buffer (for grads + params)
|
||||
Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend());
|
||||
Tensor commOverlapBuffer
|
||||
= newTensor(modelSize, clientGraphs_[client]->getBackend());
|
||||
commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals());
|
||||
clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer);
|
||||
// Gradients local sum buffer
|
||||
@ -206,11 +207,13 @@ void MultiNodeGraphGroup::calculateShardSizes() {
|
||||
void MultiNodeGraphGroup::initShardGpuTensors() {
|
||||
size_t offset = 0;
|
||||
for(int shard = 0; shard < devices_.size(); shard++) {
|
||||
Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
|
||||
Tensor gpuParams
|
||||
= newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
|
||||
gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
|
||||
offset, shardSizes_[shard]));
|
||||
shardParams_.push_back(gpuParams);
|
||||
shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
|
||||
shardGrads_.push_back(
|
||||
newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include "tensors/tensor_operators.h"
|
||||
#include "training/graph_group_singleton.h"
|
||||
#include "tensors/tensor_operators.h"
|
||||
#include "functional/functional.h"
|
||||
|
||||
namespace marian {
|
||||
@ -15,7 +15,8 @@ void SingletonGraph::updateMovingAverage(Tensor mvAvgParams,
|
||||
Tensor params,
|
||||
size_t batches) {
|
||||
using namespace functional;
|
||||
float decay = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
|
||||
float decay
|
||||
= std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
|
||||
Element(_1 = ((1.f - decay) * _1) + (decay * _2), mvAvgParams, params);
|
||||
}
|
||||
|
||||
@ -59,8 +60,7 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
|
||||
if(mvAvg_) {
|
||||
mvAvgGraph_->reuseWorkspace(graph_);
|
||||
scheduler_->validate({mvAvgGraph_});
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
scheduler_->validate({graph_});
|
||||
}
|
||||
}
|
||||
|
@ -29,7 +29,6 @@ public:
|
||||
: GraphGroup(options),
|
||||
mvAvg_{options_->get<float>("exponential-smoothing") > 0},
|
||||
mvDecay_{options_->get<float>("exponential-smoothing")} {
|
||||
|
||||
auto deviceId = options_->getDevices()[0];
|
||||
graph_ = New<ExpressionGraph>();
|
||||
graph_->setDevice(deviceId);
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "training/graph_group_sync.h"
|
||||
#include "tensors/tensor_operators.h"
|
||||
#include "functional/functional.h"
|
||||
#include "tensors/tensor_operators.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -17,7 +17,8 @@ void SyncGraphGroup::updateMovingAverage(Tensor paramsAvg,
|
||||
Tensor params,
|
||||
size_t batches) {
|
||||
using namespace functional;
|
||||
float decay = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
|
||||
float decay
|
||||
= std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
|
||||
Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params);
|
||||
}
|
||||
|
||||
@ -135,10 +136,10 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
|
||||
int size = params_[idx]->size();
|
||||
int i = 0;
|
||||
|
||||
float div = devices_.size(); // no. of GPUs
|
||||
float div = devices_.size(); // no. of GPUs
|
||||
|
||||
// do not average gradients if cost type is sum.
|
||||
if (options_->get<std::string>("cost-type") == "ce-sum") {
|
||||
if(options_->get<std::string>("cost-type") == "ce-sum") {
|
||||
div = 1;
|
||||
}
|
||||
|
||||
@ -176,7 +177,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
|
||||
float cost = 0;
|
||||
for(auto c : costs)
|
||||
cost += c;
|
||||
if (options_->get<std::string>("cost-type") != "ce-sum") {
|
||||
if(options_->get<std::string>("cost-type") != "ce-sum") {
|
||||
cost = cost / costs.size();
|
||||
}
|
||||
|
||||
|
@ -43,7 +43,6 @@ public:
|
||||
devices_{options_->getDevices()},
|
||||
movingAvg_{options_->get<float>("exponential-smoothing") > 0},
|
||||
mvDecay_{options_->get<float>("exponential-smoothing")} {
|
||||
|
||||
for(auto device : devices_) {
|
||||
auto graph = New<ExpressionGraph>();
|
||||
graph->setDevice(device);
|
||||
|
@ -100,7 +100,8 @@ public:
|
||||
return (state_->batches % options_->get<size_t>("save-freq") == 0);
|
||||
}
|
||||
|
||||
void validate(const std::vector<Ptr<ExpressionGraph>>& graphs, bool final = false) {
|
||||
void validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
|
||||
bool final = false) {
|
||||
if(state_->validated
|
||||
|| (state_->batches % options_->get<size_t>("valid-freq") != 0
|
||||
&& !final))
|
||||
|
@ -7,8 +7,8 @@
|
||||
#include "tensors/tensor.h"
|
||||
#include "tensors/tensor_operators.h"
|
||||
#include "training/sparse_tensor.h"
|
||||
#include "tensors/gpu/cuda_helpers.h"
|
||||
|
||||
#include "tensors/gpu/cuda_helpers.h"
|
||||
|
||||
namespace marian {
|
||||
|
||||
@ -48,10 +48,10 @@ __global__ void gFindSubtensor(int* indices,
|
||||
}
|
||||
|
||||
SparseTensorBase::SparseTensorBase(int capacity, Ptr<Backend> backend)
|
||||
: backend_(backend), capacity_(capacity) {
|
||||
: backend_(backend), capacity_(capacity) {
|
||||
ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
|
||||
"Gradient dropping is currently not implemented for CPU usage");
|
||||
|
||||
"Gradient dropping is currently not implemented for CPU usage");
|
||||
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity));
|
||||
CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity));
|
||||
@ -64,7 +64,7 @@ SparseTensorBase::SparseTensorBase(float* data,
|
||||
int* indices,
|
||||
int size,
|
||||
Ptr<Backend> backend)
|
||||
: backend_(backend) {
|
||||
: backend_(backend) {
|
||||
data_ = data;
|
||||
indices_ = indices;
|
||||
size_ = size;
|
||||
@ -98,10 +98,10 @@ void SparseTensorBase::copyFrom(float* data,
|
||||
size_ = size;
|
||||
if(size == 0)
|
||||
return;
|
||||
|
||||
|
||||
ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
|
||||
"Gradient dropping is currently not implemented for CPU usage");
|
||||
|
||||
"Gradient dropping is currently not implemented for CPU usage");
|
||||
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
|
||||
cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault);
|
||||
@ -128,7 +128,7 @@ void SparseTensorBase::setSize(int size) {
|
||||
void SparseTensorBase::toDense(Tensor t, int offset) {
|
||||
ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
|
||||
"Gradient dropping is currently not implemented for CPU usage");
|
||||
|
||||
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
int threads = 512;
|
||||
int blocks = 1 + size_ / threads;
|
||||
@ -152,8 +152,8 @@ std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
|
||||
int size,
|
||||
int idx) {
|
||||
ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
|
||||
"Gradient dropping is currently not implemented for CPU usage");
|
||||
|
||||
"Gradient dropping is currently not implemented for CPU usage");
|
||||
|
||||
cudaSetDevice(backend_->getDevice().no);
|
||||
cudaStreamSynchronize(0);
|
||||
int* start = gstart_ + idx;
|
||||
|
@ -45,7 +45,6 @@ public:
|
||||
|
||||
if((options_->has("valid-sets") || options_->has("valid-script-path"))
|
||||
&& options_->get<size_t>("valid-freq") > 0) {
|
||||
|
||||
for(auto validator : Validators(dataset->getVocabs(), options_))
|
||||
scheduler->addValidator(validator);
|
||||
}
|
||||
|
@ -4,9 +4,9 @@
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
|
||||
#include "3rd_party/threadpool.h"
|
||||
#include "common/config.h"
|
||||
#include "common/utils.h"
|
||||
#include "3rd_party/threadpool.h"
|
||||
#include "data/batch_generator.h"
|
||||
#include "data/corpus.h"
|
||||
#include "graph/expression_graph.h"
|
||||
@ -25,8 +25,7 @@ namespace marian {
|
||||
class ValidatorBase : public TrainingObserver {
|
||||
public:
|
||||
ValidatorBase(bool lowerIsBetter)
|
||||
: lowerIsBetter_(lowerIsBetter),
|
||||
lastBest_{initScore()} {}
|
||||
: lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}
|
||||
|
||||
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) = 0;
|
||||
virtual std::string type() = 0;
|
||||
@ -98,7 +97,8 @@ protected:
|
||||
Ptr<data::BatchGenerator<DataSet>>)
|
||||
= 0;
|
||||
|
||||
void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs, float val) {
|
||||
void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs,
|
||||
float val) {
|
||||
if((lowerIsBetter_ && lastBest_ > val)
|
||||
|| (!lowerIsBetter_ && lastBest_ < val)) {
|
||||
stalled_ = 0;
|
||||
@ -226,7 +226,6 @@ public:
|
||||
TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Config> options)
|
||||
: Validator(vocabs, options, false),
|
||||
quiet_(options_->get<bool>("quiet-translation")) {
|
||||
|
||||
Ptr<Options> opts = New<Options>();
|
||||
opts->merge(options);
|
||||
opts->set("inference", true);
|
||||
@ -314,7 +313,8 @@ public:
|
||||
scorer = scorers[id % graphs.size()];
|
||||
}
|
||||
|
||||
auto search = New<BeamSearch>(options_, std::vector<Ptr<Scorer>>{scorer});
|
||||
auto search
|
||||
= New<BeamSearch>(options_, std::vector<Ptr<Scorer>>{scorer});
|
||||
auto histories = search->search(graph, batch);
|
||||
|
||||
for(auto history : histories) {
|
||||
|
@ -34,21 +34,20 @@ public:
|
||||
std::vector<Ptr<ScorerState>>& states,
|
||||
size_t beamSize,
|
||||
bool first) {
|
||||
|
||||
Beams newBeams(beams.size());
|
||||
for(int i = 0; i < keys.size(); ++i) {
|
||||
int embIdx = keys[i] % vocabSize;
|
||||
int beamIdx = i / beamSize;
|
||||
int embIdx = keys[i] % vocabSize;
|
||||
int beamIdx = i / beamSize;
|
||||
|
||||
if(newBeams[beamIdx].size() < beams[beamIdx].size()) {
|
||||
auto& beam = beams[beamIdx];
|
||||
auto& newBeam = newBeams[beamIdx];
|
||||
|
||||
int hypIdx = keys[i] / vocabSize;
|
||||
float cost = costs[i];
|
||||
float cost = costs[i];
|
||||
|
||||
int hypIdxTrans = (hypIdx / beamSize) +
|
||||
(hypIdx % beamSize) * beams.size();
|
||||
int hypIdxTrans
|
||||
= (hypIdx / beamSize) + (hypIdx % beamSize) * beams.size();
|
||||
if(first)
|
||||
hypIdxTrans = hypIdx;
|
||||
|
||||
@ -78,7 +77,7 @@ public:
|
||||
|
||||
Beams pruneBeam(const Beams& beams) {
|
||||
Beams newBeams;
|
||||
for(auto beam: beams) {
|
||||
for(auto beam : beams) {
|
||||
Beam newBeam;
|
||||
for(auto hyp : beam) {
|
||||
if(hyp->GetWord() > 0) {
|
||||
@ -90,9 +89,7 @@ public:
|
||||
return newBeams;
|
||||
}
|
||||
|
||||
Histories search(Ptr<ExpressionGraph> graph,
|
||||
Ptr<data::CorpusBatch> batch) {
|
||||
|
||||
Histories search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
|
||||
int dimBatch = batch->size();
|
||||
Histories histories;
|
||||
for(int i = 0; i < dimBatch; ++i) {
|
||||
@ -140,8 +137,7 @@ public:
|
||||
Expr prevCosts;
|
||||
if(first) {
|
||||
// no cost
|
||||
prevCosts = graph->constant({1, 1, 1, 1},
|
||||
inits::from_value(0));
|
||||
prevCosts = graph->constant({1, 1, 1, 1}, inits::from_value(0));
|
||||
} else {
|
||||
std::vector<float> beamCosts;
|
||||
|
||||
@ -155,8 +151,7 @@ public:
|
||||
hypIndices.push_back(hyp->GetPrevStateIndex());
|
||||
embIndices.push_back(hyp->GetWord());
|
||||
beamCosts.push_back(hyp->GetCost());
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
hypIndices.push_back(0);
|
||||
embIndices.push_back(0);
|
||||
beamCosts.push_back(-9999);
|
||||
@ -164,9 +159,8 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
prevCosts
|
||||
= graph->constant({(int)localBeamSize, 1, dimBatch, 1},
|
||||
inits::from_vector(beamCosts));
|
||||
prevCosts = graph->constant({(int)localBeamSize, 1, dimBatch, 1},
|
||||
inits::from_vector(beamCosts));
|
||||
}
|
||||
|
||||
//**********************************************************************
|
||||
@ -174,10 +168,12 @@ public:
|
||||
auto totalCosts = prevCosts;
|
||||
|
||||
for(int i = 0; i < scorers_.size(); ++i) {
|
||||
states[i] = scorers_[i]->step(graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);
|
||||
states[i] = scorers_[i]->step(
|
||||
graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);
|
||||
|
||||
if(scorers_[i]->getWeight() != 1.f)
|
||||
totalCosts = totalCosts + scorers_[i]->getWeight() * states[i]->getProbs();
|
||||
totalCosts
|
||||
= totalCosts + scorers_[i]->getWeight() * states[i]->getProbs();
|
||||
else
|
||||
totalCosts = totalCosts + states[i]->getProbs();
|
||||
}
|
||||
@ -207,12 +203,14 @@ public:
|
||||
nth->getNBestList(beamSizes, totalCosts->val(), outCosts, outKeys, first);
|
||||
|
||||
int dimTrgVoc = totalCosts->shape()[-1];
|
||||
beams = toHyps(outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first);
|
||||
beams = toHyps(
|
||||
outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first);
|
||||
|
||||
auto prunedBeams = pruneBeam(beams);
|
||||
for(int i = 0; i < dimBatch; ++i) {
|
||||
if(!beams[i].empty()) {
|
||||
final = final || histories[i]->size() >= 3 * batch->front()->batchWidth();
|
||||
final = final
|
||||
|| histories[i]->size() >= 3 * batch->front()->batchWidth();
|
||||
histories[i]->Add(beams[i], prunedBeams[i].empty() || final);
|
||||
}
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ void SetColumn(Tensor in_, size_t col, float value) {
|
||||
int nColumns = in_->shape()[-1];
|
||||
|
||||
float* in = in_->data();
|
||||
for (int rowNumber = 0; rowNumber < nRows; ++rowNumber) {
|
||||
for(int rowNumber = 0; rowNumber < nRows; ++rowNumber) {
|
||||
int index = col + rowNumber * nColumns;
|
||||
in[index] = value;
|
||||
}
|
||||
@ -31,7 +31,6 @@ void suppressUnk(Expr probs) {
|
||||
void suppressWord(Expr probs, Word id) {
|
||||
SetColumn(probs->val(), id, std::numeric_limits<float>::lowest());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void suppressUnk(Expr probs) {
|
||||
@ -55,5 +54,4 @@ void suppressWord(Expr probs, Word id) {
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -44,6 +44,5 @@ void suppressUnk(Expr probs) {
|
||||
void suppressWord(Expr probs, Word id) {
|
||||
SetColumn(probs->val(), id, std::numeric_limits<float>::lowest());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -14,7 +14,6 @@ namespace cpu {
|
||||
void suppressUnk(Expr probs);
|
||||
|
||||
void suppressWord(Expr probs, Word id);
|
||||
|
||||
}
|
||||
|
||||
namespace gpu {
|
||||
@ -22,11 +21,9 @@ namespace gpu {
|
||||
void suppressUnk(Expr probs);
|
||||
|
||||
void suppressWord(Expr probs, Word id);
|
||||
|
||||
}
|
||||
|
||||
void suppressUnk(Expr probs);
|
||||
|
||||
void suppressWord(Expr probs, Word id);
|
||||
|
||||
}
|
||||
|
@ -27,7 +27,8 @@ public:
|
||||
if(beam[j]->GetWord() == 0 || last) {
|
||||
float cost = beam[j]->GetCost() / LengthPenalty(history_.size());
|
||||
topHyps_.push({history_.size(), j, cost});
|
||||
//std::cerr << "Add " << history_.size() << " " << j << " " << cost << std::endl;
|
||||
// std::cerr << "Add " << history_.size() << " " << j << " " << cost
|
||||
// << std::endl;
|
||||
}
|
||||
}
|
||||
history_.push_back(beam);
|
||||
@ -44,14 +45,14 @@ public:
|
||||
|
||||
size_t start = bestHypCoord.i;
|
||||
size_t j = bestHypCoord.j;
|
||||
//float c = bestHypCoord.cost;
|
||||
//std::cerr << "h: " << start << " " << j << " " << c << std::endl;
|
||||
// float c = bestHypCoord.cost;
|
||||
// std::cerr << "h: " << start << " " << j << " " << c << std::endl;
|
||||
|
||||
Words targetWords;
|
||||
Ptr<Hypothesis> bestHyp = history_[start][j];
|
||||
while(bestHyp->GetPrevHyp() != nullptr) {
|
||||
targetWords.push_back(bestHyp->GetWord());
|
||||
//std::cerr << bestHyp->GetWord() << " " << bestHyp << std::endl;
|
||||
// std::cerr << bestHyp->GetWord() << " " << bestHyp << std::endl;
|
||||
bestHyp = bestHyp->GetPrevHyp();
|
||||
}
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user