Autoformat files

This commit is contained in:
Roman Grundkiewicz 2018-03-12 20:34:10 +00:00
parent 5f2eedc6e5
commit 6d0c75cf48
106 changed files with 2055 additions and 2103 deletions

View File

@ -1,3 +1,3 @@
#!/bin/bash
find ./src \( -path ./src/3rd_party -o -path ./src/tests -o -path ./src/models/experimental \) -prune -o -iname *.h -o -iname *.cpp -o -iname *.cu | xargs clang-format-3.8 -i
find ./src \( -path ./src/3rd_party -o -path ./src/tests -o -path ./src/models/experimental \) -prune -o -iname *.h -o -iname *.cpp -o -iname *.cu | xargs clang-format -i

View File

@ -10,7 +10,6 @@
#include "training/graph_group_async_drop.h"
#endif
bool configureMPI(int, char**);
int main(int argc, char** argv) {
@ -18,7 +17,7 @@ int main(int argc, char** argv) {
auto options = New<Config>(argc, argv);
auto devices = options->getDevices();
if(options->get<bool>("multi-node")) {
ABORT_IF(!configureMPI(argc, argv), "MPI not found.");

View File

@ -65,5 +65,4 @@ void Config::AddYamlToNpz(const YAML::Node& yaml,
unsigned shape = out.size() + 1;
cnpy::npz_save(fName, varName, out.c_str(), &shape, 1, "a");
}
}

View File

@ -76,7 +76,9 @@ public:
log();
if(has("version"))
LOG(info, "[config] Model created with Marian {}", get("version").as<std::string>());
LOG(info,
"[config] Model created with Marian {}",
get("version").as<std::string>());
}
Config(const Config& other) : config_(YAML::Clone(other.config_)) {}
@ -110,9 +112,7 @@ public:
YAML::Node getModelParameters();
void loadModelParameters(const std::string& name);
const std::vector<DeviceId>& getDevices() {
return devices_;
}
const std::vector<DeviceId>& getDevices() { return devices_; }
void save(const std::string& name) {
OutputFileStream out(name);

View File

@ -2,8 +2,8 @@
#include <boost/algorithm/string.hpp>
#include <boost/regex.hpp>
#include <set>
#include <string>
#include <stdexcept>
#include <string>
#if MKL_FOUND
//#include <omp.h>
@ -15,9 +15,9 @@
#endif
#endif
#include "3rd_party/cnpy/cnpy.h"
#include "common/definitions.h"
#include "common/config.h"
#include "common/config_parser.h"
#include "common/file_stream.h"
@ -146,13 +146,14 @@ bool ConfigParser::has(const std::string& key) const {
void ConfigParser::validateOptions() const {
if(mode_ == ConfigMode::translating) {
UTIL_THROW_IF2(!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
UTIL_THROW_IF2(
!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
"Translating, but vocabularies are not given!");
for(const auto& modelFile : get<std::vector<std::string>>("models")) {
boost::filesystem::path modelPath(modelFile);
UTIL_THROW_IF2(!boost::filesystem::exists(modelPath),
"Model file does not exist: " + modelFile);
"Model file does not exist: " + modelFile);
}
return;
@ -177,9 +178,10 @@ void ConfigParser::validateOptions() const {
if(mode_ == ConfigMode::rescoring) {
UTIL_THROW_IF2(!boost::filesystem::exists(modelPath),
"Model file does not exist: " + modelPath.string());
"Model file does not exist: " + modelPath.string());
UTIL_THROW_IF2(!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
UTIL_THROW_IF2(
!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(),
"Scoring, but vocabularies are not given!");
return;
@ -193,8 +195,9 @@ void ConfigParser::validateOptions() const {
!modelDir.empty() && !boost::filesystem::is_directory(modelDir),
"Model directory does not exist");
UTIL_THROW_IF2(!modelDir.empty() && !(boost::filesystem::status(modelDir).permissions()
& boost::filesystem::owner_write),
UTIL_THROW_IF2(!modelDir.empty()
&& !(boost::filesystem::status(modelDir).permissions()
& boost::filesystem::owner_write),
"No write permission in model directory");
UTIL_THROW_IF2(
@ -835,7 +838,6 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
SET_OPTION("transformer-dim-ffn", int);
SET_OPTION("transformer-ffn-activation", std::string);
#ifdef CUDNN
SET_OPTION("char-stride", int);
SET_OPTION("char-highway", int);
@ -976,7 +978,7 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
SET_OPTION("relative-paths", bool);
SET_OPTION("devices", std::vector<std::string>);
SET_OPTION("cpu-threads", size_t);
//SET_OPTION("omp-threads", size_t);
// SET_OPTION("omp-threads", size_t);
SET_OPTION("mini-batch", int);
SET_OPTION("maxi-batch", int);
@ -1021,24 +1023,22 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
exit(0);
}
// @TODO: this should probably be in processOptionDevices()
//#ifdef BLAS_FOUND
// //omp_set_num_threads(vm_["omp-threads"].as<size_t>());
//#ifdef MKL_FOUND
// mkl_set_num_threads(vm_["omp-threads"].as<size_t>());
//#endif
//#endif
// @TODO: this should probably be in processOptionDevices()
//#ifdef BLAS_FOUND
// //omp_set_num_threads(vm_["omp-threads"].as<size_t>());
//#ifdef MKL_FOUND
// mkl_set_num_threads(vm_["omp-threads"].as<size_t>());
//#endif
//#endif
}
std::vector<DeviceId> ConfigParser::getDevices() {
std::vector<DeviceId> devices;
try {
std::string devicesStr
= Join(config_["devices"].as<std::vector<std::string>>());
if(mode_ == ConfigMode::training && get<bool>("multi-node")) {
auto parts = Split(devicesStr, ":");
for(size_t i = 1; i < parts.size(); ++i) {
@ -1061,11 +1061,10 @@ std::vector<DeviceId> ConfigParser::getDevices() {
if(config_["cpu-threads"].as<size_t>() > 0) {
devices.clear();
for(size_t i = 0; i < config_["cpu-threads"].as<size_t>(); ++i)
devices.push_back({i, DeviceType::cpu});
devices.push_back({i, DeviceType::cpu});
}
}
catch(...) {
} catch(...) {
ABORT("Problem parsing devices, please report an issue on github");
}

View File

@ -1,10 +1,10 @@
#pragma once
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include <iostream>
#include "common/logging.h"
#include "shape.h"
@ -57,7 +57,6 @@ struct DeviceId {
friend bool operator==(DeviceId id1, DeviceId id2) {
return id1.no == id2.no && id1.type == id2.type;
}
};
class TensorBase;

View File

@ -1,195 +1,191 @@
#pragma once
#include <algorithm>
#include <cstdint>
#include <iostream>
#include <string>
#include <sstream>
#include <string>
#include <vector>
#include <algorithm>
#include "common/logging.h"
namespace marian {
struct Shape {
public:
std::vector<int> shape_;
public:
std::vector<int> shape_;
public:
Shape() : shape_{1} {}
public:
Shape() : shape_{1} {}
Shape(std::initializer_list<int> il) : Shape() {
shape_.resize(il.size());
std::copy(il.begin(), il.end(), begin());
Shape(std::initializer_list<int> il) : Shape() {
shape_.resize(il.size());
std::copy(il.begin(), il.end(), begin());
}
void resize(size_t n) { shape_.resize(n, 1); }
const int* data() const { return shape_.data(); }
int* data() { return shape_.data(); }
Shape(const Shape& shape) : Shape() {
shape_.resize(shape.size());
std::copy(shape.begin(), shape.end(), begin());
}
inline void set(int i, int val) { dim(i) = val; }
inline int& dim(int i) {
if(i >= 0) {
ABORT_IF(i >= size(),
"Index {} is out of bounds, shape has {} dimension",
i,
size());
return shape_[i];
} else {
ABORT_IF((int)size() + i < 0,
"Negative index {} is out of bounds, shape has {} dimension",
i,
size());
return shape_[size() + i];
}
}
void resize(size_t n) {
shape_.resize(n, 1);
}
inline const int& dim(int i) const {
return const_cast<Shape&>(*this).dim(i);
}
const int* data() const {
return shape_.data();
}
inline int operator[](int i) { return dim(i); }
int* data() {
return shape_.data();
}
inline int operator[](int i) const { return dim(i); }
Shape(const Shape& shape) : Shape() {
shape_.resize(shape.size());
std::copy(shape.begin(), shape.end(), begin());
}
inline int& back() { return shape_.back(); }
inline void set(int i, int val) {
dim(i) = val;
}
inline int stride(int i) const {
std::vector<int> stride(shape_.size(), 1);
for(int j = shape_.size() - 2; j >= 0; --j)
stride[j] = stride[j + 1] * shape_[j + 1];
inline int& dim(int i) {
if(i >= 0) {
ABORT_IF(i >= size(),
"Index {} is out of bounds, shape has {} dimension", i, size());
return shape_[i];
}
else {
ABORT_IF((int)size() + i < 0,
"Negative index {} is out of bounds, shape has {} dimension", i, size());
return shape_[size() + i];
if(i >= 0)
return stride[i];
else
return stride[size() + i];
}
inline size_t size() const { return shape_.size(); }
inline int elements() const {
int el = 1;
for(auto s : shape_)
el *= s;
return el;
}
inline void dims(int i, std::vector<int>& d) const {
d.resize(shape_.size());
std::vector<int> stride(shape_.size(), 1);
for(int j = shape_.size() - 2; j >= 0; --j)
stride[j] = stride[j + 1] * shape_[j + 1];
for(int j = 0; j < d.size(); ++j)
d[j] = (i / stride[j]) % shape_[j];
}
auto begin() -> decltype(shape_.begin()) { return shape_.begin(); }
auto begin() const -> decltype(shape_.begin()) { return shape_.begin(); }
auto end() -> decltype(shape_.end()) { return shape_.end(); }
auto end() const -> decltype(shape_.end()) { return shape_.end(); }
auto rbegin() -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
auto rbegin() const -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
auto rend() -> decltype(shape_.rend()) { return shape_.rend(); }
auto rend() const -> decltype(shape_.rend()) { return shape_.rend(); }
bool operator==(const Shape& other) const {
return size() == other.size() && std::equal(begin(), end(), other.begin());
}
bool operator!=(const Shape& other) const { return !(*this == other); }
std::string toString() const {
std::stringstream strm;
strm << "shape=" << (*this)[0];
for(int i = 1; i < size(); ++i)
strm << "x" << (*this)[i];
strm << " size=" << elements() << " (" << elements() * sizeof(float)
<< "B)";
return strm.str();
}
friend std::ostream& operator<<(std::ostream& strm, const Shape& shape) {
strm << shape.toString();
return strm;
}
operator std::string() const {
std::stringstream ss;
ss << *this;
return ss.str();
}
int axis(int ax) {
if(ax < 0)
return size() + ax;
else
return ax;
}
static Shape broadcast(const std::vector<Shape>& shapes) {
int maxDims = 0;
for(auto& s : shapes)
if(s.size() > maxDims)
maxDims = s.size();
Shape shape;
shape.resize(maxDims);
for(auto& s : shapes) {
for(int i = 0; i < s.size(); ++i) {
ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1,
"Shapes {} and {} cannot be broadcasted",
(std::string)shape,
(std::string)s);
shape.set(-i, std::max(shape[-i], s[-i]));
}
}
return shape;
}
inline const int& dim(int i) const { return const_cast<Shape&>(*this).dim(i); }
template <typename T>
static Shape broadcast(const std::initializer_list<T>& il) {
return broadcast(std::vector<T>(il));
}
inline int operator[](int i) { return dim(i); }
template <typename T>
static Shape broadcast(const std::vector<T>& nodes) {
int maxDims = 0;
for(auto& n : nodes)
if(n->shape().size() > maxDims)
maxDims = n->shape().size();
inline int operator[](int i) const { return dim(i); }
Shape shape;
shape.resize(maxDims);
inline int& back() { return shape_.back(); }
inline int stride(int i) const {
std::vector<int> stride(shape_.size(), 1);
for(int j = shape_.size() - 2; j >= 0; --j)
stride[j] = stride[j + 1] * shape_[j + 1];
if(i >= 0)
return stride[i];
else
return stride[size() + i];
}
inline size_t size() const { return shape_.size(); }
inline int elements() const {
int el = 1;
for(auto s : shape_)
el *= s;
return el;
}
inline void dims(int i, std::vector<int>& d) const {
d.resize(shape_.size());
std::vector<int> stride(shape_.size(), 1);
for(int j = shape_.size() - 2; j >= 0; --j)
stride[j] = stride[j + 1] * shape_[j + 1];
for(int j = 0; j < d.size(); ++j)
d[j] = (i / stride[j]) % shape_[j];
}
auto begin() -> decltype(shape_.begin()) { return shape_.begin(); }
auto begin() const -> decltype(shape_.begin()) { return shape_.begin(); }
auto end() -> decltype(shape_.end()) { return shape_.end(); }
auto end() const -> decltype(shape_.end()) { return shape_.end(); }
auto rbegin() -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
auto rbegin() const -> decltype(shape_.rbegin()) { return shape_.rbegin(); }
auto rend() -> decltype(shape_.rend()) { return shape_.rend(); }
auto rend() const -> decltype(shape_.rend()) { return shape_.rend(); }
bool operator==(const Shape& other) const {
return size() == other.size() && std::equal(begin(), end(), other.begin());
}
bool operator!=(const Shape& other) const { return !(*this == other); }
std::string toString() const {
std::stringstream strm;
strm << "shape=" << (*this)[0];
for(int i = 1; i < size(); ++i)
strm << "x" << (*this)[i];
strm << " size=" << elements() << " ("
<< elements() * sizeof(float) << "B)";
return strm.str();
}
friend std::ostream& operator<<(std::ostream& strm, const Shape& shape) {
strm << shape.toString();
return strm;
}
operator std::string() const {
std::stringstream ss;
ss << *this;
return ss.str();
}
int axis(int ax) {
if(ax < 0)
return size() + ax;
else
return ax;
}
static Shape broadcast(const std::vector<Shape>& shapes) {
int maxDims = 0;
for(auto& s : shapes)
if(s.size() > maxDims)
maxDims = s.size();
Shape shape;
shape.resize(maxDims);
for(auto& s : shapes) {
for(int i = 0; i < s.size(); ++i) {
ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1,
"Shapes {} and {} cannot be broadcasted",
(std::string)shape,
(std::string)s);
shape.set(-i, std::max(shape[-i], s[-i]));
}
for(auto& node : nodes) {
const Shape& shapen = node->shape();
for(int i = 1; i <= shapen.size(); ++i) {
ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1,
"Shapes {} and {} cannot be broadcasted",
(std::string)shape,
(std::string)shapen);
shape.set(-i, std::max(shape[-i], shapen[-i]));
}
return shape;
}
template <typename T>
static Shape broadcast(const std::initializer_list<T>& il) {
return broadcast(std::vector<T>(il));
}
template <typename T>
static Shape broadcast(const std::vector<T>& nodes) {
int maxDims = 0;
for(auto& n : nodes)
if(n->shape().size() > maxDims)
maxDims = n->shape().size();
Shape shape;
shape.resize(maxDims);
for(auto& node : nodes) {
const Shape& shapen = node->shape();
for(int i = 1; i <= shapen.size(); ++i) {
ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1,
"Shapes {} and {} cannot be broadcasted",
(std::string)shape,
(std::string)shapen);
shape.set(-i, std::max(shape[-i], shapen[-i]));
}
}
return shape;
}
return shape;
}
};
}

View File

@ -1,11 +1,11 @@
#pragma once
#include <boost/timer/timer.hpp>
#include <condition_variable>
#include <deque>
#include <functional>
#include <queue>
#include <mutex>
#include <condition_variable>
#include <boost/timer/timer.hpp>
#include <queue>
#include "common/config.h"
#include "data/batch_stats.h"
@ -47,16 +47,17 @@ private:
void fillBatches(bool shuffle = true) {
typedef typename sample::value_type Item;
auto itemCmp = [](const Item& sa, const Item& sb) {
return sa.size() < sb.size();
};
auto itemCmp
= [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); };
auto cmpSrc = [itemCmp](const sample& a, const sample& b) {
return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(), itemCmp);
return std::lexicographical_compare(
a.begin(), a.end(), b.begin(), b.end(), itemCmp);
};
auto cmpTrg = [itemCmp](const sample& a, const sample& b) {
return std::lexicographical_compare(a.rbegin(), a.rend(), b.rbegin(), b.rend(), itemCmp);
return std::lexicographical_compare(
a.rbegin(), a.rend(), b.rbegin(), b.rend(), itemCmp);
};
auto cmpNone = [](const sample& a, const sample& b) { return &a < &b; };
@ -168,9 +169,8 @@ public:
operator bool() const {
// wait if empty but loading
std::unique_lock<std::mutex> lock(loadMutex_);
loadCondition_.wait(lock, [this]{
return loadReady_ || !bufferedBatches_.empty();
});
loadCondition_.wait(
lock, [this] { return loadReady_ || !bufferedBatches_.empty(); });
return !bufferedBatches_.empty();
}
@ -178,15 +178,16 @@ public:
BatchPtr next() {
{
std::unique_lock<std::mutex> lock(loadMutex_);
loadCondition_.wait(lock, [this]{
return loadReady_ || !bufferedBatches_.empty();
});
loadCondition_.wait(
lock, [this] { return loadReady_ || !bufferedBatches_.empty(); });
}
ABORT_IF(bufferedBatches_.empty(), "No batches to fetch, run prepare()");
currentBatch_ = bufferedBatches_.front();
if(loadReady_ && bufferedBatches_.size() <= std::max(options_->get<int>("maxi-batch") / 5, 1)) {
if(loadReady_
&& bufferedBatches_.size()
<= std::max(options_->get<int>("maxi-batch") / 5, 1)) {
{
std::unique_lock<std::mutex> lock(loadMutex_);
loadReady_ = false;

View File

@ -400,7 +400,6 @@ public:
std::cerr << std::endl;
}
}
};
class CorpusIterator;

View File

@ -1,7 +1,7 @@
#include <random>
#include "data/corpus_nbest.h"
#include "common/utils.h"
#include "data/corpus_nbest.h"
namespace marian {
namespace data {
@ -10,8 +10,8 @@ CorpusNBest::CorpusNBest(Ptr<Config> options, bool translate /*= false*/)
: CorpusBase(options, translate) {}
CorpusNBest::CorpusNBest(std::vector<std::string> paths,
std::vector<Ptr<Vocab>> vocabs,
Ptr<Config> options)
std::vector<Ptr<Vocab>> vocabs,
Ptr<Config> options)
: CorpusBase(paths, vocabs, options) {}
int numFromNbest(const std::string& line) {
@ -19,7 +19,8 @@ int numFromNbest(const std::string& line) {
Split(line, fields, " ||| ", true);
ABORT_IF(fields.size() < 4,
"Too few fields ({}) in line \"{}\", is this a correct n-best list?",
fields.size(), line);
fields.size(),
line);
return std::stoi(fields[0]);
}
@ -28,7 +29,8 @@ std::string lineFromNbest(const std::string& line) {
Split(line, fields, " ||| ", true);
ABORT_IF(fields.size() < 4,
"Too few fields ({}) in line \"{}\", is this a correct n-best list?",
fields.size(), line);
fields.size(),
line);
return fields[1];
}
@ -56,7 +58,8 @@ SentenceTuple CorpusNBest::next() {
for(size_t i = 0; i < last; ++i) {
if(curr_num > lastNum_) {
ABORT_IF(!std::getline((std::istream&)*files_[i], lastLines_[i]),
"Too few lines in input {}", i);
"Too few lines in input {}",
i);
}
addWordsToSentenceTuple(lastLines_[i], i, tup);
}
@ -90,6 +93,5 @@ void CorpusNBest::reset() {
files_.emplace_back(new InputFileStream(path));
}
}
}
}

View File

@ -154,6 +154,5 @@ void CorpusSQLite::restore(Ptr<TrainingState> ts) {
reset();
}
}
}
}

View File

@ -19,7 +19,6 @@
#include <SQLiteCpp/SQLiteCpp.h>
#include <SQLiteCpp/sqlite3/sqlite3.h>
static void SQLiteRandomSeed(sqlite3_context* context,
int argc,
sqlite3_value** argv) {

View File

@ -138,8 +138,9 @@ void Vocab::create(const std::string& vocabPath, const std::string& trainPath) {
"Specified vocab directory {} does not exist",
dir);
ABORT_IF(!dir.empty() && !(boost::filesystem::status(dir).permissions()
& boost::filesystem::owner_write),
ABORT_IF(!dir.empty()
&& !(boost::filesystem::status(dir).permissions()
& boost::filesystem::owner_write),
"No write permission in vocab directory {}",
dir);

View File

@ -32,7 +32,5 @@ struct Array {
data_[i] = val;
}
};
}
}

View File

@ -4,93 +4,90 @@
#include "functional/operands.h"
namespace marian {
namespace functional {
namespace functional {
namespace float2unsigned {
constexpr float abs(float x) { return x < 0 ? -x : x; }
namespace float2unsigned {
constexpr float abs(float x) {
return x < 0 ? -x : x;
}
constexpr int exponent(float x) {
return abs(x) >= 2 ? exponent(x / 2) + 1 :
abs(x) < 1 ? exponent(x * 2) - 1 :
0;
}
// clang-format off
constexpr int exponent(float x) {
return abs(x) >= 2 ? exponent(x / 2) + 1 :
abs(x) < 1 ? exponent(x * 2) - 1 :
0;
}
constexpr float scalbn(float value, int exponent) {
return exponent == 0 ? value :
exponent > 0 ? scalbn(value * 2, exponent - 1) :
scalbn(value / 2, exponent + 1);
}
constexpr float scalbn(float value, int exponent) {
return exponent == 0 ? value :
exponent > 0 ? scalbn(value * 2, exponent - 1) :
scalbn(value / 2, exponent + 1);
}
// clang-format on
constexpr unsigned mantissa(float x, int exp) {
// remove hidden 1 and bias the exponent to get integer
return abs(x) < std::numeric_limits<float>::infinity() ?
scalbn(scalbn(abs(x), -exp) - 1, 23) : 0;
}
constexpr unsigned mantissa(float x, int exp) {
// remove hidden 1 and bias the exponent to get integer
return abs(x) < std::numeric_limits<float>::infinity()
? scalbn(scalbn(abs(x), -exp) - 1, 23)
: 0;
}
constexpr unsigned to_binary(float x, unsigned sign, int exp) {
return sign * (1u << 31)
+ (exp + 127) * (1u << 23)
+ mantissa(x, exp);
}
constexpr unsigned to_binary(float x, unsigned sign, int exp) {
return sign * (1u << 31) + (exp + 127) * (1u << 23) + mantissa(x, exp);
}
constexpr unsigned to_binary(float x) {
return x == 0 ? 0 : to_binary(x, x < 0, exponent(x));
}
constexpr unsigned to_binary(float x) {
return x == 0 ? 0 : to_binary(x, x < 0, exponent(x));
}
}
}
namespace unsigned2float {
namespace unsigned2float {
constexpr float sign(unsigned i) {
return (i & (1u << 31)) ? -1.f : 1.f;
}
constexpr float sign(unsigned i) {
return (i & (1u << 31)) ? -1.f : 1.f;
}
constexpr int exponent(unsigned i) {
return int((i >> 23) & 255u) - 127;
}
constexpr int exponent(unsigned i) {
return int((i >> 23) & 255u) - 127;
}
constexpr float sig(unsigned i, unsigned shift) {
return ((i >> shift) & 1u) * 1.f / (1u << (23 - shift))
+ (shift > 0 ? sig(i, shift - 1) : 0);
}
constexpr float sig(unsigned i, unsigned shift) {
return ((i >> shift) & 1u) * 1.f / (1u << (23 - shift))
+ (shift > 0 ? sig(i, shift - 1) : 0);
}
constexpr float powr(int exp) {
return exp > 0 ? 2.f * powr(exp - 1) : 1.f;
}
constexpr float powr(int exp) {
return exp > 0 ? 2.f * powr(exp - 1) : 1.f;
}
constexpr float pow(int exp) {
return exp < 0 ? 1.f / powr(-exp) : powr(exp);
}
constexpr float pow(int exp) {
return exp < 0 ? 1.f / powr(-exp) : powr(exp);
}
constexpr float from_binary(unsigned i) {
return (1.f + sig(i, 22u)) * pow(exponent(i)) * sign(i);
}
}
constexpr float from_binary(unsigned i) {
return (1.f + sig(i, 22u))
* pow(exponent(i))
* sign(i);
}
}
constexpr unsigned f2i(float x) {
return float2unsigned::to_binary(x);
}
constexpr unsigned f2i(float x) {
return float2unsigned::to_binary(x);
}
constexpr float i2f(float x) {
return unsigned2float::from_binary(x);
}
constexpr float i2f(float x) {
return unsigned2float::from_binary(x);
}
template <unsigned V>
struct F {
static constexpr auto value = i2f(V);
static constexpr auto binary = V;
template <typename ...Args>
__HDI__ constexpr float operator()(Args&&... args) const {
return value;
}
std::string to_string() {
return "F<" + std::to_string(value) + ">";
}
};
template <unsigned V>
struct F {
static constexpr auto value = i2f(V);
static constexpr auto binary = V;
template <typename... Args>
__HDI__ constexpr float operator()(Args&&... args) const {
return value;
}
}
std::string to_string() { return "F<" + std::to_string(value) + ">"; }
};
}
}

View File

@ -4,25 +4,25 @@
#include "functional/predicates.h"
namespace marian {
namespace functional {
namespace functional {
template <int N>
using ref = Assignee<N>;
template <int N>
using ref = Assignee<N>;
static ref<1> _1;
static ref<2> _2;
static ref<3> _3;
static ref<4> _4;
static ref<5> _5;
static ref<6> _6;
static ref<7> _7;
static ref<8> _8;
static ref<9> _9;
static ref<1> _1;
static ref<2> _2;
static ref<3> _3;
static ref<4> _4;
static ref<5> _5;
static ref<6> _6;
static ref<7> _7;
static ref<8> _8;
static ref<9> _9;
static C<0> _0c;
static C<1> _1c;
static C<2> _2c;
static C<-1> _1cneg;
static C<-2> _2cneg;
}
static C<0> _0c;
static C<1> _1c;
static C<2> _2c;
static C<-1> _1cneg;
static C<-2> _2cneg;
}
}

View File

@ -5,71 +5,69 @@
#include "functional/defs.h"
namespace marian {
namespace functional {
namespace functional {
template <class C>
using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type;
template <int N>
struct Select {
template <typename T, typename ...Args>
__HDI__ static auto apply(T&& arg, Args&&... args) -> decltype(Select<N-1>::apply(args...)) {
return Select<N-1>::apply(args...);
}
};
template <>
struct Select<0> {
template <typename T, typename ...Args>
__HDI__ static T apply(T&& arg, Args&&... args) {
return arg;
}
};
/******************************************************************************/
template <int V>
struct C {
static constexpr auto value = V;
template <typename ...Args>
__HDI__ float operator()(Args&&... args) { return V; }
std::string to_string() {
return "C<" + std::to_string(V) + ">";
}
};
/******************************************************************************/
struct Capture {
float value;
Capture(float val) : value(val) {};
template <typename ...Args>
__HDI__ float operator()(Args&&... args) { return value; }
std::string to_string() {
return "Cap(" + std::to_string(value) + ")";
}
};
/******************************************************************************/
template <int N>
struct Var {
static constexpr auto index = N;
template <typename ...Args>
__HDI__ float& operator()(Args&&... args) {
return Select<N-1>::apply(args...);
}
std::string to_string() {
return "Var<" + std::to_string(N) + ">";
}
};
template <class C>
using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type;
template <int N>
struct Select {
template <typename T, typename... Args>
__HDI__ static auto apply(T&& arg, Args&&... args)
-> decltype(Select<N - 1>::apply(args...)) {
return Select<N - 1>::apply(args...);
}
};
template <>
struct Select<0> {
template <typename T, typename... Args>
__HDI__ static T apply(T&& arg, Args&&... args) {
return arg;
}
};
/******************************************************************************/
template <int V>
struct C {
static constexpr auto value = V;
template <typename... Args>
__HDI__ float operator()(Args&&... args) {
return V;
}
std::string to_string() { return "C<" + std::to_string(V) + ">"; }
};
/******************************************************************************/
struct Capture {
float value;
Capture(float val) : value(val){};
template <typename... Args>
__HDI__ float operator()(Args&&... args) {
return value;
}
std::string to_string() { return "Cap(" + std::to_string(value) + ")"; }
};
/******************************************************************************/
template <int N>
struct Var {
static constexpr auto index = N;
template <typename... Args>
__HDI__ float& operator()(Args&&... args) {
return Select<N - 1>::apply(args...);
}
std::string to_string() { return "Var<" + std::to_string(N) + ">"; }
};
}
}

View File

@ -4,237 +4,230 @@
#include "functional/operands.h"
namespace marian {
namespace functional {
namespace functional {
template <typename Function, typename X>
struct UnaryFunctor {
X x;
template <typename Function, typename X>
struct UnaryFunctor {
X x;
template <class Arg>
UnaryFunctor(Arg a) : x(a) {}
template <class Arg>
UnaryFunctor(Arg a) : x(a) {}
template <typename ...Args>
__HDI__ float operator()(Args&&... args) {
return Function::apply(x(args...));
}
template <typename... Args>
__HDI__ float operator()(Args&&... args) {
return Function::apply(x(args...));
}
std::string to_string() {
return Function::n() + "<" + x.to_string() + ">";
}
};
std::string to_string() { return Function::n() + "<" + x.to_string() + ">"; }
};
template <class Function, class X, class Y>
struct BinaryFunctor {
X x;
Y y;
template <class Function, class X, class Y>
struct BinaryFunctor {
X x;
Y y;
template <class Arg1, class Arg2>
BinaryFunctor(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
template <class Arg1, class Arg2>
BinaryFunctor(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
template <typename ...Args>
__HDI__ float operator()(Args&&... args) {
return Function::apply(x(args...), y(args...));
}
template <typename... Args>
__HDI__ float operator()(Args&&... args) {
return Function::apply(x(args...), y(args...));
}
std::string to_string() {
return Function::n() +
"<" + x.to_string() +
"," + y.to_string() + ">";
}
};
std::string to_string() {
return Function::n() + "<" + x.to_string() + "," + y.to_string() + ">";
}
};
#define UNARY(name, name2, func) \
namespace elem { \
struct name { \
__HDI__ static float apply(float x) { return func; } \
static std::string n() { return #name; }\
}; \
}\
template <class X> using name = UnaryFunctor<elem::name, X>;\
template <typename X>\
name<IsClass<X>> name2(X x) {\
return name<X>(x);\
}\
static name<Capture> name2(Capture x) {\
return name<Capture>(x);\
}
#define UNARY(name, name2, func) \
namespace elem { \
struct name { \
__HDI__ static float apply(float x) { return func; } \
static std::string n() { return #name; } \
}; \
} \
template <class X> \
using name = UnaryFunctor<elem::name, X>; \
template <typename X> \
name<IsClass<X>> name2(X x) { \
return name<X>(x); \
} \
static name<Capture> name2(Capture x) { return name<Capture>(x); }
#define BINARY(name, name2, func) \
namespace elem { \
struct name { \
__HDI__ static float apply(float x, float y) { return func; } \
static std::string n() { return #name; }\
}; \
}\
template <class X, class Y> using name = BinaryFunctor<elem::name, X, Y>;\
template <class X, class Y>\
name<IsClass<X>, IsClass<Y>> name2(X x, Y y) {\
return name<X, Y>(x, y);\
}\
template <class Y>\
name<Capture, IsClass<Y>> name2(Capture x, Y y) {\
return name<Capture, Y>(x, y);\
}\
template <class X>\
name<IsClass<X>, Capture> name2(X x, Capture y) {\
return name<X, Capture>(x, y);\
}
#define BINARY(name, name2, func) \
namespace elem { \
struct name { \
__HDI__ static float apply(float x, float y) { return func; } \
static std::string n() { return #name; } \
}; \
} \
template <class X, class Y> \
using name = BinaryFunctor<elem::name, X, Y>; \
template <class X, class Y> \
name<IsClass<X>, IsClass<Y>> name2(X x, Y y) { \
return name<X, Y>(x, y); \
} \
template <class Y> \
name<Capture, IsClass<Y>> name2(Capture x, Y y) { \
return name<Capture, Y>(x, y); \
} \
template <class X> \
name<IsClass<X>, Capture> name2(X x, Capture y) { \
return name<X, Capture>(x, y); \
}
UNARY(Tanh, tanh, tanhf(x));
UNARY(Sin, sin, sinf(x));
UNARY(Cos, cos, cosf(x));
UNARY(Tan, tan, tanf(x));
UNARY(Log, log, logf(x));
UNARY(Exp, exp, expf(x));
UNARY(Abs, abs, fabs(x));
UNARY(Sqrt, sqrt, sqrtf(x));
UNARY(Neg, operator-, -x);
UNARY(Logit, logit, x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x))));
UNARY(Tanh, tanh, tanhf(x));
UNARY(Sin, sin, sinf(x));
UNARY(Cos, cos, cosf(x));
UNARY(Tan, tan, tanf(x));
UNARY(Log, log, logf(x));
UNARY(Exp, exp, expf(x));
UNARY(Abs, abs, fabs(x));
UNARY(Sqrt, sqrt, sqrtf(x));
UNARY(Neg, operator-, -x);
UNARY(Logit,
logit,
x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x))));
BINARY(Plus, operator+, x + y);
BINARY(Minus, operator-, x - y);
BINARY(Mult, operator*, x * y);
BINARY(Div, operator/, x / y);
BINARY(Plus, operator+, x + y);
BINARY(Minus, operator-, x - y);
BINARY(Mult, operator*, x* y);
BINARY(Div, operator/, x / y);
UNARY(Negate, operator!, !x);
BINARY(Eq, operator==, x == y);
BINARY(NEq, operator!=, x != y);
BINARY(Gt, operator>, x > y);
BINARY(Lt, operator<, x < y);
BINARY(Geq, operator>=, x >= y);
BINARY(Leq, operator<=, x <= y);
BINARY(And, operator&&, x && y);
BINARY(Or, operator||, x || y);
UNARY(Negate, operator!, !x);
BINARY(Eq, operator==, x == y);
BINARY(NEq, operator!=, x != y);
BINARY(Gt, operator>, x> y);
BINARY(Lt, operator<, x<y);
BINARY(Geq, operator>=, x >= y);
BINARY(Leq, operator<=, x <= y);
BINARY(And, operator&&, x&& y);
BINARY(Or, operator||, x || y);
template <typename T>
__HDI__ T sgn(T val) {
return (float(0) < val) - (val < float(0));
}
template <typename T>
__HDI__ T sgn(T val) {
return (float(0) < val) - (val < float(0));
}
UNARY(Sgn, sgn, sgn(x));
UNARY(Sgn, sgn, sgn(x));
BINARY(Pow, pow, pow(x, y));
BINARY(Pow, pow, pow(x, y));
BINARY(Clip, clip, fabs(x) >= y ? sgn(x) * y : x);
BINARY(Clip, clip, fabs(x) >= y ? sgn(x) * y : x);
UNARY(sReLU, ReLU, x > 0.f ? x : 0.f);
UNARY(sReLUBack, ReLUback, x > 0.f ? 1.f : 0.f);
BINARY(sPReLU, PReLU, x > 0.f ? x : x * y);
BINARY(sPReLUBack, PReLUback, x > 0.f ? 1.f : y);
UNARY(sReLU, ReLU, x > 0.f ? x : 0.f);
UNARY(sReLUBack, ReLUback, x > 0.f ? 1.f : 0.f);
BINARY(sPReLU, PReLU, x > 0.f ? x : x * y);
BINARY(sPReLUBack, PReLUback, x > 0.f ? 1.f : y);
template <class Function, class X, class Y, class Z>
struct TernaryFunctor {
X x;
Y y;
Z z;
template <class Function, class X, class Y, class Z>
struct TernaryFunctor {
X x;
Y y;
Z z;
template <class Arg1, class Arg2, class Arg3>
TernaryFunctor(Arg1 arg1, Arg2 arg2, Arg3 arg3)
: x(arg1), y(arg2), z(arg3) {}
template <class Arg1, class Arg2, class Arg3>
TernaryFunctor(Arg1 arg1, Arg2 arg2, Arg3 arg3) : x(arg1), y(arg2), z(arg3) {}
template <typename ...Args>
__HDI__ float operator()(Args&&... args) {
return Function::apply(x(args...), y(args...), z(args...));
}
};
template <typename... Args>
__HDI__ float operator()(Args&&... args) {
return Function::apply(x(args...), y(args...), z(args...));
}
};
#define TERNARY(name, name2, func) \
namespace elem { \
struct name { \
__HDI__ static float apply(float x, float y, float z) { return func; } \
}; \
}\
template <class X, class Y, class Z> using name = TernaryFunctor<elem::name, X, Y, Z>;\
template <typename X, typename Y, typename Z>\
name<IsClass<X>, IsClass<Y>, IsClass<Z>> name2(X x, Y y, Z z) {\
return name<X, Y, Z>(x, y, z);\
}\
template <typename X, typename Z>\
name<IsClass<X>, Capture, IsClass<Z>> name2(X x, Capture y, Z z) {\
return name<X, Capture, Z>(x, y, z);\
}\
template <typename Y, typename Z>\
name<Capture, IsClass<Y>, IsClass<Z>> name2(Capture x, Y y, Z z) {\
return name<Capture, Y, Z>(x, y, z);\
}\
template <typename X>\
name<IsClass<X>, Capture, Capture> name2(X x, Capture y, Capture z) {\
return name<X, Capture, Capture>(x, y, z);\
}\
template <typename Y>\
name<Capture, IsClass<Y>, Capture> name2(Capture x, Y y, Capture z) {\
return name<Capture, Y, Capture>(x, y, z);\
}\
template <typename Z>\
name<Capture, Capture, IsClass<Z>> name2(Capture x, Capture y, Z z) {\
return name<Capture, Capture, Z>(x, y, z);\
}
#define TERNARY(name, name2, func) \
namespace elem { \
struct name { \
__HDI__ static float apply(float x, float y, float z) { return func; } \
}; \
} \
template <class X, class Y, class Z> \
using name = TernaryFunctor<elem::name, X, Y, Z>; \
template <typename X, typename Y, typename Z> \
name<IsClass<X>, IsClass<Y>, IsClass<Z>> name2(X x, Y y, Z z) { \
return name<X, Y, Z>(x, y, z); \
} \
template <typename X, typename Z> \
name<IsClass<X>, Capture, IsClass<Z>> name2(X x, Capture y, Z z) { \
return name<X, Capture, Z>(x, y, z); \
} \
template <typename Y, typename Z> \
name<Capture, IsClass<Y>, IsClass<Z>> name2(Capture x, Y y, Z z) { \
return name<Capture, Y, Z>(x, y, z); \
} \
template <typename X> \
name<IsClass<X>, Capture, Capture> name2(X x, Capture y, Capture z) { \
return name<X, Capture, Capture>(x, y, z); \
} \
template <typename Y> \
name<Capture, IsClass<Y>, Capture> name2(Capture x, Y y, Capture z) { \
return name<Capture, Y, Capture>(x, y, z); \
} \
template <typename Z> \
name<Capture, Capture, IsClass<Z>> name2(Capture x, Capture y, Z z) { \
return name<Capture, Capture, Z>(x, y, z); \
}
TERNARY(IfThenElse, if_then_else, x ? y : z);
TERNARY(IfThenElse, if_then_else, x ? y : z);
template <class X, class Y>
struct Assign {
X x;
Y y;
template <class Arg1, class Arg2>
Assign(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
template <class X, class Y>
struct Assign {
X x;
Y y;
template <typename... Args>
__HDI__ float operator()(Args&&... args) {
return x(args...) = y(args...);
}
};
template <class Arg1, class Arg2>
Assign(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {}
template <int N>
struct Assignee {
Var<N> var;
template <typename ...Args>
__HDI__ float operator()(Args&&... args) {
return x(args...) = y(args...);
}
};
Assignee() {}
Assignee(Var<N> v) : var(v) {}
template <int N>
struct Assignee {
Var<N> var;
template <typename... Args>
__HDI__ float& operator()(Args&&... args) {
return var(args...);
}
Assignee() {}
Assignee(Var<N> v) : var(v) {}
template <class X>
Assign<Var<N>, IsClass<X>> operator=(X x) {
return Assign<Var<N>, X>(var, x);
}
template <typename ...Args>
__HDI__ float& operator()(Args&&... args) {
return var(args...);
}
Assign<Var<N>, Capture> operator=(Capture x) {
return Assign<Var<N>, Capture>(var, x);
}
template <class X>
Assign<Var<N>, IsClass<X>> operator=(X x) {
return Assign<Var<N>, X>(var, x);
}
template <class X>
auto operator+=(X x) -> decltype(*this = *this + x) {
return *this = *this + x;
}
Assign<Var<N>, Capture> operator=(Capture x) {
return Assign<Var<N>, Capture>(var, x);
}
template <class X>
auto operator-=(X x) -> decltype(*this = *this - x) {
return *this = *this - x;
}
template <class X>
auto operator+=(X x)->decltype(*this = *this + x) {
return *this = *this + x;
}
template <class X>
auto operator*=(X x) -> decltype(*this = *this * x) {
return *this = *this * x;
}
template <class X>
auto operator-=(X x)->decltype(*this = *this - x) {
return *this = *this - x;
}
template <class X>
auto operator/=(X x) -> decltype(*this = *this / x) {
return *this = *this / x;
}
template <class X>
auto operator*=(X x)->decltype(*this = *this * x) {
return *this = *this * x;
}
template <class X>
auto operator/=(X x)->decltype(*this = *this / x) {
return *this = *this / x;
}
std::string to_string() {
return var.to_string();
}
};
std::string to_string() { return var.to_string(); }
};
/******************************************************************************/
}
}
}

View File

@ -17,7 +17,6 @@ namespace functional {
* @brief Represents the size of each dimension in a tensor.
*/
template <const int N>
struct ConstantShape {
Array<int, N> shape_;
@ -32,10 +31,10 @@ struct ConstantShape {
}
__HD__ ConstantShape(const ConstantShape& shape)
: shape_(shape.shape_),
stride_(shape.stride_),
bstride_(shape.bstride_),
elements_(shape.elements_) {}
: shape_(shape.shape_),
stride_(shape.stride_),
bstride_(shape.bstride_),
elements_(shape.elements_) {}
ConstantShape(const Shape& shape) {
size_t filled = shape.size();
@ -43,7 +42,8 @@ struct ConstantShape {
ABORT_IF(filled > N,
"Recompile with CONST_SHAPE_DIMS >= " + std::to_string(filled));
std::copy(shape.shape_.begin(), shape.shape_.end(), shape_.begin() + N - filled);
std::copy(
shape.shape_.begin(), shape.shape_.end(), shape_.begin() + N - filled);
if(N - filled)
std::fill_n(shape_.begin(), N - filled, 1);
updateStrides();
@ -51,7 +51,6 @@ struct ConstantShape {
}
__HDI__ void updateStrides() {
stride_[N - 1] = 1;
bstride_[N - 1] = shape_[N - 1] == 1 ? 0 : stride_[N - 1];
@ -73,7 +72,6 @@ struct ConstantShape {
updateElements();
}
__HDI__ int dim(int i) { return shape_[i]; }
__HDI__ int dim(int i) const {
@ -92,9 +90,7 @@ struct ConstantShape {
__HDI__ static constexpr size_t size() { return N; }
__HDI__ int elements() const {
return elements_;
}
__HDI__ int elements() const { return elements_; }
__HDI__ int index(const Array<int, N>& d) const {
int i = 0;
@ -113,7 +109,7 @@ struct ConstantShape {
__HDI__ void dims(int i, Array<int, N>& d) const {
for(int j = 0; j < N; ++j)
d[j] = (i / stride_[j]) % shape_[j];
}
}
__HDI__ bool operator==(const ConstantShape& other) const {
for(int i = 0; i < N; ++i)
@ -128,7 +124,5 @@ struct ConstantShape {
};
typedef ConstantShape<CONST_SHAPE_DIMS> Shape;
}
}

View File

@ -7,7 +7,7 @@
namespace marian {
namespace functional {
template<typename T>
template <typename T>
struct Tensor {
T* data_;
functional::Shape shape_;
@ -15,19 +15,20 @@ struct Tensor {
__HD__ Tensor() {}
__HD__ Tensor(T* ptr, const functional::Shape& shape)
: data_(ptr), shape_(shape) {}
: data_(ptr), shape_(shape) {}
__H__ Tensor(marian::Tensor t)
: data_(t->data()), shape_(t->shape()) {}
__H__ Tensor(marian::Tensor t) : data_(t->data()), shape_(t->shape()) {}
__HDI__ float& operator[](size_t i) { return data_[i]; }
__HDI__ const float& operator[](size_t i) const { return data_[i]; }
__HDI__ float& operator[](const functional::Array<int, functional::Shape::size()>& indices) {
__HDI__ float& operator[](
const functional::Array<int, functional::Shape::size()>& indices) {
return data_[shape_.index(indices)];
}
__HDI__ const float& operator[](const functional::Array<int, functional::Shape::size()>& indices) const {
__HDI__ const float& operator[](
const functional::Array<int, functional::Shape::size()>& indices) const {
return data_[shape_.index(indices)];
}
@ -37,6 +38,5 @@ struct Tensor {
__HDI__ Shape& shape() { return shape_; }
__HDI__ const Shape& shape() const { return shape_; }
};
}
}

View File

@ -12,82 +12,86 @@ struct FApply {};
template <class Functor>
struct FApply<1, Functor> {
__HDI__ static float apply(Functor functor,
functional::Array<functional::Tensor<float>, 1>& in,
const functional::Array<int, 1>& indices) {
__HDI__ static float apply(
Functor functor,
functional::Array<functional::Tensor<float>, 1>& in,
const functional::Array<int, 1>& indices) {
return functor(in[0][indices[0]]);
}
__HDI__ static float apply(Functor functor,
functional::Array<functional::Tensor<float>, 1>& in,
int index) {
__HDI__ static float apply(
Functor functor,
functional::Array<functional::Tensor<float>, 1>& in,
int index) {
return functor(in[0][index]);
}
};
template <class Functor>
struct FApply<2, Functor> {
__HDI__ static float apply(Functor functor,
functional::Array<functional::Tensor<float>, 2>& in,
const functional::Array<int, 2>& indices) {
__HDI__ static float apply(
Functor functor,
functional::Array<functional::Tensor<float>, 2>& in,
const functional::Array<int, 2>& indices) {
return functor(in[0][indices[0]], in[1][indices[1]]);
}
__HDI__ static float apply(Functor functor,
functional::Array<functional::Tensor<float>, 2>& in,
int index) {
__HDI__ static float apply(
Functor functor,
functional::Array<functional::Tensor<float>, 2>& in,
int index) {
return functor(in[0][index], in[1][index]);
}
};
template <class Functor>
struct FApply<3, Functor> {
__HDI__ static float apply(Functor functor,
functional::Array<functional::Tensor<float>, 3>& in,
const functional::Array<int, 3>& indices) {
__HDI__ static float apply(
Functor functor,
functional::Array<functional::Tensor<float>, 3>& in,
const functional::Array<int, 3>& indices) {
return functor(in[0][indices[0]], in[1][indices[1]], in[2][indices[2]]);
}
__HDI__ static float apply(Functor functor,
functional::Array<functional::Tensor<float>, 3>& in,
int index) {
__HDI__ static float apply(
Functor functor,
functional::Array<functional::Tensor<float>, 3>& in,
int index) {
return functor(in[0][index], in[1][index], in[2][index]);
}
};
template <class Functor>
struct FApply<4, Functor> {
__HDI__ static float apply(Functor functor,
functional::Array<functional::Tensor<float>, 4>& in,
const functional::Array<int, 4>& indices) {
__HDI__ static float apply(
Functor functor,
functional::Array<functional::Tensor<float>, 4>& in,
const functional::Array<int, 4>& indices) {
return functor(in[0][indices[0]],
in[1][indices[1]],
in[2][indices[2]],
in[3][indices[3]]);
}
__HDI__ static float apply(Functor functor,
functional::Array<functional::Tensor<float>, 4>& in,
int index) {
return functor(in[0][index],
in[1][index],
in[2][index],
in[3][index]);
__HDI__ static float apply(
Functor functor,
functional::Array<functional::Tensor<float>, 4>& in,
int index) {
return functor(in[0][index], in[1][index], in[2][index], in[3][index]);
}
};
template <size_t K, class Functor>
__HDI__ float apply(Functor functor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, K>& indices) {
__HDI__ float apply(Functor functor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, K>& indices) {
return FApply<K, Functor>::apply(functor, in, indices);
}
template <size_t K, class Functor>
__HDI__ float apply(Functor functor,
functional::Array<functional::Tensor<float>, K>& in,
int index) {
__HDI__ float apply(Functor functor,
functional::Array<functional::Tensor<float>, K>& in,
int index) {
return FApply<K, Functor>::apply(functor, in, index);
}
@ -96,11 +100,12 @@ template <size_t K, class Functor>
template <size_t n, size_t N, size_t K>
struct Loop {
template <class Functor>
__HDI__ static float result(Functor functor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, K>& pAcc,
const functional::Array<int, N>& length,
const functional::Array<int, N>& dim) {
__HDI__ static float result(
Functor functor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, K>& pAcc,
const functional::Array<int, N>& length,
const functional::Array<int, N>& dim) {
float sum = 0;
functional::Array<int, K> acc;
for(int i = 0; i < length[N - n]; ++i) {
@ -116,11 +121,12 @@ struct Loop {
template <size_t N, size_t K>
struct Loop<1, N, K> {
template <class Functor>
__HDI__ static float result(Functor functor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, K>& pAcc,
const functional::Array<int, N>& length,
const functional::Array<int, N>& dim) {
__HDI__ static float result(
Functor functor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, K>& pAcc,
const functional::Array<int, N>& length,
const functional::Array<int, N>& dim) {
float sum = 0;
functional::Array<int, K> acc;
for(int i = 0; i < length[N - 1]; ++i) {
@ -141,6 +147,5 @@ __HDI__ float loops(Functor functor,
functional::Array<int, K> acc = {0};
return Loop<N, N, K>::result(functor, in, acc, length, dim);
}
}
}
}

View File

@ -1,5 +1,5 @@
#include <sstream>
#include "graph/expression_graph.h"
#include <sstream>
#include "tensors/tensor_operators.h"
@ -18,15 +18,12 @@ void ExpressionGraph::setDevice(DeviceId deviceId) {
}
Expr ExpressionGraph::dropout(float prob, const Shape& shape) {
return Expression<ConstantNode>(shared_from_this(),
shape,
[prob, this](Tensor t) {
Dropout(t, prob);
});
return Expression<ConstantNode>(
shared_from_this(), shape, [prob, this](Tensor t) { Dropout(t, prob); });
}
void ExpressionGraph::checkNan(Tensor t) {
ABORT_IF(throwNaN_, "Not implemented");
//ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN");
// ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN");
}
}

View File

@ -215,7 +215,9 @@ public:
ABORT_IF(shape != p->shape(),
"Requested shape {} for existing parameter '{}' does not match "
"original shape {}",
shape, name, p->shape());
shape,
name,
p->shape());
p->setTrainable(!fixed);
add(p);
@ -239,10 +241,8 @@ public:
return p;
}
Expr constant(const Shape& shape,
const NodeInitializer& init) {
return Expression<ConstantNode>(
shared_from_this(), shape, init);
Expr constant(const Shape& shape, const NodeInitializer& init) {
return Expression<ConstantNode>(shared_from_this(), shape, init);
}
Expr ones(const Shape& shape) {

View File

@ -126,7 +126,6 @@ Expr repeat(Expr a, size_t repeats, keywords::axis_k ax) {
return concatenate(std::vector<Expr>(repeats, a), ax);
}
Expr reshape(Expr a, Shape shape) {
return Expression<ReshapeNodeOp>(a, shape);
}
@ -165,10 +164,7 @@ Expr flatten(Expr a) {
}
Expr flatten_2d(Expr a) {
Shape shape = {
a->shape().elements() / a->shape()[-1],
a->shape()[-1]
};
Shape shape = {a->shape().elements() / a->shape()[-1], a->shape()[-1]};
return Expression<ReshapeNodeOp>(a, shape);
}
@ -232,17 +228,16 @@ Expr step(Expr a, int step, int axis) {
}
Expr cross_entropy(Expr a, Expr b) {
//auto sOrig = a->shape();
//auto sOut = a->shape();
//Shape sTemp({sOrig[0] * sOrig[2] * sOrig[3], sOrig[1], 1, 1});
//sOut.set(1, 1);
//return reshape(Expression<CrossEntropyNodeOp>(reshape(a, sTemp), b), sOut);
// auto sOrig = a->shape();
// auto sOut = a->shape();
// Shape sTemp({sOrig[0] * sOrig[2] * sOrig[3], sOrig[1], 1, 1});
// sOut.set(1, 1);
// return reshape(Expression<CrossEntropyNodeOp>(reshape(a, sTemp), b), sOut);
return Expression<CrossEntropyNodeOp>(a, b);
}
Expr affine(Expr a, Expr b, Expr c,
bool transA, bool transB, float scalar) {
Expr affine(Expr a, Expr b, Expr c, bool transA, bool transB, float scalar) {
std::vector<Expr> nodes = {a, b, c};
return Expression<AffineNodeOp>(nodes, transA, transB, scalar);
}
@ -299,6 +294,7 @@ Expr highway(Expr y, Expr x, Expr t) {
}
Expr highway(const std::string prefix, Expr x) {
// clang-format off
size_t outDim = x->shape()[-1];
auto g = mlp::dense(x->graph())
("prefix", prefix + "_highway_d1")
@ -311,6 +307,7 @@ Expr highway(const std::string prefix, Expr x) {
("activation", mlp::act::ReLU)
.construct()->apply(x);
return (g * relued) + ((1 - g) * x);
// clang-format on
}
// Expr batch_norm(Expr x, Expr gamma, Expr beta) {
@ -334,41 +331,26 @@ Expr shift(Expr a, Shape shift) {
#ifdef CUDA_FOUND
Expr avg_pooling(
Expr x,
int height,
int width,
int padHeight,
int padWidth,
int strideHeight,
int strideWidth) {
return Expression<PoolingOp>(x,
height,
width,
padHeight,
padWidth,
strideHeight,
strideWidth,
"avg");
Expr avg_pooling(Expr x,
int height,
int width,
int padHeight,
int padWidth,
int strideHeight,
int strideWidth) {
return Expression<PoolingOp>(
x, height, width, padHeight, padWidth, strideHeight, strideWidth, "avg");
}
Expr max_pooling(
Expr x,
int height,
int width,
int padHeight,
int padWidth,
int strideHeight,
int strideWidth)
{
return Expression<PoolingOp>(x,
height,
width,
padHeight,
padWidth,
strideHeight,
strideWidth,
"max");
Expr max_pooling(Expr x,
int height,
int width,
int padHeight,
int padWidth,
int strideHeight,
int strideWidth) {
return Expression<PoolingOp>(
x, height, width, padHeight, padWidth, strideHeight, strideWidth, "max");
}
Expr convert2cudnnFormat(Expr x) {
@ -377,13 +359,13 @@ Expr convert2cudnnFormat(Expr x) {
int embSize = x->shape()[2];
std::vector<size_t> newIndeces;
for (int b = 0; b < numExamples; ++b) {
for (int t = 0; t < numWords; ++t) {
for(int b = 0; b < numExamples; ++b) {
for(int t = 0; t < numWords; ++t) {
newIndeces.push_back((t * numExamples) + b);
}
}
auto xRows = reshape(x, {x->shape()[0] * x ->shape()[1], x->shape()[2]});
auto xRows = reshape(x, {x->shape()[0] * x->shape()[1], x->shape()[2]});
Shape outShape({numExamples, 1, numWords, embSize});
return reshape(rows(xRows, newIndeces), outShape);
@ -397,8 +379,8 @@ Expr convertFromcudnnFormat(Expr x) {
auto reshapedX = reshape(x, {batchDim * sentenceDim, embSize});
std::vector<size_t> newIndeces;
for (int t = 0; t < sentenceDim; ++t) {
for (int b = 0; b < batchDim; ++b) {
for(int t = 0; t < sentenceDim; ++t) {
for(int b = 0; b < batchDim; ++b) {
newIndeces.push_back(b * sentenceDim + t);
}
}
@ -412,5 +394,4 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
}
#endif
}

View File

@ -110,7 +110,6 @@ Expr mean(Expr a, keywords::axis_k ax = 0);
Expr cross_entropy(Expr a, Expr b);
Expr scalar_product(Expr a, Expr b, keywords::axis_k ax = 0);
Expr weighted_average(Expr in, Expr weights, keywords::axis_k ax = 0);
@ -161,6 +160,5 @@ Expr max_pooling(Expr x,
int strideHeight = 1,
int strideWidth = 1);
Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven=false);
Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven = false);
}

View File

@ -1,4 +1,5 @@
#include "tensors/backend.h"
#include "graph/expression_graph.h"
#include "graph/node.h"

View File

@ -33,8 +33,7 @@ protected:
public:
Node(Ptr<ExpressionGraph> graph, Shape shape)
: graph_(graph),
shape_(shape) {}
: graph_(graph), shape_(shape) {}
virtual ~Node() {
if(destroy_) {
@ -152,7 +151,7 @@ struct NaryNodeOp : public Node {
}
NaryNodeOp(const std::vector<Expr>& nodes)
: NaryNodeOp(nodes, nodes[0]->shape()) {}
: NaryNodeOp(nodes, nodes[0]->shape()) {}
virtual ~NaryNodeOp() {}

View File

@ -109,9 +109,8 @@ void ortho(Tensor t) {
NodeInitializer from_vector(const std::vector<float>& v) {
auto vPtr = New<std::vector<float>>(v.begin(), v.end());
return [vPtr](Tensor t) {
t->set(vPtr->data(), vPtr->data() + vPtr->size());
};
return
[vPtr](Tensor t) { t->set(vPtr->data(), vPtr->data() + vPtr->size()); };
}
NodeInitializer from_vector(const std::vector<size_t>& v) {
@ -138,9 +137,9 @@ NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np) {
// move this somewhere else
NodeInitializer from_word2vec(const std::string& file,
int dimVoc,
int dimEmb,
bool normalize /*= false*/) {
int dimVoc,
int dimEmb,
bool normalize /*= false*/) {
return [file, dimVoc, dimEmb, normalize](Tensor t) {
auto embs = Word2VecReader().read(file, dimVoc, dimEmb);

View File

@ -70,9 +70,9 @@ NodeInitializer from_sparse_vector(
NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np);
NodeInitializer from_word2vec(const std::string& file,
int dimVoc,
int dimEmb,
bool normalize = false);
int dimVoc,
int dimEmb,
bool normalize = false);
}
} // namespace marian

View File

@ -7,11 +7,12 @@
namespace marian {
struct ConstantNode : public Node {
ConstantNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init)
ConstantNode(Ptr<ExpressionGraph> graph,
const Shape& shape,
const NodeInitializer& init)
: Node(graph, shape),
init_(new NodeInitializer(init)),
initialized_(false) {
setTrainable(false);
}
@ -41,11 +42,13 @@ private:
};
struct ParamNode : public Node {
ParamNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init, bool fixed = false)
ParamNode(Ptr<ExpressionGraph> graph,
const Shape& shape,
const NodeInitializer& init,
bool fixed = false)
: Node(graph, shape),
init_(new NodeInitializer(init)),
initialized_(false) {
setTrainable(!fixed);
}

View File

@ -16,13 +16,8 @@ private:
float scalar_;
public:
DotNodeOp(Expr a,
Expr b,
bool transA,
bool transB,
float scalar)
: NaryNodeOp({a, b},
newShape(a, b, transA, transB)),
DotNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar)
: NaryNodeOp({a, b}, newShape(a, b, transA, transB)),
transA_(transA),
transB_(transB),
scalar_(scalar) {}
@ -49,14 +44,13 @@ public:
NodeOps forwardOps() {
// C = alpha * dot(op(A), op(B))
return {NodeOp(Prod(
val_,
child(0)->val(),
child(1)->val(),
transA_,
transB_,
0.f,
scalar_))};
return {NodeOp(Prod(val_,
child(0)->val(),
child(1)->val(),
transA_,
transB_,
0.f,
scalar_))};
}
NodeOps backwardOps() {
@ -149,7 +143,7 @@ public:
: NaryNodeOp(nodes, newShape(nodes[0], nodes[1], transA, transB)),
transA_(transA),
transB_(transB),
scalar_(scalar){}
scalar_(scalar) {}
Shape newShape(Expr a, Expr b, bool transA, bool transB) {
auto shapeA = a->shape();
@ -171,19 +165,17 @@ public:
return outShape;
}
NodeOps forwardOps() {
using namespace functional;
return {
NodeOp(Prod(
val_,
child(0)->val(),
child(1)->val(),
transA_,
transB_,
0.f,
scalar_);
Add(_1, val_, child(2)->val()))
NodeOp(Prod(val_,
child(0)->val(),
child(1)->val(),
transA_,
transB_,
0.f,
scalar_);
Add(_1, val_, child(2)->val()))
};
}
@ -266,7 +258,6 @@ public:
const std::string type() { return "affine"; }
};
class DotBatchedNodeOp : public NaryNodeOp {
private:
bool transA_;
@ -274,13 +265,8 @@ private:
float scalar_;
public:
DotBatchedNodeOp(Expr a,
Expr b,
bool transA,
bool transB,
float scalar)
: NaryNodeOp({a, b},
newShape(a, b, transA, transB)),
DotBatchedNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar)
: NaryNodeOp({a, b}, newShape(a, b, transA, transB)),
transA_(transA),
transB_(transB),
scalar_(scalar) {}
@ -307,14 +293,13 @@ public:
NodeOps forwardOps() {
// C = alpha * dot(op(A), op(B))
return {NodeOp(ProdBatched(
val_,
child(0)->val(),
child(1)->val(),
transA_,
transB_,
0.f,
scalar_))};
return {NodeOp(ProdBatched(val_,
child(0)->val(),
child(1)->val(),
transA_,
transB_,
0.f,
scalar_))};
}
NodeOps backwardOps() {
@ -325,71 +310,67 @@ public:
// to sum gradients from different graph parts
if(!transA_ && transB_)
return {
NodeOp(ProdBatched(child(0)->grad(),
adj_,
child(1)->val(),
false,
false,
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
adj_,
child(0)->val(),
true,
false,
1.0,
scalar_))};
return {NodeOp(ProdBatched(child(0)->grad(),
adj_,
child(1)->val(),
false,
false,
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
adj_,
child(0)->val(),
true,
false,
1.0,
scalar_))};
if(transA_ && !transB_)
return {
NodeOp(ProdBatched(child(0)->grad(),
child(1)->val(),
adj_,
false,
true,
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
child(0)->val(),
adj_,
false,
false,
1.0,
scalar_))};
return {NodeOp(ProdBatched(child(0)->grad(),
child(1)->val(),
adj_,
false,
true,
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
child(0)->val(),
adj_,
false,
false,
1.0,
scalar_))};
if(transA_ && transB_)
return {
NodeOp(ProdBatched(child(0)->grad(),
child(1)->val(),
adj_,
true,
true,
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
adj_,
child(0)->val(),
true,
true,
1.0,
scalar_))};
return {NodeOp(ProdBatched(child(0)->grad(),
child(1)->val(),
adj_,
true,
true,
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
adj_,
child(0)->val(),
true,
true,
1.0,
scalar_))};
return {
NodeOp(ProdBatched(child(0)->grad(),
adj_,
child(1)->val(),
false,
true,
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
child(0)->val(),
adj_,
true,
false,
1.0,
scalar_))};
return {NodeOp(ProdBatched(child(0)->grad(),
adj_,
child(1)->val(),
false,
true,
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
child(0)->val(),
adj_,
true,
false,
1.0,
scalar_))};
}
const std::string type() { return ""; }
@ -400,8 +381,7 @@ public:
struct ScalarProductNodeOp : public NaryNodeOp {
template <typename... Args>
ScalarProductNodeOp(Expr a, Expr b, Args... args)
: NaryNodeOp({a, b}, newShape(a, b, args...)) {
}
: NaryNodeOp({a, b}, newShape(a, b, args...)) {}
template <typename... Args>
Shape newShape(Expr a, Expr b, Args... args) {
@ -433,12 +413,9 @@ struct ScalarProductNodeOp : public NaryNodeOp {
};
struct ElementBinaryNodeOp : public NaryNodeOp {
ElementBinaryNodeOp(Expr a, Expr b)
: NaryNodeOp({a, b}, newShape(a, b)) {}
ElementBinaryNodeOp(Expr a, Expr b) : NaryNodeOp({a, b}, newShape(a, b)) {}
Shape newShape(Expr a, Expr b) {
return Shape::broadcast({a, b});
}
Shape newShape(Expr a, Expr b) { return Shape::broadcast({a, b}); }
const std::string color() { return "yellow"; }
};
@ -553,8 +530,7 @@ struct DivNodeOp : public ElementBinaryNodeOp {
// Cross-entropy node. It computes -b*log(softmax(a)), summing rowwise.
struct CrossEntropyNodeOp : public NaryNodeOp {
CrossEntropyNodeOp(Expr a, Expr b)
: NaryNodeOp({a, b}, newShape(a)) {}
CrossEntropyNodeOp(Expr a, Expr b) : NaryNodeOp({a, b}, newShape(a)) {}
Shape newShape(Expr a) {
Shape shape1 = a->shape();
@ -578,7 +554,9 @@ struct CrossEntropyNodeOp : public NaryNodeOp {
struct ConcatenateNodeOp : public NaryNodeOp {
template <typename... Args>
ConcatenateNodeOp(const std::vector<Expr>& nodes, Args... args)
: NaryNodeOp(nodes, newShape(nodes, keywords::Get(keywords::axis, 0, args...))) {}
: NaryNodeOp(nodes,
newShape(nodes, keywords::Get(keywords::axis, 0, args...))) {
}
Shape newShape(const std::vector<Expr>& nodes, int ax) {
Shape shape = nodes.back()->shape();
@ -730,38 +708,33 @@ struct HighwayNodeOp : public NaryNodeOp {
class ConvolutionOp : public NaryNodeOp {
public:
ConvolutionOp(
const std::vector<Expr>& nodes,
int hPad = 0,
int wPad = 0,
int hStride = 1,
int wStride = 1)
: NaryNodeOp(nodes),
conv_(nodes[1]->shape(),
nodes[2]->shape(),
hPad,
wPad,
hStride,
wStride) {
ConvolutionOp(const std::vector<Expr>& nodes,
int hPad = 0,
int wPad = 0,
int hStride = 1,
int wStride = 1)
: NaryNodeOp(nodes),
conv_(nodes[1]->shape(),
nodes[2]->shape(),
hPad,
wPad,
hStride,
wStride) {
conv_.getOutputShape(nodes[0]->shape(), shape_);
}
NodeOps forwardOps() {
return {NodeOp(conv_.forward(
child(0)->val(),
child(1)->val(),
child(2)->val(),
val_))};
child(0)->val(), child(1)->val(), child(2)->val(), val_))};
}
NodeOps backwardOps() {
return {NodeOp(conv_.backward(
child(0)->val(),
child(0)->grad(),
child(1)->val(),
child(1)->grad(),
child(2)->grad(),
adj_))};
return {NodeOp(conv_.backward(child(0)->val(),
child(0)->grad(),
child(1)->val(),
child(1)->grad(),
child(2)->grad(),
adj_))};
}
const std::string type() { return "layer_convolution"; }
@ -769,5 +742,4 @@ public:
protected:
ConvolutionWrapper conv_;
};
}

View File

@ -12,11 +12,9 @@
namespace marian {
struct UnaryNodeOp : public NaryNodeOp {
UnaryNodeOp(Expr a, Shape shape)
: NaryNodeOp({a}, shape) {}
UnaryNodeOp(Expr a, Shape shape) : NaryNodeOp({a}, shape) {}
UnaryNodeOp(Expr a)
: NaryNodeOp({a}, a->shape()) {}
UnaryNodeOp(Expr a) : NaryNodeOp({a}, a->shape()) {}
const std::string color() { return "yellow"; }
};
@ -26,9 +24,7 @@ private:
float scalar_{0};
public:
ScalarAddNodeOp(Expr a, float scalar)
: UnaryNodeOp(a),
scalar_{scalar} {}
ScalarAddNodeOp(Expr a, float scalar) : UnaryNodeOp(a), scalar_{scalar} {}
NodeOps forwardOps() {
using namespace functional;
@ -67,8 +63,7 @@ private:
float scalar_{0};
public:
ScalarMultNodeOp(Expr a, float scalar)
: UnaryNodeOp(a), scalar_{scalar} {}
ScalarMultNodeOp(Expr a, float scalar) : UnaryNodeOp(a), scalar_{scalar} {}
NodeOps forwardOps() {
using namespace functional;
@ -210,7 +205,6 @@ struct TanhNodeOp : public NaryNodeOp {
const std::string type() { return "tanh"; }
};
struct ReLUNodeOp : public UnaryNodeOp {
ReLUNodeOp(Expr a) : UnaryNodeOp(a) {}
@ -262,8 +256,7 @@ struct ReLUNodeOp : public UnaryNodeOp {
* \f]
*/
struct PReLUNodeOp : public UnaryNodeOp {
PReLUNodeOp(float alpha, Expr a)
: UnaryNodeOp(a), alpha_(alpha) {}
PReLUNodeOp(float alpha, Expr a) : UnaryNodeOp(a), alpha_(alpha) {}
NodeOps forwardOps() {
using namespace functional;
@ -334,11 +327,9 @@ struct SwishNodeOp : public UnaryNodeOp {
};
struct SoftmaxNodeOp : public UnaryNodeOp {
SoftmaxNodeOp(Expr a)
: UnaryNodeOp(a), mask_(nullptr) {}
SoftmaxNodeOp(Expr a) : UnaryNodeOp(a), mask_(nullptr) {}
SoftmaxNodeOp(Expr a, Expr mask)
: UnaryNodeOp(a), mask_(mask) {}
SoftmaxNodeOp(Expr a, Expr mask) : UnaryNodeOp(a), mask_(mask) {}
Expr mask_;
@ -407,17 +398,18 @@ struct SumNodeOp : public UnaryNodeOp {
int ax_;
template <typename... Args>
SumNodeOp(Expr a, Args... args)
: UnaryNodeOp(a, newShape(a, args...)) {}
SumNodeOp(Expr a, Args... args) : UnaryNodeOp(a, newShape(a, args...)) {}
NodeOps forwardOps() {
using namespace functional;
return {NodeOp(Reduce(_1, val_, child(0)->val()))}; }
return {NodeOp(Reduce(_1, val_, child(0)->val()))};
}
NodeOps backwardOps() {
using namespace functional;
return {NodeOp(Add(_1, child(0)->grad(), adj_))}; }
return {NodeOp(Add(_1, child(0)->grad(), adj_))};
}
template <class... Args>
Shape newShape(Expr a, Args... args) {
@ -456,8 +448,7 @@ struct MeanNodeOp : public UnaryNodeOp {
int ax_;
template <typename... Args>
MeanNodeOp(Expr a, Args... args)
: UnaryNodeOp(a, newShape(a, args...)) {}
MeanNodeOp(Expr a, Args... args) : UnaryNodeOp(a, newShape(a, args...)) {}
NodeOps forwardOps() {
using namespace functional;
@ -543,8 +534,7 @@ struct ExpNodeOp : public UnaryNodeOp {
struct SqrtNodeOp : public UnaryNodeOp {
float epsilon_;
SqrtNodeOp(Expr a, float epsilon)
: UnaryNodeOp(a), epsilon_(epsilon) {}
SqrtNodeOp(Expr a, float epsilon) : UnaryNodeOp(a), epsilon_(epsilon) {}
NodeOps forwardOps() {
using namespace functional;
@ -614,8 +604,7 @@ struct NegNodeOp : public UnaryNodeOp {
struct RowsNodeOp : public UnaryNodeOp {
RowsNodeOp(Expr a, const std::vector<size_t>& indeces)
: UnaryNodeOp(a, newShape(a, indeces)),
indices_(indeces) {}
: UnaryNodeOp(a, newShape(a, indeces)), indices_(indeces) {}
NodeOps forwardOps() {
// @TODO: solve this with a tensor!
@ -666,8 +655,7 @@ struct RowsNodeOp : public UnaryNodeOp {
struct ColsNodeOp : public UnaryNodeOp {
ColsNodeOp(Expr a, const std::vector<size_t>& indeces)
: UnaryNodeOp(a, newShape(a, indeces)),
indices_(indeces) {}
: UnaryNodeOp(a, newShape(a, indeces)), indices_(indeces) {}
NodeOps forwardOps() {
// @TODO: solve this with a tensor!
@ -716,8 +704,7 @@ struct ColsNodeOp : public UnaryNodeOp {
struct SelectNodeOp : public UnaryNodeOp {
SelectNodeOp(Expr a, int axis, const std::vector<size_t>& indeces)
: UnaryNodeOp(a, newShape(a, axis, indeces)),
indices_(indeces) {}
: UnaryNodeOp(a, newShape(a, axis, indeces)), indices_(indeces) {}
NodeOps forwardOps() {
return {NodeOp(
@ -772,8 +759,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
std::vector<int> axes_;
TransposeNodeOp(Expr a, const std::vector<int>& axes)
: UnaryNodeOp(a, newShape(a, axes)),
axes_{axes} {}
: UnaryNodeOp(a, newShape(a, axes)), axes_{axes} {}
NodeOps forwardOps() {
return {NodeOp(TransposeND(val_, child(0)->val(), axes_))};
@ -788,7 +774,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
Shape shape = a->shape();
ABORT_IF(shape.size() != axes.size(),
"Shape and transpose axes have different number of dimensions");
"Shape and transpose axes have different number of dimensions");
for(int i = 0; i < shape.size(); ++i)
shape.set(i, a->shape()[axes[i]]);
@ -829,8 +815,7 @@ private:
public:
template <typename... Args>
ReshapeNodeOp(Expr a, Shape shape)
: UnaryNodeOp(a, shape), reshapee_(a) {
ReshapeNodeOp(Expr a, Shape shape) : UnaryNodeOp(a, shape), reshapee_(a) {
Node::destroy_ = false;
}
@ -894,9 +879,7 @@ private:
public:
StepNodeOp(Expr a, int step, int axis)
: UnaryNodeOp(a, newShape(a, axis)),
stepNode_(a),
step_(step) {
: UnaryNodeOp(a, newShape(a, axis)), stepNode_(a), step_(step) {
Node::destroy_ = false;
}
@ -1056,67 +1039,54 @@ public:
padWidth,
strideHeight,
strideWidth,
mode) {
}
mode) {}
NodeOps forwardOps() {
return {NodeOp(pooling_.forward(child(0)->val(), val_))};
}
NodeOps backwardOps() {
return {NodeOp(pooling_.backward(
child(0)->val(),
child(0)->grad(),
val_,
adj_))};
return {NodeOp(
pooling_.backward(child(0)->val(), child(0)->grad(), val_, adj_))};
}
const std::string type() { return "layer_pooling"; }
protected:
PoolingWrapper pooling_;
};
class PoolingWithMaskingOp : public UnaryNodeOp {
public:
PoolingWithMaskingOp( Expr x, Expr mask, int width, bool isEven=false)
: UnaryNodeOp(x),
mask_(mask),
width_(width),
isEven_(isEven)
{
auto xShape = x->shape();
int dimBatch = xShape[0];
int dimWord = xShape[1];
int cols = (isEven_) ? xShape[2] - 1 : xShape[2];
int dimSentence = (cols / width_) + (cols % width_ != 0);
shape_ = {dimBatch, dimWord, dimSentence};
}
public:
PoolingWithMaskingOp(Expr x, Expr mask, int width, bool isEven = false)
: UnaryNodeOp(x), mask_(mask), width_(width), isEven_(isEven) {
auto xShape = x->shape();
int dimBatch = xShape[0];
int dimWord = xShape[1];
int cols = (isEven_) ? xShape[2] - 1 : xShape[2];
int dimSentence = (cols / width_) + (cols % width_ != 0);
shape_ = {dimBatch, dimWord, dimSentence};
}
NodeOps forwardOps() {
return {NodeOp(PoolingWithMaskingForward(val_,
NodeOps forwardOps() {
return {NodeOp(PoolingWithMaskingForward(
val_, child(0)->val(), mask_->val(), width_, isEven_))};
}
NodeOps backwardOps() {
return {NodeOp(PoolingWithMaskingBackward(adj_,
child(0)->grad(),
child(0)->val(),
mask_->val(),
width_,
isEven_))};
}
}
NodeOps backwardOps() {
return {NodeOp(PoolingWithMaskingBackward(adj_,
child(0)->grad(),
child(0)->val(),
mask_->val(),
width_,
isEven_))};
}
const std::string type() { return "layer_pooling"; }
const std::string type() {return "layer_pooling";}
protected:
Expr mask_;
int width_;
bool isEven_;
protected:
Expr mask_;
int width_;
bool isEven_;
};
}

View File

@ -23,18 +23,18 @@ struct LayerFactory : public Factory {
return as<Cast>() != nullptr;
}
virtual Ptr<Layer> construct() = 0;
virtual Ptr<Layer> construct() = 0;
};
class DenseFactory : public LayerFactory {
protected:
//std::vector<std::pair<std::string, std::string>> tiedParams_;
// std::vector<std::pair<std::string, std::string>> tiedParams_;
std::vector<std::pair<std::string, std::string>> tiedParamsTransposed_;
public:
DenseFactory(Ptr<ExpressionGraph> graph) : LayerFactory(graph) {}
//Accumulator<DenseFactory> tie(const std::string& param,
// Accumulator<DenseFactory> tie(const std::string& param,
// const std::string& tied) {
// tiedParams_.push_back({param, tied});
// return Accumulator<DenseFactory>(*this);
@ -48,21 +48,20 @@ public:
Ptr<Layer> construct() {
auto dense = New<Dense>(graph_, options_);
//for(auto& p : tiedParams_)
// for(auto& p : tiedParams_)
// dense->tie(p.first, p.second);
for(auto& p : tiedParamsTransposed_)
dense->tie_transposed(p.first, p.second);
return dense;
}
DenseFactory clone() {
DenseFactory aClone(graph_);
aClone.options_->merge(options_);
//aClone.tiedParams_ = tiedParams_;
// aClone.tiedParams_ = tiedParams_;
aClone.tiedParamsTransposed_ = tiedParamsTransposed_;
return aClone;
}
};
typedef Accumulator<DenseFactory> dense;
@ -95,7 +94,6 @@ public:
}
void push_back(Ptr<Layer> layer) { layers_.push_back(layer); }
};
class MLPFactory : public Factory {
@ -121,7 +119,7 @@ public:
layers_.push_back(New<LF>(lf));
return Accumulator<MLPFactory>(*this);
}
MLPFactory clone() {
MLPFactory aClone(graph_);
aClone.options_->merge(options_);
@ -129,7 +127,6 @@ public:
aClone.push_back(lf->clone());
return aClone;
}
};
typedef Accumulator<MLPFactory> mlp;

View File

@ -2,8 +2,7 @@
#include "graph/node_operators_binary.h"
namespace marian {
Convolution::Convolution(Ptr<ExpressionGraph> graph)
: Factory(graph) {}
Convolution::Convolution(Ptr<ExpressionGraph> graph) : Factory(graph) {}
Expr Convolution::apply(Expr x) {
auto prefix = opt<std::string>("prefix");
@ -13,28 +12,21 @@ Expr Convolution::apply(Expr x) {
auto strides = opt<std::pair<int, int>>("strides", std::make_pair(1, 1));
int layerIn = x->shape()[1];
auto kernel = graph_->param(prefix + "_conv_kernels",
{layerIn,
kernelNum,
kernelDims.first,
kernelDims.second},
inits::glorot_uniform);
auto kernel
= graph_->param(prefix + "_conv_kernels",
{layerIn, kernelNum, kernelDims.first, kernelDims.second},
inits::glorot_uniform);
auto bias = graph_->param(prefix + "_conv_bias",
{1, kernelNum, 1, 1},
inits::zeros);
auto bias = graph_->param(
prefix + "_conv_bias", {1, kernelNum, 1, 1}, inits::zeros);
std::vector<Expr> nodes = {x, kernel, bias};
return Expression<ConvolutionOp>(nodes,
paddings.first,
paddings.second,
strides.first,
strides.second);
return Expression<ConvolutionOp>(
nodes, paddings.first, paddings.second, strides.first, strides.second);
}
Expr Convolution::apply(const std::vector<Expr>&) {
ABORT("Can't apply convolution on many inputs at once");
return nullptr;
}
}

View File

@ -33,6 +33,7 @@ public:
template <class BaseFactory>
class Accumulator : public BaseFactory {
typedef BaseFactory Factory;
public:
Accumulator() : Factory(nullptr) {}
Accumulator(Ptr<ExpressionGraph> graph) : Factory(graph) {}

View File

@ -1,6 +1,7 @@
#pragma once
#include "marian.h"
#include "layers/factory.h"
namespace marian {
@ -75,11 +76,9 @@ public:
if(tiedParams_.count(nameW)) {
W = tiedParams_[nameW];
transposeW = true;
}
else {
W = g->param(name + "_" + nameW,
{in->shape()[-1], dim},
inits::glorot_uniform);
} else {
W = g->param(
name + "_" + nameW, {in->shape()[-1], dim}, inits::glorot_uniform);
}
Expr b;
@ -87,8 +86,7 @@ public:
if(tiedParams_.count(nameB))
b = tiedParams_[nameB];
else
b = g->param(
name + "_" + nameB, {1, dim}, inits::zeros);
b = g->param(name + "_" + nameB, {1, dim}, inits::zeros);
params_.push_back(W);
params_.push_back(b);
@ -98,19 +96,19 @@ public:
auto ln_s = g->param(name + "_ln_s" + std::to_string(i),
{1, dim},
inits::from_value(1.f));
auto ln_b = g->param(name + "_ln_b" + std::to_string(i),
{1, dim},
inits::zeros);
auto ln_b = g->param(
name + "_ln_b" + std::to_string(i), {1, dim}, inits::zeros);
outputs.push_back(
layer_norm(affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS));
outputs.push_back(layer_norm(
affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS));
} else {
auto gamma = g->param(name + "_gamma" + std::to_string(i),
{1, dim},
inits::from_value(1.0));
params_.push_back(gamma);
outputs.push_back(layer_norm(dot(in, W, false, transposeW), gamma, b));
outputs.push_back(
layer_norm(dot(in, W, false, transposeW), gamma, b));
}
} else {
@ -147,11 +145,9 @@ public:
if(tiedParams_.count(nameW)) {
transposeW = true;
W = tiedParams_[nameW];
}
else {
W = g->param(name + "_" + nameW,
{input->shape()[-1], dim},
inits::glorot_uniform);
} else {
W = g->param(
name + "_" + nameW, {input->shape()[-1], dim}, inits::glorot_uniform);
}
Expr b;
std::string nameB = "b";
@ -165,16 +161,14 @@ public:
Expr out;
if(layerNorm) {
if(nematusNorm) {
auto ln_s = g->param(
name + "_ln_s", {1, dim}, inits::from_value(1.f));
auto ln_b
= g->param(name + "_ln_b", {1, dim}, inits::zeros);
auto ln_s = g->param(name + "_ln_s", {1, dim}, inits::from_value(1.f));
auto ln_b = g->param(name + "_ln_b", {1, dim}, inits::zeros);
out = layer_norm(affine(input, W, b, false, transposeW),
ln_s, ln_b, NEMATUS_LN_EPS);
out = layer_norm(
affine(input, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS);
} else {
auto gamma = g->param(
name + "_gamma", {1, dim}, inits::from_value(1.0));
auto gamma
= g->param(name + "_gamma", {1, dim}, inits::from_value(1.0));
params_.push_back(gamma);
out = layer_norm(dot(input, W, false, transposeW), gamma, b);
@ -217,22 +211,18 @@ struct EmbeddingFactory : public Factory {
}
}
return graph_->param(name,
{dimVoc, dimEmb},
initFunc,
fixed);
return graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
}
};
typedef Accumulator<EmbeddingFactory> embedding;
static inline
Expr Cost(Expr logits,
Expr indices,
Expr mask,
std::string costType = "cross-entropy",
float smoothing = 0,
Expr weights = nullptr) {
static inline Expr Cost(Expr logits,
Expr indices,
Expr mask,
std::string costType = "cross-entropy",
float smoothing = 0,
Expr weights = nullptr) {
using namespace keywords;
auto ce = cross_entropy(logits, indices);
@ -255,15 +245,17 @@ Expr Cost(Expr logits,
// axes:
// - time axis (words): -3
// - batch axis (sentences): -2
if(costType == "ce-mean" || costType == "cross-entropy") { // sum over words; average over sentences
if(costType == "ce-mean"
|| costType
== "cross-entropy") { // sum over words; average over sentences
cost = mean(costSum, axis = -2);
} else if(costType == "ce-mean-words") { // average over target tokens
} else if(costType == "ce-mean-words") { // average over target tokens
cost = sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2);
} else if(costType == "ce-sum") { // sum over target tokens
} else if(costType == "ce-sum") { // sum over target tokens
cost = sum(costSum, axis = -2);
} else if(costType == "perplexity") { // ==exp('ce-mean-words')
} else if(costType == "perplexity") { // ==exp('ce-mean-words')
cost = exp(sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2));
} else if(costType == "ce-rescore") { // sum over words, keep batch axis
} else if(costType == "ce-rescore") { // sum over words, keep batch axis
cost = -costSum;
} else { // same as ce-mean
cost = mean(costSum, axis = -2);

View File

@ -4,20 +4,18 @@
namespace marian {
static inline
Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch,
Ptr<Options> options,
Expr att) {
static inline Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch,
Ptr<Options> options,
Expr att) {
using namespace keywords;
int dimBatch = att->shape()[0];
int dimSrc = att->shape()[2];
int dimTrg = att->shape()[3];
auto aln = graph->constant(
{dimBatch, 1, dimSrc, dimTrg},
inits::from_vector(batch->getGuidedAlignment()));
auto aln = graph->constant({dimBatch, 1, dimSrc, dimTrg},
inits::from_vector(batch->getGuidedAlignment()));
std::string guidedCostType
= options->get<std::string>("guided-alignment-cost");

View File

@ -2,6 +2,7 @@
#include "marian.h"
#include "models/s2s.h"
#include "layers/convolution.h"
namespace marian {
@ -29,21 +30,18 @@ public:
int dimEmb = opt<int>("dim-emb");
auto convSizes = options_->get<std::vector<int>>("char-conv-filters-num");
auto convWidths = options_->get<std::vector<int>>("char-conv-filters-widths");
auto convWidths
= options_->get<std::vector<int>>("char-conv-filters-widths");
int stride = opt<int>("char-stride");
int highwayNum = opt<int>("char-highway");
auto conved = CharConvPooling(
prefix_ + "conv_pooling",
dimEmb,
convWidths,
convSizes,
stride)
(batchEmbeddings, batchMask);
prefix_ + "conv_pooling", dimEmb, convWidths, convSizes, stride)(
batchEmbeddings, batchMask);
auto inHighway = conved;
for (int i = 0; i < highwayNum; ++i) {
inHighway = highway(prefix_ +"_" + std::to_string(i), inHighway);
for(int i = 0; i < highwayNum; ++i) {
inHighway = highway(prefix_ + "_" + std::to_string(i), inHighway);
}
Expr stridedMask = getStridedMask(graph, batch, stride);
@ -52,24 +50,26 @@ public:
return New<EncoderState>(context, stridedMask, batch);
}
protected:
Expr getStridedMask(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch,
Expr getStridedMask(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch,
int stride) {
auto subBatch = (*batch)[batchIndex_];
int dimBatch = subBatch->batchSize();
std::vector<float> strided;
for (size_t wordIdx = 0; wordIdx < subBatch->mask().size(); wordIdx += stride * dimBatch) {
for (size_t j = wordIdx; j < wordIdx + dimBatch; ++j) {
for(size_t wordIdx = 0; wordIdx < subBatch->mask().size();
wordIdx += stride * dimBatch) {
for(size_t j = wordIdx; j < wordIdx + dimBatch; ++j) {
strided.push_back(subBatch->mask()[j]);
}
}
int dimWords = strided.size() / dimBatch;
auto stridedMask = graph->constant({dimWords, dimBatch, 1},
inits::from_vector(strided));
auto stridedMask
= graph->constant({dimWords, dimBatch, 1}, inits::from_vector(strided));
return stridedMask;
}
};
}

View File

@ -1,6 +1,7 @@
#pragma once
#include "marian.h"
#include "layers/generic.h"
#include "layers/guided_alignment.h"
#include "model_base.h"
@ -30,8 +31,8 @@ protected:
auto batchEmbeddings
= reshape(chosenEmbeddings, {dimWords, dimBatch, dimEmb});
auto batchMask = graph->constant(
{dimWords, dimBatch, 1}, inits::from_vector(subBatch->mask()));
auto batchMask = graph->constant({dimWords, dimBatch, 1},
inits::from_vector(subBatch->mask()));
return std::make_tuple(batchEmbeddings, batchMask);
}
@ -129,7 +130,8 @@ public:
virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
Ptr<DecoderState> state,
const std::vector<size_t>& embIdx,
int dimBatch, int dimBeam) {
int dimBatch,
int dimBeam) {
using namespace keywords;
int dimTrgEmb = opt<int>("dim-emb");
@ -149,12 +151,10 @@ public:
Expr selectedEmbs;
if(embIdx.empty()) {
selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb},
inits::zeros);
selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb}, inits::zeros);
} else {
selectedEmbs = rows(yEmb, embIdx);
selectedEmbs
= reshape(selectedEmbs, {dimBeam, 1, dimBatch, dimTrgEmb});
selectedEmbs = reshape(selectedEmbs, {dimBeam, 1, dimBatch, dimTrgEmb});
}
state->setTargetEmbeddings(selectedEmbs);
}
@ -174,14 +174,16 @@ public:
virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
Ptr<DecoderState> state,
const std::vector<size_t>&,
int dimBatch, int beamSize)
int dimBatch,
int beamSize)
= 0;
virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
Ptr<DecoderState>,
const std::vector<size_t>&,
const std::vector<size_t>&,
int dimBatch, int beamSize)
int dimBatch,
int beamSize)
= 0;
virtual Ptr<DecoderState> step(Ptr<ExpressionGraph>, Ptr<DecoderState>) = 0;
@ -224,7 +226,8 @@ protected:
decoder["mini-batch"] = opt<size_t>("valid-mini-batch");
decoder["maxi-batch"] = opt<size_t>("valid-mini-batch") > 1 ? 100 : 1;
decoder["maxi-batch-sort"] = opt<size_t>("valid-mini-batch") > 1 ? "trg" : "none";
decoder["maxi-batch-sort"]
= opt<size_t>("valid-mini-batch") > 1 ? "trg" : "none";
decoder["relative-paths"] = false;
@ -320,8 +323,10 @@ public:
Ptr<DecoderState> state,
const std::vector<size_t>& hypIndices,
const std::vector<size_t>& embIndices,
int dimBatch, int beamSize) {
auto selectedState = hypIndices.empty() ? state : state->select(hypIndices, beamSize);
int dimBatch,
int beamSize) {
auto selectedState
= hypIndices.empty() ? state : state->select(hypIndices, beamSize);
selectEmbeddings(graph, selectedState, embIndices, dimBatch, beamSize);
selectedState->setSingleStep(true);
auto nextState = step(graph, selectedState);
@ -332,7 +337,8 @@ public:
virtual void selectEmbeddings(Ptr<ExpressionGraph> graph,
Ptr<DecoderState> state,
const std::vector<size_t>& embIdx,
int dimBatch, int beamSize) {
int dimBatch,
int beamSize) {
decoders_[0]->selectEmbeddings(graph, state, embIdx, dimBatch, beamSize);
}
@ -366,17 +372,12 @@ public:
int dimBatch = batch->size();
int dimWords = sentenceWeighting ? 1 : batch->back()->batchWidth();
weights = graph->constant(
{1, dimWords, dimBatch, 1},
inits::from_vector(batch->getDataWeights()));
weights = graph->constant({1, dimWords, dimBatch, 1},
inits::from_vector(batch->getDataWeights()));
}
auto cost = Cost(nextState->getProbs(),
trgIdx,
trgMask,
costType,
ls,
weights);
auto cost
= Cost(nextState->getProbs(), trgIdx, trgMask, costType, ls, weights);
if(options_->has("guided-alignment") && !inference_) {
auto alignments = decoders_[0]->getAlignments();
@ -428,7 +429,7 @@ public:
do {
size_t current = (start + end) / 2;
//std::cerr << i << " " << current << std::endl;
// std::cerr << i << " " << current << std::endl;
auto batch = data::CorpusBatch::fakeBatch(lengths, current, options_);
build(graph, batch);
fits = graph->fits();
@ -436,8 +437,7 @@ public:
if(fits) {
stats->add(batch, multiplier);
start = current + 1;
}
else {
} else {
end = current - 1;
}
} while(end - start > step);

View File

@ -1,9 +1,10 @@
#pragma once
#include "marian.h"
#include "layers/generic.h"
#include "rnn/types.h"
#include "rnn/attention_constructors.h"
#include "rnn/types.h"
#include <numeric>
@ -21,13 +22,16 @@ public:
: DecoderState(states, probs, encStates),
attentionIndices_(attentionIndices) {}
virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, int beamSize) {
virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx,
int beamSize) {
std::vector<size_t> selectedAttentionIndices;
for(auto i : selIdx)
selectedAttentionIndices.push_back(attentionIndices_[i]);
return New<DecoderStateHardAtt>(
states_.select(selIdx, beamSize), probs_, encStates_, selectedAttentionIndices);
return New<DecoderStateHardAtt>(states_.select(selIdx, beamSize),
probs_,
encStates_,
selectedAttentionIndices);
}
virtual void setAttentionIndices(
@ -85,11 +89,11 @@ public:
Expr start;
if(!meanContexts.empty()) {
// apply single layer network to mean to map into decoder space
auto mlp = mlp::mlp(graph) //
.push_back(mlp::dense(graph) //
("prefix", prefix_ + "_ff_state") //
("dim", opt<int>("dim-rnn")) //
("activation", (int)mlp::act::tanh)//
auto mlp = mlp::mlp(graph) //
.push_back(mlp::dense(graph) //
("prefix", prefix_ + "_ff_state") //
("dim", opt<int>("dim-rnn")) //
("activation", (int)mlp::act::tanh) //
("layer-normalization",
opt<bool>("layer-normalization")));
start = mlp->apply(meanContexts);
@ -133,7 +137,8 @@ public:
int dimBeam = trgEmbeddings->shape()[-4];
if(dropoutTrg) {
trgEmbeddings = dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1});
trgEmbeddings
= dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1});
}
auto flatContext = reshape(context, {dimBatch * dimSrcWords, dimContext});

View File

@ -1,7 +1,7 @@
#pragma once
#include "marian.h"
#include <string>
#include "marian.h"
namespace marian {
namespace models {

View File

@ -1,12 +1,13 @@
#include "marian.h"
#include "models/model_factory.h"
#include "models/amun.h"
#include "models/encdec.h"
#include "models/hardatt.h"
#include "models/nematus.h"
#include "models/s2s.h"
#include "models/transformer.h"
#include "models/hardatt.h"
#include "models/amun.h"
#include "models/nematus.h"
#include "models/encdec.h"
#ifdef CUDNN
#include "models/char_s2s.h"

View File

@ -1,8 +1,9 @@
#pragma once
#include "marian.h"
#include "layers/factory.h"
#include "encdec.h"
#include "layers/factory.h"
namespace marian {

View File

@ -1,9 +1,10 @@
#pragma once
#include "marian.h"
#include "layers/constructors.h"
#include "rnn/constructors.h"
#include "rnn/attention_constructors.h"
#include "rnn/constructors.h"
namespace marian {

View File

@ -48,8 +48,10 @@ public:
virtual Expr getProbs() { return probs_; }
virtual void setProbs(Expr probs) { probs_ = probs; }
virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, int beamSize) {
return New<DecoderState>(states_.select(selIdx, beamSize), probs_, encStates_);
virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx,
int beamSize) {
return New<DecoderState>(
states_.select(selIdx, beamSize), probs_, encStates_);
}
virtual const rnn::States& getStates() { return states_; }

View File

@ -1,11 +1,12 @@
#pragma once
#include "marian.h"
#include "layers/factory.h"
#include "encdec.h"
#include "layers/constructors.h"
#include "layers/factory.h"
#include "model_base.h"
#include "model_factory.h"
#include "encdec.h"
namespace marian {
@ -34,8 +35,8 @@ public:
}
// shared across batch entries
auto signal = graph->constant({dimWords, 1, dimEmb},
inits::from_vector(vPos));
auto signal
= graph->constant({dimWords, 1, dimEmb}, inits::from_vector(vPos));
return input + signal;
}
@ -47,15 +48,14 @@ public:
for(int i = 0; i < length; ++i)
for(int j = 0; j <= i; ++j)
vMask[i * length + j] = 1.f;
return graph->constant({1, length, length},
inits::from_vector(vMask));
return graph->constant({1, length, length}, inits::from_vector(vMask));
}
Expr InverseMask(Expr mask) {
// convert 0/1 mask to transformer style -inf mask
auto ms = mask->shape();
mask = (1 - mask) * -99999999.f;
return reshape(mask, {ms[-3], 1, ms[-2], ms[-1]}) ;
return reshape(mask, {ms[-3], 1, ms[-2], ms[-1]});
}
Expr SplitHeads(Expr input, int dimHeads) {
@ -135,18 +135,17 @@ public:
if(op == 'h') {
auto Wh = graph->param(
prefix + "_Wh", {dimModel, dimModel}, inits::glorot_uniform);
auto bh
= graph->param(prefix + "_bh", {1, dimModel}, inits::zeros);
auto bh = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros);
auto t = affine(prevInput, Wh, bh);
output = highway(output, prevInput, t);
}
// layer normalization
if(op == 'n') {
auto scale = graph->param(
prefix + "_ln_scale", {1, dimModel}, inits::ones);
auto bias = graph->param(
prefix + "_ln_bias", {1, dimModel}, inits::zeros);
auto scale
= graph->param(prefix + "_ln_scale", {1, dimModel}, inits::ones);
auto bias
= graph->param(prefix + "_ln_bias", {1, dimModel}, inits::zeros);
output = layer_norm(output, scale, bias, 1e-6);
}
}
@ -219,17 +218,13 @@ public:
if(i > 0)
prefixProj += "_enc" + std::to_string(i + 1);
auto Wk = graph->param(prefixProj + "_Wk",
{dimModel, dimModel},
inits::glorot_uniform);
auto bk = graph->param(
prefixProj + "_bk", {1, dimModel}, inits::zeros);
auto Wk = graph->param(
prefixProj + "_Wk", {dimModel, dimModel}, inits::glorot_uniform);
auto bk = graph->param(prefixProj + "_bk", {1, dimModel}, inits::zeros);
auto Wv = graph->param(prefixProj + "_Wv",
{dimModel, dimModel},
inits::glorot_uniform);
auto bv = graph->param(
prefixProj + "_bv", {1, dimModel}, inits::zeros);
auto Wv = graph->param(
prefixProj + "_Wv", {dimModel, dimModel}, inits::glorot_uniform);
auto bv = graph->param(prefixProj + "_bv", {1, dimModel}, inits::zeros);
auto kh = affine(keys[i], Wk, bk);
auto vh = affine(values[i], Wv, bv);
@ -254,8 +249,8 @@ public:
int dimAtt = output->shape()[-1];
auto Wo = graph->param(
prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
auto Wo
= graph->param(prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
auto bo = graph->param(prefix + "_bo", {1, dimOut}, inits::zeros);
output = affine(output, Wo, bo);
@ -468,11 +463,12 @@ public:
std::vector<Ptr<EncoderState>> &encStates)
: DecoderState(states, probs, encStates) {}
virtual Ptr<DecoderState> select(const std::vector<size_t> &selIdx, int beamSize) {
virtual Ptr<DecoderState> select(const std::vector<size_t> &selIdx,
int beamSize) {
rnn::States selectedStates;
int dimDepth = states_[0].output->shape()[-1];
int dimTime = states_[0].output->shape()[-2];
int dimTime = states_[0].output->shape()[-2];
int dimBatch = selIdx.size() / beamSize;
std::vector<size_t> selIdx2;
@ -553,7 +549,7 @@ public:
decoderMask = reshape(TransposeTimeBatch(decoderMask),
{1, dimBatch, 1, dimTrgWords});
selfMask = selfMask * decoderMask;
//if(dimBeam > 1)
// if(dimBeam > 1)
// selfMask = repeat(selfMask, dimBeam, axis = -4);
}
@ -586,7 +582,8 @@ public:
for(int i = 1; i <= opt<int>("dec-depth"); ++i) {
auto values = query;
if(prevDecoderStates.size() > 0)
values = concatenate({prevDecoderStates[i - 1].output, query}, axis = -2);
values
= concatenate({prevDecoderStates[i - 1].output, query}, axis = -2);
decoderStates.push_back({values, nullptr});

View File

@ -15,5 +15,4 @@ void Norm::clip(Tensor t) {
if(l2Norm >= c_)
Element(_1 = (c_ / l2Norm) * _1, t);
}
}

View File

@ -58,7 +58,8 @@ void Adagrad::load(const std::string& name,
// extract data into vectors
if(name == "adagrad_gt") {
vGt.resize(totalSize);
std::copy((float*)np->data(), (float*)np->data() + totalSize, vGt.begin());
std::copy(
(float*)np->data(), (float*)np->data() + totalSize, vGt.begin());
}
}
@ -178,11 +179,13 @@ void Adam::load(const std::string& name,
// extract data into vectors
if(name == "adam_mt") {
vMt.resize(totalSize);
std::copy((float*)np->data(), (float*)np->data() + totalSize, vMt.begin());
std::copy(
(float*)np->data(), (float*)np->data() + totalSize, vMt.begin());
}
if(name == "adam_vt") {
vVt.resize(totalSize);
std::copy((float*)np->data(), (float*)np->data() + totalSize, vVt.begin());
std::copy(
(float*)np->data(), (float*)np->data() + totalSize, vVt.begin());
}
}

View File

@ -1,8 +1,8 @@
#pragma once
#include <algorithm>
#include <map>
#include <memory>
#include <algorithm>
#include "common/config.h"
#include "graph/expression_graph.h"

View File

@ -42,9 +42,11 @@ private:
public:
Rescore(Ptr<Config> options)
: options_(options),
corpus_(options_->get<bool>("n-best") ?
std::static_pointer_cast<CorpusBase>(New<CorpusNBest>(options_)) :
std::static_pointer_cast<CorpusBase>(New<Corpus>(options_))) {
corpus_(
options_->get<bool>("n-best")
? std::static_pointer_cast<CorpusBase>(
New<CorpusNBest>(options_))
: std::static_pointer_cast<CorpusBase>(New<Corpus>(options_))) {
corpus_->prepare();
auto devices = options_->getDevices();
@ -66,12 +68,12 @@ public:
models_.resize(graphs_.size());
ThreadPool pool(graphs_.size(), graphs_.size());
for(int i = 0; i < graphs_.size(); ++i) {
pool.enqueue([=](int j) {
models_[j] = New<Model>(temp);
models_[j]->load(graphs_[j], modelFile);
}, i);
pool.enqueue(
[=](int j) {
models_[j] = New<Model>(temp);
models_[j]->load(graphs_[j], modelFile);
},
i);
}
}
@ -81,9 +83,10 @@ public:
auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
batchGenerator->prepare(false);
Ptr<ScoreCollector> output = options_->get<bool>("n-best") ?
std::static_pointer_cast<ScoreCollector>(New<ScoreCollectorNBest>(options_)) :
New<ScoreCollector>();
Ptr<ScoreCollector> output = options_->get<bool>("n-best")
? std::static_pointer_cast<ScoreCollector>(
New<ScoreCollectorNBest>(options_))
: New<ScoreCollector>();
bool summarize = options_->has("summary");
std::string summary

View File

@ -13,8 +13,7 @@ namespace marian {
class ScoreCollector {
public:
ScoreCollector()
: nextId_(0), outStrm_(new OutputFileStream(std::cout)) {};
ScoreCollector() : nextId_(0), outStrm_(new OutputFileStream(std::cout)){};
virtual void Write(long id, const std::string& message) {
boost::mutex::scoped_lock lock(mutex_);
@ -52,9 +51,7 @@ public:
}
}
virtual void Write(long id, float value) {
Write(id, std::to_string(value));
}
virtual void Write(long id, float value) { Write(id, std::to_string(value)); }
protected:
long nextId_{0};
@ -86,7 +83,9 @@ public:
ScoreCollectorNBest(const ScoreCollectorNBest&) = delete;
std::string addToNBest(const std::string nbest, const std::string feature, float score) {
std::string addToNBest(const std::string nbest,
const std::string feature,
float score) {
std::vector<std::string> fields;
Split(nbest, fields, "|||");
std::stringstream ss;
@ -96,13 +95,15 @@ public:
}
virtual void Write(long id, float score) {
std::string line;
{
boost::mutex::scoped_lock lock(mutex_);
auto iter = buffer_.find(id);
if(iter == buffer_.end()) {
ABORT_IF(lastRead_ >= id, "Entry {} < {} already read but not in buffer", id, lastRead_);
ABORT_IF(lastRead_ >= id,
"Entry {} < {} already read but not in buffer",
id,
lastRead_);
std::string line;
while(lastRead_ < id && std::getline((std::istream&)*file_, line)) {
lastRead_++;
@ -116,6 +117,5 @@ public:
ScoreCollector::Write(id, addToNBest(line, fname_, score));
}
};
}

View File

@ -1,10 +1,11 @@
#pragma once
#include "marian.h"
#include "layers/factory.h"
#include "rnn/types.h"
#include "rnn/constructors.h"
#include "rnn/attention.h"
#include "rnn/constructors.h"
#include "rnn/types.h"
namespace marian {
namespace rnn {

View File

@ -52,8 +52,7 @@ Expr gruOps(const std::vector<Expr>& nodes, bool final) {
/******************************************************************************/
struct LSTMCellNodeOp : public NaryNodeOp {
LSTMCellNodeOp(const std::vector<Expr>& nodes)
: NaryNodeOp(nodes) {}
LSTMCellNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {}
NodeOps forwardOps() {
std::vector<Tensor> inputs;
@ -89,8 +88,7 @@ struct LSTMCellNodeOp : public NaryNodeOp {
};
struct LSTMOutputNodeOp : public NaryNodeOp {
LSTMOutputNodeOp(const std::vector<Expr>& nodes)
: NaryNodeOp(nodes) {}
LSTMOutputNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {}
NodeOps forwardOps() {
std::vector<Tensor> inputs;

View File

@ -43,8 +43,7 @@ public:
{dimInput, dimState},
inits::glorot_uniform);
b_ = graph->param(
prefix + "_b", {1, dimState}, inits::zeros);
b_ = graph->param(prefix + "_b", {1, dimState}, inits::zeros);
if(dropout_ > 0.0f) {
if(dimInput)
@ -520,8 +519,7 @@ public:
{dimInput, 4 * dimState},
inits::glorot_uniform);
b_ = graph->param(
prefix + "_b", {1, 4 * dimState}, inits::zeros);
b_ = graph->param(prefix + "_b", {1, 4 * dimState}, inits::zeros);
if(dropout_ > 0.0f) {
if(dimInput)

View File

@ -1,7 +1,7 @@
#pragma once
#include "marian.h"
#include "layers/factory.h"
#include "marian.h"
#include "rnn/rnn.h"
namespace marian {

View File

@ -1,9 +1,9 @@
#pragma once
#include "marian.h"
#include "layers/generic.h"
#include "rnn/types.h"
#include "marian.h"
#include "rnn/cells.h"
#include "rnn/types.h"
#include <algorithm>
#include <chrono>
@ -74,10 +74,9 @@ private:
j = timeSteps - i - 1;
std::vector<Expr> steps(xWs.size());
std::transform(xWs.begin(),
xWs.end(),
steps.begin(),
[j](Expr e) { return step(e, j, -3); });
std::transform(xWs.begin(), xWs.end(), steps.begin(), [j](Expr e) {
return step(e, j, -3);
});
if(mask)
state = cell_->applyState(steps, state, step(mask, j, -3));

View File

@ -9,8 +9,8 @@
#include <vector>
#include "common/definitions.h"
#include "tensors/memory_piece.h"
#include "tensors/device.h"
#include "tensors/memory_piece.h"
namespace marian {
@ -92,8 +92,8 @@ private:
gaps_.swap(oldGaps);
for(auto gap : oldGaps)
gaps_.insert(
Gap(device_->data() + std::distance(oldData, gap.data()), gap.size()));
gaps_.insert(Gap(device_->data() + std::distance(oldData, gap.data()),
gap.size()));
insertGap(Gap(device_->data() + oldSize, add));
std::unordered_map<uint8_t*, Ptr<MemoryPiece>> oldAllocated;
@ -141,7 +141,10 @@ private:
}
public:
Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256)
Allocator(DeviceId deviceId,
size_t bytes,
size_t step,
size_t alignment = 256)
: device_(DispatchDevice(deviceId, alignment)),
step_(step),
available_(0),

View File

@ -16,5 +16,4 @@ Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) {
#endif
return New<cpu::Backend>(deviceId, seed);
}
}

View File

@ -10,8 +10,7 @@ protected:
size_t seed_;
public:
Backend(DeviceId deviceId, size_t seed)
: deviceId_(deviceId), seed_(seed) {}
Backend(DeviceId deviceId, size_t seed) : deviceId_(deviceId), seed_(seed) {}
virtual DeviceId getDevice() { return deviceId_; };
virtual void setDevice() = 0;
@ -19,5 +18,4 @@ public:
};
Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed);
}

View File

@ -5,24 +5,22 @@
#pragma once
#include "tensors/tensor.h"
#include "functional/functional.h"
#include "functional/shape.h"
#include "functional/tmp.h"
#include "functional/tensor.h"
#include "functional/tmp.h"
#include "tensors/tensor.h"
namespace marian {
namespace cpu {
template <size_t K, class Functor>
void gAddGeneric(Functor functor,
const functional::Shape full,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
float scale = 1.0) {
int outLength = out.shape().elements();
bool same = outLength == full.elements();
for(int i = 0; i < K; ++i)
@ -73,7 +71,6 @@ void gAddReduce(Functor functor,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
float scale = 1.0) {
int rows = full.elements() / full.back();
int cols = full.back();
@ -100,12 +97,8 @@ void gAddReduce(Functor functor,
}
}
template <class Functor, class ...Tensors>
void Add(Functor functor,
float scale,
marian::Tensor out,
Tensors... tensors) {
template <class Functor, class... Tensors>
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
auto full = marian::Shape::broadcast({out, tensors...});
int length = out->shape().elements();
@ -113,7 +106,7 @@ void Add(Functor functor,
constexpr size_t K = sizeof...(Tensors);
functional::Tensor<float> gOut = out;
functional::Array<functional::Tensor<float>, K> gIns = {tensors ...};
functional::Array<functional::Tensor<float>, K> gIns = {tensors...};
if(full.back() != 1 && out->shape().back() == 1) {
size_t m = full.elements() / length;
@ -128,8 +121,5 @@ void Add(Functor functor,
cpu::gAddGeneric(functor, full, gOut, gIns, scale);
}
}
}
}

View File

@ -15,17 +15,13 @@ private:
public:
Backend(DeviceId deviceId, size_t seed)
: marian::Backend(deviceId, seed),
gen_(seed_) {}
: marian::Backend(deviceId, seed), gen_(seed_) {}
void setDevice() { }
void setDevice() {}
void synchronize() {}
std::default_random_engine& getRandomGenerator() {
return gen_;
}
std::default_random_engine& getRandomGenerator() { return gen_; }
};
}
}

View File

@ -1,31 +1,31 @@
#include <iostream>
#include "tensors/device.h"
#include <iostream>
#include <stdlib.h>
namespace marian {
namespace cpu {
Device::~Device() {
Device::~Device() {
free(data_);
data_ = nullptr;
size_ = 0;
}
void Device::reserve(size_t size) {
size = align(size);
ABORT_IF(size < size_ || size == 0,
"New size must be larger than old size and larger than 0");
if(data_) {
uint8_t *temp = static_cast<uint8_t *>(aligned_alloc(alignment_, size));
std::copy(data_, data_ + size_, temp);
free(data_);
data_ = nullptr;
size_ = 0;
data_ = temp;
} else {
data_ = static_cast<uint8_t *>(aligned_alloc(alignment_, size));
}
void Device::reserve(size_t size) {
size = align(size);
ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
if(data_) {
uint8_t *temp = static_cast<uint8_t*>(aligned_alloc(alignment_, size));
std::copy(data_, data_ + size_, temp);
free(data_);
data_ = temp;
} else {
data_ = static_cast<uint8_t*>(aligned_alloc(alignment_, size));
}
size_ = size;
}
size_ = size;
}
}
}

View File

@ -1,19 +1,20 @@
#include <algorithm>
#include <random>
#include "tensors/tensor_operators.h"
#include "tensors/cpu/backend.h"
#include "tensors/tensor_operators.h"
namespace marian {
namespace cpu {
namespace cpu {
void Dropout(Tensor tensor, float p) {
auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
auto &gen = cpuBackend->getRandomGenerator();
std::bernoulli_distribution dist(1.f - p);
std::generate(tensor->data(), tensor->data() + tensor->size(),
[&]() { return dist(gen) / (1.f - p); });
}
}
void Dropout(Tensor tensor, float p) {
auto cpuBackend
= std::static_pointer_cast<cpu::Backend>(tensor->getBackend());
auto &gen = cpuBackend->getRandomGenerator();
std::bernoulli_distribution dist(1.f - p);
std::generate(tensor->data(), tensor->data() + tensor->size(), [&]() {
return dist(gen) / (1.f - p);
});
}
}
}

View File

@ -13,12 +13,11 @@ namespace cpu {
template <size_t K, bool broadcast, class Functor>
void gElement(Functor functor,
functional::Array<functional::Tensor<float>, K> tensors) {
int length = tensors[0].shape().elements();
functional::Array<int, functional::Shape::size()> dims;
functional::Array<int, K> indices;
#pragma omp parallel for simd
#pragma omp parallel for simd
for(int index = 0; index < length; ++index) {
indices.fill(index);
if(broadcast) {
@ -30,8 +29,8 @@ void gElement(Functor functor,
}
}
template <class Functor, class ...Tensors>
void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
template <class Functor, class... Tensors>
void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
constexpr size_t K = sizeof...(tensors) + 1;
functional::Array<functional::Tensor<float>, K> gTensors = {out, tensors...};
@ -46,6 +45,5 @@ void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
else
cpu::gElement<K, false>(functor, gTensors);
}
}
}

View File

@ -1,15 +1,15 @@
/* All or part of this file was contributed by Intel under license:
* Copyright (C) 2017-2018 Intel Corporation
* SPDX-License-Identifier: MIT
*/
/* All or part of this file was contributed by Intel under license:
* Copyright (C) 2017-2018 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "tensors/tensor.h"
#include "tensors/cpu/backend.h"
#include "tensors/tensor.h"
#if MKL_FOUND
#include <mkl.h>
#else
#if BLAS_FOUND
#if BLAS_FOUND
#include <cblas.h>
#endif
#endif
@ -25,7 +25,6 @@ void Prod(marian::Tensor C,
bool transB,
float beta,
float scalar) {
#if BLAS_FOUND
float alpha = scalar;
@ -46,19 +45,20 @@ void Prod(marian::Tensor C,
if(transB)
ldc = B->shape().elements() / B->shape()[-1];
cblas_sgemm(
CblasColMajor,
transB ? CblasTrans : CblasNoTrans,
transA ? CblasTrans : CblasNoTrans,
n, m, k,
alpha,
B->data(),
ldb,
A->data(),
lda,
beta,
C->data(),
ldc);
cblas_sgemm(CblasColMajor,
transB ? CblasTrans : CblasNoTrans,
transA ? CblasTrans : CblasNoTrans,
n,
m,
k,
alpha,
B->data(),
ldb,
A->data(),
lda,
beta,
C->data(),
ldc);
#else
ABORT("Not implemented!");
#endif
@ -73,7 +73,7 @@ void ProdBatched(marian::Tensor C,
float scalar) {
#if BLAS_FOUND
float alpha = scalar;
size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]);
size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]);
@ -95,33 +95,34 @@ void ProdBatched(marian::Tensor C,
ldc = B->shape()[-2];
auto opA = transA ? CblasTrans : CblasNoTrans;
auto opB = transB ? CblasTrans : CblasNoTrans;
auto opB = transB ? CblasTrans : CblasNoTrans;
auto strideB = batchB == 1 ? 0 : n * k;
auto strideA = batchA == 1 ? 0 : m * k;
auto strideC = n * m;
int steps = std::max(batchA, batchB);
int offsetA = 0;
int offsetB = 0;
int offsetC = 0;
for(int i = 0; i < steps; ++i) {
cblas_sgemm(
CblasColMajor,
opB,
opA,
n, m, k,
alpha,
B->data() + offsetB,
ldb,
A->data() + offsetA,
lda,
beta,
C->data() + offsetC,
ldc);
cblas_sgemm(CblasColMajor,
opB,
opA,
n,
m,
k,
alpha,
B->data() + offsetB,
ldb,
A->data() + offsetA,
lda,
beta,
C->data() + offsetC,
ldc);
offsetA += strideA;
offsetB += strideB;
offsetC += strideC;
@ -130,6 +131,5 @@ void ProdBatched(marian::Tensor C,
ABORT("Not implemented!");
#endif
}
}
}

View File

@ -44,13 +44,13 @@ void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
}
inline void gInsertCols(float* out,
const float* in,
size_t rows,
size_t cols,
size_t cols_out,
size_t cols_in,
size_t offset_out,
size_t offset_in) {
const float* in,
size_t rows,
size_t cols,
size_t cols_out,
size_t cols_in,
size_t offset_out,
size_t offset_in) {
for(int j = 0; j < rows; ++j) {
float* rowOut = out + j * cols_out + offset_out;
const float* rowIn = in + j * cols_in + offset_in;
@ -68,9 +68,10 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
for(auto in : inputs) {
ABORT_IF(rows != in->shape().elements() / in->shape().back(),
"First dimension must be equal");
"First dimension must be equal");
int cols_in = in->shape().back();
cpu::gInsertCols(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
cpu::gInsertCols(
out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
offset += cols_in;
}
}
@ -88,11 +89,10 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
int cols_in = in->shape().back();
for(auto out : outputs) {
ABORT_IF(rows != out->shape().elements() / out->shape().back(),
"First dimension must be equal");
"First dimension must be equal");
int cols_out = out->shape().back();
cpu::gInsertCols(out->data(), in->data(),
rows, cols_out, cols_out, cols_in,
0, offset);
cpu::gInsertCols(
out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset);
offset += cols_out;
}
}
@ -158,24 +158,24 @@ void Softmax(Tensor out_, Tensor in_, Tensor mask_) {
int rows = out_->shape().elements() / out_->shape().back();
int cols = out_->shape().back();
for (int j = 0; j < rows; ++j) {
float* so = out + j*cols;
const float* sp = in + j*cols;
const float* mp = mask ? mask + j*cols : nullptr;
for(int j = 0; j < rows; ++j) {
float* so = out + j * cols;
const float* sp = in + j * cols;
const float* mp = mask ? mask + j * cols : nullptr;
float max = sp[0];
for (int i = 1; i < cols; ++i) {
for(int i = 1; i < cols; ++i) {
max = std::max(max, sp[i]);
}
float sum = 0.f;
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
float ex = !mask || mp[i] ? std::exp(sp[i] - max) : 0.f;
so[i] = ex;
sum += ex;
}
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
so[i] /= sum;
}
}
@ -188,24 +188,24 @@ void LogSoftmax(Tensor out_, Tensor in_) {
int rows = out_->shape().elements() / out_->shape().back();
int cols = out_->shape().back();
for (int j = 0; j < rows; ++j) {
for(int j = 0; j < rows; ++j) {
float* so = out + j * cols;
const float* sp = in + j*cols;
const float* sp = in + j * cols;
float max = sp[0];
for (int i = 1; i < cols; ++i) {
for(int i = 1; i < cols; ++i) {
max = std::max(max, sp[i]);
}
float sum = 0.f;
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
float sm = sp[i] - max;
float ex = std::exp(sm);
so[i] = sm;
sum += ex;
}
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
so[i] -= std::log(sum);
}
}
@ -219,17 +219,17 @@ void SoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) {
const float* adj = adj_->data();
const float* val = val_->data();
for (size_t j = 0; j < rows; ++j) {
float* gradRow = grad + j*cols;
const float* adjRow = adj + j*cols;
const float* valRow = val + j*cols;
for(size_t j = 0; j < rows; ++j) {
float* gradRow = grad + j * cols;
const float* adjRow = adj + j * cols;
const float* valRow = val + j * cols;
float sum = 0.f;
for (size_t i = 0; i < cols; ++i) {
for(size_t i = 0; i < cols; ++i) {
sum += valRow[i] * adjRow[i];
}
for (size_t i = 0; i < cols; ++i) {
for(size_t i = 0; i < cols; ++i) {
gradRow[i] += valRow[i] * (adjRow[i] - sum);
}
}
@ -243,62 +243,68 @@ void LogSoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) {
const float* adj = adj_->data();
const float* val = val_->data();
for (int j = 0; j < rows; ++j) {
float* gradRow = grad + j*cols;
const float* adjRow = adj + j*cols;
const float* valRow = val + j*cols;
for(int j = 0; j < rows; ++j) {
float* gradRow = grad + j * cols;
const float* adjRow = adj + j * cols;
const float* valRow = val + j * cols;
float sum = 0.f;
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
sum += adjRow[i];
}
for (int i = 0; i < cols; ++i) {
gradRow[i] += adjRow[i] - sum*std::exp(valRow[i]);
for(int i = 0; i < cols; ++i) {
gradRow[i] += adjRow[i] - sum * std::exp(valRow[i]);
}
}
}
void CopyRows(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
void CopyRows(Tensor out_,
const Tensor in_,
const std::vector<size_t>& indices) {
size_t cols = in_->shape()[1];
size_t rows = indices.size();
float* out = out_->data();
const float* in = in_->data();
#pragma omp parallel for
for (int j = 0; j < rows; ++j) {
#pragma omp parallel for
for(int j = 0; j < rows; ++j) {
size_t dst = j;
size_t src = indices[j];
float* rowOut = out + dst*cols;
const float* rowIn = in + src*cols;
float* rowOut = out + dst * cols;
const float* rowIn = in + src * cols;
std::copy(rowIn, rowIn + cols, rowOut);
}
}
void PasteRows(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
void PasteRows(Tensor out_,
const Tensor in_,
const std::vector<size_t>& indices) {
size_t cols = in_->shape()[-1];
size_t rows = indices.size();
float* out = out_->data();
const float* in = in_->data();
for (int j = 0; j < rows; ++j) {
size_t dst = indices[j]; // not a permutation - may alias, unlike PasteCols
for(int j = 0; j < rows; ++j) {
size_t dst = indices[j]; // not a permutation - may alias, unlike PasteCols
size_t src = j;
float* rowOut = out + dst*cols;
const float* rowIn = in + src*cols;
float* rowOut = out + dst * cols;
const float* rowIn = in + src * cols;
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
rowOut[i] += rowIn[i];
}
}
}
void CopyCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
void CopyCols(Tensor out_,
const Tensor in_,
const std::vector<size_t>& indices) {
size_t rows = in_->shape().elements() / in_->shape()[-1];
size_t colsIn = in_->shape()[-1];
size_t colsOut = indices.size();
@ -306,18 +312,20 @@ void CopyCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices)
float* out = out_->data();
const float* in = in_->data();
#pragma omp parallel for
for (int j = 0; j < rows; ++j) {
const float* rowIn = in + j*colsIn;
float* rowOut = out + j*colsOut;
#pragma omp parallel for
for(int j = 0; j < rows; ++j) {
const float* rowIn = in + j * colsIn;
float* rowOut = out + j * colsOut;
for (int i = 0; i < colsOut; ++i) {
for(int i = 0; i < colsOut; ++i) {
rowOut[i] = rowIn[indices[i]];
}
}
}
void PasteCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) {
void PasteCols(Tensor out_,
const Tensor in_,
const std::vector<size_t>& indices) {
size_t rows = out_->shape().elements() / out_->shape()[-1];
size_t colsOut = out_->shape()[-1];
size_t colsIn = indices.size();
@ -328,12 +336,12 @@ void PasteCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices
/* n.b. Unlike PasteRows, currently appears safe to assume indices[i] is a
* permutation i.e. no racy aliases, and no need to sum vs. just assign.
*/
for (int j = 0; j < rows; ++j) {
const float* rowIn = in + j*colsIn;
float* rowOut = out + j*colsOut;
for(int j = 0; j < rows; ++j) {
const float* rowIn = in + j * colsIn;
float* rowOut = out + j * colsOut;
// @TODO: should this be a sum?
for (int i = 0; i < colsIn; ++i) {
for(int i = 0; i < colsIn; ++i) {
rowOut[indices[i]] = rowIn[i];
}
}
@ -367,8 +375,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
const float* b = inputs[3]->data();
const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;
#pragma omp parallel for
for (int j = 0; j < rows; ++j) {
#pragma omp parallel for
for(int j = 0; j < rows; ++j) {
float m = !mask || mask[j];
float* rowOut = out + j * cols;
const float* rowState = state + j * cols;
@ -376,8 +384,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
const float* xWrow = xW + j * cols * 3;
const float* sUrow = sU + j * cols * 3;
#pragma omp simd
for (int i = 0; i < cols; ++i) {
#pragma omp simd
for(int i = 0; i < cols; ++i) {
// @TODO: stable logit
float r = stableLogit(xWrow[i] + sUrow[i] + b[i]);
@ -417,8 +425,8 @@ void GRUFastBackward(std::vector<Tensor> outputs,
const float* mask = inputs.size() > 4 ? inputs[4]->data() : 0;
const float* adj = adj_->data();
#pragma omp parallel
for (int j = 0; j < rows; ++j) {
#pragma omp parallel
for(int j = 0; j < rows; ++j) {
float m = !mask || mask[j];
float* rowOutState = outState + j * cols;
@ -430,8 +438,8 @@ void GRUFastBackward(std::vector<Tensor> outputs,
const float* rowSU = sU + j * cols * 3;
const float* rowAdj = adj + j * cols;
#pragma omp for simd nowait
for (int i = 0; i < cols; ++i) {
#pragma omp for simd nowait
for(int i = 0; i < cols; ++i) {
int k = i + cols;
int l = i + 2 * cols;
@ -446,10 +454,11 @@ void GRUFastBackward(std::vector<Tensor> outputs,
float adj = rowAdj[i];
float t = (1-z)*(1-h*h);
float t = (1 - z) * (1 - h * h);
// df/ds
if(outState) rowOutState[i] += (m * z - m + 1) * adj;
if(outState)
rowOutState[i] += (m * z - m + 1) * adj;
// df/d(xW_r) ...
float dfdxW_r = m * r * (1 - r) * t * adj;
@ -457,20 +466,28 @@ void GRUFastBackward(std::vector<Tensor> outputs,
dfdxW_r *= rowSU[l] + b[l];
else
dfdxW_r *= rowSU[l];
if(outXW) rowOutXW[i] += dfdxW_r;
if(outSU) rowOutSU[i] += dfdxW_r;
if(outB) outB[i] += dfdxW_r;
if(outXW)
rowOutXW[i] += dfdxW_r;
if(outSU)
rowOutSU[i] += dfdxW_r;
if(outB)
outB[i] += dfdxW_r;
// df/d(xW_z) ...
float dfdxW_z = m * (1 - z) * z * (rowState[i] - h) * adj;
if(outXW) rowOutXW[k] += dfdxW_z;
if(outSU) rowOutSU[k] += dfdxW_z;
if(outB) outB[k] += dfdxW_z;
if(outXW)
rowOutXW[k] += dfdxW_z;
if(outSU)
rowOutSU[k] += dfdxW_z;
if(outB)
outB[k] += dfdxW_z;
// df/d(xW_x) ...
float dfdxW_x = m * t * adj;
if(outXW) rowOutXW[l] += dfdxW_x;
if(outSU) rowOutSU[l] += dfdxW_x * r;
if(outXW)
rowOutXW[l] += dfdxW_x;
if(outSU)
rowOutSU[l] += dfdxW_x * r;
if(outB)
if(final)
outB[l] += dfdxW_x * r;
@ -490,18 +507,18 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) {
int rows = inShape.elements() / inShape.back();
int cols = inShape.back();
#pragma omp parallel for
for (int j = 0; j < rows; ++j) {
const float* sp = in + j*cols;
#pragma omp parallel for
for(int j = 0; j < rows; ++j) {
const float* sp = in + j * cols;
float max = sp[0];
#pragma omp simd reduction(max:max)
for (int i = 1; i < cols; ++i) {
#pragma omp simd reduction(max : max)
for(int i = 1; i < cols; ++i) {
max = std::max(max, sp[i]);
}
float sum = 0.f;
#pragma omp simd reduction(+:sum)
for (int i = 0; i < cols; ++i) {
#pragma omp simd reduction(+ : sum)
for(int i = 0; i < cols; ++i) {
sum += std::exp(sp[i] - max);
}
@ -512,7 +529,10 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) {
}
}
void CrossEntropyPickBackward(Tensor out_, Tensor adj_, Tensor a, Tensor pick_) {
void CrossEntropyPickBackward(Tensor out_,
Tensor adj_,
Tensor a,
Tensor pick_) {
float* out = out_->data();
Shape& outShape = out_->shape();
const float* adj = adj_->data();
@ -522,23 +542,23 @@ void CrossEntropyPickBackward(Tensor out_, Tensor adj_, Tensor a, Tensor pick_)
int rows = outShape.elements() / outShape.back();
int cols = outShape.back();
#pragma omp parallel for
for (int j = 0; j < rows; ++j) {
const float* sp = in + j*cols;
float* so = out + j*cols;
#pragma omp parallel for
for(int j = 0; j < rows; ++j) {
const float* sp = in + j * cols;
float* so = out + j * cols;
float max = sp[0];
for (int i = 1; i < cols; ++i) {
for(int i = 1; i < cols; ++i) {
max = std::max(max, sp[i]);
}
float sum = 0.f;
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
sum += std::exp(sp[i] - max);
}
// cross-entropy
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
float sub = (float)(i == (int)pick[j]);
so[i] += adj[j] * (std::exp(sp[i] - max) / sum - sub);
}
@ -549,8 +569,8 @@ float L2Norm(Tensor in) {
float sum = 0.f;
size_t size = in->size();
const float* data = in->data();
#pragma omp parallel for simd reduction(+:sum)
for (size_t i = 0; i < size; ++i) {
#pragma omp parallel for simd reduction(+ : sum)
for(size_t i = 0; i < size; ++i) {
sum += data[i] * data[i];
}
return std::sqrt(sum);
@ -570,15 +590,15 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) {
int rows = m;
int cols = k;
#pragma omp parallel for
for (size_t j = 0; j < rows; ++j) {
#pragma omp parallel for
for(size_t j = 0; j < rows; ++j) {
const float* vaRow = va;
const float* ctxRow = ctx + (j % (b * t)) * cols;
const float* stateRow = state + ((j / (b * t)) * b + j % b) * cols;
float sum = 0.f;
#pragma omp simd reduction(+:sum)
for (size_t i = 0; i < cols; ++i) {
#pragma omp simd reduction(+ : sum)
for(size_t i = 0; i < cols; ++i) {
float z = ctxRow[i] + stateRow[i];
sum += std::tanh(z) * vaRow[i];
}
@ -587,8 +607,12 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) {
}
}
void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,
Tensor va_, Tensor context_, Tensor state_,
void AttBack(Tensor gVa_,
Tensor gContext_,
Tensor gState_,
Tensor va_,
Tensor context_,
Tensor state_,
Tensor adj_) {
float* gVa = gVa_->data();
float* gContext = gContext_->data();
@ -603,8 +627,8 @@ void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,
size_t k = context_->shape()[-1];
size_t n = context_->shape()[-2];
#pragma omp parallel for reduction(+:gState[:n*k], gVa[:k])
for (size_t j = 0; j < m; ++j) {
#pragma omp parallel for reduction(+ : gState[ : n* k], gVa[ : k])
for(size_t j = 0; j < m; ++j) {
float* gcRow = gContext + j * k;
float* gsRow = gState + (j % n) * k;
@ -613,8 +637,8 @@ void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_,
float adj_j = adj[j];
#pragma omp simd
for (size_t i = 0; i < k; ++i) {
#pragma omp simd
for(size_t i = 0; i < k; ++i) {
float z = cRow[i] + sRow[i];
float t = std::tanh(z);
@ -642,31 +666,31 @@ void LayerNormalization(Tensor out_,
int rows = in_->shape().elements() / in_->shape().back();
int cols = in_->shape().back();
#pragma omp parallel for
for (int j = 0; j < rows; ++j) {
float* so = out + j*cols;
const float* sp = in + j*cols;
#pragma omp parallel for
for(int j = 0; j < rows; ++j) {
float* so = out + j * cols;
const float* sp = in + j * cols;
float sum = 0.f;
#pragma omp simd reduction(+:sum)
for (int i = 0; i < cols; ++i) {
#pragma omp simd reduction(+ : sum)
for(int i = 0; i < cols; ++i) {
sum += sp[i];
}
float mean = sum / cols;
float sqSum = 0.f;
#pragma omp simd reduction(+:sqSum)
for (int i = 0; i < cols; ++i) {
#pragma omp simd reduction(+ : sqSum)
for(int i = 0; i < cols; ++i) {
float ex = sp[i] - mean;
sqSum += ex*ex;
sqSum += ex * ex;
}
float sigma = std::sqrt(eps + sqSum / cols);
#pragma omp simd
for (int i = 0; i < cols; ++i) {
#pragma omp simd
for(int i = 0; i < cols; ++i) {
float t = alpha[i] * ((sp[i] - mean) / sigma);
if (beta != nullptr) {
if(beta != nullptr) {
t += beta[i];
}
@ -696,36 +720,36 @@ void LayerNormalizationGrad(Tensor gradX_,
size_t rows = y_->shape().elements() / y_->shape()[-1];
size_t cols = y_->shape()[-1];
if (beta) {
#pragma omp parallel for reduction(+:gradGamma[:cols], gradBeta[:cols])
for (size_t j = 0; j < rows; ++j) {
const float* xRow = x + j*cols;
const float* yRow = y + j*cols;
const float* adjRow = adj + j*cols;
float* gradXRow = gradX + j*cols;
if(beta) {
#pragma omp parallel for reduction(+ : gradGamma[ : cols], gradBeta[ : cols])
for(size_t j = 0; j < rows; ++j) {
const float* xRow = x + j * cols;
const float* yRow = y + j * cols;
const float* adjRow = adj + j * cols;
float* gradXRow = gradX + j * cols;
float sum_x = 0.f;
float sum_adj = 0.f;
float sum_adj_x = 0.f;
float sum_sqr = 0.f;
#pragma omp simd reduction(+:sum_x, sum_adj_x, sum_adj)
for (size_t i = 0; i < cols; ++i) {
#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj)
for(size_t i = 0; i < cols; ++i) {
sum_x += xRow[i];
sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[i] : 0.f)) / gamma[i];
sum_adj += adjRow[i];
}
float mean = sum_x / cols;
#pragma omp simd reduction(+:sum_sqr)
for (size_t i = 0; i < cols; ++i) {
#pragma omp simd reduction(+ : sum_sqr)
for(size_t i = 0; i < cols; ++i) {
float ex = xRow[i] - mean;
sum_sqr += ex*ex;
sum_sqr += ex * ex;
}
float sigma = std::sqrt(eps + sum_sqr / cols);
#pragma omp simd
for (size_t i = 0; i < cols; ++i) {
#pragma omp simd
for(size_t i = 0; i < cols; ++i) {
float grad_x = 0.f;
float x_hat = (yRow[i] - beta[i]) / gamma[i];
grad_x += cols * adjRow[i];
@ -739,35 +763,35 @@ void LayerNormalizationGrad(Tensor gradX_,
}
}
} else {
#pragma omp parallel for reduction(+:gradGamma[:cols])
for (size_t j = 0; j < rows; ++j) {
const float* xRow = x + j*cols;
const float* yRow = y + j*cols;
const float* adjRow = adj + j*cols;
float* gradXRow = gradX + j*cols;
#pragma omp parallel for reduction(+ : gradGamma[ : cols])
for(size_t j = 0; j < rows; ++j) {
const float* xRow = x + j * cols;
const float* yRow = y + j * cols;
const float* adjRow = adj + j * cols;
float* gradXRow = gradX + j * cols;
float sum_x = 0.f;
float sum_adj = 0.f;
float sum_adj_x = 0.f;
float sum_sqr = 0.f;
#pragma omp simd reduction(+:sum_x, sum_adj_x, sum_adj)
for (size_t i = 0; i < cols; ++i) {
#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj)
for(size_t i = 0; i < cols; ++i) {
sum_x += xRow[i];
sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[i] : 0.f)) / gamma[i];
sum_adj += adjRow[i];
}
float mean = sum_x / cols;
#pragma omp simd reduction(+:sum_sqr)
for (size_t i = 0; i < cols; ++i) {
#pragma omp simd reduction(+ : sum_sqr)
for(size_t i = 0; i < cols; ++i) {
float ex = xRow[i] - mean;
sum_sqr += ex*ex;
sum_sqr += ex * ex;
}
float sigma = std::sqrt(eps + sum_sqr / cols);
#pragma omp simd
for (size_t i = 0; i < cols; ++i) {
#pragma omp simd
for(size_t i = 0; i < cols; ++i) {
float grad_x = 0.f;
float x_hat = yRow[i] / gamma[i];
grad_x += cols * adjRow[i];
@ -794,9 +818,9 @@ void Shift(Tensor out_, Tensor in_, marian::Shape shift, bool invert) {
const float* in = in_->data();
int length = out_->shape().elements();
#pragma omp parallel for
for (int i = 0; i < length; ++i) {
if (i - offset < 0 || i - offset >= length) {
#pragma omp parallel for
for(int i = 0; i < length; ++i) {
if(i - offset < 0 || i - offset >= length) {
out[i] = 0.f;
} else {
out[i] = in[i - offset];
@ -808,7 +832,7 @@ void SetSparse(float* out,
const std::vector<size_t>& indices,
const std::vector<float>& values) {
int length = indices.size();
for (int index = 0; index < length; ++index) {
for(int index = 0; index < length; ++index) {
out[indices[index]] = values[index];
}
}
@ -824,26 +848,26 @@ void LSTMCellForward(Tensor out_, std::vector<Tensor> inputs) {
const float* b = inputs[3]->data();
const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;
for (int j = 0; j < rows; ++j) {
for(int j = 0; j < rows; ++j) {
float m = !mask || mask[j];
float* rowOut = out + j*cols;
const float* rowCell = cell + j*cols;
float* rowOut = out + j * cols;
const float* rowCell = cell + j * cols;
const float* xWrow = xW + j*cols*4;
const float* sUrow = sU + j*cols*4;
const float* xWrow = xW + j * cols * 4;
const float* sUrow = sU + j * cols * 4;
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);
int l = i + 2*cols;
int l = i + 2 * cols;
float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);
float cout = gf*rowCell[i] + gi*gc;
rowOut[i] = m*cout + (1-m)*rowCell[i];
float cout = gf * rowCell[i] + gi * gc;
rowOut[i] = m * cout + (1 - m) * rowCell[i];
}
}
}
@ -858,15 +882,15 @@ void LSTMOutputForward(Tensor out_, std::vector<Tensor> inputs) {
const float* sU = inputs[2]->data();
const float* b = inputs[3]->data();
for (int j = 0; j <rows; ++j) {
float* rowOut = out + j*cols;
const float* rowCell = cell + j*cols;
for(int j = 0; j < rows; ++j) {
float* rowOut = out + j * cols;
const float* rowCell = cell + j * cols;
const float* xWrow = xW + j*cols*4;
const float* sUrow = sU + j*cols*4;
const float* xWrow = xW + j * cols * 4;
const float* sUrow = sU + j * cols * 4;
for (int i = 0; i < cols; ++i) {
int k = i + 3*cols;
for(int i = 0; i < cols; ++i) {
int k = i + 3 * cols;
float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);
rowOut[i] = go * std::tanh(rowCell[i]);
@ -893,52 +917,70 @@ void LSTMCellBackward(std::vector<Tensor> outputs,
const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;
const float* adj = adj_->data();
for (int j = 0; j <rows; ++j) {
for(int j = 0; j < rows; ++j) {
float m = !mask || mask[j];
float* rowOutCell = outCell + j*cols;
float* rowOutXW = outXW + j*cols*4;
float* rowOutSU = outSU + j*cols*4;
float* rowOutCell = outCell + j * cols;
float* rowOutXW = outXW + j * cols * 4;
float* rowOutSU = outSU + j * cols * 4;
const float* rowCell = cell + j*cols;
const float* xWrow = xW + j*cols*4;
const float* sUrow = sU + j*cols*4;
const float* rowCell = cell + j * cols;
const float* xWrow = xW + j * cols * 4;
const float* sUrow = sU + j * cols * 4;
const float* rowAdj = adj + j*cols;
const float* rowAdj = adj + j * cols;
for (int i = 0; i < cols; ++i) {
for(int i = 0; i < cols; ++i) {
float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);
int l = i + 2*cols;
int l = i + 2 * cols;
float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);
float adj = rowAdj[i];
// dc/dx_{t-1}
if (outCell) {
rowOutCell[i] += (m*gf - m + 1)*adj;
if(outCell) {
rowOutCell[i] += (m * gf - m + 1) * adj;
}
// dc/d(b_f) = dc/d(xW_f) ...
float dcdxf = m*rowCell[i] * gf*(1-gf) * adj;
if (outXW) { rowOutXW[i] += dcdxf; }
if (outSU) { rowOutSU[i] += dcdxf; }
if (outB) { outB[i] += dcdxf; }
float dcdxf = m * rowCell[i] * gf * (1 - gf) * adj;
if(outXW) {
rowOutXW[i] += dcdxf;
}
if(outSU) {
rowOutSU[i] += dcdxf;
}
if(outB) {
outB[i] += dcdxf;
}
// dc/d(b_i) ...
float dcdb_i = m * gc * gi*(1-gi) * adj;
if (outXW) { rowOutXW[k] += dcdb_i; }
if (outSU) { rowOutSU[k] += dcdb_i; }
if (outB) { outB[k] += dcdb_i; }
float dcdb_i = m * gc * gi * (1 - gi) * adj;
if(outXW) {
rowOutXW[k] += dcdb_i;
}
if(outSU) {
rowOutSU[k] += dcdb_i;
}
if(outB) {
outB[k] += dcdb_i;
}
// dc/d(b_c) ...
float dcdxc = m * gi * (1 - gc*gc) * adj;
if (outXW) { rowOutXW[l] += dcdxc; }
if (outSU) { rowOutSU[l] += dcdxc; }
if (outB) { outB[l] += dcdxc; }
float dcdxc = m * gi * (1 - gc * gc) * adj;
if(outXW) {
rowOutXW[l] += dcdxc;
}
if(outSU) {
rowOutSU[l] += dcdxc;
}
if(outB) {
outB[l] += dcdxc;
}
}
}
}
@ -961,19 +1003,19 @@ void LSTMOutputBackward(std::vector<Tensor> outputs,
const float* adj = adj_->data();
for (int j = 0; j < rows; ++j) {
float* rowOutCell = outCell + j*cols;
float* rowOutXW = outXW + j*cols*4;
float* rowOutSU = outSU + j*cols*4;
for(int j = 0; j < rows; ++j) {
float* rowOutCell = outCell + j * cols;
float* rowOutXW = outXW + j * cols * 4;
float* rowOutSU = outSU + j * cols * 4;
const float* rowCell = cell + j*cols;
const float* xWrow = xW + j*cols*4;
const float* sUrow = sU + j*cols*4;
const float* rowCell = cell + j * cols;
const float* xWrow = xW + j * cols * 4;
const float* sUrow = sU + j * cols * 4;
const float* rowAdj = adj + j*cols;
const float* rowAdj = adj + j * cols;
for (int i = 0; i < cols; ++i) {
int k = i + 3*cols;
for(int i = 0; i < cols; ++i) {
int k = i + 3 * cols;
float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);
float t = std::tanh(rowCell[i]);
@ -981,15 +1023,21 @@ void LSTMOutputBackward(std::vector<Tensor> outputs,
float adj = rowAdj[i];
// dc/dc_{t-1}
if (outCell) {
rowOutCell[i] += go * (1 - t*t) * adj;
if(outCell) {
rowOutCell[i] += go * (1 - t * t) * adj;
}
// dc/d(b_o) = dc/d(xW_f) ...
float dcdxo = t * go*(1-go) * adj;
if (outXW) { rowOutXW[k] += dcdxo; }
if (outSU) { rowOutSU[k] += dcdxo; }
if (outB) { outB[k] += dcdxo; }
float dcdxo = t * go * (1 - go) * adj;
if(outXW) {
rowOutXW[k] += dcdxo;
}
if(outSU) {
rowOutSU[k] += dcdxo;
}
if(outB) {
outB[k] += dcdxo;
}
}
}
}
@ -1027,6 +1075,5 @@ void PoolingWithMaskingBackward(Tensor adj,
bool isEven) {
ABORT("Not implemented!");
}
}
} // namespace marian

View File

@ -23,7 +23,7 @@ public:
Device(DeviceId deviceId, size_t alignment = 256)
: deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {}
virtual ~Device() {};
virtual ~Device(){};
virtual void reserve(size_t size) = 0;
@ -35,30 +35,31 @@ public:
};
namespace gpu {
class Device : public marian::Device {
public:
Device(DeviceId deviceId, size_t alignment = 256)
class Device : public marian::Device {
public:
Device(DeviceId deviceId, size_t alignment = 256)
: marian::Device(deviceId, alignment) {}
~Device();
~Device();
void reserve(size_t size);
};
void reserve(size_t size);
};
}
namespace cpu {
class Device : public marian::Device {
public:
Device(DeviceId deviceId, size_t alignment = 256)
class Device : public marian::Device {
public:
Device(DeviceId deviceId, size_t alignment = 256)
: marian::Device(deviceId, alignment) {}
~Device();
~Device();
void reserve(size_t size);
};
void reserve(size_t size);
};
}
static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) {
static inline Ptr<Device> DispatchDevice(DeviceId deviceId,
size_t alignment = 256) {
#ifdef CUDA_FOUND
if(deviceId.type == DeviceType::gpu)
return New<gpu::Device>(deviceId, alignment);
@ -71,5 +72,4 @@ static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 2
return New<cpu::Device>(deviceId, alignment);
#endif
}
}

View File

@ -2,204 +2,250 @@
#ifdef CUDA_FOUND
#define DISPATCH1(Function, Arg1) \
namespace gpu { \
void Function(Arg1); \
} \
namespace cpu { \
void Function(Arg1); \
} \
void Function(Arg1 arg1) { \
#define DISPATCH1(Function, Arg1) \
namespace gpu { \
void Function(Arg1); \
} \
namespace cpu { \
void Function(Arg1); \
} \
void Function(Arg1 arg1) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1); \
else \
cpu::Function(arg1); \
gpu::Function(arg1); \
else \
cpu::Function(arg1); \
}
#define DISPATCH2(Function, Arg1, Arg2) \
namespace gpu { \
void Function(Arg1, Arg2); \
} \
namespace cpu { \
void Function(Arg1, Arg2); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2) { \
#define DISPATCH2(Function, Arg1, Arg2) \
namespace gpu { \
void Function(Arg1, Arg2); \
} \
namespace cpu { \
void Function(Arg1, Arg2); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2); \
else \
cpu::Function(arg1, arg2); \
gpu::Function(arg1, arg2); \
else \
cpu::Function(arg1, arg2); \
}
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3); \
} \
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3); \
else \
cpu::Function(arg1, arg2, arg3); \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3); \
else \
cpu::Function(arg1, arg2, arg3); \
}
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4); \
} \
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4); \
else \
cpu::Function(arg1, arg2, arg3, arg4); \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4); \
else \
cpu::Function(arg1, arg2, arg3, arg4); \
}
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
} \
static inline void Function( \
Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4, arg5); \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
gpu::Function(arg1, arg2, arg3, arg4, arg5); \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
}
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
} \
static inline void Function( \
Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
}
#define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
} \
static inline void Function(Arg1 arg1, \
Arg2 arg2, \
Arg3 arg3, \
Arg4 arg4, \
Arg5 arg5, \
Arg6 arg6, \
Arg7 arg7) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
}
#define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
} \
static inline void Function(Arg1 arg1, \
Arg2 arg2, \
Arg3 arg3, \
Arg4 arg4, \
Arg5 arg5, \
Arg6 arg6, \
Arg7 arg7, \
Arg8 arg8) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
}
#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
#define DISPATCH9( \
Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
namespace gpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
} \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
} \
static inline void Function(Arg1 arg1, \
Arg2 arg2, \
Arg3 arg3, \
Arg4 arg4, \
Arg5 arg5, \
Arg6 arg6, \
Arg7 arg7, \
Arg8 arg8, \
Arg9 arg9) { \
if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
else \
else \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
}
#else
#define DISPATCH1(Function, Arg1) \
namespace cpu { \
void Function(Arg1); \
} \
void Function(Arg1 arg1) { \
cpu::Function(arg1); \
}
namespace cpu { \
void Function(Arg1); \
} \
void Function(Arg1 arg1) { cpu::Function(arg1); }
#define DISPATCH2(Function, Arg1, Arg2) \
namespace cpu { \
void Function(Arg1, Arg2); \
} \
#define DISPATCH2(Function, Arg1, Arg2) \
namespace cpu { \
void Function(Arg1, Arg2); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2) { \
cpu::Function(arg1, arg2); \
cpu::Function(arg1, arg2); \
}
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3); \
} \
#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
cpu::Function(arg1, arg2, arg3); \
cpu::Function(arg1, arg2, arg3); \
}
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4); \
} \
#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
cpu::Function(arg1, arg2, arg3, arg4); \
cpu::Function(arg1, arg2, arg3, arg4); \
}
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
} \
static inline void Function( \
Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5); \
}
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
} \
static inline void Function( \
Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
}
#define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
} \
static inline void Function(Arg1 arg1, \
Arg2 arg2, \
Arg3 arg3, \
Arg4 arg4, \
Arg5 arg5, \
Arg6 arg6, \
Arg7 arg7) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
}
#define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
} \
static inline void Function(Arg1 arg1, \
Arg2 arg2, \
Arg3 arg3, \
Arg4 arg4, \
Arg5 arg5, \
Arg6 arg6, \
Arg7 arg7, \
Arg8 arg8) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
}
#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
} \
static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
#define DISPATCH9( \
Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
namespace cpu { \
void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
} \
static inline void Function(Arg1 arg1, \
Arg2 arg2, \
Arg3 arg3, \
Arg4 arg4, \
Arg5 arg5, \
Arg6 arg6, \
Arg7 arg7, \
Arg8 arg8, \
Arg9 arg9) { \
cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
}
#endif

View File

@ -9,8 +9,8 @@
#include "functional/functional.h"
#include "functional/shape.h"
#include "functional/tmp.h"
#include "functional/tensor.h"
#include "functional/tmp.h"
namespace marian {
@ -22,7 +22,6 @@ __global__ void gAddGeneric(Functor functor,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
float scale = 1.0) {
int outLength = out.shape().elements();
bool same = outLength == full.elements();
for(int i = 0; i < K; ++i)
@ -37,14 +36,12 @@ __global__ void gAddGeneric(Functor functor,
for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < outLength) {
if(same) {
out[index] += functional::apply(functor, ins, index) * scale;
} else {
out.shape().dims(index, dims);
out[index] += functional::loops(functor, ins, len, dims) * scale;
}
}
}
}
@ -81,7 +78,6 @@ __global__ void gAddReduce(Functor functor,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
float scale = 1.0) {
int rows = full.elements() / full.back();
int cols = full.back();
@ -133,12 +129,8 @@ __global__ void gAddReduce(Functor functor,
}
}
template <class Functor, class ...Tensors>
void Add(Functor functor,
float scale,
marian::Tensor out,
Tensors... tensors) {
template <class Functor, class... Tensors>
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
cudaSetDevice(out->getDevice().no);
auto full = marian::Shape::broadcast({out, tensors...});
@ -148,7 +140,7 @@ void Add(Functor functor,
constexpr size_t K = sizeof...(Tensors);
functional::Tensor<float> gOut = out;
functional::Array<functional::Tensor<float>, K> gIns = {tensors ...};
functional::Array<functional::Tensor<float>, K> gIns = {tensors...};
if(full.back() != 1 && out->shape().back() == 1) {
size_t m = full.elements() / length;
@ -180,6 +172,5 @@ void Add(Functor functor,
}
#include "tensors/gpu/add.inc"
}
}

View File

@ -6,11 +6,7 @@ namespace marian {
namespace gpu {
template <class Functor, class ...Tensors>
void Add(Functor functor,
float scale,
marian::Tensor out,
Tensors... tensors);
template <class Functor, class... Tensors>
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors);
}
}

View File

@ -4,45 +4,49 @@
#include "tensors/gpu/cuda_helpers.h"
namespace marian {
namespace gpu {
template <typename T>
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
CudaCopy(begin, end, dest);
CUDA_CHECK(cudaStreamSynchronize(0));
namespace gpu {
template <typename T>
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
CudaCopy(begin, end, dest);
CUDA_CHECK(cudaStreamSynchronize(0));
}
template void copy<float>(Ptr<Backend> backend,
const float* begin,
const float* end,
float* dest);
template void copy<int>(Ptr<Backend> backend,
const int* begin,
const int* end,
int* dest);
__global__ void gFill(float* d_in, int size, float val) {
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
if(index < size) {
d_in[index] = val;
}
template void copy<float>(Ptr<Backend> backend, const float* begin, const float* end, float* dest);
template void copy<int>(Ptr<Backend> backend, const int* begin, const int* end, int* dest);
__global__ void gFill(float *d_in, int size, float val) {
for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) {
int index = bid + threadIdx.x + blockDim.x * blockIdx.x;
if(index < size) {
d_in[index] = val;
}
}
}
void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
int size = end - begin;
int threads = std::min(512, size);
int blocks = (size / threads) + (size % threads != 0);
gFill<<<blocks, threads>>>(begin, size, value);
CUDA_CHECK(cudaStreamSynchronize(0));
}
void setSparse(Ptr<Backend> backend,
const std::vector<size_t>& keys,
const std::vector<float>& values,
float* data) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
ABORT("no SetSparse");
//gpu::SetSparse(data, keys, values);
CUDA_CHECK(cudaStreamSynchronize(0));
}
}
}
void fill(Ptr<Backend> backend, float* begin, float* end, float value) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
int size = end - begin;
int threads = std::min(512, size);
int blocks = (size / threads) + (size % threads != 0);
gFill<<<blocks, threads>>>(begin, size, value);
CUDA_CHECK(cudaStreamSynchronize(0));
}
void setSparse(Ptr<Backend> backend,
const std::vector<size_t>& keys,
const std::vector<float>& values,
float* data) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
ABORT("no SetSparse");
// gpu::SetSparse(data, keys, values);
CUDA_CHECK(cudaStreamSynchronize(0));
}
}
}

View File

@ -3,12 +3,15 @@
#include "tensors/backend.h"
namespace marian {
namespace gpu {
template <typename T>
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest);
void fill(Ptr<Backend> backend, float* begin, float* end, float value);
namespace gpu {
template <typename T>
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest);
void setSparse(Ptr<Backend> backend, const std::vector<size_t>&, const std::vector<float>&, float*);
}
void fill(Ptr<Backend> backend, float* begin, float* end, float value);
void setSparse(Ptr<Backend> backend,
const std::vector<size_t>&,
const std::vector<float>&,
float*);
}
}

View File

@ -25,13 +25,9 @@ public:
setHandles();
}
void setDevice() {
cudaSetDevice(deviceId_.no);
}
void setDevice() { cudaSetDevice(deviceId_.no); }
void synchronize() {
cudaStreamSynchronize(0);
}
void synchronize() { cudaStreamSynchronize(0); }
cublasHandle_t getCublasHandle() { return cublasHandle_; }
@ -41,13 +37,11 @@ private:
cublasHandle_t cublasHandle_;
curandGenerator_t curandGenerator_;
void setHandles() {
cublasHandle_ = create_handle();
curandGenerator_ = createCurandGenerator();
}
curandGenerator_t createCurandGenerator() {
cudaSetDevice(deviceId_.no);
curandGenerator_t generator;
@ -67,6 +61,5 @@ private:
return cublasHandle;
}
};
}
}

View File

@ -13,9 +13,8 @@ const int MAX_BLOCKS = 65535;
#define CUDA_CHECK(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code,
const char *file,
const char* file,
int line,
bool abort = true) {
if(code != cudaSuccess) {
@ -26,8 +25,8 @@ inline void gpuAssert(cudaError_t code,
template <typename T>
void CudaCopy(const T* start, const T* end, T* dest) {
CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T),
cudaMemcpyDefault));
CUDA_CHECK(cudaMemcpy(
(void*)dest, (void*)start, (end - start) * sizeof(T), cudaMemcpyDefault));
}
#define CUSPARSE_CHECK(x) \

View File

@ -7,34 +7,34 @@
namespace marian {
namespace gpu {
Device::~Device() {
cudaSetDevice(deviceId_.no);
if(data_) {
CUDA_CHECK(cudaFree(data_));
}
cudaDeviceSynchronize();
}
void Device::reserve(size_t size) {
size = align(size);
cudaSetDevice(deviceId_.no);
ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0");
if(data_) {
// Allocate memory by going through host memory
uint8_t *temp = new uint8_t[size_];
CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(data_));
CUDA_CHECK(cudaMalloc(&data_, size));
CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
delete[] temp;
} else {
CUDA_CHECK(cudaMalloc(&data_, size));
}
size_ = size;
Device::~Device() {
cudaSetDevice(deviceId_.no);
if(data_) {
CUDA_CHECK(cudaFree(data_));
}
cudaDeviceSynchronize();
}
void Device::reserve(size_t size) {
size = align(size);
cudaSetDevice(deviceId_.no);
ABORT_IF(size < size_ || size == 0,
"New size must be larger than old size and larger than 0");
if(data_) {
// Allocate memory by going through host memory
uint8_t *temp = new uint8_t[size_];
CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(data_));
CUDA_CHECK(cudaMalloc(&data_, size));
CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice));
delete[] temp;
} else {
CUDA_CHECK(cudaMalloc(&data_, size));
}
size_ = size;
}
}
}

View File

@ -22,31 +22,29 @@
} \
} while(0)
namespace marian {
namespace gpu {
__global__ void gScale(float* data, int n, float p) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
while(index < n) {
data[index] = (data[index] < p) / p;
index += gridDim.x * blockDim.x;
}
}
void Dropout(Tensor tensor, float p) {
auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
curandGenerator_t gen = gpuBackend->getCurandGenerator();
int n = tensor->size();
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
int numThreads = std::min(n, 512);
int numBlocks = n / numThreads + (n % numThreads != 0);
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
}
namespace gpu {
__global__ void gScale(float* data, int n, float p) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
while(index < n) {
data[index] = (data[index] < p) / p;
index += gridDim.x * blockDim.x;
}
}
void Dropout(Tensor tensor, float p) {
auto gpuBackend
= std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
curandGenerator_t gen = gpuBackend->getCurandGenerator();
int n = tensor->size();
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
int numThreads = std::min(n, 512);
int numBlocks = n / numThreads + (n % numThreads != 0);
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
}
}
}

View File

@ -1,6 +1,5 @@
#include "tensors/gpu/element.h"
#include "tensors/gpu/cuda_helpers.h"
#include "functional/array.h"
#include "functional/tensor.h"
@ -11,9 +10,9 @@ namespace marian {
namespace gpu {
template <size_t K, bool broadcast, class Functor>
__global__ void gElement(Functor functor,
functional::Array<functional::Tensor<float>, K> tensors) {
__global__ void gElement(
Functor functor,
functional::Array<functional::Tensor<float>, K> tensors) {
int length = tensors[0].shape().elements();
functional::Array<int, functional::Shape::size()> dims;
functional::Array<int, K> indices;
@ -21,7 +20,6 @@ __global__ void gElement(Functor functor,
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
indices.fill(index);
if(broadcast) {
@ -35,8 +33,8 @@ __global__ void gElement(Functor functor,
}
}
template <class Functor, class ...Tensors>
void Element(Functor functor, Tensor out, Tensors ...tensors) {
template <class Functor, class... Tensors>
void Element(Functor functor, Tensor out, Tensors... tensors) {
cudaSetDevice(out->getDevice().no);
constexpr size_t K = sizeof...(tensors) + 1;
@ -57,8 +55,5 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) {
}
#include "tensors/gpu/element.inc"
}
}

View File

@ -5,8 +5,7 @@
namespace marian {
namespace gpu {
template <class Functor, class ...Tensors>
void Element(Functor functor, Tensor out, Tensors ...tensors);
template <class Functor, class... Tensors>
void Element(Functor functor, Tensor out, Tensors... tensors);
}
}

View File

@ -38,10 +38,11 @@ void Prod(marian::Tensor C,
cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle();
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())
->getCublasHandle();
#if CUDA_VERSION >= 9000
//cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
// cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
#endif
cublasSgemm(cublasHandle,
@ -59,7 +60,7 @@ void Prod(marian::Tensor C,
C->data(),
ldc);
#if CUDA_VERSION >= 9000
//cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
// cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
#endif
}
@ -96,10 +97,11 @@ void ProdBatched(marian::Tensor C,
cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle();
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())
->getCublasHandle();
#if CUDA_VERSION >= 9000
//cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
// cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
#endif
cublasSgemmStridedBatched(cublasHandle,
opB,
@ -120,10 +122,8 @@ void ProdBatched(marian::Tensor C,
n * m,
std::max(batchA, batchB));
#if CUDA_VERSION >= 9000
//cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
// cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
#endif
}
}
}

View File

@ -21,6 +21,5 @@ void ProdBatched(marian::Tensor C,
bool transB,
float beta = 0,
float scalar = 1);
}
}

View File

@ -2,8 +2,8 @@
#include <cusparse_v2.h>
#include "common/definitions.h"
#include "tensors/tensor.h"
#include "kernels/cuda_helpers.h"
#include "tensors/tensor.h"
namespace marian {

View File

@ -28,11 +28,11 @@ __device__ inline float stableLogit(float x) {
}
bool IsNan(Tensor in) {
//cudaSetDevice(in->getDevice().no);
//thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data());
//thrust::device_ptr<float> end
// cudaSetDevice(in->getDevice().no);
// thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data());
// thrust::device_ptr<float> end
// = thrust::device_pointer_cast(in->data() + in->size());
//return thrust::transform_reduce(
// return thrust::transform_reduce(
// begin, end, isnan_test(), 0, thrust::plus<bool>());
return false;
}
@ -93,10 +93,9 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
for(auto in : inputs) {
ABORT_IF(rows != in->shape().elements() / in->shape().back(),
"First dimension must be equal");
"First dimension must be equal");
int cols_in = in->shape().back();
int blocks = std::min(MAX_BLOCKS, rows);
int threads = std::min(MAX_THREADS, cols_in);
@ -122,7 +121,7 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
int cols_in = in->shape().back();
for(auto out : outputs) {
ABORT_IF(rows != out->shape().elements() / out->shape().back(),
"First dimension must be equal");
"First dimension must be equal");
int cols_out = out->shape().back();
int blocks = std::min(MAX_BLOCKS, rows);
@ -166,10 +165,10 @@ void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
SplitCont(outputs, in, ax);
}
__global__ void gTransposeND(functional::Tensor<float> out,
const functional::Tensor<float> in,
const functional::Array<int, functional::Shape::size()> permute) {
__global__ void gTransposeND(
functional::Tensor<float> out,
const functional::Tensor<float> in,
const functional::Array<int, functional::Shape::size()> permute) {
constexpr size_t N = functional::Shape::size();
functional::Array<int, N> oDims;
functional::Array<int, N> pDims;
@ -458,7 +457,6 @@ void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
int m = grad->shape().elements() / grad->shape().back();
int k = grad->shape().back();
int blocks = std::min(MAX_BLOCKS, m);
int threads = std::min(MAX_THREADS, k);
int shared = sizeof(float) * threads * 2;
@ -784,7 +782,9 @@ void Select(Tensor out,
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
auto mp_indices = allocator->alloc<size_t>(indices.size());
CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
CudaCopy(indices.data(),
indices.data() + indices.size(),
mp_indices->data<size_t>());
int axisGPU = axis + functional::Shape::size() - out->shape().size();
gSelect<<<blocks, threads>>>(out->data(),
@ -810,7 +810,9 @@ void Insert(Tensor out,
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
auto mp_indices = allocator->alloc<size_t>(indices.size());
CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>());
CudaCopy(indices.data(),
indices.data() + indices.size(),
mp_indices->data<size_t>());
int axisGPU = axis + functional::Shape::size() - out->shape().size();
gInsert<<<blocks, threads>>>(out->data(),
@ -1174,19 +1176,18 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
out->data(), out->shape(), adj->data(), a->data(), pick->data());
}
float L2Norm(Tensor in) {
cudaSetDevice(in->getDevice().no);
int size = in->shape().elements();
int threads = std::min(MAX_THREADS, size);
int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
uint8_t* data;
cudaMalloc(&data, blocks * sizeof(float));
Tensor out(new TensorBase(
New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend()));
Tensor out(new TensorBase(New<MemoryPiece>(data, blocks * sizeof(float)),
{1, blocks},
in->getBackend()));
using namespace functional;
ReduceAll(_1 * _1, out, in);
@ -1203,7 +1204,7 @@ __global__ void gAtt(float* out,
int m, // total rows (batch x time x beam)
int k, // depth
int b, // batch size
int t // time of ctx
int t // time of ctx
) {
int rows = m;
int cols = k;
@ -1255,14 +1256,8 @@ void Att(Tensor out, Tensor va, Tensor context, Tensor state) {
int threads = std::min(MAX_THREADS, (int)k);
int shared = sizeof(float) * threads * 2;
gAtt<<<blocks, threads, shared>>>(out->data(),
va->data(),
context->data(),
state->data(),
m,
k,
b,
t);
gAtt<<<blocks, threads, shared>>>(
out->data(), va->data(), context->data(), state->data(), m, k, b, t);
}
__global__ void gAttBack(float* gVa,
@ -1576,7 +1571,6 @@ __global__ void gShift(float* out, const float* in, int length, int offset) {
}
void Shift(Tensor out, Tensor in, marian::Shape shift, bool invert) {
ABORT_IF(in->shape().size() != shift.size(), "bad dimensions");
int offset = 0;
@ -2006,21 +2000,22 @@ __global__ void gMaxPoolingForward(float* out,
int lastWidth) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= outRows * outCols) return;
if(tid >= outRows * outCols)
return;
int rowId = tid / outRows;
int colId = tid % outRows;
float* b = in + (rowId * inCols) + (colId * width);
float* localMask = mask + (rowId / numKernels) * maskCols + colId * width;
float* localMask = mask + (rowId / numKernels) * maskCols + colId * width;
if (colId == outRows - 1) {
if(colId == outRows - 1) {
width = lastWidth;
}
float currentMax = b[0] * localMask[0];
for (int i = 1; i < width; ++i) {
if (b[i] * localMask[i] > currentMax) {
for(int i = 1; i < width; ++i) {
if(b[i] * localMask[i] > currentMax) {
currentMax = b[i] * localMask[i];
}
}
@ -2045,15 +2040,20 @@ void PoolingWithMaskingForward(Tensor out,
int outRows = outShape[2];
int outCols = outShape[0] * outShape[1];
int lastWidth = ((inCols - isEven) % width == 0)
? width
: (inCols - isEven) % width;
int lastWidth
= ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width;
gMaxPoolingForward<<<blocks, threads>>>(
out->data(), outRows, outCols,
in->data(), inRows, inCols,
mask->data(), outShape[1], mask->shape()[2],
width, lastWidth);
gMaxPoolingForward<<<blocks, threads>>>(out->data(),
outRows,
outCols,
in->data(),
inRows,
inCols,
mask->data(),
outShape[1],
mask->shape()[2],
width,
lastWidth);
}
__global__ void gMaxPoolingBackward(float* adj,
@ -2067,30 +2067,31 @@ __global__ void gMaxPoolingBackward(float* adj,
int numKernels,
int maskCols,
int width,
int lastWidth)
{
int lastWidth) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= adjRows * adjCols) return;
if(tid >= adjRows * adjCols)
return;
int rowId = tid / adjRows;
int colId = tid % adjRows;
float* b = in + (rowId * inCols) + (colId * width);
if (colId == adjRows - 1) {
if(colId == adjRows - 1) {
width = lastWidth;
}
float* localMask = mask + (rowId / numKernels) * maskCols + colId * width;
size_t currentMaxIdx = 0;
for (int i = 1; i < width; ++i) {
if (b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) {
for(int i = 1; i < width; ++i) {
if(b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) {
currentMaxIdx = i;
}
}
adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx] += adj[rowId + (colId * adjCols)];
adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx]
+= adj[rowId + (colId * adjCols)];
}
void PoolingWithMaskingBackward(Tensor adj,
@ -2111,16 +2112,21 @@ void PoolingWithMaskingBackward(Tensor adj,
int adjRows = adjShape[2];
int adjCols = adjShape[0] * adjShape[1];
int lastWidth = ((inCols - isEven) % width == 0)
? width
: (inCols - isEven) % width;
int lastWidth
= ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width;
gMaxPoolingBackward<<<blocks, threads>>>(
adj->data(), adjRows, adjCols,
in->data(), adjIn->data(), inRows, inCols,
mask->data(), adjShape[1], mask->shape()[2],
width, lastWidth);
gMaxPoolingBackward<<<blocks, threads>>>(adj->data(),
adjRows,
adjCols,
in->data(),
adjIn->data(),
inRows,
inCols,
mask->data(),
adjShape[1],
mask->shape()[2],
width,
lastWidth);
}
}
} // namespace marian

View File

@ -78,7 +78,7 @@ public:
#endif
}
void get(std::vector<float> &v) {
void get(std::vector<float>& v) {
v.resize(size());
if(backend_->getDevice().type == DeviceType::cpu) {
std::copy(data(), data() + size(), v.data());
@ -101,9 +101,7 @@ public:
#endif
}
void set(const std::vector<float> &v) {
set(v.data(), v.data() + v.size());
}
void set(const std::vector<float>& v) { set(v.data(), v.data() + v.size()); }
void set(float value) {
if(backend_->getDevice().type == DeviceType::cpu) {
@ -116,8 +114,7 @@ public:
#endif
}
void setSparse(const std::vector<size_t> &k,
const std::vector<float> &v) {
void setSparse(const std::vector<size_t>& k, const std::vector<float>& v) {
if(backend_->getDevice().type == DeviceType::cpu) {
for(int i = 0; i < k.size(); ++i)
data()[k[i]] = v[i];
@ -130,8 +127,8 @@ public:
}
void copyFrom(Tensor in) {
if(in->getBackend()->getDevice().type == DeviceType::cpu &&
backend_->getDevice().type == DeviceType::cpu) {
if(in->getBackend()->getDevice().type == DeviceType::cpu
&& backend_->getDevice().type == DeviceType::cpu) {
std::copy(in->data(), in->data() + in->size(), data());
}
#ifdef CUDA_FOUND
@ -167,7 +164,6 @@ public:
disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);
if(disp) {
if(dims.back() == 0) {
bool par = true;
std::vector<std::string> p;
@ -182,9 +178,7 @@ public:
strm << " ";
}
strm << std::setw(12)
<< values[i]
<< " ";
strm << std::setw(12) << values[i] << " ";
if(dims.back() + 1 == shape().back()) {
for(int j = dims.size() - 1; j >= 0; --j) {
@ -214,9 +208,7 @@ public:
strm << std::endl;
return strm.str();
}
};
typedef std::shared_ptr<TensorBase> Tensor;
}

View File

@ -21,53 +21,47 @@
namespace marian {
template <class Functor, class ...Tensors>
void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
template <class Functor, class... Tensors>
void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
#ifdef CUDA_FOUND
if(out->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Element(functor, out, tensors...);
else
if(out->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Element(functor, out, tensors...);
else
#endif
cpu::Element(functor, out, tensors...);
}
cpu::Element(functor, out, tensors...);
}
template <class Functor, class ...Tensors>
void Add(Functor functor,
float scale,
marian::Tensor out,
Tensors... tensors) {
template <class Functor, class... Tensors>
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
#ifdef CUDA_FOUND
if(out->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Add(functor, scale, out, tensors...);
else
if(out->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Add(functor, scale, out, tensors...);
else
#endif
cpu::Add(functor, scale, out, tensors...);
}
cpu::Add(functor, scale, out, tensors...);
}
template <class Functor, class ...Tensors>
void Add(Functor functor,
marian::Tensor out,
Tensors... tensors) {
Add(functor, 1, out, tensors...);
}
template <class Functor, class... Tensors>
void Add(Functor functor, marian::Tensor out, Tensors... tensors) {
Add(functor, 1, out, tensors...);
}
template <class Functor, class ...Tensors>
void Reduce(Functor functor,
float scale,
marian::Tensor out,
Tensors... tensors) {
out->set(0);
Add(functor, scale, out, tensors...);
}
template <class Functor, class... Tensors>
void Reduce(Functor functor,
float scale,
marian::Tensor out,
Tensors... tensors) {
out->set(0);
Add(functor, scale, out, tensors...);
}
template <class Functor, class ...Tensors>
void Reduce(Functor functor,
marian::Tensor out,
Tensors... tensors) {
out->set(0);
Add(functor, out, tensors...);
}
template <class Functor, class... Tensors>
void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
out->set(0);
Add(functor, out, tensors...);
}
// clang-format off
DISPATCH7(Prod, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
DISPATCH7(ProdBatched, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
@ -86,26 +80,34 @@ namespace marian {
DISPATCH4(Shift, marian::Tensor, marian::Tensor, marian::Shape, bool)
DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
// clang-format on
#ifdef CUDA_FOUND
namespace gpu {
void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
}
namespace gpu {
void Deconcatenate(std::vector<marian::Tensor>& outputs,
const marian::Tensor in,
int ax);
}
#endif
namespace cpu {
void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
}
namespace cpu {
void Deconcatenate(std::vector<marian::Tensor>& outputs,
const marian::Tensor in,
int ax);
}
static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) {
static inline void Deconcatenate(std::vector<marian::Tensor>& outputs,
const marian::Tensor in,
int ax) {
#ifdef CUDA_FOUND
if(in->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Deconcatenate(outputs, in, ax);
else
if(in->getBackend()->getDevice().type == DeviceType::gpu)
gpu::Deconcatenate(outputs, in, ax);
else
#endif
cpu::Deconcatenate(outputs, in, ax);
}
cpu::Deconcatenate(outputs, in, ax);
}
// clang-format off
DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
DISPATCH9(LayerNormalizationGrad, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
@ -120,113 +122,116 @@ namespace marian {
DISPATCH5(Select, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>)
DISPATCH5(Insert, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>)
DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>)
DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>);
// clang-format on
#ifdef CUDA_FOUND
namespace gpu {
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
namespace gpu {
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
#endif
namespace cpu {
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
namespace cpu {
void LSTMCellBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj) {
#ifdef CUDA_FOUND
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::LSTMCellBackward(outputs, inputs, adj);
else
#endif
cpu::LSTMCellBackward(outputs, inputs, adj);
}
#ifdef CUDA_FOUND
namespace gpu {
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
#endif
namespace cpu {
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj) {
#ifdef CUDA_FOUND
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::LSTMCellBackward(outputs, inputs, adj);
else
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::LSTMOutputBackward(outputs, inputs, adj);
else
#endif
cpu::LSTMCellBackward(outputs, inputs, adj);
}
cpu::LSTMOutputBackward(outputs, inputs, adj);
}
DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)
#ifdef CUDA_FOUND
namespace gpu {
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
namespace gpu {
void GRUFastBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj,
bool final);
}
#endif
namespace cpu {
void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj);
}
namespace cpu {
void GRUFastBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj,
bool final);
}
static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj) {
static inline void GRUFastBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj,
bool final = false) {
#ifdef CUDA_FOUND
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::LSTMOutputBackward(outputs, inputs, adj);
else
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::GRUFastBackward(outputs, inputs, adj, final);
else
#endif
cpu::LSTMOutputBackward(outputs, inputs, adj);
}
DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)
#ifdef CUDA_FOUND
namespace gpu {
void GRUFastBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj,
bool final);
}
#endif
namespace cpu {
void GRUFastBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj,
bool final);
}
static inline void GRUFastBackward(std::vector<marian::Tensor> outputs,
std::vector<marian::Tensor> inputs,
marian::Tensor adj,
bool final = false) {
#ifdef CUDA_FOUND
if(adj->getBackend()->getDevice().type == DeviceType::gpu)
gpu::GRUFastBackward(outputs, inputs, adj, final);
else
#endif
cpu::GRUFastBackward(outputs, inputs, adj, final);
}
cpu::GRUFastBackward(outputs, inputs, adj, final);
}
// clang-format off
DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
// clang-format on
#ifdef CUDA_FOUND
namespace gpu {
float L2Norm(marian::Tensor in);
}
namespace gpu {
float L2Norm(marian::Tensor in);
}
#endif
namespace cpu {
float L2Norm(marian::Tensor in);
}
namespace cpu {
float L2Norm(marian::Tensor in);
}
static inline float L2Norm(marian::Tensor in) {
static inline float L2Norm(marian::Tensor in) {
#ifdef CUDA_FOUND
if(in->getBackend()->getDevice().type == DeviceType::gpu)
return gpu::L2Norm(in);
else
if(in->getBackend()->getDevice().type == DeviceType::gpu)
return gpu::L2Norm(in);
else
#endif
return cpu::L2Norm(in);
}
return cpu::L2Norm(in);
}
// clang-format off
DISPATCH5(PoolingWithMaskingForward, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
DISPATCH6(PoolingWithMaskingBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, int, bool)
// clang-format on
}

View File

@ -104,7 +104,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
totalSize -= __size__;
Tensor param;
Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
Ptr<TensorAllocator> allocator
= New<TensorAllocator>(graph->getBackend());
allocator->reserveExact(__size__ * sizeof(float));
allocator->allocate(param, {1, __size__});
paramsAlloc_.push_back(allocator);
@ -122,7 +123,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
int __size__ = std::min(shardSize_, totalSize);
totalSize -= __size__;
Tensor grad_;
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(graph->getBackend());
Ptr<TensorAllocator> allocator_
= New<TensorAllocator>(graph->getBackend());
allocator_->reserveExact(__size__ * sizeof(float));
allocator_->allocate(grad_, {1, __size__});
@ -139,7 +141,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) {
int __size__ = std::min(shardSize_, totalSize);
totalSize -= __size__;
Tensor paramAvg;
Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend());
Ptr<TensorAllocator> allocator
= New<TensorAllocator>(graph->getBackend());
allocator->reserveExact(__size__ * sizeof(float));
allocator->allocate(paramAvg, {1, __size__});
@ -230,7 +233,8 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
scheduler_->update(cost, batch);
if(scheduler_->saving() || scheduler_->validating()) {
// Wait with validation or saving until all other threads are done with update.
// Wait with validation or saving until all other threads are done with
// update.
// We want to reuse the graphs for validation, so they need to be in
// a safe state.
pool_->wait_for_others(lock);

View File

@ -70,7 +70,6 @@ public:
movingAvg_{options_->get<float>("exponential-smoothing") > 0},
mvDecay_{options_->get<float>("exponential-smoothing")},
tau_{options_->get<size_t>("optimizer-delay")} {
pool_.reset(new ThreadPool(devices_.size(), devices_.size()));
for(auto device : devices_) {

View File

@ -150,7 +150,8 @@ void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
size_t modelSize = clientGraphs_[0]->params()->vals()->size();
for(int client = 0; client < devices_.size(); client++) {
// Communication overlap buffer (for grads + params)
Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend());
Tensor commOverlapBuffer
= newTensor(modelSize, clientGraphs_[client]->getBackend());
commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals());
clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer);
// Gradients local sum buffer
@ -206,11 +207,13 @@ void MultiNodeGraphGroup::calculateShardSizes() {
void MultiNodeGraphGroup::initShardGpuTensors() {
size_t offset = 0;
for(int shard = 0; shard < devices_.size(); shard++) {
Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
Tensor gpuParams
= newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
offset, shardSizes_[shard]));
shardParams_.push_back(gpuParams);
shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
shardGrads_.push_back(
newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()));
}
}

View File

@ -1,5 +1,5 @@
#include "tensors/tensor_operators.h"
#include "training/graph_group_singleton.h"
#include "tensors/tensor_operators.h"
#include "functional/functional.h"
namespace marian {
@ -15,7 +15,8 @@ void SingletonGraph::updateMovingAverage(Tensor mvAvgParams,
Tensor params,
size_t batches) {
using namespace functional;
float decay = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
float decay
= std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
Element(_1 = ((1.f - decay) * _1) + (decay * _2), mvAvgParams, params);
}
@ -59,8 +60,7 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
if(mvAvg_) {
mvAvgGraph_->reuseWorkspace(graph_);
scheduler_->validate({mvAvgGraph_});
}
else {
} else {
scheduler_->validate({graph_});
}
}

View File

@ -29,7 +29,6 @@ public:
: GraphGroup(options),
mvAvg_{options_->get<float>("exponential-smoothing") > 0},
mvDecay_{options_->get<float>("exponential-smoothing")} {
auto deviceId = options_->getDevices()[0];
graph_ = New<ExpressionGraph>();
graph_->setDevice(deviceId);

View File

@ -1,6 +1,6 @@
#include "training/graph_group_sync.h"
#include "tensors/tensor_operators.h"
#include "functional/functional.h"
#include "tensors/tensor_operators.h"
namespace marian {
@ -17,7 +17,8 @@ void SyncGraphGroup::updateMovingAverage(Tensor paramsAvg,
Tensor params,
size_t batches) {
using namespace functional;
float decay = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
float decay
= std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params);
}
@ -135,10 +136,10 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
int size = params_[idx]->size();
int i = 0;
float div = devices_.size(); // no. of GPUs
float div = devices_.size(); // no. of GPUs
// do not average gradients if cost type is sum.
if (options_->get<std::string>("cost-type") == "ce-sum") {
if(options_->get<std::string>("cost-type") == "ce-sum") {
div = 1;
}
@ -176,7 +177,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
float cost = 0;
for(auto c : costs)
cost += c;
if (options_->get<std::string>("cost-type") != "ce-sum") {
if(options_->get<std::string>("cost-type") != "ce-sum") {
cost = cost / costs.size();
}

View File

@ -43,7 +43,6 @@ public:
devices_{options_->getDevices()},
movingAvg_{options_->get<float>("exponential-smoothing") > 0},
mvDecay_{options_->get<float>("exponential-smoothing")} {
for(auto device : devices_) {
auto graph = New<ExpressionGraph>();
graph->setDevice(device);

View File

@ -100,7 +100,8 @@ public:
return (state_->batches % options_->get<size_t>("save-freq") == 0);
}
void validate(const std::vector<Ptr<ExpressionGraph>>& graphs, bool final = false) {
void validate(const std::vector<Ptr<ExpressionGraph>>& graphs,
bool final = false) {
if(state_->validated
|| (state_->batches % options_->get<size_t>("valid-freq") != 0
&& !final))

View File

@ -7,8 +7,8 @@
#include "tensors/tensor.h"
#include "tensors/tensor_operators.h"
#include "training/sparse_tensor.h"
#include "tensors/gpu/cuda_helpers.h"
#include "tensors/gpu/cuda_helpers.h"
namespace marian {
@ -48,10 +48,10 @@ __global__ void gFindSubtensor(int* indices,
}
SparseTensorBase::SparseTensorBase(int capacity, Ptr<Backend> backend)
: backend_(backend), capacity_(capacity) {
: backend_(backend), capacity_(capacity) {
ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
"Gradient dropping is currently not implemented for CPU usage");
"Gradient dropping is currently not implemented for CPU usage");
cudaSetDevice(backend_->getDevice().no);
CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity));
CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity));
@ -64,7 +64,7 @@ SparseTensorBase::SparseTensorBase(float* data,
int* indices,
int size,
Ptr<Backend> backend)
: backend_(backend) {
: backend_(backend) {
data_ = data;
indices_ = indices;
size_ = size;
@ -98,10 +98,10 @@ void SparseTensorBase::copyFrom(float* data,
size_ = size;
if(size == 0)
return;
ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
"Gradient dropping is currently not implemented for CPU usage");
"Gradient dropping is currently not implemented for CPU usage");
cudaSetDevice(backend_->getDevice().no);
cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault);
@ -128,7 +128,7 @@ void SparseTensorBase::setSize(int size) {
void SparseTensorBase::toDense(Tensor t, int offset) {
ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
"Gradient dropping is currently not implemented for CPU usage");
cudaSetDevice(backend_->getDevice().no);
int threads = 512;
int blocks = 1 + size_ / threads;
@ -152,8 +152,8 @@ std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos,
int size,
int idx) {
ABORT_IF(backend_->getDevice().type == DeviceType::cpu,
"Gradient dropping is currently not implemented for CPU usage");
"Gradient dropping is currently not implemented for CPU usage");
cudaSetDevice(backend_->getDevice().no);
cudaStreamSynchronize(0);
int* start = gstart_ + idx;

View File

@ -45,7 +45,6 @@ public:
if((options_->has("valid-sets") || options_->has("valid-script-path"))
&& options_->get<size_t>("valid-freq") > 0) {
for(auto validator : Validators(dataset->getVocabs(), options_))
scheduler->addValidator(validator);
}

View File

@ -4,9 +4,9 @@
#include <cstdlib>
#include <limits>
#include "3rd_party/threadpool.h"
#include "common/config.h"
#include "common/utils.h"
#include "3rd_party/threadpool.h"
#include "data/batch_generator.h"
#include "data/corpus.h"
#include "graph/expression_graph.h"
@ -25,8 +25,7 @@ namespace marian {
class ValidatorBase : public TrainingObserver {
public:
ValidatorBase(bool lowerIsBetter)
: lowerIsBetter_(lowerIsBetter),
lastBest_{initScore()} {}
: lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {}
virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) = 0;
virtual std::string type() = 0;
@ -98,7 +97,8 @@ protected:
Ptr<data::BatchGenerator<DataSet>>)
= 0;
void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs, float val) {
void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs,
float val) {
if((lowerIsBetter_ && lastBest_ > val)
|| (!lowerIsBetter_ && lastBest_ < val)) {
stalled_ = 0;
@ -226,7 +226,6 @@ public:
TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Config> options)
: Validator(vocabs, options, false),
quiet_(options_->get<bool>("quiet-translation")) {
Ptr<Options> opts = New<Options>();
opts->merge(options);
opts->set("inference", true);
@ -314,7 +313,8 @@ public:
scorer = scorers[id % graphs.size()];
}
auto search = New<BeamSearch>(options_, std::vector<Ptr<Scorer>>{scorer});
auto search
= New<BeamSearch>(options_, std::vector<Ptr<Scorer>>{scorer});
auto histories = search->search(graph, batch);
for(auto history : histories) {

View File

@ -34,21 +34,20 @@ public:
std::vector<Ptr<ScorerState>>& states,
size_t beamSize,
bool first) {
Beams newBeams(beams.size());
for(int i = 0; i < keys.size(); ++i) {
int embIdx = keys[i] % vocabSize;
int beamIdx = i / beamSize;
int embIdx = keys[i] % vocabSize;
int beamIdx = i / beamSize;
if(newBeams[beamIdx].size() < beams[beamIdx].size()) {
auto& beam = beams[beamIdx];
auto& newBeam = newBeams[beamIdx];
int hypIdx = keys[i] / vocabSize;
float cost = costs[i];
float cost = costs[i];
int hypIdxTrans = (hypIdx / beamSize) +
(hypIdx % beamSize) * beams.size();
int hypIdxTrans
= (hypIdx / beamSize) + (hypIdx % beamSize) * beams.size();
if(first)
hypIdxTrans = hypIdx;
@ -78,7 +77,7 @@ public:
Beams pruneBeam(const Beams& beams) {
Beams newBeams;
for(auto beam: beams) {
for(auto beam : beams) {
Beam newBeam;
for(auto hyp : beam) {
if(hyp->GetWord() > 0) {
@ -90,9 +89,7 @@ public:
return newBeams;
}
Histories search(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
Histories search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
int dimBatch = batch->size();
Histories histories;
for(int i = 0; i < dimBatch; ++i) {
@ -140,8 +137,7 @@ public:
Expr prevCosts;
if(first) {
// no cost
prevCosts = graph->constant({1, 1, 1, 1},
inits::from_value(0));
prevCosts = graph->constant({1, 1, 1, 1}, inits::from_value(0));
} else {
std::vector<float> beamCosts;
@ -155,8 +151,7 @@ public:
hypIndices.push_back(hyp->GetPrevStateIndex());
embIndices.push_back(hyp->GetWord());
beamCosts.push_back(hyp->GetCost());
}
else {
} else {
hypIndices.push_back(0);
embIndices.push_back(0);
beamCosts.push_back(-9999);
@ -164,9 +159,8 @@ public:
}
}
prevCosts
= graph->constant({(int)localBeamSize, 1, dimBatch, 1},
inits::from_vector(beamCosts));
prevCosts = graph->constant({(int)localBeamSize, 1, dimBatch, 1},
inits::from_vector(beamCosts));
}
//**********************************************************************
@ -174,10 +168,12 @@ public:
auto totalCosts = prevCosts;
for(int i = 0; i < scorers_.size(); ++i) {
states[i] = scorers_[i]->step(graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);
states[i] = scorers_[i]->step(
graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);
if(scorers_[i]->getWeight() != 1.f)
totalCosts = totalCosts + scorers_[i]->getWeight() * states[i]->getProbs();
totalCosts
= totalCosts + scorers_[i]->getWeight() * states[i]->getProbs();
else
totalCosts = totalCosts + states[i]->getProbs();
}
@ -207,12 +203,14 @@ public:
nth->getNBestList(beamSizes, totalCosts->val(), outCosts, outKeys, first);
int dimTrgVoc = totalCosts->shape()[-1];
beams = toHyps(outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first);
beams = toHyps(
outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first);
auto prunedBeams = pruneBeam(beams);
for(int i = 0; i < dimBatch; ++i) {
if(!beams[i].empty()) {
final = final || histories[i]->size() >= 3 * batch->front()->batchWidth();
final = final
|| histories[i]->size() >= 3 * batch->front()->batchWidth();
histories[i]->Add(beams[i], prunedBeams[i].empty() || final);
}
}

View File

@ -18,7 +18,7 @@ void SetColumn(Tensor in_, size_t col, float value) {
int nColumns = in_->shape()[-1];
float* in = in_->data();
for (int rowNumber = 0; rowNumber < nRows; ++rowNumber) {
for(int rowNumber = 0; rowNumber < nRows; ++rowNumber) {
int index = col + rowNumber * nColumns;
in[index] = value;
}
@ -31,7 +31,6 @@ void suppressUnk(Expr probs) {
void suppressWord(Expr probs, Word id) {
SetColumn(probs->val(), id, std::numeric_limits<float>::lowest());
}
}
void suppressUnk(Expr probs) {
@ -55,5 +54,4 @@ void suppressWord(Expr probs, Word id) {
}
#endif
}
}

View File

@ -44,6 +44,5 @@ void suppressUnk(Expr probs) {
void suppressWord(Expr probs, Word id) {
SetColumn(probs->val(), id, std::numeric_limits<float>::lowest());
}
}
}

View File

@ -14,7 +14,6 @@ namespace cpu {
void suppressUnk(Expr probs);
void suppressWord(Expr probs, Word id);
}
namespace gpu {
@ -22,11 +21,9 @@ namespace gpu {
void suppressUnk(Expr probs);
void suppressWord(Expr probs, Word id);
}
void suppressUnk(Expr probs);
void suppressWord(Expr probs, Word id);
}

View File

@ -27,7 +27,8 @@ public:
if(beam[j]->GetWord() == 0 || last) {
float cost = beam[j]->GetCost() / LengthPenalty(history_.size());
topHyps_.push({history_.size(), j, cost});
//std::cerr << "Add " << history_.size() << " " << j << " " << cost << std::endl;
// std::cerr << "Add " << history_.size() << " " << j << " " << cost
// << std::endl;
}
}
history_.push_back(beam);
@ -44,14 +45,14 @@ public:
size_t start = bestHypCoord.i;
size_t j = bestHypCoord.j;
//float c = bestHypCoord.cost;
//std::cerr << "h: " << start << " " << j << " " << c << std::endl;
// float c = bestHypCoord.cost;
// std::cerr << "h: " << start << " " << j << " " << c << std::endl;
Words targetWords;
Ptr<Hypothesis> bestHyp = history_[start][j];
while(bestHyp->GetPrevHyp() != nullptr) {
targetWords.push_back(bestHyp->GetWord());
//std::cerr << bestHyp->GetWord() << " " << bestHyp << std::endl;
// std::cerr << bestHyp->GetWord() << " " << bestHyp << std::endl;
bestHyp = bestHyp->GetPrevHyp();
}

Some files were not shown because too many files have changed in this diff Show More